diff --git a/include/tvm/ir/base_expr.h b/include/tvm/ir/base_expr.h
new file mode 100644
index 000000000000..0a844bb3ba8e
--- /dev/null
+++ b/include/tvm/ir/base_expr.h
@@ -0,0 +1,320 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/ir/base_expr.h
+ * \brief Base expression and primitive type nodes.
+ */
+#ifndef TVM_IR_BASE_EXPR_H_
+#define TVM_IR_BASE_EXPR_H_
+
+#include <tvm/ffi/cast.h>
+#include <tvm/ffi/dtype.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/source_map.h>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace tvm {
+
+/*!
+ * \brief Type is the base type of all types.
+ *
+ * TVM's type system contains following subclasses:
+ *
+ * - PrimType: type of primitive type values used in the low-level IR.
+ * - FuncType: type of a function.
+ * - TensorType: type of certain Tensor values in the expression.
+ *
+ * There are also advanced types to support generic(polymorphic types).
+ * \sa Type
+ */
+class TypeNode : public ffi::Object {
+ public:
+  /*!
+   * \brief Span that points to the original source code.
+   *        Reserved debug information.
+   */
+  mutable Span span;
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    // span do not participate in structural equal and hash.
+    refl::ObjectDef<TypeNode>().def_ro("span", &TypeNode::span, refl::DefaultValue(Span()),
+                                       refl::AttachFieldFlag::SEqHashIgnore());
+  }
+
+  static constexpr TVMFFISEqHashKind _type_s_eq_hash_kind = kTVMFFISEqHashKindTreeNode;
+
+  static constexpr const uint32_t _type_child_slots = 14;
+  TVM_FFI_DECLARE_OBJECT_INFO("ir.Type", TypeNode, ffi::Object);
+};
+
+/*!
+ * \brief Managed reference to TypeNode.
+ * \sa TypeNode
+ */
+class Type : public ffi::ObjectRef {
+ public:
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Type, ffi::ObjectRef, TypeNode);
+};
+
+/*!
+ * \brief Primitive data types used in the low-level IR.
+ *
+ * PrimType represents POD-values and handles that are
+ * not automatically managed by the runtime.
+ *
+ * \sa PrimType
+ */
+class PrimTypeNode final : public TypeNode {
+ public:
+  /*!
+   * \brief The raw DLPack dtype represented by this primitive type.
+   */
+  DLDataType dtype;
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<PrimTypeNode>().def_ro("dtype", &PrimTypeNode::dtype);
+  }
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("ir.PrimType", PrimTypeNode, TypeNode);
+};
+
+/*
+ * \brief Managed reference to PrimTypeNode.
+ * \sa PrimTypeNode
+ */
+class PrimType final : public Type {
+ public:
+  /*!
+   * \brief Construct from a raw DLPack dtype.
+   * \param dtype The corresponding DLPack dtype.
+   */
+  TVM_DLL explicit PrimType(DLDataType dtype);
+
+  /*!
+   * \brief Construct from DLPack dtype fields.
+   * \param code The DLPack dtype code.
+   * \param bits The scalar bit width.
+   * \param lanes The fixed lane count.
+   */
+  TVM_DLL PrimType(DLDataTypeCode code, int bits, int lanes = 1);
+
+  /*! \brief Construct a signed integer type with fixed lanes. */
+  TVM_DLL static PrimType Int(int bits, int lanes = 1);
+  /*! \brief Construct an unsigned integer type with fixed lanes. */
+  TVM_DLL static PrimType UInt(int bits, int lanes = 1);
+  /*! \brief Construct a floating-point type with fixed lanes. */
+  TVM_DLL static PrimType Float(int bits, int lanes = 1);
+  /*! \brief Construct a bfloat type with fixed lanes. */
+  TVM_DLL static PrimType BFloat(int bits, int lanes = 1);
+  /*! \brief Construct a boolean type with fixed lanes. */
+  TVM_DLL static PrimType Bool(int lanes = 1);
+  /*! \brief Construct an opaque handle type. */
+  TVM_DLL static PrimType Handle(int bits = 64, int lanes = 1);
+  /*! \brief Construct the void sentinel type, encoded as handle(0, 0). */
+  TVM_DLL static PrimType Void();
+  /*!
+   * \brief Construct a scalable vector type.
+   * \param code The DLPack dtype code.
+   * \param bits The scalar bit width.
+   * \param lanes The positive vscale factor to encode in the DLPack lane field.
+   */
+  TVM_DLL static PrimType ScalableVector(DLDataTypeCode code, int bits, int lanes);
+
+  /*! \return The DLPack dtype code. */
+  TVM_FFI_INLINE DLDataTypeCode code() const {
+    return static_cast<DLDataTypeCode>(static_cast<int>(get()->dtype.code));
+  }
+
+  /*! \return The scalar bit width. */
+  TVM_FFI_INLINE int32_t bits() const { return get()->dtype.bits; }
+
+  /*!
+   * \return The fixed lane count.
+   * \note Throws on scalable vector types, where the encoded lane field stores a vscale factor.
+   */
+  TVM_FFI_INLINE int32_t lanes() const {
+    int16_t encoded_lanes = static_cast<int16_t>(get()->dtype.lanes);
+    if (TVM_FFI_PREDICT_FALSE(encoded_lanes < 0)) {
+      TVM_FFI_THROW(InternalError)
+          << "Can't fetch the lanes of a scalable vector at a compile time.";
+    }
+    return encoded_lanes;
+  }
+
+  /*!
+   * \brief Check the scalar element code and bit width.
+   * \note Lane count and scalable-vector encoding are intentionally ignored.
+   */
+  TVM_FFI_INLINE bool MatchesElementType(DLDataTypeCode code, int bits) const {
+    DLDataType dtype = get()->dtype;
+    return dtype.code == static_cast<uint8_t>(code) && dtype.bits == bits;
+  }
+
+  /*!
+   * \brief Check whether the dtype code matches any of the provided DLPack codes.
+   * \note Bit width and lanes are intentionally ignored.
+   */
+  template <typename... Codes>
+  TVM_FFI_INLINE bool MatchesCode(Codes... codes) const {
+    uint8_t dtype_code = get()->dtype.code;
+    return ((dtype_code == static_cast<uint8_t>(codes)) || ...);
+  }
+
+  /*! \brief Whether this type is a scalar, excluding fixed and scalable vectors. */
+  TVM_FFI_INLINE bool IsScalar() const {
+    int16_t encoded_lanes = static_cast<int16_t>(get()->dtype.lanes);
+    return encoded_lanes == 1;
+  }
+
+  /*! \brief Whether this type is the void sentinel `handle(0, 0)`. */
+  TVM_FFI_INLINE bool IsVoid() const {
+    DLDataType dtype = get()->dtype;
+    return dtype.code == static_cast<uint8_t>(DLDataTypeCode::kDLOpaqueHandle) && dtype.bits == 0 &&
+           static_cast<int16_t>(dtype.lanes) == 0;
+  }
+
+  /*! \brief Whether this type is an opaque handle, excluding the void sentinel. */
+  TVM_FFI_INLINE bool IsHandle() const {
+    return this->code() == DLDataTypeCode::kDLOpaqueHandle && !this->IsVoid();
+  }
+
+  /*! \brief Whether this type is a scalable vector. */
+  TVM_FFI_INLINE bool IsScalableVector() const {
+    return static_cast<int16_t>(get()->dtype.lanes) < -1;
+  }
+
+  /*! \brief Whether this type is a fixed-length vector. */
+  TVM_FFI_INLINE bool IsFixedLengthVector() const {
+    return static_cast<int16_t>(get()->dtype.lanes) > 1;
+  }
+
+  /*!
+   * \brief Return the number of bytes needed to store one value of this type.
+   *
+   * This uses the same packed sub-byte dtype sizing rule as runtime tensors.
+   * Scalable vector types have no compile-time storage size and are rejected.
+   */
+  TVM_DLL size_t StorageBytes() const;
+
+  /*! \brief Return the same type with a different dtype code, preserving bits and lanes. */
+  TVM_FFI_INLINE PrimType WithCode(DLDataTypeCode code) const {
+    DLDataType dtype = get()->dtype;
+    int16_t encoded_lanes = static_cast<int16_t>(dtype.lanes);
+    if (encoded_lanes < -1) {
+      return ScalableVector(code, dtype.bits, -encoded_lanes);
+    }
+    return PrimType(code, dtype.bits, encoded_lanes);
+  }
+
+  /*! \brief Return the same type with a different scalar bit width, preserving code and lanes. */
+  TVM_FFI_INLINE PrimType WithBits(int bits) const {
+    DLDataType dtype = get()->dtype;
+    int16_t encoded_lanes = static_cast<int16_t>(dtype.lanes);
+    if (encoded_lanes < -1) {
+      return ScalableVector(this->code(), bits, -encoded_lanes);
+    }
+    return PrimType(this->code(), bits, encoded_lanes);
+  }
+
+  /*! \brief Return the same scalar element type with a fixed lane count. */
+  TVM_FFI_INLINE PrimType WithLanes(int lanes) const {
+    return PrimType(this->code(), this->bits(), lanes);
+  }
+
+  /*! \return The vscale factor encoded in a scalable vector type. */
+  TVM_FFI_INLINE int32_t VScaleFactor() const {
+    int16_t encoded_lanes = static_cast<int16_t>(get()->dtype.lanes);
+    if (encoded_lanes >= -1) {
+      TVM_FFI_THROW(InternalError) << "A fixed length vector doesn't have a vscale factor.";
+    }
+    return -encoded_lanes;
+  }
+
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NOTNULLABLE(PrimType, Type, PrimTypeNode);
+};
+
+inline bool operator==(const PrimType& lhs, const PrimType& rhs) {
+  return lhs->dtype == rhs->dtype;
+}
+
+inline bool operator!=(const PrimType& lhs, const PrimType& rhs) { return !(lhs == rhs); }
+
+/*!
+ * \brief Base type of all the expressions.
+ * \sa Expr
+ */
+class BaseExprNode : public ffi::Object {
+ public:
+  /*!
+   * \brief Span that points to the original source code.
+   *        Reserved debug information.
+   */
+  mutable Span span;
+
+  /*!
+   * \brief The deduced or annotated type of the expression.
+   *
+   * This field is intentionally nullable because type information may
+   * be populated by later analysis passes instead of expression
+   * constructors.
+   */
+  mutable Type ty;
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    // span and ty do not participate in structural equal and hash.
+    refl::ObjectDef<BaseExprNode>()
+        .def_ro("span", &BaseExprNode::span, refl::DefaultValue(Span()),
+                refl::AttachFieldFlag::SEqHashIgnore())
+        .def_ro("ty", &BaseExprNode::ty, refl::DefaultValue(Type()),
+                refl::AttachFieldFlag::SEqHashIgnore());
+  }
+
+  static constexpr TVMFFISEqHashKind _type_s_eq_hash_kind = kTVMFFISEqHashKindTreeNode;
+
+  static constexpr const uint32_t _type_child_slots = 64;
+  TVM_FFI_DECLARE_OBJECT_INFO("ir.BaseExpr", BaseExprNode, ffi::Object);
+};
+
+/*!
+ * \brief Managed reference to BaseExprNode.
+ * \sa BaseExprNode
+ */
+class BaseExpr : public ffi::ObjectRef {
+ public:
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(BaseExpr, ffi::ObjectRef, BaseExprNode);
+};
+
+namespace ffi {
+template <>
+inline constexpr bool use_default_type_traits_v<PrimType> = false;
+
+template <>
+struct TypeTraits<PrimType> : public ObjectRefWithFallbackTraitsBase<PrimType, DLDataType> {
+  TVM_FFI_INLINE static PrimType ConvertFallbackValue(DLDataType dtype) { return PrimType(dtype); }
+};
+}  // namespace ffi
+
+}  // namespace tvm
+
+#endif  // TVM_IR_BASE_EXPR_H_
diff --git a/include/tvm/ir/expr.h b/include/tvm/ir/expr.h
index b81e4c2feda7..70e1ffeb480c 100644
--- a/include/tvm/ir/expr.h
+++ b/include/tvm/ir/expr.h
@@ -24,12 +24,13 @@
 #ifndef TVM_IR_EXPR_H_
 #define TVM_IR_EXPR_H_
 
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/extra/dataclass.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ffi/string.h>
+#include <tvm/ir/base_expr.h>
 #include <tvm/ir/cow.h>
 #include <tvm/ir/source_map.h>
-#include <tvm/runtime/data_type.h>
 
 #include <algorithm>
 #include <functional>
@@ -54,82 +55,6 @@ class VirtualDevice;
  * There are also advanced types to support generic(polymorphic types).
  * \sa Type
  */
-class TypeNode : public ffi::Object {
- public:
-  /*!
-   * \brief Span that points to the original source code.
-   *        Reserved debug information.
-   */
-  mutable Span span;
-
-  static void RegisterReflection() {
-    namespace refl = tvm::ffi::reflection;
-    // span do not participate in structural equal and hash.
-    refl::ObjectDef<TypeNode>().def_ro("span", &TypeNode::span, refl::DefaultValue(Span()),
-                                       refl::AttachFieldFlag::SEqHashIgnore());
-  }
-
-  static constexpr TVMFFISEqHashKind _type_s_eq_hash_kind = kTVMFFISEqHashKindTreeNode;
-
-  static constexpr const uint32_t _type_child_slots = 14;
-  TVM_FFI_DECLARE_OBJECT_INFO("ir.Type", TypeNode, ffi::Object);
-};
-
-/*!
- * \brief Managed reference to TypeNode.
- * \sa TypeNode
- */
-class Type : public ffi::ObjectRef {
- public:
-  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Type, ffi::ObjectRef, TypeNode);
-};
-
-/*!
- * \brief Base type of all the expressions.
- * \sa Expr
- */
-class BaseExprNode : public ffi::Object {
- public:
-  /*!
-   * \brief Span that points to the original source code.
-   *        Reserved debug information.
-   */
-  mutable Span span;
-
-  /*!
-   * \brief The deduced or annotated type of the expression.
-   *
-   * This field is intentionally nullable because type information may
-   * be populated by later analysis passes instead of expression
-   * constructors.
-   */
-  mutable Type ty;
-
-  static void RegisterReflection() {
-    namespace refl = tvm::ffi::reflection;
-    // span and ty do not participate in structural equal and hash.
-    refl::ObjectDef<BaseExprNode>()
-        .def_ro("span", &BaseExprNode::span, refl::DefaultValue(Span()),
-                refl::AttachFieldFlag::SEqHashIgnore())
-        .def_ro("ty", &BaseExprNode::ty, refl::DefaultValue(Type()),
-                refl::AttachFieldFlag::SEqHashIgnore());
-  }
-
-  static constexpr TVMFFISEqHashKind _type_s_eq_hash_kind = kTVMFFISEqHashKindTreeNode;
-
-  static constexpr const uint32_t _type_child_slots = 64;
-  TVM_FFI_DECLARE_OBJECT_INFO("ir.BaseExpr", BaseExprNode, ffi::Object);
-};
-
-/*!
- * \brief Managed reference to BaseExprNode.
- * \sa BaseExprNode
- */
-class BaseExpr : public ffi::ObjectRef {
- public:
-  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(BaseExpr, ffi::ObjectRef, BaseExprNode);
-};
-
 /*!
  * \brief Base node of all primitive expressions.
  *
@@ -144,25 +69,16 @@ class BaseExpr : public ffi::ObjectRef {
  */
 class PrimExprNode : public BaseExprNode {
  public:
-  /*!
-   * \brief The runtime data type of the primitive expression.
-   *
-   * runtime::DataType(dtype) provides coarse grained type information
-   * during compile time and runtime. It is eagerly built in
-   * PrimExpr expression construction and can be used for
-   * quick type checking.
-   *
-   * dtype is sufficient to decide the Type of the PrimExpr
-   * when it corresponds to POD value types such as i32.
-   *
-   * When dtype is DataType::Handle(), the expression could corresponds to
-   * a more fine-grained Type, and we can get the type by running lazy type inference.
-   */
-  DataType dtype;
+  /*! \return the primitive type of this expression node. */
+  PrimType ty() const {
+    TVM_FFI_DCHECK(this->BaseExprNode::ty.defined());
+    TVM_FFI_DCHECK(this->BaseExprNode::ty->IsInstance<PrimTypeNode>());
+    return ffi::GetRef<PrimType>(static_cast<const PrimTypeNode*>(this->BaseExprNode::ty.get()));
+  }
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
-    refl::ObjectDef<PrimExprNode>().def_ro("dtype", &PrimExprNode::dtype);
+    refl::ObjectDef<PrimExprNode>();
   }
 
   static constexpr const uint32_t _type_child_slots = 40;
@@ -186,8 +102,13 @@ class PrimExpr : public BaseExpr {
    */
   TVM_DLL PrimExpr(float value);  // NOLINT(*)
 
-  /*! \return the data type of this expression. */
-  DataType dtype() const { return static_cast<const PrimExprNode*>(get())->dtype; }
+  /*! \return the primitive type of this expression. */
+  PrimType ty() const {
+    const auto* node = static_cast<const PrimExprNode*>(get());
+    TVM_FFI_DCHECK(node->BaseExprNode::ty.defined());
+    TVM_FFI_DCHECK(node->BaseExprNode::ty->IsInstance<PrimTypeNode>());
+    return ffi::GetRef<PrimType>(static_cast<const PrimTypeNode*>(node->BaseExprNode::ty.get()));
+  }
 
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(PrimExpr, BaseExpr, PrimExprNode);
 
@@ -554,11 +475,11 @@ class IntImm : public PrimExpr {
  public:
   /*!
    * \brief Constructor.
-   * \param dtype The data type of the value.
+   * \param value_ty The primitive type of the value.
    * \param value The internal value.
    * \param span The location of this object in the source code.
    */
-  TVM_DLL IntImm(DataType dtype, int64_t value, Span span = Span());
+  TVM_DLL IntImm(PrimType value_ty, int64_t value, Span span = Span());
 
   /*!
    * \brief Construct a scalar boolean constant.
@@ -566,7 +487,7 @@ class IntImm : public PrimExpr {
    * \param span The location of this object in the source code.
    */
   static IntImm Bool(bool value, Span span = Span()) {
-    return IntImm(DataType::Bool(), value, span);
+    return IntImm(PrimType::Bool(), value, span);
   }
 
   /*!
@@ -575,7 +496,7 @@ class IntImm : public PrimExpr {
    * \param span The location of this object in the source code.
    */
   static IntImm Int32(int64_t value, Span span = Span()) {
-    return IntImm(DataType::Int(32), value, span);
+    return IntImm(PrimType::Int(32), value, span);
   }
 
   /*!
@@ -584,7 +505,7 @@ class IntImm : public PrimExpr {
    * \param span The location of this object in the source code.
    */
   static IntImm Int64(int64_t value, Span span = Span()) {
-    return IntImm(DataType::Int(64), value, span);
+    return IntImm(PrimType::Int(64), value, span);
   }
 
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(IntImm, PrimExpr, IntImmNode);
@@ -616,11 +537,11 @@ class FloatImm : public PrimExpr {
  public:
   /*!
    * \brief Constructor.
-   * \param dtype The data type of the value.
+   * \param value_ty The primitive type of the value.
    * \param value The internal value.
    * \param span The location in the source code.
    */
-  TVM_DLL FloatImm(DataType dtype, double value, Span span = Span());
+  TVM_DLL FloatImm(PrimType value_ty, double value, Span span = Span());
 
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(FloatImm, PrimExpr, FloatImmNode);
   TVM_DEFINE_OBJECT_REF_COW_METHOD(FloatImmNode);
@@ -688,11 +609,11 @@ inline constexpr bool use_default_type_traits_v<IntImm> = false;
 template <>
 struct TypeTraits<IntImm> : public ObjectRefWithFallbackTraitsBase<IntImm, int64_t> {
   TVM_FFI_INLINE static IntImm ConvertFallbackValue(int64_t value) {
-    auto dtype =
+    auto value_ty =
         (value > std::numeric_limits<int>::max() || value < std::numeric_limits<int>::min())
-            ? DataType::Int(64)
-            : DataType::Int(32);
-    return IntImm(dtype, value);
+            ? PrimType::Int(64)
+            : PrimType::Int(32);
+    return IntImm(value_ty, value);
   }
 };
 
@@ -702,7 +623,7 @@ inline constexpr bool use_default_type_traits_v<FloatImm> = false;
 template <>
 struct TypeTraits<FloatImm> : public ObjectRefWithFallbackTraitsBase<FloatImm, double> {
   TVM_FFI_INLINE static FloatImm ConvertFallbackValue(double value) {
-    return FloatImm(runtime::DataType::Float(32), value);
+    return FloatImm(PrimType::Float(32), value);
   }
 };
 
diff --git a/include/tvm/ir/type.h b/include/tvm/ir/type.h
index 9c56d0376405..f63b5d261500 100644
--- a/include/tvm/ir/type.h
+++ b/include/tvm/ir/type.h
@@ -26,21 +26,19 @@
  *
  * This file contains types that are common across IR variants.
  *
- * ## Relation between Type and runtime::DataType
+ * ## Relation between Type and DLPack dtype
  *
- * Besides Type, we also store a dtype field in the low-level PrimExpr.
- * runtime::DataType(dtype) provides coarse grained type information
- * during compile time and runtime. It is eagerly built in
- * low-level expression construction and can be used for
- * quick type checking in the low-level IR.
- * For example, when an Expr's dtype is int32,
- * we know for sure that its type is also int32.
+ * PrimExpr stores a PrimType in its `ty` field, backed by a DLPack
+ * `DLDataType`. This provides coarse grained scalar/vector element type
+ * information during compile time and runtime. It is eagerly built in
+ * low-level expression construction and can be used for quick type checking
+ * in the low-level IR. For example, when an Expr's dtype is int32, we know
+ * for sure that its PrimType is also int32.
  *
  * On the other hand, Type provides more fine grained information.
- * For example, a low level expression can have DataType::Handle() as
- * its dtype and MemRef[float32] as its type.
- * Types are usually lazily constructed via type checking,
- * so they may not readily be available during IR construction.
+ * For example, a low level expression can have a handle dtype while a
+ * node-specific type annotation records a
+ * PointerType to a float32 element.
  *
  * The unified Type serves as a common bridge across IR dialects.
  * For example, we require all the functions to have a type signature,
@@ -49,55 +47,16 @@
 #ifndef TVM_IR_TYPE_H_
 #define TVM_IR_TYPE_H_
 
-#include <tvm/ffi/cast.h>
 #include <tvm/ffi/container/array.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/ir/expr.h>
+#include <tvm/ir/base_expr.h>
 #include <tvm/ir/source_map.h>
-#include <tvm/runtime/data_type.h>
 
 #include <string>
 
 namespace tvm {
 
-/*!
- * \brief Primitive data types used in the low-level IR.
- *
- * PrimType represents POD-values and handles that are
- * not automatically managed by the runtime.
- *
- * \sa PrimType
- */
-class PrimTypeNode : public TypeNode {
- public:
-  /*!
-   * \brief The corresponding dtype field.
-   */
-  runtime::DataType dtype;
-
-  static void RegisterReflection() {
-    namespace refl = tvm::ffi::reflection;
-    refl::ObjectDef<PrimTypeNode>().def_ro("dtype", &PrimTypeNode::dtype);
-  }
-  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("ir.PrimType", PrimTypeNode, TypeNode);
-};
-
-/*
- * \brief Managed reference to PrimTypeNode.
- * \sa PrimTypeNode
- */
-class PrimType : public Type {
- public:
-  /*!
-   * \brief Constructor
-   * \param dtype The corresponding dtype.
-   * \param span The span
-   */
-  TVM_DLL explicit PrimType(runtime::DataType dtype, Span span = Span());
-
-  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NOTNULLABLE(PrimType, Type, PrimTypeNode);
-};
-
 /*!
  * \brief Low-level raw pointer type.
  *
diff --git a/include/tvm/relax/attrs/create.h b/include/tvm/relax/attrs/create.h
index 14a3402f2503..76ef219a862c 100644
--- a/include/tvm/relax/attrs/create.h
+++ b/include/tvm/relax/attrs/create.h
@@ -31,7 +31,7 @@ namespace relax {
 
 /*! \brief Attributes used in full/full_like, ones/ones_like, and zeros/zeros_like operators */
 struct InitAttrs : public AttrsNode {
-  DataType dtype;
+  DLDataType dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
diff --git a/include/tvm/relax/attrs/datatype.h b/include/tvm/relax/attrs/datatype.h
index f67223edb546..aeac65e64484 100644
--- a/include/tvm/relax/attrs/datatype.h
+++ b/include/tvm/relax/attrs/datatype.h
@@ -31,7 +31,7 @@ namespace relax {
 
 /*! \brief Attributes used in astype operator */
 struct AstypeAttrs : public AttrsNode {
-  DataType dtype;
+  DLDataType dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -42,7 +42,7 @@ struct AstypeAttrs : public AttrsNode {
 
 /*! \brief Attributes used in wrap_param operator */
 struct WrapParamAttrs : public AttrsNode {
-  DataType dtype;
+  DLDataType dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
diff --git a/include/tvm/relax/attrs/image.h b/include/tvm/relax/attrs/image.h
index c9a720374036..8f512f28e55f 100644
--- a/include/tvm/relax/attrs/image.h
+++ b/include/tvm/relax/attrs/image.h
@@ -39,7 +39,7 @@ struct Resize2DAttrs : public AttrsNode {
   double cubic_alpha;
   int cubic_exclude;
   double extrapolation_value;
-  DataType out_dtype;
+  DLDataType out_dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -88,7 +88,7 @@ struct Resize3DAttrs : public AttrsNode {
   double cubic_alpha;
   int cubic_exclude;
   double extrapolation_value;
-  DataType out_dtype;
+  DLDataType out_dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
diff --git a/include/tvm/relax/attrs/linear_algebra.h b/include/tvm/relax/attrs/linear_algebra.h
index 817885edb871..19a5982bfe12 100644
--- a/include/tvm/relax/attrs/linear_algebra.h
+++ b/include/tvm/relax/attrs/linear_algebra.h
@@ -31,7 +31,7 @@ namespace relax {
 
 /*! \brief Attributes for matmul operator */
 struct MatmulAttrs : public AttrsNode {
-  DataType out_dtype;
+  DLDataType out_dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
diff --git a/include/tvm/relax/attrs/nn.h b/include/tvm/relax/attrs/nn.h
index 52d9c40d742d..aa3c0f4736f0 100644
--- a/include/tvm/relax/attrs/nn.h
+++ b/include/tvm/relax/attrs/nn.h
@@ -38,7 +38,7 @@ struct Conv1DAttrs : public AttrsNode {
   ffi::String data_layout;
   ffi::String kernel_layout;
   ffi::String out_layout;
-  DataType out_dtype;
+  DLDataType out_dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -82,7 +82,7 @@ struct Conv2DAttrs : public AttrsNode {
   ffi::String data_layout;
   ffi::String kernel_layout;
   ffi::String out_layout;
-  DataType out_dtype;
+  DLDataType out_dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -128,7 +128,7 @@ struct Conv3DAttrs : public AttrsNode {
   ffi::String data_layout;
   ffi::String kernel_layout;
   ffi::String out_layout;
-  DataType out_dtype;
+  DLDataType out_dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -177,7 +177,7 @@ struct Conv1DTransposeAttrs : public AttrsNode {
   ffi::String data_layout;
   ffi::String kernel_layout;
   ffi::String out_layout;
-  DataType out_dtype;
+  DLDataType out_dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -226,7 +226,7 @@ struct Conv2DTransposeAttrs : public AttrsNode {
   ffi::String data_layout;
   ffi::String kernel_layout;
   ffi::String out_layout;
-  DataType out_dtype;
+  DLDataType out_dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -277,7 +277,7 @@ struct Conv3DTransposeAttrs : public AttrsNode {
   ffi::String data_layout;
   ffi::String kernel_layout;
   ffi::String out_layout;
-  DataType out_dtype;
+  DLDataType out_dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
diff --git a/include/tvm/relax/attrs/qdq.h b/include/tvm/relax/attrs/qdq.h
index 83ec2223c3c7..be95b9e7b8ed 100644
--- a/include/tvm/relax/attrs/qdq.h
+++ b/include/tvm/relax/attrs/qdq.h
@@ -31,7 +31,7 @@ namespace relax {
 
 /*! \brief Attributes for relax.quantize/relax.dequantize operator */
 struct QuantizeAttrs : public AttrsNode {
-  DataType out_dtype;
+  DLDataType out_dtype;
   int axis;
 
   static void RegisterReflection() {
diff --git a/include/tvm/relax/attrs/sampling.h b/include/tvm/relax/attrs/sampling.h
index 11bbfb6eba31..07b7de25e553 100644
--- a/include/tvm/relax/attrs/sampling.h
+++ b/include/tvm/relax/attrs/sampling.h
@@ -31,13 +31,13 @@ namespace relax {
 
 /*! \brief Attributes used in multinomial_from_uniform operator */
 struct MultinomialFromUniformAttrs : public AttrsNode {
-  DataType dtype;
+  DLDataType dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
     refl::ObjectDef<MultinomialFromUniformAttrs>().def_ro(
         "dtype", &MultinomialFromUniformAttrs::dtype, "Data type of the output indices.",
-        refl::DefaultValue(DataType::Int(64)));
+        refl::DefaultValue((DLDataType{kDLInt, 64, 1})));
   }
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("relax.attrs.MultinomialFromUniformAttrs",
                                     MultinomialFromUniformAttrs, AttrsNode);
diff --git a/include/tvm/relax/attrs/sorting.h b/include/tvm/relax/attrs/sorting.h
index e8bf65d55a43..ef21bf9a637e 100644
--- a/include/tvm/relax/attrs/sorting.h
+++ b/include/tvm/relax/attrs/sorting.h
@@ -54,7 +54,7 @@ struct SortAttrs : public AttrsNode {
 struct ArgsortAttrs : public AttrsNode {
   int axis;
   bool descending;
-  DataType dtype;
+  DLDataType dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -68,7 +68,7 @@ struct ArgsortAttrs : public AttrsNode {
                 "If it is not specified, it defaults to the ascending order.",
                 refl::DefaultValue(false))
         .def_ro("dtype", &ArgsortAttrs::dtype, "DType of the output indices.",
-                refl::DefaultValue(DataType::Void()));
+                refl::DefaultValue((DLDataType{kDLOpaqueHandle, 0, 0})));
   }
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("relax.attrs.ArgsortAttrs", ArgsortAttrs, AttrsNode);
 };  // struct ArgsortAttrs
@@ -79,7 +79,7 @@ struct TopKAttrs : public AttrsNode {
   int axis;
   bool largest;
   ffi::String ret_type;
-  DataType dtype;
+  DLDataType dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -98,7 +98,7 @@ struct TopKAttrs : public AttrsNode {
                 "By default, return the largest k elements.",
                 refl::DefaultValue(true))
         .def_ro("dtype", &TopKAttrs::dtype, "Data type of the output indices.",
-                refl::DefaultValue(DataType::Void()));
+                refl::DefaultValue((DLDataType{kDLOpaqueHandle, 0, 0})));
   }
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("relax.attrs.TopKAttrs", TopKAttrs, AttrsNode);
 };  // struct TopKAttrs
diff --git a/include/tvm/relax/attrs/statistical.h b/include/tvm/relax/attrs/statistical.h
index 66996c802cc3..a815e0e07e51 100644
--- a/include/tvm/relax/attrs/statistical.h
+++ b/include/tvm/relax/attrs/statistical.h
@@ -50,7 +50,7 @@ struct StatisticalAttrs : public AttrsNode {
 /*! \brief Attributes used in scan operators like cumsum, cumprod */
 struct ScanopAttrs : public AttrsNode {
   ffi::Optional<int64_t> axis;
-  DataType dtype;
+  DLDataType dtype;
   bool exclusive = false;
 
   static void RegisterReflection() {
diff --git a/include/tvm/relax/dataflow_pattern.h b/include/tvm/relax/dataflow_pattern.h
index 27894da3addd..0511395f8a67 100644
--- a/include/tvm/relax/dataflow_pattern.h
+++ b/include/tvm/relax/dataflow_pattern.h
@@ -116,8 +116,8 @@ class DFPattern : public ffi::ObjectRef {
   TVM_DLL AttrPattern HasAttr(const ffi::Map<ffi::String, Any>& attrs) const;
   /*! \brief Syntatic Sugar for creating a TypePattern */
   TVM_DLL TypePattern HasType(const Type& ty) const;
-  /*! \brief Syntatic Sugar for creating a DataTypePattern with a DataType */
-  TVM_DLL DataTypePattern HasDtype(const DataType& dtype) const;
+  /*! \brief Syntatic Sugar for creating a DataTypePattern with a dtype */
+  TVM_DLL DataTypePattern HasDtype(DLDataType dtype) const;
   /*! \brief Syntatic Sugar for creating a DataTypePattern with a data type's name */
   TVM_DLL DataTypePattern HasDtype(const std::string& dtype) const;
   /*! \brief Syntatic Sugar for creating a ShapePattern */
@@ -860,7 +860,7 @@ class SameShapeConstraint : public DFConstraint {
 class DataTypePatternNode : public DFPatternNode {
  public:
   DFPattern pattern; /*!< The root pattern to match */
-  DataType dtype;    /*!< The data type to match */
+  DLDataType dtype;  /*!< The data type to match */
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -878,7 +878,7 @@ class DataTypePatternNode : public DFPatternNode {
  */
 class DataTypePattern : public DFPattern {
  public:
-  TVM_DLL DataTypePattern(DFPattern pattern, DataType dtype);
+  TVM_DLL DataTypePattern(DFPattern pattern, DLDataType dtype);
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(DataTypePattern, DFPattern, DataTypePatternNode);
 };
 
diff --git a/include/tvm/relax/distributed/global_info.h b/include/tvm/relax/distributed/global_info.h
index 62ff904fc1a4..0347ec3b85a8 100644
--- a/include/tvm/relax/distributed/global_info.h
+++ b/include/tvm/relax/distributed/global_info.h
@@ -25,6 +25,7 @@
 #ifndef TVM_RELAX_DISTRIBUTED_GLOBAL_INFO_H_
 #define TVM_RELAX_DISTRIBUTED_GLOBAL_INFO_H_
 
+#include <tvm/ffi/container/shape.h>
 #include <tvm/ir/expr.h>
 #include <tvm/ir/module.h>
 namespace tvm {
diff --git a/include/tvm/relax/expr.h b/include/tvm/relax/expr.h
index 937091255b6f..0b75bf27a7d2 100644
--- a/include/tvm/relax/expr.h
+++ b/include/tvm/relax/expr.h
@@ -471,7 +471,7 @@ class StringImm : public LeafExpr {
 class DataTypeImmNode : public LeafExprNode {
  public:
   /*! \brief The data value. */
-  DataType value;
+  DLDataType value;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -491,7 +491,7 @@ class DataTypeImm : public LeafExpr {
    * \param value The value input.
    * \param span The source span of the expression.
    */
-  TVM_DLL explicit DataTypeImm(DataType value, Span span = Span());
+  TVM_DLL explicit DataTypeImm(DLDataType value, Span span = Span());
 
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(DataTypeImm, LeafExpr, DataTypeImmNode);
   TVM_DEFINE_OBJECT_REF_COW_METHOD(DataTypeImmNode);
diff --git a/include/tvm/relax/transform.h b/include/tvm/relax/transform.h
index d0d0d1bb5441..5c757ba15161 100644
--- a/include/tvm/relax/transform.h
+++ b/include/tvm/relax/transform.h
@@ -663,9 +663,8 @@ TVM_DLL Pass DataflowUseInplaceCalls();
  *
  * \note Mainly operates within dataflow blocks. ConvertToDataflow may need to be called first.
  */
-TVM_DLL Pass
-ToMixedPrecision(const DataType& out_dtype,
-                 ffi::Optional<ffi::Array<ffi::String>> fp16_input_names = std::nullopt);
+TVM_DLL Pass ToMixedPrecision(
+    DLDataType out_dtype, ffi::Optional<ffi::Array<ffi::String>> fp16_input_names = std::nullopt);
 
 /*!
  * \brief Rewrite a Relax module for executing with CUDA graph. This pass identifies
diff --git a/include/tvm/relax/type.h b/include/tvm/relax/type.h
index 9c27b627a7d6..a77a3cc66c38 100644
--- a/include/tvm/relax/type.h
+++ b/include/tvm/relax/type.h
@@ -124,7 +124,7 @@ class ShapeTypeNode : public TypeNode {
    * \brief The number of dimension of the shape, can be unknown.
    * \sa kUnknownNDim
    */
-  int ndim;
+  int ndim{kUnknownNDim};
 
   /*! \return Whether the type contains unknown ndim. */
   bool IsUnknownNdim() const { return ndim == kUnknownNDim; }
@@ -174,19 +174,19 @@ class TensorTypeNode : public TypeNode {
    *  is expected to be executed.
    */
   ffi::Optional<VDevice> vdevice;
-  /*! \brief The content data type, use void to denote the dtype is unknown. */
-  DataType dtype;
+  /*! \brief The content dtype, use void to denote the dtype is unknown. */
+  tvm::PrimType dtype{DLDataType{kDLOpaqueHandle, 0, 0}};
   /*!
    * \brief The number of dimension of the tensor, can be unknown.
    * \sa kUnknownNDim
    */
-  int ndim;
+  int ndim{kUnknownNDim};
 
   /*! \return Whether the type contains unknown ndim. */
   bool IsUnknownNdim() const { return ndim == kUnknownNDim; }
 
   /*! \return Whether the type contains unknown dtype. */
-  bool IsUnknownDtype() const { return dtype.is_void(); }
+  bool IsUnknownDtype() const { return dtype->dtype == DLDataType{kDLOpaqueHandle, 0, 0}; }
 
   /*! \return Shape if it is known. */
   ffi::Optional<ffi::Array<PrimExpr>> GetShape() const {
@@ -230,7 +230,7 @@ class TensorType : public Type {
    *
    * \note shape must already be normalized.
    */
-  TVM_DLL TensorType(Expr shape, DataType dtype, ffi::Optional<VDevice> vdevice = std::nullopt,
+  TVM_DLL TensorType(Expr shape, tvm::PrimType dtype, ffi::Optional<VDevice> vdevice = std::nullopt,
                      Span span = Span());
 
   /*!
@@ -240,7 +240,7 @@ class TensorType : public Type {
    * \param vdevice The virtual device.
    * \param span The span of the AST.
    */
-  TVM_DLL TensorType(DataType dtype, int ndim, ffi::Optional<VDevice> vdevice = std::nullopt,
+  TVM_DLL TensorType(tvm::PrimType dtype, int ndim, ffi::Optional<VDevice> vdevice = std::nullopt,
                      Span span = Span());
 
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NOTNULLABLE(TensorType, Type, TensorTypeNode);
diff --git a/include/tvm/runtime/data_type.h b/include/tvm/runtime/data_type.h
deleted file mode 100644
index 9f230cac824e..000000000000
--- a/include/tvm/runtime/data_type.h
+++ /dev/null
@@ -1,522 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*
- * \file tvm/runtime/data_type.h
- * \brief Primitive runtime data type.
- */
-// Acknowledgement: DataType structure design originates from Halide.
-#ifndef TVM_RUNTIME_DATA_TYPE_H_
-#define TVM_RUNTIME_DATA_TYPE_H_
-
-#include <tvm/ffi/container/shape.h>
-#include <tvm/ffi/dtype.h>
-#include <tvm/ffi/error.h>
-#include <tvm/runtime/base.h>
-
-#include <cstring>
-#include <string>
-#include <type_traits>
-
-namespace tvm {
-namespace runtime {
-
-/*!
- * \brief Runtime primitive data type.
- *
- *  This class is a thin wrapper of DLDataType.
- *  We also make use of DataType in compiler to store quick hint
- */
-class DataType {
- public:
-  /*!
-   * \brief Type code for the DataType.
-   *
-   * DLPack consistency:
-   * 1) kInt is consistent with kDLInt
-   * 2) kUInt is consistent with kDLUInt
-   * 3) kFloat is consistent with kDLFloat
-   */
-  enum TypeCode {
-    kInt = kDLInt,
-    kUInt = kDLUInt,
-    kFloat = kDLFloat,
-    kHandle = kDLOpaqueHandle,
-    kBFloat = kDLBfloat,
-    kBool = kDLBool,
-    kFloat8_e3m4 = kDLFloat8_e3m4,
-    kFloat8_e4m3 = kDLFloat8_e4m3,
-    kFloat8_e4m3b11fnuz = kDLFloat8_e4m3b11fnuz,
-    kFloat8_e4m3fn = kDLFloat8_e4m3fn,
-    kFloat8_e4m3fnuz = kDLFloat8_e4m3fnuz,
-    kFloat8_e5m2 = kDLFloat8_e5m2,
-    kFloat8_e5m2fnuz = kDLFloat8_e5m2fnuz,
-    kFloat8_e8m0fnu = kDLFloat8_e8m0fnu,
-    kFloat6_e2m3fn = kDLFloat6_e2m3fn,
-    kFloat6_e3m2fn = kDLFloat6_e3m2fn,
-    kFloat4_e2m1fn = kDLFloat4_e2m1fn,
-    kCustomBegin = 129
-  };
-  /*! \brief default constructor */
-  DataType() { data_ = DataType::Void(); }
-  /*!
-   * \brief Constructor
-   * \param dtype The DLDataType
-   */
-  explicit DataType(DLDataType dtype) : data_(dtype) {}
-  /*!
-   * \brief Constructor
-   * \param code The type code.
-   * \param bits The number of bits in the type.
-   * \param lanes The number of lanes.
-   * \param is_scalable Whether the data type is scalable.
-   */
-  DataType(int code, int bits, int lanes, bool is_scalable = false) {
-    data_.code = static_cast<uint8_t>(code);
-    data_.bits = static_cast<uint8_t>(bits);
-    if (is_scalable) {
-      TVM_FFI_ICHECK(lanes > 1) << "Invalid value for vscale factor" << lanes;
-    }
-    data_.lanes = is_scalable ? static_cast<uint16_t>(-lanes) : static_cast<uint16_t>(lanes);
-    if (code == kBFloat) {
-      TVM_FFI_ICHECK_EQ(bits, 16);
-    }
-    if (code == kFloat8_e3m4 || code == kFloat8_e4m3 || code == kFloat8_e4m3b11fnuz ||
-        code == kFloat8_e4m3fn || code == kFloat8_e4m3fnuz || code == kFloat8_e5m2 ||
-        code == kFloat8_e5m2fnuz || code == kFloat8_e8m0fnu) {
-      TVM_FFI_ICHECK_EQ(bits, 8);
-    }
-    if (code == kFloat6_e2m3fn || code == kFloat6_e3m2fn) {
-      TVM_FFI_ICHECK_EQ(bits, 6);
-    }
-    if (code == kFloat4_e2m1fn) {
-      TVM_FFI_ICHECK_EQ(bits, 4);
-    }
-  }
-  /*! \return The type code. */
-  int code() const { return static_cast<int>(data_.code); }
-  /*! \return number of bits in the data. */
-  int bits() const { return static_cast<int>(data_.bits); }
-  /*! \return number of bytes to store each scalar. */
-  int bytes() const { return (bits() + 7) / 8; }
-  /*! \return number of lanes in the data. */
-  int lanes() const {
-    int lanes_as_int = static_cast<int16_t>(data_.lanes);
-    if (lanes_as_int < 0) {
-      TVM_FFI_THROW(InternalError)
-          << "Can't fetch the lanes of a scalable vector at a compile time.";
-    }
-    return lanes_as_int;
-  }
-  /*! \return the integer multiplier of vscale in a scalable vector. */
-  int vscale_factor() const {
-    int lanes_as_int = static_cast<int16_t>(data_.lanes);
-    if (lanes_as_int >= -1) {
-      TVM_FFI_THROW(InternalError) << "A fixed length vector doesn't have a vscale factor.";
-    }
-    return -lanes_as_int;
-  }
-  /*! \return get vscale factor or lanes depending on scalability of the vector. */
-  int get_lanes_or_vscale_factor() const {
-    return is_scalable_vector() ? vscale_factor() : lanes();
-  }
-  /*! \return whether type is a scalar type. */
-  bool is_scalar() const { return !is_scalable_vector() && lanes() == 1; }
-  /*! \return whether type is a bool type. */
-  bool is_bool() const { return code() == DataType::kBool; }
-  /*! \return whether type can be used in a predicate expression. */
-  bool is_predicate_dtype() const { return is_bool() || (is_uint() && bits() == 1); }
-  /*! \return whether type is a float type. */
-  bool is_float() const { return code() == DataType::kFloat; }
-  /*! \return whether type is a bfloat type. */
-  bool is_bfloat() const { return code() == DataType::kBFloat; }
-  /*! \return whether type is any 8-bit custom Float8 variant. */
-  bool is_float8() const {
-    return bits() == 8 &&
-           (code() == DataType::kFloat8_e3m4 || code() == DataType::kFloat8_e4m3 ||
-            code() == DataType::kFloat8_e4m3b11fnuz || code() == DataType::kFloat8_e4m3fn ||
-            code() == DataType::kFloat8_e4m3fnuz || code() == DataType::kFloat8_e5m2 ||
-            code() == DataType::kFloat8_e5m2fnuz || code() == DataType::kFloat8_e8m0fnu);
-  }
-  /*! \return whether type is any 6-bit custom Float6 variant. */
-  bool is_float6() const {
-    return bits() == 6 &&
-           (code() == DataType::kFloat6_e2m3fn || code() == DataType::kFloat6_e3m2fn);
-  }
-  /*! \return whether type is the 4-bit custom Float4_e2m1fn variant. */
-  bool is_float4() const { return bits() == 4 && code() == DataType::kFloat4_e2m1fn; }
-  /*! \return whether type is Float8E3M4. */
-  bool is_float8_e3m4() const { return bits() == 8 && code() == DataType::kFloat8_e3m4; }
-  /*! \return whether type is Float8E4M3. */
-  bool is_float8_e4m3() const { return bits() == 8 && code() == DataType::kFloat8_e4m3; }
-  /*! \return whether type is Float8E4M3B11FNUZ. */
-  bool is_float8_e4m3b11fnuz() const {
-    return bits() == 8 && code() == DataType::kFloat8_e4m3b11fnuz;
-  }
-  /*! \return whether type is Float8E4M3FN. */
-  bool is_float8_e4m3fn() const { return bits() == 8 && code() == DataType::kFloat8_e4m3fn; }
-  /*! \return whether type is Float8E4M3FNUZ. */
-  bool is_float8_e4m3fnuz() const { return bits() == 8 && code() == DataType::kFloat8_e4m3fnuz; }
-  /*! \return whether type is Float8E5M2. */
-  bool is_float8_e5m2() const { return bits() == 8 && code() == DataType::kFloat8_e5m2; }
-  /*! \return whether type is Float8E5M2FNUZ. */
-  bool is_float8_e5m2fnuz() const { return bits() == 8 && code() == DataType::kFloat8_e5m2fnuz; }
-  /*! \return whether type is Float8E8M0FNU. */
-  bool is_float8_e8m0fnu() const { return bits() == 8 && code() == DataType::kFloat8_e8m0fnu; }
-  /*! \return whether type is Float6E2M3FN. */
-  bool is_float6_e2m3fn() const { return bits() == 6 && code() == DataType::kFloat6_e2m3fn; }
-  /*! \return whether type is Float6E3M2FN. */
-  bool is_float6_e3m2fn() const { return bits() == 6 && code() == DataType::kFloat6_e3m2fn; }
-  /*! \return whether type is Float4E2M1FN. */
-  bool is_float4_e2m1fn() const { return bits() == 4 && code() == DataType::kFloat4_e2m1fn; }
-  /*! \return whether type is a float16 type. */
-  bool is_float16() const { return is_float() && bits() == 16; }
-  /*! \return whether type is a bfloat16 type. */
-  bool is_bfloat16() const { return code() == DataType::kBFloat && bits() == 16; }
-  /*! \return whether type is an int type. */
-  bool is_int() const { return code() == DataType::kInt; }
-  /*! \return whether type is an uint type. */
-  bool is_uint() const { return code() == DataType::kUInt; }
-  /*! \return whether type is a handle type. */
-  bool is_handle() const { return code() == DataType::kHandle && !is_void(); }
-  /*! \return whether type is a vector type. */
-  bool is_scalable_or_fixed_length_vector() const {
-    int encoded_lanes = static_cast<int16_t>(data_.lanes);
-    return (encoded_lanes < -1) || (1 < encoded_lanes);
-  }
-  /*! \return Whether the type is a fixed length vector. */
-  bool is_fixed_length_vector() const { return static_cast<int16_t>(data_.lanes) > 1; }
-  /*! \return Whether the type is a scalable vector. */
-  bool is_scalable_vector() const { return static_cast<int16_t>(data_.lanes) < -1; }
-  /*! \return whether type is a vector type. */
-  bool is_vector() const { return lanes() > 1; }
-  /*! \return whether type is a bool vector type. */
-  bool is_vector_bool() const { return is_scalable_or_fixed_length_vector() && is_bool(); }
-  /*! \return whether type is a Void type. */
-  bool is_void() const {
-    return code() == DataType::kHandle && bits() == 0 && static_cast<int16_t>(data_.lanes) == 0;
-  }
-  /*!
-   * \brief Create a new data type by change lanes to a specified value.
-   * \param lanes The target number of lanes.
-   * \return the result type.
-   */
-  DataType with_lanes(int lanes) const { return DataType(data_.code, data_.bits, lanes); }
-  /*!
-   * \brief Create a new scalable vector data type by changing the vscale multiplier to a specified
-   * value. We'll use the data_.lanes field for this value. \param vscale_factor The vscale
-   * multiplier. \return A copy of the old DataType with the number of scalable lanes.
-   */
-  DataType with_scalable_vscale_factor(int vscale_factor) const {
-    return DataType(data_.code, data_.bits, -vscale_factor);
-  }
-  /*!
-   * \brief Create a new data type by change bits to a specified value.
-   * \param bits The target number of bits.
-   * \return the result type.
-   */
-  DataType with_bits(int bits) const { return DataType(data_.code, bits, data_.lanes); }
-  /*!
-   * \brief Get the scalar version of the type.
-   * \return the result type.
-   */
-  DataType element_of() const { return with_lanes(1); }
-  /*!
-   * \brief Assignment operator.
-   */
-  DataType& operator=(const DataType& rhs) {
-    if (this == &rhs) {
-      return *this;
-    }
-    data_ = rhs.data_;
-    return *this;
-  }
-  /*!
-   * \brief Equal comparator.
-   * \param other The data type to compare against.
-   * \return The comparison result.
-   */
-  bool operator==(const DataType& other) const {
-    return data_.code == other.data_.code && data_.bits == other.data_.bits &&
-           data_.lanes == other.data_.lanes;
-  }
-  /*!
-   * \brief NotEqual comparator.
-   * \param other The data type to compare against.
-   * \return The comparison result.
-   */
-  bool operator!=(const DataType& other) const { return !operator==(other); }
-  /*!
-   * \brief Converter to DLDataType
-   * \return the result.
-   */
-  operator DLDataType() const { return data_; }
-
-  /*!
-   * \brief Construct an int type.
-   * \param bits The number of bits in the type.
-   * \param lanes The number of lanes.
-   * \return The constructed data type.
-   */
-  static DataType Int(int bits, int lanes = 1) { return DataType(kDLInt, bits, lanes); }
-  /*!
-   * \brief Construct an uint type.
-   * \param bits The number of bits in the type.
-   * \param lanes The number of lanes.
-   * \param is_scalable Whether the data type is scalable.
-   * \return The constructed data type.
-   */
-  static DataType UInt(int bits, int lanes = 1, bool is_scalable = false) {
-    return DataType(kDLUInt, bits, lanes, is_scalable);
-  }
-  /*!
-   * \brief Construct an float type.
-   * \param bits The number of bits in the type.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float(int bits, int lanes = 1) { return DataType(kDLFloat, bits, lanes); }
-  /*!
-   * \brief Construct an bfloat type.
-   * \param bits The number of bits in the type.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType BFloat(int bits, int lanes = 1) { return DataType(kDLBfloat, bits, lanes); }
-  /*!
-   * \brief Construct float8 e3m4 datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float8E3M4(int lanes = 1) { return DataType(kFloat8_e3m4, 8, lanes); }
-
-  /*!
-   * \brief Construct float8 e4m3 datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float8E4M3(int lanes = 1) { return DataType(kFloat8_e4m3, 8, lanes); }
-
-  /*!
-   * \brief Construct float8 e4m3b11fnuz datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float8E4M3B11FNUZ(int lanes = 1) {
-    return DataType(kFloat8_e4m3b11fnuz, 8, lanes);
-  }
-
-  /*!
-   * \brief Construct float8 e4m3fn datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float8E4M3FN(int lanes = 1) { return DataType(kFloat8_e4m3fn, 8, lanes); }
-
-  /*!
-   * \brief Construct float8 e4m3fnuz datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float8E4M3FNUZ(int lanes = 1) { return DataType(kFloat8_e4m3fnuz, 8, lanes); }
-
-  /*!
-   * \brief Construct float8 e5m2 datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float8E5M2(int lanes = 1) { return DataType(kFloat8_e5m2, 8, lanes); }
-
-  /*!
-   * \brief Construct float8 e5m2fnuz datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float8E5M2FNUZ(int lanes = 1) { return DataType(kFloat8_e5m2fnuz, 8, lanes); }
-
-  /*!
-   * \brief Construct float8 e8m0fnu datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float8E8M0FNU(int lanes = 1) { return DataType(kFloat8_e8m0fnu, 8, lanes); }
-
-  /*!
-   * \brief Construct float6 e2m3fn datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float6E2M3FN(int lanes = 1) { return DataType(kFloat6_e2m3fn, 6, lanes); }
-
-  /*!
-   * \brief Construct float6 e3m2fn datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float6E3M2FN(int lanes = 1) { return DataType(kFloat6_e3m2fn, 6, lanes); }
-
-  /*!
-   * \brief Construct float4 e2m1fn datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float4E2M1FN(int lanes = 1) { return DataType(kFloat4_e2m1fn, 4, lanes); }
-  /*!
-   * \brief Construct a bool type.
-   * \param lanes The number of lanes.
-   * \param is_scalable Whether the data type is scalable.
-   * \return The constructed data type.
-   */
-  static DataType Bool(int lanes = 1, bool is_scalable = false) {
-    return DataType(kDLBool, 8, lanes, is_scalable);
-  }
-  /*!
-   * \brief Construct a handle type.
-   * \param bits The number of bits in the type.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Handle(int bits = 64, int lanes = 1) { return DataType(kHandle, bits, lanes); }
-  /*!
-   * \brief Construct a Void type.
-   * \return The constructed data type.
-   */
-  static DataType Void() { return DataType(kHandle, 0, 0); }
-  /*!
-   * \brief Get the corresponding type of TVMShapeIndex.
-   * \return The type of TVM shape index.
-   */
-  static DataType ShapeIndex() {
-    if (std::is_signed<ffi::Shape::index_type>::value) {
-      return DataType::Int(sizeof(ffi::Shape::index_type) * 8);
-    } else {
-      return DataType::UInt(sizeof(ffi::Shape::index_type) * 8);
-    }
-  }
-
- private:
-  DLDataType data_;
-};
-
-/*!
- * \brief Get the number of bytes needed in a vector.
- * \param dtype The data type.
- * \return Number of bytes needed.
- */
-inline int GetVectorBytes(DataType dtype) {
-  int data_bits = dtype.bits() * dtype.lanes();
-  // allow bool to exist
-  if (dtype == DataType::Bool() || dtype == DataType::Int(4) || dtype == DataType::UInt(4) ||
-      dtype == DataType::Int(1) || dtype == DataType::Float4E2M1FN() ||
-      dtype == DataType::Float6E2M3FN() || dtype == DataType::Float6E3M2FN()) {
-    return 1;
-  }
-  TVM_FFI_ICHECK_EQ(data_bits % 8, 0U) << "Need to load/store by multiple of bytes";
-  return data_bits / 8;
-}
-
-/*!
- * \brief Check whether type matches the given spec.
- * \param t The type
- * \param code The type code.
- * \param bits The number of bits to be matched.
- * \param lanes The number of lanes in the type.
- */
-inline bool TypeMatch(DLDataType t, int code, int bits, int lanes = 1) {
-  return t.code == code && t.bits == bits && t.lanes == lanes;
-}
-/*!
- * \brief Check whether two types are equal .
- * \param lhs The left operand.
- * \param rhs The right operand.
- */
-inline bool TypeEqual(DLDataType lhs, DLDataType rhs) {
-  return lhs.code == rhs.code && lhs.bits == rhs.bits && lhs.lanes == rhs.lanes;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const DataType& dtype) {  // NOLINT(*)
-  return os << dtype.operator DLDataType();
-}
-}  // namespace runtime
-
-using DataType = runtime::DataType;
-
-namespace ffi {
-
-// runtime::DataType
-template <>
-struct TypeTraits<runtime::DataType> : public TypeTraitsBase {
-  static constexpr int32_t field_static_type_index = TypeIndex::kTVMFFIDataType;
-
-  TVM_FFI_INLINE static void CopyToAnyView(const runtime::DataType& src, TVMFFIAny* result) {
-    // clear padding part to ensure the equality check can always check the v_uint64 part
-    result->v_uint64 = 0;
-    result->zero_padding = 0;
-    result->type_index = TypeIndex::kTVMFFIDataType;
-    result->v_dtype = src;
-  }
-
-  TVM_FFI_INLINE static void MoveToAny(runtime::DataType src, TVMFFIAny* result) {
-    // clear padding part to ensure the equality check can always check the v_uint64 part
-    result->v_uint64 = 0;
-    result->zero_padding = 0;
-    result->type_index = TypeIndex::kTVMFFIDataType;
-    result->v_dtype = src;
-  }
-
-  TVM_FFI_INLINE static std::optional<runtime::DataType> TryCastFromAnyView(const TVMFFIAny* src) {
-    auto opt_dtype = TypeTraits<DLDataType>::TryCastFromAnyView(src);
-    if (opt_dtype) {
-      return runtime::DataType(opt_dtype.value());
-    }
-    return std::nullopt;
-  }
-
-  TVM_FFI_INLINE static bool CheckAnyStrict(const TVMFFIAny* src) {
-    return TypeTraits<DLDataType>::CheckAnyStrict(src);
-  }
-
-  TVM_FFI_INLINE static runtime::DataType CopyFromAnyViewAfterCheck(const TVMFFIAny* src) {
-    return runtime::DataType(TypeTraits<DLDataType>::CopyFromAnyViewAfterCheck(src));
-  }
-
-  TVM_FFI_INLINE static std::string TypeStr() { return ffi::StaticTypeKey::kTVMFFIDataType; }
-
-  TVM_FFI_INLINE static std::string TypeSchema() {
-    return R"({"type":")" + std::string(ffi::StaticTypeKey::kTVMFFIDataType) + R"("})";
-  }
-};
-
-}  // namespace ffi
-}  // namespace tvm
-
-namespace std {
-template <>
-struct hash<tvm::DataType> {
-  inline int cantor_pairing_function(int a, int b) const { return (a + b) * (a + b + 1) / 2 + b; }
-  std::size_t operator()(tvm::DataType const& dtype) const {
-    int a = dtype.code();
-    int b = dtype.bits();
-    int c = dtype.lanes();
-    int d = cantor_pairing_function(a, b);
-    return cantor_pairing_function(c, d);
-  }
-};
-}  // namespace std
-
-#endif  //  TVM_RUNTIME_DATA_TYPE_H_
diff --git a/include/tvm/runtime/disco/builtin.h b/include/tvm/runtime/disco/builtin.h
index a9487c866acc..9d66a09507c5 100644
--- a/include/tvm/runtime/disco/builtin.h
+++ b/include/tvm/runtime/disco/builtin.h
@@ -19,8 +19,8 @@
 #ifndef TVM_RUNTIME_DISCO_BUILTIN_H_
 #define TVM_RUNTIME_DISCO_BUILTIN_H_
 
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/extra/module.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/tensor.h>
 
 #include <string>
@@ -70,7 +70,7 @@ TVM_RUNTIME_DLL ffi::Module LoadVMModule(std::string path, ffi::Optional<Device>
  * \param device The device the Tensor is created on. If None, use the thread local default device
  * \return The Tensor created
  */
-TVM_RUNTIME_DLL Tensor DiscoEmptyTensor(ffi::Shape shape, DataType dtype,
+TVM_RUNTIME_DLL Tensor DiscoEmptyTensor(ffi::Shape shape, DLDataType dtype,
                                         ffi::Optional<Device> device);
 /*!
  * \brief Perform an allreduce operation using the underlying communication library
diff --git a/include/tvm/runtime/tensor.h b/include/tvm/runtime/tensor.h
index d3497c8ff78f..cb93c4abd741 100644
--- a/include/tvm/runtime/tensor.h
+++ b/include/tvm/runtime/tensor.h
@@ -26,10 +26,10 @@
 
 #include <tvm/ffi/container/shape.h>
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/optional.h>
 #include <tvm/ffi/string.h>
 #include <tvm/runtime/base.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/support/io.h>
 #include <tvm/support/serializer.h>
@@ -59,7 +59,7 @@ class Tensor : public tvm::ffi::Tensor {
   Tensor(const ffi::Tensor& other) : tvm::ffi::Tensor(other) {}        // NOLINT(*)
 
   ffi::ShapeView Shape() const { return this->shape(); }
-  runtime::DataType DataType() const { return runtime::DataType(this->dtype()); }
+  DLDataType DataType() const { return this->dtype(); }
 
   // DLPack handling
   static Tensor FromDLPack(DLManagedTensor* tensor) {
diff --git a/include/tvm/runtime/vm/bytecode.h b/include/tvm/runtime/vm/bytecode.h
index 0f1927e0cbcb..ea246da5d354 100644
--- a/include/tvm/runtime/vm/bytecode.h
+++ b/include/tvm/runtime/vm/bytecode.h
@@ -24,8 +24,8 @@
 #ifndef TVM_RUNTIME_VM_BYTECODE_H_
 #define TVM_RUNTIME_VM_BYTECODE_H_
 
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/error.h>
-#include <tvm/runtime/data_type.h>
 
 #include <iostream>
 #include <vector>
diff --git a/include/tvm/runtime/vm/tensor_cache_support.h b/include/tvm/runtime/vm/tensor_cache_support.h
index ea997f0755bd..b112043c376f 100644
--- a/include/tvm/runtime/vm/tensor_cache_support.h
+++ b/include/tvm/runtime/vm/tensor_cache_support.h
@@ -54,7 +54,7 @@ struct TensorCacheMetadata {
       /*! \brief Shape of the parameter */
       ffi::Shape shape;
       /*! \brief Data type of the parameter */
-      DataType dtype;
+      DLDataType dtype;
       /*! \brief Format of the parameter */
       std::string format;
       /*! \brief Number of bytes */
diff --git a/include/tvm/s_tir/data_layout.h b/include/tvm/s_tir/data_layout.h
index 48836c5a53d5..ee6d51832dba 100644
--- a/include/tvm/s_tir/data_layout.h
+++ b/include/tvm/s_tir/data_layout.h
@@ -140,10 +140,10 @@ class SLayout : public ffi::ObjectRef {
    *        the corresponding lower case with factor size
    *        indicates the split dimension.
    *        return undefined layout if "__undef__" is passed.
-   * \param dtype The dtype of generated axes vars in the returned layout.
+   * \param index_ty The type of generated axes vars in the returned layout.
    *        It is required to be integer type.
    */
-  TVM_DLL SLayout(const std::string& name, DataType dtype = DataType::Int(32));  // NOLINT(*)
+  TVM_DLL SLayout(const std::string& name, PrimType index_ty = PrimType::Int(32));  // NOLINT(*)
 
   /*!
    * \brief access the internal node container
diff --git a/include/tvm/s_tir/meta_schedule/arg_info.h b/include/tvm/s_tir/meta_schedule/arg_info.h
index 463e73b0e246..a346a73dd441 100644
--- a/include/tvm/s_tir/meta_schedule/arg_info.h
+++ b/include/tvm/s_tir/meta_schedule/arg_info.h
@@ -20,9 +20,9 @@
 #define TVM_S_TIR_META_SCHEDULE_ARG_INFO_H_
 
 #include <tvm/ffi/container/shape.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ir/module.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/tirx/function.h>
 
 namespace tvm {
@@ -77,7 +77,7 @@ class ArgInfo : public ffi::ObjectRef {
 class TensorInfoNode : public ArgInfoNode {
  public:
   /*! \brief The data type of the tensor. */
-  runtime::DataType dtype;
+  DLDataType dtype;
   /*! \brief The shape of the tensor. */
   ffi::Shape shape;
 
@@ -104,7 +104,7 @@ class TensorInfo : public ArgInfo {
    * \param dtype The data type of the tensor argument.
    * \param shape The shape tuple of the tensor argument.
    */
-  TVM_DLL explicit TensorInfo(runtime::DataType dtype, ffi::Shape shape);
+  TVM_DLL explicit TensorInfo(DLDataType dtype, ffi::Shape shape);
   /*!
    * \brief Parse the argument information from a JSON object.
    * \param json_obj The json object to parse.
diff --git a/include/tvm/script/printer/config.h b/include/tvm/script/printer/config.h
index beea4042470c..e0ed32d38094 100644
--- a/include/tvm/script/printer/config.h
+++ b/include/tvm/script/printer/config.h
@@ -30,10 +30,11 @@
 #include <tvm/ffi/any.h>
 #include <tvm/ffi/container/array.h>
 #include <tvm/ffi/container/map.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/reflection/access_path.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ffi/string.h>
-#include <tvm/runtime/data_type.h>
+#include <tvm/runtime/base.h>
 
 #include <string>
 
@@ -53,15 +54,15 @@ class PrinterConfigNode : public ffi::Object {
    */
   ffi::String module_alias = "cls";
   /*! \brief Default buffer dtype */
-  DataType buffer_dtype = DataType::Float(32);
+  DLDataType buffer_dtype = DLDataType{kDLFloat, 32, 1};
   /*! \brief Default data type of integer literals */
-  DataType int_dtype = DataType::Int(32);
+  DLDataType int_dtype = DLDataType{kDLInt, 32, 1};
   /*!
    * \brief Default data type of float literals. Right now we always print out the explicit type
    * of floating point values, so setting it to Void means we do not print without the
    * T.float32/T.float64 wrapper.
    */
-  DataType float_dtype = DataType::Void();
+  DLDataType float_dtype = DLDataType{kDLOpaqueHandle, 0, 0};
   /*! \brief Whether or not to verbose print expressions. */
   bool verbose_expr = false;
   /*! \brief Number of spaces used for indentation*/
diff --git a/include/tvm/script/printer/doc.h b/include/tvm/script/printer/doc.h
index 2389c1b50d15..bc90e5365734 100644
--- a/include/tvm/script/printer/doc.h
+++ b/include/tvm/script/printer/doc.h
@@ -19,10 +19,11 @@
 #ifndef TVM_SCRIPT_PRINTER_DOC_H_
 #define TVM_SCRIPT_PRINTER_DOC_H_
 
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/reflection/access_path.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ir/expr.h>
-#include <tvm/runtime/data_type.h>
+#include <tvm/ir/type.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/script/printer/config.h>
 
@@ -293,7 +294,7 @@ class LiteralDoc : public ExprDoc {
    * \param p The object path
    */
   static LiteralDoc Float(double v, const ffi::Optional<AccessPath>& p) {
-    return LiteralDoc(FloatImm(DataType::Float(64), v), p);
+    return LiteralDoc(FloatImm(PrimType::Float(64), v), p);
   }
   /*!
    * \brief Create a LiteralDoc to represent string.
@@ -308,8 +309,9 @@ class LiteralDoc : public ExprDoc {
    * \param v The string value.
    * \param p The object path
    */
-  static LiteralDoc DataType(const runtime::DataType& v, const ffi::Optional<AccessPath>& p) {
-    std::string dtype = v.is_void() ? "void" : ffi::DLDataTypeToString(v);
+  static LiteralDoc DataType(DLDataType v, const ffi::Optional<AccessPath>& p) {
+    std::string dtype =
+        v == DLDataType{kDLOpaqueHandle, 0, 0} ? "void" : ffi::DLDataTypeToString(v);
     return LiteralDoc::Str(dtype, p);
   }
   /*!
diff --git a/include/tvm/script/printer/ir_docsifier.h b/include/tvm/script/printer/ir_docsifier.h
index 98249c6f30bd..e9c82265ff27 100644
--- a/include/tvm/script/printer/ir_docsifier.h
+++ b/include/tvm/script/printer/ir_docsifier.h
@@ -333,7 +333,7 @@ inline TDoc IRDocsifierNode::AsDoc(const Any& value, const AccessPath& path) con
       return LiteralDoc::Str(string_value, path).as_or_throw<TDoc>();
     }
     case ffi::TypeIndex::kTVMFFIDataType:
-      return LiteralDoc::DataType(value.as<runtime::DataType>().value(), path).as_or_throw<TDoc>();
+      return LiteralDoc::DataType(value.as<DLDataType>().value(), path).as_or_throw<TDoc>();
     case ffi::TypeIndex::kTVMFFIDevice:
       return LiteralDoc::Device(value.as<DLDevice>().value(), path).as_or_throw<TDoc>();
     default: {
diff --git a/include/tvm/te/operation.h b/include/tvm/te/operation.h
index c9d35a77fe99..ba5267a8ce85 100644
--- a/include/tvm/te/operation.h
+++ b/include/tvm/te/operation.h
@@ -34,6 +34,7 @@
 
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 namespace tvm {
@@ -67,11 +68,11 @@ class TVM_DLL OperationNode : public ffi::Object {
   /*! \return number of outputs */
   virtual int num_outputs() const = 0;
   /*!
-   * \brief Get data type. i-th output tensor.
+   * \brief Get the primitive element type of the i-th output tensor.
    * \param i The output index.
-   * \return type of i-th output.
+   * \return primitive element type of i-th output.
    */
-  virtual DataType output_dtype(size_t i) const = 0;
+  virtual PrimType output_dtype(size_t i) const = 0;
   /*!
    * \brief Get shape of i-th output tensor.
    * \param i The output index.
@@ -101,11 +102,11 @@ class PlaceholderOpNode : public OperationNode {
  public:
   /*! \brief The shape of the input */
   ffi::Array<PrimExpr> shape;
-  /*! \brief The data type of the input. */
-  DataType dtype;
+  /*! \brief The dtype of the input. */
+  PrimType dtype{DLDataType{kDLOpaqueHandle, 0, 0}};
   // override behavior.
   int num_outputs() const final;
-  DataType output_dtype(size_t i) const final;
+  PrimType output_dtype(size_t i) const final;
   ffi::Array<PrimExpr> output_shape(size_t i) const final;
   ffi::Array<Tensor> InputTensors() const final;
 
@@ -124,7 +125,9 @@ class PlaceholderOpNode : public OperationNode {
  */
 class PlaceholderOp : public Operation {
  public:
-  TVM_DLL PlaceholderOp(std::string name, ffi::Array<PrimExpr> shape, DataType dtype);
+  TVM_DLL PlaceholderOp(std::string name, ffi::Array<PrimExpr> shape, PrimType dtype);
+  PlaceholderOp(std::string name, ffi::Array<PrimExpr> shape, DLDataType dtype)
+      : PlaceholderOp(std::move(name), std::move(shape), PrimType(dtype)) {}
 
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(PlaceholderOp, Operation, PlaceholderOpNode);
 };
@@ -162,7 +165,7 @@ class TVM_DLL ComputeOpNode : public BaseComputeOpNode {
   ComputeOpNode() {}
   // override functions
   int num_outputs() const final;
-  DataType output_dtype(size_t i) const final;
+  PrimType output_dtype(size_t i) const final;
   ffi::Array<Tensor> InputTensors() const final;
 
   static void RegisterReflection() {
@@ -217,7 +220,7 @@ class ScanOpNode : public OperationNode {
   ScanOpNode() {}
   // override behavior.
   int num_outputs() const final;
-  DataType output_dtype(size_t i) const final;
+  PrimType output_dtype(size_t i) const final;
   ffi::Array<PrimExpr> output_shape(size_t i) const final;
   ffi::Array<Tensor> InputTensors() const final;
 
@@ -266,7 +269,7 @@ class ExternOpNode : public OperationNode {
   ExternOpNode() {}
   // override functions
   int num_outputs() const final;
-  DataType output_dtype(size_t i) const final;
+  PrimType output_dtype(size_t i) const final;
   ffi::Array<PrimExpr> output_shape(size_t i) const final;
   ffi::Array<Tensor> InputTensors() const final;
 
@@ -299,7 +302,7 @@ class ExternOp : public Operation {
  * \param name_hint The name hint for the expression
  * \param t The type of the expression
  */
-TVM_DLL Var var(std::string name_hint, DataType t = DataType::Int(32));
+TVM_DLL Var var(std::string name_hint, PrimType t = PrimType::Int(32));
 
 /*!
  * \brief Create a new IterVar that represents an axis in thread.
@@ -329,9 +332,14 @@ using FBatchCompute = std::function<ffi::Array<PrimExpr>(const ffi::Array<Var>&
  * \param dtype the data type of the tensor.
  * \param name The name of the Tensor.
  */
-TVM_DLL Tensor placeholder(ffi::Array<PrimExpr> shape, DataType dtype = DataType::Float(32),
+TVM_DLL Tensor placeholder(ffi::Array<PrimExpr> shape, PrimType dtype = PrimType::Float(32),
                            std::string name = "placeholder");
 
+inline Tensor placeholder(ffi::Array<PrimExpr> shape, DLDataType dtype,
+                          std::string name = "placeholder") {
+  return placeholder(std::move(shape), PrimType(dtype), std::move(name));
+}
+
 /*!
  * \brief Construct a new tensor by computing over shape,
  *  using the computation rule: result_tensor[axis] = fcompute(axis)
diff --git a/include/tvm/te/tensor.h b/include/tvm/te/tensor.h
index ed07a35fb2da..760d308623f8 100644
--- a/include/tvm/te/tensor.h
+++ b/include/tvm/te/tensor.h
@@ -71,8 +71,8 @@ class TensorNode : public DataProducerNode {
  public:
   /*! \brief The shape of the tensor */
   ffi::Array<PrimExpr> shape;
-  /*! \brief data type in the content of the tensor */
-  DataType dtype;
+  /*! \brief dtype in the content of the tensor */
+  PrimType dtype{DLDataType{kDLOpaqueHandle, 0, 0}};
   /*! \brief the source operation, can be None */
   Operation op;
   /*! \brief the output index from source operation */
@@ -82,7 +82,7 @@ class TensorNode : public DataProducerNode {
 
   ffi::Array<PrimExpr> GetShape() const final { return shape; }
 
-  DataType GetDataType() const final { return dtype; }
+  PrimType GetDataType() const final { return dtype; }
 
   TVM_DLL PrimExpr ToPrimExpr() const final;
 
@@ -108,7 +108,9 @@ class Tensor : public DataProducer {
   inline PrimExpr IndexTensor(ffi::Array<PrimExpr> indices, bool support_negative_indices) const;
 
  public:
-  TVM_DLL Tensor(ffi::Array<PrimExpr> shape, DataType dtype, Operation op, int value_index);
+  TVM_DLL Tensor(ffi::Array<PrimExpr> shape, PrimType dtype, Operation op, int value_index);
+  Tensor(ffi::Array<PrimExpr> shape, DLDataType dtype, Operation op, int value_index)
+      : Tensor(std::move(shape), PrimType(dtype), std::move(op), value_index) {}
   /*!
    * \brief check if two tensors equals each other.
    * \param other tensor to be checked.
diff --git a/include/tvm/tirx/buffer.h b/include/tvm/tirx/buffer.h
index 1456787d688b..71d4c974dbb8 100644
--- a/include/tvm/tirx/buffer.h
+++ b/include/tvm/tirx/buffer.h
@@ -40,11 +40,20 @@ namespace tirx {
 #define TVM_INDEX_DEFAULT_I64 1
 #endif
 /*! \brief if TVM_INDEX_DEFAULT_I64 is set, return int64, otherwise return int32 */
-inline DataType DefaultIndexType() {
+inline PrimType DefaultIndexPrimType() {
 #if TVM_INDEX_DEFAULT_I64
-  return DataType::Int(64);
+  static const PrimType default_index_ty = PrimType::Int(64);
 #else
-  return DataType::Int(32);
+  static const PrimType default_index_ty = PrimType::Int(32);
+#endif
+  return default_index_ty;
+}
+
+inline DLDataType DefaultIndexType() {
+#if TVM_INDEX_DEFAULT_I64
+  return DLDataType{kDLInt, 64, 1};
+#else
+  return DLDataType{kDLInt, 32, 1};
 #endif
 }
 
@@ -67,8 +76,8 @@ class BufferNode : public ffi::Object {
    * \sa data_alignment The alignment of data in bytes.
    */
   Var data;
-  /*! \brief data type in the content of the tensor */
-  DataType dtype;
+  /*! \brief dtype in the content of the tensor */
+  PrimType dtype{DLDataType{kDLOpaqueHandle, 0, 0}};
   /*! \brief The type of the buffer prior to flattening
    *
    * This contains the shape as it is accessed by
@@ -147,10 +156,13 @@ class BufferNode : public ffi::Object {
   }
 
   /*! \return preferred index type for this buffer node */
-  DataType DefaultIndexType() const {
-    return shape.size() != 0 ? shape[0].dtype() : tvm::tirx::DefaultIndexType();
+  DLDataType DefaultIndexType() const {
+    return shape.size() != 0 ? shape[0].ty()->dtype : tvm::tirx::DefaultIndexType();
   }
 
+  /*! \return primitive element type for compiler-side uses. */
+  PrimType ElementType() const { return dtype; }
+
   /*! \brief Determine the offset in the buffer of the given index.
    *
    * Returns the buffer offset, in number of elements of type dtype,
@@ -176,11 +188,19 @@ class Buffer : public ffi::ObjectRef {
  public:
   // User can specify data_alignment and offset_factor to be 0
   // A default value will be picked.
-  TVM_DLL Buffer(Var data, DataType dtype, ffi::Array<PrimExpr> shape, ffi::Array<PrimExpr> strides,
+  TVM_DLL Buffer(Var data, PrimType dtype, ffi::Array<PrimExpr> shape, ffi::Array<PrimExpr> strides,
                  PrimExpr elem_offset, ffi::String name, int data_alignment, int offset_factor,
                  BufferType buffer_type, ffi::Array<IntImm> axis_separators = {},
                  Span span = Span(), ffi::Optional<Layout> layout = std::nullopt,
                  ffi::Array<PrimExpr> allocated_addr = {});
+  Buffer(Var data, DLDataType dtype, ffi::Array<PrimExpr> shape, ffi::Array<PrimExpr> strides,
+         PrimExpr elem_offset, ffi::String name, int data_alignment, int offset_factor,
+         BufferType buffer_type, ffi::Array<IntImm> axis_separators = {}, Span span = Span(),
+         ffi::Optional<Layout> layout = std::nullopt, ffi::Array<PrimExpr> allocated_addr = {})
+      : Buffer(std::move(data), PrimType(dtype), std::move(shape), std::move(strides),
+               std::move(elem_offset), std::move(name), data_alignment, offset_factor, buffer_type,
+               std::move(axis_separators), std::move(span), std::move(layout),
+               std::move(allocated_addr)) {}
 
   /*!
    * \brief Return a new buffer that is equivalent with current one
@@ -205,7 +225,7 @@ class Buffer : public ffi::ObjectRef {
    * \param offset The offset of ptr.
    * \param input_extent The extent of ptr.
    */
-  TVM_DLL PrimExpr access_ptr(int access_mask, DataType ptr_type = DataType::Handle(),
+  TVM_DLL PrimExpr access_ptr(int access_mask, PrimType ptr_type = PrimType::Handle(),
                               int content_lanes = 1, PrimExpr offset = IntImm::Int32(0),
                               ffi::Optional<PrimExpr> input_extent = std::nullopt) const;
   /*!
@@ -215,7 +235,7 @@ class Buffer : public ffi::ObjectRef {
    * \param predicate A vector mask of boolean values indicating which lanes of a vector are to be
    * loaded. The number lanes of the mask must be equal to the number of lanes in being loaded.
    */
-  TVM_DLL PrimExpr vload(ffi::Array<PrimExpr> begin, DataType dtype,
+  TVM_DLL PrimExpr vload(ffi::Array<PrimExpr> begin, PrimType dtype,
                          ffi::Optional<PrimExpr> predicate = std::nullopt) const;
   /*!
    * \brief Create a Stmt that does a vector store at begin index.
@@ -267,7 +287,11 @@ class Buffer : public ffi::ObjectRef {
   /*!
    * \brief Return a new buffer with the dtype.
    */
-  TVM_DLL Buffer with_dtype(DataType dtype) const;
+  TVM_DLL Buffer with_dtype(PrimType dtype) const;
+  Buffer with_dtype(DLDataType dtype) const { return with_dtype(PrimType(dtype)); }
+
+  /*! \return primitive element type for compiler-side uses. */
+  PrimType ElementType() const { return (*this)->ElementType(); }
 
   /*!
    * \brief Return a new buffer with the data.
@@ -289,11 +313,20 @@ class Buffer : public ffi::ObjectRef {
  * \return The created buffer.
  * \sa Buffer for complete constructor.
  */
-TVM_DLL Buffer decl_buffer(ffi::Array<PrimExpr> shape, DataType dtype = DataType::Float(32),
+TVM_DLL Buffer decl_buffer(ffi::Array<PrimExpr> shape,
+                           DLDataType dtype = DLDataType{kDLFloat, 32, 1},
                            ffi::String name = "buffer", ffi::String storage_scope = "",
                            ffi::Optional<ffi::Array<IntImm>> axis_separators = std::nullopt,
                            Span span = Span());
 
+inline Buffer decl_buffer(ffi::Array<PrimExpr> shape, PrimType dtype, ffi::String name = "buffer",
+                          ffi::String storage_scope = "",
+                          ffi::Optional<ffi::Array<IntImm>> axis_separators = std::nullopt,
+                          Span span = Span()) {
+  return decl_buffer(std::move(shape), dtype->dtype, std::move(name), std::move(storage_scope),
+                     std::move(axis_separators), std::move(span));
+}
+
 /*!
  * \brief Base node for data producers.
  *
@@ -316,10 +349,10 @@ class DataProducerNode : public PrimExprConvertibleNode {
    */
   virtual ffi::Array<PrimExpr> GetShape() const = 0;
   /*!
-   * \brief Get the data type of the result.
-   * \return The data type.
+   * \brief Get the raw element dtype of the result.
+   * \return The raw dtype.
    */
-  virtual DataType GetDataType() const = 0;
+  virtual PrimType GetDataType() const = 0;
   /*!
    * \brief Get the name hint of the data producer.
    * \return The data type.
@@ -350,10 +383,18 @@ class DataProducer : public PrimExprConvertible {
  * \param compact If the statement has already bound to a compact buffer.
  * \param memory_scope memory scope of the buffer
  */
-TVM_DLL tirx::Buffer BufferWithOffsetAlignment(ffi::Array<PrimExpr> shape, DataType dtype,
+TVM_DLL tirx::Buffer BufferWithOffsetAlignment(ffi::Array<PrimExpr> shape, DLDataType dtype,
                                                std::string name, int data_alignment,
                                                int offset_factor, bool compact,
                                                std::string memory_scope = "");
+
+inline tirx::Buffer BufferWithOffsetAlignment(ffi::Array<PrimExpr> shape, PrimType dtype,
+                                              std::string name, int data_alignment,
+                                              int offset_factor, bool compact,
+                                              std::string memory_scope = "") {
+  return BufferWithOffsetAlignment(std::move(shape), dtype->dtype, std::move(name), data_alignment,
+                                   offset_factor, compact, std::move(memory_scope));
+}
 }  // namespace tirx
 }  // namespace tvm
 #endif  // TVM_TIR_BUFFER_H_
diff --git a/include/tvm/tirx/expr.h b/include/tvm/tirx/expr.h
index cd51108b0d23..bf4c9004e84d 100644
--- a/include/tvm/tirx/expr.h
+++ b/include/tvm/tirx/expr.h
@@ -27,13 +27,13 @@
 
 #include <tvm/ffi/container/array.h>
 #include <tvm/ffi/container/map.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/string.h>
 #include <tvm/ir/attrs.h>
 #include <tvm/ir/cow.h>
 #include <tvm/ir/expr.h>
 #include <tvm/ir/node_functor.h>
 #include <tvm/runtime/base.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/tirx/buffer.h>
 #include <tvm/tirx/var.h>
 
@@ -96,7 +96,7 @@ class CastNode : public PrimExprNode {
  */
 class Cast : public PrimExpr {
  public:
-  TVM_DLL Cast(DataType dtype, PrimExpr value, Span span = Span());
+  TVM_DLL Cast(PrimType value_ty, PrimExpr value, Span span = Span());
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Cast, PrimExpr, CastNode);
   TVM_DEFINE_OBJECT_REF_COW_METHOD(CastNode);
 };
@@ -752,9 +752,9 @@ class CallNode : public PrimExprNode {
  */
 class Call : public PrimExpr {
  public:
-  TVM_DLL Call(DataType dtype, RelaxExpr op, ffi::Array<PrimExpr> args, Attrs attrs = Attrs(),
+  TVM_DLL Call(PrimType ret_ty, RelaxExpr op, ffi::Array<PrimExpr> args, Attrs attrs = Attrs(),
                Span span = Span());
-  TVM_DLL Call(DataType dtype, RelaxExpr op, ffi::Array<PrimExpr> args, Span span);
+  TVM_DLL Call(PrimType ret_ty, RelaxExpr op, ffi::Array<PrimExpr> args, Span span);
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Call, PrimExpr, CallNode);
   TVM_DEFINE_OBJECT_REF_COW_METHOD(CallNode);
 };
diff --git a/include/tvm/tirx/op.h b/include/tvm/tirx/op.h
index 416aff73ee29..be827b9ef534 100644
--- a/include/tvm/tirx/op.h
+++ b/include/tvm/tirx/op.h
@@ -39,6 +39,7 @@
 #include <algorithm>
 #include <limits>
 #include <type_traits>
+#include <utility>
 
 namespace tvm {
 
@@ -58,34 +59,36 @@ namespace tvm {
 /*!
  * \brief Get the type of the expression under the unified type system.
  *
- * This function could return a more refined type than
- * the runtime type provided by expr->dtype
+ * This function could return a more refined type than the runtime dtype
+ * implied by PrimExpr::ty().
  *
  * \param expr The input parameter.
  * \return The result type.
  *
- * \sa tvm/ir/type.h for discussion about the relation between Type and runtime::DataType.
+ * \sa tvm/ir/type.h for discussion about the relation between Type and DLPack dtype.
  */
 TVM_DLL Type GetType(const PrimExpr& expr);
 
 /*!
- * \brief Get the type corresponding to DataType
- * \param dtype The data type
+ * \brief Get the type corresponding to a runtime DLPack dtype.
+ * \param dtype The runtime dtype.
  * \return The result type
  *
- * \sa tvm/ir/type.h for discussion about the relation between Type and runtime::DataType.
+ * \sa tvm/ir/type.h for discussion about the relation between Type and DLPack dtype.
  */
-TVM_DLL Type GetTypeFromRuntimeDataType(const DataType& dtype);
+TVM_DLL Type GetTypeFromRuntimeDataType(DLDataType dtype);
 
 /*!
- * \brief Get the implied DataType for storing values with type during runtime.
+ * \brief Get the implied DLPack dtype for storing values with type during runtime.
  *
  * \param type The input type.
- * \return The result runtime::DataType.
+ * \return The result DLPack dtype.
  *
- * \sa tvm/ir/type.h for discussion about the relation between Type and runtime::DataType.
+ * \sa tvm/ir/type.h for discussion about the relation between Type and DLPack dtype.
  */
-TVM_DLL runtime::DataType GetRuntimeDataType(const Type& type);
+TVM_DLL DLDataType GetRuntimeDLDataType(const Type& type);
+
+inline DLDataType GetRuntimeDataType(const Type& type) { return GetRuntimeDLDataType(type); }
 
 /*!
  * \brief Return the value.
@@ -120,27 +123,27 @@ TVM_DLL PrimExpr break_loop(Span span = Span());
 
 /*!
  * Query the maximum possible value of dtype.
- * \param dtype The data type.
+ * \param dtype The primitive type.
  * \param span The location of this operation in the source.
  * \return the maximum possible value in this format.
  */
-TVM_DLL PrimExpr max_value(const DataType& dtype, Span span = Span());
+TVM_DLL PrimExpr max_value(PrimType dtype, Span span = Span());
 
 /*!
  * Query the minimum possible value of dtype.
- * \param dtype The data type.
+ * \param dtype The primitive type.
  * \param span The location of this operation in the source.
  * \return the minimum possible value in this format.
  */
-TVM_DLL PrimExpr min_value(const DataType& dtype, Span span = Span());
+TVM_DLL PrimExpr min_value(PrimType dtype, Span span = Span());
 
 /*!
  * Get the value of infinity.
- * \param dtype The data type.
+ * \param dtype The primitive type.
  * \param span The location of this operation in the source.
  * \return the infinity value in this format.
  */
-TVM_DLL PrimExpr infinity(const DataType& dtype, Span span = Span());
+TVM_DLL PrimExpr infinity(PrimType dtype, Span span = Span());
 
 /*!
  * \brief cast value to type.
@@ -151,7 +154,7 @@ TVM_DLL PrimExpr infinity(const DataType& dtype, Span span = Span());
  * \return The result expression.
  * \note This function may return value if the type is the same.
  */
-TVM_DLL PrimExpr cast(const DataType& t, PrimExpr value, Span span = Span());
+TVM_DLL PrimExpr cast(PrimType t, PrimExpr value, Span span = Span());
 /*!
  * \brief perform reinterpret cast value to type.
  *
@@ -161,7 +164,7 @@ TVM_DLL PrimExpr cast(const DataType& t, PrimExpr value, Span span = Span());
  * \return The result expression.
  * \note This function may return value if the type is the same.
  */
-TVM_DLL PrimExpr reinterpret(const DataType& t, PrimExpr value, Span span = Span());
+TVM_DLL PrimExpr reinterpret(PrimType t, PrimExpr value, Span span = Span());
 /*!
  * \brief add operator
  *
@@ -691,13 +694,13 @@ TVM_DLL PrimExpr trunc(PrimExpr x, Span span = Span());
 
 /*!
  * \brief Construct a large uint constant by its low 32 bits and high 32bits.
- * \param dtype The final data type.
+ * \param value_ty The final primitive type.
  * \param low The lower 32 bits.
  * \param high The higher 32 bits.
  * \param span The location of this operation in the source.
  * \return The constructed expression.
  */
-TVM_DLL PrimExpr LargeUIntImm(DataType dtype, int64_t low, int64_t high, Span span = Span());
+TVM_DLL PrimExpr LargeUIntImm(PrimType value_ty, int64_t low, int64_t high, Span span = Span());
 
 /*!
  * \brief Execute a multiplication between two Q-numbers x and y
@@ -731,29 +734,35 @@ TVM_DLL PrimExpr q_multiply_shift(PrimExpr x, PrimExpr y, PrimExpr q, PrimExpr s
  */
 TVM_DLL PrimExpr fast_erf_float_expr(PrimExpr arg, int bits);
 
-inline void CheckMathUnaryOpInputDType(const char* op_name, DataType dtype) {
-  TVM_FFI_CHECK(dtype.is_float() || dtype.is_bfloat16(), TypeError)
+inline void CheckMathUnaryOpInputDType(const char* op_name, const PrimType& dtype) {
+  TVM_FFI_CHECK(dtype.code() == DLDataTypeCode::kDLFloat ||
+                    dtype.MatchesElementType(DLDataTypeCode::kDLBfloat, 16),
+                TypeError)
       << "tirx." << op_name << " only supports floating-point inputs, but got " << dtype;
 }
 
 // Intrinsic operators
-#define TVM_DECLARE_INTRIN_UNARY_WITH_CHECK(OpName, CheckInputDType)         \
-  inline PrimExpr OpName(PrimExpr x, Span span = Span()) {                   \
-    static const Op op = Op::Get("tirx." #OpName);                           \
-    CheckInputDType(#OpName, x.dtype());                                     \
-    if (x.dtype().is_bfloat16()) {                                           \
-      DataType bf16_dtype = x.dtype();                                       \
-      DataType fp32_dtype(kDLFloat, 32, bf16_dtype.lanes());                 \
-      PrimExpr x_fp32 = tirx::Cast(fp32_dtype, {x}, span);                   \
-      PrimExpr result_fp32 = tirx::Call(fp32_dtype, op, {x_fp32}, {}, span); \
-      return tirx::Cast(bf16_dtype, {result_fp32}, span);                    \
-    } else {                                                                 \
-      return tirx::Call(x.dtype(), op, {x}, {}, span);                       \
-    }                                                                        \
+#define TVM_DECLARE_INTRIN_UNARY_WITH_CHECK(OpName, CheckInputDType)                        \
+  inline PrimExpr OpName(PrimExpr x, Span span = Span()) {                                  \
+    static const Op op = Op::Get("tirx." #OpName);                                          \
+    PrimType x_ty = x.ty();                                                                 \
+    CheckInputDType(#OpName, x_ty);                                                         \
+    if (x_ty.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) {                           \
+      PrimType bf16_ty = x_ty;                                                              \
+      PrimType f32_ty =                                                                     \
+          x_ty.IsScalableVector()                                                           \
+              ? PrimType::ScalableVector(DLDataTypeCode::kDLFloat, 32, x_ty.VScaleFactor()) \
+              : PrimType::Float(32, x_ty.lanes());                                          \
+      PrimExpr x_fp32 = tirx::Cast(f32_ty, x, span);                                        \
+      PrimExpr result_fp32 = tirx::Call(f32_ty, op, {x_fp32}, {}, span);                    \
+      return tirx::Cast(bf16_ty, result_fp32, span);                                        \
+    } else {                                                                                \
+      return tirx::Call(x_ty, op, {x}, {}, span);                                           \
+    }                                                                                       \
   }
 
 #define TVM_DECLARE_INTRIN_UNARY(OpName) \
-  TVM_DECLARE_INTRIN_UNARY_WITH_CHECK(OpName, [](const char*, DataType) {})
+  TVM_DECLARE_INTRIN_UNARY_WITH_CHECK(OpName, [](const char*, const PrimType&) {})
 
 #define TVM_DECLARE_FLOAT_INTRIN_UNARY(OpName) \
   TVM_DECLARE_INTRIN_UNARY_WITH_CHECK(OpName, CheckMathUnaryOpInputDType)
@@ -787,7 +796,7 @@ TVM_DECLARE_INTRIN_UNARY(clz);
 #define TVM_DECLARE_INTRIN_BINARY(OpName)                              \
   inline PrimExpr OpName(PrimExpr x, PrimExpr y, Span span = Span()) { \
     static const Op op = Op::Get("tirx." #OpName);                     \
-    return tirx::Call(x.dtype(), op, {x, y}, {}, span);                \
+    return tirx::Call(x.ty(), op, {x, y}, {}, span);                   \
   }
 
 TVM_DECLARE_INTRIN_BINARY(atan2);
@@ -804,7 +813,7 @@ namespace tirx {
  * \param element_type The corresponding element type.
  * \return The check results
  */
-inline bool IsPointerType(const Type& type, const DataType& element_type) {
+inline bool IsPointerType(const Type& type, DLDataType element_type) {
   if (!type.defined()) return false;
   if (const auto* ptr_type = type.as<PointerTypeNode>()) {
     if (const auto* prim_type = ptr_type->element_type.as<PrimTypeNode>()) {
@@ -832,7 +841,7 @@ inline bool IsPointerType(const Type& type, const DataType& element_type) {
 template <typename ValueType,
           typename = typename std::enable_if<std::is_standard_layout<ValueType>::value &&
                                              std::is_trivial<ValueType>::value>::type>
-inline PrimExpr MakeConst(DataType dtype, ValueType value, Span span = Span());
+inline PrimExpr MakeConst(PrimType dtype, ValueType value, Span span = Span());
 /*!
  * \brief Make a constant handle value.
  * \param value The integer payload to reinterpret as a handle.
@@ -970,9 +979,12 @@ inline bool is_no_op(const tirx::Stmt& stmt) {
 }
 
 template <typename ValueType>
-inline PrimExpr MakeConstScalar(DataType dtype, ValueType value, Span span = Span()) {
-  if (dtype.is_int() || dtype.is_bool()) return IntImm(dtype, static_cast<int64_t>(value), span);
-  if (dtype.is_uint()) {
+inline PrimExpr MakeConstScalar(PrimType dtype, ValueType value, Span span = Span()) {
+  DLDataTypeCode code = dtype.code();
+  if (code == DLDataTypeCode::kDLInt || code == DLDataTypeCode::kDLBool) {
+    return IntImm(dtype, static_cast<int64_t>(value), span);
+  }
+  if (code == DLDataTypeCode::kDLUInt) {
     // Use IntImm if it is a small integer
     uint64_t uval = static_cast<uint64_t>(value);
     if (value < static_cast<ValueType>(0)) {
@@ -986,8 +998,13 @@ inline PrimExpr MakeConstScalar(DataType dtype, ValueType value, Span span = Spa
       return LargeUIntImm(dtype, static_cast<int64_t>(low), static_cast<int64_t>(high), span);
     }
   }
-  if (dtype.is_float() || dtype.is_bfloat16() || dtype.is_float8() || dtype.is_float6() ||
-      dtype.is_float4()) {
+  if (dtype.MatchesCode(DLDataTypeCode::kDLFloat, DLDataTypeCode::kDLFloat8_e3m4,
+                        DLDataTypeCode::kDLFloat8_e4m3, DLDataTypeCode::kDLFloat8_e4m3b11fnuz,
+                        DLDataTypeCode::kDLFloat8_e4m3fn, DLDataTypeCode::kDLFloat8_e4m3fnuz,
+                        DLDataTypeCode::kDLFloat8_e5m2, DLDataTypeCode::kDLFloat8_e5m2fnuz,
+                        DLDataTypeCode::kDLFloat8_e8m0fnu, DLDataTypeCode::kDLFloat6_e2m3fn,
+                        DLDataTypeCode::kDLFloat6_e3m2fn, DLDataTypeCode::kDLFloat4_e2m1fn) ||
+      dtype.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) {
     return FloatImm(dtype, static_cast<double>(value), span);
   }
   TVM_FFI_THROW(InternalError) << "cannot make const for type " << dtype;
@@ -995,27 +1012,26 @@ inline PrimExpr MakeConstScalar(DataType dtype, ValueType value, Span span = Spa
 }
 
 template <>
-inline PrimExpr MakeConstScalar(DataType dtype, bool value, Span span) {
+inline PrimExpr MakeConstScalar(PrimType dtype, bool value, Span span) {
   return MakeConstScalar(dtype, static_cast<int>(value), span);
 }
 
 template <typename ValueType, typename>
-inline PrimExpr MakeConst(DataType dtype, ValueType value, Span span) {
-  if (dtype.is_scalar()) {
+inline PrimExpr MakeConst(PrimType dtype, ValueType value, Span span) {
+  if (!dtype.IsScalableVector() && !dtype.IsFixedLengthVector()) {
     return MakeConstScalar(dtype, value, span);
-  } else {
-    if (dtype.is_fixed_length_vector()) {
-      return tirx::Broadcast(MakeConstScalar(dtype.element_of(), value, span), dtype.lanes(), span);
-    } else {
-      PrimExpr lanes = tirx::Mul(tirx::Call(DataType::Int(32), tirx::builtin::vscale(), {}),
-                                 dtype.vscale_factor());
-      return tirx::Broadcast(MakeConstScalar(dtype.element_of(), value, span), lanes, span);
-    }
   }
+  PrimType elem_ty = dtype.WithLanes(1);
+  if (dtype.IsFixedLengthVector()) {
+    return tirx::Broadcast(MakeConstScalar(elem_ty, value, span), dtype.lanes(), span);
+  }
+  PrimExpr lanes =
+      tirx::Mul(tirx::Call(PrimType::Int(32), tirx::builtin::vscale(), {}), dtype.VScaleFactor());
+  return tirx::Broadcast(MakeConstScalar(elem_ty, value, span), lanes, span);
 }
 
 inline PrimExpr ConstHandle(int64_t value, Span span) {
-  return reinterpret(DataType::Handle(), IntImm(DataType::UInt(64), value, span));
+  return reinterpret(PrimType::Handle(), IntImm(PrimType::UInt(64), value, span));
 }
 
 }  // namespace tirx
@@ -1027,17 +1043,13 @@ inline PrimExpr ConstHandle(int64_t value, Span span) {
     return a;                                       \
   }
 
-#define TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(Name)                                   \
-  inline PrimExpr Name(const PrimExpr& a, float b) { return Name(a, PrimExpr(b)); } \
-  inline PrimExpr Name(float a, const PrimExpr& b) { return Name(PrimExpr(a), b); } \
-  inline PrimExpr Name(int a, const PrimExpr& b) {                                  \
-    return Name(tirx::MakeConst(b.dtype(), a), b);                                  \
-  }                                                                                 \
-  inline PrimExpr Name(const PrimExpr& a, int b) {                                  \
-    return Name(a, tirx::MakeConst(a.dtype(), b));                                  \
-  }                                                                                 \
-  inline PrimExpr Name(const PrimExpr& a, double b) {                               \
-    return Name(a, FloatImm(DataType::Float(64), b));                               \
+#define TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(Name)                                                \
+  inline PrimExpr Name(const PrimExpr& a, float b) { return Name(a, PrimExpr(b)); }              \
+  inline PrimExpr Name(float a, const PrimExpr& b) { return Name(PrimExpr(a), b); }              \
+  inline PrimExpr Name(int a, const PrimExpr& b) { return Name(tirx::MakeConst(b.ty(), a), b); } \
+  inline PrimExpr Name(const PrimExpr& a, int b) { return Name(a, tirx::MakeConst(a.ty(), b)); } \
+  inline PrimExpr Name(const PrimExpr& a, double b) {                                            \
+    return Name(a, FloatImm(PrimType::Float(64), b));                                            \
   }
 
 #define TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD_SPANNED(Name)                 \
@@ -1048,13 +1060,13 @@ inline PrimExpr ConstHandle(int64_t value, Span span) {
     return Name(PrimExpr(a), b, span);                                    \
   }                                                                       \
   inline PrimExpr Name(int a, const PrimExpr& b, Span span = Span()) {    \
-    return Name(tirx::MakeConst(b.dtype(), a), b, span);                  \
+    return Name(tirx::MakeConst(b.ty(), a), b, span);                     \
   }                                                                       \
   inline PrimExpr Name(const PrimExpr& a, int b, Span span = Span()) {    \
-    return Name(a, tirx::MakeConst(a.dtype(), b), span);                  \
+    return Name(a, tirx::MakeConst(a.ty(), b), span);                     \
   }                                                                       \
   inline PrimExpr Name(const PrimExpr& a, double b, Span span = Span()) { \
-    return Name(a, FloatImm(DataType::Float(64), b), span);               \
+    return Name(a, FloatImm(PrimType::Float(64), b), span);               \
   }
 
 #define TVM_DEFINE_LOGICAL_OP_CONST_VAL_OVERLOAD(Name)                             \
@@ -1069,18 +1081,16 @@ inline PrimExpr ConstHandle(int64_t value, Span span) {
     return Name(PrimExpr(a), b, span);                                  \
   }
 
-#define TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD(Name) \
-  inline PrimExpr Name(const PrimExpr& a, int b) { \
-    return Name(a, tirx::MakeConst(a.dtype(), b)); \
-  }                                                \
-  inline PrimExpr Name(int a, const PrimExpr& b) { return Name(tirx::MakeConst(b.dtype(), a), b); }
+#define TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD(Name)                                               \
+  inline PrimExpr Name(const PrimExpr& a, int b) { return Name(a, tirx::MakeConst(a.ty(), b)); } \
+  inline PrimExpr Name(int a, const PrimExpr& b) { return Name(tirx::MakeConst(b.ty(), a), b); }
 
 #define TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD_SPANNED(Name)             \
   inline PrimExpr Name(const PrimExpr& a, int b, Span span = Span()) { \
-    return Name(a, tirx::MakeConst(a.dtype(), b), span);               \
+    return Name(a, tirx::MakeConst(a.ty(), b), span);                  \
   }                                                                    \
   inline PrimExpr Name(int a, const PrimExpr& b, Span span = Span()) { \
-    return Name(tirx::MakeConst(b.dtype(), a), b, span);               \
+    return Name(tirx::MakeConst(b.ty(), a), b, span);                  \
   }
 
 TVM_DEFINE_ASSIGN_OP_OVERLOAD(operator+=, operator+);
diff --git a/include/tvm/tirx/script/builder/ir.h b/include/tvm/tirx/script/builder/ir.h
index ad18d7ac4001..684653134a55 100644
--- a/include/tvm/tirx/script/builder/ir.h
+++ b/include/tvm/tirx/script/builder/ir.h
@@ -57,7 +57,7 @@ using tvm::tirx::Var;
  * \param axis_separators The separators between input axes when generating flattened output axes.
  * \return The declared buffer.
  */
-Buffer BufferDecl(ffi::Array<PrimExpr> shape, DataType dtype, ffi::String buffer_name,
+Buffer BufferDecl(ffi::Array<PrimExpr> shape, PrimType dtype, ffi::String buffer_name,
                   ffi::Optional<Var> data, ffi::Optional<ffi::Array<PrimExpr>> strides,
                   ffi::Optional<PrimExpr> elem_offset, ffi::String storage_scope, int align,
                   int offset_factor, ffi::String buffer_type,
@@ -122,7 +122,7 @@ Type FuncRet(Type ret_type);
  * \return The matched buffer.
  */
 Buffer MatchBuffer(ffi::ObjectRef param, ffi::Array<PrimExpr> shape,
-                   DataType dtype = DataType::Float(32), ffi::Optional<Var> data = std::nullopt,
+                   PrimType dtype = PrimType::Float(32), ffi::Optional<Var> data = std::nullopt,
                    ffi::Array<PrimExpr> strides = {}, PrimExpr elem_offset = PrimExpr(),
                    ffi::String storage_scope = "global", int align = -1, int offset_factor = 0,
                    ffi::String buffer_type = "default",
@@ -197,7 +197,7 @@ void BlockAttrs(ffi::Map<ffi::String, ffi::Any> attrs);
  * T.prim_func(tirx=True).
  */
 ffi::Variant<Buffer, AllocBufferFrame> SBlockAllocBuffer(
-    ffi::Array<PrimExpr> shape, DataType dtype = DataType::Float(32),
+    ffi::Array<PrimExpr> shape, PrimType dtype = PrimType::Float(32),
     ffi::Optional<Var> data = std::nullopt, ffi::Array<PrimExpr> strides = {},
     PrimExpr elem_offset = PrimExpr(), ffi::String storage_scope = "", int align = -1,
     int offset_factor = 0, ffi::String buffer_type = "default",
@@ -213,7 +213,7 @@ namespace axis {
  * \param dtype The data type of the iteration variable.
  * \return The iteration variable.
  */
-Var Spatial(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32));
+Var Spatial(Range dom, PrimExpr binding, PrimType dtype = PrimType::Int(32));
 
 /*!
  * \brief The reduced block axis defining function.
@@ -222,7 +222,7 @@ Var Spatial(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32));
  * \param dtype The data type of the iteration variable.
  * \return The iteration variable.
  */
-Var Reduce(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32));
+Var Reduce(Range dom, PrimExpr binding, PrimType dtype = PrimType::Int(32));
 
 /*!
  * \brief The scanning block axis defining function.
@@ -231,7 +231,7 @@ Var Reduce(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32));
  * \param dtype The data type of the iteration variable.
  * \return The iteration variable.
  */
-Var Scan(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32));
+Var Scan(Range dom, PrimExpr binding, PrimType dtype = PrimType::Int(32));
 
 /*!
  * \brief The opaque block axis defining function.
@@ -240,7 +240,7 @@ Var Scan(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32));
  * \param dtype The data type of the iteration variable.
  * \return The iteration variable.
  */
-Var Opaque(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32));
+Var Opaque(Range dom, PrimExpr binding, PrimType dtype = PrimType::Int(32));
 
 /*!
  * \brief The block axis remapping function.
@@ -250,7 +250,7 @@ Var Opaque(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32));
  * \return The iteration variables.
  */
 ffi::Array<Var> Remap(ffi::String kinds, ffi::Array<PrimExpr> bindings,
-                      DataType dtype = DataType::Int(32));
+                      PrimType dtype = PrimType::Int(32));
 
 }  // namespace axis
 
@@ -412,7 +412,7 @@ ElseFrame Else();
  * \param layout The layout of the buffer.
  * \return The declaration frame.
  */
-DeclBufferFrame DeclBuffer(ffi::Array<PrimExpr> shape, DataType dtype, ffi::String buffer_name,
+DeclBufferFrame DeclBuffer(ffi::Array<PrimExpr> shape, PrimType dtype, ffi::String buffer_name,
                            ffi::Optional<Var> data, ffi::Optional<ffi::Array<PrimExpr>> strides,
                            ffi::Optional<PrimExpr> elem_offset, ffi::String storage_scope,
                            int align, int offset_factor, ffi::String buffer_type,
@@ -428,7 +428,7 @@ DeclBufferFrame DeclBuffer(ffi::Array<PrimExpr> shape, DataType dtype, ffi::Stri
  * \param annotations Optional annotations for the allocation.
  * \return The allocated buffer.
  */
-Buffer AllocBuffer(ffi::Array<PrimExpr> shape, DataType dtype = DataType::Float(32),
+Buffer AllocBuffer(ffi::Array<PrimExpr> shape, PrimType dtype = PrimType::Float(32),
                    ffi::String storage_scope = "global",
                    ffi::Optional<ffi::Map<ffi::String, ffi::Any>> annotations = std::nullopt);
 
@@ -465,7 +465,7 @@ ComposeOpFrame ComposeOp(ffi::Map<ffi::String, Buffer> workspace,
  * \param dtype The data type of the variable.
  * \return The result variable which gets bound to the thread env.
  */
-Var EnvThread(ffi::String thread_tag, DataType dtype = DataType::Int(32));
+Var EnvThread(ffi::String thread_tag, PrimType dtype = PrimType::Int(32));
 
 /*!
  * \brief Store data in a buffer.
@@ -494,21 +494,20 @@ void Evaluate(PrimExpr value);
  * \param is_size_var Whether the pointer is a size var.
  *
  * \param is_unknown_type Used to distinguish between
- * `PrimType(DataType::Handle())` and
- * `PointerType(PrimType(DataType::Void()))`.  If true, resolve dtype
+ * `PrimType::Handle()` and `PointerType(PrimType(DLDataType{kDLOpaqueHandle, 0, 0}))`.
+ * If true, resolve dtype
  * of `Void()` as `PrimType`, and if false resolve dtype of `Void()`
  * as a `PointerType`.
  *
  * \return The pointer.
  */
-inline Var Handle(runtime::DataType dtype = runtime::DataType::Void(),
-                  ffi::String storage_scope = "global", bool is_size_var = false,
-                  bool is_unknown_type = false) {
+inline Var Handle(PrimType dtype = PrimType::Handle(), ffi::String storage_scope = "global",
+                  bool is_size_var = false, bool is_unknown_type = false) {
   Type type_annotation{nullptr};
   if (is_unknown_type && storage_scope == "global") {
-    type_annotation = PrimType(runtime::DataType::Handle());
+    type_annotation = PrimType::Handle();
   } else {
-    type_annotation = PointerType(PrimType(dtype), storage_scope);
+    type_annotation = PointerType(dtype, storage_scope);
   }
   return is_size_var ? tvm::tirx::SizeVar("", type_annotation)
                      : tvm::tirx::Var("", type_annotation);
@@ -519,67 +518,67 @@ inline Var TensorMap() { return tvm::tirx::Var("", PointerType(TensorMapType()))
 #define TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName, DType)                                 \
   inline PrimExpr FuncName(ffi::Optional<PrimExpr> expr = std::nullopt,                     \
                            bool is_size_var = false) {                                      \
-    DataType dtype = DType;                                                                 \
+    PrimType dtype(DType);                                                                  \
     return expr.defined()                                                                   \
                ? tvm::cast(dtype, expr.value())                                             \
                : (is_size_var ? tvm::tirx::SizeVar("", dtype) : tvm::tirx::Var("", dtype)); \
   }
 
-#define TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES(DType, FDType) \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##8, FDType(8));      \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##16, FDType(16));    \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##32, FDType(32));    \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##64, FDType(64));
-
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES(BFloat, DataType::BFloat);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES(Float, DataType::Float);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES(UInt, DataType::UInt);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES(Int, DataType::Int);
-
-#define TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES(FuncName, FDType, Size) \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x2, FDType(Size, 2))      \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x4, FDType(Size, 4));     \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x8, FDType(Size, 8));     \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x16, FDType(Size, 16));   \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x32, FDType(Size, 32));   \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x64, FDType(Size, 64));
-
-#define TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(DType, FDType) \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES(DType##8, FDType, 8);      \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES(DType##16, FDType, 16);    \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES(DType##32, FDType, 32);    \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES(DType##64, FDType, 64);
-
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(BFloat, DataType::BFloat);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(Float, DataType::Float);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(UInt, DataType::UInt);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(Int, DataType::Int);
-
-#define TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(DType, FDType) \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType, FDType(1));                    \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x2, FDType(2));                \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x4, FDType(4));                \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x8, FDType(8));                \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x16, FDType(16));              \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x32, FDType(32));              \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x64, FDType(64));
-
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E3M4, DataType::Float8E3M4);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E4M3, DataType::Float8E4M3);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E4M3B11FNUZ, DataType::Float8E4M3B11FNUZ);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E4M3FN, DataType::Float8E4M3FN);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E4M3FNUZ, DataType::Float8E4M3FNUZ);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E5M2, DataType::Float8E5M2);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E5M2FNUZ, DataType::Float8E5M2FNUZ);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E8M0FNU, DataType::Float8E8M0FNU);
-
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float6E2M3FN, DataType::Float6E2M3FN);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float6E3M2FN, DataType::Float6E3M2FN);
-
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float4E2M1FN, DataType::Float4E2M1FN);
-
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(Boolean, DataType::Bool());
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(Void, DataType::Void());
+#define TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES(DType, Code)               \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##8, (DLDataType{Code, 8, 1}));   \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##16, (DLDataType{Code, 16, 1})); \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##32, (DLDataType{Code, 32, 1})); \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##64, (DLDataType{Code, 64, 1}));
+
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES(BFloat, kDLBfloat);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES(Float, kDLFloat);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES(UInt, kDLUInt);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES(Int, kDLInt);
+
+#define TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES(FuncName, Code, Size)             \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x2, (DLDataType{Code, Size, 2}))    \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x4, (DLDataType{Code, Size, 4}));   \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x8, (DLDataType{Code, Size, 8}));   \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x16, (DLDataType{Code, Size, 16})); \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x32, (DLDataType{Code, Size, 32})); \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x64, (DLDataType{Code, Size, 64}));
+
+#define TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(DType, Code) \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES(DType##8, Code, 8);      \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES(DType##16, Code, 16);    \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES(DType##32, Code, 32);    \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES(DType##64, Code, 64);
+
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(BFloat, kDLBfloat);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(Float, kDLFloat);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(UInt, kDLUInt);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(Int, kDLInt);
+
+#define TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(DType, Code, Bits)  \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType, (DLDataType{Code, Bits, 1}));       \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x2, (DLDataType{Code, Bits, 2}));   \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x4, (DLDataType{Code, Bits, 4}));   \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x8, (DLDataType{Code, Bits, 8}));   \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x16, (DLDataType{Code, Bits, 16})); \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x32, (DLDataType{Code, Bits, 32})); \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x64, (DLDataType{Code, Bits, 64}));
+
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E3M4, kDLFloat8_e3m4, 8);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E4M3, kDLFloat8_e4m3, 8);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E4M3B11FNUZ, kDLFloat8_e4m3b11fnuz, 8);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E4M3FN, kDLFloat8_e4m3fn, 8);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E4M3FNUZ, kDLFloat8_e4m3fnuz, 8);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E5M2, kDLFloat8_e5m2, 8);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E5M2FNUZ, kDLFloat8_e5m2fnuz, 8);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E8M0FNU, kDLFloat8_e8m0fnu, 8);
+
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float6E2M3FN, kDLFloat6_e2m3fn, 6);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float6E3M2FN, kDLFloat6_e3m2fn, 6);
+
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float4E2M1FN, kDLFloat4_e2m1fn, 4);
+
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(Boolean, (DLDataType{kDLBool, 8, 1}));
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(Void, (DLDataType{kDLOpaqueHandle, 0, 0}));
 
 #undef TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST
 
diff --git a/include/tvm/tirx/stmt.h b/include/tvm/tirx/stmt.h
index 1ed4d5acac54..7eb004f8cf25 100644
--- a/include/tvm/tirx/stmt.h
+++ b/include/tvm/tirx/stmt.h
@@ -1282,7 +1282,7 @@ inline bool IsPragmaKey(const std::string& attr_key) {
  * \param span The location of this object in the source code.
  * \return Expr a expression with dtype.
  */
-TVM_DLL PrimExpr TypeAnnotation(DataType dtype, Span span = Span());
+TVM_DLL PrimExpr TypeAnnotation(PrimType dtype, Span span = Span());
 
 // overload printing of for type.
 TVM_DLL std::ostream& operator<<(std::ostream& os, ForKind kind);
diff --git a/include/tvm/tirx/var.h b/include/tvm/tirx/var.h
index 8c536ef0d668..3a4746a3f6a2 100644
--- a/include/tvm/tirx/var.h
+++ b/include/tvm/tirx/var.h
@@ -24,9 +24,9 @@
 #ifndef TVM_TIR_VAR_H_
 #define TVM_TIR_VAR_H_
 
+#include <tvm/ffi/dtype.h>
 #include <tvm/ir/cow.h>
 #include <tvm/ir/expr.h>
-#include <tvm/runtime/data_type.h>
 
 #include <functional>
 #include <string>
@@ -57,7 +57,7 @@ class VarNode : public PrimExprNode {
    *
    * It is an optional field that provides a refined type of the variable than dtype.
    *
-   * \sa tvm/ir/type.h for discussion of relations between runtime::DataType and Type.
+   * \sa tvm/ir/type.h for discussion of relations between DLPack dtype and Type.
    */
   Type type_annotation;
 
@@ -84,7 +84,7 @@ class Var : public PrimExpr {
    * \param dtype data type
    * \param span The location of this object in the source code.
    */
-  TVM_DLL explicit Var(ffi::String name_hint = "v", DataType dtype = DataType::Int(32),
+  TVM_DLL explicit Var(ffi::String name_hint = "v", PrimType dtype = PrimType::Int(32),
                        Span span = Span());
   /*!
    * \brief Constructor which provides a more detailed type annotation.
@@ -110,7 +110,7 @@ class Var : public PrimExpr {
    * \param dtype The specified dtype
    * \return The new variable
    */
-  TVM_DLL Var copy_with_dtype(DataType dtype) const;
+  TVM_DLL Var copy_with_dtype(PrimType dtype) const;
 
   /*!
    * \brief Get pointer to the internal value.
@@ -150,7 +150,7 @@ class SizeVar : public Var {
    * \param t data type
    * \param span The location of this object in the source code.
    */
-  TVM_DLL explicit SizeVar(ffi::String name_hint = "s", DataType t = DataType::Int(32),
+  TVM_DLL explicit SizeVar(ffi::String name_hint = "s", PrimType t = PrimType::Int(32),
                            Span span = Span());
   /*!
    * \brief Constructor which provides a more detailed type annotation.
diff --git a/include/tvm/topi/broadcast.h b/include/tvm/topi/broadcast.h
index b0c6ac8f6722..26bf7c100ca5 100644
--- a/include/tvm/topi/broadcast.h
+++ b/include/tvm/topi/broadcast.h
@@ -252,7 +252,8 @@ TOPI_DEFINE_BCAST_OP(divide, { return div(a, b); });
  * \return The result.
  */
 TOPI_DEFINE_BCAST_OP(floor_divide, {
-  if (a.dtype().is_int() || a.dtype().is_uint()) {
+  PrimType a_ty = a.ty();
+  if (a_ty.code() == DLDataTypeCode::kDLInt || a_ty.code() == DLDataTypeCode::kDLUInt) {
     return floordiv(a, b);
   } else {
     return floor(div(a, b));
@@ -287,7 +288,8 @@ TOPI_DEFINE_BCAST_OP(log_add_exp, { return logaddexp(a, b); });
  * \return The result.
  */
 TOPI_DEFINE_BCAST_OP(trunc_divide, {
-  if (a.dtype().is_int() || a.dtype().is_uint()) {
+  PrimType a_ty = a.ty();
+  if (a_ty.code() == DLDataTypeCode::kDLInt || a_ty.code() == DLDataTypeCode::kDLUInt) {
     return truncdiv(a, b);
   } else {
     return trunc(div(a, b));
@@ -319,7 +321,8 @@ TOPI_DEFINE_BCAST_OP(mod, { return truncmod(a, b); });
  * \return The result.
  */
 TOPI_DEFINE_BCAST_OP(floor_mod, {
-  if (a.dtype().is_int() || a.dtype().is_uint()) {
+  PrimType a_ty = a.ty();
+  if (a_ty.code() == DLDataTypeCode::kDLInt || a_ty.code() == DLDataTypeCode::kDLUInt) {
     return floormod(a, b);
   } else {
     return a - floor_divide(a, b) * b;
@@ -338,7 +341,8 @@ TOPI_DEFINE_BCAST_OP(floor_mod, {
  * \return The result.
  */
 TOPI_DEFINE_BCAST_OP(trunc_mod, {
-  if (a.dtype().is_int() || a.dtype().is_uint()) {
+  PrimType a_ty = a.ty();
+  if (a_ty.code() == DLDataTypeCode::kDLInt || a_ty.code() == DLDataTypeCode::kDLUInt) {
     return truncmod(a, b);
   } else {
     return a - trunc_divide(a, b) * b;
diff --git a/include/tvm/topi/contrib/cublas.h b/include/tvm/topi/contrib/cublas.h
index 3590b7a54458..18ad4320f489 100644
--- a/include/tvm/topi/contrib/cublas.h
+++ b/include/tvm/topi/contrib/cublas.h
@@ -48,7 +48,7 @@ inline Tensor cublas_matmul(const Tensor& lhs, const Tensor& rhs, bool transa, b
   auto m = transb ? rhs->shape[0] : rhs->shape[1];
 
   return make_extern(
-      {{n, m}}, {lhs->dtype}, {lhs, rhs},
+      {{n, m}}, {lhs->GetDataType()}, {lhs, rhs},
       [&](ffi::Array<Buffer> ins, ffi::Array<Buffer> outs) {
         return call_packed({StringImm("tvm.contrib.cublas.matmul"), pack_buffer(ins[0]),
                             pack_buffer(ins[1]), pack_buffer(outs[0]), transa, transb});
@@ -73,7 +73,7 @@ inline Tensor cublas_batch_matmul(const Tensor& lhs, const Tensor& rhs, bool tra
   auto m = transb ? rhs->shape[1] : rhs->shape[2];
 
   return make_extern(
-      {{b, n, m}}, {lhs->dtype}, {lhs, rhs},
+      {{b, n, m}}, {lhs->GetDataType()}, {lhs, rhs},
       [&](ffi::Array<Buffer> ins, ffi::Array<Buffer> outs) {
         return call_packed({StringImm("tvm.contrib.cublas.batch_matmul"), pack_buffer(ins[0]),
                             pack_buffer(ins[1]), pack_buffer(outs[0]), transa, transb});
diff --git a/include/tvm/topi/detail/broadcast.h b/include/tvm/topi/detail/broadcast.h
index c9dce9eb7489..7c990c5c6e1a 100644
--- a/include/tvm/topi/detail/broadcast.h
+++ b/include/tvm/topi/detail/broadcast.h
@@ -42,10 +42,10 @@ struct BroadcastHelper {
   std::deque<tvm::tirx::Var> vars2;
 };
 
-static inline DataType CommonType(DataType type1, DataType type2) {
-  TVM_FFI_ICHECK(type1.is_scalar() && type2.is_scalar());
+static inline PrimType CommonType(const PrimType& type1, const PrimType& type2) {
+  TVM_FFI_ICHECK(type1.IsScalar() && type2.IsScalar());
   TVM_FFI_ICHECK(type1.code() == type2.code());
-  return DataType(type1.code(), std::max(type1.bits(), type2.bits()), /*lanes=*/1);
+  return type1.bits() < type2.bits() ? type1.WithBits(type2.bits()) : type1;
 }
 
 inline BroadcastHelper BroadcastShape(const tvm::ffi::Array<tvm::PrimExpr>& shape1,
@@ -56,15 +56,15 @@ inline BroadcastHelper BroadcastShape(const tvm::ffi::Array<tvm::PrimExpr>& shap
   tvm::PrimExpr one(1);
   int i;
 
-  auto cast_if_needed = [](DataType to_type, PrimExpr expr) {
-    return to_type != expr.dtype() ? cast(to_type, expr) : expr;
+  auto cast_if_needed = [](PrimType to_type, PrimExpr expr) {
+    return to_type == expr.ty() ? expr : cast(to_type, expr);
   };
 
   for (i = 1; i <= std::min(s1_size, s2_size); ++i) {
     // TODO(@icemelon9): Need to revisit this part
     const IntImmNode* static_size1 = shape1[s1_size - i].as<IntImmNode>();
     const IntImmNode* static_size2 = shape2[s2_size - i].as<IntImmNode>();
-    DataType common_type = CommonType(shape1[s1_size - i].dtype(), shape2[s2_size - i].dtype());
+    PrimType common_type = CommonType(shape1[s1_size - i].ty(), shape2[s2_size - i].ty());
 
     bh.all_vars.push_front(tvm::tirx::Var("dim", common_type));
     if (topi::detail::EqualCheck(shape1[s1_size - i], shape2[s2_size - i])) {
@@ -104,7 +104,7 @@ inline BroadcastHelper BroadcastShape(const tvm::ffi::Array<tvm::PrimExpr>& shap
   auto& shape = (s1_size > s2_size) ? shape1 : shape2;
   auto& vars = (s1_size > s2_size) ? bh.vars1 : bh.vars2;
   for (; i <= max_size; ++i) {
-    bh.all_vars.push_front(tvm::tirx::Var("v", shape[max_size - 1].dtype()));
+    bh.all_vars.push_front(tvm::tirx::Var("v", shape[max_size - 1].ty()));
     bh.common_shape.push_front(shape[max_size - i]);
     vars.push_front(bh.all_vars[0]);
   }
@@ -130,7 +130,7 @@ inline tvm::ffi::Array<tvm::PrimExpr> InputIndexFromBroadcast(
     // Only inject 0 here if we have not yet reached the dimension of I
     // (i.e. this must be a 1)
     if (!found && (ovars.size() - i) <= expected_dims) {
-      ivars.push_back(tvm::IntImm(ovars[i].dtype(), 0));
+      ivars.push_back(tvm::IntImm(ovars[i].ty(), 0));
     }
   }
   TVM_FFI_ICHECK(expected_dims == ivars.size());
diff --git a/include/tvm/topi/detail/extern.h b/include/tvm/topi/detail/extern.h
index 161d5291c38e..b0ce2d713bee 100644
--- a/include/tvm/topi/detail/extern.h
+++ b/include/tvm/topi/detail/extern.h
@@ -28,6 +28,7 @@
 #include <tvm/tirx/builtin.h>
 
 #include <string>
+#include <utility>
 #include <vector>
 
 namespace tvm {
@@ -61,7 +62,7 @@ using FExtern = std::function<PrimExpr(ffi::Array<Buffer>, ffi::Array<Buffer>)>;
  * element of out_types.
  */
 inline ffi::Array<Tensor> make_extern(const ffi::Array<ffi::Array<PrimExpr>>& out_shapes,
-                                      const std::vector<DataType>& out_types,
+                                      const std::vector<PrimType>& out_types,
                                       const ffi::Array<Tensor>& inputs, FExtern fextern,
                                       std::string name, std::string tag,
                                       ::tvm::ffi::Map<ffi::String, ffi::Any> attrs) {
@@ -100,10 +101,10 @@ inline ffi::Array<Tensor> make_extern(const ffi::Array<ffi::Array<PrimExpr>>& ou
 inline PrimExpr pack_buffer(Buffer buf) {
   TVM_FFI_ICHECK_GT(buf->shape.size(), 0) << "buf shape must have at least one element";
   auto shape =
-      tvm::tirx::Call(DataType::Handle(), tvm::tirx::builtin::tvm_stack_make_shape(), buf->shape);
+      tvm::tirx::Call(PrimType::Handle(), tvm::tirx::builtin::tvm_stack_make_shape(), buf->shape);
   PrimExpr strides;
   if (buf->strides.size() > 0) {
-    strides = tvm::tirx::Call(DataType::Handle(), tvm::tirx::builtin::tvm_stack_make_shape(),
+    strides = tvm::tirx::Call(PrimType::Handle(), tvm::tirx::builtin::tvm_stack_make_shape(),
                               buf->strides);
   } else {
     strides = 0;
@@ -112,9 +113,9 @@ inline PrimExpr pack_buffer(Buffer buf) {
                                  shape,
                                  strides,
                                  IntImm::Int32(static_cast<int64_t>(buf->shape.size())),
-                                 MakeConst(buf->dtype, 0),
+                                 MakeConst(PrimType(buf->dtype), 0),
                                  buf->elem_offset};
-  return tvm::tirx::Call(DataType::Handle(), tvm::tirx::builtin::tvm_stack_make_array(), pack_args);
+  return tvm::tirx::Call(PrimType::Handle(), tvm::tirx::builtin::tvm_stack_make_array(), pack_args);
 }
 
 /*!
@@ -127,7 +128,7 @@ inline PrimExpr pack_buffer(Buffer buf) {
  * \return An expression representing the invocation
  */
 inline PrimExpr call_packed(ffi::Array<PrimExpr> args) {
-  return tvm::tirx::Call(DataType::Int(32), tvm::tirx::builtin::tvm_call_packed(), args);
+  return tvm::tirx::Call(PrimType::Int(32), tvm::tirx::builtin::tvm_call_packed(), args);
 }
 
 }  // namespace detail
diff --git a/include/tvm/topi/detail/strided_slice.h b/include/tvm/topi/detail/strided_slice.h
index 19ee79a2086f..95ab3a38cbc0 100644
--- a/include/tvm/topi/detail/strided_slice.h
+++ b/include/tvm/topi/detail/strided_slice.h
@@ -91,7 +91,7 @@ inline ffi::Array<PrimExpr> StridedSliceCanonicalizeBegin(const ffi::Array<PrimE
                                                           const std::vector<int64_t>& begin,
                                                           const std::vector<int64_t>& strides,
                                                           const ffi::Array<int64_t>& axes,
-                                                          DataType dtype,
+                                                          PrimType dtype,
                                                           std::string slice_mode = "end") {
   ffi::Array<PrimExpr> begin_expr;
   for (size_t i = 0; i < axes.size(); ++i) {
@@ -140,9 +140,9 @@ inline ffi::Array<PrimExpr> StridedSliceOutputShape(
           static_cast<int>((interval + std::abs(strides[i]) - 1) / std::abs(strides[i]));
       TVM_FFI_ICHECK(strides[i] < 0 ? (end_i <= begin_i) : (begin_i <= end_i))
           << ": Input [Begin=" << begin[i] << ", End=" << end[i] << "] is invalid for axis=" << i;
-      out_shape.Set(ax, cast(out_shape[i].dtype(), PrimExpr(slice_size)));
+      out_shape.Set(ax, cast(out_shape[i].ty(), PrimExpr(slice_size)));
     } else {
-      out_shape.Set(ax, tvm::tirx::Var("dim", out_shape[i]->dtype));
+      out_shape.Set(ax, tvm::tirx::Var("dim", out_shape[i].ty()));
     }
   }
 
diff --git a/include/tvm/topi/detail/tensor_utils.h b/include/tvm/topi/detail/tensor_utils.h
index d67ad6359434..82649cd0b387 100644
--- a/include/tvm/topi/detail/tensor_utils.h
+++ b/include/tvm/topi/detail/tensor_utils.h
@@ -70,10 +70,10 @@ inline PrimExpr bilinear_sample_nchw(const Tensor& input, const ffi::Array<PrimE
   auto in_y = indices[2];
   auto in_x = indices[3];
 
-  auto y_low = tvm::cast(DataType::Int(32), tvm::floor(in_y));
+  auto y_low = tvm::cast(PrimType::Int(32), tvm::floor(in_y));
   auto y_high = y_low + 1;
 
-  auto x_low = tvm::cast(DataType::Int(32), tvm::floor(in_x));
+  auto x_low = tvm::cast(PrimType::Int(32), tvm::floor(in_x));
   auto x_high = x_low + 1;
 
   auto wy_h = in_y - y_low;
@@ -114,10 +114,10 @@ inline PrimExpr bilinear_sample_nhwc(const Tensor& input, const ffi::Array<PrimE
   auto in_y = indices[1];
   auto in_x = indices[2];
 
-  auto y_low = tvm::cast(DataType::Int(32), tvm::floor(in_y));
+  auto y_low = tvm::cast(PrimType::Int(32), tvm::floor(in_y));
   auto y_high = y_low + 1;
 
-  auto x_low = tvm::cast(DataType::Int(32), tvm::floor(in_x));
+  auto x_low = tvm::cast(PrimType::Int(32), tvm::floor(in_x));
   auto x_high = x_low + 1;
 
   auto wy_h = in_y - y_low;
diff --git a/include/tvm/topi/elemwise.h b/include/tvm/topi/elemwise.h
index 57225af9b493..b47204b46c25 100644
--- a/include/tvm/topi/elemwise.h
+++ b/include/tvm/topi/elemwise.h
@@ -82,22 +82,23 @@ TOPI_DECLARE_UNARY_OP(isinf);
 inline Tensor fast_tanh_float(const Tensor& in, std::string name, std::string tag) {
   // Clamp the inputs to the range [-9, 9] since anything outside
   // this range is +/-1.0f in single-precision.
-  auto x = maximum(MakeConst(in->dtype, -9.0), minimum(MakeConst(in->dtype, 9.0), in));
+  PrimType input_type = in->GetDataType();
+  auto x = maximum(MakeConst(input_type, -9.0), minimum(MakeConst(input_type, 9.0), in));
 
   // The monomial coefficients of the numerator polynomial (odd).
-  auto alpha_1 = MakeConst(in->dtype, 4.89352455891786e-03);
-  auto alpha_3 = MakeConst(in->dtype, 6.37261928875436e-04);
-  auto alpha_5 = MakeConst(in->dtype, 1.48572235717979e-05);
-  auto alpha_7 = MakeConst(in->dtype, 5.12229709037114e-08);
-  auto alpha_9 = MakeConst(in->dtype, -8.60467152213735e-11);
-  auto alpha_11 = MakeConst(in->dtype, 2.00018790482477e-13);
-  auto alpha_13 = MakeConst(in->dtype, -2.76076847742355e-16);
+  auto alpha_1 = MakeConst(input_type, 4.89352455891786e-03);
+  auto alpha_3 = MakeConst(input_type, 6.37261928875436e-04);
+  auto alpha_5 = MakeConst(input_type, 1.48572235717979e-05);
+  auto alpha_7 = MakeConst(input_type, 5.12229709037114e-08);
+  auto alpha_9 = MakeConst(input_type, -8.60467152213735e-11);
+  auto alpha_11 = MakeConst(input_type, 2.00018790482477e-13);
+  auto alpha_13 = MakeConst(input_type, -2.76076847742355e-16);
 
   // The monomial coefficients of the denominator polynomial (even).
-  auto beta_0 = MakeConst(in->dtype, 4.89352518554385e-03);
-  auto beta_2 = MakeConst(in->dtype, 2.26843463243900e-03);
-  auto beta_4 = MakeConst(in->dtype, 1.18534705686654e-04);
-  auto beta_6 = MakeConst(in->dtype, 1.19825839466702e-06);
+  auto beta_0 = MakeConst(input_type, 4.89352518554385e-03);
+  auto beta_2 = MakeConst(input_type, 2.26843463243900e-03);
+  auto beta_4 = MakeConst(input_type, 1.18534705686654e-04);
+  auto beta_6 = MakeConst(input_type, 1.19825839466702e-06);
 
   return compute(
       x->shape,
@@ -130,7 +131,7 @@ inline Tensor fast_tanh_float(const Tensor& in, std::string name, std::string ta
  */
 inline Tensor fast_tanh(const Tensor& x, std::string name = "T_fast_tanh",
                         std::string tag = kElementWise) {
-  if (x->dtype == DataType::Float(32)) {
+  if (x->GetDataType().MatchesElementType(DLDataTypeCode::kDLFloat, 32)) {
     // invoke fast_tanh_float implementation
     return fast_tanh_float(x, name, tag);
   } else {
@@ -209,9 +210,10 @@ inline Tensor sign(const Tensor& x, std::string name = "T_sign", std::string tag
   return compute(
       x->shape,
       [&](const ffi::Array<Var>& i) {
-        PrimExpr zero = MakeConst(x->dtype, 0);
-        PrimExpr one = MakeConst(x->dtype, 1);
-        PrimExpr minus_one = MakeConst(x->dtype, -1);
+        PrimType x_type(x->GetDataType());
+        PrimExpr zero = MakeConst(x_type, 0);
+        PrimExpr one = MakeConst(x_type, 1);
+        PrimExpr minus_one = MakeConst(x_type, -1);
         auto s1 = tvm::tirx::Select((x(i) < zero), minus_one, zero);
         auto s2 = tvm::tirx::Select((x(i) > zero), one, s1);
         return s2;
@@ -232,7 +234,7 @@ inline Tensor rsqrt(const Tensor& x, std::string name = "tensor", std::string ta
   return compute(
       x->shape,
       [&](const ffi::Array<Var>& i) {
-        PrimExpr one = MakeConst(x->dtype, 1);
+        PrimExpr one = MakeConst(x->GetDataType(), 1);
         return one / tvm::sqrt(x(i));
       },
       name, tag);
@@ -255,8 +257,9 @@ inline Tensor clip(const Tensor& x, const PrimExpr& a_min, const PrimExpr& a_max
   return compute(
       x->shape,
       [&](const ffi::Array<Var>& i) {
-        auto min_val = tvm::cast(x->dtype, a_min);
-        auto max_val = tvm::cast(x->dtype, a_max);
+        PrimType x_type(x->GetDataType());
+        auto min_val = tvm::cast(x_type, a_min);
+        auto max_val = tvm::cast(x_type, a_max);
         return tvm::max(tvm::min(x(i), max_val), min_val);  // NOLINT(*)
       },
       name, tag);
@@ -274,16 +277,24 @@ inline Tensor clip(const Tensor& x, const PrimExpr& a_min, const PrimExpr& a_max
  *
  * \return A Tensor whose op member is the cast operation
  */
-inline Tensor cast(const Tensor& x, DataType type, std::string name = "T_cast",
+inline Tensor cast(const Tensor& x, PrimType type, std::string name, std::string tag);
+
+inline Tensor cast(const Tensor& x, DLDataType type, std::string name = "T_cast",
+                   std::string tag = kElementWise) {
+  return cast(x, PrimType(type), std::move(name), std::move(tag));
+}
+
+inline Tensor cast(const Tensor& x, PrimType type, std::string name = "T_cast",
                    std::string tag = kElementWise) {
   return compute(
       x->shape,
       [&](const ffi::Array<Var>& i) -> PrimExpr {
         auto expr = x(i);
-        if (expr.dtype().code() == type.code() && expr.dtype().bits() == type.bits()) {
-          if (expr.dtype().lanes() == type.lanes()) {
+        PrimType expr_ty = expr.ty();
+        if (expr_ty.MatchesElementType(type.code(), type.bits())) {
+          if (expr_ty.lanes() == type.lanes()) {
             return expr;
-          } else if (expr.dtype().lanes() == 1 && type.is_vector()) {
+          } else if (expr_ty.lanes() == 1 && type.IsFixedLengthVector()) {
             return tvm::tirx::Broadcast(expr, type.lanes());
           }
         }
@@ -303,7 +314,14 @@ inline Tensor cast(const Tensor& x, DataType type, std::string name = "T_cast",
  *
  * \return A Tensor whose op member is the reinterpret operation
  */
-inline Tensor reinterpret(const Tensor& x, DataType type, std::string name = "tensor",
+inline Tensor reinterpret(const Tensor& x, PrimType type, std::string name, std::string tag);
+
+inline Tensor reinterpret(const Tensor& x, DLDataType type, std::string name = "tensor",
+                          std::string tag = kElementWise) {
+  return reinterpret(x, PrimType(type), std::move(name), std::move(tag));
+}
+
+inline Tensor reinterpret(const Tensor& x, PrimType type, std::string name = "tensor",
                           std::string tag = kElementWise) {
   return compute(
       x->shape, [&](const ffi::Array<Var>& i) { return reinterpret(type, x(i)); }, name, tag);
@@ -344,7 +362,15 @@ inline Tensor elemwise_sum(const ffi::Array<Tensor>& xs, std::string name = "T_e
  *
  * \return A Tensor whose op member is the full operation
  */
-inline Tensor full(const ffi::Array<PrimExpr>& shape, DataType dtype, const PrimExpr fill_value,
+inline Tensor full(const ffi::Array<PrimExpr>& shape, PrimType dtype, const PrimExpr fill_value,
+                   std::string name, std::string tag);
+
+inline Tensor full(const ffi::Array<PrimExpr>& shape, DLDataType dtype, const PrimExpr fill_value,
+                   std::string name = "T_full", std::string tag = kElementWise) {
+  return full(shape, PrimType(dtype), fill_value, std::move(name), std::move(tag));
+}
+
+inline Tensor full(const ffi::Array<PrimExpr>& shape, PrimType dtype, const PrimExpr fill_value,
                    std::string name = "T_full", std::string tag = kElementWise) {
   PrimExpr ev = cast(dtype, fill_value);
   if (!ev.defined()) {
@@ -366,7 +392,7 @@ inline Tensor full(const ffi::Array<PrimExpr>& shape, DataType dtype, const Prim
  */
 inline Tensor full_like(const Tensor& x, const PrimExpr fill_value,
                         std::string name = "T_full_like", std::string tag = kElementWise) {
-  PrimExpr ev = cast(x->dtype, fill_value);
+  PrimExpr ev = cast(x->GetDataType(), fill_value);
   return compute(x->shape, [&](const ffi::Array<Var>& i) { return ev; }, name, tag);
 }
 
@@ -392,19 +418,17 @@ inline Tensor full_like(const Tensor& x, const PrimExpr fill_value,
  * y = exp(f) = 1 + 2 * P(x**2)/(Q(x**2) - P(x**2))
  */
 inline Tensor fast_exp_float32(const Tensor& _x, std::string name, std::string tag) {
-  auto x_hi = FloatImm(DataType::Float(32), 88.3762626647950f);
-  auto x_lo = FloatImm(DataType::Float(32), -88.3762626647949f);
-  auto log2e = FloatImm(DataType::Float(32), 1.44269504088896341f);
-  auto ln2 = FloatImm(DataType::Float(32), 0.6931471805599453f);
-  PrimExpr p[6] = {FloatImm(DataType::Float(32), 1.9875691500E-4f),
-                   FloatImm(DataType::Float(32), 1.3981999507E-3f),
-                   FloatImm(DataType::Float(32), 8.3334519073E-3f),
-                   FloatImm(DataType::Float(32), 4.1665795894E-2f),
-                   FloatImm(DataType::Float(32), 1.6666665459E-1f),
-                   FloatImm(DataType::Float(32), 5.0000001201E-1f)};
-  auto one = FloatImm(DataType::Float(32), 1.0f);
-  auto one_half = FloatImm(DataType::Float(32), 0.5f);
-  auto b = FloatImm(DataType::Float(32), 127.0f);
+  PrimType f32_ty = PrimType::Float(32);
+  auto x_hi = FloatImm(f32_ty, 88.3762626647950f);
+  auto x_lo = FloatImm(f32_ty, -88.3762626647949f);
+  auto log2e = FloatImm(f32_ty, 1.44269504088896341f);
+  auto ln2 = FloatImm(f32_ty, 0.6931471805599453f);
+  PrimExpr p[6] = {FloatImm(f32_ty, 1.9875691500E-4f), FloatImm(f32_ty, 1.3981999507E-3f),
+                   FloatImm(f32_ty, 8.3334519073E-3f), FloatImm(f32_ty, 4.1665795894E-2f),
+                   FloatImm(f32_ty, 1.6666665459E-1f), FloatImm(f32_ty, 5.0000001201E-1f)};
+  auto one = FloatImm(f32_ty, 1.0f);
+  auto one_half = FloatImm(f32_ty, 0.5f);
+  auto b = FloatImm(f32_ty, 127.0f);
 
   return compute(
       _x->shape,
@@ -419,7 +443,7 @@ inline Tensor fast_exp_float32(const Tensor& _x, std::string name, std::string t
             (((((p[0] * f + p[1]) * f + p[2]) * f + p[3]) * f + p[4]) * f + p[5]) * f * f + f + one;
         // Return 2^m * exp(r).
         auto ef =
-            tvm::reinterpret(DataType::Float(32), ::tvm::cast(DataType::Int(32), n + b) << 23);
+            tvm::reinterpret(PrimType::Float(32), ::tvm::cast(PrimType::Int(32), n + b) << 23);
         return ::tvm::max(ef * y, _x(i));  // NOLINT(*)
       },
       name, tag);
@@ -437,7 +461,7 @@ inline Tensor fast_exp_float32(const Tensor& _x, std::string name, std::string t
  */
 inline Tensor fast_exp(const Tensor& x, std::string name = "T_fast_exp",
                        std::string tag = kElementWise) {
-  if (x->dtype == DataType::Float(32)) {
+  if (x->GetDataType().MatchesElementType(DLDataTypeCode::kDLFloat, 32)) {
     auto ret = fast_exp_float32(x, name, tag);
     return ret;
   } else {
@@ -474,10 +498,11 @@ inline Tensor fast_erf_float16(const Tensor& data, std::string name, std::string
  */
 inline Tensor fast_erf(const Tensor& x, std::string name = "T_fast_erf",
                        std::string tag = kElementWise) {
-  if (x->dtype == DataType::Float(32)) {
+  PrimType x_type(x->GetDataType());
+  if (x_type.MatchesElementType(DLDataTypeCode::kDLFloat, 32)) {
     auto ret = fast_erf_float32(x, name, tag);
     return ret;
-  } else if (x->dtype == DataType::Float(16)) {
+  } else if (x_type.MatchesElementType(DLDataTypeCode::kDLFloat, 16)) {
     auto ret = fast_erf_float16(x, name, tag);
     return ret;
   } else {
diff --git a/include/tvm/topi/nn.h b/include/tvm/topi/nn.h
index 0a448620dae3..b864bfe53ea3 100644
--- a/include/tvm/topi/nn.h
+++ b/include/tvm/topi/nn.h
@@ -57,7 +57,7 @@ inline tvm::te::Tensor relu(const tvm::te::Tensor& t, T threshold = static_cast<
   return tvm::te::compute(
       t->shape,
       [&](const tvm::ffi::Array<tvm::tirx::Var>& i) {
-        auto threshold_const = tvm::tirx::MakeConst(t->dtype, threshold);
+        auto threshold_const = tvm::tirx::MakeConst(tvm::PrimType(t->dtype), threshold);
         return tvm::max(t(i), threshold_const);
       },
       name, tag);
@@ -80,7 +80,7 @@ inline tvm::te::Tensor leaky_relu(const tvm::te::Tensor& t, double alpha = 0.1,
       t->shape,
       [&](const tvm::ffi::Array<tvm::tirx::Var>& i) {
         auto value = t(i);
-        auto calpha = tvm::tirx::MakeConst(value.dtype(), alpha);
+        auto calpha = tvm::tirx::MakeConst(value.ty(), alpha);
         return tvm::tirx::Select(value > 0, value, value * calpha);
       },
       name, tag);
@@ -171,10 +171,10 @@ inline tvm::te::Tensor pad(
   tvm::ffi::Array<tvm::PrimExpr> pad_after_int32;
 
   for (const auto& ele : pad_before) {
-    pad_before_int32.push_back(tvm::cast(tvm::DataType::Int(32), ele));
+    pad_before_int32.push_back(tvm::cast(tvm::PrimType::Int(32), ele));
   }
   for (const auto& ele : pad_after) {
-    pad_after_int32.push_back(tvm::cast(tvm::DataType::Int(32), ele));
+    pad_after_int32.push_back(tvm::cast(tvm::PrimType::Int(32), ele));
   }
 
   tvm::ffi::Array<tvm::PrimExpr> output_shape;
@@ -194,7 +194,7 @@ inline tvm::te::Tensor pad(
   }
 
   if (!pad_value.defined()) {
-    pad_value = tvm::tirx::MakeConst(t->dtype, 0);
+    pad_value = tvm::tirx::MakeConst(tvm::PrimType(t->dtype), 0);
   }
 
   auto l = [&](tvm::ffi::Array<tvm::tirx::Var> ovars) {
@@ -495,19 +495,19 @@ inline tvm::te::Tensor space_to_batch_nd(const tvm::te::Tensor& data,
   tvm::ffi::Array<tvm::PrimExpr> pad_after_int32;
 
   // pad size for batch dimension is 0
-  pad_before_int32.push_back(tvm::cast(tvm::DataType::Int(32), 0));
-  pad_after_int32.push_back(tvm::cast(tvm::DataType::Int(32), 0));
+  pad_before_int32.push_back(tvm::cast(tvm::PrimType::Int(32), 0));
+  pad_after_int32.push_back(tvm::cast(tvm::PrimType::Int(32), 0));
   // insert pad sizes given for spatial dimensions
   for (const auto& ele : pad_before) {
-    pad_before_int32.push_back(tvm::cast(tvm::DataType::Int(32), ele));
+    pad_before_int32.push_back(tvm::cast(tvm::PrimType::Int(32), ele));
   }
   for (const auto& ele : pad_after) {
-    pad_after_int32.push_back(tvm::cast(tvm::DataType::Int(32), ele));
+    pad_after_int32.push_back(tvm::cast(tvm::PrimType::Int(32), ele));
   }
 
   // pad the input with paddings provided
   if (!pad_value.defined()) {
-    pad_value = tvm::tirx::MakeConst(data->dtype, 0);
+    pad_value = tvm::tirx::MakeConst(tvm::PrimType(data->dtype), 0);
   }
   padded_t = pad(data, pad_before_int32, pad_after_int32, pad_value);
 
@@ -629,9 +629,9 @@ inline tvm::te::Tensor batch_to_space_nd(const tvm::te::Tensor& data,
   // Crop the start and end of dimensions of out
   ffi::Array<ffi::Optional<IntImm>> begin_idx, end_idx;
   ffi::Array<IntImm> strides;
-  DataType index_dtype = DataType::Int(64);
+  PrimType index_ty = PrimType::Int(64);
   for (size_t i = 0; i < r_p_shape.size(); ++i) {
-    strides.push_back(IntImm(index_dtype, 1));
+    strides.push_back(IntImm(index_ty, 1));
     if (i > 0 && i <= num_block_dims) {
       // prepare begin and end index for spatial dimensions
       int64_t begin_i = GetConstInt(crop_begin_list[i - 1]);
@@ -640,12 +640,12 @@ inline tvm::te::Tensor batch_to_space_nd(const tvm::te::Tensor& data,
       TVM_FFI_ICHECK_GT(out_i, (begin_i + end_i))
           << "Incorrect crop sizes for (" << i << ")th dim, can not crop more than"
           << " output size" << out_i << " vs " << (begin_i + end_i);
-      begin_idx.push_back(IntImm(index_dtype, begin_i));
-      end_idx.push_back(IntImm(index_dtype, out_i - end_i));
+      begin_idx.push_back(IntImm(index_ty, begin_i));
+      end_idx.push_back(IntImm(index_ty, out_i - end_i));
     } else {
       // ignore the batch and remaining dimension
-      begin_idx.push_back(IntImm(index_dtype, 0));
-      end_idx.push_back(IntImm(index_dtype, GetConstInt(r_p_shape[i])));
+      begin_idx.push_back(IntImm(index_ty, 0));
+      end_idx.push_back(IntImm(index_ty, GetConstInt(r_p_shape[i])));
     }
   }
 
@@ -677,7 +677,7 @@ inline Tensor nll_loss(const Tensor& predictions, const Tensor& targets, const T
         [&](const tvm::ffi::Array<tvm::tirx::Var>& target_indices) {
           auto c = targets();
           return tvm::tirx::Select(c != ignore_index, -predictions(c) * weights(c),
-                                   tvm::tirx::MakeConst(predictions->dtype, 0));
+                                   tvm::tirx::MakeConst(tvm::PrimType(predictions->dtype), 0));
         },
         name, tag);
     if (reduction == "mean") {
@@ -686,7 +686,7 @@ inline Tensor nll_loss(const Tensor& predictions, const Tensor& targets, const T
           [&](const tvm::ffi::Array<tvm::tirx::Var>& target_indices) {
             auto c = targets();
             return tvm::tirx::Select(c != ignore_index, weights(c),
-                                     tvm::tirx::MakeConst(predictions->dtype, 0));
+                                     tvm::tirx::MakeConst(tvm::PrimType(predictions->dtype), 0));
           },
           name, tag);
       return topi::divide(T, W);
@@ -705,7 +705,7 @@ inline Tensor nll_loss(const Tensor& predictions, const Tensor& targets, const T
           pred_indices.push_back(target_indices[i]);  // indices for multidimensional loss
         }
         return tvm::tirx::Select(c != ignore_index, -predictions(pred_indices) * weights(c),
-                                 tvm::tirx::MakeConst(predictions->dtype, 0));
+                                 tvm::tirx::MakeConst(tvm::PrimType(predictions->dtype), 0));
       },
       name, tag);
   TVM_FFI_ICHECK(T->shape.size() != 0);
@@ -715,7 +715,7 @@ inline Tensor nll_loss(const Tensor& predictions, const Tensor& targets, const T
         [&](const tvm::ffi::Array<tvm::tirx::Var>& target_indices) {
           auto c = targets(target_indices);
           return tvm::tirx::Select(c != ignore_index, weights(c),
-                                   tvm::tirx::MakeConst(predictions->dtype, 0));
+                                   tvm::tirx::MakeConst(tvm::PrimType(predictions->dtype), 0));
         },
         name, tag);
     return topi::divide(topi::sum(T, tvm::ffi::Array<int64_t>(nullptr)),
diff --git a/include/tvm/topi/nn/bnn.h b/include/tvm/topi/nn/bnn.h
index 5faed879c005..56a6f3aaa815 100644
--- a/include/tvm/topi/nn/bnn.h
+++ b/include/tvm/topi/nn/bnn.h
@@ -71,14 +71,14 @@ inline tvm::te::Tensor binarize_pack(const tvm::te::Tensor& data, int axis,
           start_idx.push_back(i == static_cast<size_t>(axis) ? indices[i] * 32
                                                              : static_cast<PrimExpr>(indices[i]));
         }
-        PrimExpr packed = IntImm(DataType::UInt(32), 0);
+        PrimExpr packed = IntImm(PrimType::UInt(32), 0);
         for (size_t j = 0; j < 32; ++j) {
           ffi::Array<PrimExpr> idx;
           for (size_t i = 0; i < n; ++i) {
             idx.push_back(i == static_cast<size_t>(axis) ? start_idx[i] + static_cast<int>(j)
                                                          : start_idx[i]);
           }
-          auto sign = tvm::cast(DataType::UInt(32), data(idx) >= 0);
+          auto sign = tvm::cast(PrimType::UInt(32), data(idx) >= 0);
           packed = (packed | sign);
           if (j == 31) {
             return packed;
@@ -101,8 +101,8 @@ inline tvm::te::Tensor binarize_pack(const tvm::te::Tensor& data, int axis,
 inline tvm::te::Tensor binary_dense(const tvm::te::Tensor& data, const tvm::te::Tensor& weight) {
   TVM_FFI_ICHECK_EQ(data->shape.size(), 2) << "binary_dense requires 2-D data";
   TVM_FFI_ICHECK_EQ(weight->shape.size(), 2) << "binary_dense requires 2-D weight";
-  TVM_FFI_ICHECK_EQ(data->dtype, DataType::UInt(32)) << "binary_dense requires uint32 data";
-  TVM_FFI_ICHECK_EQ(weight->dtype, DataType::UInt(32)) << "binary_dense requires uint32 weight";
+  TVM_FFI_ICHECK_EQ(data->dtype, PrimType::UInt(32)) << "binary_dense requires uint32 data";
+  TVM_FFI_ICHECK_EQ(weight->dtype, PrimType::UInt(32)) << "binary_dense requires uint32 weight";
 
   auto batch = data->shape[0];
   auto in_dim = data->shape[1];
diff --git a/include/tvm/topi/nn/dense.h b/include/tvm/topi/nn/dense.h
index be0030cd40d5..2c7b2330505e 100644
--- a/include/tvm/topi/nn/dense.h
+++ b/include/tvm/topi/nn/dense.h
@@ -46,7 +46,7 @@ using namespace tvm::te;
  * \return Tensor with shape [batch, out_dim]
  */
 inline tvm::te::Tensor dense(const tvm::te::Tensor& data, const tvm::te::Tensor& weight,
-                             const tvm::te::Tensor& bias, const DataType& out_dtype) {
+                             const tvm::te::Tensor& bias, const PrimType& out_dtype) {
   TVM_FFI_ICHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
   TVM_FFI_ICHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
   if (bias.defined()) {
diff --git a/include/tvm/topi/nn/dilate.h b/include/tvm/topi/nn/dilate.h
index 0c8ea395c701..f45543eda337 100644
--- a/include/tvm/topi/nn/dilate.h
+++ b/include/tvm/topi/nn/dilate.h
@@ -95,7 +95,7 @@ inline Tensor dilate(const Tensor& x, ffi::Array<PrimExpr> strides, double dilat
         if (not_zero.size() > 0) {
           auto all_not_zero = all(not_zero);
           return tvm::if_then_else(all_not_zero, x(index_tuple),
-                                   MakeConst(x->dtype, dilation_value));
+                                   MakeConst(PrimType(x->dtype), dilation_value));
         }
         return x(index_tuple);
       },
diff --git a/include/tvm/topi/nn/group_norm.h b/include/tvm/topi/nn/group_norm.h
index 4962587a9396..7a778dea8ce5 100644
--- a/include/tvm/topi/nn/group_norm.h
+++ b/include/tvm/topi/nn/group_norm.h
@@ -45,9 +45,9 @@ inline Tensor group_norm(const Tensor& data, const Tensor& gamma, const Tensor&
   const auto& beta_type = beta.defined() ? beta->dtype : data_type;
   TVM_FFI_ICHECK(data_type == gamma_type && data_type == beta_type)
       << "group_norm: data, gamma and beta must have the same type";
-  TVM_FFI_ICHECK(data_type == DataType::Float(32) || data_type == DataType::Float(16))
+  TVM_FFI_ICHECK(data_type == PrimType::Float(32) || data_type == PrimType::Float(16))
       << "group_norm: only support float32 and float16 for now";
-  bool is_float16 = data_type == DataType::Float(16);
+  bool is_float16 = data_type == PrimType::Float(16);
   // reshape data C -> G, C/G
   int ndim = data->shape.size();
   channel_axis = GetRealAxis(static_cast<int>(ndim), ffi::Array<int64_t>({channel_axis}))[0];
@@ -65,7 +65,7 @@ inline Tensor group_norm(const Tensor& data, const Tensor& gamma, const Tensor&
   }
   Tensor data_reshaped;
   if (is_float16) {
-    data_reshaped = cast(reshape(data, new_shape), DataType::Float(32));
+    data_reshaped = cast(reshape(data, new_shape), PrimType::Float(32));
   } else {
     data_reshaped = reshape(data, new_shape);
   }
@@ -126,7 +126,7 @@ inline Tensor group_norm(const Tensor& data, const Tensor& gamma, const Tensor&
 
   auto temp_x = temp_x_x2[0];
   auto temp_x2 = temp_x_x2[1];
-  PrimExpr reduce_extent = FloatImm(DataType::Float(32), 1);
+  PrimExpr reduce_extent = FloatImm(PrimType::Float(32), 1);
   for (auto axis : new_axes) {
     reduce_extent *= data_reshaped->shape[axis];
   }
@@ -142,10 +142,10 @@ inline Tensor group_norm(const Tensor& data, const Tensor& gamma, const Tensor&
     gamma_indices = {indices[channel_axis], indices[channel_axis + 1]};
     auto mean = temp_x(non_reduce_indices) / reduce_extent;
     auto var = temp_x2(non_reduce_indices) / reduce_extent - mean * mean;
-    PrimExpr group_norm =
-        (data_reshaped(indices) - mean) * tvm::rsqrt(var + MakeConst(data->dtype, epsilon));
+    PrimExpr group_norm = (data_reshaped(indices) - mean) *
+                          tvm::rsqrt(var + MakeConst(PrimType(data->dtype), epsilon));
     if (is_float16) {
-      group_norm = Cast(DataType::Float(16), group_norm);
+      group_norm = Cast(PrimType::Float(16), group_norm);
     }
     if (gamma.defined()) {
       group_norm = topi::multiply(group_norm, gamma_reshaped(gamma_indices));
diff --git a/include/tvm/topi/nn/instance_norm.h b/include/tvm/topi/nn/instance_norm.h
index 60361e8bc681..e246d97a59df 100644
--- a/include/tvm/topi/nn/instance_norm.h
+++ b/include/tvm/topi/nn/instance_norm.h
@@ -58,9 +58,9 @@ inline Tensor instance_norm(const Tensor& data, const Tensor& gamma, const Tenso
   const auto& beta_type = beta.defined() ? beta->dtype : data_type;
   TVM_FFI_ICHECK(data_type == gamma_type && data_type == beta_type)
       << "instance_norm: data, gamma and beta must have the same type";
-  TVM_FFI_ICHECK(data_type == DataType::Float(32) || data_type == DataType::Float(16))
+  TVM_FFI_ICHECK(data_type == PrimType::Float(32) || data_type == PrimType::Float(16))
       << "instance_norm: only support float32 and float16 for now";
-  bool is_float16 = data_type == DataType::Float(16);
+  bool is_float16 = data_type == PrimType::Float(16);
   // sum x and x^2
   auto ndim = data->shape.size();
   TVM_FFI_ICHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
@@ -69,9 +69,10 @@ inline Tensor instance_norm(const Tensor& data, const Tensor& gamma, const Tenso
   auto target_shape =
       MakeReduceTargetShape(real_axis, data, /*keepdims=*/false, /*atleast1d=*/true);
   auto func = MakeTupleSumReducer();
+  PrimType f32_ty = PrimType::Float(32);
 
-  auto compute = [ndim, is_float16, &real_axis, &reduce_axes, &func,
-                  &data](const ffi::Array<Var>& indices) {
+  auto compute = [ndim, is_float16, &real_axis, &reduce_axes, &func, &data,
+                  f32_ty](const ffi::Array<Var>& indices) {
     ffi::Array<PrimExpr> eval_range;
     int arg_counter = 0;
     int red_counter = 0;
@@ -86,15 +87,14 @@ inline Tensor instance_norm(const Tensor& data, const Tensor& gamma, const Tenso
         arg_counter++;
       }
     }
-    auto square = [is_float16](const PrimExpr& x) {
+    auto square = [is_float16, f32_ty](const PrimExpr& x) {
       if (is_float16) {
-        return Cast(DataType::Float(32), x) * Cast(DataType::Float(32), x);
+        return Cast(f32_ty, x) * Cast(f32_ty, x);
       }
       return x * x;
     };
     if (is_float16) {
-      return func({Cast(DataType::Float(32), data(eval_range)), square(data(eval_range))},
-                  reduce_axes, nullptr);
+      return func({Cast(f32_ty, data(eval_range)), square(data(eval_range))}, reduce_axes, nullptr);
     } else {
       return func({data(eval_range), square(data(eval_range))}, reduce_axes, nullptr);
     }
@@ -106,7 +106,7 @@ inline Tensor instance_norm(const Tensor& data, const Tensor& gamma, const Tenso
   auto temp_x = temp_x_x2[0];
   auto temp_x2 = temp_x_x2[1];
 
-  auto reduce_extent = MakeConst(data->dtype, 1);
+  auto reduce_extent = MakeConst(PrimType(data->dtype), 1);
   for (int i : real_axis) {
     reduce_extent *= data->shape[i];
   }
@@ -124,9 +124,9 @@ inline Tensor instance_norm(const Tensor& data, const Tensor& gamma, const Tenso
     channel = indices[channel_axis];
     auto mean = temp_x(non_reduce_indices) / reduce_extent;
     auto var = temp_x2(non_reduce_indices) / reduce_extent - mean * mean;
-    auto instance_norm = (data(indices) - mean) * tvm::rsqrt(var + MakeConst(var->dtype, epsilon));
+    auto instance_norm = (data(indices) - mean) * tvm::rsqrt(var + MakeConst(var.ty(), epsilon));
     if (is_float16) {
-      instance_norm = Cast(DataType::Float(16), instance_norm);
+      instance_norm = Cast(PrimType::Float(16), instance_norm);
     }
     instance_norm = topi::multiply(instance_norm, gamma(channel));
     if (beta.defined()) {
diff --git a/include/tvm/topi/nn/layer_norm.h b/include/tvm/topi/nn/layer_norm.h
index fb8155ef654a..8a995d7b91fe 100644
--- a/include/tvm/topi/nn/layer_norm.h
+++ b/include/tvm/topi/nn/layer_norm.h
@@ -57,9 +57,9 @@ inline Tensor layer_norm(const Tensor& data, const Tensor& gamma, const Tensor&
   const auto& beta_type = beta.defined() ? beta->dtype : data_type;
   TVM_FFI_ICHECK(data_type == gamma_type && data_type == beta_type)
       << "layer_norm: data, gamma and beta must have the same type";
-  TVM_FFI_ICHECK(data_type == DataType::Float(32) || data_type == DataType::Float(16))
+  TVM_FFI_ICHECK(data_type == PrimType::Float(32) || data_type == PrimType::Float(16))
       << "layer_norm: only support float32 and float16 for now";
-  bool is_float16 = data_type == DataType::Float(16);
+  bool is_float16 = data_type == PrimType::Float(16);
   // Two-pass algorithm for improved numerical stability:
   //   pass1: mean = E[x]
   //   pass2: var = E[(x - mean)^2]
@@ -69,6 +69,7 @@ inline Tensor layer_norm(const Tensor& data, const Tensor& gamma, const Tensor&
   auto reduce_axes = MakeReduceAxes(real_axis, data);
   auto target_shape =
       MakeReduceTargetShape(real_axis, data, /*keepdims=*/false, /*atleast1d=*/false);
+  PrimType f32_ty = PrimType::Float(32);
 
   auto make_eval_range = [&real_axis, &reduce_axes,
                           ndim](const ffi::Array<Var>& non_reduce_indices) {
@@ -91,17 +92,17 @@ inline Tensor layer_norm(const Tensor& data, const Tensor& gamma, const Tensor&
 
   Tensor temp_sum = te::compute(
       target_shape,
-      [is_float16, &data, &reduce_axes, &make_eval_range](const ffi::Array<Var>& indices) {
+      [is_float16, &data, &reduce_axes, &make_eval_range, f32_ty](const ffi::Array<Var>& indices) {
         auto eval_range = make_eval_range(indices);
         PrimExpr x = data(eval_range);
         if (is_float16) {
-          x = Cast(DataType::Float(32), x);
+          x = Cast(f32_ty, x);
         }
         return sum(x, reduce_axes);
       },
       data->op->name + "_sum", kCommReduce);
 
-  DataType reduce_dtype = is_float16 ? DataType::Float(32) : data->dtype;
+  PrimType reduce_dtype = is_float16 ? PrimType::Float(32) : PrimType(data->dtype);
   PrimExpr reduce_extent = MakeConst(reduce_dtype, 1);
   for (int i : real_axis) {
     reduce_extent *= data->shape[i];
@@ -115,12 +116,12 @@ inline Tensor layer_norm(const Tensor& data, const Tensor& gamma, const Tensor&
 
   Tensor temp_var_sum = te::compute(
       target_shape,
-      [is_float16, &data, &reduce_axes, &make_eval_range,
-       &temp_mean](const ffi::Array<Var>& indices) {
+      [is_float16, &data, &reduce_axes, &make_eval_range, &temp_mean,
+       f32_ty](const ffi::Array<Var>& indices) {
         auto eval_range = make_eval_range(indices);
         PrimExpr x = data(eval_range);
         if (is_float16) {
-          x = Cast(DataType::Float(32), x);
+          x = Cast(f32_ty, x);
         }
         PrimExpr diff = x - temp_mean(indices);
         return sum(diff * diff, reduce_axes);
@@ -138,9 +139,9 @@ inline Tensor layer_norm(const Tensor& data, const Tensor& gamma, const Tensor&
     }
     auto mean = temp_mean(non_reduce_indices);
     auto var = temp_var_sum(non_reduce_indices) / reduce_extent;
-    auto layer_norm = (data(indices) - mean) * rsqrt(var + MakeConst(var->dtype, epsilon));
+    auto layer_norm = (data(indices) - mean) * rsqrt(var + MakeConst(var.ty(), epsilon));
     if (is_float16) {
-      layer_norm = Cast(DataType::Float(16), layer_norm);
+      layer_norm = Cast(PrimType::Float(16), layer_norm);
     }
     layer_norm = topi::multiply(layer_norm, gamma(reduce_indices));
     if (beta.defined()) {
diff --git a/include/tvm/topi/nn/local_response_norm.h b/include/tvm/topi/nn/local_response_norm.h
index 7407448f88c5..4f411076387d 100644
--- a/include/tvm/topi/nn/local_response_norm.h
+++ b/include/tvm/topi/nn/local_response_norm.h
@@ -55,7 +55,8 @@ inline Tensor lrn(const Tensor& data, int size, int axis = 1, float alpha = 0.00
   TVM_FFI_ICHECK_EQ(data->shape.size(), 4) << "LRN requires 4-D input";
   TVM_FFI_ICHECK_EQ(size % 2, 1) << "size should be odd number";
   TVM_FFI_ICHECK(axis == 1 || axis == 3) << "axis should be 1 or 3 for NCHW and NHWC";
-  TVM_FFI_ICHECK(data->dtype.is_float()) << "datatype should be float";
+  // LRN only requires a floating-point element kind; lane encoding is irrelevant here.
+  TVM_FFI_ICHECK_EQ(data->dtype.code(), DLDataTypeCode::kDLFloat) << "datatype should be float";
   auto input_shape = data->shape;
   ffi::Array<PrimExpr> pad_before{0, 0, 0, 0};
   ffi::Array<PrimExpr> pad_after{0, 0, 0, 0};
@@ -79,9 +80,9 @@ inline Tensor lrn(const Tensor& data, int size, int axis = 1, float alpha = 0.00
         },
         "tensor", "sqr_sum");
   }
-  PrimExpr alpha_imm = tvm::te::MakeConst(data->dtype, alpha);
-  PrimExpr beta_imm = tvm::te::MakeConst(data->dtype, beta);
-  PrimExpr bias_imm = tvm::te::MakeConst(data->dtype, bias);
+  PrimExpr alpha_imm = tvm::te::MakeConst(PrimType(data->dtype), alpha);
+  PrimExpr beta_imm = tvm::te::MakeConst(PrimType(data->dtype), beta);
+  PrimExpr bias_imm = tvm::te::MakeConst(PrimType(data->dtype), bias);
   auto sqrt_sum_up = tvm::te::compute(
       input_shape,
       [&](Var i, Var j, Var k, Var l) {
diff --git a/include/tvm/topi/nn/pooling.h b/include/tvm/topi/nn/pooling.h
index e8410d8add22..91b10e7d8df9 100644
--- a/include/tvm/topi/nn/pooling.h
+++ b/include/tvm/topi/nn/pooling.h
@@ -117,7 +117,8 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
         tvm::te::reduce_axis(Range(0, (kernel_width + stride_width - 1) / stride_width), "ww");
 
     auto argmax = MakeArgmaxReducer();
-    auto pad_x = do_pad ? pad(x, pad_before, pad_after, tvm::min_value(x->dtype), "pad_temp") : x;
+    auto pad_x =
+        do_pad ? pad(x, pad_before, pad_after, tvm::min_value(PrimType(x->dtype)), "pad_temp") : x;
 
     auto mp_argmax = tvm::te::compute(
         out_shape,
@@ -145,17 +146,17 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
           out_idx.Set(width_axis, (inds[width_axis] + pad_left) / stride_width - windoww);
 
           PrimExpr out_idx_lower_h = tirx::Select(
-              pad_inds[height_axis] < kernel_height, IntImm(pad_inds[height_axis].dtype(), 0),
+              pad_inds[height_axis] < kernel_height, IntImm(pad_inds[height_axis].ty(), 0),
               (pad_inds[height_axis] - kernel_height) / stride_height + 1);
           PrimExpr out_idx_lower_w = tirx::Select(
-              pad_inds[width_axis] < kernel_width, IntImm(pad_inds[width_axis].dtype(), 0),
+              pad_inds[width_axis] < kernel_width, IntImm(pad_inds[width_axis].ty(), 0),
               (pad_inds[width_axis] - kernel_width) / stride_width + 1);
 
           return tvm::sum(
               tvm::if_then_else(tirx::And(tirx::And(out_idx[height_axis] >= out_idx_lower_h,
                                                     out_idx[width_axis] >= out_idx_lower_w),
                                           mp_inds(out_idx) == idx),
-                                out_grad(out_idx), MakeConst(x->dtype, 0)),
+                                out_grad(out_idx), MakeConst(PrimType(x->dtype), 0)),
               {windowh, windoww});
         },
         "T_pool_grad", "pool_grad_max");
@@ -176,10 +177,10 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
           out_idx.Set(width_axis, (pad_w_idx / stride_width - windoww));
 
           PrimExpr out_idx_lower_h =
-              tirx::Select(pad_h_idx < kernel_height, IntImm(pad_h_idx.dtype(), 0),
+              tirx::Select(pad_h_idx < kernel_height, IntImm(pad_h_idx.ty(), 0),
                            (pad_h_idx - kernel_height) / stride_height + 1);
           PrimExpr out_idx_lower_w =
-              tirx::Select(pad_w_idx < kernel_width, IntImm(pad_w_idx.dtype(), 0),
+              tirx::Select(pad_w_idx < kernel_width, IntImm(pad_w_idx.ty(), 0),
                            (pad_w_idx - kernel_width) / stride_width + 1);
 
           PrimExpr divide_factor;  // number of pooled elements
@@ -191,16 +192,17 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
 
             PrimExpr h_end = min(h_start + kernel_height, height);
             PrimExpr w_end = min(w_start + kernel_width, width);
-            h_start = max(h_start, IntImm(h_start.dtype(), 0));
-            w_start = max(w_start, IntImm(w_start.dtype(), 0));
-            divide_factor = max((h_end - h_start) * (w_end - w_start), MakeConst(h_end.dtype(), 1));
+            h_start = max(h_start, IntImm(h_start.ty(), 0));
+            w_start = max(w_start, IntImm(w_start.ty(), 0));
+            divide_factor = max((h_end - h_start) * (w_end - w_start), MakeConst(h_end.ty(), 1));
           }
           return tvm::sum(
               tvm::if_then_else(tirx::And(tirx::And(out_idx[height_axis] >= out_idx_lower_h,
                                                     out_idx[height_axis] < out_height),
                                           tirx::And(out_idx[width_axis] >= out_idx_lower_w,
                                                     out_idx[width_axis] < out_width)),
-                                out_grad(out_idx) / divide_factor, MakeConst(out_grad->dtype, 0)),
+                                out_grad(out_idx) / divide_factor,
+                                MakeConst(PrimType(out_grad->dtype), 0)),
               {windowh, windoww});
         },
         "T_pool_grad", "pool_grad_avg");
@@ -384,9 +386,9 @@ inline Tensor adaptive_pool_impl(const Tensor& x, const ffi::Array<PrimExpr>& ou
           ffi::Array<tirx::IterVar> reduce_axes;
           std::tie(indices, reduce_axes) = get_iter_vars(output, false);
 
-          PrimExpr divide_factor = tvm::cast(x->dtype, 1);
+          PrimExpr divide_factor = tvm::cast(PrimType(x->dtype), 1);
           for (size_t i = 0; i < n_dim; ++i) {
-            divide_factor *= tvm::cast(DataType::Int(32), reduce_axes[i]->dom->extent);
+            divide_factor *= tvm::cast(PrimType::Int(32), reduce_axes[i]->dom->extent);
           }
 
           return div(pool_sum(indices), divide_factor);
@@ -582,7 +584,8 @@ inline Tensor pool_impl_nd(const Tensor& x, const ffi::Array<PrimExpr>& kernel_s
 
   ffi::Map<ffi::String, ffi::Any> attrs;
   if (pool_type == kMaxPool) {
-    auto temp = do_pad ? pad(x, pad_before, pad_after, tvm::min_value(x->dtype), "pad_temp") : x;
+    auto temp =
+        do_pad ? pad(x, pad_before, pad_after, tvm::min_value(PrimType(x->dtype)), "pad_temp") : x;
     attrs.Set("schedule_rule", tvm::ffi::String("meta_schedule.pool_max"));
     return tvm::te::compute(
         out_shape,
@@ -657,7 +660,7 @@ inline Tensor pool_impl_nd(const Tensor& x, const ffi::Array<PrimExpr>& kernel_s
               // number that represents the number of steps along the dilated kernel to reach a
               // non-padded value. Otherwise this should be 0.
               PrimExpr jumps_to_non_pad = (dilation[i] - 1 - start[i]) / dilation[i];
-              jumps_to_non_pad = max(jumps_to_non_pad, IntImm(jumps_to_non_pad.dtype(), 0));
+              jumps_to_non_pad = max(jumps_to_non_pad, IntImm(jumps_to_non_pad.ty(), 0));
 
               end[i] = min(end[i], data_shape[ii] - 1);
               num_el *= (end[i] - (start[i] + dilation[i] * jumps_to_non_pad)) / dilation[i] + 1;
diff --git a/include/tvm/topi/nn/rms_norm.h b/include/tvm/topi/nn/rms_norm.h
index 294d82054e3e..29f46918a754 100644
--- a/include/tvm/topi/nn/rms_norm.h
+++ b/include/tvm/topi/nn/rms_norm.h
@@ -54,8 +54,8 @@ inline Tensor rms_norm(const Tensor& data, const Tensor& weight, const ffi::Arra
   const auto& weight_type = weight.defined() ? weight->dtype : data_type;
   TVM_FFI_ICHECK(data_type == weight_type) << "rms_norm: data and weight must have the same type";
 
-  const auto& data_fp32 = cast(data, DataType::Float(32));
-  const auto& weight_fp32 = cast(weight, DataType::Float(32));
+  const auto& data_fp32 = cast(data, PrimType::Float(32));
+  const auto& weight_fp32 = cast(weight, PrimType::Float(32));
 
   auto square = multiply(data_fp32, data_fp32);
   auto square_sum = sum(square, axis, /*keepdims=*/false, /*atleast1d=*/true);
@@ -63,7 +63,7 @@ inline Tensor rms_norm(const Tensor& data, const Tensor& weight, const ffi::Arra
   auto ndim = data_fp32->shape.size();
   TVM_FFI_ICHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
   auto real_axis = GetRealAxis(static_cast<int>(ndim), axis);
-  auto reduce_extent = MakeConst(data_fp32->dtype, 1);
+  auto reduce_extent = MakeConst(PrimType(data_fp32->dtype), 1);
   for (int i : real_axis) {
     reduce_extent *= data_fp32->shape[i];
   }
@@ -74,8 +74,8 @@ inline Tensor rms_norm(const Tensor& data, const Tensor& weight, const ffi::Arra
         non_reduce_indices.push_back(indices[i]);
       }
     }
-    auto output =
-        tvm::rsqrt(square_sum(non_reduce_indices) / reduce_extent + MakeConst(data_type, epsilon));
+    auto output = tvm::rsqrt(square_sum(non_reduce_indices) / reduce_extent +
+                             MakeConst(PrimType(data_type), epsilon));
     return output;
   };
   auto rsqrt_shape = ffi::Array<PrimExpr>();
diff --git a/include/tvm/topi/reduction.h b/include/tvm/topi/reduction.h
index e6b4c5af1dea..fbea4a57eabf 100644
--- a/include/tvm/topi/reduction.h
+++ b/include/tvm/topi/reduction.h
@@ -259,7 +259,7 @@ inline Tensor CommReduceIdx(const Tensor& data, const ffi::Optional<ffi::Array<i
 using FCombine = std::function<ffi::Array<PrimExpr>(ffi::Array<Var> lhs, ffi::Array<Var> rhs)>;
 
 /*! \brief An initializer function for a reduction */
-using FIdentity = std::function<ffi::Array<PrimExpr>(std::vector<DataType> types)>;
+using FIdentity = std::function<ffi::Array<PrimExpr>(std::vector<PrimType> types)>;
 
 /*!
  * \brief Create a commutative reducer for a reduction
@@ -275,10 +275,10 @@ inline FCommReduce MakeCommReducer(FCombine fcombine, FIdentity fidentity,
   return [fcombine, fidentity, name](ffi::Array<PrimExpr> exprs, const ffi::Array<IterVar>& axis,
                                      PrimExpr* condition) {
     ffi::Array<Var> lhs, rhs;
-    std::vector<DataType> dtypes;
+    std::vector<PrimType> dtypes;
 
     for (size_t i = 0; i < exprs.size(); ++i) {
-      auto dtype = exprs[i].dtype();
+      PrimType dtype = exprs[i].ty();
       dtypes.push_back(dtype);
       lhs.push_back(var(name + "_lhs_" + std::to_string(i), dtype));
       rhs.push_back(var(name + "_rhs_" + std::to_string(i), dtype));
@@ -330,7 +330,8 @@ inline PrimExpr ProdOp(PrimExpr source, ffi::Array<IterVar> axis, ffi::Array<Pri
  */
 inline Tensor sum(const Tensor& data, const ffi::Optional<ffi::Array<int64_t>>& axis,
                   bool keepdims = false, bool atleast1d = false) {
-  if (data->dtype.is_bool()) {
+  // Reduction dispatch only depends on boolean element kind; lane encoding is irrelevant here.
+  if (data->dtype.code() == DLDataTypeCode::kDLBool) {
     return CommReduce(data, axis, tvm::any, keepdims, atleast1d);
   } else {
     return CommReduce(data, axis, tvm::sum, keepdims, atleast1d);
@@ -477,7 +478,7 @@ inline FCommReduce MakeArgminReducer(bool select_last_index = false) {
     result.push_back(tvm::tirx::Select(is_smaller, lhs[1], rhs[1]));    // val
     return result;
   };
-  auto fidentity = [&](std::vector<DataType> types) {
+  auto fidentity = [&](std::vector<PrimType> types) {
     ffi::Array<PrimExpr> result;
     result.push_back(tvm::tirx::MakeConst(types[0], -1));  // idx
     result.push_back(tvm::max_value(types[1]));            // val
@@ -539,7 +540,7 @@ inline FCommReduce MakeArgmaxReducer(bool select_last_index = false) {
     result.push_back(tvm::tirx::Select(is_bigger, lhs[1], rhs[1]));     // val
     return result;
   };
-  auto fidentity = [&](std::vector<DataType> types) {
+  auto fidentity = [&](std::vector<PrimType> types) {
     ffi::Array<PrimExpr> result;
     result.push_back(tvm::tirx::MakeConst(types[0], -1));  // idx
     result.push_back(tvm::min_value(types[1]));            // val
@@ -601,7 +602,7 @@ inline FCommReduce MakeTupleSumReducer() {
     }
     return result;
   };
-  auto fidentity = [](std::vector<DataType> types) {
+  auto fidentity = [](std::vector<PrimType> types) {
     ffi::Array<PrimExpr> result;
     for (size_t i = 0; i < types.size(); ++i) {
       result.push_back(tvm::tirx::MakeConst(types[i], 0));
diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index e216cf86ced4..f2ede7af8aa0 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -44,8 +44,8 @@
 #include <utility>
 #include <vector>
 
+#include "tvm/ffi/dtype.h"
 #include "tvm/ir/expr.h"
-#include "tvm/runtime/data_type.h"
 #include "tvm/tirx/expr.h"
 #include "tvm/tirx/op.h"
 #include "tvm/tirx/var.h"
@@ -338,7 +338,8 @@ inline Tensor reshape(const Tensor& x, ffi::Array<PrimExpr> newshape,
   // If either the input shape or the target shape contains a zero, return an empty tensor.
   if (is_empty_shape(target_shape) || is_empty_shape(x->shape)) {
     return compute(
-        target_shape, [&](const ffi::Array<Var>& indices) { return tvm::cast(x->dtype, 0); }, name,
+        target_shape,
+        [&](const ffi::Array<Var>& indices) { return tvm::cast(PrimType(x->dtype), 0); }, name,
         tag);
   } else {
     return compute(
@@ -679,7 +680,7 @@ inline PrimExpr CanonicalizeIndex(PrimExpr index, PrimExpr extent, PrimExpr stri
   if (index->IsInstance<tvm::IntImmNode>() && extent->IsInstance<tvm::IntImmNode>() &&
       stride->IsInstance<tvm::IntImmNode>()) {
     return tvm::IntImm(
-        tvm::DataType::Int(64),
+        tvm::PrimType::Int(64),
         StaticCanonicalizeIndex(GetConstInt(index), GetConstInt(extent), GetConstInt(stride)));
   }
   return DynamicCanonicalizeIndex(index, extent, stride);
@@ -835,14 +836,14 @@ inline te::Tensor dynamic_strided_slice(const te::Tensor& x, const te::Tensor& b
                                         bool assume_inbound = true,
                                         std::string name = "T_strided_slice_dynamic",
                                         std::string tag = topi::kInjective) {
-  DataType index_dtype = begin->shape[0]->dtype;
+  PrimType index_ty = begin->shape[0].ty();
   const int64_t num_dynamic_axes = begin->shape[0].as<IntImmNode>()->value;
   TVM_FFI_ICHECK_EQ(end->shape[0].as<IntImmNode>()->value, num_dynamic_axes);
   TVM_FFI_ICHECK_EQ(strides->shape[0].as<IntImmNode>()->value, num_dynamic_axes);
 
   ffi::Array<PrimExpr> begin_expr, end_expr, strides_expr;
   for (int64_t i = 0; i < num_dynamic_axes; ++i) {
-    auto ind = MakeConst(index_dtype, i);
+    auto ind = MakeConst(index_ty, i);
     begin_expr.push_back(begin(ind));
     end_expr.push_back(end(ind));
     strides_expr.push_back(strides(ind));
@@ -874,10 +875,10 @@ inline ffi::Array<PrimExpr> StridedSliceOutputShape(const ffi::Array<PrimExpr>&
                  axes.size() == strides.size());
   std::vector<int64_t> begin_vec, end_vec, strides_vec;
   std::tie(begin_vec, end_vec, strides_vec) = ConvertToVec(begin, end, strides, slice_mode);
-  DataType index_dtype =
-      (begin.size() > 0 && begin[0].defined()) ? begin[0].value()->dtype : DataType::Int(64);
+  PrimType index_ty =
+      (begin.size() > 0 && begin[0].defined()) ? begin[0].value().ty() : PrimType::Int(64);
   auto begin_canonicalized =
-      StridedSliceCanonicalizeBegin(ishape, begin_vec, strides_vec, axes, index_dtype, slice_mode);
+      StridedSliceCanonicalizeBegin(ishape, begin_vec, strides_vec, axes, index_ty, slice_mode);
   return StridedSliceOutputShape(ishape, begin_vec, end_vec, strides_vec, axes, slice_mode,
                                  begin_canonicalized, true);
 }
@@ -924,10 +925,10 @@ inline Tensor strided_slice_with_axes(
   std::vector<int64_t> begin_vec, end_vec, strides_vec;
   std::tie(begin_vec, end_vec, strides_vec) = ConvertToVec(begin, end, strides, slice_mode);
 
-  DataType index_dtype =
-      (begin.size() > 0 && begin[0].defined()) ? begin[0].value()->dtype : DataType::Int(64);
+  PrimType index_ty =
+      (begin.size() > 0 && begin[0].defined()) ? begin[0].value().ty() : PrimType::Int(64);
   auto begin_expr = StridedSliceCanonicalizeBegin(x->shape, begin_vec, strides_vec, normalized_axes,
-                                                  index_dtype, slice_mode);
+                                                  index_ty, slice_mode);
   auto out_shape = StridedSliceOutputShape(x->shape, begin_vec, end_vec, strides_vec,
                                            normalized_axes, slice_mode, begin_expr);
 
@@ -938,7 +939,7 @@ inline Tensor strided_slice_with_axes(
         for (size_t i = 0; i < out_shape.size(); ++i) real_indices.push_back(indices[i]);
         for (size_t i = 0; i < normalized_axes.size(); ++i) {
           int64_t ax = normalized_axes[i];
-          auto stride = MakeConst(strides[i]->dtype, strides_vec[i]);
+          auto stride = MakeConst(strides[i]->ty(), strides_vec[i]);
           PrimExpr ind = indices[ax] * stride + begin_expr[i];
           real_indices.Set(ax, ind);
         }
@@ -972,11 +973,11 @@ inline Tensor strided_slice(const Tensor& x, const ffi::Array<ffi::Optional<IntI
   ffi::Array<ffi::Optional<IntImm>> end_full(end);
   ffi::Array<IntImm> strides_full(strides);
 
-  DataType index_dtype =
-      (begin.size() > 0 && begin[0].defined()) ? begin[0].value()->dtype : DataType::Int(64);
-  const IntImm one = IntImm(index_dtype, 1);
-  const IntImm zero = IntImm(index_dtype, 0);
-  const IntImm max_range = max_value(index_dtype).as_or_throw<IntImm>();
+  PrimType index_ty =
+      (begin.size() > 0 && begin[0].defined()) ? begin[0].value().ty() : PrimType::Int(64);
+  const IntImm one = IntImm(index_ty, 1);
+  const IntImm zero = IntImm(index_ty, 0);
+  const IntImm max_range = max_value(index_ty).as_or_throw<IntImm>();
 
   for (size_t i = strides.size(); i < src_tensor_dim; ++i) {
     strides_full.push_back(one);
@@ -1073,7 +1074,8 @@ inline Tensor take(const Tensor& a, const Tensor& indices, int batch_dims,
         [&](const ffi::Array<Var>& out_index) {
           auto idx = tvm::if_then_else(
               indices(out_index) < 0 || indices(out_index) >= a_size,
-              tvm::FloatImm(a->dtype, std::numeric_limits<float>::quiet_NaN()), indices(out_index));
+              tvm::FloatImm(tvm::PrimType(a->dtype), std::numeric_limits<float>::quiet_NaN()),
+              indices(out_index));
           return a(UnravelIndex(idx, a_shape));
         },
         name, tag);
@@ -1116,9 +1118,9 @@ inline Tensor sequence_mask(const Tensor& data, const Tensor& valid_length, doub
         auto tid = out_index[axis];
         auto bid = out_index[1 - axis];
         len_index.push_back(bid);
-        PrimExpr ret =
-            tvm::if_then_else(tvm::cast(valid_length->dtype, tid) >= valid_length(len_index),
-                              tvm::tirx::MakeConst(data->dtype, mask_value), data(out_index));
+        PrimExpr ret = tvm::if_then_else(
+            tvm::cast(PrimType(valid_length->dtype), tid) >= valid_length(len_index),
+            tvm::tirx::MakeConst(PrimType(data->dtype), mask_value), data(out_index));
         return ret;
       },
       name, tag);
@@ -1293,7 +1295,7 @@ inline Tensor take(const Tensor& a, ffi::Variant<Tensor, PrimExpr> indices, int
           PrimExpr in_bounds = idx >= 0 && idx < axis_dim;
           return tvm::if_then_else(
               in_bounds, a(real_indices),
-              tvm::tirx::MakeConst(a->dtype, std::numeric_limits<float>::quiet_NaN()));
+              tvm::tirx::MakeConst(PrimType(a->dtype), std::numeric_limits<float>::quiet_NaN()));
         },
         name, tag);
   } else {  // mode == "wrap"
@@ -1443,8 +1445,8 @@ inline Tensor tile(const Tensor& x, ffi::Array<int64_t> reps, std::string name =
 
   if (is_empty_shape(new_shape)) {
     return compute(
-        new_shape, [&](const ffi::Array<Var>& indices) { return tvm::cast(x->dtype, 0); }, name,
-        tag);
+        new_shape, [&](const ffi::Array<Var>& indices) { return tvm::cast(PrimType(x->dtype), 0); },
+        name, tag);
   } else {
     return compute(
         new_shape,
@@ -1478,8 +1480,8 @@ inline Tensor dyn_tile(const Tensor& x, ffi::Array<PrimExpr> new_shape, size_t r
   size_t ndim = x->shape.size();
   if (is_empty_shape(new_shape)) {
     return compute(
-        new_shape, [&](const ffi::Array<Var>& indices) { return tvm::cast(x->dtype, 0); }, name,
-        tag);
+        new_shape, [&](const ffi::Array<Var>& indices) { return tvm::cast(PrimType(x->dtype), 0); },
+        name, tag);
   } else {
     return compute(
         new_shape,
@@ -1526,7 +1528,9 @@ inline Tensor gather(const Tensor& data, int axis, const Tensor& indices,
     size_t indices_dim_i = static_cast<size_t>(GetConstInt(indices->shape[axis]));
     TVM_FFI_ICHECK_GE(indices_dim_i, 1);
   }
-  TVM_FFI_ICHECK(indices->dtype.is_int() || indices->dtype.is_uint());
+  // Index tensors are validated by integer element kind; vector lane encoding is irrelevant here.
+  PrimType indices_ty = indices->dtype;
+  TVM_FFI_ICHECK(indices_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt));
 
   ffi::Array<PrimExpr> out_shape;
   for (size_t i = 0; i < ndim_i; ++i) {
@@ -1593,10 +1597,13 @@ inline Tensor gather_nd(const Tensor& data, const Tensor& indices, int batch_dim
         }
         for (size_t i = 0; i < indices_dim0; ++i) {
           indices_position.Set(0, IntImm::Int32(i));
-          if (indices->dtype.is_int() || indices->dtype.is_uint()) {
+          // Index tensors are validated by integer element kind; vector lane encoding is
+          // irrelevant for choosing whether an index cast is needed.
+          PrimType indices_ty = indices->dtype;
+          if (indices_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
             real_indices.push_back(indices(indices_position));
           } else {
-            real_indices.push_back(tvm::cast(tvm::DataType::Int(32), indices(indices_position)));
+            real_indices.push_back(tvm::cast(tvm::PrimType::Int(32), indices(indices_position)));
           }
         }
         if (real_indices.size() == ndim_d) {
@@ -1740,10 +1747,15 @@ inline Tensor tensordot(const Tensor& A, const tvm::te::Tensor& B, ffi::Array<Pr
 }
 
 inline Tensor arange(const PrimExpr& start, const PrimExpr& stop, const PrimExpr& step,
-                     DataType dtype, std::string name = "T_arange", std::string tag = kInjective) {
+                     PrimType dtype, std::string name = "T_arange", std::string tag = kInjective) {
   arith::Analyzer analyzer;
   PrimExpr num_elem;
-  bool is_all_int = start.dtype().is_int() && stop.dtype().is_int() && step.dtype().is_int();
+  PrimType start_ty = start.ty();
+  PrimType stop_ty = stop.ty();
+  PrimType step_ty = step.ty();
+  bool is_all_int = start_ty.code() == DLDataTypeCode::kDLInt &&
+                    stop_ty.code() == DLDataTypeCode::kDLInt &&
+                    step_ty.code() == DLDataTypeCode::kDLInt;
   if (is_all_int && analyzer->CanProveGreaterEqual(step, 1)) {
     // fast path for integer arange when step is positive
     num_elem = tvm::floordiv((stop - start + step - 1), step);
@@ -1752,8 +1764,8 @@ inline Tensor arange(const PrimExpr& start, const PrimExpr& stop, const PrimExpr
     num_elem = tvm::floordiv((start - stop - step - 1), -step);
   } else {
     // fallback path for non-integer or step of unknown sign
-    num_elem = tvm::cast(DefaultIndexType(),
-                         tvm::ceil(tvm::cast(tvm::DataType::Float(32), stop - start) / step));
+    num_elem = tvm::cast(PrimType(DefaultIndexType()),
+                         tvm::ceil(tvm::cast(tvm::PrimType::Float(32), stop - start) / step));
   }
   num_elem = analyzer->Simplify(num_elem);
 
@@ -1845,7 +1857,8 @@ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout,
         for (size_t i = 0; i < src.ndim(); ++i) {
           in_range = in_range && (src_indices[i] < src->shape[i]);
         }
-        return if_then_else(in_range, src(src_indices), tvm::cast(src->dtype, PrimExpr(0)));
+        return if_then_else(in_range, src(src_indices),
+                            tvm::cast(PrimType(src->dtype), PrimExpr(0)));
       },
       name, tag, attrs);
 }
@@ -1960,7 +1973,7 @@ inline Tensor meta_schedule_layout_transform(
   ffi::Array<Range> iter_domain;
   iter_domain.reserve(src->shape.size());
   for (const PrimExpr& e : src->shape) {
-    iter_domain.push_back(Range::FromMinExtent(IntImm(e->dtype, 0), e));
+    iter_domain.push_back(Range::FromMinExtent(IntImm(e.ty(), 0), e));
   }
   ffi::Array<PrimExpr> post_transform_shape = index_map->MapShape(src->shape, analyzer);
   return compute(
@@ -1980,7 +1993,7 @@ inline Tensor meta_schedule_layout_transform(
  * \param tag output tensor tag.
  * \return Tensor of input shape.
  */
-inline Tensor shape(const Tensor& src, DataType dtype, const std::string name = "T_shape",
+inline Tensor shape(const Tensor& src, PrimType dtype, const std::string name = "T_shape",
                     const std::string tag = kInjective) {
   int ndim = static_cast<int>(src->shape.size());
   ffi::Array<PrimExpr> out_shape{ndim};
@@ -1997,6 +2010,11 @@ inline Tensor shape(const Tensor& src, DataType dtype, const std::string name =
       name, tag);
 }
 
+inline Tensor shape(const Tensor& src, DLDataType dtype, const std::string name = "T_shape",
+                    const std::string tag = kInjective) {
+  return shape(src, PrimType(dtype), name, tag);
+}
+
 /*!
  * \brief Get the size of input tensor.
  * \param src the input tensor.
@@ -2005,7 +2023,7 @@ inline Tensor shape(const Tensor& src, DataType dtype, const std::string name =
  * \param tag output tensor tag.
  * \return Tensor of input shape.
  */
-inline te::Tensor tensor_size(const te::Tensor& src, const DataType& dtype,
+inline te::Tensor tensor_size(const te::Tensor& src, PrimType dtype,
                               const std::string& name = "tensor_size",
                               const std::string& tag = kInjective) {
   int ndim = static_cast<int>(src->shape.size());
@@ -2022,6 +2040,12 @@ inline te::Tensor tensor_size(const te::Tensor& src, const DataType& dtype,
       name, tag);
 }
 
+inline te::Tensor tensor_size(const te::Tensor& src, DLDataType dtype,
+                              const std::string& name = "tensor_size",
+                              const std::string& tag = kInjective) {
+  return tensor_size(src, PrimType(dtype), name, tag);
+}
+
 /*!
  * \brief Returns a one-hot tensor where the locations repsented by indices take value on_value,
     other locations take value off_value.
@@ -2037,7 +2061,7 @@ inline te::Tensor tensor_size(const te::Tensor& src, const DataType& dtype,
  * \return one-hot tensor.
  */
 inline Tensor one_hot(const Tensor& indices, const PrimExpr on_value, const PrimExpr off_value,
-                      int depth, int axis, const DataType& dtype,
+                      int depth, int axis, PrimType dtype,
                       ffi::Array<PrimExpr> oshape = ffi::Array<PrimExpr>(),
                       const std::string name = "T_one_hot", const std::string tag = kInjective) {
   int true_axis = (axis == -1) ? indices->shape.size() : axis;
@@ -2073,6 +2097,14 @@ inline Tensor one_hot(const Tensor& indices, const PrimExpr on_value, const Prim
       name, tag);
 }
 
+inline Tensor one_hot(const Tensor& indices, const PrimExpr on_value, const PrimExpr off_value,
+                      int depth, int axis, DLDataType dtype,
+                      ffi::Array<PrimExpr> oshape = ffi::Array<PrimExpr>(),
+                      const std::string name = "T_one_hot", const std::string tag = kInjective) {
+  return one_hot(indices, on_value, off_value, depth, axis, PrimType(dtype), std::move(oshape),
+                 name, tag);
+}
+
 /*!
  * \brief Get a dense tensor.
  * \param sparse_indices sparse_indices[i] contains sparse_values[i] will be placed.
@@ -2088,7 +2120,9 @@ inline Tensor sparse_to_dense(const Tensor& sparse_indices,
                               const PrimExpr& default_value,
                               const std::string name = "T_sparse_to_dense",
                               const std::string tag = kInjective) {
-  TVM_FFI_ICHECK(sparse_indices->dtype.is_int()) << "sparse_indices only accepts integer values";
+  // Sparse indices are validated by signed integer element kind; lane encoding is irrelevant here.
+  TVM_FFI_ICHECK_EQ(sparse_indices->dtype.code(), DLDataTypeCode::kDLInt)
+      << "sparse_indices only accepts integer values";
   TVM_FFI_ICHECK_LE(sparse_indices->shape.size(), 3)
       << "sparse_indices tensor should be 0D, 1D, or 2D only";
   TVM_FFI_ICHECK_LE(sparse_values->shape.size(), 2)
diff --git a/python/tvm/ir/expr.py b/python/tvm/ir/expr.py
index 4fbebeddd0f5..dd463150fd51 100644
--- a/python/tvm/ir/expr.py
+++ b/python/tvm/ir/expr.py
@@ -43,7 +43,18 @@ class PrimExpr(BaseExpr):
     optimizations and integer analysis.
     """
 
-    dtype: str
+    @property
+    def dtype(self):
+        """Compatibility alias for the runtime dtype of scalar PrimExpr.
+
+        New code should inspect ``expr.ty`` directly.  For scalar primitive
+        expressions, use ``expr.ty.dtype``.
+        """
+        if self.ty is None:
+            return None
+        if hasattr(self.ty, "dtype"):
+            return self.ty.dtype
+        return "handle"
 
 
 @tvm_ffi.register_object("ir.RelaxExpr")
diff --git a/python/tvm/ir/type.py b/python/tvm/ir/type.py
index 567ebafa2d5c..96548439d70e 100644
--- a/python/tvm/ir/type.py
+++ b/python/tvm/ir/type.py
@@ -53,6 +53,35 @@ class PrimType(Type):
     def __init__(self, dtype):
         self.__init_handle_by_constructor__(_ffi_api.PrimType, dtype)
 
+    def __eq__(self, other):
+        if isinstance(other, str):
+            return self.dtype == other
+        return super().__eq__(other)
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __hash__(self):
+        dtype = self.dtype
+        return hash((dtype.type_code, dtype.bits, dtype.lanes))
+
+    def __str__(self):
+        return str(self.dtype)
+
+    def matches_code(self, *codes) -> bool:
+        """Return whether this type has any of the given DLPack dtype codes."""
+        type_code = self.dtype.type_code
+        return any(type_code == int(code) for code in codes)
+
+    def matches_element_type(self, code, bits: int) -> bool:
+        """Return whether this type has the given scalar element code and bits."""
+        dtype = self.dtype
+        return dtype.type_code == int(code) and dtype.bits == bits
+
+    def is_scalar(self) -> bool:
+        """Return whether this type has exactly one fixed lane."""
+        return self.dtype.lanes == 1
+
 
 @tvm_ffi.register_object("ir.PointerType")
 class PointerType(Type):
diff --git a/python/tvm/relax/frontend/nn/extern.py b/python/tvm/relax/frontend/nn/extern.py
index 9c8efce690f1..6c7f3dc72c9f 100644
--- a/python/tvm/relax/frontend/nn/extern.py
+++ b/python/tvm/relax/frontend/nn/extern.py
@@ -145,7 +145,7 @@ def shape_dtype_inference(a, b):
 
         // those headers are guaranteed to be available
         #include <dlpack/dlpack.h>
-        #include <tvm/runtime/data_type.h>
+        #include <tvm/ffi/dtype.h>
         #include <tvm/ffi/function.h>
 
         namespace {
diff --git a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
index f987f48d4251..b9ab88da0b43 100644
--- a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
+++ b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
@@ -29,6 +29,7 @@
 import tvm_ffi
 
 from tvm import relax, tirx
+from tvm.runtime import DataTypeCode
 
 
 class BaseFXGraphImporter(metaclass=abc.ABCMeta):
@@ -566,7 +567,7 @@ def _pow(self, node: fx.Node) -> relax.Var:
         if (
             isinstance(lhs, relax.Expr)
             and isinstance(lhs.ty, relax.TensorType)
-            and "int" in lhs.ty.dtype
+            and lhs.ty.dtype.matches_code(DataTypeCode.INT, DataTypeCode.UINT)
             and isinstance(rhs, int)
             and not isinstance(rhs, bool)
             and rhs >= 0
@@ -1607,7 +1608,7 @@ def transpose_and_reshape_back(tensor):
         if attn_mask is not None:
             attn_mask = self.env[attn_mask]
             msg = "Only a float mask is supported for the attn_mask input."
-            assert "float" in attn_mask.ty.dtype, msg
+            assert attn_mask.ty.dtype.matches_code(DataTypeCode.FLOAT, DataTypeCode.BFLOAT), msg
 
         attention_output = self.block_builder.emit(
             relax.op.nn.attention(query, key, value, bias=attn_mask, causal_mask=causal_mask)
diff --git a/python/tvm/relax/op/_op_gradient.py b/python/tvm/relax/op/_op_gradient.py
index 5aae26b75f20..809a2c19ee9e 100644
--- a/python/tvm/relax/op/_op_gradient.py
+++ b/python/tvm/relax/op/_op_gradient.py
@@ -22,6 +22,7 @@
 
 from tvm import relax
 from tvm.arith import Analyzer
+from tvm.ir import PrimType
 from tvm.relax.type import ShapeType
 
 from ...tirx import PrimExpr
@@ -81,6 +82,8 @@ def _get_dtype(expr: Expr) -> str:
         raise RuntimeError(
             f"Get the dtype of {expr} failed. Please normalize it first and ensure it is a Tensor."
         ) from error
+    if isinstance(dtype, PrimType):
+        dtype = dtype.dtype
     return dtype
 
 
diff --git a/python/tvm/relax/op/create.py b/python/tvm/relax/op/create.py
index 9d28ed92f9c5..1bbeeee8f272 100644
--- a/python/tvm/relax/op/create.py
+++ b/python/tvm/relax/op/create.py
@@ -17,6 +17,7 @@
 """Creation operators."""
 
 from tvm import DataType, DataTypeCode
+from tvm.ir import PrimType
 from tvm.ir.expr import PrimExpr
 
 from ..expr import Expr, PrimValue, ShapeExpr
@@ -267,7 +268,12 @@ def is_int(expr):
             return True
         if isinstance(expr, PrimValue):
             expr = expr.value
-        return isinstance(expr, PrimExpr) and DataType(expr.dtype).type_code == DataTypeCode.INT  # type: ignore
+        if isinstance(expr, PrimExpr):
+            dtype = expr.dtype  # type: ignore
+            if isinstance(dtype, PrimType):
+                dtype = dtype.dtype
+            return DataType(dtype).type_code == DataTypeCode.INT
+        return False
 
     if dtype is None:
         args = (start, end, step)
diff --git a/python/tvm/relax/op/manipulate.py b/python/tvm/relax/op/manipulate.py
index 4b787c265bc3..43a2bd400351 100644
--- a/python/tvm/relax/op/manipulate.py
+++ b/python/tvm/relax/op/manipulate.py
@@ -19,6 +19,7 @@
 from collections.abc import Callable
 
 from tvm.ir.expr import PrimExpr
+from tvm.runtime import DataTypeCode
 from tvm.tirx import FloatImm, IndexMap, IntImm
 
 from ..expr import Expr, PrimValue, ShapeExpr
@@ -151,10 +152,12 @@ def layout_transform(
     if pad_value is None:
         pass
     elif not isinstance(pad_value, PrimValue):
-        if "int" in x_dtype and isinstance(pad_value, int):
-            pad_value = IntImm(x_dtype, pad_value)
-        elif "float" in x_dtype and (isinstance(pad_value, int | float)):
-            pad_value = FloatImm(x_dtype, float(pad_value))
+        if x_dtype.matches_code(DataTypeCode.INT, DataTypeCode.UINT) and isinstance(pad_value, int):
+            pad_value = IntImm(x_dtype.dtype, pad_value)
+        elif x_dtype.matches_code(DataTypeCode.FLOAT, DataTypeCode.BFLOAT) and (
+            isinstance(pad_value, int | float)
+        ):
+            pad_value = FloatImm(x_dtype.dtype, float(pad_value))
         pad_value = PrimValue(pad_value)
 
     if axis_separators is None:
diff --git a/python/tvm/relax/transform/legalize_ops/common.py b/python/tvm/relax/transform/legalize_ops/common.py
index 1b7d1179a521..f464c248e363 100644
--- a/python/tvm/relax/transform/legalize_ops/common.py
+++ b/python/tvm/relax/transform/legalize_ops/common.py
@@ -20,6 +20,7 @@
 
 import tvm
 from tvm import te
+from tvm.runtime import DataTypeCode
 from tvm.tirx import FloatImm, IntImm
 
 from ...block_builder import BlockBuilder
@@ -38,9 +39,6 @@
 LegalizeFunc = Callable[[BlockBuilder, Call], Expr]
 
 
-##################### Utilities #####################
-
-
 def _try_convert_to_scalar_const(
     expr: Expr, python_native: bool = False
 ) -> Expr | FloatImm | IntImm | bool | float | int:
@@ -69,13 +67,14 @@ def _try_convert_to_scalar_const(
         # get the value of the scalar constant
         value = expr.data.numpy()[()].item()
         dtype = expr.ty.dtype
+        dtype_str = str(dtype.dtype)
         if python_native:
             return value
         # preserve the data type of the constant
-        if dtype.startswith("float"):
-            return tvm.tirx.FloatImm(dtype, value)
-        elif dtype.startswith("int") or dtype.startswith("uint") or dtype.startswith("bool"):
-            return tvm.tirx.IntImm(dtype, value)
+        if dtype.matches_code(DataTypeCode.FLOAT, DataTypeCode.BFLOAT):
+            return tvm.tirx.FloatImm(dtype_str, value)
+        elif dtype.matches_code(DataTypeCode.INT, DataTypeCode.UINT, DataTypeCode.BOOL):
+            return tvm.tirx.IntImm(dtype_str, value)
     return expr
 
 
diff --git a/python/tvm/relax/transform/legalize_ops/manipulate.py b/python/tvm/relax/transform/legalize_ops/manipulate.py
index f0cc8977d4ef..a59b1f9fe52e 100644
--- a/python/tvm/relax/transform/legalize_ops/manipulate.py
+++ b/python/tvm/relax/transform/legalize_ops/manipulate.py
@@ -19,7 +19,7 @@
 """Default legalization function for manipulate operators."""
 
 import tvm
-from tvm import relax, s_tir, te, tirx, topi
+from tvm import DataTypeCode, relax, s_tir, te, tirx, topi
 from tvm.relax.op.base import call_tir
 from tvm.relax.type import TensorType
 from tvm.relax.utils import gen_call_tir_inputs
@@ -337,7 +337,7 @@ def set_axis_sep(axis_sep: list, sch: s_tir.schedule, buffer_type: str):
     if pad_value is not None:
         pad_value = pad_value.value
     else:
-        if "int" in call.args[0].ty.dtype:
+        if call.args[0].ty.dtype.matches_code(DataTypeCode.INT, DataTypeCode.UINT):
             pad_value = 0
         else:
             pad_value = 0.0
diff --git a/python/tvm/relax/transform/legalize_ops/qdq.py b/python/tvm/relax/transform/legalize_ops/qdq.py
index aa86f6fca2c3..7a825e300e40 100644
--- a/python/tvm/relax/transform/legalize_ops/qdq.py
+++ b/python/tvm/relax/transform/legalize_ops/qdq.py
@@ -19,6 +19,7 @@
 
 import tvm
 from tvm import te, tirx
+from tvm.runtime import DataTypeCode
 
 from ...block_builder import BlockBuilder
 from ...expr import Call, Expr
@@ -140,7 +141,11 @@ def dequantize_compute(*indices):
                 zp_value = zp[(0,) * len(zp.shape)]
             else:
                 zp_value = zp[indices[axis]]
-            dtype = "float32" if "float" in data.dtype else "int32"
+            dtype = (
+                "float32"
+                if data.dtype.matches_code(DataTypeCode.FLOAT, DataTypeCode.BFLOAT)
+                else "int32"
+            )
             sub = te.subtract(data[indices].astype(dtype), zp_value)
             out = te.multiply(sub, scale_value.astype("float32"))
             if out_dtype == "float32":
diff --git a/python/tvm/relax/type.py b/python/tvm/relax/type.py
index ad8f469826ef..305f01750306 100644
--- a/python/tvm/relax/type.py
+++ b/python/tvm/relax/type.py
@@ -21,7 +21,7 @@
 import tvm_ffi
 from tvm_ffi import Array
 
-from tvm.ir import EnvFunc, PrimExpr, Span, TupleType, VDevice
+from tvm.ir import EnvFunc, PrimExpr, PrimType, Span, TupleType, VDevice
 
 from . import _ffi_api
 from .expr import Expr, ShapeExpr, Type
@@ -92,7 +92,7 @@ class TensorType(Type):
     """
 
     shape: Expr | None
-    dtype: str
+    dtype: PrimType
     vdevice: VDevice | None
     ndim: int
     span: Span
@@ -100,13 +100,15 @@ class TensorType(Type):
     def __init__(
         self,
         shape: Expr | None | list[PrimExpr] = None,
-        dtype: str = "float32",
+        dtype: str | PrimType | None = "float32",
         vdevice: VDevice | None | str = None,
         ndim: int = -1,
         span: Span = None,
     ) -> None:
         if isinstance(shape, list | tuple | Array):
             shape = ShapeExpr(shape)
+        if dtype is not None and not isinstance(dtype, PrimType):
+            dtype = PrimType(dtype)
         self.__init_handle_by_constructor__(
             _ffi_api.TensorType,
             shape,
diff --git a/python/tvm/runtime/object_generic.py b/python/tvm/runtime/object_generic.py
index 51c8805f9445..505613d0372e 100644
--- a/python/tvm/runtime/object_generic.py
+++ b/python/tvm/runtime/object_generic.py
@@ -66,5 +66,9 @@ def const(value, dtype=None, span=None):
     if dtype is None:
         dtype = _scalar_type_inference(value)
     if dtype == "uint64" and value >= (1 << 63):
-        return _ffi_node_api.LargeUIntImm(dtype, value & ((1 << 32) - 1), value >> 32, span)
+        from tvm.ir import PrimType  # pylint: disable=import-outside-toplevel
+
+        return _ffi_node_api.LargeUIntImm(
+            PrimType(dtype), value & ((1 << 32) - 1), value >> 32, span
+        )
     return _ffi_node_api._const(value, dtype, span)
diff --git a/python/tvm/s_tir/schedule/schedule.py b/python/tvm/s_tir/schedule/schedule.py
index 7f191df98d84..25b81239189d 100644
--- a/python/tvm/s_tir/schedule/schedule.py
+++ b/python/tvm/s_tir/schedule/schedule.py
@@ -24,7 +24,7 @@
 
 from tvm.error import register_error
 from tvm.ir import GlobalVar, IRModule, PrimExpr
-from tvm.runtime import Object
+from tvm.runtime import DataTypeCode, Object
 from tvm.tirx import Buffer, FloatImm, For, IntImm, PrimFunc, SBlock
 from tvm.tirx.function import IndexMap
 
@@ -3465,10 +3465,14 @@ def two_elementwise_transformed_intermediate_buffer(a: T.handle, c: T.handle) ->
             # buffer's type.  If the default `tvm.runtime.convert`
             # behavior is applied, these would be converted to
             # int32/float32, which may not match the buffer's type.
-            if "int" in buffer_obj.dtype and isinstance(pad_value, int):
-                pad_value = IntImm(buffer_obj.dtype, pad_value)
-            elif "float" in buffer_obj.dtype and isinstance(pad_value, float):
-                pad_value = FloatImm(buffer_obj.dtype, pad_value)
+            if buffer_obj.dtype.matches_code(DataTypeCode.INT, DataTypeCode.UINT) and isinstance(
+                pad_value, int
+            ):
+                pad_value = IntImm(buffer_obj.dtype.dtype, pad_value)
+            elif buffer_obj.dtype.matches_code(DataTypeCode.FLOAT, DataTypeCode.BFLOAT) and (
+                isinstance(pad_value, float)
+            ):
+                pad_value = FloatImm(buffer_obj.dtype.dtype, pad_value)
             pad_value = IndexMap.from_func(
                 lambda *indices: pad_value,
                 ndim=len(index_map.final_indices),
diff --git a/python/tvm/script/parser/core/evaluator.py b/python/tvm/script/parser/core/evaluator.py
index 4d38292b9b56..0461e56ec984 100644
--- a/python/tvm/script/parser/core/evaluator.py
+++ b/python/tvm/script/parser/core/evaluator.py
@@ -396,7 +396,11 @@ def _eval_if_exp(self, fields: dict[str, Any]) -> Any:
         orelse = self._eval_expr(fields["orelse"])
         if isinstance(test, bool):
             return body if test else orelse
-        elif isinstance(test, tvm.tirx.PrimExpr) and test.dtype == "bool":
+        elif (
+            isinstance(test, tvm.tirx.PrimExpr)
+            and isinstance(test.ty, tvm.ir.PrimType)
+            and test.ty.matches_code(tvm.DataTypeCode.BOOL)
+        ):
             return tvm.tirx.op.if_then_else(test, body, orelse)
         else:
             raise TypeError(f"Expected Python bool or TIR bool, but got {type(test)}")
diff --git a/python/tvm/te/tensor.py b/python/tvm/te/tensor.py
index 531915c6798a..b7238cf07eda 100644
--- a/python/tvm/te/tensor.py
+++ b/python/tvm/te/tensor.py
@@ -19,6 +19,7 @@
 # pylint: disable=invalid-name
 import tvm_ffi
 
+from tvm.ir import PrimType
 from tvm.runtime import Object, ObjectConvertible
 from tvm.tirx import DataProducer
 from tvm.tirx import expr as _expr
@@ -49,6 +50,10 @@ def dtype(self):
         """Data content of the tensor."""
         return self.tensor.dtype
 
+    def expr_ty(self):
+        """Compile-time element type of the tensor."""
+        return self.tensor.expr_ty()
+
 
 @tvm_ffi.register_object("te.Tensor")
 class Tensor(DataProducer, _expr.ExprOp):
@@ -86,6 +91,15 @@ def ndim(self):
         """Dimension of the tensor."""
         return len(self.shape)
 
+    @property
+    def dtype(self):
+        """Data content of the tensor."""
+        return PrimType(_ffi_api.TensorDType(self))
+
+    def expr_ty(self):
+        """Compile-time element type of the tensor."""
+        return self.dtype
+
     @property
     def name(self):
         op = self.op
diff --git a/python/tvm/tirx/buffer.py b/python/tvm/tirx/buffer.py
index 4caf154547fa..43023b4c3cb9 100644
--- a/python/tvm/tirx/buffer.py
+++ b/python/tvm/tirx/buffer.py
@@ -352,7 +352,7 @@ def _infer_shape(shape):
             shape = args
             assert all(
                 isinstance(arg, int)
-                or (isinstance(arg, PrimExpr) and arg.dtype in ["int32", "int64"])
+                or (isinstance(arg, PrimExpr) and arg.ty.dtype in ["int32", "int64"])
                 for arg in shape
             ), "shape must be a list of integers or PrimExprs with dtype int32 or int64"
             # Safely get optional keyword arguments
@@ -462,7 +462,7 @@ def permute(self, *dims) -> "Buffer":
 
     def __getitem__(self, indices):
         from ..arith import Analyzer  # pylint: disable=import-outside-toplevel
-        from .expr import BufferLoad, Ramp, const  # pylint: disable=import-outside-toplevel
+        from .expr import BufferLoad, Ramp  # pylint: disable=import-outside-toplevel
         from .stmt import BufferRegion  # pylint: disable=import-outside-toplevel
 
         if not isinstance(indices, tuple | list):
@@ -483,7 +483,8 @@ def __getitem__(self, indices):
                 else:
                     region.append(
                         Range.from_min_extent(
-                            index, const(1, index.dtype) if isinstance(index, PrimExpr) else 1
+                            index,
+                            tvm.tirx.expr.IntImm(index.ty, 1) if isinstance(index, PrimExpr) else 1,
                         )
                     )
             if has_implicit_slice:
@@ -499,7 +500,7 @@ def __getitem__(self, indices):
                     step = 1 if index.step is None else index.step
                     # We should ensure the dtype of start is the same with that of step.
                     if isinstance(start, tvm.tirx.expr.PrimExpr) and isinstance(step, int):
-                        step = tvm.tirx.expr.IntImm(start.dtype, step)
+                        step = tvm.tirx.expr.IntImm(start.ty, step)
                     lanes = analyzer.simplify((stop - start + step - 1) // step)
                     if lanes == 1:
                         expr_indices.append(start)
@@ -540,11 +541,11 @@ def decl_buffer(
         layout = TileLayout(S[tuple(shape)]) if shape else None
 
     if offset_factor != 0 and elem_offset is None:
-        shape_dtype = shape[0].dtype if shape and hasattr(shape[0], "dtype") else "int32"
-        elem_offset = Var(f"{name}_elem_offset", shape_dtype)
+        shape_ty = shape[0].ty if shape and isinstance(shape[0], PrimExpr) else "int32"
+        elem_offset = Var(f"{name}_elem_offset", shape_ty)
     if data is None:
         # Bool is represented as uint1 in the IR, but stored as int8
-        storage_type = PrimType(dtype)
+        storage_type = dtype if isinstance(dtype, PrimType) else PrimType(dtype)
         storage_type = PrimType("int8") if storage_type.dtype == "bool" else storage_type
         data = Var(name, PointerType(storage_type, scope), span)
     return _ffi_api.Buffer(  # type: ignore
diff --git a/python/tvm/tirx/expr.py b/python/tvm/tirx/expr.py
index a97171e436ae..ec744acf5093 100644
--- a/python/tvm/tirx/expr.py
+++ b/python/tvm/tirx/expr.py
@@ -34,7 +34,7 @@
 from tvm import ir
 from tvm.ir import Op, PrimExpr
 from tvm.ir.base import Span
-from tvm.runtime import DataType, DataTypeCode, Object, ObjectConvertible, Scriptable, const
+from tvm.runtime import DataTypeCode, Object, ObjectConvertible, Scriptable, const
 
 from . import _ffi_api
 from . import generic as _generic
@@ -56,13 +56,17 @@ def div_ambiguity_error() -> RuntimeError:
 def _dtype_is_int(value):
     if isinstance(value, int):
         return True
-    return isinstance(value, ExprOp) and DataType(value.dtype).type_code == DataTypeCode.INT  # type: ignore
+    if isinstance(value, ExprOp):
+        return value.expr_ty().matches_code(DataTypeCode.INT)
+    return False
 
 
 def _dtype_is_float(value):
     if isinstance(value, float):
         return True
-    return isinstance(value, ExprOp) and DataType(value.dtype).type_code == DataTypeCode.FLOAT  # type: ignore
+    if isinstance(value, ExprOp):
+        return value.expr_ty().matches_code(DataTypeCode.FLOAT)
+    return False
 
 
 class ExprOp:
@@ -70,6 +74,13 @@ class ExprOp:
 
     # TODO(tkonolige): use inspect to add source information to these objects
 
+    def expr_ty(self) -> ir.PrimType:
+        """Return the compile-time primitive type for expression operators."""
+        ty = getattr(self, "ty", None)
+        if isinstance(ty, ir.PrimType):
+            return ty
+        raise TypeError(f"Cannot determine PrimType for {type(self).__name__}")
+
     def __add__(self, other: PrimExpr) -> PrimExpr:
         return _generic.add(self, other)
 
@@ -121,7 +132,7 @@ def __rmod__(self, other: PrimExpr) -> PrimExpr:
         return _ffi_api._OpFloorMod(other, self, None)  # type: ignore
 
     def __neg__(self) -> PrimExpr:
-        neg_one = const(-1, self.dtype)  # type: ignore
+        neg_one = const(-1, self.expr_ty().dtype)
         return self.__mul__(neg_one)
 
     def __lshift__(self, other: PrimExpr) -> PrimExpr:
@@ -204,7 +215,7 @@ def equal(self, other: PrimExpr, span: Span | None = None) -> bool:
         """
         return _ffi_api._OpEQ(self, other, span)  # type: ignore
 
-    def astype(self, dtype: str, span: Span | None = None) -> PrimExpr:
+    def astype(self, dtype: str | ir.PrimType, span: Span | None = None) -> PrimExpr:
         """Cast the expression to other type.
 
         Parameters
@@ -259,6 +270,10 @@ def asobject(self) -> PrimExpr:
         """Convert object."""
         return _ffi_api._OpEQ(self.a, self.b, self.span)  # type: ignore
 
+    def expr_ty(self) -> ir.PrimType:
+        """Compile-time type of the equality result."""
+        return ir.PrimType("bool")
+
     def __repr__(self) -> str:
         return f"EqualOp({self.a!r}, {self.b!r})"
 
@@ -299,6 +314,10 @@ def asobject(self) -> PrimExpr:
         """Convert object."""
         return _ffi_api._OpNE(self.a, self.b, self.span)  # type: ignore
 
+    def expr_ty(self) -> ir.PrimType:
+        """Compile-time type of the inequality result."""
+        return ir.PrimType("bool")
+
     def __repr__(self) -> str:
         return f"NotEqualOp({self.a!r}, {self.b!r})"
 
@@ -458,12 +477,10 @@ def __init__(
                 raise TypeError("dom need to be Range")
 
         name = var if var is not None else "iter"
-        dtype = "int32" if dom is None else dom.extent.dtype
+        dtype = "int32" if dom is None else dom.extent.ty
         var = Var(name, dtype=dtype, span=span) if not isinstance(var, Var) else var
         if dom is not None:
-            assert var.dtype == dom.extent.dtype, (
-                "IterVar's Var dtype must match its domain's extent's dtype"
-            )
+            assert var.ty == dom.extent.ty, "IterVar's Var type must match its domain's extent type"
         self.__init_handle_by_constructor__(
             _ffi_api.IterVar,
             dom,
@@ -473,6 +490,10 @@ def __init__(
             span,  # type: ignore
         )
 
+    def expr_ty(self) -> ir.PrimType:
+        """Compile-time type of the iteration variable."""
+        return self.var.ty
+
 
 @tvm_ffi.register_object("tirx.CommReducer")
 class CommReducer(Object, Scriptable):
@@ -595,7 +616,9 @@ class FloatImm(ConstExpr):
 
     value: float
 
-    def __init__(self, dtype: str, value: float, span: Span | None = None) -> None:
+    def __init__(self, dtype: str | ir.PrimType, value: float, span: Span | None = None) -> None:
+        if isinstance(dtype, ir.PrimType):
+            dtype = dtype.dtype
         self.__init_handle_by_constructor__(
             tvm.ir._ffi_api.FloatImm,
             dtype,
@@ -625,7 +648,9 @@ class IntImm(ConstExpr):
 
     value: int
 
-    def __init__(self, dtype: str, value: int, span: Span | None = None) -> None:
+    def __init__(self, dtype: str | ir.PrimType, value: int, span: Span | None = None) -> None:
+        if isinstance(dtype, ir.PrimType):
+            dtype = dtype.dtype
         self.__init_handle_by_constructor__(
             tvm.ir._ffi_api.IntImm,
             dtype,
@@ -702,7 +727,9 @@ class Cast(PrimExprWithOp):
 
     value: PrimExpr
 
-    def __init__(self, dtype, value, span: Span | None = None) -> None:
+    def __init__(self, dtype: str | ir.PrimType, value, span: Span | None = None) -> None:
+        if isinstance(dtype, ir.PrimType):
+            dtype = dtype.dtype
         self.__init_handle_by_constructor__(_ffi_api.Cast, dtype, value, span)  # type: ignore
 
 
@@ -1313,7 +1340,7 @@ class Call(PrimExprWithOp):
 
     def __init__(
         self,
-        dtype: str,
+        dtype: str | ir.PrimType,
         op: Op | str,
         args: list[PrimExpr],
         attrs: ir.Attrs | dict | None = None,
@@ -1332,6 +1359,8 @@ def __init__(
             op = Op.get(op)
         if isinstance(attrs, dict):
             attrs = ir.make_node("ir.DictAttrs", **attrs)
+        if not isinstance(dtype, ir.PrimType):
+            dtype = ir.PrimType(dtype)
         if attrs:
             self.__init_handle_by_constructor__(  # type: ignore
                 _ffi_api.CallWithAttrs, dtype, op, args, attrs, span
diff --git a/python/tvm/tirx/layout.py b/python/tvm/tirx/layout.py
index 29a19d746dee..11d1e140ae16 100644
--- a/python/tvm/tirx/layout.py
+++ b/python/tvm/tirx/layout.py
@@ -332,10 +332,10 @@ def _get_default_strides(data: list[int | PrimExpr], stride: int = 1) -> tuple:
         # produce for int64-shaped buffers (otherwise the last stride stays a
         # Python ``int`` -> int32 IntImm and breaks structural-equal).
         for t in data:
-            if isinstance(t, PrimExpr) and t.dtype != "int32":
+            if isinstance(t, PrimExpr) and t.ty.dtype != "int32":
                 from .expr import IntImm  # pylint: disable=import-outside-toplevel
 
-                stride = IntImm(t.dtype, stride)
+                stride = IntImm(t.ty, stride)
                 break
         res = list()
         for t in reversed(data):
diff --git a/python/tvm/tirx/op.py b/python/tvm/tirx/op.py
index a7a2889c444b..9a54e915bb0b 100644
--- a/python/tvm/tirx/op.py
+++ b/python/tvm/tirx/op.py
@@ -31,7 +31,7 @@
 
 from . import _ffi_api
 from .buffer import Buffer
-from .expr import BufferLoad, Call, CommReducer, IntImm, PrimExprWithOp, Var
+from .expr import BufferLoad, Call, CommReducer, ExprOp, IntImm, PrimExprWithOp, Var
 
 tir = tirx  # alias for backward compat with upstream tir.convert() calls
 
@@ -57,6 +57,24 @@ def _canonical_device_intrin_name(func_name: str) -> str:
     return func_name
 
 
+def _primexpr_ty(expr):
+    """Return the runtime primitive type of an expression."""
+    ty = getattr(expr, "ty", None)
+    if isinstance(ty, tvm.ir.PrimType):
+        return ty
+    if isinstance(expr, ExprOp):
+        return expr.expr_ty()
+    raise TypeError(f"Cannot determine PrimExpr type for {type(expr).__name__}")
+
+
+def _primexpr_dtype(expr):
+    """Return the runtime dtype of a primitive expression without using PrimExpr.dtype."""
+    ty = _primexpr_ty(expr)
+    if not isinstance(ty, tvm.ir.PrimType):
+        raise TypeError(f"Expected PrimType for {type(expr).__name__}, but got {ty}")
+    return ty.dtype
+
+
 def _pack_buffer(buf, span=None):
     """Build intrinsics that packs the buffer."""
     shape = Call("handle", "tirx.tvm_stack_make_shape", buf.shape, span=span)
@@ -187,7 +205,7 @@ def call_cpacked(*args, span=None):
     return Call("int32", Op.get("tirx.tvm_call_cpacked"), call_args, span=span)
 
 
-def call_intrin(dtype, func_name, *args, attrs=None, span=None):
+def call_intrin(dtype: str | tvm.ir.PrimType, func_name, *args, attrs=None, span=None):
     """Build expression by calling an intrinsic function.
 
     Intrinsics can be overloaded with multiple data types via
@@ -272,8 +290,9 @@ def call_extern(dtype, func_name, *args, span=None):
 
 def _require_float_arg(op_name, x):
     x = tirx.convert(x)
-    if "float" not in x.dtype and "bfloat" not in x.dtype:
-        raise TypeError(f"tirx.{op_name} only supports floating-point inputs, but got {x.dtype}")
+    dtype = _primexpr_dtype(x)
+    if "float" not in dtype and "bfloat" not in dtype:
+        raise TypeError(f"tirx.{op_name} only supports floating-point inputs, but got {dtype}")
     return x
 
 
@@ -476,8 +495,8 @@ def call_tir(global_var: tvm.ir.GlobalVar, *args):
     dtype = "void"
     if global_var.ty is not None:
         ret_ty = global_var.ty.ret
-        if hasattr(ret_ty, "dtype"):
-            dtype = ret_ty.dtype
+        if isinstance(ret_ty, tvm.ir.PrimType):
+            dtype = ret_ty
 
     return Call(dtype=dtype, op=global_var, args=args)
 
@@ -680,7 +699,7 @@ def tvm_thread_invariant(cond):
         The call expression.
     """
     assert isinstance(cond, PrimExpr)
-    return call_intrin(cond.dtype, "tirx.tvm_thread_invariant", cond)
+    return call_intrin(_primexpr_ty(cond), "tirx.tvm_thread_invariant", cond)
 
 
 def tvm_storage_sync(storage_scope, is_load=False, num_blocks=-1):
@@ -742,7 +761,9 @@ def tvm_warp_shuffle(mask, value, warp_id, width, warp_size):
     call : PrimExpr
         The call expression.
     """
-    return call_intrin(value.dtype, "tirx.tvm_warp_shuffle", mask, value, warp_id, width, warp_size)
+    return call_intrin(
+        _primexpr_ty(value), "tirx.tvm_warp_shuffle", mask, value, warp_id, width, warp_size
+    )
 
 
 def tvm_warp_shuffle_up(mask, value, offset, width, warp_size):
@@ -768,7 +789,7 @@ def tvm_warp_shuffle_up(mask, value, offset, width, warp_size):
         The call expression.
     """
     return call_intrin(
-        value.dtype, "tirx.tvm_warp_shuffle_up", mask, value, offset, width, warp_size
+        _primexpr_ty(value), "tirx.tvm_warp_shuffle_up", mask, value, offset, width, warp_size
     )
 
 
@@ -795,7 +816,7 @@ def tvm_warp_shuffle_down(mask, value, offset, width, warp_size):
         The call expression.
     """
     return call_intrin(
-        value.dtype, "tirx.tvm_warp_shuffle_down", mask, value, offset, width, warp_size
+        _primexpr_ty(value), "tirx.tvm_warp_shuffle_down", mask, value, offset, width, warp_size
     )
 
 
@@ -821,7 +842,7 @@ def tvm_warp_shuffle_xor(mask, value, lane_mask, width, warp_size):
         The call expression.
     """
     return call_intrin(
-        value.dtype, "tirx.tvm_warp_shuffle_xor", mask, value, lane_mask, width, warp_size
+        _primexpr_ty(value), "tirx.tvm_warp_shuffle_xor", mask, value, lane_mask, width, warp_size
     )
 
 
@@ -1208,7 +1229,8 @@ def trace(args, trace_action="tvm.default_trace_action"):
         raise Exception("tvm.tirx.trace consumes the args as list type")
     call_args = [_pack_buffer(x) if isinstance(x, Buffer) else x for x in args]
     call_args.insert(0, trace_action)
-    return tvm.tirx.Call(args[-1].dtype, Op.get("tirx.tvm_call_trace_packed"), call_args)
+    dtype = _primexpr_ty(args[-1]) if isinstance(args[-1], PrimExpr) else args[-1].dtype
+    return tvm.tirx.Call(dtype, Op.get("tirx.tvm_call_trace_packed"), call_args)
 
 
 def min_value(dtype, span=None):
@@ -1304,7 +1326,7 @@ def exp(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.exp", x)
+    return call_intrin(_primexpr_ty(x), "tirx.exp", x)
 
 
 def exp2(x):
@@ -1321,7 +1343,7 @@ def exp2(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.exp2", x)
+    return call_intrin(_primexpr_ty(x), "tirx.exp2", x)
 
 
 def exp10(x):
@@ -1338,7 +1360,7 @@ def exp10(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.exp10", x)
+    return call_intrin(_primexpr_ty(x), "tirx.exp10", x)
 
 
 def fma(x, y, z):
@@ -1363,7 +1385,7 @@ def fma(x, y, z):
     x = tir.convert(x)
     y = tir.convert(y)
     z = tir.convert(z)
-    return call_intrin(x.dtype, "tirx.fma", x, y, z)
+    return call_intrin(_primexpr_ty(x), "tirx.fma", x, y, z)
 
 
 def erf(x):
@@ -1380,7 +1402,7 @@ def erf(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.erf", x)
+    return call_intrin(_primexpr_ty(x), "tirx.erf", x)
 
 
 def tanh(x):
@@ -1397,7 +1419,7 @@ def tanh(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.tanh", x)
+    return call_intrin(_primexpr_ty(x), "tirx.tanh", x)
 
 
 def sigmoid(x):
@@ -1414,7 +1436,7 @@ def sigmoid(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.sigmoid", x)
+    return call_intrin(_primexpr_ty(x), "tirx.sigmoid", x)
 
 
 def log(x):
@@ -1431,7 +1453,7 @@ def log(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.log", x)
+    return call_intrin(_primexpr_ty(x), "tirx.log", x)
 
 
 def log2(x):
@@ -1448,7 +1470,7 @@ def log2(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.log2", x)
+    return call_intrin(_primexpr_ty(x), "tirx.log2", x)
 
 
 def log10(x):
@@ -1465,7 +1487,7 @@ def log10(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.log10", x)
+    return call_intrin(_primexpr_ty(x), "tirx.log10", x)
 
 
 def log1p(x):
@@ -1482,7 +1504,7 @@ def log1p(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.log1p", x)
+    return call_intrin(_primexpr_ty(x), "tirx.log1p", x)
 
 
 def tan(x):
@@ -1499,7 +1521,7 @@ def tan(x):
         The result.
     """
     x = _require_float_arg("tan", x)
-    return call_intrin(x.dtype, "tirx.tan", x)
+    return call_intrin(_primexpr_ty(x), "tirx.tan", x)
 
 
 def cos(x):
@@ -1516,7 +1538,7 @@ def cos(x):
         The result.
     """
     x = _require_float_arg("cos", x)
-    return call_intrin(x.dtype, "tirx.cos", x)
+    return call_intrin(_primexpr_ty(x), "tirx.cos", x)
 
 
 def cosh(x):
@@ -1533,7 +1555,7 @@ def cosh(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.cosh", x)
+    return call_intrin(_primexpr_ty(x), "tirx.cosh", x)
 
 
 def acos(x):
@@ -1550,7 +1572,7 @@ def acos(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.acos", x)
+    return call_intrin(_primexpr_ty(x), "tirx.acos", x)
 
 
 def acosh(x):
@@ -1567,7 +1589,7 @@ def acosh(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.acosh", x)
+    return call_intrin(_primexpr_ty(x), "tirx.acosh", x)
 
 
 def sin(x):
@@ -1584,7 +1606,7 @@ def sin(x):
         The result.
     """
     x = _require_float_arg("sin", x)
-    return call_intrin(x.dtype, "tirx.sin", x)
+    return call_intrin(_primexpr_ty(x), "tirx.sin", x)
 
 
 def sinh(x):
@@ -1601,7 +1623,7 @@ def sinh(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.sinh", x)
+    return call_intrin(_primexpr_ty(x), "tirx.sinh", x)
 
 
 def asin(x):
@@ -1618,7 +1640,7 @@ def asin(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.asin", x)
+    return call_intrin(_primexpr_ty(x), "tirx.asin", x)
 
 
 def asinh(x):
@@ -1635,7 +1657,7 @@ def asinh(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.asinh", x)
+    return call_intrin(_primexpr_ty(x), "tirx.asinh", x)
 
 
 def atan(x):
@@ -1652,7 +1674,7 @@ def atan(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.atan", x)
+    return call_intrin(_primexpr_ty(x), "tirx.atan", x)
 
 
 def atanh(x):
@@ -1669,7 +1691,7 @@ def atanh(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.atanh", x)
+    return call_intrin(_primexpr_ty(x), "tirx.atanh", x)
 
 
 def atan2(x1, x2):
@@ -1690,7 +1712,7 @@ def atan2(x1, x2):
     """
     x1 = tir.convert(x1)
     x2 = tir.convert(x2)
-    return call_intrin(x1.dtype, "tirx.atan2", x1, x2)
+    return call_intrin(_primexpr_ty(x1), "tirx.atan2", x1, x2)
 
 
 def sqrt(x):
@@ -1707,7 +1729,7 @@ def sqrt(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.sqrt", x)
+    return call_intrin(_primexpr_ty(x), "tirx.sqrt", x)
 
 
 def rsqrt(x):
@@ -1724,7 +1746,7 @@ def rsqrt(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.rsqrt", x)
+    return call_intrin(_primexpr_ty(x), "tirx.rsqrt", x)
 
 
 def clz(x):
@@ -1971,7 +1993,7 @@ def nextafter(x1, x2):
     """
     x1 = tir.convert(x1)
     x2 = tir.convert(x2)
-    return call_intrin(x1.dtype, "tirx.nextafter", x1, x2)  # type: ignore
+    return call_intrin(_primexpr_ty(x1), "tirx.nextafter", x1, x2)  # type: ignore
 
 
 def hypot(x1, x2):
@@ -1992,7 +2014,7 @@ def hypot(x1, x2):
     """
     x1 = tir.convert(x1)
     x2 = tir.convert(x2)
-    return call_intrin(x1.dtype, "tirx.hypot", x1, x2)  # type: ignore
+    return call_intrin(_primexpr_ty(x1), "tirx.hypot", x1, x2)  # type: ignore
 
 
 def copysign(x1, x2):
@@ -2013,7 +2035,7 @@ def copysign(x1, x2):
     """
     x1 = tir.convert(x1)
     x2 = tir.convert(x2)
-    return call_intrin(x1.dtype, "tirx.copysign", x1, x2)  # type: ignore
+    return call_intrin(_primexpr_ty(x1), "tirx.copysign", x1, x2)  # type: ignore
 
 
 def ldexp(x1, x2):
@@ -2034,7 +2056,7 @@ def ldexp(x1, x2):
     """
     x1 = tir.convert(x1)
     x2 = tir.convert(x2)
-    return call_intrin(x1.dtype, "tirx.ldexp", x1, x2)  # type: ignore
+    return call_intrin(_primexpr_ty(x1), "tirx.ldexp", x1, x2)  # type: ignore
 
 
 def likely(cond, span=None):
@@ -2086,7 +2108,7 @@ def selector(var, pred, span=None):
     active domain for which ``pred`` is true. It is intended for compiler
     metadata and should not survive to executable codegen.
     """
-    return call_intrin(var.dtype, "tirx.selector", var, pred, span=span)
+    return call_intrin(_primexpr_ty(var), "tirx.selector", var, pred, span=span)
 
 
 def isnan(x, span=None):
@@ -2223,7 +2245,7 @@ def popcount(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.popcount", x)
+    return call_intrin(_primexpr_ty(x), "tirx.popcount", x)
 
 
 def q_multiply_shift(x, y, q, s):
@@ -2356,7 +2378,7 @@ def fmod(x, y):
     """
     x = tir.convert(x)
     y = tir.convert(y)
-    return call_intrin(x.dtype, "tirx.fmod", x, y)
+    return call_intrin(_primexpr_ty(x), "tirx.fmod", x, y)
 
 
 def if_then_else(cond, t, f, span=None):
@@ -2667,7 +2689,7 @@ def _make_reduce(expr, axis, where=None, init=None):
             rhs = []
             dtypes = []
             for i in range(size):
-                dtype = expr[i].dtype
+                dtype = _primexpr_dtype(expr[i])
                 dtypes.append(dtype)
                 lname = code.co_varnames[0] + "_" + str(i)
                 lhs.append(Var(lname, dtype))
@@ -2680,7 +2702,7 @@ def _make_reduce(expr, axis, where=None, init=None):
         else:
             assert isinstance(expr, tvm.ir.PrimExpr)
             size = 1
-            dtype = expr.dtype
+            dtype = _primexpr_dtype(expr)
             lvar = Var(code.co_varnames[0], dtype)
             rvar = Var(code.co_varnames[1], dtype)
             result = [fcombine(lvar, rvar)]
diff --git a/python/tvm/tirx/script/builder/external_kernel.py b/python/tvm/tirx/script/builder/external_kernel.py
index d56ed9ea0384..68e597d3f8ff 100644
--- a/python/tvm/tirx/script/builder/external_kernel.py
+++ b/python/tvm/tirx/script/builder/external_kernel.py
@@ -28,6 +28,7 @@
 
 from tvm import __version__ as tvm_version
 from tvm import tirx
+from tvm.ir import PrimExpr
 from tvm.runtime import Module, const
 from tvm.support import nvcc
 
@@ -136,8 +137,10 @@ def compile_to_device_module(  # pylint: disable=arguments-differ
             "threadIdx.y",
             "threadIdx.z",
         ][: len(grid[1])]
-        runtime_args = [arg if hasattr(arg, "dtype") else const(arg) for arg in args]
-        kernel_arg_types = [arg.dtype for arg in runtime_args]
+        runtime_args = [arg if isinstance(arg, PrimExpr) else const(arg) for arg in args]
+        kernel_arg_types = [
+            str(arg.ty.dtype) if isinstance(arg, PrimExpr) else arg.dtype for arg in runtime_args
+        ]
         runtime_args = runtime_args + list(grid[0]) + list(grid[1])
 
         # Reuse compilation path from SourceModule
diff --git a/python/tvm/tirx/script/builder/ir.py b/python/tvm/tirx/script/builder/ir.py
index 2c18c61136b8..12db12aa99db 100644
--- a/python/tvm/tirx/script/builder/ir.py
+++ b/python/tvm/tirx/script/builder/ir.py
@@ -520,7 +520,7 @@ def match_buffer(
             raise ValueError("Shape must be specified when binding input param")
     shape = (shape,) if isinstance(shape, PrimExpr | Integral) else shape
     if strides is not None:
-        idx_dtype = shape[0].dtype if isinstance(shape[0], PrimExpr) else "int32"
+        idx_dtype = shape[0].ty if isinstance(shape[0], PrimExpr) else "int32"
         strides = [Var(s, idx_dtype) if isinstance(s, str) else s for s in strides]
     else:
         strides = []
@@ -1012,8 +1012,8 @@ def _as_range(dom: ir.Range | list[PrimExpr]) -> ir.Range:
         if isinstance(extent, tir.IntImm):
             return ir.Range.from_min_extent(dom[0], extent)
         return ir.Range(dom[0], dom[1])
-    if hasattr(dom, "dtype"):
-        return ir.Range(IntImm(dom.dtype, 0), dom)
+    if isinstance(dom, PrimExpr):
+        return ir.Range(IntImm(dom.ty, 0), dom)
     return ir.Range(0, dom)
 
 
@@ -1204,8 +1204,8 @@ def serial(
             annotations["disable_unroll"] = True
     if stop is None:
         stop = start
-        if hasattr(start, "dtype"):
-            start = IntImm(start.dtype, 0)
+        if isinstance(start, PrimExpr):
+            start = IntImm(start.ty, 0)
         else:
             start = 0
     return _ffi_api.Serial(start, stop, annotations, step)  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -1241,8 +1241,8 @@ def parallel(
     """
     if stop is None:
         stop = start
-        if hasattr(start, "dtype"):
-            start = IntImm(start.dtype, 0)
+        if isinstance(start, PrimExpr):
+            start = IntImm(start.ty, 0)
         else:
             start = 0
     return _ffi_api.Parallel(start, stop, annotations, step)  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -1278,8 +1278,8 @@ def vectorized(
     """
     if stop is None:
         stop = start
-        if hasattr(start, "dtype"):
-            start = IntImm(start.dtype, 0)
+        if isinstance(start, PrimExpr):
+            start = IntImm(start.ty, 0)
         else:
             start = 0
     return _ffi_api.Vectorized(start, stop, annotations, step)  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -1315,8 +1315,8 @@ def unroll(
     """
     if stop is None:
         stop = start
-        if hasattr(start, "dtype"):
-            start = IntImm(start.dtype, 0)
+        if isinstance(start, PrimExpr):
+            start = IntImm(start.ty, 0)
         else:
             start = 0
     return _ffi_api.Unroll(start, stop, annotations, step)  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -1355,14 +1355,14 @@ def thread_binding(
             raise ValueError("Thread cannot be None for thread_binding")
         thread = stop
         stop = start
-        if hasattr(start, "dtype"):
-            start = IntImm(start.dtype, 0)
+        if isinstance(start, PrimExpr):
+            start = IntImm(start.ty, 0)
         else:
             start = 0
     elif stop is None:
         stop = start
-        if hasattr(start, "dtype"):
-            start = IntImm(start.dtype, 0)
+        if isinstance(start, PrimExpr):
+            start = IntImm(start.ty, 0)
         else:
             start = 0
     return _ffi_api.ThreadBinding(  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -1502,7 +1502,8 @@ def as_var(self, rhs_dtype=None):
             else:
                 raise TypeError(f"Invalid type for T.let: {self.type_spec}")
         elif rhs_dtype is not None:
-            return Var("", ir.PrimType(rhs_dtype))
+            rhs_ty = rhs_dtype if isinstance(rhs_dtype, Type) else ir.PrimType(rhs_dtype)
+            return Var("", rhs_ty)
         else:
             raise TypeError("T.let requires either a type or an RHS value")
 
@@ -2799,7 +2800,7 @@ def comm_reducer(combiner: Callable, identity: list[PrimExpr]) -> CommReducer:
         if isinstance(i, int):
             args.append(Var(name, "int32"))
         else:
-            args.append(Var(name, i.dtype))
+            args.append(Var(name, i.ty))
     res = combiner(*args)
     if not isinstance(res, tuple):
         res = (res,)
@@ -2986,19 +2987,19 @@ class WebGPUNamespace:
     def subgroup_shuffle(var, lane):
         if isinstance(var, Buffer):
             var = var[0]
-        return _tir_op.call_intrin(var.dtype, "tirx.webgpu.subgroup_shuffle", var, lane)
+        return _tir_op.call_intrin(var.ty, "tirx.webgpu.subgroup_shuffle", var, lane)
 
     @staticmethod
     def subgroup_shuffle_up(var, delta):
         if isinstance(var, Buffer):
             var = var[0]
-        return _tir_op.call_intrin(var.dtype, "tirx.webgpu.subgroup_shuffle_up", var, delta)
+        return _tir_op.call_intrin(var.ty, "tirx.webgpu.subgroup_shuffle_up", var, delta)
 
     @staticmethod
     def subgroup_shuffle_down(var, delta):
         if isinstance(var, Buffer):
             var = var[0]
-        return _tir_op.call_intrin(var.dtype, "tirx.webgpu.subgroup_shuffle_down", var, delta)
+        return _tir_op.call_intrin(var.ty, "tirx.webgpu.subgroup_shuffle_down", var, delta)
 
 
 webgpu = WebGPUNamespace()
diff --git a/python/tvm/tirx/script/parser/operation.py b/python/tvm/tirx/script/parser/operation.py
index dac8f06ebf80..4f362b7d3acf 100644
--- a/python/tvm/tirx/script/parser/operation.py
+++ b/python/tvm/tirx/script/parser/operation.py
@@ -17,7 +17,8 @@
 """The tirx expression operation registration"""
 
 from tvm import tirx
-from tvm.runtime import DataType, DataTypeCode
+from tvm.ir import PrimType
+from tvm.runtime import DataTypeCode
 from tvm.script.parser._core import OpMethod, doc, register_op
 from tvm.tirx import IntImm
 from tvm.tirx.expr import FloatImm
@@ -26,12 +27,20 @@
 def _register_expr_op(ty: type):  # pylint: disable=invalid-name
     ty._dispatch_type = ty  # pylint: disable=protected-access
 
+    def _expr_ty(expr):
+        ty = expr.ty if isinstance(expr, tirx.PrimExpr) else None
+        if not isinstance(ty, PrimType):
+            ty = expr.expr_ty()
+        if not isinstance(ty, PrimType):
+            raise TypeError(f"Expected a PrimType expression, but got {ty}")
+        return ty
+
     def _and(a, b):
         if isinstance(a, bool):
             a = IntImm("bool", a)
         if isinstance(b, bool):
             b = IntImm("bool", b)
-        if DataType(a.dtype).lanes > 1 or DataType(b.dtype).lanes > 1:
+        if not _expr_ty(a).is_scalar() or not _expr_ty(b).is_scalar():
             return a & b
         else:
             return tirx.And(a, b)
@@ -41,58 +50,56 @@ def _or(a, b):
             a = IntImm("bool", a)
         if isinstance(b, bool):
             b = IntImm("bool", b)
-        if DataType(a.dtype).lanes > 1 or DataType(b.dtype).lanes > 1:
+        if not _expr_ty(a).is_scalar() or not _expr_ty(b).is_scalar():
             return a | b
         else:
             return tirx.Or(a, b)
 
-    def _get_type_str(dtype: str):
-        if DataType(dtype).lanes == 1:
-            return dtype
-        index = dtype.find("x")
-        return dtype[0:index]
+    def _get_type_str(ty: PrimType):
+        dtype_str = str(ty.dtype)
+        if ty.is_scalar():
+            return dtype_str
+        index = dtype_str.find("x")
+        return dtype_str[0:index]
 
     def _auto_broadcast(a, b, op):
         if isinstance(a, int):
-            if hasattr(b, "dtype"):
-                if (
-                    DataType(b.dtype).type_code == DataTypeCode.INT
-                    or DataType(b.dtype).type_code == DataTypeCode.UINT
-                    or DataType(b.dtype).type_code == DataTypeCode.BOOL
-                ):
-                    a = IntImm(_get_type_str(b.dtype), a)
-                elif DataType(b.dtype).type_code == DataTypeCode.FLOAT:
-                    a = FloatImm(_get_type_str(b.dtype), a)
+            if isinstance(b, tirx.PrimExpr) or hasattr(b, "expr_ty"):
+                b_ty = _expr_ty(b)
+                if b_ty.matches_code(DataTypeCode.INT, DataTypeCode.UINT, DataTypeCode.BOOL):
+                    a = IntImm(_get_type_str(b_ty), a)
+                elif b_ty.matches_code(DataTypeCode.FLOAT):
+                    a = FloatImm(_get_type_str(b_ty), a)
             elif isinstance(b, float):
                 a = FloatImm("float32", a)
             else:
                 a = IntImm("int32", a)
         elif isinstance(a, float):
-            if DataType(b.dtype).type_code == DataTypeCode.FLOAT:
-                a = FloatImm(_get_type_str(b.dtype), a)
+            b_ty = _expr_ty(b)
+            if b_ty.matches_code(DataTypeCode.FLOAT):
+                a = FloatImm(_get_type_str(b_ty), a)
             else:
                 a = FloatImm("float32", a)
 
         assert isinstance(a, tirx.PrimExpr), "Operand should be a PrimExpr."
         if isinstance(b, int):
-            if (
-                DataType(a.dtype).type_code == DataTypeCode.INT
-                or DataType(a.dtype).type_code == DataTypeCode.UINT
-                or DataType(a.dtype).type_code == DataTypeCode.BOOL
-            ):
-                b = IntImm(_get_type_str(a.dtype), b)
-            elif DataType(a.dtype).type_code == DataTypeCode.FLOAT:
-                b = FloatImm(_get_type_str(a.dtype), b)
+            a_ty = _expr_ty(a)
+            if a_ty.matches_code(DataTypeCode.INT, DataTypeCode.UINT, DataTypeCode.BOOL):
+                b = IntImm(_get_type_str(a_ty), b)
+            elif a_ty.matches_code(DataTypeCode.FLOAT):
+                b = FloatImm(_get_type_str(a_ty), b)
         elif isinstance(b, float):
-            b = FloatImm(_get_type_str(a.dtype), b)
+            b = FloatImm(_get_type_str(_expr_ty(a)), b)
 
-        if DataType(a.dtype).lanes == DataType(b.dtype).lanes:
+        a_ty = _expr_ty(a)
+        b_ty = _expr_ty(b)
+        if a_ty.dtype.lanes == b_ty.dtype.lanes:
             return op(a, b)
-        elif DataType(a.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes:
-            broadcast_a = tirx.Broadcast(a, DataType(b.dtype).lanes)
+        elif a_ty.is_scalar() and a_ty.dtype.lanes != b_ty.dtype.lanes:
+            broadcast_a = tirx.Broadcast(a, b_ty.dtype.lanes)
             return op(broadcast_a, b)
-        elif DataType(b.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes:
-            broadcast_b = tirx.Broadcast(b, DataType(a.dtype).lanes)
+        elif b_ty.is_scalar() and a_ty.dtype.lanes != b_ty.dtype.lanes:
+            broadcast_b = tirx.Broadcast(b, a_ty.dtype.lanes)
             return op(a, broadcast_b)
         else:
             raise TypeError("do not know how to deal with it.")
diff --git a/python/tvm/tirx/script/parser/parser.py b/python/tvm/tirx/script/parser/parser.py
index 54c18db374d8..b2f2b30063a8 100644
--- a/python/tvm/tirx/script/parser/parser.py
+++ b/python/tvm/tirx/script/parser/parser.py
@@ -225,13 +225,13 @@ def bind_assign_value(self: Parser, node: doc.expr, var_name: str, value: Any) -
             value = tvm.tirx.const(value)
         if not isinstance(value, tvm.tirx.StringImm):
             # x = expr -> scalar (auto-typed from value)
-            scalar = T.local_scalar(dtype=str(value.dtype))
+            scalar = T.local_scalar(dtype=str(value.ty.dtype))
             IRBuilder.name(var_name, scalar.scalar.buffer)
             T.buffer_store(scalar.scalar.buffer, value, [0])
             return scalar.scalar
         else:
             # StringImm: x = expr -> immutable Bind var
-            ann_var = tvm.tirx.Var(var_name, value.dtype)
+            ann_var = tvm.tirx.Var(var_name, value.ty)
             IRBuilder.name(var_name, ann_var)
             T.Bind(value, var=ann_var)
             return ann_var
@@ -539,7 +539,7 @@ def visit_ann_assign(self: Parser, node: doc.AnnAssign) -> None:
         if raw_ann.type_spec is not None:
             ann_var = raw_ann.as_var()
         else:
-            ann_var = raw_ann.as_var(rhs_dtype=rhs.dtype)
+            ann_var = raw_ann.as_var(rhs_dtype=rhs.ty)
         if not isinstance(ann_var, Var):
             self.report_error(node.annotation, "Annotation should resolve to Var")
         self.eval_assign(target=lhs, source=ann_var, bind_value=bind_assign_value)
@@ -619,7 +619,7 @@ def visit_function_def(self: Parser, node: doc.FunctionDef) -> None:
             if node.returns is not None:
                 ret_type = self.eval_expr(node.returns)
                 if callable(ret_type):
-                    ret_type = PrimType(ret_type().dtype)
+                    ret_type = ret_type().ty
                 T.func_ret(ret_type)
             with self.with_dispatch_token("tirx"):
                 # TODO: handle different types of arguments:
@@ -888,7 +888,7 @@ def visit_tvm_declare_function(self: Parser, node: doc.FunctionDef) -> GlobalVar
         if node.returns is not None:
             ret_type = self.eval_expr(node.returns)
             if callable(ret_type):
-                ret_type = PrimType(ret_type().dtype)
+                ret_type = ret_type().ty
 
         arg_annotations = []
         for arg in node.args.args:
diff --git a/python/tvm/tirx/stmt.py b/python/tvm/tirx/stmt.py
index 532bf35b254a..543ff99fed66 100644
--- a/python/tvm/tirx/stmt.py
+++ b/python/tvm/tirx/stmt.py
@@ -35,7 +35,7 @@
 
 from tvm.ir import Op, PrimExpr, Range, Span
 from tvm.runtime import Object, Scriptable, const
-from tvm.tirx import FloatImm
+from tvm.tirx import FloatImm, IntImm
 
 from . import _ffi_api
 from .buffer import Buffer
@@ -656,7 +656,7 @@ def __getitem__(self, indices):
                 new_min = old_range.min + index
                 new_region.append(
                     Range.from_min_extent(
-                        new_min, const(1, index.dtype) if isinstance(index, PrimExpr) else 1
+                        new_min, IntImm(index.ty, 1) if isinstance(index, PrimExpr) else 1
                     )
                 )
         # Fill remaining dimensions with their original ranges
diff --git a/python/tvm/topi/math.py b/python/tvm/topi/math.py
index d3e8991c85c7..6088c4baa800 100644
--- a/python/tvm/topi/math.py
+++ b/python/tvm/topi/math.py
@@ -18,7 +18,7 @@
 
 # pylint: disable=redefined-builtin,unused-argument
 import tvm
-from tvm import DataType, DataTypeCode, te
+from tvm import DataTypeCode, te
 from tvm.tirx import PrimExpr
 
 from . import cpp, tag
@@ -26,11 +26,15 @@
 
 
 def _require_float_tensor(op_name, x):
-    if DataType(x.dtype).type_code not in (DataTypeCode.FLOAT, DataTypeCode.BFLOAT):
+    if not x.dtype.matches_code(DataTypeCode.FLOAT, DataTypeCode.BFLOAT):
         raise TypeError(f"topi.{op_name} only supports floating-point inputs, but got {x.dtype}")
     return x
 
 
+def _is_integer_tensor(x):
+    return x.dtype.matches_code(DataTypeCode.INT, DataTypeCode.UINT)
+
+
 @tvm.te.tag_scope(tag=tag.ELEMWISE)
 def identity(x):
     """Take identity of input x.
@@ -478,7 +482,7 @@ def log(x):
     y : tvm.te.Tensor
         The result.
     """
-    if x.dtype.startswith("int"):
+    if x.dtype.matches_code(DataTypeCode.INT):
         x = te.compute(x.shape, lambda *i: x(*i).astype("float32"))
     return te.compute(x.shape, lambda *i: te.log(x(*i)), tag=tag.ELEMWISE)
 
@@ -496,7 +500,7 @@ def log2(x):
     y : tvm.te.Tensor
         The result.
     """
-    if x.dtype.startswith("int"):
+    if x.dtype.matches_code(DataTypeCode.INT):
         x = te.compute(x.shape, lambda *i: x(*i).astype("float32"))
     return te.compute(x.shape, lambda *i: te.log2(x(*i)), tag=tag.ELEMWISE)
 
@@ -514,7 +518,7 @@ def log10(x):
     y : tvm.te.Tensor
         The result.
     """
-    if x.dtype.startswith("int"):
+    if x.dtype.matches_code(DataTypeCode.INT):
         x = te.compute(x.shape, lambda *i: x(*i).astype("float32"))
     return te.compute(x.shape, lambda *i: te.log10(x(*i)), tag=tag.ELEMWISE)
 
@@ -533,7 +537,7 @@ def sqrt(x):
     y : tvm.te.Tensor
         The result.
     """
-    if x.dtype.startswith("int"):
+    if x.dtype.matches_code(DataTypeCode.INT):
         x = te.compute(x.shape, lambda *i: x(*i).astype("float32"))
     return te.compute(x.shape, lambda *i: te.sqrt(x(*i)))
 
@@ -552,7 +556,7 @@ def rsqrt(x):
     y : tvm.te.Tensor
         The result.
     """
-    if x.dtype.startswith("int"):
+    if x.dtype.matches_code(DataTypeCode.INT):
         x = te.compute(x.shape, lambda *i: x(*i).astype("float32"))
     return te.compute(x.shape, lambda *i: te.rsqrt(x(*i)))
 
@@ -798,7 +802,7 @@ def fast_exp(x):
     y : tvm.te.Tensor
         The result.
     """
-    if x.dtype.startswith("int") or x.dtype.startswith("uint"):
+    if _is_integer_tensor(x):
         x = cast(x, "float32")
     return cpp.fast_exp(x, x.dtype, tag.ELEMWISE)
 
@@ -816,7 +820,7 @@ def fast_tanh(x):
     y : tvm.te.Tensor
         The result.
     """
-    if x.dtype.startswith("int") or x.dtype.startswith("uint"):
+    if _is_integer_tensor(x):
         x = cast(x, "float32")
     return cpp.fast_tanh(x, x.dtype, tag.ELEMWISE)
 
@@ -855,24 +859,26 @@ def ceil_log2(x):
     if not isinstance(x, tvm.tirx.PrimExpr):
         x = tvm.tirx.const(x)
 
-    if "float" in x.dtype:
+    if x.ty.matches_code(DataTypeCode.FLOAT, DataTypeCode.BFLOAT):
         return tvm.tirx.ceil(tvm.tirx.log2(x))
 
     target = tvm.target.Target.current()
 
-    if "vulkan" in target.kind.name:
-        clz = tvm.tirx.clz(x)
-        bits = int(x.dtype[-2:])
-        res = tvm.tirx.if_then_else(x & (x - 1) == 0, bits - clz - 1, bits - clz)
-        if res.dtype != x.dtype:
-            return cast(res, x.dtype)
-        return res
-
-    if "adreno" in str(target.attrs.get("device", "")) or target.kind.name in [
-        "metal",
-        "rocm",
-        "webgpu",
-    ]:
-        return cast(tvm.tirx.ceil(tvm.tirx.log2(cast(x, "float32"))), x.dtype)
+    if target is not None:
+        target_name = target.kind.name
+        if "vulkan" in target_name:
+            clz = tvm.tirx.clz(x)
+            bits = x.ty.dtype.bits
+            res = tvm.tirx.if_then_else(x & (x - 1) == 0, bits - clz - 1, bits - clz)
+            if res.dtype != x.dtype:
+                return cast(res, x.dtype)
+            return res
+
+        if "adreno" in str(target.attrs.get("device", "")) or target_name in [
+            "metal",
+            "rocm",
+            "webgpu",
+        ]:
+            return cast(tvm.tirx.ceil(tvm.tirx.log2(cast(x, "float32"))), x.dtype)
 
     return cast(tvm.tirx.ceil(tvm.tirx.log2(cast(x, "float64"))), x.dtype)
diff --git a/python/tvm/topi/scatter.py b/python/tvm/topi/scatter.py
index bf5b86599854..de35577c4d85 100644
--- a/python/tvm/topi/scatter.py
+++ b/python/tvm/topi/scatter.py
@@ -18,7 +18,7 @@
 # ruff: noqa: E741
 """ScatterND operator"""
 
-from tvm import te, tirx  # hide redefinition of min and max
+from tvm import DataTypeCode, te, tirx  # hide redefinition of min and max
 from tvm.arith.analyzer import Analyzer
 from tvm.script.ir_builder import IRBuilder
 from tvm.script.ir_builder import tirx as T
@@ -49,7 +49,7 @@ def _verify_scatter_nd_inputs(data, indices, updates):
             f"of out_shape[{i}] ({data.shape[i]})."
         )
 
-    assert "int" in indices.dtype, (
+    assert indices.dtype.matches_code(DataTypeCode.INT, DataTypeCode.UINT), (
         f"Indices must be a tensor of integers, but its elements are {indices.dtype}."
     )
 
diff --git a/python/tvm/topi/sort.py b/python/tvm/topi/sort.py
index 81821e462dcf..846573db5036 100644
--- a/python/tvm/topi/sort.py
+++ b/python/tvm/topi/sort.py
@@ -110,7 +110,7 @@ def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
         f = tvm.compile(s, [data, out], "llvm")
         dev = tvm.cpu()
         tvm_data = tvm.runtime.tensor(np_data, dev)
-        tvm_out = tvm.runtime.tensor(np.zeros(dshape, dtype=data.dtype), dev)
+        tvm_out = tvm.runtime.tensor(np.zeros(dshape, dtype=data.dtype.dtype), dev)
         f(tvm_data, tvm_out)
     """
     data_buf = tvm.tirx.decl_buffer(
diff --git a/src/arith/analyzer.cc b/src/arith/analyzer.cc
index fc59f891e1bf..94eb8788846b 100644
--- a/src/arith/analyzer.cc
+++ b/src/arith/analyzer.cc
@@ -73,7 +73,8 @@ void AnalyzerObj::Bind(const Var& var, const Range& range, bool allow_override)
 void AnalyzerObj::MarkGlobalNonNegValue(const PrimExpr& value) {
   // decompose value as symbol * scale + offset
   int64_t offset = 0;
-  PrimExpr symbol_scale = tirx::MakeConst(value.dtype(), 0);
+  PrimType value_ty = value.ty();
+  PrimExpr symbol_scale = tirx::MakeConst(value_ty, 0);
 
   auto fcollect_sum = [&](PrimExpr val, int sign) {
     if (const auto* intimm = val.as<IntImmNode>()) {
@@ -90,7 +91,7 @@ void AnalyzerObj::MarkGlobalNonNegValue(const PrimExpr& value) {
 
   // split out the symbol and non-symbolic part
   int64_t cscale = 1;
-  PrimExpr symbol = tirx::MakeConst(value.dtype(), 1);
+  PrimExpr symbol = tirx::MakeConst(value_ty, 1);
   auto fcollect_prod = [&](PrimExpr val) {
     if (const auto* intimm = val.as<IntImmNode>()) {
       cscale *= intimm->value;
@@ -110,7 +111,7 @@ void AnalyzerObj::MarkGlobalNonNegValue(const PrimExpr& value) {
     Var var = ffi::GetRef<Var>(var_ptr);
     // skip non-index type, keep it to be compatible
     // with any_dim that do not represent any value
-    if (!IsIndexType(var.dtype())) return;
+    if (!IsIndexTypedExpr(var)) return;
     bool allow_override = true;
     // mark the constant bound is sufficient
     // we cannot mark interval set as that will cause relaxation of the var
@@ -169,7 +170,7 @@ bool AnalyzerObj::CanProveEqual(const PrimExpr& lhs, const PrimExpr& rhs) {
   const auto* clhs = lhs.as<IntImmNode>();
   const auto* crhs = rhs.as<IntImmNode>();
   if (clhs && crhs) return clhs->value == crhs->value;
-  if (lhs->dtype.is_handle() || rhs->dtype.is_handle()) {
+  if (lhs->ty().IsHandle() || rhs->ty().IsHandle()) {
     return lhs.same_as(rhs);
   }
   return CanProve(lhs - rhs == 0);
@@ -189,7 +190,7 @@ bool AnalyzerObj::CanProveLessEqualThanSymbolicShapeValue(const PrimExpr& lhs,
     }
   };
   UnpackReduction<tirx::MulNode>(shape, fcollect);
-  PrimExpr const_shape_bound = IntImm(shape.dtype(), std::abs(cscale));
+  PrimExpr const_shape_bound = IntImm(shape.ty(), std::abs(cscale));
   if (this->CanProve(lhs <= const_shape_bound, ProofStrength::kSymbolicBound)) return true;
   return false;
 }
diff --git a/src/arith/bound_deducer.cc b/src/arith/bound_deducer.cc
index 475a687cd462..01d50da56e41 100644
--- a/src/arith/bound_deducer.cc
+++ b/src/arith/bound_deducer.cc
@@ -96,7 +96,8 @@ class BoundDeducer : public ExprFunctor<void(const PrimExpr&)> {
   void VisitExprDefault_(const ffi::Object* op) final { success_ = false; }
 
   SignType GetSignType(const PrimExpr& e) {
-    if (e.dtype().is_uint()) {
+    PrimType e_ty = e.ty();
+    if (e_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
       return kPositive;
     }
     return expr_map_[e].GetSignType();
diff --git a/src/arith/canonical_simplify.cc b/src/arith/canonical_simplify.cc
index 12344cffd1d8..17a6ba022e2b 100644
--- a/src/arith/canonical_simplify.cc
+++ b/src/arith/canonical_simplify.cc
@@ -83,14 +83,14 @@ inline PrimExpr DivImpl(PrimExpr a, PrimExpr b, DivMode mode) {
  * \param analyzer The analyzer
  * \return whether value fits in dtype
  */
-bool CastIsSafe(DataType dtype, PrimExpr value, AnalyzerObj* analyzer) {
-  if (!IsIndexType(dtype)) {
+bool CastIsSafe(PrimType dtype, PrimExpr value, AnalyzerObj* analyzer) {
+  if (!IsIndexType(dtype->dtype)) {
     return false;
   }
   ConstIntBound bound = analyzer->const_int_bound(value);
   int64_t ubound = max_value(dtype).as_or_throw<IntImm>()->value;
   int64_t lbound = min_value(dtype).as_or_throw<IntImm>()->value;
-  if (value.dtype().bits() <= dtype.bits() ||  // upcast is safe
+  if (value.ty().bits() <= dtype.bits() ||  // upcast is safe
       (bound->max_value <= ubound && bound->min_value >= lbound)) {
     return true;
   }
@@ -128,7 +128,7 @@ class SplitExprNode : public CanonicalExprNode {
 
   PrimExpr NormalizeWithScale(int64_t sscale) const {
     PrimExpr res = this->index;
-    DataType dtype = this->dtype;
+    PrimType dtype = this->ty();
     if (this->scale == 0) {
       return IntImm(dtype, 0);
     }
@@ -140,7 +140,7 @@ class SplitExprNode : public CanonicalExprNode {
     }
     sscale *= this->scale;
     if (sscale != 1) {
-      TVM_FFI_ICHECK(!dtype.is_uint() || sscale > 0);
+      TVM_FFI_ICHECK(dtype.code() != DLDataTypeCode::kDLUInt || sscale > 0);
       res = res * MakeConst(dtype, sscale);
     }
     return res;
@@ -156,12 +156,12 @@ class SplitExprNode : public CanonicalExprNode {
    * \param analyzer The analyzer
    * \return whether the cast can be safely pushed to children
    */
-  bool CanPushCastToChildren(DataType dtype, AnalyzerObj* analyzer) const {
+  bool CanPushCastToChildren(PrimType dtype, AnalyzerObj* analyzer) const {
     // cast(dtype, index % upper_factor / lower_factor * scale) ==
     // cast(dtype, index) % upper_factor / lower_factor * scale
     // iff it is an upcast (dtype.bits >= self.dtype.bits) or all of
     // its intermediate results fit in the range of dtype
-    if (dtype.bits() >= this->dtype.bits()) {
+    if (dtype.bits() >= this->ty().bits()) {
       return true;  // upcast is safe
     }
     PrimExpr res = this->index;
@@ -172,20 +172,20 @@ class SplitExprNode : public CanonicalExprNode {
       return false;
     }
     if (this->upper_factor != SplitExprNode::kPosInf) {
-      res = ModImpl(res, MakeConst(this->dtype, this->upper_factor), div_mode);
+      res = ModImpl(res, MakeConst(this->ty(), this->upper_factor), div_mode);
       if (!CastIsSafe(dtype, res, analyzer)) {
         return false;
       }
     }
     if (this->lower_factor != 1) {
-      res = DivImpl(res, MakeConst(this->dtype, this->lower_factor), div_mode);
+      res = DivImpl(res, MakeConst(this->ty(), this->lower_factor), div_mode);
       if (!CastIsSafe(dtype, res, analyzer)) {
         return false;
       }
     }
     if (this->scale != 1) {
-      TVM_FFI_ICHECK(!this->dtype.is_uint() || this->scale > 0);
-      res = res * MakeConst(this->dtype, this->scale);
+      TVM_FFI_ICHECK(this->ty().code() != DLDataTypeCode::kDLUInt || this->scale > 0);
+      res = res * MakeConst(this->ty(), this->scale);
       if (!CastIsSafe(dtype, res, analyzer)) {
         return false;
       }
@@ -197,9 +197,9 @@ class SplitExprNode : public CanonicalExprNode {
    * \brief self = cast(dtype, self)
    * \param dtype The target datatype
    */
-  void PushCastToChildren(DataType dtype) {
+  void PushCastToChildren(PrimType dtype) {
     this->index = cast(dtype, this->index);
-    this->dtype = dtype;
+    this->BaseExprNode::ty = dtype;
   }
 
   inline bool IndexEqual(const SplitExpr& other) const;
@@ -252,9 +252,9 @@ class SumExprNode : public CanonicalExprNode {
   PrimExpr Normalize() const final {
     // quick path 1.
     if (this->args.size() == 0) {
-      return MakeConst(this->dtype, this->base);
+      return MakeConst(this->ty(), this->base);
     }
-    return Normalize_(this->dtype, SimplifySplitExprs(args), base);
+    return Normalize_(this->ty(), SimplifySplitExprs(args), base);
   }
   /*!
    * \brief Whether self is divisible by scale.
@@ -334,14 +334,14 @@ class SumExprNode : public CanonicalExprNode {
    * \param analyzer The analyzer
    * \return whether the cast can be safely pushed to children
    */
-  bool CanPushCastToChildren(DataType dtype, AnalyzerObj* analyzer) const {
+  bool CanPushCastToChildren(PrimType dtype, AnalyzerObj* analyzer) const {
     bool is_min_value = dtype.bits() == 64 ? base == std::numeric_limits<int64_t>::lowest()
                                            : base == -(1LL << (dtype.bits() - 1));
     // cast(dtype, arg_1 + arg_2 + ... arg_n) ==
     // cast(dtype, arg_1) + ... + cast(dtype, arg_n)
     // iff it is an upcast (dtype.bits >= self.dtype.bits) or all of
     // its intermediate results fit in the range of dtype
-    if (dtype.bits() >= this->dtype.bits()) {
+    if (dtype.bits() >= this->ty().bits()) {
       return true;  // upcast is safe
     }
     PrimExpr res = IntImm(dtype, 0);
@@ -386,11 +386,11 @@ class SumExprNode : public CanonicalExprNode {
    * \brief self = cast(dtype, self)
    * \param dtype The target datatype
    */
-  void PushCastToChildren(DataType dtype) {
+  void PushCastToChildren(PrimType dtype) {
     for (auto& arg : args) {
       arg.CopyOnWrite()->PushCastToChildren(dtype);
     }
-    this->dtype = dtype;
+    this->BaseExprNode::ty = dtype;
   }
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("arith.SumExpr", SumExprNode, CanonicalExprNode);
 
@@ -496,7 +496,7 @@ class SumExprNode : public CanonicalExprNode {
     std::stable_sort(args.begin(), args.end(), fcompare);
     return args;
   }
-  static PrimExpr Normalize_(DataType dtype, const std::vector<SplitExpr>& args, int64_t base) {
+  static PrimExpr Normalize_(PrimType dtype, const std::vector<SplitExpr>& args, int64_t base) {
     bool is_min_value = dtype.bits() == 64 ? base == std::numeric_limits<int64_t>::lowest()
                                            : base == -(1LL << (dtype.bits() - 1));
     // Positive scales first
@@ -648,7 +648,7 @@ class CanonicalSimplifier::Impl : public RewriteSimplifier::Impl {
       expr = op->Normalize();
     }
     ffi::ObjectPtr<SplitExprNode> n = ffi::make_object<SplitExprNode>();
-    n->dtype = expr.dtype();
+    n->BaseExprNode::ty = expr.ty();
     n->index = std::move(expr);
     n->div_mode = kTruncDiv;
     return SplitExpr(n);
@@ -685,7 +685,7 @@ class CanonicalSimplifier::Impl : public RewriteSimplifier::Impl {
       return op.value();
     }
     ffi::ObjectPtr<SumExprNode> n = ffi::make_object<SumExprNode>();
-    n->dtype = expr.dtype();
+    n->BaseExprNode::ty = expr.ty();
     if (const auto* op = expr.as<IntImmNode>()) {
       n->base = op->value;
       return SumExpr(n);
@@ -699,7 +699,7 @@ class CanonicalSimplifier::Impl : public RewriteSimplifier::Impl {
 };
 
 PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const AddNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Rewriter::VisitExpr_(op);
   }
   // normalize
@@ -723,7 +723,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const AddNode* op) {
 }
 
 PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const SubNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Rewriter::VisitExpr_(op);
   }
   // normalize
@@ -747,7 +747,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const SubNode* op) {
 }
 
 PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const MulNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Rewriter::VisitExpr_(op);
   }
   // normalize
@@ -794,8 +794,8 @@ void CanonicalSimplifier::Impl::SeparateDivisibleParts(const SumExprNode* psum,
                                                        SumExpr* out_non_divisible) {
   auto divisible = ffi::make_object<SumExprNode>();
   auto non_divisible = ffi::make_object<SumExprNode>();
-  divisible->dtype = psum->dtype;
-  non_divisible->dtype = psum->dtype;
+  divisible->BaseExprNode::ty = psum->ty();
+  non_divisible->BaseExprNode::ty = psum->ty();
 
   if (psum->base % coeff == 0) {
     divisible->base = psum->base;
@@ -834,11 +834,11 @@ SplitExpr CanonicalSimplifier::Impl::SplitDivConst(SplitExpr lhs, int64_t cval,
       return lhs;
     } else if (lhs->upper_factor <= (lhs->lower_factor * scaled_cval)) {
       // (x % c1) / c2  => 0 when c2 >= c1
-      return ToSplitExpr(IntImm(lhs.dtype(), 0));
+      return ToSplitExpr(IntImm(lhs.ty(), 0));
     } else {
       // move the upper_factor modular into index.
       lhs.CopyOnWrite()->index =
-          ModImpl(lhs->index, MakeConst(lhs.dtype(), lhs->upper_factor), div_mode);
+          ModImpl(lhs->index, MakeConst(lhs.ty(), lhs->upper_factor), div_mode);
       lhs.CopyOnWrite()->upper_factor = SplitExprNode::kPosInf;
       lhs.CopyOnWrite()->scale = 1;
       lhs.CopyOnWrite()->lower_factor *= scaled_cval;
@@ -862,8 +862,9 @@ bool CanonicalSimplifier::Impl::ProdDivSimplify(PrimExpr* plhs, PrimExpr* prhs,
   if (prhs->as<IntImmNode>()) return false;
   // collect lhs products and try to eliminate by matching them to prod in rhs
   ffi::Array<ffi::Optional<PrimExpr>> lhs_prods;
-  PrimExpr new_rhs = MakeConst(prhs->dtype(), 1);
-  PrimExpr new_common_scale = MakeConst(prhs->dtype(), 1);
+  PrimType rhs_ty = prhs->ty();
+  PrimExpr new_rhs = MakeConst(rhs_ty, 1);
+  PrimExpr new_common_scale = MakeConst(rhs_ty, 1);
   int64_t lhs_cscale = 1, rhs_cscale = 1;
   int num_elimination = 0;
 
@@ -905,18 +906,19 @@ bool CanonicalSimplifier::Impl::ProdDivSimplify(PrimExpr* plhs, PrimExpr* prhs,
   if (num_elimination == 0 && cscale_gcd == 1) return false;
 
   // construct prod via canonical form
-  PrimExpr new_lhs = MakeConst(plhs->dtype(), 1);
+  PrimType lhs_ty = plhs->ty();
+  PrimExpr new_lhs = MakeConst(lhs_ty, 1);
   for (ffi::Optional<PrimExpr> val : lhs_prods) {
     if (val.defined()) new_lhs = new_lhs * val.value();
   }
-  *plhs = new_lhs * MakeConst(plhs->dtype(), lhs_cscale);
-  *prhs = new_rhs * MakeConst(prhs->dtype(), rhs_cscale);
-  *common_scale = new_common_scale * MakeConst(prhs->dtype(), cscale_gcd);
+  *plhs = new_lhs * MakeConst(lhs_ty, lhs_cscale);
+  *prhs = new_rhs * MakeConst(rhs_ty, rhs_cscale);
+  *common_scale = new_common_scale * MakeConst(rhs_ty, cscale_gcd);
   return true;
 }
 
 PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const DivNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Rewriter::VisitExpr_(op);
   }
 
@@ -958,7 +960,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const DivNode* op) {
       // if a >= 0 && a < cval, then result == 0
       auto cbound = analyzer_->const_int_bound(Normalize(a));
       if (cbound->min_value >= 0 && cbound->max_value < cval) {
-        return IntImm(a.dtype(), 0);
+        return IntImm(a.ty(), 0);
       }
     }
     return SplitDivConst(ToSplitExpr(std::move(a)), cval, kTruncDiv);
@@ -980,7 +982,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const DivNode* op) {
 }
 
 PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Rewriter::VisitExpr_(op);
   }
   PrimExpr a = this->CanonicalMutate(op->a);
@@ -1019,7 +1021,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
       // if a >= 0 && a < cval, then result == 0
       auto cbound = analyzer_->const_int_bound(Normalize(a));
       if (cbound->min_value >= 0 && cbound->max_value < cval) {
-        return IntImm(a.dtype(), 0);
+        return IntImm(a.ty(), 0);
       }
     }
     // Identity: floordiv(floormod(index, m*n), n) = floormod(floordiv(index, n), m)
@@ -1049,7 +1051,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
             }
             // Apply floormod(floordiv_result, m) to complete the identity
             PrimExpr div_result = Normalize(lhs);
-            return this->VisitExpr(floormod(div_result, MakeConst(a.dtype(), new_mod)));
+            return this->VisitExpr(floormod(div_result, MakeConst(a.ty(), new_mod)));
           }
         }
       }
@@ -1095,8 +1097,8 @@ SplitExpr CanonicalSimplifier::Impl::SplitModConst(SplitExpr lhs, int64_t cval,
       // Perhaps there are more chances in simplifying the index
       // Do a recursive call to simplify the mod with the new factor.
       if (new_upper_factor < lhs->upper_factor && lhs->upper_factor != SplitExprNode::kPosInf) {
-        auto updated = ToSplitExpr(this->VisitExpr(
-            ModImpl(lhs->index, MakeConst(lhs.dtype(), new_upper_factor), div_mode)));
+        auto updated = ToSplitExpr(
+            this->VisitExpr(ModImpl(lhs->index, MakeConst(lhs.ty(), new_upper_factor), div_mode)));
         // re-apply the lower_factor
         if (lhs->lower_factor != 1) {
           auto ret = SplitDivConst(updated, lhs->lower_factor, div_mode);
@@ -1126,7 +1128,7 @@ SplitExpr CanonicalSimplifier::Impl::SplitModConst(SplitExpr lhs, int64_t cval,
 }
 
 PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const ModNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Rewriter::VisitExpr_(op);
   }
   // normalize
@@ -1144,7 +1146,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const ModNode* op) {
       SumExpr lhs, extra;
       SeparateDivisibleParts(psum, cval, &lhs, &extra);
       if (extra->IsZero()) {
-        return IntImm(a.dtype(), 0);
+        return IntImm(a.ty(), 0);
       }
       // both lhs and extra are non-negative
       if (analyzer_->CanProveGreaterEqual(lhs->Normalize(), 0) &&
@@ -1200,7 +1202,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const ModNode* op) {
 }
 
 PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Rewriter::VisitExpr_(op);
   }
   // normalize
@@ -1362,7 +1364,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const ReduceNode* op) {
 }
 
 PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const CastNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Rewriter::VisitExpr_(op);
   }
   // normalize
@@ -1370,15 +1372,15 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const CastNode* op) {
   // PushCastToChildren
   if (value.as<SumExprNode>()) {
     SumExpr se = value.as_or_throw<SumExpr>();
-    if (se->CanPushCastToChildren(op->dtype, analyzer_)) {
-      se.CopyOnWrite()->PushCastToChildren(op->dtype);
+    if (se->CanPushCastToChildren(op->ty(), analyzer_)) {
+      se.CopyOnWrite()->PushCastToChildren(op->ty());
       return se;
     }
   }
   if (value.as<SplitExprNode>()) {
     SplitExpr se = value.as_or_throw<SplitExpr>();
-    if (se->CanPushCastToChildren(op->dtype, analyzer_)) {
-      se.CopyOnWrite()->PushCastToChildren(op->dtype);
+    if (se->CanPushCastToChildren(op->ty(), analyzer_)) {
+      se.CopyOnWrite()->PushCastToChildren(op->ty());
       return se;
     }
   }
@@ -1411,8 +1413,8 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const LTNode* op) {
     }
     SumExpr divisible, extra;
     SeparateDivisibleParts(lhs, gcd, &divisible, &extra);
-    DataType dtype = divisible->dtype;
-    TVM_FFI_ICHECK(extra->dtype == dtype);
+    PrimType dtype = divisible->ty();
+    TVM_FFI_ICHECK(extra->ty()->dtype == dtype->dtype);
     PrimExpr normal_extra = extra->Normalize();
     if (this->analyzer_->CanProve(normal_extra < MakeConst(dtype, gcd)) &&
         this->analyzer_->CanProve(normal_extra >= IntImm(dtype, 0))) {
diff --git a/src/arith/const_fold.h b/src/arith/const_fold.h
index fb1055660e3b..4793538316a3 100644
--- a/src/arith/const_fold.h
+++ b/src/arith/const_fold.h
@@ -72,18 +72,29 @@ inline ffi::Optional<PrimExpr> TryConstFold(PrimExpr a);
  * \param type The type to represent index.
  * \return the checked result.
  */
-inline bool IsIndexType(const DataType& type) {
-  return type.is_int() && !type.is_scalable_or_fixed_length_vector() &&
-         (type.bits() == 32 || type.bits() == 64);
+inline bool IsIndexType(DLDataType type) {
+  return type.code == static_cast<uint8_t>(DLDataTypeCode::kDLInt) &&
+         (type.bits == 32 || type.bits == 64) && type.lanes == 1;
+}
+
+inline bool IsIndexTypedExpr(const PrimExprNode* expr) {
+  TVM_FFI_DCHECK(expr != nullptr);
+  TVM_FFI_DCHECK(expr->BaseExprNode::ty.defined());
+  const auto* prim_ty = expr->BaseExprNode::ty.as<PrimTypeNode>();
+  TVM_FFI_DCHECK(prim_ty != nullptr);
+  return IsIndexType(prim_ty->dtype);
+}
+
+inline bool IsIndexTypedExpr(const PrimExpr& expr) {
+  return IsIndexTypedExpr(static_cast<const PrimExprNode*>(expr.get()));
 }
 
 /*! \brief Helper to get const folding result repr in int64. */
-inline int64_t GetFoldResultInt64Repr(int64_t x, const DataType& dtype) {
+inline int64_t GetFoldResultInt64Repr(int64_t x, const PrimType& dtype) {
   if (dtype.bits() < 64) {
     x &= (1LL << dtype.bits()) - 1;
   }
-  if (dtype.is_int()) {
-    // get sign extended value of integer with specified bits
+  if (dtype.MatchesCode(DLDataTypeCode::kDLInt)) {
     int64_t m = 1LL << (dtype.bits() - 1);
     x = (x ^ m) - m;
   }
@@ -118,32 +129,30 @@ inline double GetFoldResultDoubleRepr(float x) {
   const FloatImmNode* fb = b.as<FloatImmNode>(); \
   BODY;
 
-#define TVM_INDEX_CONST_PROPAGATION(BODY)                 \
-  const IntImmNode* pa = a.as<IntImmNode>();              \
-  const IntImmNode* pb = b.as<IntImmNode>();              \
-  const DataType& ta = a.dtype();                         \
-  const DataType& tb = b.dtype();                         \
-  if (arith::IsIndexType(ta) && arith::IsIndexType(tb)) { \
-    BODY;                                                 \
+#define TVM_INDEX_CONST_PROPAGATION(BODY)                         \
+  const IntImmNode* pa = a.as<IntImmNode>();                      \
+  const IntImmNode* pb = b.as<IntImmNode>();                      \
+  if (arith::IsIndexTypedExpr(a) && arith::IsIndexTypedExpr(b)) { \
+    BODY;                                                         \
   }
 
 // specialization of constant folders.
 template <>
 inline ffi::Optional<PrimExpr> TryConstFold<tirx::Add>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
+    PrimType result_ty = a.ty();
     if (pa && pb) {
       int64_t res = pa->value + pb->value;
-      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
+      return IntImm(result_ty, GetFoldResultInt64Repr(res, result_ty));
     }
     if (pa && pa->value == 0) return b;
     if (pb && pb->value == 0) return a;
     if (fa && fb) {
-      if (rtype.bits() == 32) {
-        return FloatImm(rtype, GetFoldResultDoubleRepr(static_cast<float>(fa->value) +
-                                                       static_cast<float>(fb->value)));
-      } else if (rtype.bits() == 64) {
-        return FloatImm(rtype, fa->value + fb->value);
+      if (result_ty.bits() == 32) {
+        return FloatImm(result_ty, GetFoldResultDoubleRepr(static_cast<float>(fa->value) +
+                                                           static_cast<float>(fb->value)));
+      } else if (result_ty.bits() == 64) {
+        return FloatImm(result_ty, fa->value + fb->value);
       }
     }
     if (fa && fa->value == 0) return b;
@@ -155,22 +164,22 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::Add>(PrimExpr a, PrimExpr b) {
 template <>
 inline ffi::Optional<PrimExpr> TryConstFold<tirx::Sub>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
-    TVM_FFI_ICHECK(!((pa && pa->dtype.is_uint() && pa->value == 0U) &&
-                     (pb && pb->dtype.is_uint() && pb->value > 0U)))
+    TVM_FFI_ICHECK(!((pa && pa->ty().MatchesCode(DLDataTypeCode::kDLUInt) && pa->value == 0U) &&
+                     (pb && pb->ty().MatchesCode(DLDataTypeCode::kDLUInt) && pb->value > 0U)))
         << "Checked failed. Minuend 's value is 0U and it's dtype is uint "
         << "while Subtrahend's dtype is uint; which will cause a negative uint";
-    const DataType& rtype = a.dtype();
+    PrimType result_ty = a.ty();
     if (pa && pb) {
       int64_t res = pa->value - pb->value;
-      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
+      return IntImm(result_ty, GetFoldResultInt64Repr(res, result_ty));
     }
     if (pb && pb->value == 0) return a;
     if (fa && fb) {
-      if (rtype.bits() == 32) {
-        return FloatImm(rtype, GetFoldResultDoubleRepr(static_cast<float>(fa->value) -
-                                                       static_cast<float>(fb->value)));
-      } else if (rtype.bits() == 64) {
-        return FloatImm(rtype, fa->value - fb->value);
+      if (result_ty.bits() == 32) {
+        return FloatImm(result_ty, GetFoldResultDoubleRepr(static_cast<float>(fa->value) -
+                                                           static_cast<float>(fb->value)));
+      } else if (result_ty.bits() == 64) {
+        return FloatImm(result_ty, fa->value - fb->value);
       }
     }
     if (fb && fb->value == 0) return a;
@@ -181,10 +190,10 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::Sub>(PrimExpr a, PrimExpr b) {
 template <>
 inline ffi::Optional<PrimExpr> TryConstFold<tirx::Mul>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
+    PrimType result_ty = a.ty();
     if (pa && pb) {
       int64_t res = pa->value * pb->value;
-      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
+      return IntImm(result_ty, GetFoldResultInt64Repr(res, result_ty));
     }
     if (pa) {
       if (pa->value == 1) return b;
@@ -195,11 +204,11 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::Mul>(PrimExpr a, PrimExpr b) {
       if (pb->value == 0) return b;
     }
     if (fa && fb) {
-      if (rtype.bits() == 32) {
-        return FloatImm(rtype, GetFoldResultDoubleRepr(static_cast<float>(fa->value) *
-                                                       static_cast<float>(fb->value)));
-      } else if (rtype.bits() == 64) {
-        return FloatImm(rtype, fa->value * fb->value);
+      if (result_ty.bits() == 32) {
+        return FloatImm(result_ty, GetFoldResultDoubleRepr(static_cast<float>(fa->value) *
+                                                           static_cast<float>(fb->value)));
+      } else if (result_ty.bits() == 64) {
+        return FloatImm(result_ty, fa->value * fb->value);
       }
     }
     if (fa) {
@@ -217,13 +226,13 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::Mul>(PrimExpr a, PrimExpr b) {
 template <>
 inline ffi::Optional<PrimExpr> TryConstFold<tirx::Div>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
+    PrimType result_ty = a.ty();
     if (pa && pb) {
       // due to division and mod can have different modes
       // NOTE: this will assumes truc div.
       TVM_FFI_ICHECK_NE(pb->value, 0) << "Divide by zero";
       int64_t res = pa->value / pb->value;
-      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
+      return IntImm(result_ty, GetFoldResultInt64Repr(res, result_ty));
     }
     if (pa) {
       if (pa->value == 0) return a;
@@ -234,11 +243,11 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::Div>(PrimExpr a, PrimExpr b) {
     }
     if (fa && fb) {
       TVM_FFI_ICHECK_NE(fb->value, 0) << "Divide by zero";
-      if (rtype.bits() == 32) {
-        return FloatImm(rtype, GetFoldResultDoubleRepr(static_cast<float>(fa->value) /
-                                                       static_cast<float>(fb->value)));
-      } else if (rtype.bits() == 64) {
-        return FloatImm(rtype, fa->value / fb->value);
+      if (result_ty.bits() == 32) {
+        return FloatImm(result_ty, GetFoldResultDoubleRepr(static_cast<float>(fa->value) /
+                                                           static_cast<float>(fb->value)));
+      } else if (result_ty.bits() == 64) {
+        return FloatImm(result_ty, fa->value / fb->value);
       }
     }
     if (fa && fa->value == 0) return a;
@@ -253,18 +262,18 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::Div>(PrimExpr a, PrimExpr b) {
 template <>
 inline ffi::Optional<PrimExpr> TryConstFold<tirx::Mod>(PrimExpr a, PrimExpr b) {
   TVM_INDEX_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
+    PrimType result_ty = a.ty();
     if (pa && pb) {
       TVM_FFI_ICHECK_NE(pb->value, 0) << "Divide by zero";
       int64_t res = pa->value % pb->value;
-      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
+      return IntImm(result_ty, GetFoldResultInt64Repr(res, result_ty));
     }
     if (pa) {
       if (pa->value == 0) return a;
     }
     if (pb) {
       // MakeConst can handle both vector and scalar types.
-      if (pb->value == 1) return tirx::MakeConst(rtype, 0);
+      if (pb->value == 1) return tirx::MakeConst(result_ty, 0);
       TVM_FFI_ICHECK_NE(pb->value, 0) << "Divide by zero";
     }
   });
@@ -274,11 +283,11 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::Mod>(PrimExpr a, PrimExpr b) {
 template <>
 inline ffi::Optional<PrimExpr> TryConstFold<tirx::FloorDiv>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
+    PrimType result_ty = a.ty();
     if (pa && pb) {
       TVM_FFI_ICHECK_NE(pb->value, 0) << "Divide by zero";
       int64_t res = arith::floordiv(pa->value, pb->value);
-      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
+      return IntImm(result_ty, GetFoldResultInt64Repr(res, result_ty));
     }
     if (pa) {
       if (pa->value == 0) return a;
@@ -288,11 +297,12 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::FloorDiv>(PrimExpr a, PrimExpr
       TVM_FFI_ICHECK_NE(pb->value, 0) << "Divide by zero";
     }
     if (fa && fb && fb->value != 0) {
-      if (rtype.bits() == 32) {
-        return FloatImm(rtype, GetFoldResultDoubleRepr(std::floor(static_cast<float>(fa->value) /
-                                                                  static_cast<float>(fb->value))));
-      } else if (rtype.bits() == 64) {
-        return FloatImm(rtype, std::floor(fa->value / fb->value));
+      if (result_ty.bits() == 32) {
+        return FloatImm(result_ty,
+                        GetFoldResultDoubleRepr(std::floor(static_cast<float>(fa->value) /
+                                                           static_cast<float>(fb->value))));
+      } else if (result_ty.bits() == 64) {
+        return FloatImm(result_ty, std::floor(fa->value / fb->value));
       } else {
         return std::nullopt;
       }
@@ -309,18 +319,18 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::FloorDiv>(PrimExpr a, PrimExpr
 template <>
 inline ffi::Optional<PrimExpr> TryConstFold<tirx::FloorMod>(PrimExpr a, PrimExpr b) {
   TVM_INDEX_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
+    PrimType result_ty = a.ty();
     if (pa && pb) {
       TVM_FFI_ICHECK_NE(pb->value, 0) << "Divide by zero";
       int64_t res = arith::floormod(pa->value, pb->value);
-      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
+      return IntImm(result_ty, GetFoldResultInt64Repr(res, result_ty));
     }
     if (pa) {
       if (pa->value == 0) return a;
     }
     if (pb) {
       // MakeConst can handle both vector and scalar types.
-      if (pb->value == 1) return tirx::MakeConst(rtype, 0);
+      if (pb->value == 1) return tirx::MakeConst(result_ty, 0);
       TVM_FFI_ICHECK_NE(pb->value, 0) << "Divide by zero";
     }
   });
@@ -330,9 +340,9 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::FloorMod>(PrimExpr a, PrimExpr
 template <>
 inline ffi::Optional<PrimExpr> TryConstFold<tirx::Min>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
-    if (pa && pb) return IntImm(rtype, std::min(pa->value, pb->value));
-    if (fa && fb) return FloatImm(rtype, std::min(fa->value, fb->value));
+    PrimType result_ty = a.ty();
+    if (pa && pb) return IntImm(result_ty, std::min(pa->value, pb->value));
+    if (fa && fb) return FloatImm(result_ty, std::min(fa->value, fb->value));
   });
   if (a.same_as(b)) return a;
   return std::nullopt;
@@ -341,9 +351,9 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::Min>(PrimExpr a, PrimExpr b) {
 template <>
 inline ffi::Optional<PrimExpr> TryConstFold<tirx::Max>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
-    if (pa && pb) return IntImm(rtype, std::max(pa->value, pb->value));
-    if (fa && fb) return FloatImm(rtype, std::max(fa->value, fb->value));
+    PrimType result_ty = a.ty();
+    if (pa && pb) return IntImm(result_ty, std::max(pa->value, pb->value));
+    if (fa && fb) return FloatImm(result_ty, std::max(fa->value, fb->value));
   });
   if (a.same_as(b)) return a;
   return std::nullopt;
diff --git a/src/arith/const_int_bound.cc b/src/arith/const_int_bound.cc
index 4d700564ea05..3e8087af0eff 100644
--- a/src/arith/const_int_bound.cc
+++ b/src/arith/const_int_bound.cc
@@ -151,7 +151,7 @@ class ConstIntBoundAnalyzer::Impl
 
   // Override visitor behaviors
   Entry VisitExprDefault_(const ffi::Object* op) final {
-    return Everything(static_cast<const PrimExprNode*>(op)->dtype);
+    return Everything(static_cast<const PrimExprNode*>(op)->ty());
   }
 
   Entry VisitExpr(const PrimExpr& expr) final {
@@ -167,7 +167,7 @@ class ConstIntBoundAnalyzer::Impl
     if (bound_) {
       auto val = bound_->find(expr);
       if (val != bound_->end()) {
-        auto everything = Everything(expr->dtype);
+        auto everything = Everything(expr->ty());
         TVM_FFI_ICHECK(
             (val->second->min_value == res.min_value && val->second->max_value == res.max_value) ||
             (val->second->min_value == everything.min_value &&
@@ -203,7 +203,7 @@ class ConstIntBoundAnalyzer::Impl
       a = VisitExpr(op->value);
     }
 
-    Entry b = Everything(op->dtype);
+    Entry b = Everything(op->ty());
     return Intersect(a, b);
   }
 
@@ -263,7 +263,7 @@ class ConstIntBoundAnalyzer::Impl
   Entry VisitExpr_(const DivNode* op) final {
     Entry a = VisitExpr(op->a);
     Entry b = AssumeNoZeroDivisor(VisitExpr(op->b));
-    return HandleDivision(a, b, op->dtype, InfAwareDiv);
+    return HandleDivision(a, b, op->ty(), InfAwareDiv);
   }
 
   Entry VisitExpr_(const ModNode* op) final {
@@ -312,14 +312,14 @@ class ConstIntBoundAnalyzer::Impl
       TVM_FFI_ICHECK(!b.is_const(0)) << "mod by zero";
       // mod by negative value is rare,
       // and we just use the simpliest rule.
-      return Everything(op->dtype);
+      return Everything(op->ty());
     }
   }
 
   Entry VisitExpr_(const FloorDivNode* op) final {
     Entry a = VisitExpr(op->a);
     Entry b = AssumeNoZeroDivisor(VisitExpr(op->b));
-    return HandleDivision(a, b, op->dtype, InfAwareFloorDiv);
+    return HandleDivision(a, b, op->ty(), InfAwareFloorDiv);
   }
 
   Entry VisitExpr_(const FloorModNode* op) final {
@@ -385,7 +385,7 @@ class ConstIntBoundAnalyzer::Impl
       int64_t b_max_cap = InfAwareAdd(b.max_value, -1);
       return Intersect(MakeBound(std::min(static_cast<int64_t>(0), b_min_cap),
                                  std::max(static_cast<int64_t>(0), b_max_cap)),
-                       Everything(op->dtype));
+                       Everything(op->ty()));
     }
   }
 
@@ -424,7 +424,7 @@ class ConstIntBoundAnalyzer::Impl
     } else if (op->op.same_as(tirx::builtin::bitwise_and())) {
       return VisitBitwiseAnd(op);
     } else {
-      return Everything(op->dtype);
+      return Everything(op->ty());
     }
   }
 
@@ -434,7 +434,7 @@ class ConstIntBoundAnalyzer::Impl
     if (it != var_map_.end()) {
       return it->second;
     } else {
-      return Everything(op->dtype);
+      return Everything(op->ty());
     }
   }
 
@@ -456,7 +456,7 @@ class ConstIntBoundAnalyzer::Impl
       // If either operand can negative, we may run into undefined
       // behavior for some targets.  In these cases, avoid making any
       // assumptions about the result.
-      return Everything(op->dtype);
+      return Everything(op->ty());
     }
 
     return BinaryOpBoundary(a, b, InfAwareLeftShift);
@@ -481,7 +481,7 @@ class ConstIntBoundAnalyzer::Impl
       if (a.min_value >= 0) {
         return MakeBound(0, a.max_value);
       }
-      return Everything(op->dtype);
+      return Everything(op->ty());
     }
   }
 
@@ -549,7 +549,7 @@ class ConstIntBoundAnalyzer::Impl
    * \return The result.
    */
   template <typename F>
-  static Entry HandleDivision(Entry a, Entry b, DataType dt, const F& op) {
+  static Entry HandleDivision(Entry a, Entry b, PrimType dt, const F& op) {
     // Here we have a / b.
     // The largest value of the division will be for the smallest (with
     // respect to the absolute value) value of b. If the range of b starts
@@ -557,7 +557,7 @@ class ConstIntBoundAnalyzer::Impl
     // be closer to 0, because BinaryOpBoundary only checks end-points of
     // the domain ranges.
     // If the range of b contains 0, then some infinity will be involved
-    if (b.min_value <= 0 && 0 <= b.max_value && dt.is_int()) {
+    if (b.min_value <= 0 && 0 <= b.max_value && dt.code() == DLDataTypeCode::kDLInt) {
       Entry b_neg = b.min_value < 0 ? MakeBound(b.min_value, -1) : Everything(dt);
       Entry b_pos = b.max_value > 0 ? MakeBound(1, b.max_value) : Everything(dt);
 
@@ -566,7 +566,7 @@ class ConstIntBoundAnalyzer::Impl
 
       return MakeBound(std::min(e_neg.min_value, e_pos.min_value),
                        std::max(e_neg.max_value, e_pos.max_value));
-    } else if (b.min_value == 0 && dt.is_uint()) {
+    } else if (b.min_value == 0 && dt.code() == DLDataTypeCode::kDLUInt) {
       // uints only have one sided bounds
       Entry assumed_b = MakeBound(1, b.max_value);
       return BinaryOpBoundary(a, assumed_b, op);
@@ -727,16 +727,17 @@ class ConstIntBoundAnalyzer::Impl
    * \param dtype The data type.
    * \return Bound that represent everything dtype can represent.
    */
-  static Entry Everything(DataType dtype) {
-    if (!dtype.is_int() && !dtype.is_uint() && !dtype.is_bool()) {
+  static Entry Everything(PrimType dtype) {
+    if (dtype.code() != DLDataTypeCode::kDLInt && dtype.code() != DLDataTypeCode::kDLUInt &&
+        dtype.code() != DLDataTypeCode::kDLBool) {
       return MakeBound(kNegInf, kPosInf);
     }
-    if (dtype.is_bool()) {
+    if (dtype.code() == DLDataTypeCode::kDLBool) {
       return MakeBound(0, 1);
     }
     Entry ret;
-    int64_t vbits = dtype.bits() - static_cast<int>(dtype.is_int());
-    if (dtype.is_uint()) {
+    int64_t vbits = dtype.bits() - static_cast<int>(dtype.code() == DLDataTypeCode::kDLInt);
+    if (dtype.code() == DLDataTypeCode::kDLUInt) {
       ret.min_value = 0;
     } else {
       if (vbits >= 63) {
@@ -800,7 +801,7 @@ class ConstIntBoundAnalyzer::Impl
   static ffi::Optional<PrimExpr> FindCeilLog2Arg(const CastNode* op) {
     static const Op& ceil_op = Op::Get("tirx.ceil");
     static const Op& log2_op = Op::Get("tirx.log2");
-    if (op->dtype.is_int()) {
+    if (op->ty().code() == DLDataTypeCode::kDLInt) {
       if (auto as_call = op->value.as<CallNode>()) {
         if (as_call->op.same_as(ceil_op)) {
           PrimExpr ceil_arg = as_call->args[0];
diff --git a/src/arith/detect_linear_equation.cc b/src/arith/detect_linear_equation.cc
index 5e77dca59405..f7e04ee0ebf5 100644
--- a/src/arith/detect_linear_equation.cc
+++ b/src/arith/detect_linear_equation.cc
@@ -54,10 +54,10 @@ class LinearEqDetector : public ExprFunctor<LinearEqEntry(const PrimExpr&, const
     *ret = VisitExpr(e, e);
     if (fail_) return false;
     if (!ret->base.defined()) {
-      ret->base = IntImm(var_.dtype(), 0);
+      ret->base = IntImm(var_.ty(), 0);
     }
     if (!ret->coeff.defined()) {
-      ret->coeff = IntImm(var_.dtype(), 0);
+      ret->coeff = IntImm(var_.ty(), 0);
     }
     return true;
   }
@@ -101,8 +101,8 @@ class LinearEqDetector : public ExprFunctor<LinearEqEntry(const PrimExpr&, const
   LinearEqEntry VisitExpr_(const VarNode* op, const PrimExpr& e) final {
     LinearEqEntry ret;
     if (op == var_.get()) {
-      auto dtype = op->dtype;
-      ret.coeff = MakeConst(DataType::Int(dtype.bits(), dtype.lanes()), 1);
+      PrimType dtype = op->ty();
+      ret.coeff = MakeConst(PrimType::Int(dtype.bits(), dtype.lanes()), 1);
     } else {
       ret.base = e;
     }
@@ -194,19 +194,21 @@ bool DetectClipBound(const PrimExpr& cond,
   bool is_eq = false;
   PrimExpr canonical;
   if (const LTNode* op = cond.as<LTNode>()) {
-    if (!op->a.dtype().is_int()) return false;
-    canonical = op->b - op->a - MakeConst(op->a.dtype(), 1);
+    PrimType a_ty = op->a.ty();
+    if (a_ty.code() != DLDataTypeCode::kDLInt) return false;
+    canonical = op->b - op->a - MakeConst(a_ty, 1);
   } else if (const LENode* op = cond.as<LENode>()) {
-    if (!op->a.dtype().is_int()) return false;
+    if (op->a.ty().code() != DLDataTypeCode::kDLInt) return false;
     canonical = op->b - op->a;
   } else if (const GTNode* op = cond.as<GTNode>()) {
-    if (!op->a.dtype().is_int()) return false;
-    canonical = op->a - op->b - MakeConst(op->a.dtype(), 1);
+    PrimType a_ty = op->a.ty();
+    if (a_ty.code() != DLDataTypeCode::kDLInt) return false;
+    canonical = op->a - op->b - MakeConst(a_ty, 1);
   } else if (const GENode* op = cond.as<GENode>()) {
-    if (!op->a.dtype().is_int()) return false;
+    if (op->a.ty().code() != DLDataTypeCode::kDLInt) return false;
     canonical = op->a - op->b;
   } else if (const EQNode* op = cond.as<EQNode>()) {
-    if (!op->a.dtype().is_int()) return false;
+    if (op->a.ty().code() != DLDataTypeCode::kDLInt) return false;
     canonical = op->a - op->b;
     is_eq = true;
   } else {
diff --git a/src/arith/int_constraints.cc b/src/arith/int_constraints.cc
index 55db4fc774b6..b517324f378d 100644
--- a/src/arith/int_constraints.cc
+++ b/src/arith/int_constraints.cc
@@ -74,7 +74,8 @@ ffi::Array<PrimExpr> AsConditions(const ffi::Array<Var>& variables,
 
 IntGroupBounds::IntGroupBounds(PrimExpr coef, ffi::Array<PrimExpr> lower,
                                ffi::Array<PrimExpr> equal, ffi::Array<PrimExpr> upper) {
-  TVM_FFI_ICHECK(coef.dtype().is_int() || coef.dtype().is_uint())
+  PrimType coef_ty = coef.ty();
+  TVM_FFI_ICHECK(coef_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt))
       << "Coefficient in IntGroupBounds must be integers";
   ffi::ObjectPtr<IntGroupBoundsNode> node = ffi::make_object<IntGroupBoundsNode>();
   node->coef = std::move(coef);
@@ -86,7 +87,7 @@ IntGroupBounds::IntGroupBounds(PrimExpr coef, ffi::Array<PrimExpr> lower,
 
 IntGroupBounds IntGroupBounds::FromRange(const Range& r) {
   Analyzer analyzer;
-  PrimExpr coef = tirx::MakeConst(r->min.dtype(), 1);
+  PrimExpr coef = tirx::MakeConst(r->min.ty(), 1);
   ffi::Array<PrimExpr> equal;
   ffi::Array<PrimExpr> lower;
   ffi::Array<PrimExpr> upper;
@@ -232,7 +233,8 @@ IntConstraints::IntConstraints(ffi::Array<Var> variables, ffi::Map<Var, Range> r
   }
   TVM_FFI_ICHECK(relations.defined());
   for (const auto& var : variables) {
-    TVM_FFI_ICHECK(var.dtype().is_int() || var.dtype().is_uint())
+    PrimType var_ty = var.ty();
+    TVM_FFI_ICHECK(var_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt))
         << "Variables in IntConstraints must be integers";
   }
   node->variables = std::move(variables);
diff --git a/src/arith/int_set.cc b/src/arith/int_set.cc
index d7bf32442497..b3d111ffa7a8 100644
--- a/src/arith/int_set.cc
+++ b/src/arith/int_set.cc
@@ -50,8 +50,8 @@ using tirx::MakeConst;
 
 TVM_FFI_STATIC_INIT_BLOCK() { IntervalSetNode::RegisterReflection(); }
 
-PrimExpr SymbolicLimits::pos_inf_ = Var("pos_inf", DataType::Handle());
-PrimExpr SymbolicLimits::neg_inf_ = Var("neg_inf", DataType::Handle());
+PrimExpr SymbolicLimits::pos_inf_ = Var("pos_inf", PrimType::Handle());
+PrimExpr SymbolicLimits::neg_inf_ = Var("neg_inf", PrimType::Handle());
 
 IntervalSet::IntervalSet(PrimExpr min_value, PrimExpr max_value) {
   auto node = ffi::make_object<IntervalSetNode>();
@@ -72,8 +72,10 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 IntervalSet Intersect(AnalyzerObj* analyzer, IntervalSet a, IntervalSet b) {
   PrimExpr max_value = min(a->max_value, b->max_value);
   PrimExpr min_value = max(a->min_value, b->min_value);
-  if ((max_value.dtype().is_int() || max_value.dtype().is_uint()) &&
-      (min_value.dtype().is_int() || min_value.dtype().is_uint()) &&
+  PrimType max_ty = max_value.ty();
+  PrimType min_ty = min_value.ty();
+  if (max_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt) &&
+      min_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt) &&
       analyzer->CanProve(max_value < min_value)) {
     return IntervalSet::Empty();
   } else {
@@ -121,7 +123,7 @@ TVM_DECLARE_LOGICAL_OP(Not);
  */
 template <typename Op, typename OpNode>
 inline IntervalSet Combine(AnalyzerObj* analyzer, IntervalSet a, IntervalSet b, const OpNode* op) {
-  DataType dtype = op->dtype;
+  PrimType dtype = op->ty();
   if (a->IsSinglePoint() && b->IsSinglePoint()) {
     PrimExpr expr;
     if (auto res = TryConstFold<Op>(a->min_value, b->min_value)) {
@@ -195,7 +197,7 @@ inline IntervalSet Combine<tirx::Mul>(AnalyzerObj* analyzer, IntervalSet a, Inte
       return IntervalSet(min_value, max_value);
     } else if (a->HasUpperBound() && a->HasLowerBound()) {
       using tirx::Select;
-      PrimExpr sign = b->min_value >= IntImm(b->min_value.dtype().element_of(), 0);
+      PrimExpr sign = b->min_value >= IntImm(b->min_value.ty().WithLanes(1), 0);
       PrimExpr e1 = a->min_value * b->min_value;
       PrimExpr e2 = a->max_value * b->min_value;
       return IntervalSet(Select(sign, e1, e2), Select(sign, e2, e1));
@@ -229,7 +231,7 @@ inline IntervalSet Combine<tirx::Div>(AnalyzerObj* analyzer, IntervalSet a, Inte
       return IntervalSet(min_value, max_value);
     } else if (a->HasUpperBound() && a->HasLowerBound()) {
       using tirx::Select;
-      PrimExpr sign = b->min_value >= IntImm(b->min_value.dtype().element_of(), 0);
+      PrimExpr sign = b->min_value >= IntImm(b->min_value.ty().WithLanes(1), 0);
       PrimExpr e1 = a->min_value / b->min_value;
       PrimExpr e2 = a->max_value / b->min_value;
       return IntervalSet(Select(sign, e1, e2), Select(sign, e2, e1));
@@ -258,7 +260,7 @@ inline IntervalSet Combine<tirx::Mod>(AnalyzerObj* analyzer, IntervalSet a, Inte
     // is the case of our application.
     // TODO(tqchen): add bound constraints for a.
     if (analyzer->CanProveGreaterEqual(divisor, 0)) {
-      return IntervalSet(IntImm(divisor.dtype(), 0), divisor - 1);
+      return IntervalSet(IntImm(divisor.ty(), 0), divisor - 1);
     } else {
       PrimExpr bound = abs(divisor) - 1;
       return IntervalSet(-bound, bound);
@@ -292,7 +294,7 @@ inline IntervalSet Combine<tirx::FloorDiv>(AnalyzerObj* analyzer, IntervalSet a,
       return IntervalSet(min_value, max_value);
     } else if (a->HasUpperBound() && a->HasLowerBound()) {
       using tirx::Select;
-      PrimExpr sign = b->min_value >= IntImm(b->min_value.dtype().element_of(), 0);
+      PrimExpr sign = b->min_value >= IntImm(b->min_value.ty().WithLanes(1), 0);
       PrimExpr e1 = floordiv(a->min_value, b->min_value);
       PrimExpr e2 = floordiv(a->max_value, b->min_value);
       return IntervalSet(Select(sign, e1, e2), Select(sign, e2, e1));
@@ -323,7 +325,7 @@ inline IntervalSet Combine<tirx::FloorMod>(AnalyzerObj* analyzer, IntervalSet a,
         auto qmin = a->HasLowerBound() ? floordiv(a->min_value, divisor) : neg_inf();
         // We can compare +/- inf against each other, but cannot use
         // operator== between the symbolic limits and an integer.
-        bool compatible_dtypes = !(qmin.dtype().is_handle() ^ qmax.dtype().is_handle());
+        bool compatible_dtypes = !(qmin.ty().IsHandle() ^ qmax.ty().IsHandle());
         if (compatible_dtypes && analyzer->CanProve(qmax == qmin)) {
           auto tmax = a->max_value - divisor * qmin;
           auto tmin = a->min_value - divisor * qmin;
@@ -348,12 +350,13 @@ inline IntervalSet Combine<tirx::FloorMod>(AnalyzerObj* analyzer, IntervalSet a,
             int64_t max_mod_result = max_quotient * gcd + (dividend_mod->base % gcd);
 
             if (max_mod_result >= 0 && max_mod_result < div_val) {
-              return IntervalSet(IntImm(op->dtype, 0), IntImm(op->dtype, max_mod_result));
+              PrimType result_ty = ffi::GetRef<PrimExpr>(op).ty();
+              return IntervalSet(IntImm(result_ty, 0), IntImm(result_ty, max_mod_result));
             }
           }
         }
       }
-      return IntervalSet(IntImm(divisor.dtype(), 0), divisor - 1);
+      return IntervalSet(IntImm(divisor.ty(), 0), divisor - 1);
     } else {
       PrimExpr bound = abs(divisor) - 1;
       return IntervalSet(-bound, bound);
@@ -522,7 +525,7 @@ class IntervalSetEvaluator : public ExprFunctor<IntervalSet(const PrimExpr&)> {
     IntervalSet base = Eval(op->base);
     PVar<IntImm> stride;
     if (stride.Match(op->stride)) {
-      DataType t = op->base.dtype();
+      PrimType t = op->base.ty();
       int64_t vstride = stride.Eval()->value;
       if (op->lanes->IsInstance<IntImmNode>()) {
         int lanes = static_cast<int>(op->lanes.as_or_throw<IntImm>()->value);
@@ -569,18 +572,19 @@ class IntervalSetEvaluator : public ExprFunctor<IntervalSet(const PrimExpr&)> {
     // short cut for the int set.
     if (value_set->min_value.same_as(value_set->max_value)) {
       if (value_set->IsEmpty()) return value_set;
-      return IntervalSet::SinglePoint(cast(op->dtype, value_set->min_value));
+      return IntervalSet::SinglePoint(cast(op->ty(), value_set->min_value));
     }
     PrimExpr min_value =
-        value_set->HasLowerBound() ? cast(op->dtype, value_set->min_value) : neg_inf();
+        value_set->HasLowerBound() ? cast(op->ty(), value_set->min_value) : neg_inf();
     PrimExpr max_value =
-        value_set->HasUpperBound() ? cast(op->dtype, value_set->max_value) : pos_inf();
+        value_set->HasUpperBound() ? cast(op->ty(), value_set->max_value) : pos_inf();
     return IntervalSet(min_value, max_value);
   }
 
   IntervalSet VisitExpr_(const BufferLoadNode* op) final {
-    if (!(op->dtype.is_int() || op->dtype.is_uint())) {
-      DLOG(WARNING) << "cannot evaluate set BufferLoad which loads from a " << op->dtype
+    PrimType op_ty = op->ty();
+    if (!op_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
+      DLOG(WARNING) << "cannot evaluate set BufferLoad which loads from a " << op_ty->dtype
                     << " buffer";
       return IntervalSet::Everything();
     }
@@ -1048,7 +1052,7 @@ IntSet EvalSet(PrimExpr e, const ffi::Map<Var, IntSet>& dom_map) {
 
 IntSet IntSet::Vector(PrimExpr x) {
   // short cut: simply get single point
-  if (!x.dtype().is_scalable_or_fixed_length_vector()) {
+  if (!x.ty().IsScalableVector() && !x.ty().IsFixedLengthVector()) {
     return IntSet::SinglePoint(x);
   } else {
     // vector case.
@@ -1068,7 +1072,9 @@ IntSet EvalSet(PrimExpr e, const std::unordered_map<const VarNode*, IntSet>& dom
 
 IntSet EvalSet(Range r, const ffi::Map<Var, IntSet>& dom_map) {
   Analyzer ana;
-  if ((r->min->dtype.is_int() || r->min->dtype.is_uint()) && ana->CanProveEqual(r->extent, 1)) {
+  PrimType min_ty = r->min.ty();
+  if (min_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt) &&
+      ana->CanProveEqual(r->extent, 1)) {
     return EvalSet(r->min, dom_map);
   }
   IntervalSetEvaluator m(ana.get(), dom_map);
diff --git a/src/arith/ir_mutator_with_analyzer.cc b/src/arith/ir_mutator_with_analyzer.cc
index 8dcef7a75a80..d6a264288b16 100644
--- a/src/arith/ir_mutator_with_analyzer.cc
+++ b/src/arith/ir_mutator_with_analyzer.cc
@@ -54,7 +54,7 @@ void AppendFloorDivConstraints(const FloorDivNode* div, int64_t value, CompareKi
   int64_t divisor_value = 0;
   if (!TryGetIntImm(div->b, &divisor_value) || divisor_value <= 0) return;
 
-  DataType dtype = div->a.dtype();
+  PrimType dtype = div->a.ty();
   PrimExpr divisor = MakeConst(dtype, divisor_value);
   PrimExpr k = MakeConst(dtype, value);
   PrimExpr lo = k * divisor;
@@ -117,7 +117,8 @@ void CollectDerivedConstraintFacts(const PrimExpr& condition, std::vector<PrimEx
   }
   if (const auto* call = condition.as<CallNode>()) {
     if (call->op.same_as(tirx::builtin::bitwise_and()) && call->args.size() == 2 &&
-        call->args[0].dtype().is_bool() && call->args[1].dtype().is_bool()) {
+        call->args[0].ty().MatchesElementType(DLDataTypeCode::kDLBool, 8) &&
+        call->args[1].ty().MatchesElementType(DLDataTypeCode::kDLBool, 8)) {
       CollectDerivedConstraintFacts(call->args[0], out);
       CollectDerivedConstraintFacts(call->args[1], out);
       return;
@@ -260,7 +261,7 @@ Stmt IRMutatorWithAnalyzer::VisitStmt_(const AttrStmtNode* op) {
     if (op->attr_key == tirx::attr::thread_extent || op->attr_key == s_tir::attr::virtual_thread) {
       IterVar iv = op->node.as_or_throw<IterVar>();
       TVM_FFI_ICHECK_NE(iv->thread_tag.length(), 0U);
-      Range dom = Range::FromMinExtent(IntImm(op->value.dtype(), 0), op->value);
+      Range dom = Range::FromMinExtent(IntImm(op->value.ty(), 0), op->value);
       analyzer_->Bind(iv->var, dom);
       iter_vars_.Set(iv->var, dom);
     }
@@ -313,7 +314,8 @@ PrimExpr IRMutatorWithAnalyzer::VisitExpr_(const CallNode* op) {
         false_value.same_as(op->args[2])) {
       return ffi::GetRef<PrimExpr>(op);
     } else {
-      return Call(op->dtype, op->op, {cond, true_value, false_value}, op->attrs, op->span);
+      return Call(ffi::GetRef<PrimExpr>(op).ty(), op->op, {cond, true_value, false_value},
+                  op->attrs, op->span);
     }
   }
   return StmtExprMutator::VisitExpr_(op);
diff --git a/src/arith/ir_visitor_with_analyzer.cc b/src/arith/ir_visitor_with_analyzer.cc
index ffe9c73bd6f2..0313dbfe4271 100644
--- a/src/arith/ir_visitor_with_analyzer.cc
+++ b/src/arith/ir_visitor_with_analyzer.cc
@@ -79,7 +79,7 @@ void IRVisitorWithAnalyzer::VisitStmt_(const AttrStmtNode* op) {
     if (op->attr_key == tirx::attr::thread_extent || op->attr_key == s_tir::attr::virtual_thread) {
       IterVar iv = op->node.as_or_throw<IterVar>();
       TVM_FFI_ICHECK_NE(iv->thread_tag.length(), 0U);
-      analyzer_->Bind(iv->var, Range::FromMinExtent(IntImm(op->value->dtype, 0), op->value));
+      analyzer_->Bind(iv->var, Range::FromMinExtent(IntImm(op->value.ty(), 0), op->value));
     }
     StmtExprVisitor::VisitStmt_(op);
   });
diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index c7f8819f944f..430a4ec5c839 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -66,8 +66,8 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 IterSplitExpr::IterSplitExpr(IterMark source) {
   auto n = ffi::make_object<IterSplitExprNode>();
-  auto one = MakeConst(source->source->dtype, 1);
-  n->dtype = source->source->dtype;
+  auto one = MakeConst(source->source.ty(), 1);
+  n->BaseExprNode::ty = source->source.ty();
   n->source = std::move(source);
   n->extent = n->source->extent;
   n->lower_factor = one;
@@ -77,8 +77,8 @@ IterSplitExpr::IterSplitExpr(IterMark source) {
 
 IterSplitExpr::IterSplitExpr(IterMark source, PrimExpr scale) {
   auto n = ffi::make_object<IterSplitExprNode>();
-  auto one = MakeConst(source->source->dtype, 1);
-  n->dtype = source->source->dtype;
+  auto one = MakeConst(source->source.ty(), 1);
+  n->BaseExprNode::ty = source->source.ty();
   n->source = std::move(source);
   n->extent = n->source->extent;
   n->lower_factor = one;
@@ -89,7 +89,7 @@ IterSplitExpr::IterSplitExpr(IterMark source, PrimExpr scale) {
 IterSplitExpr::IterSplitExpr(IterMark source, PrimExpr lower_factor, PrimExpr extent,
                              PrimExpr scale) {
   auto n = ffi::make_object<IterSplitExprNode>();
-  n->dtype = source->source->dtype;
+  n->BaseExprNode::ty = source->source.ty();
   n->source = std::move(source);
   n->lower_factor = std::move(lower_factor);
   n->extent = std::move(extent);
@@ -109,7 +109,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 IterSumExpr::IterSumExpr(ffi::Array<IterSplitExpr> args, PrimExpr base) {
   auto n = ffi::make_object<IterSumExprNode>();
-  n->dtype = base->dtype;
+  n->BaseExprNode::ty = base.ty();
   n->args = std::move(args);
   n->base = std::move(base);
   data_ = std::move(n);
@@ -563,7 +563,7 @@ class IterMapRewriter : public ExprMutator {
                                                IterMapLevel check_level) {
     std::vector<bool> used(splits.size(), false);
     std::vector<IterSplitExpr> iters;
-    PrimExpr expected_lower_factor = MakeConst(mark->source->dtype, 1);
+    PrimExpr expected_lower_factor = MakeConst(mark->source.ty(), 1);
 
     for (size_t i = 0; i < splits.size(); ++i) {
       size_t j = 0;
@@ -694,7 +694,7 @@ class IterMapRewriter : public ExprMutator {
       PrimExpr iter_min = mark_offset;
       PrimExpr iter_max = iter_min + mark->extent;
       // the delta of iter_min when it is updated when the lower bound predicate is present
-      PrimExpr iter_min_delta = IntImm(iter_min.dtype(), 0);
+      PrimExpr iter_min_delta = IntImm(iter_min.ty(), 0);
       if (predicate_induced_min.defined()) {
         iter_min_delta = max(predicate_induced_min.value(), iter_min) - iter_min;
         iter_min = max(predicate_induced_min.value(), iter_min);
@@ -788,7 +788,7 @@ class IterMapRewriter : public ExprMutator {
     for (IterSplitExpr split : expr->args) {
       int64_t symbol_prod_count = 0;
       int64_t cscale = 1;
-      PrimExpr res = tirx::MakeConst(split.dtype(), 1);
+      PrimExpr res = tirx::MakeConst(split.ty(), 1);
       auto fcollect = [&](PrimExpr val) {
         if (const auto* intimm = val.as<IntImmNode>()) {
           cscale *= intimm->value;
@@ -799,7 +799,7 @@ class IterMapRewriter : public ExprMutator {
       };
       UnpackReduction<tirx::MulNode>(split->scale, fcollect);
       if (cscale != 1) {
-        res = res * tirx::MakeConst(res.dtype(), cscale);
+        res = res * tirx::MakeConst(res.ty(), cscale);
       }
       split.CopyOnWrite()->scale = res;
       items.emplace_back(Item{cscale, symbol_prod_count, split});
@@ -830,7 +830,7 @@ class IterMapRewriter : public ExprMutator {
     if (auto op = expr.as<IterSumExpr>()) {
       return op.value();
     } else if (auto op = expr.as<IterSplitExpr>()) {
-      return IterSumExpr({op.value()}, IntImm(expr->dtype, 0));
+      return IterSumExpr({op.value()}, IntImm(expr.ty(), 0));
     } else {
       TVM_FFI_ICHECK(!expr->IsInstance<IterMapExprNode>());
       return IterSumExpr({}, expr);
@@ -1103,8 +1103,8 @@ class IterMapRewriter : public ExprMutator {
     std::vector<IterSplitExpr> flattened_iters, grouped_iters;
 
     // check if it can be remapped into a fused pattern.
-    PrimExpr expected_extra_base = IntImm(expr.dtype(), 0);
-    PrimExpr tail_extent = IntImm(expr.dtype(), 0);
+    PrimExpr expected_extra_base = IntImm(expr.ty(), 0);
+    PrimExpr tail_extent = IntImm(expr.ty(), 0);
     PrimExpr expected_scale = base_scale;
     int first_possible_unit_extent_pos = FindFirstPossibleUnitExtentIndex(expr);
 
@@ -1200,10 +1200,10 @@ class IterMapRewriter : public ExprMutator {
     IterSumExpr structured_form = expr, flattened_form = expr;
     flattened_form.CopyOnWrite()->args =
         ffi::Array<IterSplitExpr>(flattened_iters.rbegin(), flattened_iters.rend());
-    flattened_form.CopyOnWrite()->base = IntImm(expr.dtype(), 0);
+    flattened_form.CopyOnWrite()->base = IntImm(expr.ty(), 0);
     structured_form.CopyOnWrite()->args =
         ffi::Array<IterSplitExpr>(grouped_iters.rbegin(), grouped_iters.rend());
-    structured_form.CopyOnWrite()->base = IntImm(expr.dtype(), 0);
+    structured_form.CopyOnWrite()->base = IntImm(expr.ty(), 0);
     auto it = sum_fuse_map_.find(flattened_form);
     if (it != sum_fuse_map_.end()) {
       // old iter
@@ -1245,7 +1245,7 @@ class IterMapRewriter : public ExprMutator {
     if (sign > 0) {
       lhs->args.push_back(rhs);
     } else {
-      rhs.CopyOnWrite()->scale = IntImm(rhs->scale.dtype(), 0) - rhs->scale;
+      rhs.CopyOnWrite()->scale = IntImm(rhs->scale.ty(), 0) - rhs->scale;
       lhs->args.push_back(rhs);
     }
   }
@@ -1332,8 +1332,10 @@ bool MatchBoundConstraints(PrimExpr pred, ffi::Map<Var, Range>* input_iters,
     PrimExpr lhs_expr = lhs.Eval();
     PrimExpr rhs_expr = rhs.Eval();
     // we only accept predicate of integers
-    if (!((lhs_expr->dtype.is_int() || lhs_expr->dtype.is_uint()) &&
-          (rhs_expr->dtype.is_int() || rhs_expr->dtype.is_uint()))) {
+    PrimType lhs_ty = lhs_expr.ty();
+    PrimType rhs_ty = rhs_expr.ty();
+    if (!((lhs_ty.code() == DLDataTypeCode::kDLInt || lhs_ty.code() == DLDataTypeCode::kDLUInt) &&
+          (rhs_ty.code() == DLDataTypeCode::kDLInt || rhs_ty.code() == DLDataTypeCode::kDLUInt))) {
       return false;
     }
     // determine iter and bound, if we can not distinguish them simply,
@@ -1563,7 +1565,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const VarNode* op) {
 }
 
 PrimExpr IterMapRewriter::VisitExpr_(const AddNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Parent::VisitExpr_(op);
   }
   PrimExpr a = this->DirectMutate(op->a);
@@ -1596,7 +1598,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const AddNode* op) {
 }
 
 PrimExpr IterMapRewriter::VisitExpr_(const SubNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Parent::VisitExpr_(op);
   }
 
@@ -1631,7 +1633,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const SubNode* op) {
 }
 
 PrimExpr IterMapRewriter::VisitExpr_(const MulNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Parent::VisitExpr_(op);
   }
   // normalize
@@ -1677,7 +1679,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const MulNode* op) {
 IterSumExpr IterMapRewriter::PreprocessDividend(IterMapExpr dividend, PrimExpr original_dividend) {
   if (dividend->IsInstance<IterSplitExprNode>()) {
     auto split = dividend.as_or_throw<IterSplitExpr>();
-    return IterSumExpr({split}, IntImm(split.dtype(), 0));
+    return IterSumExpr({split}, IntImm(split.ty(), 0));
   } else if (dividend->IsInstance<IterSumExprNode>()) {
     auto sum = dividend.as_or_throw<IterSumExpr>();
     if (sum->args.empty()) {
@@ -1880,12 +1882,12 @@ PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr base, P
     } else if (CanProveDivisible(rhs, lhs->scale) && is_zero(base)) {
       // floordiv(x*c1, c1*c2) = floordiv(x, c2), c2=rhs/scale
       rhs = floordiv(rhs, lhs->scale);
-      lhs.CopyOnWrite()->scale = MakeConst(rhs->dtype, 1);
+      lhs.CopyOnWrite()->scale = MakeConst(rhs.ty(), 1);
     } else if (CanProveDivisible(rhs, lhs->scale) && CanProveDivisible(base, lhs->scale)) {
       // floordiv(x*c1 + y*c1, c1*c2) = floordiv(x+y, c2), c2=rhs/scale
       base = floordiv(base, lhs->scale);
       rhs = floordiv(rhs, lhs->scale);
-      lhs.CopyOnWrite()->scale = MakeConst(rhs->dtype, 1);
+      lhs.CopyOnWrite()->scale = MakeConst(rhs.ty(), 1);
     } else {
       // mark as unresolved.
       ErrorLogger(this) << "Cannot represent as IterMap: the numerator's scaling factor, "
@@ -1931,7 +1933,7 @@ PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr base, P
     new_split = IterSplitExpr(IterMark(padded, padded->extent),
                               /* lower_factor = */ rhs,
                               /* extent = */ analyzer_->Simplify(ceildiv(padded->extent, rhs)),
-                              /* scale = */ MakeConst(rhs->dtype, 1));
+                              /* scale = */ MakeConst(rhs.ty(), 1));
   }
 
   auto new_base = analyzer_->Simplify(floordiv(base - left_pad, rhs), 6);
@@ -1944,7 +1946,7 @@ PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr base, P
 }
 
 PrimExpr IterMapRewriter::VisitExpr_(const FloorDivNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Parent::VisitExpr_(op);
   }
 
@@ -1987,13 +1989,13 @@ PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr base, P
 
   if (is_one(rhs)) {
     // floormod(x, 1) = 0
-    return IntImm(lhs->dtype, 0);
+    return IntImm(lhs.ty(), 0);
   }
 
   if (!is_one(lhs->scale)) {
     if (CanProveDivisible(lhs->scale, rhs) && CanProveDivisible(base, rhs)) {
       // floormod(x*c1*c2, c1) = 0
-      return IntImm(lhs->dtype, 0);
+      return IntImm(lhs.ty(), 0);
     } else if (CanProveDivisible(rhs, lhs->scale) && is_zero(base)) {
       // floormod(x*c1, c1*c2) = (floormod(x, c2)) * c1, where c2 = rhs/scale
       rhs = floordiv(rhs, lhs->scale);
@@ -2028,7 +2030,7 @@ PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr base, P
 }
 
 PrimExpr IterMapRewriter::VisitExpr_(const FloorModNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Parent::VisitExpr_(op);
   }
 
@@ -2113,7 +2115,7 @@ class IterMapToExprNormalizer : public ExprMutator {
       // simplify trivial iters like `vi \in [0, 1)`, which can be useful for subsequent analysis
       // like tensorization.
       if (is_one(expr->extent) && !is_one(expr->source->extent)) {
-        return IntImm(expr->extent->dtype, 0);
+        return IntImm(expr->extent.ty(), 0);
       }
       return floordiv(source, expr->lower_factor) * expr->scale;
     } else {
@@ -2255,13 +2257,13 @@ class SubspaceDivider {
     IterSplitExpr GetInnerAsSplit() const { return GetAsSplit(inner, inner_extent); }
 
     static DivisionResult Inner(const IterMapExpr& iter, const PrimExpr& extent) {
-      auto dtype = iter.dtype();
+      PrimType dtype = iter.ty();
       return DivisionResult(IterSumExpr({}, IntImm(dtype, 0)), IntImm(dtype, 1), iter, extent,
                             Kind::kInner);
     }
 
     static DivisionResult Outer(const IterMapExpr& iter, const PrimExpr& extent) {
-      auto dtype = iter.dtype();
+      PrimType dtype = iter.ty();
       return DivisionResult(iter, extent, IterSumExpr({}, IntImm(dtype, 0)), IntImm(dtype, 1),
                             Kind::kOuter);
     }
@@ -2285,7 +2287,7 @@ class SubspaceDivider {
 
   // Divide an IterSumExpr
   DivisionResult DivideIterSumExpr(const IterSumExpr& expr, const PrimExpr& mark_extent) {
-    auto dtype = expr.dtype();
+    PrimType dtype = expr.ty();
     if (expr->args.empty()) {
       // base
       return DivisionResult(IterSumExpr({}, IntImm(dtype, 0)), IntImm(dtype, 1),
@@ -2377,7 +2379,7 @@ class SubspaceDivider {
   // args are sorted from inner to outer
   static IterMark MarkFromArgsAndBase(const std::vector<IterSplitExpr>& args, PrimExpr base) {
     std::vector<IterSplitExpr> res;
-    PrimExpr extent = MakeConst(base.dtype(), 1);
+    PrimExpr extent = MakeConst(base.ty(), 1);
     for (const IterSplitExpr& it : args) {
       IterSplitExpr arg = it;
       arg.CopyOnWrite()->scale = extent;
@@ -2431,7 +2433,7 @@ class SubspaceDivider {
       bool encountered_boundary = mark_division.IsOuter();
       std::vector<bool> used(splits.size(), false);
       std::vector<IterSplitExpr> inner_iters, outer_iters;
-      PrimExpr expected_lower_factor = MakeConst(expr->source->source->dtype, 1);
+      PrimExpr expected_lower_factor = MakeConst(expr->source->source.ty(), 1);
       // find the boundary of outer and inner, like case 1 above
       for (size_t i = 0; i < splits.size(); ++i) {
         size_t j = 0;
diff --git a/src/arith/pattern_match.h b/src/arith/pattern_match.h
index bb1ebd54cca7..dda8e704cfed 100644
--- a/src/arith/pattern_match.h
+++ b/src/arith/pattern_match.h
@@ -199,7 +199,10 @@ class PVar : public Pattern<PVar<T>> {
   // Store PVars by reference in the expression.
   using Nested = const PVar<T>&;
 
-  void InitMatch_() const { filled_ = false; }
+  void InitMatch_() const {
+    value_ = nullptr;
+    filled_ = false;
+  }
 
   bool Match_(const T& value) const {
     if (!filled_) {
@@ -207,7 +210,7 @@ class PVar : public Pattern<PVar<T>> {
       filled_ = true;
       return true;
     } else {
-      return PEqualChecker<T>()(value_, value);
+      return PEqualChecker<T>()(value_.value(), value);
     }
   }
 
@@ -223,14 +226,14 @@ class PVar : public Pattern<PVar<T>> {
 
   T Eval() const {
     TVM_FFI_ICHECK(filled_);
-    return value_;
+    return value_.value();
   }
 
-  T EvalOr(const T& default_value) const { return filled_ ? value_ : default_value; }
+  T EvalOr(const T& default_value) const { return filled_ ? value_.value() : default_value; }
 
  protected:
   /*! \brief The matched value */
-  mutable T value_;
+  mutable ffi::Optional<T> value_;
   /*! \brief whether the variable has been filled */
   mutable bool filled_{false};
 };
@@ -282,7 +285,7 @@ class PVarWithDataType : public PVarWithCheck<PVarWithDataType<T, DType>, T> {
  public:
   explicit PVarWithDataType(const DType& dtype) : dtype_(dtype) {}
 
-  bool Match_(const T& value) const { return dtype_.Match_(value->dtype); }
+  bool Match_(const T& value) const { return dtype_.Match_(value.ty()); }
 
  protected:
   typename DType::Nested dtype_;
@@ -291,15 +294,15 @@ class PVarWithDataType : public PVarWithCheck<PVarWithDataType<T, DType>, T> {
 /*!
  * \brief Pattern variable container for data type with lanes.
  */
-class PVecDataType : public PVarWithCheck<PVecDataType, DataType> {
+class PVecDataType : public PVarWithCheck<PVecDataType, PrimType> {
  public:
   /*! \brief construct vector dtype placeholder with element type check */
-  explicit PVecDataType(const DataType& elem_dtype) : elem_dtype_(elem_dtype) {}
+  explicit PVecDataType(PrimType elem_dtype) : elem_dtype_(elem_dtype) {}
 
-  bool Match_(const DataType& dtype) const { return dtype.code() == elem_dtype_.code(); }
+  bool Match_(PrimType dtype) const { return dtype.code() == elem_dtype_.code(); }
 
  protected:
-  DataType elem_dtype_;
+  PrimType elem_dtype_;
 };
 
 /*!
@@ -377,7 +380,7 @@ class PConstWithTypeLike : public Pattern<PConstWithTypeLike<TA>> {
     }
   }
 
-  PrimExpr Eval() const { return tirx::MakeConst(ref_.Eval().dtype(), value_); }
+  PrimExpr Eval() const { return tirx::MakeConst(ref_.Eval().ty(), value_); }
 
  private:
   typename TA::Nested ref_;
@@ -540,7 +543,7 @@ class PCastExpr : public Pattern<PCastExpr<DType, TA>> {
 
   bool Match_(const ffi::ObjectRef& node) const {
     if (const tirx::CastNode* ptr = node.as<tirx::CastNode>()) {
-      if (!dtype_.Match_(ptr->dtype)) return false;
+      if (!dtype_.Match_(ptr->ty())) return false;
       if (!value_.Match_(ptr->value)) return false;
       return true;
     } else {
@@ -558,7 +561,7 @@ class PCastExpr : public Pattern<PCastExpr<DType, TA>> {
 /*!
  * \brief Construct a cast pattern.
  *
- * \param dtype The target data type, can be PVar<DataType> or PConst<DataType>.
+ * \param dtype The target data type, can be PVar<PrimType> or PConst<PrimType>.
  * \param value The input type.
  *
  * \return The result pattern.
@@ -780,7 +783,7 @@ class PCallExpr : public Pattern<PCallExpr<Op, TArgs...>> {
 #define TVM_PATTERN_BINARY_INTRIN(FuncName, OpName, IntrinOpName)                         \
   struct OpName {                                                                         \
     static PrimExpr Eval(ffi::Array<PrimExpr> args) {                                     \
-      return tirx::Call(args[0].dtype(), GetOp(), args);                                  \
+      return tirx::Call(args[0].ty(), GetOp(), args);                                     \
     }                                                                                     \
     static const Op& GetOp() { return tirx::builtin::IntrinOpName(); }                    \
   };                                                                                      \
@@ -799,7 +802,7 @@ TVM_PATTERN_BINARY_INTRIN(operator^, PBitwiseXorOp, bitwise_xor);
 #define TVM_PATTERN_UNARY_INTRIN(FuncName, OpName, IntrinOpName)       \
   struct OpName {                                                      \
     static PrimExpr Eval(ffi::Array<PrimExpr> args) {                  \
-      return tirx::Call(args[0].dtype(), GetOp(), args);               \
+      return tirx::Call(args[0].ty(), GetOp(), args);                  \
     }                                                                  \
     static const Op& GetOp() { return tirx::builtin::IntrinOpName(); } \
   };                                                                   \
@@ -813,7 +816,7 @@ TVM_PATTERN_UNARY_INTRIN(operator~, PBitwiseNotOp, bitwise_not);
 // if_then_else
 struct PIfThenElseOp {
   static PrimExpr Eval(ffi::Array<PrimExpr> args) {
-    return tirx::Call(args[1].dtype(), GetOp(), args);
+    return tirx::Call(args[1].ty(), GetOp(), args);
   }
   static const Op& GetOp() { return tirx::builtin::if_then_else(); }
 };
@@ -841,7 +844,7 @@ inline PCallExpr<PIfThenElseOp, TCond, TA, TB> if_then_else(const Pattern<TCond>
 
 // vscale
 struct PVscaleOp {
-  static PrimExpr Eval() { return tirx::Call(DataType::Int(32), GetOp(), {}); }
+  static PrimExpr Eval() { return tirx::Call(PrimType::Int(32), GetOp(), {}); }
   static const Op& GetOp() { return tirx::builtin::vscale(); }
 };
 
diff --git a/src/arith/product_normal_form.h b/src/arith/product_normal_form.h
index 40d02c1952b7..79e040287fa7 100644
--- a/src/arith/product_normal_form.h
+++ b/src/arith/product_normal_form.h
@@ -79,7 +79,8 @@ inline void UnpackSum(const PrimExpr& value, FLeaf fleaf, int sign = 1) {
  */
 inline PrimExpr MulAndNormalize(const PrimExpr& lhs, const PrimExpr& rhs) {
   int64_t cscale = 1;
-  PrimExpr res = tirx::MakeConst(lhs.dtype(), 1);
+  PrimType lhs_ty = lhs.ty();
+  PrimExpr res = tirx::MakeConst(lhs_ty, 1);
   auto fcollect = [&](PrimExpr val) {
     if (const auto* intimm = val.as<IntImmNode>()) {
       cscale *= intimm->value;
@@ -90,7 +91,7 @@ inline PrimExpr MulAndNormalize(const PrimExpr& lhs, const PrimExpr& rhs) {
   UnpackReduction<tirx::MulNode>(lhs, fcollect);
   UnpackReduction<tirx::MulNode>(rhs, fcollect);
   if (cscale != 1) {
-    res = res * tirx::MakeConst(res.dtype(), cscale);
+    res = res * tirx::MakeConst(res.ty(), cscale);
   }
   return res;
 }
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index fa3ba0b519d6..07ea2c7a7778 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -425,7 +425,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AddNode* op) {
   // Pattern var for lanes in broadcast and ramp
   PVar<PrimExpr> lanes;
   // Vector rules
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(ramp(b1, s1, lanes) + ramp(b2, s2, lanes), ramp(b1 + b2, s1 + s2, lanes));
     TVM_TRY_REWRITE(ramp(b1, s1, lanes) + broadcast(x, lanes), ramp(b1 + x, s1, lanes));
     TVM_TRY_REWRITE(broadcast(x, lanes) + ramp(b1, s1, lanes), ramp(x + b1, s1, lanes));
@@ -433,7 +433,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AddNode* op) {
     TVM_TRY_REWRITE_IF(x + broadcast(c4, lanes), x, c4.Eval()->value == 0.0f);
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexTypedExpr(op)) {
     // Index rules
     // cancelation rules
     TVM_TRY_REWRITE((x - y) + y, x);
@@ -535,7 +535,7 @@ std::function<void()> RewriteSimplifier::Impl::EnterConstraint(const PrimExpr& c
     if (SideEffect(subconstraint) <= CallEffectKind::kPure) {
       literal_constraints_.push_back(subconstraint);
       PrimExpr negation;
-      if (subconstraint.dtype().is_bool()) {
+      if (subconstraint.ty().MatchesElementType(DLDataTypeCode::kDLBool, 8)) {
         // We could apply NormalizeBooleanOperators during
         // TryMatchLiteralConstraint, but that would require
         // performing a rewrite of each expression being checked.
@@ -543,7 +543,7 @@ std::function<void()> RewriteSimplifier::Impl::EnterConstraint(const PrimExpr& c
         // applied.
         negation = NormalizeBooleanOperators(Not(subconstraint));
       } else {
-        negation = subconstraint == IntImm(subconstraint.dtype(), 0);
+        negation = subconstraint == IntImm(subconstraint.ty(), 0);
       }
       literal_constraints_.push_back(Not(negation));
     }
@@ -575,14 +575,14 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const SubNode* op) {
   PVar<PrimExpr> lanes;
 
   // Vector rules
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(ramp(b1, s1, lanes) - ramp(b2, s2, lanes), ramp(b1 - b2, s1 - s2, lanes));
     TVM_TRY_REWRITE(ramp(b1, s1, lanes) - broadcast(x, lanes), ramp(b1 - x, s1, lanes));
     TVM_TRY_REWRITE(broadcast(x, lanes) - ramp(b1, s1, lanes), ramp(x - b1, 0 - s1, lanes));
     TVM_TRY_REWRITE(broadcast(x, lanes) - broadcast(y, lanes), broadcast(x - y, lanes));
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexTypedExpr(op)) {
     // Index rules
     // cancelation rules
     TVM_TRY_REWRITE(matches_one_of((x + y) - y, (y + x) - y), x);
@@ -765,7 +765,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MulNode* op) {
   // Pattern var for lanes in broadcast and ramp
   PVar<PrimExpr> lanes;
   // Vector rules
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(broadcast(x, lanes) * broadcast(y, lanes), broadcast(x * y, lanes));
     TVM_TRY_REWRITE(matches_one_of(ramp(b1, s1, lanes) * broadcast(x, lanes),
                                    broadcast(x, lanes) * ramp(b1, s1, lanes)),
@@ -773,7 +773,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MulNode* op) {
     TVM_TRY_REWRITE_IF(broadcast(c3, lanes) * x, broadcast(c3, lanes), c3.Eval()->value == 0.0f);
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexTypedExpr(op)) {
     // constant simplification rule
     TVM_TRY_REWRITE((x + c1) * c2, x * c2 + c1 * c2);
     TVM_TRY_REWRITE((x * c1) * c2, x * (c1 * c2));
@@ -803,7 +803,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const DivNode* op) {
   PVar<PrimExpr> lanes;
 
   // Vector rules
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     // NOTE: use div as the pattern also works for float.
     TVM_TRY_REWRITE(div(broadcast(x, lanes), broadcast(y, lanes)), broadcast(div(x, y), lanes));
     // ramp / bcast
@@ -827,7 +827,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const DivNode* op) {
     }
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexTypedExpr(op)) {
     // Be-aware of the division rules:
     // We adopt the default C division uses truncation instead of floordiv.
     // This means most rules need to check non-negativeness of the operands.
@@ -839,7 +839,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const DivNode* op) {
     if (truncdiv(c1, c2).Match(ret)) {
       int64_t c1val = c1.Eval()->value;
       int64_t c2val = c2.Eval()->value;
-      return MakeConst(op->dtype, truncdiv(c1val, c2val));
+      return MakeConst(op->ty(), truncdiv(c1val, c2val));
     }
 
     // while it is always true for trunc div
@@ -957,7 +957,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const ModNode* op) {
   PVar<PrimExpr> lanes;
 
   // Vector rules
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(truncmod(broadcast(x, lanes), broadcast(y, lanes)),
                     broadcast(truncmod(x, y), lanes));
 
@@ -994,7 +994,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const ModNode* op) {
     }
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexTypedExpr(op)) {
     // Be-aware of the division rules:
     // We adopt the default C division uses truncation instead of floordiv.
     // This means most rules need to check non-negativeness of the operands.
@@ -1019,7 +1019,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const ModNode* op) {
     // canonicalization: x % c == x % (-c) for truncated division
     // NOTE: trunc div required
     TVM_TRY_RECURSIVE_REWRITE_IF(
-        truncmod(x, c1), truncmod(x, PConst<PrimExpr>(MakeConst(op->dtype, -c1.Eval()->value))),
+        truncmod(x, c1), truncmod(x, PConst<PrimExpr>(MakeConst(op->ty(), -c1.Eval()->value))),
         c1.Eval()->value < 0);
 
     // try modular analysis
@@ -1046,7 +1046,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
   PVar<PrimExpr> lanes;
 
   // Vector rules
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(floordiv(broadcast(x, lanes), broadcast(y, lanes)),
                     broadcast(floordiv(x, y), lanes));
     // ramp // bcast
@@ -1077,7 +1077,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
     }
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexTypedExpr(op)) {
     // Be-aware of the division rules: this is floor division.
     TVM_TRY_REWRITE_IF(floordiv(floordiv(x, c1), c2), floordiv(x, c1 * c2),
                        c1.Eval()->value > 0 && c2.Eval()->value > 0);
@@ -1198,7 +1198,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
   PVar<PrimExpr> lanes;
 
   // Vector rules
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(floormod(broadcast(x, lanes), broadcast(y, lanes)),
                     broadcast(floormod(x, y), lanes));
 
@@ -1238,7 +1238,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
     }
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexTypedExpr(op)) {
     // Be-aware of the division rules: we use floordiv/floormod here
     TVM_TRY_REWRITE_IF(floormod(x * c1, c2), floormod(x * floormod(c1, c2), c2),
                        c2.Eval()->value != 0);
@@ -1314,12 +1314,12 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MinNode* op) {
   PVar<PrimExpr> lanes;
 
   // vector rule
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(min(broadcast(x, lanes), broadcast(y, lanes)), broadcast(min(x, y), lanes));
     TVM_TRY_REWRITE(min(min(x, broadcast(y, lanes)), broadcast(z, lanes)),
                     min(x, broadcast(min(y, z), lanes)));
   }
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexTypedExpr(op)) {
     TVM_TRY_REWRITE(min(x, x), x);
 
     // constant int bound
@@ -1498,12 +1498,12 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MaxNode* op) {
   PVar<PrimExpr> lanes;
 
   // vector rule
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(max(broadcast(x, lanes), broadcast(y, lanes)), broadcast(max(x, y), lanes));
     TVM_TRY_REWRITE(max(max(x, broadcast(y, lanes)), broadcast(z, lanes)),
                     max(x, broadcast(max(y, z), lanes)));
   }
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexTypedExpr(op)) {
     TVM_TRY_REWRITE(max(x, x), x);
 
     // constant int bound
@@ -1686,10 +1686,10 @@ ffi::Optional<PrimExpr> RewriteSimplifier::Impl::TryMatchLiteralConstraint(
   ExprDeepEqual expr_equal;
   for (const auto& constraint : literal_constraints_) {
     if (expr_equal(constraint, expr)) {
-      return MakeConst(expr->dtype, true);
+      return MakeConst(expr->ty(), true);
     }
     if (expr_equal(constraint, negation)) {
-      return MakeConst(expr->dtype, false);
+      return MakeConst(expr->ty(), false);
     }
   }
   return std::nullopt;
@@ -1715,20 +1715,20 @@ PrimExpr RewriteSimplifier::Impl::ApplyRewriteRules(EQ ret) {
   // Pattern var match IntImm
   PVar<IntImm> c1, c2;
   PVar<PrimExpr> lanes;
-  PConst<PrimExpr> ctrue(MakeConst(ret->dtype, true));
+  PConst<PrimExpr> ctrue(MakeConst(ret->ty(), true));
 
   // vector rule
-  if (ret->dtype.is_scalable_or_fixed_length_vector()) {
+  if (ret->ty().IsScalableVector() || ret->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(broadcast(x, lanes) == broadcast(y, lanes), broadcast(x == y, lanes));
   }
 
-  if (IsIndexType(ret->a.dtype())) {
+  if (IsIndexTypedExpr(ret->a)) {
     CompareResult result = TryCompare(ret->a, ret->b);
     if (result == CompareResult::kEQ) {
-      return MakeConst(ret->dtype, true);
+      return MakeConst(ret->ty(), true);
     } else if (result == CompareResult::kNE || result == CompareResult::kGT ||
                result == CompareResult::kLT) {
-      return MakeConst(ret->dtype, false);
+      return MakeConst(ret->ty(), false);
     }
     TVM_TRY_REWRITE(c1 == x, x == c1);
 
@@ -1758,13 +1758,13 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const NENode* op) {
   if (auto const_res = TryConstFold<NE>(op->a, op->b)) return const_res.value();
   if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
 
-  if (IsIndexType(op->a.dtype())) {
+  if (IsIndexTypedExpr(op->a)) {
     CompareResult result = TryCompare(op->a, op->b);
     if (result == CompareResult::kNE || result == CompareResult::kGT ||
         result == CompareResult::kLT) {
-      return MakeConst(op->dtype, true);
+      return MakeConst(op->ty(), true);
     } else if (result == CompareResult::kEQ) {
-      return MakeConst(op->dtype, false);
+      return MakeConst(op->ty(), false);
     } else if (result == CompareResult::kGE) {
       // Known: a >= b
       //
@@ -1802,13 +1802,13 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const LENode* op) {
   // (floordiv(A,B)<x) in these cases instead.
   ret = ApplyRewriteRules(Not(ApplyRewriteRules(LT(op->b, op->a))));
 
-  if (auto op = ret.as<LENode>(); op && IsIndexType(op->a.dtype())) {
+  if (auto op = ret.as<LENode>(); op && IsIndexTypedExpr(op->a)) {
     CompareResult result = TryCompare(op->a, op->b);
     if (result == CompareResult::kLE || result == CompareResult::kLT ||
         result == CompareResult::kEQ) {
-      return MakeConst(op->dtype, true);
+      return MakeConst(op->ty(), true);
     } else if (result == CompareResult::kGT) {
-      return MakeConst(op->dtype, false);
+      return MakeConst(op->ty(), false);
     } else if (result == CompareResult::kNE) {
       // Known: a != b
       //
@@ -1857,19 +1857,19 @@ PrimExpr RewriteSimplifier::Impl::ApplyRewriteRules(LT ret) {
   PVar<PrimExpr> lanes;
 
   // vector rule
-  if (ret->dtype.is_scalable_or_fixed_length_vector()) {
+  if (ret->ty().IsScalableVector() || ret->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(broadcast(x, lanes) < broadcast(y, lanes), broadcast(x < y, lanes));
     TVM_TRY_REWRITE(ramp(x, s1, lanes) < ramp(y, s1, lanes), broadcast(x < y, lanes));
   }
 
-  if (IsIndexType(ret->a.dtype())) {
+  if (IsIndexTypedExpr(ret->a)) {
     CompareResult result = TryCompare(ret->a, ret->b);
     if (result == CompareResult::kLT) {
-      return MakeConst(ret->dtype, true);
+      return MakeConst(ret->ty(), true);
     }
     if (result == CompareResult::kEQ || result == CompareResult::kGT ||
         result == CompareResult::kGE) {
-      return MakeConst(ret->dtype, false);
+      return MakeConst(ret->ty(), false);
     }
 
     // clang-format off
@@ -1987,9 +1987,9 @@ PrimExpr RewriteSimplifier::Impl::ApplyRewriteRules(LT ret) {
       } else if (diff == 1) {
         return lhs <= rhs;
       } else if (diff < 0 && rhs_offset != 0) {
-        return lhs + MakeConst(lhs.dtype(), -diff) < rhs;
+        return lhs + MakeConst(lhs.ty(), -diff) < rhs;
       } else if (diff > 0 && lhs_offset != 0) {
-        return lhs < rhs + MakeConst(rhs.dtype(), diff);
+        return lhs < rhs + MakeConst(rhs.ty(), diff);
       }
 
       return std::nullopt;
@@ -2024,7 +2024,7 @@ PrimExpr RewriteSimplifier::Impl::ApplyRewriteRules(Not ret) {
   // Pattern var to match any expression
   PVar<PrimExpr> x, y;
   PVar<PrimExpr> lanes;
-  if (ret->dtype.is_scalable_or_fixed_length_vector()) {
+  if (ret->ty().IsScalableVector() || ret->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(!broadcast(x, lanes), broadcast(!x, lanes));
   }
 
@@ -2100,11 +2100,11 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AndNode* op) {
   PVar<IntImm> c1, c2, c3;
   PVar<PrimExpr> lanes;
 
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(broadcast(x, lanes) && broadcast(y, lanes), broadcast(x && y, lanes));
   }
 
-  auto cfalse = PConst<PrimExpr>(MakeConst(op->dtype, false));
+  auto cfalse = PConst<PrimExpr>(MakeConst(op->ty(), false));
   TVM_TRY_REWRITE(x == y && x != y, cfalse);
   TVM_TRY_REWRITE(x != y && x == y, cfalse);
   TVM_TRY_REWRITE(x && !x, cfalse);
@@ -2248,11 +2248,11 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const OrNode* op) {
   PVar<IntImm> c1, c2;
   PVar<PrimExpr> lanes;
 
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(broadcast(x, lanes) || broadcast(y, lanes), broadcast(x || y, lanes));
   }
 
-  auto ctrue = PConst<PrimExpr>(MakeConst(op->dtype, true));
+  auto ctrue = PConst<PrimExpr>(MakeConst(op->ty(), true));
 
   TVM_TRY_REWRITE(x == y || x != y, ctrue);
   TVM_TRY_REWRITE(x != y || x == y, ctrue);
@@ -2319,12 +2319,14 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const CallNode* op) {
   static const Op& ceil_op = Op::Get("tirx.ceil");
   static const Op& log2_op = Op::Get("tirx.log2");
   static const Op& clz_op = Op::Get("tirx.clz");
+  PrimType ret_ty = ffi::GetRef<PrimExpr>(op).ty();
   if (op->op.same_as(ceil_op)) {
     PrimExpr ceil_arg = op->args[0];
     if (auto arg_int = op->args[0].as<IntImmNode>()) {
-      return cast(op->dtype, IntImm(arg_int->dtype, arg_int->value));
+      return cast(ret_ty, IntImm(ffi::GetRef<PrimExpr>(arg_int).ty(), arg_int->value));
     } else if (auto arg_float = ceil_arg.as<FloatImmNode>()) {
-      return cast(op->dtype, FloatImm(arg_float->dtype, std::ceil(arg_float->value)));
+      return cast(ret_ty,
+                  FloatImm(ffi::GetRef<PrimExpr>(arg_float).ty(), std::ceil(arg_float->value)));
     } else if (auto arg_call = ceil_arg.as<CallNode>()) {
       // ceil(log2(cast(n,"float64"))) is used as the implementation of
       // topi.math.ceil_log2, and appears in iteration bounds.
@@ -2334,17 +2336,17 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const CallNode* op) {
           // ceil(log2(n)) can be simplified, and should produce the
           // same integer result regardless of the target's rounding
           // conventions.
-          return FloatImm(op->dtype, std::ceil(std::log2(as_float->value)));
+          return FloatImm(ret_ty, std::ceil(std::log2(as_float->value)));
         }
       }
     }
   } else if (op->op.same_as(clz_op)) {
     if (const auto* arg_int = op->args[0].as<IntImmNode>()) {
-      int bits = arg_int->dtype.bits();
-      if (arg_int->value == 0) return MakeConst(op->dtype, bits);
+      int bits = arg_int->ty().bits();
+      if (arg_int->value == 0) return MakeConst(ret_ty, bits);
       for (int i = bits - 1; i >= 0; --i) {
         if ((int64_t(1) << i) & arg_int->value) {
-          return IntImm(op->dtype, bits - i - 1);
+          return IntImm(ret_ty, bits - i - 1);
         }
       }
       TVM_FFI_THROW(InternalError) << "Should not reach here";
@@ -2373,7 +2375,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const CallNode* op) {
       // Only check constant cases to avoid recursion
       if (is_const_number(inner_else_expr) && is_const_number(else_expr) &&
           analyzer_->CanProve(inner_else_expr == else_expr)) {
-        return Call(op->dtype, op->op, {cond && inner_cond, inner_then_expr, else_expr}, op->attrs,
+        return Call(ret_ty, op->op, {cond && inner_cond, inner_then_expr, else_expr}, op->attrs,
                     op->span);
       }
     }
@@ -2384,7 +2386,9 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const CallNode* op) {
 
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const VarNode* op) {
   Var var = ffi::GetRef<Var>(op);
-  if (op->dtype == DataType::Bool()) {
+  PrimType op_ty = op->ty();
+  if (op_ty.MatchesElementType(DLDataTypeCode::kDLBool, 8) && !op_ty.IsScalableVector() &&
+      !op_ty.IsFixedLengthVector()) {
     if (auto match = TryMatchLiteralConstraint(var)) {
       return match.value();
     }
@@ -2400,7 +2404,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const VarNode* op) {
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const CastNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
   op = ret.as<CastNode>();
-  return cast(op->dtype, op->value);
+  return cast(ret.ty(), op->value);
 }
 
 bool RewriteSimplifier::Impl::CanInlineLet(const LetNode* op) {
diff --git a/src/arith/solve_linear_equation.cc b/src/arith/solve_linear_equation.cc
index 27144c674b9f..fd507ccdd658 100644
--- a/src/arith/solve_linear_equation.cc
+++ b/src/arith/solve_linear_equation.cc
@@ -24,9 +24,9 @@
 #include <tvm/arith/analyzer.h>
 #include <tvm/arith/int_solver.h>
 #include <tvm/arith/pattern.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/tirx/expr.h>
 #include <tvm/tirx/op.h>
@@ -133,10 +133,10 @@ void SmithNormalFormDiag(std::vector<std::vector<int64_t>>* S, std::vector<std::
           (*S)[i][j] = new_i_j;
         }
         // We have to do the same with rhs
-        PrimExpr ea = tirx::MakeConst((*y)[index].dtype(), a);
-        PrimExpr eb = tirx::MakeConst((*y)[i].dtype(), b);
-        PrimExpr e_m_g = tirx::MakeConst((*y)[i].dtype(), m_g);
-        PrimExpr e_n_g = tirx::MakeConst((*y)[index].dtype(), n_g);
+        PrimExpr ea = tirx::MakeConst((*y)[index].ty(), a);
+        PrimExpr eb = tirx::MakeConst((*y)[i].ty(), b);
+        PrimExpr e_m_g = tirx::MakeConst((*y)[i].ty(), m_g);
+        PrimExpr e_n_g = tirx::MakeConst((*y)[index].ty(), n_g);
         PrimExpr new_index_rhs = ea * (*y)[index] + eb * (*y)[i];
         PrimExpr new_i_rhs = e_n_g * (*y)[index] - e_m_g * (*y)[i];
         (*y)[index] = new_index_rhs;
@@ -193,10 +193,10 @@ void SmithNormalFormDiag(std::vector<std::vector<int64_t>>* S, std::vector<std::
           (*V)[i][j] = new_i_j;
         }
         // And apply reverse transformations to new_to_old.
-        PrimExpr ea = tirx::MakeConst((*x)[j].dtype(), a);
-        PrimExpr eb = tirx::MakeConst((*x)[index].dtype(), b);
-        PrimExpr e_m_g = tirx::MakeConst((*x)[index].dtype(), m_g);
-        PrimExpr e_n_g = tirx::MakeConst((*x)[j].dtype(), n_g);
+        PrimExpr ea = tirx::MakeConst((*x)[j].ty(), a);
+        PrimExpr eb = tirx::MakeConst((*x)[index].ty(), b);
+        PrimExpr e_m_g = tirx::MakeConst((*x)[index].ty(), m_g);
+        PrimExpr e_n_g = tirx::MakeConst((*x)[j].ty(), n_g);
         PrimExpr new_index = e_m_g * (*x)[index] + e_n_g * (*x)[j];
         PrimExpr new_j = eb * (*x)[index] - ea * (*x)[j];
         (*x)[index] = new_index;
@@ -395,7 +395,7 @@ IntConstraintsTransform SolveLinearEquations(const IntConstraints& system_to_sol
       if (const VarNode* v_old = to_old.as<VarNode>()) {
         name_hint += "_" + v_old->name_hint;
       }
-      Var v = Var(name_hint, V_inv_x[j].dtype());
+      Var v = Var(name_hint, V_inv_x[j].ty());
       solution_for_V_inv_x.push_back(v);
       new_vars.push_back(v);
       new_to_old_map.Set(v, to_old);
@@ -403,12 +403,12 @@ IntConstraintsTransform SolveLinearEquations(const IntConstraints& system_to_sol
       // The j-th variable is just a single value, don't create a tvm variable
       // S^{-1}_{nxm} Uy_{mxn}
       if (S[j][j] >= 0) {
-        PrimExpr a = tirx::MakeConst(Uy[j].dtype(), S[j][j]);
+        PrimExpr a = tirx::MakeConst(Uy[j].ty(), S[j][j]);
         solution_for_V_inv_x.push_back(analyzer_problem->Simplify(floordiv(Uy[j], a)));
       } else {
         // This is required because some simplifiers
         // have problems with dividing by negative numbers
-        PrimExpr a = tirx::MakeConst(Uy[j].dtype(), -S[j][j]);
+        PrimExpr a = tirx::MakeConst(Uy[j].ty(), -S[j][j]);
         solution_for_V_inv_x.push_back(analyzer_problem->Simplify(floordiv(-Uy[j], a)));
       }
     }
@@ -416,9 +416,9 @@ IntConstraintsTransform SolveLinearEquations(const IntConstraints& system_to_sol
 
   // V V^{-1} x = x
   for (size_t i = 0; i < num_vars; ++i) {
-    PrimExpr e = IntImm(system_to_solve->variables[i].dtype(), 0);
+    PrimExpr e = IntImm(system_to_solve->variables[i].ty(), 0);
     for (size_t j = 0; j < num_vars; ++j) {
-      e = e + tirx::MakeConst(e.dtype(), V[i][j]) * solution_for_V_inv_x[j];
+      e = e + tirx::MakeConst(e.ty(), V[i][j]) * solution_for_V_inv_x[j];
     }
     e = analyzer_problem->Simplify(e);
     old_to_new_map.Set(system_to_solve->variables[i], e);
diff --git a/src/arith/solve_linear_inequality.cc b/src/arith/solve_linear_inequality.cc
index 80d064f71157..14b1affb9927 100644
--- a/src/arith/solve_linear_inequality.cc
+++ b/src/arith/solve_linear_inequality.cc
@@ -24,9 +24,9 @@
 #include <tvm/arith/analyzer.h>
 #include <tvm/arith/int_solver.h>
 #include <tvm/arith/pattern.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/tirx/analysis.h>
 #include <tvm/tirx/expr.h>
 #include <tvm/tirx/op.h>
@@ -91,10 +91,12 @@ class NormalizeComparisons : public ExprMutator {
   template <class T>
   PrimExpr Make(const PrimExpr& a, const PrimExpr& b) {
     // rewrite LT to LE for ints
-    if (std::is_same<T, LT>::value && (a.dtype().is_int() || a.dtype().is_uint())) {
-      return LE(analyzer_->Simplify(a - b + 1), IntImm(a.dtype(), 0));
+    PrimType a_ty = a.ty();
+    if (std::is_same<T, LT>::value &&
+        (a_ty.code() == DLDataTypeCode::kDLInt || a_ty.code() == DLDataTypeCode::kDLUInt)) {
+      return LE(analyzer_->Simplify(a - b + 1), IntImm(a.ty(), 0));
     }
-    return T(analyzer_->Simplify(a - b), IntImm(a.dtype(), 0));
+    return T(analyzer_->Simplify(a - b), IntImm(a.ty(), 0));
   }
   arith::Analyzer analyzer_;
 };
@@ -248,11 +250,12 @@ PartialSolvedInequalities SolveLinearInequalities(const IntConstraints& system_t
     for (const auto& pos : coef_pos) {
       for (const auto& neg : coef_neg) {
         auto first_gcd = ExtendedEuclidean(pos.first, -neg.first, &gcd_x, &gcd_y);
-        PrimExpr c_pos = MakeConst(v.dtype(), neg.first / first_gcd);
-        PrimExpr c_neg = MakeConst(v.dtype(), pos.first / first_gcd);
+        PrimType v_ty = v.ty();
+        PrimExpr c_pos = MakeConst(v_ty, neg.first / first_gcd);
+        PrimExpr c_neg = MakeConst(v_ty, pos.first / first_gcd);
         // eliminate the current variable
         PrimExpr new_lhs = c_neg * neg.second - c_pos * pos.second;
-        PrimExpr new_ineq = LE(new_lhs, IntImm(pos.second.dtype(), 0));
+        PrimExpr new_ineq = LE(new_lhs, IntImm(pos.second.ty(), 0));
         // we need rewrite_simplify -> canonical_simplify -> rewrite_simplify
         // to help simplify things like (((y + 10) - (-1*(y - 20))) <= 0) => y - 5 <= 0
         // with steps = 2 it's (y*2) - 10 <= 0
@@ -281,7 +284,7 @@ PartialSolvedInequalities SolveLinearInequalities(const IntConstraints& system_t
     lower_bounds.reserve(coef_neg.size());
 
     for (const auto& pos : coef_pos) {
-      PrimExpr bound = MakeConst(v.dtype(), -coef_lcm / pos.first) * pos.second;
+      PrimExpr bound = MakeConst(v.ty(), -coef_lcm / pos.first) * pos.second;
       bound = analyzer->Simplify(bound, kSimplifyRewriteCanonicalRewrite);
       // Don't add if any of the existing bounds is better
       if (std::any_of(upper_bounds.begin(), upper_bounds.end(),
@@ -302,7 +305,7 @@ PartialSolvedInequalities SolveLinearInequalities(const IntConstraints& system_t
       upper_bounds.push_back(bound);
     }
     for (const auto& neg : coef_neg) {
-      PrimExpr bound = MakeConst(v.dtype(), -coef_lcm / neg.first) * neg.second;
+      PrimExpr bound = MakeConst(v.ty(), -coef_lcm / neg.first) * neg.second;
       bound = analyzer->Simplify(bound, kSimplifyRewriteCanonicalRewrite);
       // Don't add if any of the existing bounds is better
       if (std::any_of(lower_bounds.begin(), lower_bounds.end(),
@@ -330,7 +333,7 @@ PartialSolvedInequalities SolveLinearInequalities(const IntConstraints& system_t
     std::sort(equal_list.begin(), equal_list.end(), ExprLess());
 
     // Write it to the result.
-    IntGroupBounds bnds(MakeConst(v.dtype(), coef_lcm),
+    IntGroupBounds bnds(MakeConst(v.ty(), coef_lcm),
                         ffi::Array<PrimExpr>(lower_bounds.begin(), lower_bounds.end()),
                         ffi::Array<PrimExpr>(equal_list.begin(), equal_list.end()),
                         ffi::Array<PrimExpr>(upper_bounds.begin(), upper_bounds.end()));
@@ -509,7 +512,7 @@ IntConstraintsTransform SolveInequalitiesDeskewRange(const IntConstraints& inequ
                            analyzer->Simplify(var - Substitute(best_range->min, res_dst_to_src)));
 
         // Add the new var to the resulting axis
-        auto range = Range(IntImm(new_var.dtype(), 0), best_range->extent);
+        auto range = Range(IntImm(new_var.ty(), 0), best_range->extent);
         res_variables.push_back(new_var);
         res_ranges.Set(new_var, range);
 
diff --git a/src/arith/transitive_comparison_analyzer.cc b/src/arith/transitive_comparison_analyzer.cc
index 20fd05169f43..7b740d6229c2 100644
--- a/src/arith/transitive_comparison_analyzer.cc
+++ b/src/arith/transitive_comparison_analyzer.cc
@@ -615,7 +615,8 @@ CompareResult TransitiveComparisonAnalyzer::Impl::TryCompare(const PrimExpr& lhs
                                                              const PrimExpr& rhs_expr,
                                                              bool propagate_inequalities) const {
   // Currently only supports integer checks
-  if (!lhs_expr.dtype().is_int() || !rhs_expr.dtype().is_int()) {
+  if (!lhs_expr.ty().MatchesCode(DLDataTypeCode::kDLInt) ||
+      !rhs_expr.ty().MatchesCode(DLDataTypeCode::kDLInt)) {
     return CompareResult::kUnknown;
   }
 
diff --git a/src/arith/unwrap_vector_expr.cc b/src/arith/unwrap_vector_expr.cc
index e9245c48a102..dfe7a3cf404b 100644
--- a/src/arith/unwrap_vector_expr.cc
+++ b/src/arith/unwrap_vector_expr.cc
@@ -58,14 +58,16 @@ class Scalarizer : public ExprMutator {
     }
   }
   PrimExpr VisitExpr_(const LetNode* op) final {
-    if (op->value.dtype().lanes() == 1) {
+    PrimType value_ty = op->value.ty();
+    if (value_ty.lanes() == 1) {
       return ExprMutator::VisitExpr_(op);
     }
 
     auto it = let_var_remap_.find(op->var.get());
     TVM_FFI_ICHECK(it == let_var_remap_.end()) << "Duplicate binding of variable " << op->var;
 
-    Var new_var(op->var->name_hint + "_scalar", op->var.dtype().element_of());
+    PrimType var_ty = op->var.ty();
+    Var new_var(op->var->name_hint + "_scalar", var_ty.WithLanes(1));
     let_var_remap_[op->var.get()] = new_var;
 
     PrimExpr value = this->VisitExpr(op->value);
diff --git a/src/arith/z3_prover.cc b/src/arith/z3_prover.cc
index 604815c97955..9ceb156dead8 100644
--- a/src/arith/z3_prover.cc
+++ b/src/arith/z3_prover.cc
@@ -50,10 +50,10 @@
 #include <vector>
 
 #include "tvm/ffi/cast.h"
+#include "tvm/ffi/dtype.h"
 #include "tvm/ffi/object.h"
 #include "tvm/ffi/string.h"
 #include "tvm/ir/expr.h"
-#include "tvm/runtime/data_type.h"
 #include "z3++.h"
 
 namespace tvm::arith {
@@ -147,14 +147,14 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
   /// @brief Create a Free z3 expression from PrimExprNode
   z3::expr Create(const PrimExprNode* op) {
     auto ref = ffi::GetRef<PrimExpr>(op);
-    auto dtype = op->dtype;
+    PrimType dtype = op->ty();
     std::string name = ns.GetNewName(ref);
     /// TVM max_val can't handle uint64 max correctly, so we special case it here
-    if (dtype.is_bool()) {
+    if (dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
       return ctx->bool_const(name.c_str());
     } else {
       z3::expr e = ctx->int_const(name.c_str());
-      if (dtype.is_uint() && dtype.bits() == 64) {
+      if (dtype.MatchesCode(DLDataTypeCode::kDLUInt) && dtype.bits() == 64) {
         solver.add(ctx->int_val(0) <= e && e <= ctx->int_val((uint64_t)UINT64_MAX));
       } else {
         auto min_val = min_value(dtype).as_or_throw<IntImm>()->value;
@@ -249,7 +249,7 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
     // solver) must degrade to "cannot prove" instead of escaping to the caller.
     try {
       if (CheckTrivilBadCases(expr)) return false;
-      if (!IsValidDType(expr->dtype)) return false;
+      if (!IsValidType(expr.ty())) return false;
       z3::expr_vector constr(*ctx);
       constr.push_back(!ConvertBool(expr));
       auto result = solver.check(constr);
@@ -263,7 +263,7 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
   /// @brief Binded
   /// @brief Bind a variable to a value or a range
   void Bind(const Var& var, const PrimExpr& value, bool allow_override = false) {
-    if (!IsValidDType(var->dtype)) return;
+    if (!IsValidType(var.ty())) return;
     scope_stack_.back().push_back(Scope{Scope::BindValue, var, value});
     // we add the binding whenever the value is pure,
     // because non-pure parts are handling by creating free variables in VisitExpr
@@ -272,7 +272,7 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
 
   /// @brief Bind a variable to a range
   void Bind(const Var& var, const Range& range, bool allow_override = false) {
-    if (!IsValidDType(var->dtype)) return;
+    if (!IsValidType(var.ty())) return;
     scope_stack_.back().push_back(
         Scope{Scope::BindRange, var, PrimExpr(), range->min, range->extent});
     // 1. Create a placeholder for the var, and save it in the memo
@@ -427,7 +427,7 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
    * \return Number of satisfying values, -1 on error, -2 if min_consecutive constraint not met
    */
   int64_t CountSatisfyingValues(const Var& var, int64_t max_count, int64_t min_consecutive = 1) {
-    if (!IsValidDType(var->dtype)) {
+    if (!IsValidType(var.ty())) {
       return -1;
     }
 
@@ -550,12 +550,14 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
     }
     return e->IsInstance<CallNode>() || e->IsInstance<BufferLoadNode>() ||
            e->IsInstance<ProducerLoadNode>() || e->IsInstance<ReduceNode>() ||
-           (e->IsInstance<CastNode>() && !IsValidDType(e.as_or_throw<Cast>()->value->dtype));
+           (e->IsInstance<CastNode>() && !IsValidType(e.as_or_throw<Cast>()->value.ty()));
   }
 
   /// @brief Check if the dtype is valid for z3 integer operations
-  static bool IsValidDType(const DataType& dtype) {
-    return (dtype.is_int() || dtype.is_uint() || dtype.is_bool()) && dtype.lanes() == 1;
+  static bool IsValidType(const PrimType& dtype) {
+    return dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt,
+                             DLDataTypeCode::kDLBool) &&
+           dtype.lanes() == 1;
   }
 
   /// @brief Visit the expression and convert it into z3 integer expression
@@ -581,7 +583,7 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
   /// @brief Helper function to visit binary arithmetic operations
   z3::expr VisitArith(Z3BinOp signed_op, const PrimExprNode* op, const PrimExpr& a,
                       const PrimExpr& b) {
-    if (IsValidDType(a->dtype) && IsValidDType(b->dtype)) {
+    if (IsValidType(a.ty()) && IsValidType(b.ty())) {
       return signed_op(VisitInt(a), VisitInt(b));
     } else {
       return Create(op);
@@ -589,14 +591,14 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
   }
 
   z3::expr VisitExpr_(const LetNode* op) override {
-    if (IsValidDType(op->var->dtype)) {
+    if (IsValidType(op->var.ty())) {
       memo_.emplace(op->var, VisitInt(op->value));
     }
     return VisitExpr(op->body);
   }
   z3::expr VisitExpr_(const CastNode* op) override {
     // if the inner dtype is valid, we just visit it
-    if (IsValidDType(op->value->dtype) && IsValidDType(op->dtype)) {
+    if (IsValidType(op->value.ty()) && IsValidType(op->ty())) {
       return VisitInt(op->value);
     } else {
       // otherwise, we create a new free z3 variable
@@ -696,7 +698,7 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
     } else if (op->op.same_as(tirx::builtin::shift_right())) {
       return VisitShiftOp(z3::ashr, op);
     } else if (op->op.same_as(tirx::builtin::if_then_else()) && op->args.size() == 3 &&
-               IsValidDType(op->args[1]->dtype) && IsValidDType(op->args[2]->dtype)) {
+               IsValidType(op->args[1].ty()) && IsValidType(op->args[2].ty())) {
       // tir.if_then_else(cond, a, b) is a select-like ternary.
       return z3::ite(VisitBool(op->args[0]), VisitInt(op->args[1]), VisitInt(op->args[2]));
     } else {
@@ -715,9 +717,9 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
 
     const PrimExpr& a = op->args[0];
     const PrimExpr& b = op->args[1];
-    unsigned bit_width = std::max(op->args[0].dtype().bits(), op->args[1].dtype().bits());
+    unsigned bit_width = std::max(op->args[0].ty().bits(), op->args[1].ty().bits());
 
-    if (IsValidDType(a->dtype) && IsValidDType(b->dtype)) {
+    if (IsValidType(a.ty()) && IsValidType(b.ty())) {
       return z3::bv2int(
           op_func(z3::int2bv(bit_width, VisitInt(a)), z3::int2bv(bit_width, VisitInt(b))), true);
     } else {
@@ -734,9 +736,9 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
 
     const PrimExpr& a = op->args[0];
 
-    if (IsValidDType(a->dtype)) {
+    if (IsValidType(a.ty())) {
       // Cast integer to bit-vector, apply bitwise not, then cast back.
-      unsigned bit_width = a.dtype().bits();
+      unsigned bit_width = a.ty().bits();
       z3::expr a_int = VisitInt(a);
       z3::expr a_bv = z3::int2bv(bit_width, a_int);
       return z3::bv2int(~a_bv, true);
@@ -756,7 +758,7 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
     const PrimExpr& b = op->args[1];
 
     // Shift operations require integer types for both operands
-    if (IsValidDType(a->dtype) && IsValidDType(b->dtype)) {
+    if (IsValidType(a.ty()) && IsValidType(b.ty())) {
       z3::expr a_expr = VisitInt(a);
       z3::expr b_expr = VisitInt(b);
 
@@ -765,7 +767,7 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
       // matching push/pop in this path, so the assertion would permanently
       // poison the shared solver and make all subsequent unrelated proofs about
       // `b` unsound.
-      unsigned bit_width = std::max(a.dtype().bits(), b.dtype().bits());
+      unsigned bit_width = std::max(a.ty().bits(), b.ty().bits());
       z3::expr a_bv = z3::int2bv(bit_width, a_expr);
       z3::expr b_bv = z3::int2bv(bit_width, b_expr);
 
diff --git a/src/backend/cuda/codegen/codegen_cuda.cc b/src/backend/cuda/codegen/codegen_cuda.cc
index 0f2838014b28..0d70d9aef3fd 100644
--- a/src/backend/cuda/codegen/codegen_cuda.cc
+++ b/src/backend/cuda/codegen/codegen_cuda.cc
@@ -56,13 +56,32 @@ bool IsOp(const tirx::CallNode* call, const Op& compat_op, const char* canonical
   return op_node != nullptr && op_node->name == canonical_name;
 }
 
+bool IsCUDAFloat8(DLDataTypeCode code) {
+  return code == DLDataTypeCode::kDLFloat8_e3m4 || code == DLDataTypeCode::kDLFloat8_e4m3 ||
+         code == DLDataTypeCode::kDLFloat8_e4m3b11fnuz ||
+         code == DLDataTypeCode::kDLFloat8_e4m3fn || code == DLDataTypeCode::kDLFloat8_e4m3fnuz ||
+         code == DLDataTypeCode::kDLFloat8_e5m2 || code == DLDataTypeCode::kDLFloat8_e5m2fnuz ||
+         code == DLDataTypeCode::kDLFloat8_e8m0fnu;
+}
+
+bool IsCUDAFloat6(DLDataTypeCode code) {
+  return code == DLDataTypeCode::kDLFloat6_e2m3fn || code == DLDataTypeCode::kDLFloat6_e3m2fn;
+}
+
+bool IsCUDAFloat4(DLDataTypeCode code) { return code == DLDataTypeCode::kDLFloat4_e2m1fn; }
+
+bool IsCUDAPackedFloat(DLDataTypeCode code) {
+  return IsCUDAFloat8(code) || IsCUDAFloat6(code) || IsCUDAFloat4(code);
+}
+
 }  // namespace
 
-std::string GetFP8Type(DataType type) {
+std::string GetFP8Type(DLDataType type) {
+  PrimType type_ty(type);
   std::stringstream stream;
-  int32_t lanes = type.lanes();
+  int32_t lanes = type_ty.lanes();
   std::string vec;
-  if (type.is_scalar()) {
+  if (type_ty.IsScalar()) {
     vec = "";
   } else if (lanes == 2) {
     vec = "x2";
@@ -78,11 +97,12 @@ std::string GetFP8Type(DataType type) {
   }
   stream << "__nv_fp8";
   std::string suffix;
-  if (type.code() == DataType::kFloat8_e4m3fn) {
+  DLDataTypeCode code = type_ty.code();
+  if (code == DLDataTypeCode::kDLFloat8_e4m3fn) {
     suffix = "_e4m3";
-  } else if (type.code() == DataType::kFloat8_e5m2) {
+  } else if (code == DLDataTypeCode::kDLFloat8_e5m2) {
     suffix = "_e5m2";
-  } else if (type.code() == DataType::kFloat8_e8m0fnu) {
+  } else if (code == DLDataTypeCode::kDLFloat8_e8m0fnu) {
     suffix = "_e8m0";
   } else {
     TVM_FFI_THROW(InternalError) << "Unsupported FP8 type in CUDA codegen";
@@ -91,11 +111,12 @@ std::string GetFP8Type(DataType type) {
   return stream.str();
 }
 
-std::string GetFP6Type(DataType type) {
+std::string GetFP6Type(DLDataType type) {
+  PrimType type_ty(type);
   std::stringstream stream;
-  int32_t lanes = type.lanes();
+  int32_t lanes = type_ty.lanes();
   std::string vec;
-  if (type.is_scalar()) {
+  if (type_ty.IsScalar()) {
     vec = "";
   } else if (lanes == 2) {
     vec = "x2";
@@ -110,9 +131,10 @@ std::string GetFP6Type(DataType type) {
   }
   stream << "__nv_fp6";
   std::string suffix;
-  if (type.code() == DataType::kFloat6_e2m3fn) {
+  DLDataTypeCode code = type_ty.code();
+  if (code == DLDataTypeCode::kDLFloat6_e2m3fn) {
     suffix = "_e2m3";
-  } else if (type.code() == DataType::kFloat6_e3m2fn) {
+  } else if (code == DLDataTypeCode::kDLFloat6_e3m2fn) {
     suffix = "_e3m2";
   } else {
     TVM_FFI_THROW(InternalError) << "Unsupported FP6 type in CUDA codegen";
@@ -121,11 +143,12 @@ std::string GetFP6Type(DataType type) {
   return stream.str();
 }
 
-std::string GetFP4Type(DataType type) {
+std::string GetFP4Type(DLDataType type) {
+  PrimType type_ty(type);
   std::stringstream stream;
-  int32_t lanes = type.lanes();
+  int32_t lanes = type_ty.lanes();
   std::string vec;
-  if (type.is_scalar()) {
+  if (type_ty.IsScalar()) {
     vec = "";
   } else if (lanes == 2) {
     vec = "x2";
@@ -140,7 +163,8 @@ std::string GetFP4Type(DataType type) {
   }
   stream << "__nv_fp4";
   std::string suffix;
-  if (type.code() == DataType::kFloat4_e2m1fn) {
+  DLDataTypeCode code = type_ty.code();
+  if (code == DLDataTypeCode::kDLFloat4_e2m1fn) {
     suffix = "_e2m1";
   } else {
     TVM_FFI_THROW(InternalError) << "Unsupported FP4 type in CUDA codegen";
@@ -299,31 +323,34 @@ void CodeGenCUDA::BindThreadIndex(const IterVar& iv) {
                                    ";\" : \"=r\"(ctaid) :);\n"
                                    "  return ctaid;\n"
                                    "}\n");
-    var_idmap_[iv->var.get()] = CastFromTo(func_name + "()", DataType::UInt(32), iv->var.dtype());
+    var_idmap_[iv->var.get()] =
+        CastFromTo(func_name + "()", DLDataType{kDLUInt, 32, 1}, iv->var.ty()->dtype);
   } else {
-    var_idmap_[iv->var.get()] = CastFromTo(iv->thread_tag, DataType::UInt(32), iv->var.dtype());
+    var_idmap_[iv->var.get()] =
+        CastFromTo(iv->thread_tag, DLDataType{kDLUInt, 32, 1}, iv->var.ty()->dtype);
   }
 }
 
-void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
+void CodeGenCUDA::PrintType(DLDataType raw_t, std::ostream& os) {  // NOLINT(*)
+  PrimType t(raw_t);
   int lanes = t.lanes();
-  if (t.is_handle()) {
-    TVM_FFI_ICHECK(t.is_scalar()) << "do not yet support vector types";
+  if (t.IsHandle()) {
+    TVM_FFI_ICHECK(t.IsScalar()) << "do not yet support vector types";
     os << "void*";
     return;
   }
 
-  if (t.is_void()) {
+  if (t.IsVoid()) {
     os << "void";
     return;
   }
 
   bool fail = false;
-  if (t.is_float()) {
+  if (t.code() == DLDataTypeCode::kDLFloat) {
     switch (t.bits()) {
       case 16:
         codegen_tags_.insert("fp16");
-        if (t.is_scalar()) {
+        if (t.IsScalar()) {
           os << "half";
         } else if (lanes <= 8) {
           TVM_FFI_ICHECK_EQ(lanes % 2, 0) << "Only support an even number of lanes for half type";
@@ -360,15 +387,15 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
         fail = true;
         break;
     }
-    if (!fail && (t.is_scalar() || t.bits() == 16)) return;
+    if (!fail && (t.IsScalar() || t.bits() == 16)) return;
     if (!fail && (lanes > 4 && lanes <= 8 && t.bits() == 32)) return;
     if (!fail && (lanes >= 2 && lanes <= 4)) {
       os << lanes;
       return;
     }
-  } else if (t.is_bfloat16()) {
+  } else if (t.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) {
     codegen_tags_.insert("bf16");
-    if (t.is_scalar()) {
+    if (t.IsScalar()) {
       os << "nv_bfloat16";
     } else if (lanes <= 8) {
       TVM_FFI_ICHECK_EQ(lanes % 2, 0) << "only support even lane for bfloat16 type";
@@ -381,57 +408,65 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
       fail = true;
     }
     if (!fail) return;
-  } else if (t.is_float8()) {
+  } else if (t.code() == DLDataTypeCode::kDLFloat8_e3m4 ||
+             t.code() == DLDataTypeCode::kDLFloat8_e4m3 ||
+             t.code() == DLDataTypeCode::kDLFloat8_e4m3b11fnuz ||
+             t.code() == DLDataTypeCode::kDLFloat8_e4m3fn ||
+             t.code() == DLDataTypeCode::kDLFloat8_e4m3fnuz ||
+             t.code() == DLDataTypeCode::kDLFloat8_e5m2 ||
+             t.code() == DLDataTypeCode::kDLFloat8_e5m2fnuz ||
+             t.code() == DLDataTypeCode::kDLFloat8_e8m0fnu) {
     codegen_tags_.insert("fp8");
-    if (t.lanes() <= 4) {
-      os << GetFP8Type(t);
+    if (lanes <= 4) {
+      os << GetFP8Type(raw_t);
     } else {
-      os << "uint" << t.lanes() / 4;
+      os << "uint" << lanes / 4;
     }
     return;
-  } else if (t.is_float6()) {
+  } else if (t.code() == DLDataTypeCode::kDLFloat6_e2m3fn ||
+             t.code() == DLDataTypeCode::kDLFloat6_e3m2fn) {
     codegen_tags_.insert("fp6");
-    if (t.lanes() <= 4) {
-      os << GetFP6Type(t);
+    if (lanes <= 4) {
+      os << GetFP6Type(raw_t);
     } else {
       fail = true;
     }
     return;
-  } else if (t.is_float4()) {
+  } else if (t.code() == DLDataTypeCode::kDLFloat4_e2m1fn) {
     codegen_tags_.insert("fp4");
-    if (t.lanes() <= 4) {
-      os << GetFP4Type(t);
+    if (lanes <= 4) {
+      os << GetFP4Type(raw_t);
     } else {
       fail = true;
     }
     return;
-  } else if (t == DataType::Bool()) {
+  } else if (raw_t == DLDataType{kDLBool, 8, 1}) {
     os << "bool";
     return;
-  } else if (t.is_vector_bool()) {
+  } else if (t.code() == DLDataTypeCode::kDLBool && lanes > 1) {
     // CUDA does not support bool vectors.
     // Use ushort vectors to represent instead.
-    int n = t.lanes();
+    int n = lanes;
     if (n <= 4) {
       os << "ushort" << n;
       return;
     }
-  } else if (t.is_uint() || t.is_int()) {
-    if (t.is_uint()) {
+  } else if (t.MatchesCode(DLDataTypeCode::kDLUInt, DLDataTypeCode::kDLInt)) {
+    if (t.MatchesCode(DLDataTypeCode::kDLUInt)) {
       os << "u";
     }
     switch (t.bits()) {
       case 1: {
-        if (t.is_scalar()) {
+        if (t.IsScalar()) {
           os << "int";
           return;
-        } else if (t.lanes() == 8) {
+        } else if (lanes == 8) {
           os << "int8_t";
           return;
-        } else if (t.lanes() == 16) {
+        } else if (lanes == 16) {
           os << "int16_t";
           return;
-        } else if (t.lanes() == 32) {
+        } else if (lanes == 32) {
           os << "int";
           return;
         } else {
@@ -439,23 +474,23 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
         }
       }
       case 4: {
-        if (t.is_scalar()) {
+        if (t.IsScalar()) {
           os << "int";
           return;
-        } else if (t.lanes() == 4) {
+        } else if (lanes == 4) {
           os << "int16_t";
           return;
-        } else if (t.lanes() == 8) {
+        } else if (lanes == 8) {
           // directly 8 4-bit int in integer.
           os << "int";
           return;
-        } else if (t.lanes() == 16) {
+        } else if (lanes == 16) {
           os << "int2";
           return;
-        } else if (t.lanes() == 32) {
+        } else if (lanes == 32) {
           os << "int4";
           return;
-        } else if (t.lanes() == 64) {
+        } else if (lanes == 64) {
           os << "int8";
           return;
         } else {
@@ -463,7 +498,7 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
         }
       }
       case 8: {
-        if (t.lanes() == 4) {
+        if (lanes == 4) {
           // directly 4 8 bit int in integer.
           codegen_tags_.insert("int8");
 
@@ -472,15 +507,15 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
           // into 32-bit data.
           os << "int";
           return;
-        } else if (t.lanes() == 8) {
+        } else if (lanes == 8) {
           codegen_tags_.insert("int8");
           os << "int2";
           return;
-        } else if (t.lanes() == 16) {
+        } else if (lanes == 16) {
           codegen_tags_.insert("int8");
           os << "int4";
           return;
-        } else if (!t.is_uint() && t.is_scalar()) {
+        } else if (!t.MatchesCode(DLDataTypeCode::kDLUInt) && t.IsScalar()) {
           os << "signed char";
           break;
         } else {
@@ -489,11 +524,11 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
         }
       }
       case 16: {
-        if (t.is_scalar()) {
+        if (t.IsScalar()) {
           os << "short";
-        } else if (t.lanes() <= 4) {
+        } else if (lanes <= 4) {
           os << "short" << lanes;
-        } else if (t.lanes() <= 8) {
+        } else if (lanes <= 8) {
           // Emit CUDA code to access int16 vector elements.
           //
           // short4 is stored as int2
@@ -503,9 +538,8 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
           // s4.z is emitted as *(short2*)(&(i2.y)).x
           // s4.w is emitted as *(short2*)(&(i2.y)).y
           //
-          TVM_FFI_ICHECK_EQ(t.lanes() % 2, 0)
-              << "only support even lane for shorT type with lanes > 4";
-          os << "int" << t.lanes() / 2;
+          TVM_FFI_ICHECK_EQ(lanes % 2, 0) << "only support even lane for shorT type with lanes > 4";
+          os << "int" << lanes / 2;
         } else {
           fail = true;
         }
@@ -515,11 +549,11 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
         break;
       }
       case 32: {
-        if (t.is_scalar()) {
+        if (t.IsScalar()) {
           os << "int";
-        } else if (t.lanes() <= 4) {
-          os << "int" << t.lanes();
-        } else if (t.lanes() <= 8) {
+        } else if (lanes <= 4) {
+          os << "int" << lanes;
+        } else if (lanes <= 8) {
           // Emit CUDA code to access int32 vector elements for 4 < lanes <= 8.
           //
           // int8 is stored as longlong4
@@ -538,13 +572,13 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
         break;
       }
       case 64: {
-        if (t.is_scalar()) {
+        if (t.IsScalar()) {
           os << "int64_t";
-        } else if (t.lanes() == 2) {
+        } else if (lanes == 2) {
           os << "longlong2";
-        } else if (t.lanes() == 3) {
+        } else if (lanes == 3) {
           os << "longlong3";
-        } else if (t.lanes() == 4) {
+        } else if (lanes == 4) {
           os << "longlong4";
         }
         return;
@@ -561,15 +595,16 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
       return;
     }
   }
-  TVM_FFI_THROW(InternalError) << "Cannot convert type " << t << " to CUDA type";
+  TVM_FFI_THROW(InternalError) << "Cannot convert type " << ffi::DLDataTypeToString(raw_t)
+                               << " to CUDA type";
 }
 
-void CodeGenCUDA::PrintVecConstructor(DataType t, std::ostream& os) {
+void CodeGenCUDA::PrintVecConstructor(DLDataType t, std::ostream& os) {
   os << "make_";
   PrintType(t, os);
 }
 
-void CodeGenCUDA::PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr lhs, PrimExpr rhs,
+void CodeGenCUDA::PrintVecBinaryOp(const std::string& op, DLDataType t, PrimExpr lhs, PrimExpr rhs,
                                    std::ostream& os) {  // NOLINT(*)
   // Declare the result.
   std::string sret = name_supply_->FreshName("_");
@@ -579,22 +614,22 @@ void CodeGenCUDA::PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr l
   int ssa_scope = BeginScope();
   {
     // Unpack into individual ops.
-    std::string vlhs = SSAGetID(PrintExpr(lhs), lhs.dtype());
-    std::string vrhs = SSAGetID(PrintExpr(rhs), rhs.dtype());
+    std::string vlhs = SSAGetID(PrintExpr(lhs), lhs.ty()->dtype);
+    std::string vrhs = SSAGetID(PrintExpr(rhs), rhs.ty()->dtype);
 
-    for (int i = 0, lanes = t.lanes(); i < lanes; ++i) {
+    for (int i = 0, lanes = PrimType(t).lanes(); i < lanes; ++i) {
       std::ostringstream value_temp;
       if (isalpha(op[0])) {
         value_temp << op << "(";
-        PrintVecElemLoad(vlhs, lhs.dtype(), i, value_temp);
+        PrintVecElemLoad(vlhs, lhs.ty()->dtype, i, value_temp);
         value_temp << ", ";
-        PrintVecElemLoad(vrhs, rhs.dtype(), i, value_temp);
+        PrintVecElemLoad(vrhs, rhs.ty()->dtype, i, value_temp);
         value_temp << ")";
       } else {
         value_temp << "(";
-        PrintVecElemLoad(vlhs, lhs.dtype(), i, value_temp);
+        PrintVecElemLoad(vlhs, lhs.ty()->dtype, i, value_temp);
         value_temp << op;
-        PrintVecElemLoad(vrhs, rhs.dtype(), i, value_temp);
+        PrintVecElemLoad(vrhs, rhs.ty()->dtype, i, value_temp);
         value_temp << ")";
       }
       PrintVecElemStore(sret, t, i, value_temp.str());
@@ -604,55 +639,58 @@ void CodeGenCUDA::PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr l
   os << sret;
 }
 
-void CodeGenCUDA::PrintVecElemLoad(const std::string& vec, DataType t, int i,
+void CodeGenCUDA::PrintVecElemLoad(const std::string& vec, DLDataType t, int i,
                                    std::ostream& os) {  // NOLINT(*)
-  if (t.is_scalar()) {
+  PrimType t_ty(t);
+  int lanes = t_ty.lanes();
+  if (t_ty.IsScalar()) {
     os << vec;
     return;
   }
 
   static const char access[] = {'x', 'y', 'z', 'w'};
-  TVM_FFI_ICHECK(i >= 0 && i < (t.bits() == 8 ? 16 : (t.bits() == 16 || t.bits() == 32) ? 8 : 4));
-  if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
-    std::string type_name = t.is_int() ? "signed char" : "unsigned char";
-    if (t.lanes() == 2 || t.lanes() == 3) {
-      os << vec << "." << access[i % t.lanes()];
+  TVM_FFI_ICHECK(i >= 0 && i < (t.bits == 8 ? 16 : (t.bits == 16 || t.bits == 32) ? 8 : 4));
+  if (t.bits == 8 && (t_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt))) {
+    std::string type_name =
+        t_ty.MatchesCode(DLDataTypeCode::kDLInt) ? "signed char" : "unsigned char";
+    if (lanes == 2 || lanes == 3) {
+      os << vec << "." << access[i % lanes];
     } else {
-      std::string ac = t.lanes() == 4 ? vec : (vec + "." + access[i / 4]);
+      std::string ac = lanes == 4 ? vec : (vec + "." + access[i / 4]);
       os << "(reinterpret_cast<const " << type_name << "*>(&(" << ac << "))[" << (i % 4) << "])";
     }
-  } else if (t.is_float16()) {
-    if (t.lanes() <= 4) {
+  } else if (t_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16)) {
+    if (lanes <= 4) {
       os << vec << "." << access[i];
     } else {
       os << "((half2*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2];
     }
-  } else if (t.is_bfloat16()) {
-    if (t.lanes() <= 4) {
+  } else if (t_ty.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) {
+    if (lanes <= 4) {
       os << vec << "." << access[i];
     } else {
       os << "((nv_bfloat162*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2];
     }
-  } else if (t.lanes() > 4 && t.lanes() <= 8) {
+  } else if (lanes > 4 && lanes <= 8) {
     std::string type_name;
-    if (t.bits() == 16) {
-      if (t.is_int()) {
+    if (t.bits == 16) {
+      if (t_ty.MatchesCode(DLDataTypeCode::kDLInt)) {
         type_name = "short";
-      } else if (t.is_uint()) {
+      } else if (t_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
         type_name = "ushort";
       }
-    } else if (t.bits() == 32) {
-      if (t.is_int()) {
+    } else if (t.bits == 32) {
+      if (t_ty.MatchesCode(DLDataTypeCode::kDLInt)) {
         type_name = "int";
-      } else if (t.is_uint()) {
+      } else if (t_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
         type_name = "uint";
-      } else if (t.is_float()) {
+      } else if (t_ty.code() == DLDataTypeCode::kDLFloat) {
         type_name = "float";
       }
     }
     TVM_FFI_ICHECK(!type_name.empty());
     os << "((" << type_name << "2*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2];
-  } else if (t.is_float4_e2m1fn()) {
+  } else if (t_ty.code() == DLDataTypeCode::kDLFloat4_e2m1fn) {
     os << "([](__nv_fp4_storage_t v) { __nv_fp4_e2m1 t; t.__x = v; return t; })((" << vec
        << ".__x >> " << i * 4 << ") & 0xF)";
   } else {
@@ -660,50 +698,53 @@ void CodeGenCUDA::PrintVecElemLoad(const std::string& vec, DataType t, int i,
   }
 }
 
-void CodeGenCUDA::PrintVecElemStore(const std::string& vec, DataType t, int i,
+void CodeGenCUDA::PrintVecElemStore(const std::string& vec, DLDataType t, int i,
                                     const std::string& value) {
+  PrimType t_ty(t);
+  int lanes = t_ty.lanes();
   this->PrintIndent();
   static const char access[] = {'x', 'y', 'z', 'w'};
-  TVM_FFI_ICHECK(i >= 0 && i < (t.bits() == 8 ? 16 : (t.bits() == 16 || t.bits() == 32) ? 8 : 4));
-  if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
-    if (t.lanes() == 2 || t.lanes() == 3) {
-      stream << vec << '.' << access[i % t.lanes()] << "="
+  TVM_FFI_ICHECK(i >= 0 && i < (t.bits == 8 ? 16 : (t.bits == 16 || t.bits == 32) ? 8 : 4));
+  if (t.bits == 8 && (t_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt))) {
+    if (lanes == 2 || lanes == 3) {
+      stream << vec << '.' << access[i % lanes] << "="
              << "(" << value << ");\n";
     } else {
-      std::string ac = t.lanes() == 4 ? vec : (vec + "." + access[i / 4]);
-      std::string type_name = t.is_int() ? "signed char" : "unsigned char";
+      std::string ac = lanes == 4 ? vec : (vec + "." + access[i / 4]);
+      std::string type_name =
+          t_ty.MatchesCode(DLDataTypeCode::kDLInt) ? "signed char" : "unsigned char";
       stream << "reinterpret_cast<" << type_name << "*>(&(" << ac << "))[" << (i % 4) << "] = ("
              << type_name << ")(" << value << ");\n";
     }
-  } else if (t.is_float16()) {
-    if (t.lanes() <= 4) {
+  } else if (t_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16)) {
+    if (lanes <= 4) {
       stream << vec << "." << access[i] << " = " << value << ";\n";
     } else {
       stream << "((half2*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2] << " = "
              << value << ";\n";
     }
 
-  } else if (t.is_bfloat16()) {
-    if (t.lanes() <= 4) {
+  } else if (t_ty.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) {
+    if (lanes <= 4) {
       stream << vec << "." << access[i] << " = " << value << ";\n";
     } else {
       stream << "((nv_bfloat162*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2]
              << " = " << value << ";\n";
     }
-  } else if (t.lanes() > 4 && t.lanes() <= 8) {
+  } else if (lanes > 4 && lanes <= 8) {
     std::string type_name;
-    if (t.bits() == 16) {
-      if (t.is_int()) {
+    if (t.bits == 16) {
+      if (t_ty.MatchesCode(DLDataTypeCode::kDLInt)) {
         type_name = "short";
-      } else if (t.is_uint()) {
+      } else if (t_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
         type_name = "ushort";
       }
-    } else if (t.bits() == 32) {
-      if (t.is_int()) {
+    } else if (t.bits == 32) {
+      if (t_ty.MatchesCode(DLDataTypeCode::kDLInt)) {
         type_name = "int";
-      } else if (t.is_uint()) {
+      } else if (t_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
         type_name = "uint";
-      } else if (t.is_float()) {
+      } else if (t_ty.code() == DLDataTypeCode::kDLFloat) {
         type_name = "float";
       }
     }
@@ -766,15 +807,19 @@ void CodeGenCUDA::PrintStorageScope(const std::string& scope, std::ostream& os)
   }
 }
 
-std::string CodeGenCUDA::CastFromTo(std::string value, DataType from, DataType target) {
+std::string CodeGenCUDA::CastFromTo(std::string value, DLDataType from, DLDataType target) {
   if (from == target) return value;
+  PrimType from_ty(from);
+  PrimType target_ty(target);
   std::ostringstream os;
   os << "((";
   this->PrintType(target, os);
   os << ")";
-  if (from.is_float16() && (target.is_int() || target.is_uint()) && target.bits() == 8) {
+  if (from_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16) &&
+      (target_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) &&
+      target.bits == 8) {
     os << "(";
-    if (target.is_uint()) {
+    if (target_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
       os << "u";
     }
     os << "int)";
@@ -794,33 +839,22 @@ void CodeGenCUDA::AddUtilFunction(const std::string& func_name, const std::strin
 }
 
 void CodeGenCUDA::VisitExpr_(const CastNode* op, std::ostream& os) {
-  DataType from_ty = op->value.dtype();
-  DataType target_ty = op->dtype;
+  DLDataType from_dtype = op->value.ty()->dtype;
+  DLDataType target_dtype = op->ty()->dtype;
+  PrimType from_ty(from_dtype);
+  PrimType target_ty(target_dtype);
   TVM_FFI_ICHECK_EQ(target_ty.lanes(), from_ty.lanes());
 
   // Emit simple C-style type conversion.
-  if (from_ty.is_scalar()) return CodeGenC::VisitExpr_(op, os);
-
-  if (target_ty.code() == DataType::kFloat8_e3m4 || target_ty.code() == DataType::kFloat8_e4m3 ||
-      target_ty.code() == DataType::kFloat8_e4m3b11fnuz ||
-      target_ty.code() == DataType::kFloat8_e4m3fn ||
-      target_ty.code() == DataType::kFloat8_e4m3fnuz ||
-      target_ty.code() == DataType::kFloat8_e5m2 ||
-      target_ty.code() == DataType::kFloat8_e5m2fnuz ||
-      target_ty.code() == DataType::kFloat8_e8m0fnu ||
-      target_ty.code() == DataType::kFloat4_e2m1fn ||
-
-      from_ty.code() == DataType::kFloat8_e3m4 || from_ty.code() == DataType::kFloat8_e4m3 ||
-      from_ty.code() == DataType::kFloat8_e4m3b11fnuz ||
-      from_ty.code() == DataType::kFloat8_e4m3fn || from_ty.code() == DataType::kFloat8_e4m3fnuz ||
-      from_ty.code() == DataType::kFloat8_e5m2 || from_ty.code() == DataType::kFloat8_e5m2fnuz ||
-      from_ty.code() == DataType::kFloat8_e8m0fnu || from_ty.code() == DataType::kFloat4_e2m1fn) {
+  if (from_ty.IsScalar()) return CodeGenC::VisitExpr_(op, os);
+
+  if (IsCUDAPackedFloat(target_ty.code()) || IsCUDAPackedFloat(from_ty.code())) {
     std::ostringstream val;
-    if (target_ty.code() == DataType::kBFloat && target_ty.lanes() == 2) {
+    if (target_ty.code() == DLDataTypeCode::kDLBfloat && target_ty.lanes() == 2) {
       val << "cast_to_nv_bfloat162(" << PrintExpr(op->value) << ")";
     } else {
       val << "(";
-      PrintType(target_ty, val);
+      PrintType(target_dtype, val);
       val << ")(" << PrintExpr(op->value) << ")";
     }
     os << val.str();
@@ -831,18 +865,18 @@ void CodeGenCUDA::VisitExpr_(const CastNode* op, std::ostream& os) {
   // too compact to read. Emit this as vectorized unary ops.
   std::string sret = name_supply_->FreshName("_");
   this->PrintIndent();
-  this->PrintType(target_ty, stream);
+  this->PrintType(target_dtype, stream);
   stream << ' ' << sret << ";\n";
   {
-    std::string src = SSAGetID(PrintExpr(op->value), from_ty);
+    std::string src = SSAGetID(PrintExpr(op->value), from_dtype);
     for (int i = 0, lanes = from_ty.lanes(); i < lanes; ++i) {
       std::ostringstream val;
       val << "(";
-      PrintType(target_ty.element_of(), val);
+      PrintType(DLDataType{target_dtype.code, target_dtype.bits, 1}, val);
       val << ")(";
-      PrintVecElemLoad(src, from_ty, i, val);
+      PrintVecElemLoad(src, from_dtype, i, val);
       val << ")";
-      PrintVecElemStore(sret, target_ty, i, val.str());
+      PrintVecElemStore(sret, target_dtype, i, val.str());
     }
   }
   os << sret;
@@ -851,8 +885,9 @@ void CodeGenCUDA::VisitExpr_(const CastNode* op, std::ostream& os) {
 void CodeGenCUDA::PrintCallExtern(Type ret_type, ffi::String global_symbol,
                                   const ffi::Array<PrimExpr>& args, bool skip_first_arg,
                                   std::ostream& os) {  // NOLINT(*)
-  DataType ret_dtype = GetRuntimeDataType(ret_type);
-  if (ret_dtype.is_fixed_length_vector()) {
+  DLDataType ret_dtype = GetRuntimeDataType(ret_type);
+  PrimType ret_ty(ret_dtype);
+  if (ret_ty.IsFixedLengthVector()) {
     //
     // Emit an unsupported vector call
     //
@@ -881,17 +916,17 @@ void CodeGenCUDA::PrintCallExtern(Type ret_type, ffi::String global_symbol,
       std::vector<std::string> sargs;
       size_t arg_begin = static_cast<size_t>(skip_first_arg);
       for (size_t i = arg_begin; i < args.size(); ++i) {
-        std::string val = SSAGetID(PrintExpr(args[i]), args[i].dtype());
+        std::string val = SSAGetID(PrintExpr(args[i]), args[i].ty()->dtype);
         sargs.push_back(std::move(val));
       }
 
       // Emit a scalar call for each lane.
-      for (int i = 0; i < ret_dtype.lanes(); ++i) {
+      for (int i = 0; i < ret_ty.lanes(); ++i) {
         std::ostringstream scall;
         scall << global_symbol << "(";
         for (size_t j = 0; j < sargs.size(); ++j) {
           if (j > 0) scall << ", ";
-          PrintVecElemLoad(sargs[j], args[arg_begin + j].dtype(), i, scall);
+          PrintVecElemLoad(sargs[j], args[arg_begin + j].ty()->dtype, i, scall);
         }
         scall << ")";
         PrintVecElemStore(sret, ret_dtype, i, scall.str());
@@ -1196,7 +1231,7 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     std::string local_ptr = this->PrintExpr(op->args[3]);
     std::string local_offset = this->PrintExpr(op->args[4]);
     std::string smem_ptr = this->PrintExpr(op->args[5]);
-    if (trans && op->dtype.bits() == 8) {
+    if (trans && op->ty()->dtype.bits == 8) {
       // ldmatrix can't transpose 8-bit elements (it assumes 16-bit), so
       // synthesize the equivalent manual gather loop. args[6] is the
       // shared-memory stride for this fallback.
@@ -1317,39 +1352,46 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
            << guard << ")\n";
     stream << ");\n";
   } else if (op->op.same_as(builtin::reinterpret())) {
-    DataType tgt_dtype = op->dtype;
-    DataType src_dtype = op->args[0]->dtype;
+    DLDataType tgt_dtype = op->ty()->dtype;
+    DLDataType src_dtype = op->args[0].ty()->dtype;
+    PrimType tgt_ty(tgt_dtype);
+    PrimType src_ty(src_dtype);
     PrimExpr value = op->args[0];
 
-    if (src_dtype.is_handle() && tgt_dtype.is_scalar() &&
-        (tgt_dtype.is_uint() || tgt_dtype.is_int()) && tgt_dtype.bits() == 64) {
+    if (src_ty.IsHandle() && tgt_ty.IsScalar() &&
+        tgt_ty.MatchesCode(DLDataTypeCode::kDLUInt, DLDataTypeCode::kDLInt) &&
+        tgt_dtype.bits == 64) {
       os << "reinterpret_cast<";
       this->PrintType(tgt_dtype, os);
       os << ">(" << PrintExpr(value) << ")";
       return;
     }
-    if (tgt_dtype.is_handle() && src_dtype.is_scalar() &&
-        (src_dtype.is_uint() || src_dtype.is_int()) && src_dtype.bits() == 64) {
+    if (tgt_ty.IsHandle() && src_ty.IsScalar() &&
+        src_ty.MatchesCode(DLDataTypeCode::kDLUInt, DLDataTypeCode::kDLInt) &&
+        src_dtype.bits == 64) {
       os << "reinterpret_cast<void*>(" << PrintExpr(value) << ")";
       return;
     }
 
     // Handle float4_e2m1fn reinterpret
-    if (!src_dtype.is_float4_e2m1fn() && !tgt_dtype.is_float4_e2m1fn()) {
+    if (!IsCUDAFloat4(src_ty.code()) && !IsCUDAFloat4(tgt_ty.code())) {
       return CodeGenC::VisitExpr_(op, os);
     }
     if (src_dtype == tgt_dtype ||
-        tgt_dtype.lanes() * tgt_dtype.bits() == src_dtype.lanes() * src_dtype.bits()) {
+        tgt_ty.lanes() * tgt_dtype.bits == src_ty.lanes() * src_dtype.bits) {
       return CodeGenC::VisitExpr_(op, os);
     }
-    TVM_FFI_ICHECK_EQ(tgt_dtype.lanes(), src_dtype.lanes())
+    TVM_FFI_ICHECK_EQ(tgt_ty.lanes(), src_ty.lanes())
         << "E2M1 float4 reinterpret expects source and target to have the same number of lanes. "
-        << "Source dtype: " << src_dtype << ", Target dtype: " << tgt_dtype;
-    TVM_FFI_ICHECK_EQ(tgt_dtype.bytes(), src_dtype.bytes())
+        << "Source dtype: " << ffi::DLDataTypeToString(src_dtype)
+        << ", Target dtype: " << ffi::DLDataTypeToString(tgt_dtype);
+    TVM_FFI_ICHECK_EQ((tgt_ty.lanes() * tgt_dtype.bits + 7) / 8,
+                      (src_ty.lanes() * src_dtype.bits + 7) / 8)
         << "E2M1 float4 reinterpret expects source and target to have the same number of bytes. "
-        << "Source dtype: " << src_dtype << ", Target dtype: " << tgt_dtype;
+        << "Source dtype: " << ffi::DLDataTypeToString(src_dtype)
+        << ", Target dtype: " << ffi::DLDataTypeToString(tgt_dtype);
 
-    int lanes = tgt_dtype.lanes();
+    int lanes = tgt_ty.lanes();
 
     int ssa_scope = BeginScope();
     if (lanes == 1) {
@@ -1360,47 +1402,47 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
       this->PrintType(tgt_dtype, os);
       os << " *)(&(" << rhs << ")))";
     } else if (lanes == 2) {
-      if (tgt_dtype.is_float4_e2m1fn()) {
+      if (IsCUDAFloat4(tgt_ty.code())) {
         // We view the source as an uint16, and then extract bits of two fp4 numbers,
         // and finally reinterpret the result as fp4x2.
-        value = tirx::Call(DataType::UInt(16), tirx::builtin::reinterpret(), {value});
-        tirx::Var temp_var("temp_var", DataType::UInt(16));
+        value = tirx::Call(PrimType::UInt(16), tirx::builtin::reinterpret(), {value});
+        tirx::Var temp_var("temp_var", PrimType::UInt(16));
         value = tirx::Let(temp_var, value,
-                          tirx::Cast(DataType::UInt(8),
-                                     (temp_var & IntImm(DataType::UInt(16), 0xF)) |
-                                         ((temp_var >> 4) & IntImm(DataType::UInt(16), 0xF0))));
+                          tirx::Cast(PrimType::UInt(8),
+                                     (temp_var & IntImm(PrimType::UInt(16), 0xF)) |
+                                         ((temp_var >> 4) & IntImm(PrimType::UInt(16), 0xF0))));
       } else {
-        value = tirx::Cast(DataType::UInt(16),
-                           tirx::Call(DataType::UInt(8), tirx::builtin::reinterpret(), {value}));
-        tirx::Var temp_var("temp_var", DataType::UInt(16));
+        value = tirx::Cast(PrimType::UInt(16),
+                           tirx::Call(PrimType::UInt(8), tirx::builtin::reinterpret(), {value}));
+        tirx::Var temp_var("temp_var", PrimType::UInt(16));
         value = tirx::Let(temp_var, value,
-                          (temp_var & IntImm(DataType::UInt(16), 0xF)) |
-                              ((temp_var & IntImm(DataType::UInt(16), 0xF0)) << 4));
+                          (temp_var & IntImm(PrimType::UInt(16), 0xF)) |
+                              ((temp_var & IntImm(PrimType::UInt(16), 0xF0)) << 4));
       }
-      os << PrintExpr(tirx::Call(tgt_dtype, tirx::builtin::reinterpret(), {value}));
+      os << PrintExpr(tirx::Call(PrimType(tgt_dtype), tirx::builtin::reinterpret(), {value}));
     } else if (lanes == 4) {
-      if (tgt_dtype.is_float4_e2m1fn()) {
+      if (IsCUDAFloat4(tgt_ty.code())) {
         // We view the source as an uint32, and then extract bits of four fp4 numbers,
         // and finally reinterpret the result as fp4x4.
-        value = tirx::Call(DataType::UInt(32), tirx::builtin::reinterpret(), {value});
-        tirx::Var temp_var("temp_var", DataType::UInt(32));
+        value = tirx::Call(PrimType::UInt(32), tirx::builtin::reinterpret(), {value});
+        tirx::Var temp_var("temp_var", PrimType::UInt(32));
         value = tirx::Let(temp_var, value,
-                          tirx::Cast(DataType::UInt(16),
-                                     (temp_var & IntImm(DataType::UInt(32), 0xF)) |
-                                         ((temp_var >> 4) & IntImm(DataType::UInt(32), 0xF0)) |
-                                         ((temp_var >> 8) & IntImm(DataType::UInt(32), 0xF00)) |
-                                         ((temp_var >> 12) & IntImm(DataType::UInt(32), 0xF000))));
+                          tirx::Cast(PrimType::UInt(16),
+                                     (temp_var & IntImm(PrimType::UInt(32), 0xF)) |
+                                         ((temp_var >> 4) & IntImm(PrimType::UInt(32), 0xF0)) |
+                                         ((temp_var >> 8) & IntImm(PrimType::UInt(32), 0xF00)) |
+                                         ((temp_var >> 12) & IntImm(PrimType::UInt(32), 0xF000))));
       } else {
-        value = tirx::Cast(DataType::UInt(32),
-                           tirx::Call(DataType::UInt(16), tirx::builtin::reinterpret(), {value}));
-        tirx::Var temp_var("temp_var", DataType::UInt(32));
+        value = tirx::Cast(PrimType::UInt(32),
+                           tirx::Call(PrimType::UInt(16), tirx::builtin::reinterpret(), {value}));
+        tirx::Var temp_var("temp_var", PrimType::UInt(32));
         value = tirx::Let(temp_var, value,
-                          (temp_var & IntImm(DataType::UInt(32), 0xF)) |
-                              ((temp_var & IntImm(DataType::UInt(32), 0xF0)) << 4) |
-                              ((temp_var & IntImm(DataType::UInt(32), 0xF00)) << 8) |
-                              ((temp_var & IntImm(DataType::UInt(32), 0xF000)) << 12));
+                          (temp_var & IntImm(PrimType::UInt(32), 0xF)) |
+                              ((temp_var & IntImm(PrimType::UInt(32), 0xF0)) << 4) |
+                              ((temp_var & IntImm(PrimType::UInt(32), 0xF00)) << 8) |
+                              ((temp_var & IntImm(PrimType::UInt(32), 0xF000)) << 12));
       }
-      os << PrintExpr(tirx::Call(tgt_dtype, tirx::builtin::reinterpret(), {value}));
+      os << PrintExpr(tirx::Call(PrimType(tgt_dtype), tirx::builtin::reinterpret(), {value}));
     } else {
       TVM_FFI_THROW(InternalError)
           << "Invalid number of lanes for float4_e2m1fn reinterpret: " << lanes;
@@ -1411,7 +1453,8 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
 
     const PrimExpr& arg = op->args[0];
     const auto* var_node = arg.as<VarNode>();
-    DataType dtype = op->dtype;
+    DLDataType dtype = op->ty()->dtype;
+    PrimType dtype_ty(dtype);
     bool is_string = op->args[2].as<IntImmNode>()->value;
     bool is_scalar = op->args[3].as<IntImmNode>()->value;
     int num_dims = op->args[4].as<IntImmNode>()->value;
@@ -1432,22 +1475,23 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     if (is_scalar) {
       // Scalar printing logic
       std::string format_specifier;
-      bool is_float16 = dtype.is_float() && dtype.bits() == 16;
-      if (dtype.is_float())
+      bool is_float16 = dtype_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16);
+      if (dtype_ty.code() == DLDataTypeCode::kDLFloat)
         format_specifier = "%f";
-      else if (dtype.is_int())
+      else if (dtype_ty.MatchesCode(DLDataTypeCode::kDLInt))
         format_specifier = "%d";
-      else if (dtype.is_uint())
+      else if (dtype_ty.MatchesCode(DLDataTypeCode::kDLUInt))
         format_specifier = "%u";
       else
-        TVM_FFI_THROW(InternalError) << "Unsupported data type for scalar print: " << dtype;
+        TVM_FFI_THROW(InternalError)
+            << "Unsupported data type for scalar print: " << ffi::DLDataTypeToString(dtype);
 
       std::string print_arg = var_node ? ("*" + GetVarID(var_node)) : PrintExpr(arg);
       os << "// print_buffer starts (scalar)\n"
          << "if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {\n"
-         << "  printf(\"Scalar (dtype: " << dtype << "): " << format_specifier << "\\n\\n\", "
-         << (is_float16 ? "static_cast<float>(" : "") << print_arg << (is_float16 ? ")" : "")
-         << ");\n"
+         << "  printf(\"Scalar (dtype: " << ffi::DLDataTypeToString(dtype)
+         << "): " << format_specifier << "\\n\\n\", " << (is_float16 ? "static_cast<float>(" : "")
+         << print_arg << (is_float16 ? ")" : "") << ");\n"
          << "}\n"
          << "// print_buffer ends\n";
       return;
@@ -1460,19 +1504,20 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
 
     std::string format_specifier;
     bool is_float16 = false;
-    if (dtype.is_float()) {
-      if (dtype.bits() == 16) {
+    if (dtype_ty.code() == DLDataTypeCode::kDLFloat) {
+      if (dtype.bits == 16) {
         format_specifier = "%f";
         is_float16 = true;
       } else {
         format_specifier = "%f";
       }
-    } else if (dtype.is_int()) {
+    } else if (dtype_ty.MatchesCode(DLDataTypeCode::kDLInt)) {
       format_specifier = "%d";
-    } else if (dtype.is_uint()) {
+    } else if (dtype_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
       format_specifier = "%u";
     } else {
-      TVM_FFI_THROW(InternalError) << "Unsupported data type for print: " << dtype;
+      TVM_FFI_THROW(InternalError)
+          << "Unsupported data type for print: " << ffi::DLDataTypeToString(dtype);
     }
 
     TVM_FFI_ICHECK(var_node) << "Formatted print is only supported for buffer variables.";
@@ -1485,7 +1530,7 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     for (int i = 0; i < num_dims; ++i) {
       os << PrintExpr(shape[i]) << (i < num_dims - 1 ? "," : "");
     }
-    os << "), dtype=" << dtype << "):\\n\");\n";
+    os << "), dtype=" << ffi::DLDataTypeToString(dtype) << "):\\n\");\n";
 
     std::vector<std::string> loop_vars;
     for (int i = 0; i < num_dims; ++i) {
@@ -1572,7 +1617,7 @@ void CodeGenCUDA::VisitStmt_(const AttrStmtNode* op) {
         << "For CUDA, the index of an async queue must be 0.";
     this->VisitStmt(op->body);
     static const Op& ptx_cp_async_commit_group_op = Op::Get("tirx.ptx.cp_async_commit_group");
-    auto commit_group = Call(DataType::Void(), ptx_cp_async_commit_group_op, {});
+    auto commit_group = Call(PrimType::Void(), ptx_cp_async_commit_group_op, {});
     this->PrintIndent();
     this->VisitExpr(commit_group, this->stream);
     this->stream << ";\n";
@@ -1584,7 +1629,7 @@ void CodeGenCUDA::VisitStmt_(const AttrStmtNode* op) {
         << "For CUDA, the index of an async queue must be 0.";
     auto wait_cnt = wait_attrs.second;
     static const Op& ptx_cp_async_wait_group_op = Op::Get("tirx.ptx.cp_async_wait_group");
-    auto wait_group = Call(DataType::Void(), ptx_cp_async_wait_group_op, {wait_cnt});
+    auto wait_group = Call(PrimType::Void(), ptx_cp_async_wait_group_op, {wait_cnt});
     this->PrintIndent();
     this->VisitExpr(wait_group, this->stream);
     this->stream << ";\n";
@@ -1614,19 +1659,23 @@ void CodeGenCUDA::VisitStmt_(const AllocBufferNode* op) {
   this->PrintIndent();
   std::string scope = GetPtrStorageScope(op->buffer->data);
   const VarNode* buffer = op->buffer->data.as<VarNode>();
-  DataType dtype = op->buffer->dtype;
+  DLDataType dtype = op->buffer->dtype->dtype;
 
   if (scope.find("wmma.") == 0) {
     if (scope == "wmma.matrix_a" || scope == "wmma.matrix_b") {
-      TVM_FFI_ICHECK(dtype == DataType::Float(16) || dtype == DataType::Int(8) ||
-                     dtype == DataType::UInt(8) || dtype == DataType::Int(4) ||
-                     dtype == DataType::UInt(4) || dtype == DataType::Int(1) ||
-                     dtype == DataType::BFloat(16))
+      bool supported_wmma_input_dtype =
+          dtype == DLDataType{kDLFloat, 16, 1} || dtype == DLDataType{kDLInt, 8, 1} ||
+          dtype == DLDataType{kDLUInt, 8, 1} || dtype == DLDataType{kDLInt, 4, 1} ||
+          dtype == DLDataType{kDLUInt, 4, 1} || dtype == DLDataType{kDLInt, 1, 1} ||
+          dtype == DLDataType{kDLBfloat, 16, 1};
+      TVM_FFI_ICHECK(supported_wmma_input_dtype)
           << "Matrix_a and matrix_b only support half or char or unsigned char "
           << "or uint4 or int4 or int1 type for now";
     } else {
-      TVM_FFI_ICHECK(dtype == DataType::Float(16) || dtype == DataType::Float(32) ||
-                     dtype == DataType::Int(32))
+      bool supported_wmma_accumulator_dtype = dtype == DLDataType{kDLFloat, 16, 1} ||
+                                              dtype == DLDataType{kDLFloat, 32, 1} ||
+                                              dtype == DLDataType{kDLInt, 32, 1};
+      TVM_FFI_ICHECK(supported_wmma_accumulator_dtype)
           << "Accumulator only support half, float and int type for now";
     }
     PrintWmmaScope(scope, dtype, buffer, stream);
@@ -1662,9 +1711,11 @@ void CodeGenCUDA::VisitStmt_(const AllocBufferNode* op) {
     if (scope.find("wmma.") == 0) {
       constant_size = GetWmmaFragmentSize(scope, buffer, constant_size);
     }
-    if ((dtype == DataType::Int(4) || dtype == DataType::UInt(4) || dtype == DataType::Int(1)) &&
-        scope == "shared") {
-      constant_size = constant_size / (32 / dtype.bits());
+    bool is_packed_integer_dtype = dtype == DLDataType{kDLInt, 4, 1} ||
+                                   dtype == DLDataType{kDLUInt, 4, 1} ||
+                                   dtype == DLDataType{kDLInt, 1, 1};
+    if (is_packed_integer_dtype && scope == "shared") {
+      constant_size = constant_size / (32 / dtype.bits);
     }
     stream << ' ' << vid << '[' << constant_size << "];\n";
   }
@@ -1693,9 +1744,10 @@ void CodeGenCUDA::VisitStmt_(const EvaluateNode* op) {
 }
 
 void CodeGenCUDA::VisitExpr_(const RampNode* op, std::ostream& os) {
-  int lanes = op->dtype.lanes();
+  PrimType op_ty = op->ty();
+  int lanes = op_ty.lanes();
   if (lanes <= 4) {
-    PrintVecConstructor(op->dtype, os);
+    PrintVecConstructor(op->ty()->dtype, os);
     os << "(";
     for (int i = 0; i < lanes; i++) {
       os << "(" << PrintExpr(op->base) << ")"
@@ -1710,16 +1762,16 @@ void CodeGenCUDA::VisitExpr_(const RampNode* op, std::ostream& os) {
   // constructor argument layout does not match TIR vector lane layout.
   std::string sret = name_supply_->FreshName("_");
   this->PrintIndent();
-  this->PrintType(op->dtype, stream);
+  this->PrintType(op->ty()->dtype, stream);
   stream << ' ' << sret << ";\n";
   int ssa_scope = BeginScope();
   {
-    std::string vbase = SSAGetID(PrintExpr(op->base), op->base.dtype());
-    std::string vstride = SSAGetID(PrintExpr(op->stride), op->stride.dtype());
+    std::string vbase = SSAGetID(PrintExpr(op->base), op->base.ty()->dtype);
+    std::string vstride = SSAGetID(PrintExpr(op->stride), op->stride.ty()->dtype);
     for (int i = 0; i < lanes; ++i) {
       std::ostringstream value_temp;
       value_temp << "(" << vbase << ")+(" << vstride << "*" << i << ")";
-      PrintVecElemStore(sret, op->dtype, i, value_temp.str());
+      PrintVecElemStore(sret, op->ty()->dtype, i, value_temp.str());
     }
   }
   EndScope(ssa_scope);
@@ -1727,14 +1779,16 @@ void CodeGenCUDA::VisitExpr_(const RampNode* op, std::ostream& os) {
 }
 
 void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NOLINT(*)
-  int lanes = op->dtype.lanes();
-  if ((op->dtype.is_int() || op->dtype.is_uint()) && op->dtype.bits() == 8 && lanes == 4) {
+  PrimType op_ty = op->ty();
+  int lanes = op_ty.lanes();
+  if ((op_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) && op_ty.bits() == 8 &&
+      lanes == 4) {
     // make_int8x4
     const int64_t* p = as_const_int(op->value);
     TVM_FFI_ICHECK(p);
     int64_t v = *p & 0xFF;
     v = (v << 24) | (v << 16) | (v << 8) | v;
-    if (op->dtype.is_uint()) {
+    if (op_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
       os << "(uint)" << v;
     } else {
       os << "(int)" << v;
@@ -1742,9 +1796,9 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
     return;
   }
 
-  if (op->dtype.is_float16()) {
+  if (op_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16)) {
     std::string v = PrintExpr(op->value);
-    PrintVecConstructor(op->dtype, os);
+    PrintVecConstructor(op->ty()->dtype, os);
     os << '(';
     if (lanes <= 4) {
       for (int i = 0; i < lanes / 2; ++i) {
@@ -1761,9 +1815,9 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
     return;
   }
 
-  if (op->dtype.is_bfloat16()) {
+  if (op_ty.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) {
     std::string v = PrintExpr(op->value);
-    PrintVecConstructor(op->dtype, os);
+    PrintVecConstructor(op->ty()->dtype, os);
     os << '(';
     if (lanes > 4) {
       for (int i = 0; i < lanes / 2; ++i) {
@@ -1780,12 +1834,11 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
     return;
   }
 
-  if (op->dtype.is_float8() || op->dtype.is_float4()) {
-    int lanes = op->dtype.lanes();
+  if (IsCUDAFloat8(op_ty.code()) || IsCUDAFloat4(op_ty.code())) {
     TVM_FFI_ICHECK(lanes == 1 || lanes == 2 || lanes == 4);
     std::string v = PrintExpr(op->value);
     // Implicit conversion from float back to fp8
-    PrintType(op->dtype, os);
+    PrintType(op->ty()->dtype, os);
     os << "(make_float" << lanes << "(";
     for (int i = 0; i < lanes; ++i) {
       if (i != 0) os << ", ";
@@ -1795,7 +1848,7 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
     return;
   }
 
-  if ((op->dtype.is_int() || op->dtype.is_uint()) && op->dtype.bits() == 4) {
+  if ((op_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) && op_ty.bits() == 4) {
     bool fail = false;
     const int64_t* p = as_const_int(op->value);
     TVM_FFI_ICHECK(p);
@@ -1803,7 +1856,7 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
 
     if (lanes == 4) {
       v = (v << 12) | (v << 8) | (v << 4) | v;
-      if (op->dtype.is_uint()) {
+      if (op_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
         os << "(uint16_t)" << v;
       } else {
         os << "(int16_t)" << v;
@@ -1811,17 +1864,17 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
     } else {
       v = (v << 28) | (v << 24) | (v << 20) | (v << 16) | (v << 12) | (v << 8) | (v << 4) | v;
       if (lanes == 8) {
-        if (op->dtype.is_uint()) {
+        if (op_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
           os << "(uint)" << v;
         } else {
           os << "(int)" << v;
         }
       } else if (lanes == 16 || lanes == 32) {
-        PrintVecConstructor(op->dtype, os);
+        PrintVecConstructor(op->ty()->dtype, os);
         os << '(';
         for (int i = 0; i < lanes / 8; ++i) {
           if (i != 0) os << ", ";
-          if (op->dtype.is_uint()) {
+          if (op_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
             os << "(uint)" << v;
           } else {
             os << "(int)" << v;
@@ -1839,7 +1892,7 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
   }
 
   std::string v = PrintExpr(op->value);
-  PrintVecConstructor(op->dtype, os);
+  PrintVecConstructor(op->ty()->dtype, os);
   os << '(';
   for (int i = 0; i < lanes; ++i) {
     if (i != 0) os << ", ";
@@ -1849,47 +1902,49 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
 }
 
 void CodeGenCUDA::VisitExpr_(const SelectNode* op, std::ostream& os) {
+  PrimType op_ty = op->ty();
   // Non-vector cases.
-  if (!op->dtype.is_fixed_length_vector()) {
+  if (!op_ty.IsFixedLengthVector()) {
     CodeGenC::VisitExpr_(op, os);
     return;
   }
 
   // Codegen vector condition case by serializing the select op.
-  TVM_FFI_ICHECK(op->false_value->dtype == op->dtype && op->true_value->dtype == op->dtype &&
-                 op->dtype.lanes() == op->condition.dtype().lanes());
+  TVM_FFI_ICHECK(op->false_value.ty() == op_ty && op->true_value.ty() == op_ty &&
+                 op_ty.lanes() == op->condition.ty().lanes());
 
   std::string r_var = name_supply_->FreshName("_");
   this->PrintIndent();
-  this->PrintType(op->dtype, stream);
+  this->PrintType(op->ty()->dtype, stream);
   stream << ' ' << r_var << ";\n";
   {
-    std::string c_var = SSAGetID(PrintExpr(op->condition), op->dtype);
-    std::string t_var = SSAGetID(PrintExpr(op->true_value), op->dtype);
-    std::string f_var = SSAGetID(PrintExpr(op->false_value), op->dtype);
+    std::string c_var = SSAGetID(PrintExpr(op->condition), op->ty()->dtype);
+    std::string t_var = SSAGetID(PrintExpr(op->true_value), op->ty()->dtype);
+    std::string f_var = SSAGetID(PrintExpr(op->false_value), op->ty()->dtype);
 
     // The condition is stored as an ushort vector.
-    int lanes = op->dtype.lanes();
-    DataType memory_ty(DataType::TypeCode::kUInt, 16, lanes);
+    int lanes = op_ty.lanes();
+    DLDataType memory_dtype{kDLUInt, 16, static_cast<uint16_t>(lanes)};
 
     for (int i = 0; i < lanes; ++i) {
       std::ostringstream item;
       item << "(bool(";
-      PrintVecElemLoad(c_var, memory_ty, i, item);
+      PrintVecElemLoad(c_var, memory_dtype, i, item);
       item << ")?";
-      PrintVecElemLoad(t_var, op->dtype, i, item);
+      PrintVecElemLoad(t_var, op->ty()->dtype, i, item);
       item << ':';
-      PrintVecElemLoad(f_var, op->dtype, i, item);
+      PrintVecElemLoad(f_var, op->ty()->dtype, i, item);
       item << ')';
-      PrintVecElemStore(r_var, op->dtype, i, item.str());
+      PrintVecElemStore(r_var, op->ty()->dtype, i, item.str());
     }
   }
   os << r_var;
 }
 
 inline void PrintConst(const FloatImmNode* op, std::ostream& os, CodeGenCUDA* p) {  // NOLINT(*)
+  PrimType op_ty = op->ty();
   // Type code is kBFloat
-  if (op->dtype.is_bfloat16()) {
+  if (op_ty.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) {
     os << "__float2bfloat16_rn";
     os << '(' << std::hexfloat << op->value << 'f';
     os << "/*" << std::scientific << op->value << "*/";
@@ -1897,15 +1952,15 @@ inline void PrintConst(const FloatImmNode* op, std::ostream& os, CodeGenCUDA* p)
     return;
   }
   // Type code is kFloat8_e5m2 or kE4M4Float
-  if (op->dtype.is_float8() || op->dtype.is_float4()) {
-    p->PrintType(op->dtype, os);
+  if (IsCUDAFloat8(op_ty.code()) || IsCUDAFloat4(op_ty.code())) {
+    p->PrintType(op->ty()->dtype, os);
     os << '(' << std::hexfloat << op->value << 'f';
     os << "/*" << std::scientific << op->value << "*/";
     os << ')';
     return;
   }
   // Type code is kFloat
-  switch (op->dtype.bits()) {
+  switch (op_ty.bits()) {
     case 64: {
       std::ostringstream temp;
       if (std::isinf(op->value)) {
@@ -1945,13 +2000,14 @@ inline void PrintConst(const FloatImmNode* op, std::ostream& os, CodeGenCUDA* p)
     }
     case 16: {
       os << "__float2half_rn" << '(';
-      FloatImm const_f32 = FloatImm(DataType::Float(32), op->value);
+      FloatImm const_f32 = FloatImm(PrimType::Float(32), op->value);
       PrintConst(const_f32.get(), os, p);
       os << ')';
       break;
     }
     default:
-      TVM_FFI_THROW(InternalError) << "Bad bit-width for float: " << op->dtype << "\n";
+      TVM_FFI_THROW(InternalError)
+          << "Bad bit-width for float: " << ffi::DLDataTypeToString(op->ty()->dtype) << "\n";
   }
 }
 
@@ -1959,25 +2015,27 @@ void CodeGenCUDA::VisitExpr_(const FloatImmNode* op, std::ostream& os) {  // NOL
   PrintConst(op, os, this);
 }
 
-void CodeGenCUDA::PrintWmmaScope(const std::string& scope, DataType t, const VarNode* variable,
+void CodeGenCUDA::PrintWmmaScope(const std::string& scope, DLDataType t, const VarNode* variable,
                                  std::ostream& os) {
+  PrimType t_ty(t);
   std::stringstream type;
   PrintType(t, type);
   TVM_FFI_ICHECK(fragment_shapes.count(variable))
       << "Cannot find shape of the wmma fragment " << variable->name_hint;
   std::string shape_str = fragment_shapes.at(variable);
-  if ((t.is_int() || t.is_uint()) && t.bits() < 8 && t.lanes() == 1) {
+  if ((t_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) && t.bits < 8 &&
+      t_ty.lanes() == 1) {
     type.str(std::string());
-    if (t.is_int()) {
-      if (t.bits() == 4) {
+    if (t_ty.MatchesCode(DLDataTypeCode::kDLInt)) {
+      if (t.bits == 4) {
         type << "nvcuda::wmma::experimental::precision::s4";
-      } else if (t.bits() == 1) {
+      } else if (t.bits == 1) {
         type << "nvcuda::wmma::experimental::precision::b1";
       } else {
         TVM_FFI_THROW(InternalError) << "Unhandled interger type for wmma fragment!";
       }
-    } else if (t.is_uint()) {
-      if (t.bits() == 4) {
+    } else if (t_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
+      if (t.bits == 4) {
         type << "nvcuda::wmma::experimental::precision::u4";
       } else {
         TVM_FFI_THROW(InternalError) << "Unhandled interger type for wmma fragment!";
@@ -2029,20 +2087,25 @@ void CodeGenCUDA::HandleVolatileLoads(const std::string& value, const BufferLoad
   // Cast away volatile qualifier for fp16 types. That is, only loads and
   // stores are volatile. The loaded objects are not marked as volatile.
   //
-  if ((op->dtype.is_float16() || op->dtype.is_bfloat16()) && IsVolatile(op->buffer->data.get())) {
+  PrimType op_ty = op->ty();
+  if ((op_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16) ||
+       op_ty.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) &&
+      IsVolatile(op->buffer->data.get())) {
     os << "(";
-    PrintType(op->dtype, os);
+    PrintType(op->ty()->dtype, os);
     os << ")(" << value << ")";
   } else {
     os << value;
   }
 }
 
-void CodeGenCUDA::PrintVecElemLoadExpr(DataType t, int i, const std::string& value,
+void CodeGenCUDA::PrintVecElemLoadExpr(DLDataType t, int i, const std::string& value,
                                        std::ostream& os) {
-  TVM_FFI_ICHECK_GT(t.lanes(), 1);
-  if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
-    if (!(t.lanes() == 2 || t.lanes() == 3)) {
+  PrimType t_ty(t);
+  int lanes = t_ty.lanes();
+  TVM_FFI_ICHECK_GT(lanes, 1);
+  if (t.bits == 8 && (t_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt))) {
+    if (!(lanes == 2 || lanes == 3)) {
       if (i != 0) {
         os << "|";
       }
@@ -2051,12 +2114,12 @@ void CodeGenCUDA::PrintVecElemLoadExpr(DataType t, int i, const std::string& val
     }
   }
 
-  if (t.is_float16()) {
+  if (t_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16)) {
     if (i == 0) {
       PrintVecConstructor(t, os);
       os << '(';
     }
-    if (i == t.lanes() - 1) {
+    if (i == lanes - 1) {
       os << value << ")";
     } else {
       os << value << ",";
@@ -2064,12 +2127,12 @@ void CodeGenCUDA::PrintVecElemLoadExpr(DataType t, int i, const std::string& val
     return;
   }
 
-  if (t.is_bfloat16()) {
+  if (t_ty.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) {
     if (i == 0) {
       PrintVecConstructor(t, os);
       os << '(';
     }
-    if (i == t.lanes() - 1) {
+    if (i == lanes - 1) {
       os << value << ")";
     } else {
       os << value << ",";
@@ -2082,7 +2145,7 @@ void CodeGenCUDA::PrintVecElemLoadExpr(DataType t, int i, const std::string& val
     os << "(";
   }
   os << value;
-  if (i != t.lanes() - 1) {
+  if (i != lanes - 1) {
     os << ",";
   } else {
     os << ")";
diff --git a/src/backend/cuda/codegen/codegen_cuda.h b/src/backend/cuda/codegen/codegen_cuda.h
index 92ca3cab34a4..94f86614e45e 100644
--- a/src/backend/cuda/codegen/codegen_cuda.h
+++ b/src/backend/cuda/codegen/codegen_cuda.h
@@ -56,16 +56,17 @@ class CodeGenCUDA final : public CodeGenC {
   void VisitStmt_(const WhileNode* op) final;
   void PrintStorageSync(const CallNode* op) final;
   void PrintStorageScope(const std::string& scope, std::ostream& os) final;  // NOLINT(*)
-  void PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr lhs, PrimExpr rhs,
-                        std::ostream& os) final;       // NOLINT(*)
-  void PrintType(DataType t, std::ostream& os) final;  // NOLINT(*)
-  void PrintVecConstructor(DataType t, std::ostream& os) final;
-  void PrintVecElemLoad(const std::string& vec, DataType t, int i,
+  void PrintVecBinaryOp(const std::string& op, DLDataType t, PrimExpr lhs, PrimExpr rhs,
+                        std::ostream& os) final;         // NOLINT(*)
+  void PrintType(DLDataType t, std::ostream& os) final;  // NOLINT(*)
+  void PrintVecConstructor(DLDataType t, std::ostream& os) final;
+  void PrintVecElemLoad(const std::string& vec, DLDataType t, int i,
                         std::ostream& os) final;  // NOLINT(*)
-  void PrintVecElemStore(const std::string& vec, DataType t, int i, const std::string& value) final;
+  void PrintVecElemStore(const std::string& vec, DLDataType t, int i,
+                         const std::string& value) final;
   void BindThreadIndex(const IterVar& iv) final;  // NOLINT(*)
-  void PrintVecElemLoadExpr(DataType t, int i, const std::string& value, std::ostream& os) final;
-  std::string CastFromTo(std::string value, DataType from, DataType target) final;
+  void PrintVecElemLoadExpr(DLDataType t, int i, const std::string& value, std::ostream& os) final;
+  std::string CastFromTo(std::string value, DLDataType from, DLDataType target) final;
   void AddUtilFunction(const std::string& name, const std::string& code);
   // overload visitor
   void VisitExpr_(const RampNode* op, std::ostream& os) final;       // NOLINT(*)
@@ -129,7 +130,7 @@ class CodeGenCUDA final : public CodeGenC {
   std::unordered_map<const VarNode*, std::string> fragment_shapes;
   std::unordered_map<const VarNode*, std::string> fragment_layouts;
   friend void PrintConst(const FloatImmNode* op, std::ostream& os, CodeGenCUDA* p);
-  void PrintWmmaScope(const std::string& scope, DataType t, const VarNode* variable,
+  void PrintWmmaScope(const std::string& scope, DLDataType t, const VarNode* variable,
                       std::ostream& os);
   int32_t GetWmmaFragmentSize(const std::string& scope, const VarNode* variable, int32_t size);
 };
diff --git a/src/backend/cuda/codegen/intrin_rule_cuda.cc b/src/backend/cuda/codegen/intrin_rule_cuda.cc
index dc8d4a020e1e..ea2d0abfa80e 100644
--- a/src/backend/cuda/codegen/intrin_rule_cuda.cc
+++ b/src/backend/cuda/codegen/intrin_rule_cuda.cc
@@ -34,8 +34,8 @@ namespace intrin {
 using tirx::FLowerIntrinsic;
 
 struct CUDAMath {
-  std::string operator()(DataType t, std::string name) const {
-    if (t.is_float()) {
+  std::string operator()(PrimType t, std::string name) const {
+    if (t.code() == DLDataTypeCode::kDLFloat) {
       switch (t.bits()) {
         case 64:
           // Use nearbyint (ties-to-even) for round to match constant-folding semantics.
@@ -56,7 +56,7 @@ struct CUDAMath {
         default:
           return "";
       }
-    } else if (t.is_bfloat16()) {
+    } else if (t.code() == DLDataTypeCode::kDLBfloat && t.bits() == 16) {
       if (name == "fabs") {
         return "__habs";
       } else if (name == "round") {
@@ -64,7 +64,7 @@ struct CUDAMath {
       } else {
         return "h" + name;
       }
-    } else if (t.is_int() || t.is_uint()) {
+    } else if (t.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
       switch (t.bits()) {
         case 32:
           return "__" + name;
@@ -79,8 +79,8 @@ struct CUDAMath {
 };
 
 struct CUDAFastMath : public CUDAMath {
-  std::string operator()(DataType t, std::string name) const {
-    if (t.is_float() && t.bits() == 32) {
+  std::string operator()(PrimType t, std::string name) const {
+    if (t.code() == DLDataTypeCode::kDLFloat && t.bits() == 32) {
       return "__" + name + 'f';
     } else {
       return CUDAMath::operator()(t, name);
@@ -90,8 +90,8 @@ struct CUDAFastMath : public CUDAMath {
 };
 
 struct CUDAFastMathTan : public CUDAMath {
-  std::string operator()(DataType t, std::string name) const {
-    if (t.is_float()) {
+  std::string operator()(PrimType t, std::string name) const {
+    if (t.code() == DLDataTypeCode::kDLFloat) {
       switch (t.bits()) {
         case 64:
           return name;
@@ -110,8 +110,8 @@ struct CUDAFastMathTan : public CUDAMath {
 };
 
 struct CUDAPopcount {
-  std::string operator()(DataType t, std::string name) const {
-    if (t.is_uint()) {
+  std::string operator()(PrimType t, std::string name) const {
+    if (t.MatchesCode(DLDataTypeCode::kDLUInt)) {
       switch (t.bits()) {
         case 32:
           return "__popc";
@@ -126,7 +126,7 @@ struct CUDAPopcount {
 };
 
 struct CUDAWarpIntrinsic {
-  const Op operator()(DataType t, const Op& orig_op) const {
+  const Op operator()(PrimType t, const Op& orig_op) const {
     if (orig_op.same_as(builtin::tvm_warp_shuffle())) {
       static const Op& cuda_shfl_sync_op = Op::Get("tirx.cuda.__shfl_sync");
       return cuda_shfl_sync_op;
@@ -147,7 +147,7 @@ struct CUDAWarpIntrinsic {
 static PrimExpr DispatchCUDAWarpActiveMask(const PrimExpr& e) {
   const CallNode* call = e.as<CallNode>();
   static const Op& cuda_active_mask_op = Op::Get("tirx.cuda.__activemask");
-  return Call(call->dtype, cuda_active_mask_op, call->args);
+  return Call(e.ty(), cuda_active_mask_op, call->args);
 }
 
 template <typename T>
@@ -156,7 +156,7 @@ static PrimExpr DispatchCUDAShuffle(const PrimExpr& e) {
   TVM_FFI_ICHECK(call != nullptr);
   TVM_FFI_ICHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
   ffi::Array<PrimExpr> cuda_args{{call->args[0], call->args[1], call->args[2], call->args[3]}};
-  return Call(call->dtype, T()(call->dtype, call->op.as_or_throw<Op>()), cuda_args);
+  return Call(e.ty(), T()(e.ty(), call->op.as_or_throw<Op>()), cuda_args);
 }
 
 void RegisterCudaIntrinRules() {
diff --git a/src/backend/cuda/codegen/llvm/codegen_nvptx.cc b/src/backend/cuda/codegen/llvm/codegen_nvptx.cc
index e523e2b22aab..eb84f10fda10 100644
--- a/src/backend/cuda/codegen/llvm/codegen_nvptx.cc
+++ b/src/backend/cuda/codegen/llvm/codegen_nvptx.cc
@@ -87,7 +87,7 @@ class CodeGenNVPTX : public CodeGenLLVM {
     }
 
     auto storage_scope = runtime::StorageScope::Create(GetPtrStorageScope(op->buffer->data));
-    DataType dtype = op->buffer->dtype;
+    PrimType dtype = op->buffer->dtype;
 
     if (storage_scope.rank == runtime::StorageRank::kShared && storage_scope.tag == ".dyn") {
       // Shared memory: address space == 3
@@ -230,7 +230,8 @@ class CodeGenNVPTX : public CodeGenLLVM {
 // corresponding nvvm intrinsic. Return true if the match is successful.
 static bool GetWarpShuffleIntrinsic(const CallNode* op, llvm::Intrinsic::ID* id) {
   // Only 32 bit data type is supported.
-  if (op->dtype.is_fixed_length_vector() || op->dtype.bits() != 32) {
+  PrimType op_ty = op->ty();
+  if (op_ty.IsFixedLengthVector() || op_ty.bits() != 32) {
     return false;
   }
 
@@ -253,7 +254,7 @@ static bool GetWarpShuffleIntrinsic(const CallNode* op, llvm::Intrinsic::ID* id)
     return false;
   }
 
-  *id = ids[offset + op->dtype.is_float()];
+  *id = ids[offset + (op_ty.code() == DLDataTypeCode::kDLFloat)];
   return true;
 }
 
@@ -279,10 +280,11 @@ llvm::Value* CodeGenNVPTX::CreateIntrinsic(const CallNode* op) {
     auto val = llvm::InlineAsm::get(fty, "activemask.b32 %0", "=r", true);
     return builder_->CreateCall(val);
   } else if (op->op.same_as(builtin::atomic_add())) {
-    TVM_FFI_ICHECK(op->args[1]->dtype.bits() == 32) << "Only supports 32 bit atomic for now";
+    PrimType value_ty = op->args[1].ty();
+    TVM_FFI_ICHECK(value_ty.bits() == 32) << "Only supports 32 bit atomic for now";
     llvm::Value* v0 = MakeValue(op->args[0]);
     llvm::Value* v1 = MakeValue(op->args[1]);
-    if (op->args[1]->dtype.is_float()) {
+    if (value_ty.code() == DLDataTypeCode::kDLFloat) {
       return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1, llvm::MaybeAlign(),
                                        llvm::AtomicOrdering::Monotonic);
     }
diff --git a/src/backend/cuda/codegen/llvm/intrin_rule_nvptx.cc b/src/backend/cuda/codegen/llvm/intrin_rule_nvptx.cc
index d8706a94b181..13d6f7d95a3b 100644
--- a/src/backend/cuda/codegen/llvm/intrin_rule_nvptx.cc
+++ b/src/backend/cuda/codegen/llvm/intrin_rule_nvptx.cc
@@ -38,7 +38,8 @@ inline PrimExpr DispatchPureExternLibDevice(const PrimExpr& e) {
   using namespace tirx;
   const CallNode* call = e.as<CallNode>();
   TVM_FFI_ICHECK(call != nullptr);
-  TVM_FFI_ICHECK(call->dtype.bits() == 32 || call->dtype.bits() == 64)
+  PrimType call_ty = call->ty();
+  TVM_FFI_ICHECK(call_ty.bits() == 32 || call_ty.bits() == 64)
       << "Only support float32 or float64.";
 
   const OpNode* op = call->op.as<OpNode>();
@@ -48,13 +49,13 @@ inline PrimExpr DispatchPureExternLibDevice(const PrimExpr& e) {
 
   std::ostringstream intrinsic_name;
   intrinsic_name << "__nv_" << name.substr(5);
-  if (call->dtype.bits() == 32) intrinsic_name << "f";
+  if (call_ty.bits() == 32) intrinsic_name << "f";
 
   ffi::Array<PrimExpr> new_args = {StringImm(intrinsic_name.str())};
   for (auto arg : call->args) {
     new_args.push_back(arg);
   }
-  return Call(call->dtype, builtin::call_pure_extern(), new_args);
+  return Call(call->ty(), builtin::call_pure_extern(), new_args);
 }
 
 namespace llvm {
@@ -73,7 +74,7 @@ TVM_REGISTER_OP("tirx.round")
       const CallNode* call = e.as<CallNode>();
       TVM_FFI_ICHECK(call != nullptr);
       static const Op& nearbyint_op = Op::Get("tirx.nearbyint");
-      auto new_call = Call(call->dtype, nearbyint_op, call->args);
+      auto new_call = Call(call->ty(), nearbyint_op, call->args);
       return DispatchPureExternLibDevice(new_call);
     });
 
diff --git a/src/backend/cuda/runtime/cuda_device_api.cc b/src/backend/cuda/runtime/cuda_device_api.cc
index 68ae39de56bf..44d1acff4937 100644
--- a/src/backend/cuda/runtime/cuda_device_api.cc
+++ b/src/backend/cuda/runtime/cuda_device_api.cc
@@ -426,7 +426,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
     TVM_FFI_ICHECK_GE(args.size(), 4) << "init_cuTensorMap expects at least 4 arguments";
     size_t arg_cnt = 0;
     CUtensorMap* tensor_map = static_cast<CUtensorMap*>(args[arg_cnt++].cast<void*>());
-    runtime::DataType tensor_dtype = args[arg_cnt++].cast<runtime::DataType>();
+    DLDataType tensor_dtype = args[arg_cnt++].cast<DLDataType>();
     int32_t raw_tensor_rank = args[arg_cnt++].cast<int32_t>();
     TVM_FFI_ICHECK_GT(raw_tensor_rank, 0) << "tensorRank must be non-zero";
     TVM_FFI_ICHECK_LE(raw_tensor_rank, 5)
@@ -478,13 +478,14 @@ TVM_FFI_STATIC_INIT_BLOCK() {
     auto l2_promotion_kind = static_cast<CUtensorMapL2promotion>(args[arg_cnt++].cast<int>());
     auto oob_fill_kind = static_cast<CUtensorMapFloatOOBfill>(args[arg_cnt++].cast<int>());
 
-    TVM_FFI_ICHECK_EQ(tensor_dtype.lanes(), 1)
+    TVM_FFI_ICHECK_EQ(tensor_dtype.lanes, 1)
         << "Expect tensor_dtype to have lanes=1, but get " << tensor_dtype;
+    uint64_t tensor_dtype_bytes = (static_cast<uint64_t>(tensor_dtype.bits) + 7) / 8;
     CUtensorMapDataType cu_dtype;
-    switch (tensor_dtype.code()) {
-      case DataType::kInt:
+    switch (tensor_dtype.code) {
+      case kDLInt:
         // int
-        switch (tensor_dtype.bits()) {
+        switch (tensor_dtype.bits) {
           case 8:
             cu_dtype = CU_TENSOR_MAP_DATA_TYPE_UINT8;
             break;
@@ -499,9 +500,9 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                 << "Unsupported data type " << ffi::DLDataTypeToString(tensor_dtype);
         }
         break;
-      case DataType::kUInt:
+      case kDLUInt:
         // unsigned int
-        switch (tensor_dtype.bits()) {
+        switch (tensor_dtype.bits) {
           case 8:
             cu_dtype = CU_TENSOR_MAP_DATA_TYPE_UINT8;
             break;
@@ -519,9 +520,9 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                 << "Unsupported data type " << ffi::DLDataTypeToString(tensor_dtype);
         }
         break;
-      case DataType::kFloat:
+      case kDLFloat:
         // float
-        switch (tensor_dtype.bits()) {
+        switch (tensor_dtype.bits) {
           case 16:
             cu_dtype = CU_TENSOR_MAP_DATA_TYPE_FLOAT16;
             break;
@@ -536,9 +537,9 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                 << "Unsupported data type " << ffi::DLDataTypeToString(tensor_dtype);
         }
         break;
-      case DataType::kBFloat:
+      case kDLBfloat:
         // bfloat
-        switch (tensor_dtype.bits()) {
+        switch (tensor_dtype.bits) {
           case 16:
             cu_dtype = CU_TENSOR_MAP_DATA_TYPE_BFLOAT16;
             break;
@@ -547,15 +548,15 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                 << "Unsupported data type " << ffi::DLDataTypeToString(tensor_dtype);
         }
         break;
-      case DataType::kFloat8_e4m3fn:
+      case kDLFloat8_e4m3fn:
         // NV float8 e4m3
         cu_dtype = CU_TENSOR_MAP_DATA_TYPE_UINT8;
         break;
-      case DataType::kFloat8_e5m2:
+      case kDLFloat8_e5m2:
         // NV float8 e5m2
         cu_dtype = CU_TENSOR_MAP_DATA_TYPE_UINT8;
         break;
-      case DataType::kFloat4_e2m1fn:
+      case kDLFloat4_e2m1fn:
 #if (CUDA_VERSION >= 12080)
         // Packed FP4 in GMEM, unpacked into SMEM/TMEM-facing tiles.
         cu_dtype = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B;
@@ -674,7 +675,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
           << "globalDim[0] must be a multiple of 2 for packed 16U4 align8 format";
     }
     if (interleaved_kind == CU_TENSOR_MAP_INTERLEAVE_NONE && !is_packed_dtype) {
-      uint64_t inner_box_bytes = static_cast<uint64_t>(box_dim[0]) * tensor_dtype.bytes();
+      uint64_t inner_box_bytes = static_cast<uint64_t>(box_dim[0]) * tensor_dtype_bytes;
       TVM_FFI_ICHECK_EQ(inner_box_bytes % 16, 0)
           << "boxDim[0] * elementSizeInBytes(tensorDataType) must be a multiple of 16 bytes";
     }
@@ -694,15 +695,15 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
     if (interleaved_kind == CU_TENSOR_MAP_INTERLEAVE_NONE && !is_packed_dtype &&
         swizzle_kind == CU_TENSOR_MAP_SWIZZLE_32B) {
-      TVM_FFI_ICHECK_LE(box_dim[0] * tensor_dtype.bytes(), 32)
+      TVM_FFI_ICHECK_LE(box_dim[0] * tensor_dtype_bytes, 32)
           << "CU_TENSOR_MAP_SWIZZLE_32B implies the bounding box inner dimension will be <= 32.";
     } else if (interleaved_kind == CU_TENSOR_MAP_INTERLEAVE_NONE && !is_packed_dtype &&
                swizzle_kind == CU_TENSOR_MAP_SWIZZLE_64B) {
-      TVM_FFI_ICHECK_LE(box_dim[0] * tensor_dtype.bytes(), 64)
+      TVM_FFI_ICHECK_LE(box_dim[0] * tensor_dtype_bytes, 64)
           << "CU_TENSOR_MAP_SWIZZLE_64B implies the bounding box inner dimension will be <= 64.";
     } else if (interleaved_kind == CU_TENSOR_MAP_INTERLEAVE_NONE && !is_packed_dtype &&
                is_128b_swizzle) {
-      TVM_FFI_ICHECK_LE(box_dim[0] * tensor_dtype.bytes(), 128)
+      TVM_FFI_ICHECK_LE(box_dim[0] * tensor_dtype_bytes, 128)
           << "CU_TENSOR_MAP_SWIZZLE_128B implies the bounding box inner dimension will be <= "
              "128.";
     }
diff --git a/src/backend/hexagon/codegen/llvm/codegen_hexagon.cc b/src/backend/hexagon/codegen/llvm/codegen_hexagon.cc
index 017796918444..17aba2d3fc40 100644
--- a/src/backend/hexagon/codegen/llvm/codegen_hexagon.cc
+++ b/src/backend/hexagon/codegen/llvm/codegen_hexagon.cc
@@ -66,6 +66,11 @@
 namespace tvm {
 namespace codegen {
 
+TVM_FFI_INLINE int GetVectorBytes(const PrimType& dtype) {
+  TVM_FFI_ICHECK(dtype.IsFixedLengthVector() || dtype.IsScalar());
+  return static_cast<int>(dtype.StorageBytes());
+}
+
 // Hexagon code generation
 class CodeGenHexagon final : public CodeGenCPU {
  public:
@@ -97,12 +102,12 @@ class CodeGenHexagon final : public CodeGenCPU {
   void CreatePrintf(const std::string& format, llvm::ArrayRef<llvm::Value*> format_args) final;
 
  private:
-  TypedPointer CreateBufferPtr(llvm::Value* buffer_ptr, DataType buffer_element_dtype,
-                               llvm::ArrayRef<llvm::Value*> indices, DataType value_dtype) final;
+  TypedPointer CreateBufferPtr(llvm::Value* buffer_ptr, PrimType buffer_element_dtype,
+                               llvm::ArrayRef<llvm::Value*> indices, PrimType value_dtype) final;
 
   bool IsQHLFunction(const std::string& func);
 
-  llvm::Value* VectorLookupLoad(Buffer buffer, DataType buffer_type, ffi::Array<PrimExpr> indices);
+  llvm::Value* VectorLookupLoad(Buffer buffer, PrimType buffer_type, ffi::Array<PrimExpr> indices);
   llvm::Value* Intrinsic(llvm::Intrinsic::ID, llvm::ArrayRef<llvm::Value*> args);
   std::vector<std::string> fqhl_list_ = {
       "tvm_vect_qhmath_hvx_cos_ahf",     "tvm_vect_qhmath_hvx_tanh_ahf",
@@ -149,8 +154,9 @@ void CodeGenHexagon::InitTarget() {
 llvm::Value* CodeGenHexagon::CreateCallExternQHL(Type ret_type, ffi::String global_symbol,
                                                  const ffi::Array<PrimExpr>& args,
                                                  bool skip_first_arg) {
-  int num_lanes = args[1].dtype().lanes();
-  int vector_length = native_vector_bits_ / args[1].dtype().bits();
+  PrimType arg_ty = args[1].ty();
+  int num_lanes = arg_ty.lanes();
+  int vector_length = native_vector_bits_ / arg_ty.bits();
   num_lanes = ((num_lanes + vector_length - 1) / vector_length) * vector_length;
   std::vector<llvm::Value*> vect_split;
   for (int i = 0; i < num_lanes / vector_length; ++i) {
@@ -181,8 +187,9 @@ bool CodeGenHexagon::IsQHLFunction(const std::string& func) {
 llvm::Value* CodeGenHexagon::CreateCallExtern(Type ret_type, ffi::String global_symbol,
                                               const ffi::Array<PrimExpr>& args,
                                               bool skip_first_arg) {
-  int num_lanes = args[1].dtype().lanes();
-  int vector_length = native_vector_bits_ / args[1].dtype().bits();
+  PrimType arg_ty = args[1].ty();
+  int num_lanes = arg_ty.lanes();
+  int vector_length = native_vector_bits_ / arg_ty.bits();
   if (IsQHLFunction(global_symbol) && (num_lanes > vector_length))
     return CreateCallExternQHL(ret_type, global_symbol, args, skip_first_arg);
   return CodeGenCPU::CreateCallExtern(ret_type, global_symbol, args, skip_first_arg);
@@ -192,7 +199,7 @@ llvm::Value* CodeGenHexagon::VisitExpr_(const BufferLoadNode* op) {
   if (!op->buffer.same_as(op->buffer->data)) {
     // Check if we can generate a vector lookup.
     if (!op->indices[0].as<RampNode>()) {
-      if (auto* vlut = VectorLookupLoad(op->buffer, op->dtype, op->indices)) {
+      if (auto* vlut = VectorLookupLoad(op->buffer, PrimType(op->ty()->dtype), op->indices)) {
         return vlut;
       }
     }
@@ -261,9 +268,9 @@ void CodeGenHexagon::CreatePrintf(const std::string& format,
 }
 
 CodeGenLLVM::TypedPointer CodeGenHexagon::CreateBufferPtr(llvm::Value* buffer_ptr,
-                                                          DataType buffer_element_dtype,
+                                                          PrimType buffer_element_dtype,
                                                           llvm::ArrayRef<llvm::Value*> indices,
-                                                          DataType value_dtype) {
+                                                          PrimType value_dtype) {
   // Flat indices get delegated to the LLVM codegen.
   if (indices.size() == 1) {
     return CodeGenCPU::CreateBufferPtr(buffer_ptr, buffer_element_dtype, indices, value_dtype);
@@ -274,7 +281,7 @@ CodeGenLLVM::TypedPointer CodeGenHexagon::CreateBufferPtr(llvm::Value* buffer_pt
       << "-d buffer indices";
 
   // Use the first index to identify the pointer.
-  DataType dtype_void_ptr = DataType::Handle();
+  PrimType dtype_void_ptr = PrimType::Handle();
   CodeGenLLVM::TypedPointer buffer_chunk_ptr_ptr =
       CodeGenCPU::CreateBufferPtr(buffer_ptr, dtype_void_ptr, {indices[0]}, dtype_void_ptr);
   llvm::Value* buffer_chunk_ptr =
@@ -317,10 +324,11 @@ llvm::Value* CodeGenHexagon::Intrinsic(llvm::Intrinsic::ID IntID,
   return builder_->CreateCall(intf_callee, conv_args);
 }
 
-llvm::Value* CodeGenHexagon::VectorLookupLoad(Buffer buffer, DataType buffer_type,
+llvm::Value* CodeGenHexagon::VectorLookupLoad(Buffer buffer, PrimType buffer_type,
                                               ffi::Array<PrimExpr> indices) {
   PrimExpr index = indices[0];
-  if (!index.dtype().is_fixed_length_vector()) {
+  PrimType index_ty = index.ty();
+  if (!index_ty.IsFixedLengthVector()) {
     return nullptr;
   }
 
@@ -329,16 +337,16 @@ llvm::Value* CodeGenHexagon::VectorLookupLoad(Buffer buffer, DataType buffer_typ
   int table_elem_count = arith::Analyzer()->Simplify(buffer->shape[0]).as<IntImmNode>()->value;
   if (table_elem_count <= 0 || table_elem_count > 256) return nullptr;
 
-  auto int32 = DataType::Int(32);
+  auto int32 = PrimType::Int(32);
   auto native_vector_bytes = native_vector_bits_ / 8;
 
   // Indexes
-  llvm::Value* trunc = MakeValue(Cast(index.dtype().with_bits(8), index));
+  llvm::Value* trunc = MakeValue(Cast(index_ty.WithBits(8), index));
   llvm::Value* index_pad = CreateVecPad(trunc, native_vector_bytes);
 
   // Values
   std::vector<llvm::Value*> vloads;
-  DataType table_type = buffer_type.with_lanes(table_elem_count);
+  PrimType table_type = buffer_type.WithLanes(table_elem_count);
 
   auto table_all =
       MakeValue(BufferLoad(buffer, {
@@ -347,7 +355,7 @@ llvm::Value* CodeGenHexagon::VectorLookupLoad(Buffer buffer, DataType buffer_typ
 
   // The number of value vectors should be a power of 2.
   int table_vec_count = llvm::PowerOf2Ceil(GetVectorBytes(table_type) / native_vector_bytes);
-  int table_vec_length = native_vector_bytes / buffer_type.bytes();
+  int table_vec_length = native_vector_bytes / GetVectorBytes(buffer_type);
   for (int i = 0; i != table_vec_count; ++i) {
     // CreateVecSlice will generate undefs for elements outside the source vector.
     vloads.push_back(CreateVecSlice(table_all, i * table_vec_length, table_vec_length));
diff --git a/src/backend/hexagon/codegen/llvm/intrin_rule_hexagon.cc b/src/backend/hexagon/codegen/llvm/intrin_rule_hexagon.cc
index 3e46e322a881..928df03f38aa 100644
--- a/src/backend/hexagon/codegen/llvm/intrin_rule_hexagon.cc
+++ b/src/backend/hexagon/codegen/llvm/intrin_rule_hexagon.cc
@@ -50,7 +50,7 @@ inline PrimExpr TVMExternCall(const tirx::CallNode* call, const std::string& fna
   for (PrimExpr arg : call->args) {
     new_args.push_back(arg);
   }
-  return tirx::Call(call->dtype, tirx::builtin::call_pure_extern(), new_args);
+  return tirx::Call(call->ty(), tirx::builtin::call_pure_extern(), new_args);
 }
 
 template <std::string& tvm_wrapper, unsigned id, int num_sign>
@@ -72,14 +72,16 @@ inline PrimExpr DispatchTVMQHLWrapperFp16(const PrimExpr& e) {
 
   // Enable QHL library for FP16 data type
   const PrimExpr& x = call->args[0];
-  if (x->dtype.is_float16() && x->dtype.is_vector() && useqhl) {
+  PrimType x_ty = x.ty();
+  if (x_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16) &&
+      (x_ty.IsFixedLengthVector() || x_ty.IsScalableVector()) && useqhl) {
     return TVMExternCall(call, tvm_wrapper);
   }
 #endif
-  new_args.push_back(IntImm(DataType::UInt(32), id));
-  new_args.push_back(IntImm(DataType::UInt(32), num_sign));
+  new_args.push_back(IntImm(PrimType::UInt(32), id));
+  new_args.push_back(IntImm(PrimType::UInt(32), num_sign));
   new_args.insert(new_args.end(), call->args.begin(), call->args.end());
-  return tirx::Call(call->dtype, tirx::builtin::call_llvm_pure_intrin(), new_args);
+  return tirx::Call(call->ty(), tirx::builtin::call_llvm_pure_intrin(), new_args);
 }
 
 void RegisterHexagonIntrinRules() {
@@ -117,6 +119,7 @@ TVM_REGISTER_OP("tirx.tanh")
       const tirx::CallNode* call = e.as<tirx::CallNode>();
       TVM_FFI_ICHECK(call != nullptr);
       const PrimExpr& x = call->args[0];
+      PrimType x_ty = x.ty();
 
 #if ENABLE_QHL
       // Check target for qfloat enablement
@@ -130,14 +133,15 @@ TVM_REGISTER_OP("tirx.tanh")
       }
 
       // Enable QHL library for FP16 data type
-      if (x->dtype.is_float16() && x->dtype.is_vector() && useqhl) {
+      if (x_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16) &&
+          (x_ty.IsFixedLengthVector() || x_ty.IsScalableVector()) && useqhl) {
         std::string tvm_wrapper("tvm_vect_qhmath_hvx_tanh_ahf");
         return TVMExternCall(call, tvm_wrapper);
       }
 #endif
-      PrimExpr one = tirx::MakeConst(x.dtype(), 1);
-      PrimExpr two = tirx::MakeConst(x.dtype(), 2);
-      PrimExpr neg_two = tirx::MakeConst(x.dtype(), -2);
+      PrimExpr one = tirx::MakeConst(x_ty, 1);
+      PrimExpr two = tirx::MakeConst(x_ty, 2);
+      PrimExpr neg_two = tirx::MakeConst(x_ty, -2);
 
       PrimExpr exp_neg2x = exp(neg_two * x);
       PrimExpr exp_pos2x = exp(two * x);
@@ -145,7 +149,7 @@ TVM_REGISTER_OP("tirx.tanh")
       PrimExpr tanh_pos = (one - exp_neg2x) / (one + exp_neg2x);
       PrimExpr tanh_neg = (exp_pos2x - one) / (exp_pos2x + one);
       // MakeConst can handle both vector and scalar types.
-      PrimExpr tanh_x = tirx::Select(x >= tirx::MakeConst(x.dtype(), 0), tanh_pos, tanh_neg);
+      PrimExpr tanh_x = tirx::Select(x >= tirx::MakeConst(x_ty, 0), tanh_pos, tanh_neg);
       return tanh_x;
     });
 
@@ -154,6 +158,7 @@ TVM_REGISTER_OP("tirx.tan")
       const tirx::CallNode* call = e.as<tirx::CallNode>();
       TVM_FFI_ICHECK(call != nullptr);
       const PrimExpr& x = call->args[0];
+      PrimType x_ty = x.ty();
 #if ENABLE_QHL
       // Check target for qfloat enablement
       const auto f = tvm::ffi::Function::GetGlobal("target.TargetCurrent");
@@ -166,7 +171,8 @@ TVM_REGISTER_OP("tirx.tan")
       }
 
       // Enable QHL library for FP16 data type
-      if (x->dtype.is_float16() && x->dtype.is_vector() && useqhl) {
+      if (x_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16) &&
+          (x_ty.IsFixedLengthVector() || x_ty.IsScalableVector()) && useqhl) {
         std::string tvm_wrapper("tvm_vect_qhmath_hvx_tan_ahf");
         return TVMExternCall(call, tvm_wrapper);
       }
@@ -184,6 +190,7 @@ TVM_REGISTER_OP("tirx.sigmoid")
       const tirx::CallNode* call = e.as<tirx::CallNode>();
       TVM_FFI_ICHECK(call != nullptr);
       const PrimExpr& x = call->args[0];
+      PrimType x_ty = x.ty();
 #if ENABLE_QHL
       // Check target for qfloat enablement
       const auto f = tvm::ffi::Function::GetGlobal("target.TargetCurrent");
@@ -195,21 +202,22 @@ TVM_REGISTER_OP("tirx.sigmoid")
         useqhl = tstring.find("+hvx-qfloat") != std::string::npos;
       }
 
-      PrimExpr MinBound = tirx::MakeConst(x.dtype(), -8);
-      PrimExpr MaxBound = tirx::MakeConst(x.dtype(), 8);
+      PrimExpr MinBound = tirx::MakeConst(x_ty, -8);
+      PrimExpr MaxBound = tirx::MakeConst(x_ty, 8);
       const PrimExpr v1 = tirx::Max(x, MinBound);
       const PrimExpr v2 = tirx::Min(v1, MaxBound);
 
       ffi::Array<tvm::PrimExpr> new_args = {v2};
-      const tirx::Call new_call = tirx::Call(call->dtype, call->op, new_args);
+      const tirx::Call new_call = tirx::Call(call->ty(), call->op, new_args);
 
       // Enable QHL library for FP16 data type
-      if (x->dtype.is_float16() && x->dtype.is_vector() && useqhl) {
+      if (x_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16) &&
+          (x_ty.IsFixedLengthVector() || x_ty.IsScalableVector()) && useqhl) {
         std::string tvm_wrapper("tvm_vect_qhmath_hvx_sigmoid_ahf");
         return TVMExternCall(new_call.get(), tvm_wrapper);
       }
 #endif
-      PrimExpr one = tirx::MakeConst(x.dtype(), 1);
+      PrimExpr one = tirx::MakeConst(x_ty, 1);
       return one / (one + exp(-x));
     });
 
diff --git a/src/backend/hexagon/runtime/ops/conv2d_fp16_hvx.cc b/src/backend/hexagon/runtime/ops/conv2d_fp16_hvx.cc
index d555fb77cfae..c063ae62b1bd 100644
--- a/src/backend/hexagon/runtime/ops/conv2d_fp16_hvx.cc
+++ b/src/backend/hexagon/runtime/ops/conv2d_fp16_hvx.cc
@@ -21,8 +21,8 @@
 #include <hexagon_types.h>
 #include <hvx_hexagon_protos.h>
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/runtime/base.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/device_api.h>
 
 #include <algorithm>
@@ -469,7 +469,7 @@ int conv2d_packed_fp16(void*, TVMFFIAny* args, int num_args, TVMFFIAny* out_val)
   // Prepare zero_block
   int64_t block_nbytes = 2048;
   void* zero_block = device_api->AllocDataSpace(conv_utils::hexagon_device, 1, &block_nbytes,
-                                                tvm::runtime::DataType::UInt(8), vtcm_scope);
+                                                DLDataType{kDLUInt, 8, 1}, vtcm_scope);
   memset(zero_block, 0, 2048);
 
   // FIXME: Setting bias to zero_block: this works for up to 256 output channels.
diff --git a/src/backend/metal/codegen/codegen_metal.cc b/src/backend/metal/codegen/codegen_metal.cc
index 3f483f79aaed..e6ef1647e5bf 100644
--- a/src/backend/metal/codegen/codegen_metal.cc
+++ b/src/backend/metal/codegen/codegen_metal.cc
@@ -46,7 +46,7 @@ void CodeGenMetal::InitFuncState(const PrimFunc& f) {
   CodeGenC::InitFuncState(f);
   // analyze the data;
   for (Var arg : f->params) {
-    if (arg.dtype().is_handle()) {
+    if (arg.ty().IsHandle()) {
       alloc_storage_scope_[arg.get()] = "global";
     }
   }
@@ -97,7 +97,7 @@ void CodeGenMetal::AddFunction(const GlobalVar& gvar, const PrimFunc& func) {
   }
   for (size_t i = 0; i < func->params.size(); ++i, ++num_buffer) {
     Var v = func->params[i];
-    if (!v.dtype().is_handle()) break;
+    if (!v.ty().IsHandle()) break;
     this->stream << "  ";
     std::string vid = AllocVarID(v.get());
     auto it = alloc_storage_scope_.find(v.get());
@@ -126,24 +126,24 @@ void CodeGenMetal::AddFunction(const GlobalVar& gvar, const PrimFunc& func) {
     decl_stream << "struct " << arg_buf_type << " {\n";
     for (size_t i = num_buffer; i < func->params.size(); ++i) {
       Var v = func->params[i];
-      TVM_FFI_ICHECK(!v.dtype().is_handle());
+      TVM_FFI_ICHECK(!v.ty().IsHandle());
       std::string vid = AllocVarID(v.get());
       std::ostringstream vref;
-      if (v.dtype().bits() == 32) {
+      if (v.ty().bits() == 32) {
         decl_stream << "  ";
-        PrintType(v.dtype(), decl_stream);
+        PrintType(v.ty()->dtype, decl_stream);
         decl_stream << " " << vid << "[2];\n";
         vref << varg << "." << vid << "[0]";
-      } else if (v.dtype().bits() == 64) {
+      } else if (v.ty().bits() == 64) {
         decl_stream << "  ";
-        PrintType(v.dtype(), decl_stream);
+        PrintType(v.ty()->dtype, decl_stream);
         decl_stream << " " << vid << ";\n";
         vref << varg << "." << vid;
       } else {
         // For non 32bit type, ref through arg union.
         decl_stream << "  __TVMArgUnion " << vid << ";\n";
         vref << varg << "." << vid << ".v_";
-        PrintType(v.dtype(), vref);
+        PrintType(v.ty()->dtype, vref);
       }
       var_idmap_[v.get()] = vref.str();
     }
@@ -165,10 +165,14 @@ void CodeGenMetal::AddFunction(const GlobalVar& gvar, const PrimFunc& func) {
   if (work_dim != 0) {
     // use ushort by default for now
     stream << "  ";
-    PrintType(DataType::UInt(thread_index_bits_, work_dim), stream);
+    PrintType(DLDataType{kDLUInt, static_cast<uint8_t>(thread_index_bits_),
+                         static_cast<uint16_t>(work_dim)},
+              stream);
     stream << " blockIdx [[threadgroup_position_in_grid]],\n";
     stream << "  ";
-    PrintType(DataType::UInt(thread_index_bits_, work_dim), stream);
+    PrintType(DLDataType{kDLUInt, static_cast<uint8_t>(thread_index_bits_),
+                         static_cast<uint16_t>(work_dim)},
+              stream);
     stream << " threadIdx [[thread_position_in_threadgroup]]\n";
   }
   thread_work_dim_ = work_dim;
@@ -190,28 +194,29 @@ void CodeGenMetal::BindThreadIndex(const IterVar& iv) {
   if (thread_work_dim_ <= 1) {
     vname = vname.substr(0, iv->thread_tag.length() - 2);
   }
-  var_idmap_[iv->var.get()] =
-      CastFromTo(vname, DataType::UInt(thread_index_bits_), iv->var.dtype());
+  var_idmap_[iv->var.get()] = CastFromTo(
+      vname, DLDataType{kDLUInt, static_cast<uint8_t>(thread_index_bits_), 1}, iv->var.ty()->dtype);
 }
 
-void CodeGenMetal::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
+void CodeGenMetal::PrintType(DLDataType raw_t, std::ostream& os) {  // NOLINT(*)
+  PrimType t(raw_t);
   int lanes = t.lanes();
-  if (t.is_handle()) {
+  if (t.IsHandle()) {
     TVM_FFI_ICHECK_EQ(lanes, 1) << "do not yet support vector types";
     os << "void*";
     return;
   }
 
-  if (t.is_void()) {
+  if (t.IsVoid()) {
     os << "void";
     return;
   }
-  if (t == DataType::Bool()) {
+  if (raw_t == DLDataType{kDLBool, 8, 1}) {
     os << "bool";
     return;
   }
   bool fail = false;
-  if (t.is_float()) {
+  if (t.code() == DLDataTypeCode::kDLFloat) {
     // Need to care about sizes and alignment of half3/float3 because tirx representation might not
     // be aware of Metal half3/float3 details and can treat them as just three elements,
     // while sizes and alignmnents of half3/float3 are one element more (half3-8 bytes/
@@ -239,8 +244,8 @@ void CodeGenMetal::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
       os << lanes;
       return;
     }
-  } else if (t.is_uint() || t.is_int()) {
-    if (t.is_uint()) {
+  } else if (t.MatchesCode(DLDataTypeCode::kDLUInt, DLDataTypeCode::kDLInt)) {
+    if (t.MatchesCode(DLDataTypeCode::kDLUInt)) {
       os << 'u';
     }
     switch (t.bits()) {
@@ -268,11 +273,12 @@ void CodeGenMetal::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
       os << lanes;
       return;
     }
-  } else if (t.is_bfloat16()) {
+  } else if (t.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) {
     os << "bfloat";
     return;
   }
-  TVM_FFI_THROW(InternalError) << "Cannot convert type " << t << " to Metal type";
+  TVM_FFI_THROW(InternalError) << "Cannot convert type " << ffi::DLDataTypeToString(raw_t)
+                               << " to Metal type";
 }
 
 void CodeGenMetal::PrintStorageSync(const CallNode* op) {
@@ -288,12 +294,12 @@ void CodeGenMetal::PrintStorageSync(const CallNode* op) {
   }
 }
 
-void CodeGenMetal::PrintVecElemLoad(const std::string& vec, DataType t, int i,
+void CodeGenMetal::PrintVecElemLoad(const std::string& vec, DLDataType t, int i,
                                     std::ostream& os) {  // NOLINT(*)
   os << vec << "[" << i << "]";
 }
 
-void CodeGenMetal::PrintVecElemStore(const std::string& vec, DataType t, int i,
+void CodeGenMetal::PrintVecElemStore(const std::string& vec, DLDataType t, int i,
                                      const std::string& value) {
   this->PrintIndent();
   stream << vec << "[" << i << "]"
@@ -328,11 +334,14 @@ void CodeGenMetal::VisitStmt_(const AllocBufferNode* op) {
 
   auto scope = GetPtrStorageScope(op->buffer->data);
   alloc_storage_scope_[op->buffer->data.get()] = scope;
-  DataType dtype = op->buffer->dtype;
+  DLDataType dtype = op->buffer->dtype->dtype;
   if (scope == "metal.simdgroup") {
-    TVM_FFI_ICHECK(dtype == DataType::Float(16) || dtype == DataType::Float(32) ||
-                   dtype == DataType::BFloat(16))
-        << "Only float16, float32, and bfloat16 are supported, but got " << dtype;
+    bool supported_simdgroup_dtype = dtype == DLDataType{kDLFloat, 16, 1} ||
+                                     dtype == DLDataType{kDLFloat, 32, 1} ||
+                                     dtype == DLDataType{kDLBfloat, 16, 1};
+    TVM_FFI_ICHECK(supported_simdgroup_dtype)
+        << "Only float16, float32, and bfloat16 are supported, but got "
+        << ffi::DLDataTypeToString(dtype);
     TVM_FFI_ICHECK(constant_size % 64 == 0)
         << "Only 8x8 matrix is supported, but got " << constant_size << " bytes\n";
 
@@ -360,8 +369,8 @@ void CodeGenMetal::VisitExpr_(const SelectNode* op, std::ostream& os) {  // NOLI
 
 void CodeGenMetal::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NOLINT(*)
   std::string v = PrintExpr(op->value);
-  int lanes = op->dtype.lanes();
-  PrintType(op->dtype, os);
+  int lanes = op->ty().lanes();
+  PrintType(op->ty()->dtype, os);
   os << "(";
   for (int i = 0; i < lanes; ++i) {
     if (i != 0) os << ", ";
@@ -422,7 +431,7 @@ void CodeGenMetal::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT
   } else if (op->op.same_as(builtin::reinterpret())) {
     // generate as_type<TYPE>(ARG)
     os << "(as_type<";
-    this->PrintType(op->dtype, os);
+    this->PrintType(op->ty()->dtype, os);
     os << ">(";
     this->PrintExpr(op->args[0], os);
     os << "))";
@@ -442,9 +451,9 @@ void CodeGenMetal::VisitExpr_(const FloatImmNode* op, std::ostream& os) {  // NO
     temp << "NAN";
   } else {
     temp << std::scientific << op->value;
-    if (op->dtype.bits() == 32)
+    if (op->ty().bits() == 32)
       temp << 'f';
-    else if (op->dtype.bits() == 16)
+    else if (op->ty().bits() == 16)
       temp << 'h';
   }
   MarkConst(temp.str());
diff --git a/src/backend/metal/codegen/codegen_metal.h b/src/backend/metal/codegen/codegen_metal.h
index b92608aecfa1..ffa9a321aa43 100644
--- a/src/backend/metal/codegen/codegen_metal.h
+++ b/src/backend/metal/codegen/codegen_metal.h
@@ -43,13 +43,14 @@ class CodeGenMetal final : public CodeGenC {
   void InitFuncState(const PrimFunc& f) final;
   void PrintStorageScope(const std::string& scope, std::ostream& os) final;  // NOLINT(*)
   void PrintStorageSync(const CallNode* op) final;                           // NOLINT(*)
-  void PrintType(DataType t, std::ostream& os) final;                        // NOLINT(*)
+  void PrintType(DLDataType t, std::ostream& os) final;                      // NOLINT(*)
   void BindThreadIndex(const IterVar& iv) final;                             // NOLINT(*)
   // print load of single element
-  void PrintVecElemLoad(const std::string& vec, DataType t, int i,
+  void PrintVecElemLoad(const std::string& vec, DLDataType t, int i,
                         std::ostream& os) final;  // NOLINT(*)
   // print store of single element.
-  void PrintVecElemStore(const std::string& vec, DataType t, int i, const std::string& value) final;
+  void PrintVecElemStore(const std::string& vec, DLDataType t, int i,
+                         const std::string& value) final;
   // overload visitor
   void VisitStmt_(const AllocBufferNode* op) final;                  // NOLINT(*)
   void VisitExpr_(const SelectNode* op, std::ostream& os) final;     // NOLINT(*)
diff --git a/src/backend/metal/codegen/intrin_rule_metal.cc b/src/backend/metal/codegen/intrin_rule_metal.cc
index c807ac4c2e8a..999fe526f04e 100644
--- a/src/backend/metal/codegen/intrin_rule_metal.cc
+++ b/src/backend/metal/codegen/intrin_rule_metal.cc
@@ -31,7 +31,7 @@ namespace intrin {
 using tirx::FLowerIntrinsic;
 
 struct MetalWarpIntrinsic {
-  const Op operator()(DataType t, const Op& orig_op) const {
+  const Op operator()(PrimType t, const Op& orig_op) const {
     if (orig_op.same_as(builtin::tvm_warp_shuffle())) {
       static const Op& metal_simd_shuffle_op = Op::Get("tirx.metal.simd_shuffle");
       return metal_simd_shuffle_op;
@@ -52,7 +52,7 @@ static PrimExpr DispatchMetalShuffle(const PrimExpr& e) {
   TVM_FFI_ICHECK(call != nullptr);
   TVM_FFI_ICHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
   ffi::Array<PrimExpr> metal_args{{call->args[1], call->args[2]}};
-  return Call(call->dtype, T()(call->dtype, call->op.as_or_throw<Op>()), metal_args);
+  return Call(e.ty(), T()(e.ty(), call->op.as_or_throw<Op>()), metal_args);
 }
 
 void RegisterMetalIntrinRules() {
@@ -81,7 +81,7 @@ TVM_REGISTER_OP("tirx.round")
       for (auto arg : call->args) {
         new_args.push_back(arg);
       }
-      return tirx::Call(call->dtype, tirx::builtin::call_pure_extern(), new_args);
+      return tirx::Call(e.ty(), tirx::builtin::call_pure_extern(), new_args);
     });
 
 TVM_REGISTER_OP("tirx.nearbyint")
diff --git a/src/backend/opencl/codegen/codegen_opencl.cc b/src/backend/opencl/codegen/codegen_opencl.cc
index 51719785195b..001d4a33b081 100644
--- a/src/backend/opencl/codegen/codegen_opencl.cc
+++ b/src/backend/opencl/codegen/codegen_opencl.cc
@@ -84,7 +84,7 @@ void CodeGenOpenCL::InitFuncState(const PrimFunc& f) {
       // Storage scope qualifiers for textures are inferred
       // and set prior to function codegen.
       continue;
-    } else if (arg.dtype().is_handle()) {
+    } else if (arg.ty().IsHandle()) {
       alloc_storage_scope_[arg.get()] = "global";
     }
   }
@@ -189,26 +189,27 @@ void CodeGenOpenCL::BindThreadIndex(const IterVar& iv) {
   } else {
     os << "get_group_id(" << ts.dim_index << ")";
   }
-  var_idmap_[iv->var.get()] = CastFromTo(os.str(), DataType::UInt(64), iv->var.dtype());
+  var_idmap_[iv->var.get()] = CastFromTo(os.str(), DLDataType{kDLUInt, 64, 1}, iv->var.ty()->dtype);
 }
 
-void CodeGenOpenCL::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
+void CodeGenOpenCL::PrintType(DLDataType raw_t, std::ostream& os) {  // NOLINT(*)
+  PrimType t(raw_t);
   int lanes = t.lanes();
-  if (t.is_handle()) {
+  if (t.IsHandle()) {
     TVM_FFI_ICHECK_EQ(lanes, 1) << "do not yet support vector types";
     os << "void*";
     return;
   }
-  if (t.is_void()) {
+  if (t.IsVoid()) {
     os << "void";
     return;
   }
-  if (t == DataType::Bool()) {
+  if (raw_t == DLDataType{kDLBool, 8, 1}) {
     os << "bool";
     return;
   }
   bool fail = false;
-  if (t.is_float()) {
+  if (t.code() == DLDataTypeCode::kDLFloat) {
     switch (t.bits()) {
       case 16:
         os << "half";
@@ -230,14 +231,14 @@ void CodeGenOpenCL::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
       os << lanes;
       return;
     }
-  } else if (t.is_bool()) {
+  } else if (t.MatchesCode(DLDataTypeCode::kDLBool)) {
     os << "uint";
     if (!fail && ((lanes >= 2 && lanes <= 4) || lanes == 8 || lanes == 16)) {
       os << lanes;
       return;
     }
-  } else if (t.is_uint() || t.is_int()) {
-    if (t.is_uint()) {
+  } else if (t.MatchesCode(DLDataTypeCode::kDLUInt, DLDataTypeCode::kDLInt)) {
+    if (t.MatchesCode(DLDataTypeCode::kDLUInt)) {
       os << 'u';
     }
     switch (t.bits()) {
@@ -266,7 +267,8 @@ void CodeGenOpenCL::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
       return;
     }
   }
-  TVM_FFI_THROW(InternalError) << "Cannot convert type " << t << " to OpenCL type";
+  TVM_FFI_THROW(InternalError) << "Cannot convert type " << ffi::DLDataTypeToString(raw_t)
+                               << " to OpenCL type";
 }
 
 void CodeGenOpenCL::PrintType(const Type& type, std::ostream& os) {  // NOLINT(*)
@@ -286,41 +288,44 @@ void CodeGenOpenCL::PrintType(const Type& type, std::ostream& os) {  // NOLINT(*
   }
 }
 
-void CodeGenOpenCL::PrintVecAddr(const BufferNode* buffer, DataType t, PrimExpr base,
+void CodeGenOpenCL::PrintVecAddr(const BufferNode* buffer, DLDataType t, PrimExpr base,
                                  std::ostream& os) {  // NOLINT(*)
   const VarNode* buffer_var = buffer->data.get();
-  if (!HandleTypeMatch(buffer_var, t.element_of())) {
+  DLDataType elem_type{t.code, t.bits, 1};
+  if (!HandleTypeMatch(buffer_var, elem_type)) {
     os << '(';
     auto it = alloc_storage_scope_.find(buffer_var);
     if (it != alloc_storage_scope_.end()) {
       PrintStorageScope(it->second, os);
     }
-    PrintType(t.element_of(), os);
+    PrintType(elem_type, os);
     os << "*)";
   }
   os << GetVarID(buffer_var) << " + ";
   PrintExpr(base, os);
 }
-std::string CodeGenOpenCL::GetVecLoad(DataType t, const BufferNode* buffer, PrimExpr base) {
+std::string CodeGenOpenCL::GetVecLoad(DLDataType t, const BufferNode* buffer, PrimExpr base) {
   std::ostringstream os;
-  os << "vload" << t.lanes() << "(0, ";
+  os << "vload" << PrimType(t).lanes() << "(0, ";
   PrintVecAddr(buffer, t, base, os);
   os << ")";
   return os.str();
 }
 
-void CodeGenOpenCL::PrintVecStore(const BufferNode* buffer, DataType t, PrimExpr base,
+void CodeGenOpenCL::PrintVecStore(const BufferNode* buffer, DLDataType t, PrimExpr base,
                                   const std::string& value) {
   this->PrintIndent();
-  stream << "vstore" << t.lanes() << "(" << value << ", 0, ";
+  stream << "vstore" << PrimType(t).lanes() << "(" << value << ", 0, ";
   PrintVecAddr(buffer, t, base, stream);
   stream << ");\n";
 }
 
-void CodeGenOpenCL::PrintVecElemLoadExpr(DataType t, int i, const std::string& value,
+void CodeGenOpenCL::PrintVecElemLoadExpr(DLDataType t, int i, const std::string& value,
                                          std::ostream& os) {  // NOLINT(*)
-  TVM_FFI_ICHECK_GT(t.lanes(), 1);
-  if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
+  PrimType t_ty(t);
+  int lanes = t_ty.lanes();
+  TVM_FFI_ICHECK_GT(lanes, 1);
+  if (t.bits == 8 && (t_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt))) {
     if (i != 0) {
       os << "|";
     }
@@ -334,7 +339,7 @@ void CodeGenOpenCL::PrintVecElemLoadExpr(DataType t, int i, const std::string& v
     os << ")(";
   }
   os << value;
-  if (i != t.lanes() - 1) {
+  if (i != lanes - 1) {
     os << ",";
   } else {
     os << "))";
@@ -376,14 +381,14 @@ void CodeGenOpenCL::PrintRestrict(const Var& v, std::ostream& os) {
   }
 }
 
-std::string CodeGenOpenCL::CastFromTo(std::string value, DataType from, DataType target) {
+std::string CodeGenOpenCL::CastFromTo(std::string value, DLDataType from, DLDataType target) {
   if (from == target) return value;
   return CastTo(value, target);
 }
 
-std::string CodeGenOpenCL::CastTo(std::string value, DataType target) {
+std::string CodeGenOpenCL::CastTo(std::string value, DLDataType target) {
   std::ostringstream os;
-  if (target == DataType::Bool()) {
+  if (target == DLDataType{kDLBool, 8, 1}) {
     os << "(";
     os << "(";
     this->PrintType(target, os);
@@ -422,7 +427,7 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     if (it != alloc_storage_scope_.end()) {
       PrintStorageScope(it->second, os);
     }
-    this->PrintType(load->dtype.element_of(), os);
+    this->PrintType(DLDataType{load->ty()->dtype.code, load->ty()->dtype.bits, 1}, os);
     os << " *)" << this->GetVarID(load->buffer->data.get()) << " + ";
     this->PrintExpr(load->indices[0], os);
     os << ')';
@@ -434,13 +439,14 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     const int channel_size = op->args[4].as_or_throw<IntImm>()->value;
     TVM_FFI_ICHECK(channel_size == 64 || channel_size == 128)
         << "Unsupported Channel Size: " << channel_size;
-    DataType channel_type = runtime::GetChannelType(channel_size);
+    DLDataType channel_type = runtime::GetChannelType(channel_size);
 
-    DataType buffer_type = ptr_type->element_type.as<PrimTypeNode>()->dtype;
+    DLDataType buffer_type = ptr_type->element_type.as<PrimTypeNode>()->dtype;
     std::stringstream ss;
     this->PrintExpr(op->args[5], ss);
     std::string value;
-    value = this->SSAGetID(ss.str(), buffer_type.with_lanes(channel_size / buffer_type.bits()));
+    value = this->SSAGetID(ss.str(),
+                           PrimType(buffer_type).WithLanes(channel_size / buffer_type.bits)->dtype);
     if (channel_size == 64) {
       os << "write_imageh(";
     } else if (channel_size == 128) {
@@ -467,11 +473,11 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     enable_compliant_texture_reads_ = true;
     std::stringstream ss;
     const int channel_size = op->args[4].as_or_throw<IntImm>()->value;
-    const int data_lanes = channel_size / op->dtype.bits();
+    const int data_lanes = channel_size / op->ty().bits();
     TVM_FFI_ICHECK(channel_size == 64 || channel_size == 128)
         << "Unsupported Channel Size: " << channel_size;
     ss << "as_";
-    this->PrintType(op->dtype.with_lanes(data_lanes), ss);
+    this->PrintType(op->ty().WithLanes(data_lanes)->dtype, ss);
     ss << "(";
     if (channel_size == 64) {
       ss << "READ_IMAGEH(";
@@ -493,7 +499,7 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     this->PrintExpr(IntImm::Int32(0), ss);
     ss << "))))";
 
-    std::string rhs = SSAGetID(ss.str(), op->dtype.with_lanes(data_lanes));
+    std::string rhs = SSAGetID(ss.str(), op->ty().WithLanes(data_lanes)->dtype);
     if (auto ramp = op->args.back().as<RampNode>()) {
       if (ramp->base.as<IntImmNode>() && *tirx::as_const_int(ramp->base) == 0 &&
           *tirx::as_const_int(ramp->lanes) == data_lanes &&
@@ -501,10 +507,10 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
         os << rhs;
       } else if (*tirx::as_const_int(ramp->stride) == 1) {
         os << "(*(";
-        this->PrintType(op->dtype.with_lanes(*tirx::as_const_int(ramp->lanes)), os);
+        this->PrintType(op->ty().WithLanes(*tirx::as_const_int(ramp->lanes))->dtype, os);
         os << "*)";
         os << "((";
-        this->PrintType(op->dtype.with_lanes(1), os);
+        this->PrintType(op->ty().WithLanes(1)->dtype, os);
         os << "*)&" << rhs << " + ";
         this->PrintExpr(ramp->base, os);
         os << "))";
@@ -513,7 +519,7 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
       }
     } else {
       os << "((";
-      this->PrintType(op->dtype.with_lanes(1), os);
+      this->PrintType(op->ty().WithLanes(1)->dtype, os);
       os << "*)&" << rhs << ")[";
       this->PrintExpr(op->args.back(), os);
       os << "]";
@@ -521,7 +527,7 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
   } else if (op->op.same_as(builtin_call_extern_) || op->op.same_as(builtin_call_pure_extern_)) {
     auto func = op->args[0].as_or_throw<StringImm>();
     // Enable atomics extension if used.
-    if (func->value == "atomic_add" && op->dtype.is_float()) {
+    if (func->value == "atomic_add" && op->ty().code() == DLDataTypeCode::kDLFloat) {
       enable_atomics_ = true;
       this->PrintCallExtern(GetType(ffi::GetRef<PrimExpr>(op)), "atomic_add_float_emu", op->args,
                             true, os);
@@ -540,9 +546,9 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
 
 void CodeGenOpenCL::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NOLINT(*)
   std::string v = PrintExpr(op->value);
-  int lanes = op->dtype.lanes();
+  int lanes = op->ty().lanes();
   os << "((";
-  PrintType(op->dtype, os);
+  PrintType(op->ty()->dtype, os);
   os << ")(";
   for (int i = 0; i < lanes; ++i) {
     if (i != 0) os << ", ";
@@ -553,9 +559,9 @@ void CodeGenOpenCL::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  //
 
 void CodeGenOpenCL::VisitExpr_(const RampNode* op, std::ostream& os) {  // NOLINT(*)
   os << "((";
-  PrintType(op->dtype, os);
+  PrintType(op->ty()->dtype, os);
   os << ")(";
-  int lanes = op->dtype.lanes();
+  int lanes = op->ty().lanes();
   for (int i = 0; i < lanes; i++) {
     os << "(" << PrintExpr(op->base) << ")"
        << "+(" << PrintExpr(op->stride) << "*" << i << ")";
@@ -579,18 +585,18 @@ void CodeGenOpenCL::VisitExpr_(const FloatImmNode* op, std::ostream& os) {  // N
 
 template <typename T>
 inline void PrintBinaryExpr(const T* op, const char* opstr, std::ostream& os, CodeGenOpenCL* p) {
-  if (op->dtype.lanes() == 1) {
+  if (op->ty().lanes() == 1) {
     os << opstr << "((";
-    p->PrintType(op->a->dtype, os);
+    p->PrintType(op->a.ty()->dtype, os);
     os << ")";
     p->PrintExpr(op->a, os);
     os << ", (";
-    p->PrintType(op->b->dtype, os);
+    p->PrintType(op->b.ty()->dtype, os);
     os << ")";
     p->PrintExpr(op->b, os);
     os << ')';
   } else {
-    p->PrintVecBinaryOp(opstr, op->dtype, op->a, op->b, os);
+    p->PrintVecBinaryOp(opstr, op->ty()->dtype, op->a, op->b, os);
   }
 }
 
@@ -604,14 +610,16 @@ void CodeGenOpenCL::VisitExpr_(const MaxNode* op, std::ostream& os) {
 
 void CodeGenOpenCL::VisitExpr_(const ModNode* op, std::ostream& os) {  // NOLINT(*)
   std::string opstr;
-  if (op->dtype.is_int() || op->dtype.is_uint()) {
+  PrimType op_ty = op->ty();
+  if (op_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
     opstr = "%";
   } else {
-    TVM_FFI_ICHECK(op->dtype.is_float())
-        << "Expected floating point or integer dtype in Mod, but got " << op->dtype;
+    TVM_FFI_ICHECK(op_ty.code() == DLDataTypeCode::kDLFloat)
+        << "Expected floating point or integer dtype in Mod, but got "
+        << ffi::DLDataTypeToString(op->ty()->dtype);
     opstr = "fmod";
   }
-  if (op->dtype.lanes() == 1) {
+  if (op_ty.lanes() == 1) {
     if (isalpha(opstr.c_str()[0])) {
       os << opstr.c_str() << '(';
       this->PrintExpr(op->a, os);
@@ -626,7 +634,7 @@ void CodeGenOpenCL::VisitExpr_(const ModNode* op, std::ostream& os) {  // NOLINT
       os << ')';
     }
   } else {
-    this->PrintVecBinaryOp(opstr.c_str(), op->dtype, op->a, op->b, os);
+    this->PrintVecBinaryOp(opstr.c_str(), op->ty()->dtype, op->a, op->b, os);
   }
 }
 
@@ -634,11 +642,11 @@ void CodeGenOpenCL::VisitExpr_(const AndNode* op, std::ostream& os) {
   std::ostringstream oss;
   os << "(";
   this->PrintExpr(op->a, oss);
-  os << CastTo(oss.str(), op->dtype);
+  os << CastTo(oss.str(), op->ty()->dtype);
   oss.str("");
   os << " && ";
   this->PrintExpr(op->b, oss);
-  os << CastTo(oss.str(), op->dtype);
+  os << CastTo(oss.str(), op->ty()->dtype);
   os << ")";
 }
 
@@ -646,11 +654,11 @@ void CodeGenOpenCL::VisitExpr_(const OrNode* op, std::ostream& os) {
   std::ostringstream oss;
   os << "(";
   this->PrintExpr(op->a, oss);
-  os << CastTo(oss.str(), op->dtype);
+  os << CastTo(oss.str(), op->ty()->dtype);
   oss.str("");
   os << " || ";
   this->PrintExpr(op->b, oss);
-  os << CastTo(oss.str(), op->dtype);
+  os << CastTo(oss.str(), op->ty()->dtype);
   os << ")";
 }
 
@@ -658,18 +666,19 @@ void CodeGenOpenCL::VisitExpr_(const SelectNode* op, std::ostream& os) {
   std::ostringstream oss;
   os << "select(";
   PrintExpr(op->false_value, oss);
-  os << CastFromTo(oss.str(), op->false_value.dtype(), op->dtype);
+  os << CastFromTo(oss.str(), op->false_value.ty()->dtype, op->ty()->dtype);
   oss.str("");
   os << ", ";
   PrintExpr(op->true_value, oss);
-  os << CastFromTo(oss.str(), op->true_value.dtype(), op->dtype);
+  os << CastFromTo(oss.str(), op->true_value.ty()->dtype, op->ty()->dtype);
   oss.str("");
   os << ", ";
   PrintExpr(op->condition, oss);
-  if (op->dtype.is_float()) {
-    os << CastTo(oss.str(), DataType::Int(op->dtype.bits(), op->dtype.lanes()));
+  if (op->ty().code() == DLDataTypeCode::kDLFloat) {
+    os << CastTo(oss.str(), DLDataType{kDLInt, static_cast<uint8_t>(op->ty().bits()),
+                                       static_cast<uint16_t>(op->ty().lanes())});
   } else {
-    os << CastFromTo(oss.str(), op->condition.dtype(), op->dtype);
+    os << CastFromTo(oss.str(), op->condition.ty()->dtype, op->ty()->dtype);
   }
   os << ")";
 }
diff --git a/src/backend/opencl/codegen/codegen_opencl.h b/src/backend/opencl/codegen/codegen_opencl.h
index d588a18c2029..47667e30663a 100644
--- a/src/backend/opencl/codegen/codegen_opencl.h
+++ b/src/backend/opencl/codegen/codegen_opencl.h
@@ -46,20 +46,20 @@ class CodeGenOpenCL final : public CodeGenC {
   void BindThreadIndex(const IterVar& iv) final;                             // NOLINT(*)
   void PrintStorageScope(const std::string& scope, std::ostream& os) final;  // NOLINT(*)
   void PrintStorageSync(const CallNode* op) final;                           // NOLINT(*)
-  void PrintType(DataType t, std::ostream& os) final;                        // NOLINT(*)
+  void PrintType(DLDataType t, std::ostream& os) final;                      // NOLINT(*)
   void PrintType(const Type& type, std::ostream& os) final;                  // NOLINT(*)
-  std::string GetVecLoad(DataType t, const BufferNode* buffer, PrimExpr base) final;
-  void PrintVecStore(const BufferNode* buffer, DataType t, PrimExpr base,
+  std::string GetVecLoad(DLDataType t, const BufferNode* buffer, PrimExpr base) final;
+  void PrintVecStore(const BufferNode* buffer, DLDataType t, PrimExpr base,
                      const std::string& value) final;  // NOLINT(*)
-  void PrintVecElemLoadExpr(DataType t, int i, const std::string& value,
+  void PrintVecElemLoadExpr(DLDataType t, int i, const std::string& value,
                             std::ostream& os) final;  // NOLINT(*)
   // the address of load/store
-  void PrintVecAddr(const BufferNode* buffer, DataType t, PrimExpr base,
-                    std::ostream& os);                                           // NOLINT(*)
-  void PrintRestrict(const Var& v, std::ostream& os) final;                      // NOLINT(*)
-  std::string CastFromTo(std::string value, DataType from, DataType target);     // NOLINT(*)
-  std::string CastTo(std::string value, DataType target);                        // NOLINT(*)
-  void SetTextureScope(const std::unordered_map<const VarNode*, std::string>&);  // NOLINT(*)
+  void PrintVecAddr(const BufferNode* buffer, DLDataType t, PrimExpr base,
+                    std::ostream& os);                                            // NOLINT(*)
+  void PrintRestrict(const Var& v, std::ostream& os) final;                       // NOLINT(*)
+  std::string CastFromTo(std::string value, DLDataType from, DLDataType target);  // NOLINT(*)
+  std::string CastTo(std::string value, DLDataType target);                       // NOLINT(*)
+  void SetTextureScope(const std::unordered_map<const VarNode*, std::string>&);   // NOLINT(*)
 
   // overload visitor
   void VisitStmt_(const AllocBufferNode* op) final;                  // NOLINT(*)
diff --git a/src/backend/opencl/codegen/intrin_rule_opencl.cc b/src/backend/opencl/codegen/intrin_rule_opencl.cc
index f0f58be84d10..669fd1863b39 100644
--- a/src/backend/opencl/codegen/intrin_rule_opencl.cc
+++ b/src/backend/opencl/codegen/intrin_rule_opencl.cc
@@ -42,7 +42,7 @@ static PrimExpr DispatchIntelShuffle(const PrimExpr& e) {
       << "Intel warp shuffle dose not support width != warp_size";
   ffi::Array<PrimExpr> opencl_args{
       {StringImm("intel_sub_group_shuffle"), call->args[1], call->args[2]}};
-  return Call(call->dtype, builtin::call_pure_extern(), opencl_args);
+  return Call(e.ty(), builtin::call_pure_extern(), opencl_args);
 }
 
 void RegisterOpenCLIntrinRules() {
@@ -75,7 +75,7 @@ TVM_REGISTER_OP("tirx.round")
       for (auto arg : call->args) {
         new_args.push_back(arg);
       }
-      return tirx::Call(call->dtype, tirx::builtin::call_pure_extern(), new_args);
+      return tirx::Call(e.ty(), tirx::builtin::call_pure_extern(), new_args);
     });
 
 TVM_REGISTER_OP("tirx.nearbyint")
diff --git a/src/backend/opencl/runtime/opencl_common.h b/src/backend/opencl/runtime/opencl_common.h
index 3b99fa166def..4fc7ce85e383 100644
--- a/src/backend/opencl/runtime/opencl_common.h
+++ b/src/backend/opencl/runtime/opencl_common.h
@@ -186,24 +186,25 @@ inline const char* CLGetErrorString(cl_int error) {
 }
 
 inline cl_channel_type DTypeToOpenCLChannelType(DLDataType data_type) {
-  DataType dtype(data_type);
-  dtype = dtype.with_lanes(1);
+  DLDataType dtype = data_type;
+  // OpenCL image channel type depends on the scalar element type, not vector lanes.
+  dtype.lanes = 1;
 
-  if (dtype == DataType::Float(32)) {
+  if (dtype == DLDataType{kDLFloat, 32, 1}) {
     return CL_FLOAT;
-  } else if (dtype == DataType::Float(16)) {
+  } else if (dtype == DLDataType{kDLFloat, 16, 1}) {
     return CL_HALF_FLOAT;
-  } else if (dtype == DataType::Int(8)) {
+  } else if (dtype == DLDataType{kDLInt, 8, 1}) {
     return CL_SIGNED_INT8;
-  } else if (dtype == DataType::Int(16)) {
+  } else if (dtype == DLDataType{kDLInt, 16, 1}) {
     return CL_SIGNED_INT16;
-  } else if (dtype == DataType::Int(32)) {
+  } else if (dtype == DLDataType{kDLInt, 32, 1}) {
     return CL_SIGNED_INT32;
-  } else if (dtype == DataType::UInt(8)) {
+  } else if (dtype == DLDataType{kDLUInt, 8, 1}) {
     return CL_UNSIGNED_INT8;
-  } else if (dtype == DataType::UInt(16)) {
+  } else if (dtype == DLDataType{kDLUInt, 16, 1}) {
     return CL_UNSIGNED_INT16;
-  } else if (dtype == DataType::UInt(32)) {
+  } else if (dtype == DLDataType{kDLUInt, 32, 1}) {
     return CL_UNSIGNED_INT32;
   }
   TVM_FFI_THROW(InternalError) << "data type is not supported in OpenCL runtime yet: " << dtype;
diff --git a/src/backend/opencl/runtime/opencl_device_api.cc b/src/backend/opencl/runtime/opencl_device_api.cc
index eeb8e95ad543..0b53a1915192 100644
--- a/src/backend/opencl/runtime/opencl_device_api.cc
+++ b/src/backend/opencl/runtime/opencl_device_api.cc
@@ -779,14 +779,12 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                     int64_t height = shape[1];
                     int64_t depth = shape[2];
                     int64_t channel_size = args[7].cast<int64_t>();
-                    DataType channel_type = GetChannelType(channel_size);
+                    DLDataType channel_type = GetChannelType(channel_size);
                     Device dev;
                     dev.device_type = static_cast<DLDeviceType>(device_type);
                     dev.device_id = device_id;
                     DLDataType type_hint;
-                    type_hint.code = channel_type.code();
-                    type_hint.bits = channel_type.bits();
-                    type_hint.lanes = channel_type.lanes();
+                    type_hint = channel_type;
 
                     *rv = OpenCLWorkspace::Global()->AllocDataSpace(
                         dev, static_cast<size_t>(width), static_cast<size_t>(height),
diff --git a/src/backend/opencl/runtime/texture.h b/src/backend/opencl/runtime/texture.h
index a8711805cbfa..3aa2d3681142 100644
--- a/src/backend/opencl/runtime/texture.h
+++ b/src/backend/opencl/runtime/texture.h
@@ -120,15 +120,13 @@ size_t GetTextureMemorySize(T shape, int bits, int lanes, std::string mem_scope,
 /*!
  * \brief Returns the standard channel datatype for any given type.
  * \param channel_size The Number of bits in a Channel
- * \return DataType to be used in the codegen.
+ * \return DLDataType to be used in the codegen.
  */
-inline DataType GetChannelType(size_t channel_size) {
-  DataType channel_type;
-
+inline DLDataType GetChannelType(size_t channel_size) {
   if (channel_size == 128)
-    return DataType::Float(32, 4);
+    return DLDataType{kDLFloat, 32, 4};
   else if (channel_size == 64)
-    return DataType::Float(16, 4);
+    return DLDataType{kDLFloat, 16, 4};
 
   TVM_FFI_THROW(InternalError) << "Unsupported Channel Size: " << channel_size;
 }
diff --git a/src/backend/rocm/codegen/llvm/codegen_amdgpu.cc b/src/backend/rocm/codegen/llvm/codegen_amdgpu.cc
index 22ce75cddade..6f70343f46a4 100644
--- a/src/backend/rocm/codegen/llvm/codegen_amdgpu.cc
+++ b/src/backend/rocm/codegen/llvm/codegen_amdgpu.cc
@@ -100,7 +100,7 @@ class CodeGenAMDGPU : public CodeGenLLVM {
     llvm::Value* buf = nullptr;
     StorageInfo& info = alloc_storage_info_[op->buffer->data.get()];
     auto storage_scope = runtime::StorageScope::Create(GetPtrStorageScope(op->buffer->data));
-    DataType dtype = op->buffer->dtype;
+    PrimType dtype = op->buffer->dtype;
 
     if (storage_scope.rank == runtime::StorageRank::kShared && storage_scope.tag == ".dyn") {
       LOG(WARNING) << "Dynamic shared memory support for rocm is experimental.";
@@ -188,7 +188,7 @@ class CodeGenAMDGPU : public CodeGenLLVM {
     llvm::Function* f = llvm::Intrinsic::getDeclaration(module_.get(), intrin_id);
 #endif
     llvm::Value* result = builder_->CreateCall(f, {});
-    return this->CreateCast(DataType::Int(32), iv->var->dtype, result);
+    return this->CreateCast(PrimType::Int(32), iv->var.ty(), result);
   }
 
   llvm::Value* CreateStorageSync(const CallNode* op) final {
@@ -220,10 +220,11 @@ class CodeGenAMDGPU : public CodeGenLLVM {
 
   llvm::Value* CreateIntrinsic(const CallNode* op) final {
     if (op->op.same_as(builtin::atomic_add())) {
-      TVM_FFI_ICHECK(op->args[1]->dtype.bits() == 32) << "Only supports 32 bit atomic for now";
+      PrimType value_ty = op->args[1].ty();
+      TVM_FFI_ICHECK(value_ty.bits() == 32) << "Only supports 32 bit atomic for now";
       llvm::Value* v0 = MakeValue(op->args[0]);
       llvm::Value* v1 = MakeValue(op->args[1]);
-      if (op->args[1]->dtype.is_float()) {
+      if (value_ty.MatchesCode(DLDataTypeCode::kDLFloat)) {
         return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1, llvm::MaybeAlign(),
                                          llvm::AtomicOrdering::Monotonic);
       }
diff --git a/src/backend/rocm/codegen/llvm/intrin_rule_rocm.cc b/src/backend/rocm/codegen/llvm/intrin_rule_rocm.cc
index 4859fd5f4a24..db0f113b9c8b 100644
--- a/src/backend/rocm/codegen/llvm/intrin_rule_rocm.cc
+++ b/src/backend/rocm/codegen/llvm/intrin_rule_rocm.cc
@@ -50,14 +50,14 @@ inline PrimExpr DispatchPureExternOCML(const PrimExpr& e) {
   TVM_FFI_ICHECK_EQ(name.substr(0, 5), "tirx.");
 
   std::ostringstream intrinsic_name;
-  intrinsic_name << "__ocml_" << name.substr(5) << "_f" << call->dtype.bits();
+  intrinsic_name << "__ocml_" << name.substr(5) << "_f" << call->ty().bits();
 
   ffi::Array<PrimExpr> new_args = {StringImm(intrinsic_name.str())};
   for (auto arg : call->args) {
     new_args.push_back(arg);
   }
 
-  return Call(call->dtype, builtin::call_pure_extern(), new_args);
+  return Call(call->ty(), builtin::call_pure_extern(), new_args);
 }
 
 inline PrimExpr DispatchShuffle(const PrimExpr& e) {
@@ -66,15 +66,17 @@ inline PrimExpr DispatchShuffle(const PrimExpr& e) {
   TVM_FFI_ICHECK(call != nullptr);
   TVM_FFI_ICHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
   PrimExpr var = call->args[1];
-  TVM_FFI_ICHECK_EQ(var.dtype().bits(), 32);
+  PrimType var_ty = var.ty();
+  TVM_FFI_ICHECK_EQ(var_ty.bits(), 32);
 
   // get own lane in self (__lane_id)
   PrimExpr minus_one = IntImm::Int32(-1);
   PrimExpr zero = IntImm::Int32(0);
-  PrimExpr lo = Call(DataType::Int(32), builtin::call_pure_extern(),
+  PrimType i32_ty = PrimType::Int(32);
+  PrimExpr lo = Call(i32_ty, builtin::call_pure_extern(),
                      {StringImm("llvm.amdgcn.mbcnt.lo"), minus_one, zero});
-  PrimExpr self = Call(DataType::Int(32), builtin::call_pure_extern(),
-                       {StringImm("llvm.amdgcn.mbcnt.hi"), minus_one, lo});
+  PrimExpr self =
+      Call(i32_ty, builtin::call_pure_extern(), {StringImm("llvm.amdgcn.mbcnt.hi"), minus_one, lo});
 
   // compute lane to get from
   PrimExpr width = call->args[3];
@@ -93,12 +95,12 @@ inline PrimExpr DispatchShuffle(const PrimExpr& e) {
     index = Select((self & (width - 1)) + delta >= width, self, index);
   }
   // reinterprete var as int32
-  bool is_int32 = var.dtype().is_int() && var.dtype().bits() == 32;
-  PrimExpr source = is_int32 ? var : reinterpret(DataType::Int(32), var);
-  PrimExpr res = Call(DataType::Int(32), builtin::call_pure_extern(),
+  bool is_int32 = var_ty.MatchesElementType(DLDataTypeCode::kDLInt, 32);
+  PrimExpr source = is_int32 ? var : reinterpret(PrimType::Int(32), var);
+  PrimExpr res = Call(i32_ty, builtin::call_pure_extern(),
                       {StringImm("llvm.amdgcn.ds.bpermute"), index << 2, source});
   if (!is_int32) {
-    res = reinterpret(var.dtype(), res);
+    res = reinterpret(var_ty, res);
   }
   return res;
 }
diff --git a/src/backend/trn/codegen/codegen_trn.cc b/src/backend/trn/codegen/codegen_trn.cc
index eb9d7ca4b437..631df21f8b08 100644
--- a/src/backend/trn/codegen/codegen_trn.cc
+++ b/src/backend/trn/codegen/codegen_trn.cc
@@ -110,7 +110,7 @@ void CodeGenTrainium::AddFunction(const GlobalVar& gvar, const PrimFunc& func) {
   size_t num_buffer = 0;
   for (size_t i = 0; i < func->params.size(); ++i, ++num_buffer) {
     Var v = func->params[i];
-    if (!v.dtype().is_handle()) {
+    if (!v.ty().IsHandle()) {
       LOG(FATAL) << "Trainium codegen currently only support buffer arguments";
     };
     std::string vid = AllocVarID(v.get());
@@ -137,16 +137,17 @@ void CodeGenTrainium::AddFunction(const GlobalVar& gvar, const PrimFunc& func) {
   this->EndScope(func_scope);
 }
 
-void CodeGenTrainium::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
+void CodeGenTrainium::PrintType(DLDataType raw_t, std::ostream& os) {  // NOLINT(*)
+  PrimType t(raw_t);
   int lanes = t.lanes();
   TVM_FFI_ICHECK(lanes == 1) << "Trainium codegen does not support vector types";
-  TVM_FFI_ICHECK(!t.is_handle()) << "Trainium codegen does not support handle type";
-  TVM_FFI_ICHECK(!t.is_void()) << "Trainium codegen does not support void type";
-  if (t == DataType::Bool()) {
+  TVM_FFI_ICHECK(!t.IsHandle()) << "Trainium codegen does not support handle type";
+  TVM_FFI_ICHECK(!t.IsVoid()) << "Trainium codegen does not support void type";
+  if (t.MatchesCode(DLDataTypeCode::kDLBool)) {
     os << "np.bool";
     return;
   }
-  if (t.is_float()) {
+  if (t.code() == DLDataTypeCode::kDLFloat) {
     switch (t.bits()) {
       case 16:
         os << "np.float16";
@@ -160,13 +161,13 @@ void CodeGenTrainium::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
     }
     return;
   }
-  if (t.is_uint() || t.is_int()) {
+  if (t.MatchesCode(DLDataTypeCode::kDLUInt, DLDataTypeCode::kDLInt)) {
     if (t.bits() == 1) {
       os << "np.bool";
       return;
     }
     os << "np.";
-    if (t.is_uint()) {
+    if (t.MatchesCode(DLDataTypeCode::kDLUInt)) {
       os << 'u';
     }
     switch (t.bits()) {
@@ -188,11 +189,11 @@ void CodeGenTrainium::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
     }
     return;
   }
-  if (t.is_bfloat16()) {
+  if (t.code() == DLDataTypeCode::kDLBfloat && t.bits() == 16) {
     os << "nl.bfloat16";
     return;
   }
-  LOG(FATAL) << "Cannot convert type " << t << " to Trainium type";
+  LOG(FATAL) << "Cannot convert type " << raw_t << " to Trainium type";
 }
 
 std::string CodeGenTrainium::GetStorageScopeStr(const std::string& scope) {  // NOLINT(*)
@@ -215,7 +216,7 @@ void CodeGenTrainium::VisitStmt_(const AllocBufferNode* op) {
   this->PrintIndent();
   auto scope = GetPtrStorageScope(op->buffer->data);
   std::ostringstream dtype_os;
-  PrintType(op->buffer->dtype, dtype_os);
+  PrintType(op->buffer->dtype->dtype, dtype_os);
   std::string dtype_str = dtype_os.str();
   if (scope == "trn.psum") {
     stream << vid << " = nl.ndarray(shape=[";
@@ -589,7 +590,7 @@ void CodeGenTrainium::VisitExpr_(const VarNode* op, std::ostream& os) {  // NOLI
 }
 
 void CodeGenTrainium::VisitExpr_(const CastNode* op, std::ostream& os) {
-  ctx_.dst_dtype = op->dtype;
+  ctx_.dst_dtype = op->ty();
   CodeGenTrainium::VisitExpr(op->value, os);
 }
 
diff --git a/src/backend/trn/codegen/codegen_trn.h b/src/backend/trn/codegen/codegen_trn.h
index 2c3b5fd37393..ec4eaad29cce 100644
--- a/src/backend/trn/codegen/codegen_trn.h
+++ b/src/backend/trn/codegen/codegen_trn.h
@@ -41,7 +41,7 @@ struct NKIInstructionCtx {
   bool is_matmul_input = false;
   int buffer_index = -1;
   int used_var_cnt = 0;
-  DataType dst_dtype;
+  PrimType dst_dtype = PrimType::Void();
   PrimExpr mask;
   bool tensorizing = false;
 };
@@ -57,7 +57,7 @@ class CodeGenTrainium final : public CodeGenC {
   void InitFuncState(const PrimFunc& f) final;
   std::string GetStorageScopeStr(const std::string& scope);           // NOLINT(*)
   void VisitExpr_(const VarNode* op, std::ostream& os) final;         // NOLINT(*)
-  void PrintType(DataType t, std::ostream& os) final;                 // NOLINT(*)
+  void PrintType(DLDataType t, std::ostream& os) final;               // NOLINT(*)
   void VisitStmt_(const AllocBufferNode* op) final;                   // NOLINT(*)
   void VisitStmt_(const AttrStmtNode* op) final;                      // NOLINT(*)
   void VisitStmt_(const ForNode* op) final;                           // NOLINT(*)
diff --git a/src/backend/trn/transform/lower_trainium_layout.cc b/src/backend/trn/transform/lower_trainium_layout.cc
index ad4b206a48b2..fb1d92c5215d 100644
--- a/src/backend/trn/transform/lower_trainium_layout.cc
+++ b/src/backend/trn/transform/lower_trainium_layout.cc
@@ -176,8 +176,8 @@ class TrainiumLayoutApplier : public arith::IRMutatorWithAnalyzer {
       flattened = buf.GetFlattenedBuffer();
       writer = flattened.CopyOnWrite();
     }
-    if (flattened->dtype == DataType::Bool()) {
-      writer->dtype = DataType::Int(8);
+    if (flattened->dtype->dtype == DLDataType{kDLBool, 8, 1}) {
+      writer->dtype = PrimType::Int(8);
     }
     for (size_t i = 0; i < flattened->shape.size(); ++i) {
       writer->shape.Set(i, analyzer_->canonical_simplify(flattened->shape[i]));
@@ -191,28 +191,30 @@ class TrainiumLayoutApplier : public arith::IRMutatorWithAnalyzer {
 
   Stmt VisitStmt_(const BufferStoreNode* op) final {
     BufferStore store = StmtExprMutator::VisitStmt_(op).as_or_throw<BufferStore>();
-    bool store_returns_bool = (op->value.dtype() == DataType::Bool());
+    PrimType store_value_ty = op->value.ty();
+    bool store_returns_bool = store_value_ty.MatchesCode(DLDataTypeCode::kDLBool);
     store = VisitBufferAccess(store);
 
     if (store_returns_bool) {
-      TVM_FFI_ICHECK_EQ(store->buffer->dtype, DataType::Int(8))
+      TVM_FFI_ICHECK_EQ(store->buffer->dtype->dtype, (DLDataType{kDLInt, 8, 1}))
           << "Expected int8 backing array for boolean tensor";
       auto writer = store.CopyOnWrite();
-      writer->value = tvm::cast(DataType::Int(8), store->value);
+      writer->value = tvm::cast(PrimType::Int(8), store->value);
       return std::move(store);
     }
     return std::move(store);
   }
 
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
-    bool load_returns_bool = (op->dtype == DataType::Bool());
+    PrimType load_ty = op->ty();
+    bool load_returns_bool = load_ty.MatchesCode(DLDataTypeCode::kDLBool);
     BufferLoad load = StmtExprMutator::VisitExpr_(op).as_or_throw<BufferLoad>();
     load = VisitBufferAccess(load);
     if (load_returns_bool) {
-      TVM_FFI_ICHECK_EQ(load->buffer->dtype, DataType::Int(8))
+      TVM_FFI_ICHECK_EQ(load->buffer->dtype->dtype, (DLDataType{kDLInt, 8, 1}))
           << "Expected int8 backing array for boolean tensor";
-      load.CopyOnWrite()->dtype = DataType::Int(8);
-      return tvm::cast(DataType::Bool(), load);
+      load.CopyOnWrite()->BaseExprNode::ty = PrimType::Int(8);
+      return tvm::cast(PrimType::Bool(), load);
     } else {
       return std::move(load);
     }
diff --git a/src/backend/vulkan/codegen/codegen_spirv.cc b/src/backend/vulkan/codegen/codegen_spirv.cc
index 5737c60da9dc..094e31370481 100644
--- a/src/backend/vulkan/codegen/codegen_spirv.cc
+++ b/src/backend/vulkan/codegen/codegen_spirv.cc
@@ -52,8 +52,8 @@ runtime::SPIRVShader CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std::s
   const uint32_t descriptor_set = 0;
 
   for (Var arg : f->params) {
-    DataType t = arg.dtype();
-    if (t.is_handle()) {
+    PrimType t = PrimType(arg.ty()->dtype);
+    if (t.IsHandle()) {
       auto* ptr = arg->type_annotation.as<PointerTypeNode>();
       TVM_FFI_ICHECK(ptr)
           << "All handles passed to the Vulkan codegen must have a type_annotation as a "
@@ -64,11 +64,11 @@ runtime::SPIRVShader CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std::s
           << "All handles passed to the Vulkan codegen must have a type_annotation as a "
              "PointerType, "
           << "and must point to a PrimType";
-      DataType value_storage_type = prim->dtype;
-      if (value_storage_type == DataType::Bool()) {
+      PrimType value_storage_type(prim->dtype);
+      if (value_storage_type == PrimType::Bool()) {
         // We need a physically addressable buffer type to support boolean tensors.
         // The loaded byte is cast to bool inside the LoadNode visitor below.
-        value_storage_type = boolean_storage_type_.with_lanes(value_storage_type.lanes());
+        value_storage_type = boolean_storage_type_.WithLanes(value_storage_type.lanes());
       }
       spirv::Value arg_value = builder_->BufferArgument(builder_->GetSType(value_storage_type),
                                                         descriptor_set, i_buffer++);
@@ -87,7 +87,7 @@ runtime::SPIRVShader CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std::s
   if (pod_args.size() != 0) {
     std::vector<spirv::SType> value_types;
     for (size_t i = 0; i < pod_args.size(); ++i) {
-      value_types.push_back(builder_->GetSType(pod_args[i].dtype()));
+      value_types.push_back(builder_->GetSType(PrimType(pod_args[i].ty()->dtype)));
     }
     if (pod_args.size() * sizeof(runtime::ArgUnion64) <= runtime::vulkan::kMaxPushConstantsBytes) {
       spirv::Value ptr = builder_->DeclarePushConstant(value_types);
@@ -150,7 +150,7 @@ spirv::Value CodeGenSPIRV::GetThreadIndex(const IterVar& iv, const PrimExpr& ext
   } else {
     v = builder_->GetWorkgroupID(ts.dim_index);
   }
-  return builder_->Cast(builder_->GetSType(iv->var.dtype()), v);
+  return builder_->Cast(builder_->GetSType(PrimType(iv->var.ty()->dtype)), v);
 }
 
 spirv::Value CodeGenSPIRV::CreateStorageSync(const CallNode* op) {
@@ -179,7 +179,7 @@ spirv::Value CodeGenSPIRV::CreateStorageSync(const CallNode* op) {
     TVM_FFI_THROW(InternalError) << "Do not support sync " << sync;
   }
 
-  auto type_int = builder_->GetSType(DataType::Int(32));
+  auto type_int = builder_->GetSType(PrimType::Int(32));
   builder_->MakeInst(spv::OpControlBarrier, builder_->IntImm(type_int, sync_scope),
                      builder_->IntImm(type_int, sync_scope),
                      builder_->IntImm(type_int, memory_semantics));
@@ -194,11 +194,11 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const VarNode* op) {
 }
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const IntImmNode* op) {
-  return builder_->IntImm(builder_->GetSType(op->dtype), op->value);
+  return builder_->IntImm(builder_->GetSType(PrimType(op->ty()->dtype)), op->value);
 }
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const FloatImmNode* op) {
-  return builder_->FloatImm(builder_->GetSType(op->dtype), op->value);
+  return builder_->FloatImm(builder_->GetSType(PrimType(op->ty()->dtype)), op->value);
 }
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const StringImmNode* op) {
@@ -206,7 +206,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const StringImmNode* op) {
 }
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const CastNode* op) {
-  return builder_->Cast(builder_->GetSType(op->dtype), MakeValue(op->value));
+  return builder_->Cast(builder_->GetSType(PrimType(op->ty()->dtype)), MakeValue(op->value));
 }
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const AddNode* op) {
@@ -308,7 +308,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     for (size_t i = 1; i < op->args.size(); ++i) {
       values.push_back(MakeValue(op->args[i]));
     }
-    return builder_->CallGLSL450(builder_->GetSType(op->dtype), inst_id, values);
+    return builder_->CallGLSL450(builder_->GetSType(PrimType(op->ty()->dtype)), inst_id, values);
   } else if (op->op.same_as(builtin::bitwise_and())) {
     TVM_FFI_ICHECK_EQ(op->args.size(), 2U);
     spirv::Value a = MakeValue(op->args[0]);
@@ -337,20 +337,20 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     TVM_FFI_ICHECK_EQ(op->args.size(), 2U);
     spirv::Value a = MakeValue(op->args[0]);
     spirv::Value b = MakeValue(op->args[1]);
-    if (op->args[0].dtype().is_int()) {
+    if (PrimType(op->args[0].ty()->dtype).MatchesCode(DLDataTypeCode::kDLInt)) {
       return builder_->MakeValue(spv::OpShiftRightArithmetic, a.stype, a, b);
     } else {
       return builder_->MakeValue(spv::OpShiftRightLogical, a.stype, a, b);
     }
   } else if (op->op.same_as(builtin::reinterpret())) {
-    return builder_->MakeValue(spv::OpBitcast, builder_->GetSType(op->dtype),
+    return builder_->MakeValue(spv::OpBitcast, builder_->GetSType(PrimType(op->ty()->dtype)),
                                MakeValue(op->args[0]));
   } else if (op->op.same_as(builtin::large_uint_imm())) {
     TVM_FFI_ICHECK_EQ(op->args.size(), 2U);
     uint64_t low = static_cast<uint64_t>(op->args[0].as_or_throw<IntImm>()->value);
     uint64_t high = static_cast<uint64_t>(op->args[1].as_or_throw<IntImm>()->value);
     uint64_t val = (high << 32U) | low;
-    return builder_->UIntImm(builder_->GetSType(op->dtype), val);
+    return builder_->UIntImm(builder_->GetSType(PrimType(op->ty()->dtype)), val);
   } else if (op->op.same_as(builtin::tvm_storage_sync())) {
     return this->CreateStorageSync(op);
   } else if (op->op.same_as(builtin::if_then_else())) {
@@ -378,7 +378,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     phi.SetIncoming(1, else_value, else_value_label);
     return phi;
   } else if (op->op.same_as(builtin::popcount())) {
-    return builder_->MakeValue(spv::OpBitCount, builder_->GetSType(op->dtype),
+    return builder_->MakeValue(spv::OpBitCount, builder_->GetSType(PrimType(op->ty()->dtype)),
                                MakeValue(op->args[0]));
   } else if (op->op.same_as(builtin::call_pure_extern())) {
     TVM_FFI_ICHECK_GE(op->args.size(), 1U);
@@ -388,7 +388,8 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
       for (size_t i = 1; i < op->args.size(); ++i) {
         values.push_back(MakeValue(op->args[i]));
       }
-      return builder_->CallKHRIntegerDotProduct(builder_->GetSType(op->dtype), values, op->dtype);
+      PrimType op_dtype(op->ty()->dtype);
+      return builder_->CallKHRIntegerDotProduct(builder_->GetSType(op_dtype), values, op_dtype);
     } else {
       TVM_FFI_THROW(InternalError)
           << "SPIR-V shader cannot make extern calls.  Graph contains extern \""
@@ -412,8 +413,9 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     TVM_FFI_ICHECK_EQ(op->args.size(), 6U);
     const VarNode* buffer_node = op->args[0].as<VarNode>();
     TVM_FFI_ICHECK(buffer_node && fragment_info_.count(buffer_node));
-    DataType ele_dtype = GetElementDataType(buffer_node);
-    TVM_FFI_ICHECK(ele_dtype.is_float()) << "Only floating point fragment accumulator is supported";
+    PrimType ele_dtype = GetElementDataType(buffer_node);
+    TVM_FFI_ICHECK(ele_dtype.MatchesCode(DLDataTypeCode::kDLFloat))
+        << "Only floating point fragment accumulator is supported";
     spirv::SType ele_stype = builder_->GetSType(ele_dtype);
     spirv::SType& fragment_type = fragment_info_[buffer_node].stype;
     double init = static_cast<uint64_t>(op->args[5].as_or_throw<FloatImm>()->value);
@@ -435,7 +437,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     PrimExpr dst_index = op->args[4];
     PrimExpr src_ptr_expr = op->args[5];
     int stride = static_cast<int>(op->args[6].as_or_throw<IntImm>()->value);
-    auto type_int = builder_->GetSType(DataType::Int(32));
+    auto type_int = builder_->GetSType(PrimType::Int(32));
     spirv::Value stride_val = builder_->IntImm(type_int, stride);
     std::string layout = (op->args[7].as<StringImmNode>())->value;
     spirv::SType dst_ptr_type =
@@ -443,7 +445,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     spirv::Value dst_ptr =
         builder_->StructArrayAccess(dst_ptr_type, var_map_[buffer_node], MakeValue(dst_index));
     spirv::Value src_ptr = VisitExpr(op->args[5]);
-    spirv::SType type_bool = builder_->GetSType(DataType::Bool());
+    spirv::SType type_bool = builder_->GetSType(PrimType::Bool());
     spirv::Value t_val = builder_->UIntImm(type_bool, 1);
     spirv::Value f_val = builder_->UIntImm(type_bool, 0);
     spirv::Value loaded =
@@ -494,7 +496,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     PrimExpr index = op->args[4];
     PrimExpr buffer_ptr = op->args[5];
     int stride = static_cast<int>(op->args[6].as_or_throw<IntImm>()->value);
-    auto type_int = builder_->GetSType(DataType::Int(32));
+    auto type_int = builder_->GetSType(PrimType::Int(32));
     spirv::Value stride_val = builder_->IntImm(type_int, stride);
     std::string layout = (op->args[7].as<StringImmNode>())->value;
     spirv::Value dst_ptr = VisitExpr(op->args[5]);
@@ -505,7 +507,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
         builder_->StructArrayAccess(ptr_type, var_map_[buffer_node], MakeValue(index));
     uint32_t mask = spv::MemoryAccessMaskNone;
     spirv::Value loaded = builder_->MakeValue(spv::OpLoad, fragment_type, ptr, mask);
-    spirv::SType type_bool = builder_->GetSType(DataType::Bool());
+    spirv::SType type_bool = builder_->GetSType(PrimType::Bool());
     spirv::Value t_val = builder_->UIntImm(type_bool, 1);
     spirv::Value f_val = builder_->UIntImm(type_bool, 0);
     builder_->MakeInst(spv::OpCooperativeMatrixStoreNV, dst_ptr, loaded, stride_val,
@@ -516,7 +518,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     Var buffer_var = load->buffer->data;
     const VarNode* buffer_node = buffer_var.get();
     PrimExpr index = load->indices[0];
-    DataType ele_dtype = GetElementDataType(buffer_node);
+    PrimType ele_dtype = GetElementDataType(buffer_node);
     spirv::SType ele_stype = builder_->GetSType(ele_dtype);
     spirv::Value buffer_val = MakeValue(buffer_var);
     spirv::SType ptr_type = builder_->GetPointerType(ele_stype, buffer_val.stype.storage_class);
@@ -532,11 +534,11 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
 spirv::Value CodeGenSPIRV::VisitExpr_(const RampNode* op) {
   std::vector<spirv::Value> values;
   spirv::Value base = MakeValue(op->base);
-  int lanes = op->dtype.lanes();
+  int lanes = op->ty().lanes();
   for (int i = 0; i < lanes; ++i) {
     spirv::Value v = base;
     if (i != 0) {
-      spirv::Value offset = MakeValue(MakeConst(op->stride.dtype(), i) * op->stride);
+      spirv::Value offset = MakeValue(MakeConst(op->stride.ty(), i) * op->stride);
       v = builder_->Add(v, offset);
     }
     values.push_back(v);
@@ -547,7 +549,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const RampNode* op) {
 spirv::Value CodeGenSPIRV::VisitExpr_(const BroadcastNode* op) {
   std::vector<spirv::Value> values;
   spirv::Value v = MakeValue(op->value);
-  int lanes = op->dtype.lanes();
+  int lanes = op->ty().lanes();
   for (int i = 0; i < lanes; i++) {
     values.push_back(v);
   }
@@ -560,15 +562,15 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const BufferLoadNode* op) {
   Var buffer_var = op->buffer->data;
   PrimExpr prim_index = op->indices[0];
 
-  DataType desired_read_type = op->dtype;
-  if (desired_read_type == DataType::Bool()) {
-    desired_read_type = boolean_storage_type_.with_lanes(desired_read_type.lanes());
+  PrimType desired_read_type(op->ty()->dtype);
+  if (desired_read_type == PrimType::Bool()) {
+    desired_read_type = boolean_storage_type_.WithLanes(desired_read_type.lanes());
   }
 
   auto it = storage_info_.find(buffer_var.get());
   TVM_FFI_ICHECK(it != storage_info_.end());
   StorageInfo& info = it->second;
-  info.CheckContentType(desired_read_type, prim_index.dtype().lanes());
+  info.CheckContentType(desired_read_type, PrimType(prim_index.ty()->dtype).lanes());
 
   spirv::SType content_type = builder_->GetSType(info.element_type);
   spirv::Value buffer = MakeValue(buffer_var);
@@ -588,13 +590,13 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const BufferLoadNode* op) {
     spirv::Value loaded = builder_->MakeValue(spv::OpLoad, content_type, ptr, mask);
     // OpTypeBool have no physical address/storage.  Here, cast from
     // the storage type to an OpTypeBool.
-    if (op->dtype == DataType::Bool()) {
-      auto spirv_bool = builder_->GetSType(DataType::Bool());
+    if (PrimType(op->ty()->dtype) == PrimType::Bool()) {
+      auto spirv_bool = builder_->GetSType(PrimType::Bool());
       loaded = builder_->Cast(spirv_bool, loaded);
     }
     return loaded;
 
-  } else if (desired_read_type.element_of() == info.element_type) {
+  } else if (desired_read_type.WithLanes(1) == info.element_type) {
     // Requested several elements returned as an array.  Read out each
     // element and concatenate into the result.
     std::vector<spirv::Value> values;
@@ -609,21 +611,22 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const BufferLoadNode* op) {
     TVM_FFI_THROW(InternalError) << "Cannot perform buffer access of buffer variable '"
                                  << buffer_var->name_hint << "' with element type "
                                  << info.element_type << " using index of type "
-                                 << prim_index->dtype << " to produce output of type " << op->dtype;
+                                 << PrimType(prim_index.ty()->dtype)
+                                 << " to produce output of type " << PrimType(op->ty()->dtype);
     return spirv::Value();
   }
 }
 
 void CodeGenSPIRV::Scalarize(const PrimExpr& e, std::function<void(int i, spirv::Value v)> f) {
   if (const RampNode* ramp = e.as<RampNode>()) {
-    for (int i = 0; i < ramp->dtype.lanes(); ++i) {
+    for (int i = 0; i < ramp->ty().lanes(); ++i) {
       PrimExpr offset = ramp->base + ramp->stride * i;
       f(i, MakeValue(offset));
     }
   } else {
-    spirv::SType etype = builder_->GetSType(e.dtype().element_of());
+    spirv::SType etype = builder_->GetSType(PrimType(e.ty()->dtype).WithLanes(1));
     spirv::Value value = MakeValue(e);
-    for (int i = 0; i < e.dtype().lanes(); ++i) {
+    for (int i = 0; i < PrimType(e.ty()->dtype).lanes(); ++i) {
       f(i, builder_->MakeValue(spv::OpCompositeExtract, etype, value, i));
     }
   }
@@ -635,7 +638,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const ShuffleNode* op) {
       << "of one vector with one index";
   spirv::Value vector = MakeValue(op->vectors[0]);
   int index = op->indices[0].as_or_throw<IntImm>()->value;
-  spirv::SType etype = builder_->GetSType(op->dtype);
+  spirv::SType etype = builder_->GetSType(PrimType(op->ty()->dtype));
   spirv::Value element = builder_->MakeValue(spv::OpCompositeExtract, etype, vector, index);
   return element;
 }
@@ -649,7 +652,7 @@ void CodeGenSPIRV::VisitStmt_(const BufferStoreNode* op) {
   auto it = storage_info_.find(buffer_var.get());
   TVM_FFI_ICHECK(it != storage_info_.end());
   StorageInfo& info = it->second;
-  info.CheckContentType(op->value.dtype(), prim_index.dtype().lanes());
+  info.CheckContentType(PrimType(op->value.ty()->dtype), PrimType(prim_index.ty()->dtype).lanes());
 
   spirv::SType content_type = builder_->GetSType(info.element_type);
   spirv::Value buffer = MakeValue(buffer_var);
@@ -661,16 +664,16 @@ void CodeGenSPIRV::VisitStmt_(const BufferStoreNode* op) {
     mask |= spv::MemoryAccessVolatileMask;
   }
 
-  if (op->value.dtype() == info.element_type) {
+  if (PrimType(op->value.ty()->dtype) == info.element_type) {
     // Requested store of a single value.  This may be a scalar store
     // or a vectorized store, based on the array element type.
-    TVM_FFI_ICHECK_EQ(info.element_type, op->value.dtype())
+    TVM_FFI_ICHECK_EQ(info.element_type, PrimType(op->value.ty()->dtype))
         << "Vulkan only allow one type access to the same buffer";
     spirv::Value index = MakeValue(prim_index);
     spirv::Value ptr = builder_->StructArrayAccess(ptr_type, buffer, index);
     builder_->MakeInst(spv::OpStore, ptr, value, mask);
 
-  } else if (op->value.dtype().element_of() == info.element_type) {
+  } else if (PrimType(op->value.ty()->dtype).WithLanes(1) == info.element_type) {
     // Requested store of several arbitrarily located values.  Extract
     // each value from the composite, then assign to the buffer.
     auto f = [&](int i, spirv::Value index) {
@@ -681,10 +684,10 @@ void CodeGenSPIRV::VisitStmt_(const BufferStoreNode* op) {
     this->Scalarize(prim_index, f);
 
   } else {
-    TVM_FFI_THROW(InternalError) << "Cannot store value of type " << op->value.dtype()
+    TVM_FFI_THROW(InternalError) << "Cannot store value of type " << PrimType(op->value.ty()->dtype)
                                  << " into buffer variable '" << buffer_var->name_hint
                                  << "' with element type " << info.element_type
-                                 << " using index of type " << prim_index->dtype;
+                                 << " using index of type " << PrimType(prim_index.ty()->dtype);
   }
 }
 
@@ -697,10 +700,11 @@ void CodeGenSPIRV::VisitStmt_(const ForNode* op) {
   // loop step
   spirv::Value step;
   if (op->HasTrivialStep()) {
-    step = op->loop_var.dtype().is_int() ? builder_->IntImm(init_value.stype, 1)
-                                         : builder_->UIntImm(init_value.stype, 1);
+    step = PrimType(op->loop_var.ty()->dtype).MatchesCode(DLDataTypeCode::kDLInt)
+               ? builder_->IntImm(init_value.stype, 1)
+               : builder_->UIntImm(init_value.stype, 1);
   } else {
-    step = MakeValue(tvm::cast(end->dtype, *op->step));
+    step = MakeValue(tvm::cast(end.ty(), *op->step));
   }
 
   // Must get init label after making value(to make sure they are correct)
@@ -807,7 +811,7 @@ void CodeGenSPIRV::VisitStmt_(const IfThenElseNode* op) {
 }
 
 void CodeGenSPIRV::VisitStmt_(const AllocBufferNode* op) {
-  TVM_FFI_ICHECK(!op->buffer->dtype.is_handle());
+  TVM_FFI_ICHECK(!op->buffer->dtype.IsHandle());
   const IntImmNode* dim_imm = op->buffer->shape[0].as<IntImmNode>();
   TVM_FFI_ICHECK(dim_imm) << "Can only handle constant size stack allocation in GPU";
   size_t constant_size = static_cast<size_t>(dim_imm->value);
@@ -848,7 +852,7 @@ void CodeGenSPIRV::VisitStmt_(const AllocBufferNode* op) {
       int32_t aligned_constant_size = ((constant_size + 3) & ~0x3);
       buf = builder_->Allocate(etype, static_cast<uint32_t>(aligned_constant_size), storage_class);
 
-      size_t num_bytes = op->buffer->dtype.bytes() * op->buffer->dtype.lanes() *
+      size_t num_bytes = ((op->buffer->dtype.bits() + 7) / 8) * op->buffer->dtype.lanes() *
                          static_cast<uint32_t>(aligned_constant_size);
       shared_memory_bytes_used_ += num_bytes;
     } break;
@@ -897,7 +901,7 @@ void CodeGenSPIRV::VisitStmt_(const AssertStmtNode* op) {
 
 void CodeGenSPIRV::VisitStmt_(const BindNode* op) {
   TVM_FFI_ICHECK(!var_map_.count(op->var.get()));
-  TVM_FFI_ICHECK(!op->var.dtype().is_handle());
+  TVM_FFI_ICHECK(!PrimType(op->var.ty()->dtype).IsHandle());
   var_map_[op->var.get()] = MakeValue(op->value);
   analyzer_->Bind(op->var, op->value);
 }
@@ -910,18 +914,18 @@ void CodeGenSPIRV::VisitStmt_(const SeqStmtNode* op) {
 
 void CodeGenSPIRV::VisitStmt_(const EvaluateNode* op) { MakeValue(op->value); }
 
-spirv::SType CodeGenSPIRV::GetFragmentSType(const VarNode* buffer, const DataType& dtype) {
+spirv::SType CodeGenSPIRV::GetFragmentSType(const VarNode* buffer, const PrimType& dtype) {
   TVM_FFI_ICHECK(fragment_info_.count(buffer));
   const std::string& scope = fragment_info_[buffer].scope;
   const std::string& shape_str = fragment_info_.at(buffer).shape;
   std::pair<int32_t, int32_t> dim = GetWmmaFragmentDimSize(shape_str, scope);
   int64_t size = dim.first * dim.second;
-  spirv::SType stype = builder_->GetSType(dtype.with_lanes(size), dim.first, dim.second);
+  spirv::SType stype = builder_->GetSType(dtype.WithLanes(size), dim.first, dim.second);
   fragment_info_[buffer].stype = stype;
   return stype;
 }
 
-DataType CodeGenSPIRV::GetElementDataType(const VarNode* buffer) {
+PrimType CodeGenSPIRV::GetElementDataType(const VarNode* buffer) {
   auto it = storage_info_.find(buffer);
   TVM_FFI_ICHECK(it != storage_info_.end());
   return it->second.element_type;
diff --git a/src/backend/vulkan/codegen/codegen_spirv.h b/src/backend/vulkan/codegen/codegen_spirv.h
index 46fbcb696b6f..5ade6e383908 100644
--- a/src/backend/vulkan/codegen/codegen_spirv.h
+++ b/src/backend/vulkan/codegen/codegen_spirv.h
@@ -142,7 +142,7 @@ class CodeGenSPIRV : public ExprFunctor<spirv::Value(const PrimExpr&)>,
      * buffer variable (AllocBufferNode) or of the parameter (shader
      * arguments).
      */
-    DataType element_type{DataType()};
+    PrimType element_type{PrimType::Void()};
 
     /* \brief Check that the access type matches the known type
      *
@@ -156,10 +156,10 @@ class CodeGenSPIRV : public ExprFunctor<spirv::Value(const PrimExpr&)>,
      * product of the number of lanes of the buffer element type and
      * the number of lanes of the index.
      */
-    void CheckContentType(DataType type, int index_lanes = 1) const {
+    void CheckContentType(PrimType type, int index_lanes = 1) const {
       TVM_FFI_ICHECK(element_type_known) << "Cannot check element type of buffer " << name_hint
                                          << " no previous element type defined";
-      DataType expected_type = element_type.with_lanes(index_lanes * element_type.lanes());
+      PrimType expected_type = element_type.WithLanes(index_lanes * element_type.lanes());
       TVM_FFI_ICHECK_EQ(type, expected_type)
           << "Attempted to access buffer " << name_hint << " as element type " << type
           << " using an index of size " << index_lanes << " when the element type is "
@@ -167,7 +167,7 @@ class CodeGenSPIRV : public ExprFunctor<spirv::Value(const PrimExpr&)>,
     }
 
     // Update content type if it hasn't been updated.
-    void SetContentType(DataType type, std::string name_hint) {
+    void SetContentType(PrimType type, std::string name_hint) {
       TVM_FFI_ICHECK(!element_type_known)
           << "Cannot set element type of buffer " << name_hint << " a second time.";
       this->element_type = type;
@@ -191,8 +191,8 @@ class CodeGenSPIRV : public ExprFunctor<spirv::Value(const PrimExpr&)>,
   spirv::Value CreateStorageSync(const CallNode* op);
   void Scalarize(const PrimExpr& e, std::function<void(int i, spirv::Value v)> f);
 
-  spirv::SType GetFragmentSType(const VarNode* buffer, const DataType& dtype);
-  DataType GetElementDataType(const VarNode* buffer);
+  spirv::SType GetFragmentSType(const VarNode* buffer, const PrimType& dtype);
+  PrimType GetElementDataType(const VarNode* buffer);
 
   // SPIRV-related capabilities of the target
   SPIRVSupport spirv_support_;
@@ -213,7 +213,7 @@ class CodeGenSPIRV : public ExprFunctor<spirv::Value(const PrimExpr&)>,
    * integer type supported by the device, as not all Vulkan
    * implementations support int8.
    */
-  DataType boolean_storage_type_{DataType::Int(8)};
+  PrimType boolean_storage_type_{PrimType::Int(8)};
 
   // the storage scope of allocation
   std::unordered_map<const VarNode*, StorageInfo> storage_info_;
diff --git a/src/backend/vulkan/codegen/intrin_rule_spirv.cc b/src/backend/vulkan/codegen/intrin_rule_spirv.cc
index 14287562d9e4..6deb6e0a9b61 100644
--- a/src/backend/vulkan/codegen/intrin_rule_spirv.cc
+++ b/src/backend/vulkan/codegen/intrin_rule_spirv.cc
@@ -39,12 +39,12 @@ PrimExpr CallGLSLIntrin(PrimExpr e, const ffi::Array<PrimExpr>& args) {
   TVM_FFI_ICHECK(call != nullptr);
   ffi::Array<PrimExpr> cargs;
   // intrin id.
-  cargs.push_back(IntImm(DataType::UInt(32), id));
+  cargs.push_back(IntImm(PrimType::UInt(32), id));
 
   for (PrimExpr arg : args) {
     cargs.push_back(arg);
   }
-  return tirx::Call(call->dtype, tirx::builtin::call_spirv_pure_glsl450(), cargs);
+  return tirx::Call(call->ty(), tirx::builtin::call_spirv_pure_glsl450(), cargs);
 }
 
 template <unsigned id>
@@ -166,21 +166,22 @@ TVM_REGISTER_OP("tirx.clz")
       TVM_FFI_ICHECK(call != nullptr);
       TVM_FFI_ICHECK_EQ(call->args.size(), 1);
       PrimExpr arg = call->args[0];
+      PrimType arg_ty = arg.ty();
       PrimExpr msb;
-      if (arg.dtype().bits() == 64) {
+      if (arg_ty.bits() == 64) {
         // SPIR-V FindUMsb intrinsic only supports 32 bit input
-        auto int32 = DataType::Int(32);
+        auto int32 = PrimType::Int(32);
         PrimExpr arg_hi32 = tvm::tirx::Cast(int32, arg >> 32);
         PrimExpr arg_lo32 = tvm::tirx::Cast(int32, arg);
         PrimExpr msb_hi = CallGLSLIntrin<GLSLstd450FindUMsb>(e, {arg_hi32});
         PrimExpr msb_lo = CallGLSLIntrin<GLSLstd450FindUMsb>(e, {arg_lo32});
         msb = tvm::if_then_else(arg_hi32 == 0, msb_lo, msb_hi + 32);
-      } else if (arg.dtype().bits() == 32) {
+      } else if (arg_ty.bits() == 32) {
         msb = CallGLSLIntrin<GLSLstd450FindUMsb>(e);
       } else {
         TVM_FFI_THROW(InternalError) << "SPIR-V clz only supports a 32 bit or 64 bit integer.";
       }
-      return PrimExpr(arg.dtype().bits() - 1) - msb;
+      return PrimExpr(arg_ty.bits() - 1) - msb;
     });
   // clang-format on
 }
diff --git a/src/backend/vulkan/codegen/ir_builder.cc b/src/backend/vulkan/codegen/ir_builder.cc
index f912e482761c..e986454a7f75 100644
--- a/src/backend/vulkan/codegen/ir_builder.cc
+++ b/src/backend/vulkan/codegen/ir_builder.cc
@@ -74,10 +74,10 @@ void IRBuilder::InitHeader() {
 
 void IRBuilder::InitPreDefs() {
   ext_glsl450_ = ExtInstImport("GLSL.std.450");
-  t_int32_ = DeclareType(DataType::Int(32));
-  t_uint32_ = DeclareType(DataType::UInt(32));
-  t_bool_ = DeclareType(DataType::Bool());
-  t_fp32_ = DeclareType(DataType::Float(32));
+  t_int32_ = DeclareType(PrimType::Int(32));
+  t_uint32_ = DeclareType(PrimType::UInt(32));
+  t_bool_ = DeclareType(PrimType::Bool());
+  t_fp32_ = DeclareType(PrimType::Float(32));
   const_i32_zero_ = IntImm(t_int32_, 0);
 
   // declare void, and void functions
@@ -112,14 +112,14 @@ std::vector<uint32_t> IRBuilder::Finalize() {
   return data;
 }
 
-SType IRBuilder::GetSType(const DataType& dtype, uint32_t row, uint32_t col) {
-  if (dtype == DataType::Int(32)) {
+SType IRBuilder::GetSType(const PrimType& dtype, uint32_t row, uint32_t col) {
+  if (dtype == PrimType::Int(32)) {
     return t_int32_;
-  } else if (dtype == DataType::Bool()) {
+  } else if (dtype == PrimType::Bool()) {
     return t_bool_;
-  } else if (dtype == DataType::Float(32)) {
+  } else if (dtype == PrimType::Float(32)) {
     return t_fp32_;
-  } else if (dtype == DataType::UInt(32)) {
+  } else if (dtype == PrimType::UInt(32)) {
     return t_uint32_;
   }
   uint64_t type_key;
@@ -151,7 +151,7 @@ SType IRBuilder::GetPointerType(const SType& value_type, spv::StorageClass stora
   }
   SType t;
   t.id = id_counter_++;
-  t.type = DataType::Handle();
+  t.type = PrimType::Handle();
   t.element_type_id = value_type.id;
   t.storage_class = storage_class;
   ib_.Begin(spv::OpTypePointer).AddSeq(t, storage_class, value_type).Commit(&global_);
@@ -169,11 +169,11 @@ SType IRBuilder::GetStructArrayType(const SType& value_type, uint32_t num_elems,
 
   SType arr_type;
   arr_type.id = id_counter_++;
-  arr_type.type = DataType::Handle();
+  arr_type.type = PrimType::Handle();
   arr_type.element_type_id = value_type.id;
 
   if (num_elems != 0) {
-    Value length = UIntImm(GetSType(DataType::UInt(32)), num_elems);
+    Value length = UIntImm(GetSType(PrimType::UInt(32)), num_elems);
     ib_.Begin(spv::OpTypeArray).AddSeq(arr_type, value_type, length).Commit(&global_);
   } else {
     ib_.Begin(spv::OpTypeRuntimeArray).AddSeq(arr_type, value_type).Commit(&global_);
@@ -188,7 +188,7 @@ SType IRBuilder::GetStructArrayType(const SType& value_type, uint32_t num_elems,
   // declare struct of array
   SType struct_type;
   struct_type.id = id_counter_++;
-  struct_type.type = DataType::Handle();
+  struct_type.type = PrimType::Handle();
   struct_type.element_type_id = value_type.id;
   ib_.Begin(spv::OpTypeStruct).AddSeq(struct_type, arr_type).Commit(&global_);
 
@@ -241,7 +241,7 @@ Value IRBuilder::FloatImm(const SType& dtype, double value) {
     if (data == 0)
       return GetConst_(dtype, &data);
     else
-      return Cast(dtype, FloatImm(GetSType(DataType::Float(32)), value));
+      return Cast(dtype, FloatImm(GetSType(PrimType::Float(32)), value));
   }
 }
 
@@ -270,7 +270,7 @@ Value IRBuilder::DeclareStorageVariable(const std::vector<SType>& value_types,
                                         spv::StorageClass storage_class, ValueKind kind) {
   SType struct_type;
   struct_type.id = id_counter_++;
-  struct_type.type = DataType::Handle();
+  struct_type.type = PrimType::Handle();
   ib_.Begin(spv::OpTypeStruct).Add(struct_type);
   for (const SType& vtype : value_types) {
     ib_.Add(vtype);
@@ -282,7 +282,7 @@ Value IRBuilder::DeclareStorageVariable(const std::vector<SType>& value_types,
     ib_.Begin(spv::OpMemberDecorate)
         .AddSeq(struct_type, i, spv::DecorationOffset, offset)
         .Commit(&decorate_);
-    DataType t = value_types[i].type;
+    PrimType t = value_types[i].type;
     uint32_t nbits = t.bits() * t.lanes();
     TVM_FFI_ICHECK_EQ(nbits % 8, 0);
     uint32_t bytes = (nbits / 8);
@@ -394,13 +394,11 @@ Value IRBuilder::GetBuiltInValue(spv::BuiltIn built_in, uint32_t index, const st
     }
   }
 
-  DataType data_type;
-  DataType global_arr_type;
+  PrimType data_type = PrimType::Int(32);
+  PrimType global_arr_type = data_type.WithLanes(3);
   switch (built_in) {
     case spv::BuiltInLocalInvocationId:
     case spv::BuiltInWorkgroupId:
-      data_type = DataType::Int(32);
-      global_arr_type = data_type.with_lanes(3);
       break;
 
     default:
@@ -468,7 +466,7 @@ Value IRBuilder::GetConst_(const SType& dtype, const uint64_t* pvalue) {
   }
   TVM_FFI_ICHECK_LE(dtype.type.bits(), 64);
   Value ret = NewValue(dtype, kConstant);
-  if (dtype.type == DataType::Bool()) {
+  if (dtype.type == PrimType::Bool()) {
     // bool types.
     if (*pvalue) {
       ib_.Begin(spv::OpConstantTrue).AddSeq(dtype, ret);
@@ -481,7 +479,7 @@ Value IRBuilder::GetConst_(const SType& dtype, const uint64_t* pvalue) {
     uint64_t mask = 0xFFFFFFFFUL;
     ib_.Add(static_cast<uint32_t>(pvalue[0] & mask));
     if (dtype.type.bits() > 32) {
-      if (dtype.type.is_int()) {
+      if (dtype.type.MatchesCode(DLDataTypeCode::kDLInt)) {
         int64_t sign_mask = 0xFFFFFFFFL;
         const int64_t* sign_ptr = reinterpret_cast<const int64_t*>(pvalue);
         ib_.Add(static_cast<uint32_t>((sign_ptr[0] >> 32L) & sign_mask));
@@ -495,20 +493,20 @@ Value IRBuilder::GetConst_(const SType& dtype, const uint64_t* pvalue) {
   return ret;
 }
 
-SType IRBuilder::DeclareType(const DataType& dtype, uint32_t row, uint32_t col) {
+SType IRBuilder::DeclareType(const PrimType& dtype, uint32_t row, uint32_t col) {
   AddCapabilityFor(dtype);
 
   if (dtype.lanes() == 1) {
     SType t;
     t.id = id_counter_++;
     t.type = dtype;
-    if (dtype.is_bool()) {
+    if (dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
       ib_.Begin(spv::OpTypeBool).Add(t).Commit(&global_);
-    } else if (dtype.is_int()) {
+    } else if (dtype.MatchesCode(DLDataTypeCode::kDLInt)) {
       ib_.Begin(spv::OpTypeInt).AddSeq(t, dtype.bits(), 1).Commit(&global_);
-    } else if (dtype.is_uint()) {
+    } else if (dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
       ib_.Begin(spv::OpTypeInt).AddSeq(t, dtype.bits(), 0).Commit(&global_);
-    } else if (dtype.is_float()) {
+    } else if (dtype.MatchesCode(DLDataTypeCode::kDLFloat)) {
       ib_.Begin(spv::OpTypeFloat).AddSeq(t, dtype.bits()).Commit(&global_);
     } else {
       TVM_FFI_THROW(InternalError) << "declare type do not support handle";
@@ -518,15 +516,15 @@ SType IRBuilder::DeclareType(const DataType& dtype, uint32_t row, uint32_t col)
     SType t;
     t.id = id_counter_++;
     t.type = dtype;
-    SType base_type = GetSType(dtype.element_of());
+    SType base_type = GetSType(dtype.WithLanes(1));
 
     if (row * col == 0) {
       TVM_FFI_ICHECK((row == 0) && (col == 0));
       ib_.Begin(spv::OpTypeVector).AddSeq(t, base_type, dtype.lanes()).Commit(&global_);
     } else {
-      Value v_row = GetSpecConst(GetSType(DataType::UInt(32)), row);
-      Value v_col = GetSpecConst(GetSType(DataType::UInt(32)), col);
-      Value scope = UIntImm(GetSType(DataType::UInt(32)), spv::ScopeSubgroup);
+      Value v_row = GetSpecConst(GetSType(PrimType::UInt(32)), row);
+      Value v_col = GetSpecConst(GetSType(PrimType::UInt(32)), col);
+      Value scope = UIntImm(GetSType(PrimType::UInt(32)), spv::ScopeSubgroup);
       ib_.Begin(spv::OpTypeCooperativeMatrixNV)
           .AddSeq(t, base_type, scope, v_row, v_col)
           .Commit(&global_);
@@ -535,9 +533,9 @@ SType IRBuilder::DeclareType(const DataType& dtype, uint32_t row, uint32_t col)
   }
 }
 
-void IRBuilder::AddCapabilityFor(const DataType& dtype) {
+void IRBuilder::AddCapabilityFor(const PrimType& dtype) {
   // Declare appropriate capabilities for int/float types
-  if (dtype.is_int() || dtype.is_uint()) {
+  if (dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
     if (dtype.bits() == 8) {
       TVM_FFI_ICHECK(spirv_support_.supports_int8)
           << "Vulkan target does not support Int8 capability.  "
@@ -561,7 +559,7 @@ void IRBuilder::AddCapabilityFor(const DataType& dtype) {
       capabilities_used_.insert(spv::CapabilityInt64);
     }
 
-  } else if (dtype.is_float()) {
+  } else if (dtype.MatchesCode(DLDataTypeCode::kDLFloat)) {
     if (dtype.bits() == 16) {
       TVM_FFI_ICHECK(spirv_support_.supports_float16)
           << "Vulkan target does not support Float16 capability.  "
@@ -584,7 +582,7 @@ void IRBuilder::AddCapabilityFor(const DataType& dtype) {
   // future.  Requiring StorageBuffer8BitAccess in order to declare an
   // Int8 prevents use of an 8-bit loop iterator on a device that
   // supports Int8 but doesn't support 8-bit buffer access.
-  if (dtype.bits() == 8 && !dtype.is_bool()) {
+  if (dtype.bits() == 8 && !dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
     TVM_FFI_ICHECK(spirv_support_.supports_storage_buffer_8bit_access)
         << "Vulkan target does not support StorageBuffer8BitAccess.  "
         << "If your device supports 8-bit buffer access, "
@@ -642,7 +640,7 @@ Value IRBuilder::CallGLSL450(const SType& ret_type, uint32_t inst_id,
 }
 
 Value IRBuilder::CallKHRIntegerDotProduct(const SType& ret_type, const std::vector<Value>& args,
-                                          const DataType& dtype) {
+                                          const PrimType& dtype) {
   if (args.size() != 3) {
     TVM_FFI_THROW(InternalError) << "Unresolved arguments in SPIRV_KHR_integer_dot_product";
   }
@@ -653,9 +651,9 @@ Value IRBuilder::CallKHRIntegerDotProduct(const SType& ret_type, const std::vect
       << "If your device supports integer dot product operations, "
       << "please either add -mattr=+dotprod to the target, "
       << "or query all device parameters by adding -from_device=0.";
-  if (dtype.is_int()) {
+  if (dtype.MatchesCode(DLDataTypeCode::kDLInt)) {
     ib_.Begin(spv::OpSDotAccSatKHR).AddSeq(ret_type, val);
-  } else if (dtype.is_uint()) {
+  } else if (dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
     ib_.Begin(spv::OpUDotAccSatKHR).AddSeq(ret_type, val);
   } else {
     TVM_FFI_THROW(InternalError) << "Unsupported type";
@@ -674,15 +672,15 @@ Value IRBuilder::CallKHRIntegerDotProduct(const SType& ret_type, const std::vect
 
 Value IRBuilder::Concat(const std::vector<Value>& vec) {
   bool is_const = vec[0].flag == kConstant;
-  DataType etype = vec[0].stype.type;
+  PrimType etype = vec[0].stype.type;
   int lanes = etype.lanes();
   for (size_t i = 1; i < vec.size(); ++i) {
-    TVM_FFI_ICHECK_EQ(etype, vec[i].stype.type.element_of())
+    TVM_FFI_ICHECK_EQ(etype, vec[i].stype.type.WithLanes(1))
         << "Cannot concat vector of different element type";
     lanes += vec[i].stype.type.lanes();
     is_const = is_const && (vec[i].flag == kConstant);
   }
-  Value ret = NewValue(GetSType(etype.with_lanes(lanes)), kNormal);
+  Value ret = NewValue(GetSType(etype.WithLanes(lanes)), kNormal);
   if (is_const && vec.size() == static_cast<size_t>(lanes)) {
     ib_.Begin(spv::OpConstantComposite);
     ib_.AddSeq(ret.stype, ret);
@@ -704,53 +702,56 @@ Value IRBuilder::Concat(const std::vector<Value>& vec) {
 Value IRBuilder::Cast(const SType& dst_type, spirv::Value value) {
   TVM_FFI_ICHECK_NE(value.stype.id, 0U);
   if (value.stype.id == dst_type.id) return value;
-  const tvm::DataType& from = value.stype.type;
-  const tvm::DataType& to = dst_type.type;
+  const tvm::PrimType& from = value.stype.type;
+  const tvm::PrimType& to = dst_type.type;
   TVM_FFI_ICHECK_EQ(from.lanes(), to.lanes());
-  if (from == DataType::Bool()) {
-    if (to.is_int()) {
+  if (from == PrimType::Bool()) {
+    if (to.MatchesCode(DLDataTypeCode::kDLInt)) {
       return Select(value, IntImm(dst_type, 1), IntImm(dst_type, 0));
-    } else if (to.is_uint()) {
+    } else if (to.MatchesCode(DLDataTypeCode::kDLUInt)) {
       return Select(value, UIntImm(dst_type, 1), UIntImm(dst_type, 0));
-    } else if (to.is_float()) {
+    } else if (to.MatchesCode(DLDataTypeCode::kDLFloat)) {
       return MakeValue(spv::OpConvertUToF, dst_type,
                        Select(value, UIntImm(t_uint32_, 1), UIntImm(t_uint32_, 0)));
     } else {
       TVM_FFI_THROW(InternalError) << "cannot cast from " << from << " to " << to;
       return Value();
     }
-  } else if (to == DataType::Bool()) {
-    if (from.is_int()) {
+  } else if (to == PrimType::Bool()) {
+    if (from.MatchesCode(DLDataTypeCode::kDLInt)) {
       return NE(value, IntImm(value.stype, 0));
-    } else if (to.is_uint()) {
+    } else if (from.MatchesCode(DLDataTypeCode::kDLUInt)) {
       return NE(value, UIntImm(value.stype, 0));
     } else {
       TVM_FFI_THROW(InternalError) << "cannot cast from " << from << " to " << to;
       return Value();
     }
-  } else if (from.is_int() && to.is_int()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLInt) && to.MatchesCode(DLDataTypeCode::kDLInt)) {
     return MakeValue(spv::OpSConvert, dst_type, value);
-  } else if (from.is_uint() && to.is_uint()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLUInt) && to.MatchesCode(DLDataTypeCode::kDLUInt)) {
     return MakeValue(spv::OpUConvert, dst_type, value);
-  } else if (from.is_uint() && to.is_int()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLUInt) && to.MatchesCode(DLDataTypeCode::kDLInt)) {
     if (from.bits() != to.bits()) {
-      value = MakeValue(spv::OpUConvert, GetSType(from.with_bits(to.bits())), value);
+      value = MakeValue(spv::OpUConvert, GetSType(from.WithBits(to.bits())), value);
     }
     return MakeValue(spv::OpBitcast, dst_type, value);
-  } else if (from.is_int() && to.is_uint()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLInt) && to.MatchesCode(DLDataTypeCode::kDLUInt)) {
     if (from.bits() != to.bits()) {
-      value = MakeValue(spv::OpSConvert, GetSType(from.with_bits(to.bits())), value);
+      value = MakeValue(spv::OpSConvert, GetSType(from.WithBits(to.bits())), value);
     }
     return MakeValue(spv::OpBitcast, dst_type, value);
-  } else if (from.is_float() && to.is_int()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLFloat) && to.MatchesCode(DLDataTypeCode::kDLInt)) {
     return MakeValue(spv::OpConvertFToS, dst_type, value);
-  } else if (from.is_float() && to.is_uint()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLFloat) &&
+             to.MatchesCode(DLDataTypeCode::kDLUInt)) {
     return MakeValue(spv::OpConvertFToU, dst_type, value);
-  } else if (from.is_int() && to.is_float()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLInt) && to.MatchesCode(DLDataTypeCode::kDLFloat)) {
     return MakeValue(spv::OpConvertSToF, dst_type, value);
-  } else if (from.is_uint() && to.is_float()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLUInt) &&
+             to.MatchesCode(DLDataTypeCode::kDLFloat)) {
     return MakeValue(spv::OpConvertUToF, dst_type, value);
-  } else if (from.is_float() && to.is_float()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLFloat) &&
+             to.MatchesCode(DLDataTypeCode::kDLFloat)) {
     return MakeValue(spv::OpFConvert, dst_type, value);
   } else {
     TVM_FFI_THROW(InternalError) << "do not support type cast from " << from << " to " << to;
@@ -782,28 +783,28 @@ Value IRBuilder::GetSpecConst(const SType& dtype, uint64_t value) {
   return ret;
 }
 
-#define DEFINE_BUILDER_BINARY_USIGN_OP(_OpName, _Op)       \
-  Value IRBuilder::_OpName(Value a, Value b) {             \
-    TVM_FFI_ICHECK_EQ(a.stype.id, b.stype.id);             \
-    if (a.stype.type.is_int() || a.stype.type.is_uint()) { \
-      return MakeValue(spv::OpI##_Op, a.stype, a, b);      \
-    } else {                                               \
-      TVM_FFI_ICHECK(a.stype.type.is_float());             \
-      return MakeValue(spv::OpF##_Op, a.stype, a, b);      \
-    }                                                      \
+#define DEFINE_BUILDER_BINARY_USIGN_OP(_OpName, _Op)                                 \
+  Value IRBuilder::_OpName(Value a, Value b) {                                       \
+    TVM_FFI_ICHECK_EQ(a.stype.id, b.stype.id);                                       \
+    if (a.stype.type.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) { \
+      return MakeValue(spv::OpI##_Op, a.stype, a, b);                                \
+    } else {                                                                         \
+      TVM_FFI_ICHECK(a.stype.type.MatchesCode(DLDataTypeCode::kDLFloat));            \
+      return MakeValue(spv::OpF##_Op, a.stype, a, b);                                \
+    }                                                                                \
   }
 
-#define DEFINE_BUILDER_BINARY_SIGN_OP(_OpName, _Op)   \
-  Value IRBuilder::_OpName(Value a, Value b) {        \
-    TVM_FFI_ICHECK_EQ(a.stype.id, b.stype.id);        \
-    if (a.stype.type.is_int()) {                      \
-      return MakeValue(spv::OpS##_Op, a.stype, a, b); \
-    } else if (a.stype.type.is_uint()) {              \
-      return MakeValue(spv::OpU##_Op, a.stype, a, b); \
-    } else {                                          \
-      TVM_FFI_ICHECK(a.stype.type.is_float());        \
-      return MakeValue(spv::OpF##_Op, a.stype, a, b); \
-    }                                                 \
+#define DEFINE_BUILDER_BINARY_SIGN_OP(_OpName, _Op)                       \
+  Value IRBuilder::_OpName(Value a, Value b) {                            \
+    TVM_FFI_ICHECK_EQ(a.stype.id, b.stype.id);                            \
+    if (a.stype.type.MatchesCode(DLDataTypeCode::kDLInt)) {               \
+      return MakeValue(spv::OpS##_Op, a.stype, a, b);                     \
+    } else if (a.stype.type.MatchesCode(DLDataTypeCode::kDLUInt)) {       \
+      return MakeValue(spv::OpU##_Op, a.stype, a, b);                     \
+    } else {                                                              \
+      TVM_FFI_ICHECK(a.stype.type.MatchesCode(DLDataTypeCode::kDLFloat)); \
+      return MakeValue(spv::OpF##_Op, a.stype, a, b);                     \
+    }                                                                     \
   }
 
 DEFINE_BUILDER_BINARY_USIGN_OP(Add, Add);
@@ -813,29 +814,29 @@ DEFINE_BUILDER_BINARY_SIGN_OP(Div, Div);
 
 Value IRBuilder::Mod(Value a, Value b) {
   TVM_FFI_ICHECK_EQ(a.stype.id, b.stype.id);
-  if (a.stype.type.is_int()) {
+  if (a.stype.type.MatchesCode(DLDataTypeCode::kDLInt)) {
     return MakeValue(spv::OpSRem, a.stype, a, b);
-  } else if (a.stype.type.is_uint()) {
+  } else if (a.stype.type.MatchesCode(DLDataTypeCode::kDLUInt)) {
     return MakeValue(spv::OpUMod, a.stype, a, b);
   } else {
-    TVM_FFI_ICHECK(a.stype.type.is_float());
+    TVM_FFI_ICHECK(a.stype.type.MatchesCode(DLDataTypeCode::kDLFloat));
     return MakeValue(spv::OpFRem, a.stype, a, b);
   }
 }
 
-#define DEFINE_BUILDER_CMP_OP(_OpName, _Op)                                                    \
-  Value IRBuilder::_OpName(Value a, Value b) {                                                 \
-    TVM_FFI_ICHECK_EQ(a.stype.id, b.stype.id);                                                 \
-    TVM_FFI_ICHECK_EQ(a.stype.type.lanes(), b.stype.type.lanes());                             \
-    const auto& bool_type = this->GetSType(DataType::Bool().with_lanes(a.stype.type.lanes())); \
-    if (a.stype.type.is_int()) {                                                               \
-      return MakeValue(spv::OpS##_Op, bool_type, a, b);                                        \
-    } else if (a.stype.type.is_uint()) {                                                       \
-      return MakeValue(spv::OpU##_Op, bool_type, a, b);                                        \
-    } else {                                                                                   \
-      TVM_FFI_ICHECK(a.stype.type.is_float());                                                 \
-      return MakeValue(spv::OpFOrd##_Op, bool_type, a, b);                                     \
-    }                                                                                          \
+#define DEFINE_BUILDER_CMP_OP(_OpName, _Op)                                                   \
+  Value IRBuilder::_OpName(Value a, Value b) {                                                \
+    TVM_FFI_ICHECK_EQ(a.stype.id, b.stype.id);                                                \
+    TVM_FFI_ICHECK_EQ(a.stype.type.lanes(), b.stype.type.lanes());                            \
+    const auto& bool_type = this->GetSType(PrimType::Bool().WithLanes(a.stype.type.lanes())); \
+    if (a.stype.type.MatchesCode(DLDataTypeCode::kDLInt)) {                                   \
+      return MakeValue(spv::OpS##_Op, bool_type, a, b);                                       \
+    } else if (a.stype.type.MatchesCode(DLDataTypeCode::kDLUInt)) {                           \
+      return MakeValue(spv::OpU##_Op, bool_type, a, b);                                       \
+    } else {                                                                                  \
+      TVM_FFI_ICHECK(a.stype.type.MatchesCode(DLDataTypeCode::kDLFloat));                     \
+      return MakeValue(spv::OpFOrd##_Op, bool_type, a, b);                                    \
+    }                                                                                         \
   }
 
 DEFINE_BUILDER_CMP_OP(LT, LessThan);
@@ -843,17 +844,17 @@ DEFINE_BUILDER_CMP_OP(LE, LessThanEqual);
 DEFINE_BUILDER_CMP_OP(GT, GreaterThan);
 DEFINE_BUILDER_CMP_OP(GE, GreaterThanEqual);
 
-#define DEFINE_BUILDER_CMP_UOP(_OpName, _Op)                                                   \
-  Value IRBuilder::_OpName(Value a, Value b) {                                                 \
-    TVM_FFI_ICHECK_EQ(a.stype.id, b.stype.id);                                                 \
-    TVM_FFI_ICHECK_EQ(a.stype.type.lanes(), b.stype.type.lanes());                             \
-    const auto& bool_type = this->GetSType(DataType::Bool().with_lanes(a.stype.type.lanes())); \
-    if (a.stype.type.is_int() || a.stype.type.is_uint()) {                                     \
-      return MakeValue(spv::OpI##_Op, bool_type, a, b);                                        \
-    } else {                                                                                   \
-      TVM_FFI_ICHECK(a.stype.type.is_float());                                                 \
-      return MakeValue(spv::OpFOrd##_Op, bool_type, a, b);                                     \
-    }                                                                                          \
+#define DEFINE_BUILDER_CMP_UOP(_OpName, _Op)                                                  \
+  Value IRBuilder::_OpName(Value a, Value b) {                                                \
+    TVM_FFI_ICHECK_EQ(a.stype.id, b.stype.id);                                                \
+    TVM_FFI_ICHECK_EQ(a.stype.type.lanes(), b.stype.type.lanes());                            \
+    const auto& bool_type = this->GetSType(PrimType::Bool().WithLanes(a.stype.type.lanes())); \
+    if (a.stype.type.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {          \
+      return MakeValue(spv::OpI##_Op, bool_type, a, b);                                       \
+    } else {                                                                                  \
+      TVM_FFI_ICHECK(a.stype.type.MatchesCode(DLDataTypeCode::kDLFloat));                     \
+      return MakeValue(spv::OpFOrd##_Op, bool_type, a, b);                                    \
+    }                                                                                         \
   }
 
 DEFINE_BUILDER_CMP_UOP(EQ, Equal);
@@ -861,7 +862,7 @@ DEFINE_BUILDER_CMP_UOP(NE, NotEqual);
 
 Value IRBuilder::Select(Value cond, Value a, Value b) {
   TVM_FFI_ICHECK_EQ(a.stype.id, b.stype.id);
-  TVM_FFI_ICHECK_EQ(cond.stype.type.element_of(), DataType::Bool());
+  TVM_FFI_ICHECK_EQ(cond.stype.type.WithLanes(1), PrimType::Bool());
   return MakeValue(spv::OpSelect, a.stype, cond, a, b);
 }
 
diff --git a/src/backend/vulkan/codegen/ir_builder.h b/src/backend/vulkan/codegen/ir_builder.h
index 3cca1b4cfe33..7e8844682c4e 100644
--- a/src/backend/vulkan/codegen/ir_builder.h
+++ b/src/backend/vulkan/codegen/ir_builder.h
@@ -50,7 +50,7 @@ struct SType {
   /*! \brief The Id to represent type */
   uint32_t id{0};
   /*! \brief corresponding TVM type */
-  tvm::DataType type;
+  tvm::PrimType type{tvm::PrimType::Void()};
   /*! \brief content type id if it is a pointer/struct-array class */
   uint32_t element_type_id{0};
   /*! \brief The storage class, if it is a pointer */
@@ -430,7 +430,7 @@ class IRBuilder {
    * \return The result value.
    */
   Value CallKHRIntegerDotProduct(const SType& ret_type, const std::vector<Value>& args,
-                                 const DataType& dtype);
+                                 const PrimType& dtype);
 
   /*!
    * \brief Build vector by concatenating components
@@ -444,7 +444,7 @@ class IRBuilder {
    * \param dtype The data type.
    * \return The corresponding spirv type.
    */
-  SType GetSType(const tvm::DataType& dtype, uint32_t row = 0, uint32_t col = 0);
+  SType GetSType(const tvm::PrimType& dtype, uint32_t row = 0, uint32_t col = 0);
   /*!
    * \brief Get the pointer type that points to value_type
    * \param value_type.
@@ -656,11 +656,11 @@ class IRBuilder {
   Value GetConst_(const SType& dtype, const uint64_t* pvalue);
 
   // declare type
-  SType DeclareType(const DataType& dtype, uint32_t row = 0, uint32_t col = 0);
+  SType DeclareType(const PrimType& dtype, uint32_t row = 0, uint32_t col = 0);
 
   // Declare the appropriate SPIR-V capabilities and extensions to use
   // this data type.
-  void AddCapabilityFor(const DataType& dtype);
+  void AddCapabilityFor(const PrimType& dtype);
 
   /*! \brief SPIRV-related capabilities of the target
    *
diff --git a/src/backend/webgpu/codegen/codegen_webgpu.cc b/src/backend/webgpu/codegen/codegen_webgpu.cc
index 440f1f04b95e..7129aa23d2ee 100644
--- a/src/backend/webgpu/codegen/codegen_webgpu.cc
+++ b/src/backend/webgpu/codegen/codegen_webgpu.cc
@@ -68,7 +68,7 @@ class WebGPUWorkgroupInfoCollector : public StmtExprVisitor {
   void VisitExpr_(const VarNode* op) final {
     StmtExprVisitor::VisitExpr_(op);
     Var buffer_var = ffi::GetRef<Var>(op);
-    if (buffer_var.dtype().is_handle()) {
+    if (buffer_var.ty().IsHandle()) {
       info_.write_access_set.insert(buffer_var);
     }
   }
@@ -119,7 +119,7 @@ void CodeGenWebGPU::InitFuncState(const PrimFunc& f) {
   CodeGenC::InitFuncState(f);
   // analyze the data;
   for (Var arg : f->params) {
-    if (arg.dtype().is_handle()) {
+    if (arg.ty().IsHandle()) {
       alloc_storage_scope_[arg.get()] = "global";
     }
   }
@@ -174,10 +174,10 @@ runtime::FunctionInfo CodeGenWebGPU::AddFunction(const PrimFunc& f, bool skip_re
   os_param_access << "paramWriteAccess:[";
   // setup buffer argumemts
   for (Var arg : f->params) {
-    DataType t = arg.dtype();
-    func_arg_types.push_back(t);
+    PrimType t = arg.ty();
+    func_arg_types.push_back(t->dtype);
 
-    if (t.is_handle()) {
+    if (t.IsHandle()) {
       auto* ptr = arg->type_annotation.as<PointerTypeNode>();
       TVM_FFI_ICHECK(ptr)
           << "All handles passed to the CodeGenWebGPU must have a type_annotation as a "
@@ -188,11 +188,11 @@ runtime::FunctionInfo CodeGenWebGPU::AddFunction(const PrimFunc& f, bool skip_re
           << "All handles passed to the CodeGenWebGPU must have a type_annotation as a "
              "PointerType, "
           << "and must point to a PrimType";
-      DataType value_storage_type = prim->dtype;
-      if (value_storage_type == DataType::Bool()) {
+      PrimType value_storage_type(prim->dtype);
+      if (value_storage_type.MatchesCode(DLDataTypeCode::kDLBool)) {
         // We need a physically addressable buffer type to support boolean tensors.
         // The loaded byte is cast to bool inside the LoadNode visitor below.
-        value_storage_type = boolean_storage_type_.with_lanes(value_storage_type.lanes());
+        value_storage_type = boolean_storage_type_.WithLanes(value_storage_type.lanes());
       }
       std::string vid = AllocVarID(arg.get());
       std::string access_mode;
@@ -209,7 +209,7 @@ runtime::FunctionInfo CodeGenWebGPU::AddFunction(const PrimFunc& f, bool skip_re
       // add extra access mode info to launch params
       this->decl_stream << "@group(0) @binding(" << num_buffer++ << ") "
                         << "var<storage, " << access_mode << "> " << vid << " : array<";
-      this->PrintType(value_storage_type, this->decl_stream);
+      this->PrintType(value_storage_type->dtype, this->decl_stream);
       this->decl_stream << ">;\n";
     } else {
       pod_args.push_back(arg);
@@ -228,17 +228,17 @@ runtime::FunctionInfo CodeGenWebGPU::AddFunction(const PrimFunc& f, bool skip_re
 
   for (size_t i = 0; i < pod_args.size(); ++i) {
     Var v = pod_args[i];
-    TVM_FFI_ICHECK(!v.dtype().is_handle());
+    TVM_FFI_ICHECK(!v.ty().IsHandle());
     std::string vid = AllocVarID(v.get());
 
-    if (v.dtype() == DataType::Int(32)) {
+    if (v.ty() == PrimType::Int(32)) {
       this->decl_stream << "  " << vid << ": i32";
-    } else if (v.dtype() == DataType::UInt(32)) {
+    } else if (v.ty() == PrimType::UInt(32)) {
       this->decl_stream << "  " << vid << ": u32";
-    } else if (v.dtype() == DataType::Float(32)) {
+    } else if (v.ty() == PrimType::Float(32)) {
       this->decl_stream << "  " << vid << ": f32";
     } else {
-      TVM_FFI_THROW(InternalError) << "Do not support pod argument type " << v.dtype();
+      TVM_FFI_THROW(InternalError) << "Do not support pod argument type " << v.ty()->dtype;
     }
     this->decl_stream << ",\n";
     // value ref
@@ -289,13 +289,13 @@ runtime::FunctionInfo CodeGenWebGPU::AddFunction(const PrimFunc& f, bool skip_re
 void CodeGenWebGPU::BindThreadIndex(const IterVar& iv) {
   TVM_FFI_ICHECK(!var_idmap_.count(iv->var.get()));
   std::ostringstream os;
-  PrintType(iv->var.dtype(), os);
+  PrintType(iv->var.ty()->dtype, os);
   if (iv->thread_tag == "blockIdx.x") {
     // WebGPU have restriction to limit the maximum size of blockId.x to be 65535
     // We allow runtime to spread the load out to blockIdx.z so it can be a large number.
     os << "(blockIdx.z * gridDim.x + blockIdx.x)";
     std::string tidx = os.str();
-    std::string aggregated_bidx = SSAGetID(os.str(), iv->var.dtype());
+    std::string aggregated_bidx = SSAGetID(os.str(), iv->var.ty()->dtype);
     var_idmap_[iv->var.get()] = aggregated_bidx;
   } else {
     os << "(" << iv->thread_tag << ")";
@@ -305,16 +305,17 @@ void CodeGenWebGPU::BindThreadIndex(const IterVar& iv) {
   }
 }
 
-void CodeGenWebGPU::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
+void CodeGenWebGPU::PrintType(DLDataType raw_t, std::ostream& os) {  // NOLINT(*)
+  PrimType t(raw_t);
   int lanes = t.lanes();
-  if (t.is_handle()) {
+  if (t.IsHandle()) {
     TVM_FFI_THROW(InternalError) << "Cannot print handle type in WebGPU";
   }
-  if (t.is_void()) {
+  if (t.IsVoid()) {
     os << "void";
     return;
   }
-  if (t == DataType::Bool()) {
+  if (raw_t == DLDataType{kDLBool, 8, 1}) {
     os << "bool";
     return;
   }
@@ -323,28 +324,29 @@ void CodeGenWebGPU::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
     TVM_FFI_ICHECK(lanes >= 2 && lanes <= 4)
         << "CodeGenWebGPU: only allows vector with lanes in {2, 3, 4}";
     // Currently WebGPU doesn't support `i8` and an `int8x4` is represented as a `u32`.
-    if (t.is_int() && t.bits() == 8 && lanes == 4) {
+    if (t.MatchesCode(DLDataTypeCode::kDLInt) && t.bits() == 8 && lanes == 4) {
       os << "u32";
       return;
     }
     os << "vec" << lanes << "<";
   }
 
-  if (t.is_float()) {
+  if (t.code() == DLDataTypeCode::kDLFloat) {
     TVM_FFI_ICHECK(t.bits() == 16 || t.bits() == 32) << "CodeGenWebGPU: only support f16 or f32";
     if (t.bits() == 16) {
       // Using f16 requires enable directive
       enable_fp16_ = true;
     }
     os << "f" << t.bits();
-  } else if (t.is_uint()) {
+  } else if (t.MatchesCode(DLDataTypeCode::kDLUInt)) {
     TVM_FFI_ICHECK(t.bits() != 64) << "CodeGenWebGPU: do not support u64";
     os << "u" << t.bits();
-  } else if (t.is_int()) {
+  } else if (t.MatchesCode(DLDataTypeCode::kDLInt)) {
     TVM_FFI_ICHECK(t.bits() != 64) << "CodeGenWebGPU: do not support i64";
     os << "i" << t.bits();
   } else {
-    TVM_FFI_THROW(InternalError) << "CodeGenWebGPU: Cannot convert type " << t << " to WebGPU type";
+    TVM_FFI_THROW(InternalError) << "CodeGenWebGPU: Cannot convert type "
+                                 << ffi::DLDataTypeToString(raw_t) << " to WebGPU type";
   }
   if (lanes != 1) {
     os << ">";
@@ -365,18 +367,18 @@ void CodeGenWebGPU::PrintStorageSync(const CallNode* op) {
 }
 
 void CodeGenWebGPU::PrintSSAAssign(const std::string& target, const std::string& src,
-                                   DataType type) {
+                                   PrimType type) {
   stream << "let " << target << " : ";
-  PrintType(type, stream);
+  PrintType(type->dtype, stream);
   stream << " = " << src << ";\n";
 }
 
-void CodeGenWebGPU::PrintVecElemLoad(const std::string& vec, DataType t, int i,
+void CodeGenWebGPU::PrintVecElemLoad(const std::string& vec, DLDataType t, int i,
                                      std::ostream& os) {  // NOLINT(*)
   os << vec << "[" << i << "]";
 }
 
-void CodeGenWebGPU::PrintVecElemStore(const std::string& vec, DataType t, int i,
+void CodeGenWebGPU::PrintVecElemStore(const std::string& vec, DLDataType t, int i,
                                       const std::string& value) {
   this->PrintIndent();
   stream << vec << "[" << i << "] = " << value << ";\n";
@@ -384,8 +386,8 @@ void CodeGenWebGPU::PrintVecElemStore(const std::string& vec, DataType t, int i,
 
 void CodeGenWebGPU::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NOLINT(*)
   std::string v = PrintExpr(op->value);
-  int lanes = op->dtype.lanes();
-  PrintType(op->dtype, os);
+  int lanes = op->ty().lanes();
+  PrintType(op->ty()->dtype, os);
   os << "(";
   for (int i = 0; i < lanes; ++i) {
     if (i != 0) os << ", ";
@@ -395,14 +397,14 @@ void CodeGenWebGPU::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  //
 }
 
 PrimExpr CodeGenWebGPU::EnforceU32(PrimExpr value) {
-  return cast(DataType::UInt(32, value.dtype().lanes()), value);
+  return cast(PrimType::UInt(32, value.ty().lanes()), value);
 }
 
 void CodeGenWebGPU::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
   if (op->op.same_as(builtin::reinterpret())) {
     // generate bitcast<TYPE>(ARG)
     os << "bitcast<";
-    this->PrintType(op->dtype, os);
+    this->PrintType(op->ty()->dtype, os);
     os << ">(";
     this->PrintExpr(op->args[0], os);
     os << ")";
@@ -426,7 +428,7 @@ void CodeGenWebGPU::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLIN
     std::string cond = PrintExpr(op->args[0]);
     this->PrintIndent();
     this->stream << "var " << result << " : ";
-    PrintType(op->dtype, this->stream);
+    PrintType(op->ty()->dtype, this->stream);
     this->stream << ";\n";
     this->PrintIndent();
     this->stream << "if (" << cond << ") {\n";
@@ -459,7 +461,7 @@ void CodeGenWebGPU::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLIN
 }
 
 void CodeGenWebGPU::VisitExpr_(const CastNode* op, std::ostream& os) {  // NOLINT(*)
-  PrintType(op->dtype, os);
+  PrintType(op->ty()->dtype, os);
   os << "(" << PrintExpr(op->value) << ")";
 }
 
@@ -478,7 +480,7 @@ void CodeGenWebGPU::VisitExpr_(const LetNode* op, std::ostream& os) {  // NOLINT
     PrintIndent();
     std::string value = PrintExpr(op->value);
     this->stream << "let " << AllocVarID(op->var.get()) << " : ";
-    PrintType(op->var.dtype(), this->stream);
+    PrintType(op->var.ty()->dtype, this->stream);
     this->stream << " = " << value << ";\n";
   }
   os << PrintExpr(op->body);
@@ -490,18 +492,18 @@ void CodeGenWebGPU::VisitExpr_(const LetNode* op, std::ostream& os) {  // NOLINT
 }
 
 void CodeGenWebGPU::VisitExpr_(const IntImmNode* op, std::ostream& os) {  // NOLINT(*)
-  if (op->dtype.bits() == 32) {
+  if (op->ty().bits() == 32) {
     std::ostringstream temp;
-    if (op->dtype.is_int()) {
+    if (op->ty().MatchesCode(DLDataTypeCode::kDLInt)) {
       temp << op->value << "i";
     } else {
-      TVM_FFI_ICHECK(op->dtype.is_uint());
+      TVM_FFI_ICHECK(op->ty().MatchesCode(DLDataTypeCode::kDLUInt));
       temp << op->value << "u";
     }
     this->MarkConst(temp.str());
     os << temp.str();
   } else {
-    this->PrintType(op->dtype, os);
+    this->PrintType(op->ty()->dtype, os);
     os << "(" << op->value << ")";
   }
 }
@@ -509,14 +511,14 @@ void CodeGenWebGPU::VisitExpr_(const IntImmNode* op, std::ostream& os) {  // NOL
 void CodeGenWebGPU::VisitExpr_(const FloatImmNode* op, std::ostream& os) {  // NOLINT(*)
   std::ostringstream temp;
   temp << std::scientific << op->value;
-  if (op->dtype.bits() == 32) {
+  if (op->ty().bits() == 32) {
     temp << 'f';
-  } else if (op->dtype.bits() == 16) {
+  } else if (op->ty().bits() == 16) {
     // Using f16 requires enable directive
     enable_fp16_ = true;
     temp << 'h';
   } else {
-    TVM_FFI_THROW(InternalError) << "Unsupported floating point bits " << op->dtype.bits();
+    TVM_FFI_THROW(InternalError) << "Unsupported floating point bits " << op->ty().bits();
   }
   MarkConst(temp.str());
   os << temp.str();
@@ -530,39 +532,42 @@ void CodeGenWebGPU::VisitExpr_(const BufferLoadNode* op, std::ostream& os) {  //
   TVM_FFI_ICHECK_EQ(op->indices.size(), 1) << "Load from non-flat memory not supported.";
   TVM_FFI_ICHECK(!op->predicate.defined()) << "Predicated buffer load is not supported.";
 
-  DataType value_dtype = op->dtype;
+  DLDataType value_dtype = op->ty()->dtype;
+  PrimType value_ty(value_dtype);
   PrimExpr index = op->indices[0];
   Var buffer_var = op->buffer->data;
-  DataType element_dtype = op->buffer->dtype;
+  DLDataType element_dtype = op->buffer->dtype->dtype;
+  PrimType element_ty(element_dtype);
 
-  int lanes = op->dtype.lanes();
+  int lanes = value_ty.lanes();
   std::string buffer_vid = GetVarID(buffer_var.get());
 
-  if (value_dtype.lanes() == element_dtype.lanes()) {
+  if (value_ty.lanes() == element_ty.lanes()) {
     // Direct buffer loading
     // Special handle bool loading
-    if (value_dtype == DataType::Bool()) {
+    if (value_dtype == DLDataType{kDLBool, 8, 1}) {
       this->PrintType(value_dtype, os);
       os << "(";
     } else {
       TVM_FFI_ICHECK(value_dtype == element_dtype);
     }
-    TVM_FFI_ICHECK_EQ(index.dtype().lanes(), 1);
+    TVM_FFI_ICHECK_EQ(index.ty().lanes(), 1);
     os << buffer_vid << "[" << this->PrintExpr(index) << "]";
     // Special handle bool loading
-    if (value_dtype == DataType::Bool()) {
+    if (value_dtype == DLDataType{kDLBool, 8, 1}) {
       os << ")";
     }
   } else {
     // Vector load from scalar buffer
-    TVM_FFI_ICHECK_EQ(element_dtype.lanes(), 1) << "Can only vector load scalar array";
-    TVM_FFI_ICHECK(value_dtype.element_of() == element_dtype)
+    TVM_FFI_ICHECK_EQ(element_ty.lanes(), 1) << "Can only vector load scalar array";
+    DLDataType value_element_dtype{value_dtype.code, value_dtype.bits, 1};
+    TVM_FFI_ICHECK(value_element_dtype == element_dtype)
         << "WebGPU vector loading requires base type to match";
     arith::PVar<PrimExpr> base;
-    if (arith::ramp(base, 1, op->dtype.lanes()).Match(index)) {
+    if (arith::ramp(base, 1, value_ty.lanes()).Match(index)) {
       // vec3<f32>(buf[base + 0], buf[base + 1], buf[base + 2]);
-      std::string base_vid = SSAGetID(PrintExpr(base.Eval()), base.Eval().dtype());
-      PrintType(element_dtype.with_lanes(value_dtype.lanes()), os);
+      std::string base_vid = SSAGetID(PrintExpr(base.Eval()), base.Eval().ty()->dtype);
+      PrintType(element_ty.WithLanes(value_ty.lanes())->dtype, os);
       os << "(";
       for (int i = 0; i < lanes; ++i) {
         if (i != 0) os << ", ";
@@ -571,8 +576,8 @@ void CodeGenWebGPU::VisitExpr_(const BufferLoadNode* op, std::ostream& os) {  //
       os << ")";
     } else {
       // vec3<f32>(buf[index[0]], buf[index[1]], buf[index[2]]);
-      std::string index_vid = SSAGetID(PrintExpr(index), index.dtype());
-      PrintType(element_dtype.with_lanes(value_dtype.lanes()), os);
+      std::string index_vid = SSAGetID(PrintExpr(index), index.ty()->dtype);
+      PrintType(element_ty.WithLanes(value_ty.lanes())->dtype, os);
       os << "(";
       for (int i = 0; i < lanes; ++i) {
         if (i != 0) os << ", ";
@@ -593,7 +598,7 @@ void CodeGenWebGPU::VisitStmt_(const BindNode* op) {
     PrintIndent();
     std::string value = PrintExpr(op->value);
     this->stream << "let " << AllocVarID(op->var.get()) << " : ";
-    PrintType(op->var.dtype(), this->stream);
+    PrintType(op->var.ty()->dtype, this->stream);
     this->stream << " = " << value << ";\n";
   }
 }
@@ -602,14 +607,16 @@ void CodeGenWebGPU::VisitStmt_(const BufferStoreNode* op) {
   TVM_FFI_ICHECK_EQ(op->indices.size(), 1) << "Store to non-flat memory not supported.";
   TVM_FFI_ICHECK(!op->predicate.defined()) << "Predicated buffer store is not supported.";
 
-  DataType value_dtype = op->value.dtype();
-  DataType element_dtype = op->buffer->dtype;
+  DLDataType value_dtype = op->value.ty()->dtype;
+  PrimType value_ty(value_dtype);
+  DLDataType element_dtype = op->buffer->dtype->dtype;
+  PrimType element_ty(element_dtype);
   PrimExpr index = op->indices[0];
   Var buffer_var = op->buffer->data;
 
   std::string buffer_vid = GetVarID(buffer_var.get());
 
-  if (value_dtype.lanes() == element_dtype.lanes()) {
+  if (value_ty.lanes() == element_ty.lanes()) {
     // must execute print expr first
     // so we won't have recursive append to stream
     std::string index_vid = PrintExpr(index);
@@ -618,7 +625,7 @@ void CodeGenWebGPU::VisitStmt_(const BufferStoreNode* op) {
     this->PrintIndent();
     stream << buffer_vid << "[" << index_vid << "] = ";
     // special explicit conversion of bool
-    if (value_dtype == DataType::Bool()) {
+    if (value_dtype == DLDataType{kDLBool, 8, 1}) {
       PrintType(element_dtype, stream);
       stream << "(";
     } else {
@@ -626,22 +633,23 @@ void CodeGenWebGPU::VisitStmt_(const BufferStoreNode* op) {
     }
     stream << value_vid;
     // Special handle bool store
-    if (value_dtype == DataType::Bool()) {
+    if (value_dtype == DLDataType{kDLBool, 8, 1}) {
       stream << ")";
     }
     stream << ";\n";
   } else {
     // Vector store into scalar buffer
-    TVM_FFI_ICHECK_EQ(element_dtype.lanes(), 1) << "Can only vector load scalar array";
-    TVM_FFI_ICHECK(value_dtype.element_of() == element_dtype)
+    TVM_FFI_ICHECK_EQ(element_ty.lanes(), 1) << "Can only vector load scalar array";
+    DLDataType value_element_dtype{value_dtype.code, value_dtype.bits, 1};
+    TVM_FFI_ICHECK(value_element_dtype == element_dtype)
         << "WebGPU vector stire requires base type to match";
     std::string value_vid = PrintExpr(op->value);
     arith::PVar<PrimExpr> base;
-    if (arith::ramp(base, 1, value_dtype.lanes()).Match(index)) {
+    if (arith::ramp(base, 1, value_ty.lanes()).Match(index)) {
       // buf[base + 0] = value[0]
       // buf[base + 1] = value[1]
-      std::string base_vid = SSAGetID(PrintExpr(base.Eval()), base.Eval().dtype());
-      for (int i = 0; i < value_dtype.lanes(); ++i) {
+      std::string base_vid = SSAGetID(PrintExpr(base.Eval()), base.Eval().ty()->dtype);
+      for (int i = 0; i < value_ty.lanes(); ++i) {
         this->PrintIndent();
         stream << buffer_vid << "[" << base_vid << " + " << i << "] = " << value_vid << "[" << i
                << "];\n";
@@ -649,8 +657,8 @@ void CodeGenWebGPU::VisitStmt_(const BufferStoreNode* op) {
     } else {
       // buf[index[0]] = value[0]
       // buf[index[1]] = value[1]
-      std::string index_vid = SSAGetID(PrintExpr(index), index.dtype());
-      for (int i = 0; i < value_dtype.lanes(); ++i) {
+      std::string index_vid = SSAGetID(PrintExpr(index), index.ty()->dtype);
+      for (int i = 0; i < value_ty.lanes(); ++i) {
         this->PrintIndent();
         stream << buffer_vid << "[" << index_vid << "[" << i << "]] = " << value_vid << "[" << i
                << "];\n";
@@ -673,12 +681,12 @@ void CodeGenWebGPU::VisitStmt_(const AllocBufferNode* op) {
 
   if (storage_scope.rank == runtime::StorageRank::kShared) {
     this->decl_stream << "var<workgroup> " << vid << " : array<";
-    PrintType(op->buffer->dtype, this->decl_stream);
+    PrintType(op->buffer->dtype->dtype, this->decl_stream);
     this->decl_stream << ", " << constant_size << ">;\n";
   } else if (storage_scope.rank == runtime::StorageRank::kLocal) {
     this->PrintIndent();
     this->stream << "var " << vid << " : array<";
-    PrintType(op->buffer->dtype, this->stream);
+    PrintType(op->buffer->dtype->dtype, this->stream);
     this->stream << ", " << constant_size << ">;\n";
   } else {
     TVM_FFI_THROW(InternalError) << "WebGPU: Do not support storage scope: "
@@ -694,7 +702,7 @@ void CodeGenWebGPU::VisitStmt_(const ForNode* op) {
   std::string vid = AllocVarID(op->loop_var.get());
   PrintIndent();
   stream << "for (var " << vid << " : ";
-  PrintType(op->loop_var.dtype(), stream);
+  PrintType(op->loop_var.ty()->dtype, stream);
   stream << " = " << begin_str << "; " << vid << " < " << end_str << "; " << vid;
   if (step_str.empty()) {
     stream << "++";
diff --git a/src/backend/webgpu/codegen/codegen_webgpu.h b/src/backend/webgpu/codegen/codegen_webgpu.h
index 4c873ac3db18..c2179c5c48aa 100644
--- a/src/backend/webgpu/codegen/codegen_webgpu.h
+++ b/src/backend/webgpu/codegen/codegen_webgpu.h
@@ -51,16 +51,17 @@ class CodeGenWebGPU final : public CodeGenC {
   using CodeGenC::AddFunction;
   runtime::FunctionInfo AddFunction(const PrimFunc& f, bool skip_readonly_decl);  // NOLINT(*)
   void InitFuncState(const PrimFunc& f) final;
-  void PrintStorageSync(const CallNode* op) final;     // NOLINT(*)
-  void PrintType(DataType t, std::ostream& os) final;  // NOLINT(*)
-  void BindThreadIndex(const IterVar& iv) final;       // NOLINT(*)
+  void PrintStorageSync(const CallNode* op) final;       // NOLINT(*)
+  void PrintType(DLDataType t, std::ostream& os) final;  // NOLINT(*)
+  void BindThreadIndex(const IterVar& iv) final;         // NOLINT(*)
 
   // assignment printing
-  void PrintSSAAssign(const std::string& target, const std::string& src, DataType type) final;
+  void PrintSSAAssign(const std::string& target, const std::string& src, PrimType type) final;
 
   // overload printing vector element load/store
-  void PrintVecElemLoad(const std::string& vec, DataType t, int i, std::ostream& os) final;
-  void PrintVecElemStore(const std::string& vec, DataType t, int i, const std::string& value) final;
+  void PrintVecElemLoad(const std::string& vec, DLDataType t, int i, std::ostream& os) final;
+  void PrintVecElemStore(const std::string& vec, DLDataType t, int i,
+                         const std::string& value) final;
 
   // overload visitor
   void VisitExpr_(const BroadcastNode* op, std::ostream& os) final;   // NOLINT(*)
@@ -90,7 +91,7 @@ class CodeGenWebGPU final : public CodeGenC {
   /*!
    * \brief Storage type of bool values.
    */
-  DataType boolean_storage_type_{DataType::Int(8)};
+  PrimType boolean_storage_type_{PrimType::Int(8)};
 
   // whether enable fp16
   bool enable_fp16_{false};
diff --git a/src/backend/webgpu/codegen/intrin_rule_webgpu.cc b/src/backend/webgpu/codegen/intrin_rule_webgpu.cc
index 1c172fcd141b..7992fa9915c0 100644
--- a/src/backend/webgpu/codegen/intrin_rule_webgpu.cc
+++ b/src/backend/webgpu/codegen/intrin_rule_webgpu.cc
@@ -34,7 +34,7 @@ using tirx::FLowerIntrinsic;
 
 // warp-level primitives. Follows implementation in intrin_rule_metal.cc
 struct WebGPUWarpIntrinsic {
-  const Op operator()(DataType t, const Op& orig_op) const {
+  const Op operator()(PrimType t, const Op& orig_op) const {
     if (orig_op.same_as(builtin::tvm_warp_shuffle())) {
       static const Op& webgpu_subgroup_shuffle_op = Op::Get("tirx.webgpu.subgroup_shuffle");
       return webgpu_subgroup_shuffle_op;
@@ -55,9 +55,9 @@ static PrimExpr DispatchWebGPUShuffle(const PrimExpr& e) {
   const CallNode* call = e.as<CallNode>();
   TVM_FFI_ICHECK(call != nullptr);
   TVM_FFI_ICHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
-  PrimExpr lane_or_delta = Cast(DataType::UInt(32, call->args[2].dtype().lanes()), call->args[2]);
+  PrimExpr lane_or_delta = Cast(PrimType::UInt(32, call->args[2].ty().lanes()), call->args[2]);
   ffi::Array<PrimExpr> webgpu_args{{call->args[1], lane_or_delta}};
-  return Call(call->dtype, T()(call->dtype, call->op.as_or_throw<Op>()), webgpu_args);
+  return Call(e.ty(), T()(e.ty(), call->op.as_or_throw<Op>()), webgpu_args);
 }
 
 void RegisterWebGPUIntrinRules() {
@@ -69,7 +69,7 @@ void RegisterWebGPUIntrinRules() {
 // See full list of builtin: https://www.w3.org/TR/WGSL/#builtin-functions
 
 struct ReturnAbs {
-  std::string operator()(DataType t, std::string name) const { return "abs"; }
+  std::string operator()(PrimType t, std::string name) const { return "abs"; }
 };
 
 TVM_REGISTER_OP("tirx.fabs")
@@ -124,7 +124,7 @@ TVM_REGISTER_OP("tirx.pow")
     .set_attr<FLowerIntrinsic>("webgpu.FLowerIntrinsic", DispatchPureExtern<Direct>);
 
 struct ReturnRound {
-  std::string operator()(DataType t, std::string name) const { return "round"; }
+  std::string operator()(PrimType t, std::string name) const { return "round"; }
 };
 
 // WGSL round() uses ties-to-even (banker's rounding), matching IEEE 754 and ONNX Round spec.
diff --git a/src/ir/expr.cc b/src/ir/expr.cc
index ef6ea0ed6dca..f73cd6ae3913 100644
--- a/src/ir/expr.cc
+++ b/src/ir/expr.cc
@@ -26,6 +26,7 @@
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ir/expr.h>
 #include <tvm/ir/function.h>
+#include <tvm/ir/type.h>
 #include <tvm/te/tensor.h>
 #include <tvm/tirx/expr.h>
 
@@ -48,33 +49,39 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 PrimExpr::PrimExpr(int32_t value) : PrimExpr(IntImm::Int32(value)) {}
 
-PrimExpr::PrimExpr(float value) : PrimExpr(FloatImm(DataType::Float(32), value)) {}
+PrimExpr::PrimExpr(float value) : PrimExpr(FloatImm(PrimType::Float(32), value)) {}
 
 PrimExpr PrimExpr::ConvertFallbackValue(ffi::String value) { return tirx::StringImm(value); }
 
-IntImm::IntImm(DataType dtype, int64_t value, Span span) {
-  TVM_FFI_CHECK(dtype.is_scalar(), ValueError)
-      << "IntImm can only take scalar, but " << dtype << " was supplied.";
-  TVM_FFI_CHECK(dtype.is_int() || dtype.is_uint() || dtype.is_bool(), ValueError)
-      << "IntImm supports only int or uint or bool type, but " << dtype << " was supplied.";
-  if (dtype.is_uint()) {
+IntImm::IntImm(PrimType value_ty, int64_t value, Span span) {
+  DLDataType runtime_dtype = value_ty->dtype;
+  DLDataTypeCode code = value_ty.code();
+  int32_t bits = value_ty.bits();
+  TVM_FFI_CHECK(!value_ty.IsScalableVector() && !value_ty.IsFixedLengthVector(), ValueError)
+      << "IntImm can only take scalar, but " << runtime_dtype << " was supplied.";
+  TVM_FFI_CHECK(value_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt,
+                                     DLDataTypeCode::kDLBool),
+                ValueError)
+      << "IntImm supports only int or uint or bool type, but " << runtime_dtype << " was supplied.";
+  if (code == DLDataTypeCode::kDLUInt) {
     TVM_FFI_CHECK_GE(value, 0U, ValueError)
-        << "Literal value " << value << " is negative for unsigned integer type " << dtype;
-    if (dtype.bits() < 64) {
-      TVM_FFI_CHECK_LT(value, 1LL << dtype.bits(), ValueError)
-          << "Literal value " << value << " exceeds maximum of " << dtype;
+        << "Literal value " << value << " is negative for unsigned integer type " << runtime_dtype;
+    if (bits < 64) {
+      TVM_FFI_CHECK_LT(value, 1LL << bits, ValueError)
+          << "Literal value " << value << " exceeds maximum of " << runtime_dtype;
     }
-  } else if (dtype.bits() == 1 || dtype.is_bool()) {
+  } else if (bits == 1 || code == DLDataTypeCode::kDLBool) {
     // int(1)
-    TVM_FFI_CHECK(value == 0 || value == 1, ValueError) << value << " exceeds range of " << dtype;
-  } else if (dtype.bits() < 64) {
-    TVM_FFI_CHECK_GE(value, -(1LL << (dtype.bits() - 1)), ValueError)
-        << "Literal value " << value << " exceeds minimum of " << dtype;
-    TVM_FFI_CHECK_LT(value, 1LL << (dtype.bits() - 1), ValueError)
-        << "Literal value " << value << " exceeds maximum of " << dtype;
+    TVM_FFI_CHECK(value == 0 || value == 1, ValueError)
+        << value << " exceeds range of " << runtime_dtype;
+  } else if (bits < 64) {
+    TVM_FFI_CHECK_GE(value, -(1LL << (bits - 1)), ValueError)
+        << "Literal value " << value << " exceeds minimum of " << runtime_dtype;
+    TVM_FFI_CHECK_LT(value, 1LL << (bits - 1), ValueError)
+        << "Literal value " << value << " exceeds maximum of " << runtime_dtype;
   }
   ffi::ObjectPtr<IntImmNode> node = ffi::make_object<IntImmNode>();
-  node->dtype = dtype;
+  node->BaseExprNode::ty = std::move(value_ty);
   node->value = value;
   node->span = span;
   data_ = std::move(node);
@@ -82,103 +89,118 @@ IntImm::IntImm(DataType dtype, int64_t value, Span span) {
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("ir.IntImm", [](DataType dtype, int64_t value, Span span) {
-    return IntImm(dtype, value, span);
+  refl::GlobalDef().def("ir.IntImm", [](DLDataType dtype, int64_t value, Span span) {
+    return IntImm(PrimType(dtype), value, span);
   });
 }
 
-FloatImm::FloatImm(DataType dtype, double value, Span span) {
-  TVM_FFI_CHECK_EQ(dtype.lanes(), 1, ValueError) << "FloatImm can only take scalar.";
+FloatImm::FloatImm(PrimType value_ty, double value, Span span) {
+  DLDataType runtime_dtype = value_ty->dtype;
+  DLDataTypeCode code = value_ty.code();
+  int32_t bits = value_ty.bits();
+  TVM_FFI_CHECK(!value_ty.IsScalableVector() && !value_ty.IsFixedLengthVector(), ValueError)
+      << "FloatImm can only take scalar.";
 
-  TVM_FFI_CHECK(dtype.is_float() || dtype.is_bfloat16() || dtype.is_float8() || dtype.is_float6() ||
-                    dtype.is_float4() || dtype.code() >= DataType::kCustomBegin,
-                ValueError)
-      << "FloatImm supports only float, but " << dtype << " was supplied.";
+  TVM_FFI_CHECK(
+      value_ty.MatchesCode(DLDataTypeCode::kDLFloat, DLDataTypeCode::kDLFloat8_e3m4,
+                           DLDataTypeCode::kDLFloat8_e4m3, DLDataTypeCode::kDLFloat8_e4m3b11fnuz,
+                           DLDataTypeCode::kDLFloat8_e4m3fn, DLDataTypeCode::kDLFloat8_e4m3fnuz,
+                           DLDataTypeCode::kDLFloat8_e5m2, DLDataTypeCode::kDLFloat8_e5m2fnuz,
+                           DLDataTypeCode::kDLFloat8_e8m0fnu, DLDataTypeCode::kDLFloat6_e2m3fn,
+                           DLDataTypeCode::kDLFloat6_e3m2fn) ||
+          value_ty.MatchesElementType(DLDataTypeCode::kDLBfloat, 16) ||
+          value_ty.MatchesElementType(DLDataTypeCode::kDLFloat4_e2m1fn, 4) ||
+          static_cast<int>(code) >= static_cast<int>(ffi::DLExtDataTypeCode::kDLExtCustomBegin),
+      ValueError)
+      << "FloatImm supports only float, but " << runtime_dtype << " was supplied.";
 
   // check range for float32 and float16 since they have specified range.
   if (!std::isinf(value) && !std::isnan(value)) {
-    if (dtype.bits() == 32) {
+    if (bits == 32) {
       TVM_FFI_CHECK_GE(value, std::numeric_limits<float>::lowest(), ValueError)
-          << "Literal value " << value << " exceeds minimum of " << dtype;
+          << "Literal value " << value << " exceeds minimum of " << runtime_dtype;
       TVM_FFI_CHECK_LE(value, std::numeric_limits<float>::max(), ValueError)
-          << "Literal value " << value << " exceeds maximum of " << dtype;
-    } else if (dtype.is_float16()) {
+          << "Literal value " << value << " exceeds maximum of " << runtime_dtype;
+    } else if (value_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16)) {
       TVM_FFI_CHECK_GE(value, -support::kMaxFloat16, ValueError)
-          << "Literal value " << value << " exceeds minimum of " << dtype;
+          << "Literal value " << value << " exceeds minimum of " << runtime_dtype;
       TVM_FFI_CHECK_LE(value, support::kMaxFloat16, ValueError)
-          << "Literal value " << value << " exceeds maximum of " << dtype;
-    } else if (dtype.is_bfloat16()) {
+          << "Literal value " << value << " exceeds maximum of " << runtime_dtype;
+    } else if (value_ty.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) {
       TVM_FFI_CHECK_GE(value, -support::kMaxBFloat16, ValueError)
-          << "Literal value " << value << " exceeds minimum of " << dtype;
+          << "Literal value " << value << " exceeds minimum of " << runtime_dtype;
       TVM_FFI_CHECK_LE(value, support::kMaxBFloat16, ValueError)
-          << "Literal value " << value << " exceeds maximum of " << dtype;
-    } else if (dtype.is_float8_e3m4() || dtype.is_float8_e4m3() || dtype.is_float8_e4m3b11fnuz() ||
-               dtype.is_float8_e4m3fn() || dtype.is_float8_e4m3fnuz() || dtype.is_float8_e5m2() ||
-               dtype.is_float8_e5m2fnuz() || dtype.is_float8_e8m0fnu()) {
+          << "Literal value " << value << " exceeds maximum of " << runtime_dtype;
+    } else if (value_ty.MatchesCode(
+                   DLDataTypeCode::kDLFloat8_e3m4, DLDataTypeCode::kDLFloat8_e4m3,
+                   DLDataTypeCode::kDLFloat8_e4m3b11fnuz, DLDataTypeCode::kDLFloat8_e4m3fn,
+                   DLDataTypeCode::kDLFloat8_e4m3fnuz, DLDataTypeCode::kDLFloat8_e5m2,
+                   DLDataTypeCode::kDLFloat8_e5m2fnuz, DLDataTypeCode::kDLFloat8_e8m0fnu)) {
       double bound = 0.0;
       bool nonneg = false;
 
-      switch (dtype.code()) {
-        case DataType::TypeCode::kFloat8_e3m4:
+      switch (code) {
+        case DLDataTypeCode::kDLFloat8_e3m4:
           bound = support::kMaxE3M4;
           break;
-        case DataType::TypeCode::kFloat8_e4m3:
+        case DLDataTypeCode::kDLFloat8_e4m3:
           bound = support::kMaxE4M3;
           break;
-        case DataType::TypeCode::kFloat8_e4m3b11fnuz:
+        case DLDataTypeCode::kDLFloat8_e4m3b11fnuz:
           bound = support::kMaxE4M3B11FNUZ;
           nonneg = true;
           break;
-        case DataType::TypeCode::kFloat8_e4m3fn:
+        case DLDataTypeCode::kDLFloat8_e4m3fn:
           bound = support::kMaxE4M3FN;
           break;
-        case DataType::TypeCode::kFloat8_e4m3fnuz:
+        case DLDataTypeCode::kDLFloat8_e4m3fnuz:
           bound = support::kMaxE4M3FNUZ;
           nonneg = true;
           break;
-        case DataType::TypeCode::kFloat8_e5m2:
+        case DLDataTypeCode::kDLFloat8_e5m2:
           bound = support::kMaxE5M2;
           break;
-        case DataType::TypeCode::kFloat8_e5m2fnuz:
+        case DLDataTypeCode::kDLFloat8_e5m2fnuz:
           bound = support::kMaxE5M2FNUZ;
           nonneg = true;
           break;
-        case DataType::TypeCode::kFloat8_e8m0fnu:
+        case DLDataTypeCode::kDLFloat8_e8m0fnu:
           bound = support::kMaxE8M0FNU;
           nonneg = true;
           break;
         default:
-          TVM_FFI_THROW(InternalError) << "Unhandled float8 type: " << dtype;
+          TVM_FFI_THROW(InternalError) << "Unhandled float8 type: " << runtime_dtype;
       }
 
       if (nonneg) {
         TVM_FFI_CHECK_GE(value, 0, ValueError)
-            << "Literal value " << value << " below zero for unsigned " << dtype;
+            << "Literal value " << value << " below zero for unsigned " << runtime_dtype;
       } else {
         TVM_FFI_CHECK_GE(value, -bound, ValueError)
-            << "Literal value " << value << " below minimum of " << dtype;
+            << "Literal value " << value << " below minimum of " << runtime_dtype;
       }
       TVM_FFI_CHECK_LE(value, bound, ValueError)
-          << "Literal value " << value << " exceeds maximum of " << dtype;
+          << "Literal value " << value << " exceeds maximum of " << runtime_dtype;
 
-    } else if (dtype.is_float6_e2m3fn() || dtype.is_float6_e3m2fn()) {
-      double bound = (dtype.code() == DataType::TypeCode::kFloat6_e2m3fn) ? support::kMaxE2M3FN
-                                                                          : support::kMaxE3M2FN;
+    } else if (value_ty.MatchesCode(DLDataTypeCode::kDLFloat6_e2m3fn,
+                                    DLDataTypeCode::kDLFloat6_e3m2fn)) {
+      double bound =
+          (code == DLDataTypeCode::kDLFloat6_e2m3fn) ? support::kMaxE2M3FN : support::kMaxE3M2FN;
       TVM_FFI_CHECK_GE(value, -bound, ValueError)
-          << "Literal value " << value << " below minimum of " << dtype;
+          << "Literal value " << value << " below minimum of " << runtime_dtype;
       TVM_FFI_CHECK_LE(value, bound, ValueError)
-          << "Literal value " << value << " exceeds maximum of " << dtype;
+          << "Literal value " << value << " exceeds maximum of " << runtime_dtype;
 
-    } else if (dtype.is_float4_e2m1fn()) {
+    } else if (code == DLDataTypeCode::kDLFloat4_e2m1fn) {
       double bound = support::kMaxE2M1FN;
       TVM_FFI_CHECK_GE(value, -bound, ValueError)
-          << "Literal value " << value << " below minimum of " << dtype;
+          << "Literal value " << value << " below minimum of " << runtime_dtype;
       TVM_FFI_CHECK_LE(value, bound, ValueError)
-          << "Literal value " << value << " exceeds maximum of " << dtype;
+          << "Literal value " << value << " exceeds maximum of " << runtime_dtype;
     }
   }
   ffi::ObjectPtr<FloatImmNode> node = ffi::make_object<FloatImmNode>();
-  node->dtype = dtype;
+  node->BaseExprNode::ty = std::move(value_ty);
   node->value = value;
   node->span = span;
   data_ = std::move(node);
@@ -186,8 +208,8 @@ FloatImm::FloatImm(DataType dtype, double value, Span span) {
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("ir.FloatImm", [](DataType dtype, double value, Span span) {
-    return FloatImm(dtype, value, span);
+  refl::GlobalDef().def("ir.FloatImm", [](DLDataType dtype, double value, Span span) {
+    return FloatImm(PrimType(dtype), value, span);
   });
 }
 
@@ -206,7 +228,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
         if (end.defined()) {
           return Range(begin, end.value(), span);
         } else {
-          return Range(IntImm(begin->dtype, 0), begin, span);
+          return Range(IntImm(begin.ty(), 0), begin, span);
         }
       });
 }
diff --git a/src/ir/type.cc b/src/ir/type.cc
index d6d059dba079..20bbe9c0e58a 100644
--- a/src/ir/type.cc
+++ b/src/ir/type.cc
@@ -21,30 +21,133 @@
  * \file src/ir/type.cc
  * \brief Common type system AST nodes throughout the IR.
  */
+#include <tvm/ffi/container/tensor.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ir/type.h>
+
+#include <cstdint>
+#include <unordered_map>
+
 namespace tvm {
 
+namespace {
+
+DLDataType ScalableVectorDType(DLDataTypeCode code, int bits, int lanes) {
+  TVM_FFI_ICHECK_GT(lanes, 1) << "Invalid value for vscale factor " << lanes;
+  TVM_FFI_ICHECK_LT(lanes, 32768);
+  return DLDataType{static_cast<uint8_t>(code), static_cast<uint8_t>(bits),
+                    static_cast<uint16_t>(-lanes)};
+}
+
+uint32_t PackDataTypeKey(DLDataType dtype) {
+  return (static_cast<uint32_t>(dtype.code) << 24) | (static_cast<uint32_t>(dtype.bits) << 16) |
+         static_cast<uint32_t>(dtype.lanes);
+}
+
+int64_t PrimTypeAnyHash(const ffi::Any& src) {
+  return static_cast<int64_t>(PackDataTypeKey(src.cast<PrimType>()->dtype));
+}
+
+bool PrimTypeAnyEqual(const ffi::Any& lhs, const ffi::Any& rhs) {
+  return lhs.cast<PrimType>()->dtype == rhs.cast<PrimType>()->dtype;
+}
+
+ffi::ObjectPtr<PrimTypeNode> GetCachedPrimTypeNode(DLDataType dtype) {
+  thread_local std::unordered_map<uint32_t, ffi::ObjectPtr<PrimTypeNode>> cache;
+  uint32_t key = PackDataTypeKey(dtype);
+  auto it = cache.find(key);
+  if (it != cache.end()) {
+    return it->second;
+  }
+
+  ffi::ObjectPtr<PrimTypeNode> node = ffi::make_object<PrimTypeNode>();
+  node->dtype = dtype;
+  return cache.emplace(key, std::move(node)).first->second;
+}
+
+}  // namespace
+
 TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
   TypeNode::RegisterReflection();
   PrimTypeNode::RegisterReflection();
+  refl::TypeAttrDef<PrimTypeNode>()
+      .attr(refl::type_attr::kAnyHash, reinterpret_cast<void*>(&PrimTypeAnyHash))
+      .attr(refl::type_attr::kAnyEqual, reinterpret_cast<void*>(&PrimTypeAnyEqual));
   PointerTypeNode::RegisterReflection();
   TupleTypeNode::RegisterReflection();
   FuncTypeNode::RegisterReflection();
   TensorMapTypeNode::RegisterReflection();
 }
 
-PrimType::PrimType(runtime::DataType dtype, Span span) {
-  ffi::ObjectPtr<PrimTypeNode> n = ffi::make_object<PrimTypeNode>();
-  n->dtype = dtype;
-  n->span = std::move(span);
-  data_ = std::move(n);
+PrimType::PrimType(DLDataType dtype) { data_ = GetCachedPrimTypeNode(dtype); }
+
+PrimType::PrimType(DLDataTypeCode code, int bits, int lanes)
+    : PrimType(DLDataType{static_cast<uint8_t>(code), static_cast<uint8_t>(bits),
+                          static_cast<uint16_t>(lanes)}) {}
+
+PrimType PrimType::Int(int bits, int lanes) {
+  if (lanes == 1) {
+    if (bits == 32) {
+      static const PrimType i32_ty(DLDataType{kDLInt, 32, 1});
+      return i32_ty;
+    }
+    if (bits == 64) {
+      static const PrimType i64_ty(DLDataType{kDLInt, 64, 1});
+      return i64_ty;
+    }
+  }
+  return PrimType(DLDataType{kDLInt, static_cast<uint8_t>(bits), static_cast<uint16_t>(lanes)});
+}
+
+PrimType PrimType::UInt(int bits, int lanes) {
+  return PrimType(DLDataType{kDLUInt, static_cast<uint8_t>(bits), static_cast<uint16_t>(lanes)});
+}
+
+PrimType PrimType::Float(int bits, int lanes) {
+  if (bits == 32 && lanes == 1) {
+    static const PrimType f32_ty(DLDataType{kDLFloat, 32, 1});
+    return f32_ty;
+  }
+  return PrimType(DLDataType{kDLFloat, static_cast<uint8_t>(bits), static_cast<uint16_t>(lanes)});
+}
+
+PrimType PrimType::BFloat(int bits, int lanes) {
+  return PrimType(DLDataType{kDLBfloat, static_cast<uint8_t>(bits), static_cast<uint16_t>(lanes)});
+}
+
+PrimType PrimType::Bool(int lanes) {
+  if (lanes == 1) {
+    static const PrimType bool_ty(DLDataType{kDLBool, 8, 1});
+    return bool_ty;
+  }
+  return PrimType(DLDataType{kDLBool, 8, static_cast<uint16_t>(lanes)});
+}
+
+PrimType PrimType::Handle(int bits, int lanes) {
+  return PrimType(
+      DLDataType{kDLOpaqueHandle, static_cast<uint8_t>(bits), static_cast<uint16_t>(lanes)});
+}
+
+PrimType PrimType::Void() { return PrimType(DLDataType{kDLOpaqueHandle, 0, 0}); }
+
+PrimType PrimType::ScalableVector(DLDataTypeCode code, int bits, int lanes) {
+  return PrimType(ScalableVectorDType(code, bits, lanes));
+}
+
+size_t PrimType::StorageBytes() const {
+  int16_t encoded_lanes = static_cast<int16_t>(get()->dtype.lanes);
+  if (TVM_FFI_PREDICT_FALSE(encoded_lanes < 0)) {
+    TVM_FFI_THROW(InternalError)
+        << "Cannot compute compile-time storage bytes for non-fixed vector type " << get()->dtype;
+  }
+  return ffi::GetDataSize(1, get()->dtype);
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("ir.PrimType", [](runtime::DataType dtype) { return PrimType(dtype); });
+  refl::GlobalDef().def("ir.PrimType", [](DLDataType dtype) { return PrimType(dtype); });
 }
 
 PointerType::PointerType(Type element_type, ffi::String storage_scope) {
diff --git a/src/relax/analysis/tir_op_pattern_kind.cc b/src/relax/analysis/tir_op_pattern_kind.cc
index 369f5793d9b5..0bfb48cca94c 100644
--- a/src/relax/analysis/tir_op_pattern_kind.cc
+++ b/src/relax/analysis/tir_op_pattern_kind.cc
@@ -478,8 +478,8 @@ bool HasReshapePattern(const PrimFunc& func) {
       }
 
       if (nontrivial_indices.defined()) {
-        DataType dtype =
-            !block->iter_vars.empty() ? block->iter_vars[0]->var->dtype : DataType::Int(64);
+        PrimType dtype =
+            !block->iter_vars.empty() ? block->iter_vars[0]->var.ty() : PrimType::Int(64);
         tirx::Var fused_var("fused", dtype);
         ffi::Map<tirx::Var, PrimExpr> inverse_indices_map;
         PrimExpr stride = IntImm(dtype, /*value=*/1);
@@ -494,7 +494,8 @@ bool HasReshapePattern(const PrimFunc& func) {
 
         ffi::Array<PrimExpr> simplify_res = arith::IterMapSimplify(
             /*indices=*/{flattened_idx},
-            /*input_iters=*/{{fused_var, Range(IntImm(dtype, /*value=*/0), stride)}},
+            /*input_iters=*/
+            ffi::Map<tirx::Var, Range>{{fused_var, Range(IntImm(dtype, /*value=*/0), stride)}},
             /*input_pred=*/IntImm::Bool(true),
             /*check_level=*/arith::IterMapLevel::Surjective,
             /*analyzer=*/this->ana_,
diff --git a/src/relax/analysis/type_analysis.cc b/src/relax/analysis/type_analysis.cc
index 33070051ae63..34f5a4de6216 100644
--- a/src/relax/analysis/type_analysis.cc
+++ b/src/relax/analysis/type_analysis.cc
@@ -43,7 +43,7 @@ class StaticTypeDeriver : public TypeFunctor<Type(const Type&)> {
  public:
   Type VisitType_(const ObjectTypeNode* op) final { return ObjectType(op->span); }
 
-  Type VisitType_(const PrimTypeNode* op) final { return PrimType(op->dtype, op->span); }
+  Type VisitType_(const PrimTypeNode* op) final { return tvm::PrimType(op->dtype); }
 
   Type VisitType_(const ShapeTypeNode* op) final { return ShapeType(op->ndim, op->span); }
 
@@ -86,7 +86,9 @@ Type TypeFromStaticType(const Type& type) {
   if (type.as<ObjectTypeNode>()) {
     return ObjectType(type->span);
   } else if (const PrimTypeNode* prim_type = type.as<PrimTypeNode>()) {
-    return PrimType(prim_type->dtype, prim_type->span);
+    return tvm::PrimType(prim_type->dtype);
+  } else if (const tvm::PrimTypeNode* prim_type = type.as<tvm::PrimTypeNode>()) {
+    return tvm::PrimType(prim_type->dtype);
   } else if (const ShapeTypeNode* shape_type = type.as<ShapeTypeNode>()) {
     return ShapeType(shape_type->ndim, type->span);
   } else if (const TensorTypeNode* tensor_type = type.as<TensorTypeNode>()) {
@@ -221,9 +223,9 @@ class WellDefinedEraser : public TypeMutator, public ExprMutatorBase, public tir
     if (ret.defined()) {
       PrimExpr value = ret.value();
       if (value->IsInstance<IntImmNode>()) {
-        return tvm::cast(DataType::Int(64), value);
+        return tvm::cast(PrimType::Int(64), value);
       }
-      TVM_FFI_ICHECK(value.dtype() == DataType::Int(64))
+      TVM_FFI_ICHECK(value.ty().MatchesElementType(DLDataTypeCode::kDLInt, 64))
           << "Can only provide i64 expressions in shape";
       return value;
     } else {
@@ -1015,7 +1017,9 @@ class TypeLCAFinder : public TypeFunctor<Type(const Type&, const Type&)> {
     if (rhs == nullptr) return ObjectType(lhs->span);
 
     // find the target dtype, ndim, and vdevice.
-    DataType dtype = lhs->dtype == rhs->dtype ? lhs->dtype : DataType::Void();
+    PrimType dtype = lhs->dtype->dtype == rhs->dtype->dtype
+                         ? PrimType(lhs->dtype->dtype)
+                         : PrimType(DLDataType{kDLOpaqueHandle, 0, 0});
     int ndim = lhs->ndim == rhs->ndim ? lhs->ndim : kUnknownNDim;
     VDevice vdev = VDevice();
     if (lhs->vdevice.defined() && rhs->vdevice.defined() &&
@@ -1028,7 +1032,7 @@ class TypeLCAFinder : public TypeFunctor<Type(const Type&, const Type&)> {
         !CanProveShapeEqual(lhs->shape.value(), rhs->shape.value(),
                             ffi::GetRef<arith::Analyzer>(analyzer_))) {
       // reuse lhs when possible
-      if (!lhs->shape.defined() && lhs->dtype == dtype && lhs->ndim == ndim &&
+      if (!lhs->shape.defined() && lhs->dtype->dtype == dtype->dtype && lhs->ndim == ndim &&
           (!lhs->vdevice.defined() || vdev.defined())) {
         return ffi::GetRef<Type>(lhs);
       } else {
@@ -1036,7 +1040,7 @@ class TypeLCAFinder : public TypeFunctor<Type(const Type&, const Type&)> {
       }
     }
     // symbolic shape and vdevice match but dtype mismatch
-    if (lhs->dtype != dtype || (lhs->vdevice.defined() && !vdev.defined())) {
+    if (lhs->dtype->dtype != dtype->dtype || (lhs->vdevice.defined() && !vdev.defined())) {
       return TensorType(lhs->shape.value(), dtype, vdev, lhs->span);
     } else {
       return ffi::GetRef<Type>(lhs);
diff --git a/src/relax/analysis/well_formed.cc b/src/relax/analysis/well_formed.cc
index 5c3547249c5e..52e974be75f0 100644
--- a/src/relax/analysis/well_formed.cc
+++ b/src/relax/analysis/well_formed.cc
@@ -457,9 +457,9 @@ class WellFormedChecker : public relax::ExprVisitor,
     for (PrimExpr expr : op->values) {
       // check if the symbolic vars in the expr are defined, e.g, 2 * m
       tirx::ExprVisitor::VisitExpr(expr);
-      if (!expr.dtype().is_int()) {
+      if (expr.ty().code() != DLDataTypeCode::kDLInt) {
         TVM_FFI_VISIT_THROW(TypeError, expr)
-            << "Shape expressions must be of integer type, but got " << expr.dtype();
+            << "Shape expressions must be of integer type, but got " << expr.ty()->dtype;
       }
     }
     CheckType(op);
diff --git a/src/relax/backend/contrib/codegen_c/codegen_c.h b/src/relax/backend/contrib/codegen_c/codegen_c.h
index 1a5fb1dd801e..0c36b04812c8 100644
--- a/src/relax/backend/contrib/codegen_c/codegen_c.h
+++ b/src/relax/backend/contrib/codegen_c/codegen_c.h
@@ -347,19 +347,20 @@ class CodegenCBase {
    */
   std::string GetDtypeString(const TensorTypeNode* tensor_ty) {
     std::string dtype;
-    if (runtime::TypeMatch(tensor_ty->dtype, kDLFloat, 32)) {
+    DLDataType raw_dtype = tensor_ty->dtype->dtype;
+    if (raw_dtype == DLDataType{kDLFloat, 32, 1}) {
       dtype = "float";
-    } else if (runtime::TypeMatch(tensor_ty->dtype, kDLFloat, 16)) {
+    } else if (raw_dtype == DLDataType{kDLFloat, 16, 1}) {
       dtype = "half";
-    } else if (runtime::TypeMatch(tensor_ty->dtype, kDLBfloat, 16)) {
+    } else if (raw_dtype == DLDataType{kDLBfloat, 16, 1}) {
       dtype = "bfloat";
-    } else if (runtime::TypeMatch(tensor_ty->dtype, kDLInt, 32)) {
+    } else if (raw_dtype == DLDataType{kDLInt, 32, 1}) {
       dtype = "int";
-    } else if (runtime::TypeMatch(tensor_ty->dtype, kDLInt, 64)) {
+    } else if (raw_dtype == DLDataType{kDLInt, 64, 1}) {
       dtype = "int64_t";
-    } else if (runtime::TypeMatch(tensor_ty->dtype, kDLInt, 8)) {
+    } else if (raw_dtype == DLDataType{kDLInt, 8, 1}) {
       dtype = "int8_t";
-    } else if (runtime::TypeMatch(tensor_ty->dtype, kDLUInt, 8)) {
+    } else if (raw_dtype == DLDataType{kDLUInt, 8, 1}) {
       dtype = "uint8_t";
     } else {
       TVM_FFI_THROW(InternalError) << "Unsupported dtype " << tensor_ty->dtype;
diff --git a/src/relax/backend/contrib/codegen_json/codegen_json.h b/src/relax/backend/contrib/codegen_json/codegen_json.h
index edebb7593fca..03133599a58a 100644
--- a/src/relax/backend/contrib/codegen_json/codegen_json.h
+++ b/src/relax/backend/contrib/codegen_json/codegen_json.h
@@ -89,8 +89,8 @@ class OpAttrExtractor {
     }
   }
 
-  void Visit(const char* key, DataType* value) {
-    if (!value->is_void()) {
+  void Visit(const char* key, DLDataType* value) {
+    if (!(value->code == kDLOpaqueHandle && value->bits == 0 && value->lanes == 0)) {
       SetNodeAttr(key, ffi::String(ffi::DLDataTypeToString(*value)));
     } else {
       SetNodeAttr(key, ffi::String(""));
@@ -201,7 +201,7 @@ class OpAttrExtractor {
           break;
         }
         case ffi::TypeIndex::kTVMFFIDataType: {
-          DataType value(field_value.cast<DLDataType>());
+          DLDataType value = field_value.cast<DLDataType>();
           this->Visit(field_info->name.data, &value);
           break;
         }
@@ -282,7 +282,7 @@ class JSONSerializer : public relax::MemoizedExprTranslator<NodeEntries> {
         ShapeExpr output_shape = tensor_ty->shape.value().as_or_throw<ShapeExpr>();
         ret.push_back(JSONGraphNodeEntry(node_id, i));
         shape.emplace_back(GetIntShape(output_shape->values));
-        dtype.emplace_back(DType2String(tensor_ty->dtype));
+        dtype.emplace_back(DType2String(tensor_ty->dtype->dtype));
       }
       node->SetNumOutput(tuple_ty->fields.size());
     } else {
@@ -292,7 +292,7 @@ class JSONSerializer : public relax::MemoizedExprTranslator<NodeEntries> {
       ShapeExpr output_shape = tensor_ty->shape.value().as_or_throw<ShapeExpr>();
 
       shape.emplace_back(GetIntShape(output_shape->values));
-      dtype.emplace_back(DType2String(tensor_ty->dtype));
+      dtype.emplace_back(DType2String(tensor_ty->dtype->dtype));
       ret.push_back(JSONGraphNodeEntry(node_id, 0));
     }
     node->SetShape(shape);
diff --git a/src/relax/backend/contrib/cublas/codegen.cc b/src/relax/backend/contrib/cublas/codegen.cc
index 5284de94f622..f2999b172136 100644
--- a/src/relax/backend/contrib/cublas/codegen.cc
+++ b/src/relax/backend/contrib/cublas/codegen.cc
@@ -86,11 +86,11 @@ class CublasJSONSerializer : public JSONSerializer {
         const auto* const_expr = dequantize_call->args[1].as<ConstantNode>();
         auto ty = const_expr->ty.as_or_throw<TensorType>();
         float alpha = 1.0;
-        if (ty->dtype == DataType::Float(16)) {
+        if (ty->dtype == PrimType::Float(16)) {
           alpha = __extendXfYf2__<uint16_t, uint16_t, 10, float, uint32_t, 23>(
               static_cast<uint16_t*>(const_expr->data->data)[0]);
         } else {
-          TVM_FFI_ICHECK(ty->dtype == DataType::Float(32));
+          TVM_FFI_ICHECK(ty->dtype == PrimType::Float(32));
           alpha = static_cast<float*>(const_expr->data->data)[0];
         }
 
diff --git a/src/relax/backend/contrib/cutlass/codegen.cc b/src/relax/backend/contrib/cutlass/codegen.cc
index 03621c400551..dfe4b24e4f12 100644
--- a/src/relax/backend/contrib/cutlass/codegen.cc
+++ b/src/relax/backend/contrib/cutlass/codegen.cc
@@ -167,9 +167,9 @@ class CodegenCutlass : public relax::MemoizedExprTranslator<OutputType>,
     for (const auto& arg : ext_func_args_) {
       auto ty = GetType(arg);
       if (const auto* tensor_ty = ty.as<TensorTypeNode>()) {
-        arg_types.emplace_back(backend::DType2String(tensor_ty->dtype));
+        arg_types.emplace_back(backend::DType2String(tensor_ty->dtype->dtype));
       } else if (const auto* shape_ty = ty.as<ShapeTypeNode>()) {
-        arg_types.emplace_back(backend::DType2String(shape_ty->values.value()[0]->dtype));
+        arg_types.emplace_back(backend::DType2String(shape_ty->values.value()[0].ty()->dtype));
       } else {
         TVM_FFI_THROW(InternalError) << "Unimplemented";
       }
@@ -302,7 +302,7 @@ class CodegenCutlass : public relax::MemoizedExprTranslator<OutputType>,
 
     std::vector<std::string> out_types;
     if (const auto* tensor_ty = ty.as<TensorTypeNode>()) {
-      out_types.emplace_back(backend::DType2String(tensor_ty->dtype));
+      out_types.emplace_back(backend::DType2String(tensor_ty->dtype->dtype));
     } else {
       TVM_FFI_THROW(InternalError) << "Unimplemented ty type: " << ty;
     }
diff --git a/src/relax/backend/contrib/utils.h b/src/relax/backend/contrib/utils.h
index 93916bf23236..6147a6eb2199 100644
--- a/src/relax/backend/contrib/utils.h
+++ b/src/relax/backend/contrib/utils.h
@@ -59,9 +59,7 @@ inline std::vector<int64_t> GetIntShape(const ffi::Array<PrimExpr>& shape) {
  * \param typ
  * \return std::string string format of type
  */
-inline std::string DType2String(const tvm::DataType dtype) {
-  return tvm::ffi::DLDataTypeToString(dtype);
-}
+inline std::string DType2String(DLDataType dtype) { return tvm::ffi::DLDataTypeToString(dtype); }
 
 /*!
  * \brief Check if a call node is calling an op with the given name
diff --git a/src/relax/backend/vm/codegen_vm_tir.cc b/src/relax/backend/vm/codegen_vm_tir.cc
index c1e9af85511c..3e2ac365d4fb 100644
--- a/src/relax/backend/vm/codegen_vm_tir.cc
+++ b/src/relax/backend/vm/codegen_vm_tir.cc
@@ -88,19 +88,19 @@ class CodeGenVMTIR : public ExprFunctor<ffi::Optional<PrimExpr>(const Expr&)> {
 
   PrimExpr RegListGet(int64_t slot) const {
     // use 128 bits to represent any
-    return tirx::Call(DataType::Handle(), tirx::builtin::anylist_getitem(),
+    return tirx::Call(tvm::PrimType::Handle(), tirx::builtin::anylist_getitem(),
                       {reg_anylist_handle_, ConstInt32(slot)});
   }
 
   PrimExpr ConstListGet(int64_t slot) const {
     // use 128 bits to represent any
-    return tirx::Call(DataType::Handle(), tirx::builtin::anylist_getitem(),
+    return tirx::Call(tvm::PrimType::Handle(), tirx::builtin::anylist_getitem(),
                       {const_anylist_handle_, ConstInt32(slot)});
   }
 
   PrimExpr FuncListGet(int64_t slot) const {
     // use 128 bits to represent any
-    return tirx::Call(DataType::Handle(), tirx::builtin::anylist_getitem(),
+    return tirx::Call(tvm::PrimType::Handle(), tirx::builtin::anylist_getitem(),
                       {func_anylist_handle_, ConstInt32(slot)});
   }
 
@@ -121,11 +121,11 @@ class CodeGenVMTIR : public ExprFunctor<ffi::Optional<PrimExpr>(const Expr&)> {
       all_args.push_back(arg);
     }
     if (dst_anylist_slot >= 0) {
-      this->EmitStmt(tirx::Evaluate(
-          tirx::Call(DataType::Int(32), tirx::builtin::anylist_setitem_call_packed(), all_args)));
+      this->EmitStmt(tirx::Evaluate(tirx::Call(
+          tvm::PrimType::Int(32), tirx::builtin::anylist_setitem_call_packed(), all_args)));
     } else {
       this->EmitStmt(tirx::Evaluate(
-          tirx::Call(DataType::Int(32), tirx::builtin::tvm_call_packed(), all_args)));
+          tirx::Call(tvm::PrimType::Int(32), tirx::builtin::tvm_call_packed(), all_args)));
     }
   }
 
@@ -143,11 +143,11 @@ class CodeGenVMTIR : public ExprFunctor<ffi::Optional<PrimExpr>(const Expr&)> {
       all_args.push_back(arg);
     }
     if (dst_anylist_slot >= 0) {
-      this->EmitStmt(tirx::Evaluate(
-          tirx::Call(DataType::Int(32), tirx::builtin::anylist_setitem_call_cpacked(), all_args)));
+      this->EmitStmt(tirx::Evaluate(tirx::Call(
+          tvm::PrimType::Int(32), tirx::builtin::anylist_setitem_call_cpacked(), all_args)));
     } else {
       this->EmitStmt(tirx::Evaluate(
-          tirx::Call(DataType::Int(32), tirx::builtin::tvm_call_cpacked(), all_args)));
+          tirx::Call(tvm::PrimType::Int(32), tirx::builtin::tvm_call_cpacked(), all_args)));
     }
   }
 
@@ -160,10 +160,10 @@ class CodeGenVMTIR : public ExprFunctor<ffi::Optional<PrimExpr>(const Expr&)> {
     stmt_stack_ = {};
     registers_num_ = 0;
     var_map_.clear();
-    ctx_ptr_ = tirx::Var("ctx_ptr", DataType::Handle());
-    reg_anylist_handle_ = tirx::Var("r", DataType::Handle());
-    func_anylist_handle_ = tirx::Var("f", DataType::Handle());
-    const_anylist_handle_ = tirx::Var("c", DataType::Handle());
+    ctx_ptr_ = tirx::Var("ctx_ptr", PrimType::Handle());
+    reg_anylist_handle_ = tirx::Var("r", PrimType::Handle());
+    func_anylist_handle_ = tirx::Var("f", PrimType::Handle());
+    const_anylist_handle_ = tirx::Var("c", PrimType::Handle());
 
     ffi::Array<ffi::String> param_names;
     for (Var param : func->params) {
@@ -231,7 +231,7 @@ class CodeGenVMTIR : public ExprFunctor<ffi::Optional<PrimExpr>(const Expr&)> {
     Call call = ffi::GetRef<Call>(call_node);
 
     if (call_node->op == null_value_op_) {
-      return tirx::Call(DataType::Handle(), tirx::builtin::reinterpret(), {IntImm::Int64(0)});
+      return tirx::Call(tvm::PrimType::Handle(), tirx::builtin::reinterpret(), {IntImm::Int64(0)});
     }
     int64_t dst_reg = HasVoidType(call) ? -1 : NewRegister();
     if (call->op.as<OpNode>()) {
@@ -264,7 +264,7 @@ class CodeGenVMTIR : public ExprFunctor<ffi::Optional<PrimExpr>(const Expr&)> {
     size_t merge_register = NewRegister();
     PrimExpr cond_value = this->VisitExpr(op->cond).value();
 
-    cond_value = tirx::Call(DataType::Bool(), tirx::builtin::tvm_call_packed(),
+    cond_value = tirx::Call(tvm::PrimType::Bool(), tirx::builtin::tvm_call_packed(),
                             {tirx::StringImm("vm.builtin.read_if_cond"), cond_value});
 
     tirx::Stmt true_branch = WithNewScope([&]() {
@@ -438,7 +438,7 @@ class CodeGenVMTIR : public ExprFunctor<ffi::Optional<PrimExpr>(const Expr&)> {
     TVM_FFI_ICHECK(tir_call->args[0].same_as(reg_anylist_handle_));
     const auto* p_dst_reg = tir_call->args[1].as<tirx::IntImmNode>();
     TVM_FFI_ICHECK(p_dst_reg != nullptr);
-    TVM_FFI_ICHECK(p_dst_reg->dtype == DataType::Int(32));
+    TVM_FFI_ICHECK(p_dst_reg->ty().MatchesElementType(DLDataTypeCode::kDLInt, 32));
 
     int64_t dst_reg = p_dst_reg->value;
     this->EmitCallPacked("vm.builtin.null_value", {}, dst_reg);
diff --git a/src/relax/backend/vm/lower_runtime_builtin.cc b/src/relax/backend/vm/lower_runtime_builtin.cc
index 344fc6a67e65..4a32efd81e5a 100644
--- a/src/relax/backend/vm/lower_runtime_builtin.cc
+++ b/src/relax/backend/vm/lower_runtime_builtin.cc
@@ -21,6 +21,7 @@
  * \brief Lowers most builtin functions and packed calls.
  */
 #include <tvm/ffi/cast.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/relax/analysis.h>
 #include <tvm/relax/attrs/op.h>
@@ -29,7 +30,6 @@
 #include <tvm/relax/expr_functor.h>
 #include <tvm/relax/op_attr_types.h>
 #include <tvm/relax/type.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/tirx/op.h>
 
 namespace tvm {
@@ -85,7 +85,7 @@ class LowerRuntimeBuiltinMutator : public ExprMutator {
   Expr MakeMemAllocStorage(const Call& call) {
     PrimValue runtime_device_index = call->args[1].as_or_throw<PrimValue>();
     StringImm storage_scope = call->args[2].as_or_throw<StringImm>();
-    DataTypeImm output_dtype = DataTypeImm(DataType::UInt(8));
+    DataTypeImm output_dtype = DataTypeImm((DLDataType{kDLUInt, 8, 1}));
     return Call(vm_alloc_storage_op_,
                 {call->args[0], runtime_device_index, output_dtype, storage_scope}, Attrs());
   }
diff --git a/src/relax/backend/vm/vm_shape_lower.cc b/src/relax/backend/vm/vm_shape_lower.cc
index 3d895349bbc3..6784489c5b32 100644
--- a/src/relax/backend/vm/vm_shape_lower.cc
+++ b/src/relax/backend/vm/vm_shape_lower.cc
@@ -229,7 +229,7 @@ class VMShapeLowerMutator
     slot_map_.clear();
     current_gvar_ = gvar;
     PrimExprSlotCollector::Collect(func, &slot_vec_, &slot_map_);
-    heap_size_ = IntImm(ShapeDType(), static_cast<int64_t>(slot_vec_.size()));
+    heap_size_ = IntImm(tvm::PrimType(ShapeDType()), static_cast<int64_t>(slot_vec_.size()));
     VarBinding shape_heap_binding = this->AllocShapeHeapBinding(heap_size_);
     shape_heap_ = shape_heap_binding->var;
 
@@ -298,7 +298,7 @@ class VMShapeLowerMutator
   //-------------------------------------------------------
   // PrimExpr slot handling
   //-------------------------------------------------------
-  static DataType ShapeDType() { return DataType::Int(64); }
+  static DLDataType ShapeDType() { return DLDataType{kDLInt, 64, 1}; }
 
   /*! \brief populate additional information in the slot. */
   void PopulateSlotInfo() {
@@ -329,7 +329,7 @@ class VMShapeLowerMutator
 
   VarBinding AllocShapeHeapBinding(IntImm heap_size) {
     if (heap_size->value > 0) {
-      TensorType heap_ty(ShapeDType(), 1);
+      TensorType heap_ty(PrimType(ShapeDType()), 1);
       Var var("shape_heap", heap_ty);
       // set up the builtin func.
       Call call(call_builtin_with_ctx_op_,
@@ -566,7 +566,7 @@ class VMShapeLowerMutator
     if (to_compute.size() == 0) return 0;
     TVM_FFI_ICHECK_GT(heap_size_->value, 0);
     // construct a PrimFunc that compute the shape.
-    tirx::Var heap("heap", DataType::Handle());
+    tirx::Var heap("heap", PrimType::Handle());
     ffi::Array<PrimExpr> buffer_shape{heap_size_};
     tirx::Buffer buffer = tirx::decl_buffer(buffer_shape, ShapeDType(), "H", "global");
     ffi::Map<tirx::Var, tirx::Buffer> buffer_map;
@@ -575,7 +575,8 @@ class VMShapeLowerMutator
     auto var_map = [&](const tirx::Var& var) -> ffi::Optional<PrimExpr> {
       auto it = slot_map_.find(var);
       TVM_FFI_ICHECK(it != slot_map_.end());
-      return tirx::BufferLoad(buffer, {IntImm(ShapeDType(), it->second->index)});
+      return tirx::BufferLoad(
+          buffer, ffi::Array<PrimExpr>{IntImm(tvm::PrimType(ShapeDType()), it->second->index)});
     };
 
     ffi::Array<tirx::Stmt> seq;
@@ -583,7 +584,8 @@ class VMShapeLowerMutator
       TVM_FFI_ICHECK(!slot->value_computed);
       slot->value_computed = true;
       PrimExpr value = tirx::Substitute(slot->expr, var_map);
-      seq.push_back(tirx::BufferStore(buffer, value, {IntImm(ShapeDType(), slot->index)}));
+      seq.push_back(
+          tirx::BufferStore(buffer, value, {IntImm(tvm::PrimType(ShapeDType()), slot->index)}));
     }
 
     tirx::Stmt body = tirx::SeqStmt::Flatten(seq);
@@ -678,10 +680,11 @@ class VMShapeLowerMutator
       // if we only check dynamic shapes, and the shape is static, we can skip.
       return;
     }
-    if (always_check || !IsBaseOf(TensorType(op->dtype, op->ndim), GetType(value))) {
+    if (always_check || !IsBaseOf(TensorType(PrimType(op->dtype), op->ndim), GetType(value))) {
       // check_tensor_info(value, ndim, dtype, err_ctx)
       Call call(builtin_check_tensor_info_,
-                {value, PrimValue::Int64(op->ndim), DataTypeImm(op->dtype), GetErrContext(err_ctx)},
+                {value, PrimValue::Int64(op->ndim), DataTypeImm(op->dtype->dtype),
+                 GetErrContext(err_ctx)},
                 Attrs(), {void_ty_});
       builder_->Emit(call, "_");
     }
diff --git a/src/relax/ir/dataflow_expr_rewriter.cc b/src/relax/ir/dataflow_expr_rewriter.cc
index 7b14a1f7e7e9..10fd67de1740 100644
--- a/src/relax/ir/dataflow_expr_rewriter.cc
+++ b/src/relax/ir/dataflow_expr_rewriter.cc
@@ -736,7 +736,7 @@ PatternMatchingRewriter PatternMatchingRewriter::FromModule(IRModule mod) {
       return ExternFuncPattern(func->global_symbol);
 
     } else if (auto prim = expr.as<PrimValueNode>()) {
-      return TypePattern(WildcardPattern(), PrimType(prim->value.dtype()));
+      return TypePattern(WildcardPattern(), PrimType(prim->value.ty()));
 
     } else {
       TVM_FFI_THROW(TypeError) << "Cannot convert Relax expression of type " << expr->GetTypeKey()
diff --git a/src/relax/ir/dataflow_matcher.cc b/src/relax/ir/dataflow_matcher.cc
index 08689bd10f0b..f75c540a96cd 100644
--- a/src/relax/ir/dataflow_matcher.cc
+++ b/src/relax/ir/dataflow_matcher.cc
@@ -573,8 +573,7 @@ bool DFPatternMatcher::VisitDFPattern_(const DataTypePatternNode* op, const Expr
   // no need to jump, as var.dtype == value.dtype
   auto expr_ty = expr.as<ExprNode>()->ty;
   if (const TensorTypeNode* tensor_ty = expr_ty.as<TensorTypeNode>()) {
-    return (ffi::StructuralEqual()(op->dtype, tensor_ty->dtype)) &&
-           VisitDFPattern(op->pattern, expr);
+    return op->dtype == tensor_ty->dtype->dtype && VisitDFPattern(op->pattern, expr);
   }
   return false;
 }
diff --git a/src/relax/ir/dataflow_pattern.cc b/src/relax/ir/dataflow_pattern.cc
index 5cb5352ec6c2..6302ee85049a 100644
--- a/src/relax/ir/dataflow_pattern.cc
+++ b/src/relax/ir/dataflow_pattern.cc
@@ -369,15 +369,15 @@ RELAX_PATTERN_PRINTER_DEF(SameShapeConstraintNode, [](auto p, auto node) {
   p->stream << ")";
 });
 
-DataTypePattern::DataTypePattern(DFPattern pattern, DataType dtype) {
+DataTypePattern::DataTypePattern(DFPattern pattern, DLDataType dtype) {
   ffi::ObjectPtr<DataTypePatternNode> n = ffi::make_object<DataTypePatternNode>();
   n->pattern = std::move(pattern);
-  n->dtype = std::move(dtype);
+  n->dtype = dtype;
   data_ = std::move(n);
 }
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("relax.dpl.DataTypePattern", [](DFPattern pattern, DataType dtype) {
+  refl::GlobalDef().def("relax.dpl.DataTypePattern", [](DFPattern pattern, DLDataType dtype) {
     return DataTypePattern(pattern, dtype);
   });
 }
@@ -474,11 +474,11 @@ AttrPattern DFPattern::HasAttr(const ffi::Map<ffi::String, Any>& attrs) const {
   return AttrPattern(*this, DictAttrs(attrs));
 }
 TypePattern DFPattern::HasType(const Type& ty) const { return TypePattern(*this, ty); }
-DataTypePattern DFPattern::HasDtype(const DataType& dtype) const {
+DataTypePattern DFPattern::HasDtype(DLDataType dtype) const {
   return DataTypePattern(*this, dtype);
 }
 DataTypePattern DFPattern::HasDtype(const std::string& dtype) const {
-  return HasDtype(DataType(ffi::StringToDLDataType(dtype)));
+  return HasDtype(ffi::StringToDLDataType(dtype));
 }
 ShapePattern DFPattern::HasShape(const ffi::Array<PrimExpr>& shape) const {
   return ShapePattern(*this, shape);
diff --git a/src/relax/ir/dependent_type.cc b/src/relax/ir/dependent_type.cc
index 6a2034ccc2a8..d95ebb1534e7 100644
--- a/src/relax/ir/dependent_type.cc
+++ b/src/relax/ir/dependent_type.cc
@@ -54,9 +54,9 @@ ShapeType::ShapeType(ffi::Array<PrimExpr> values, Span span) {
   n->ndim = static_cast<int>(values.size());
   n->values = values.Map([](PrimExpr value) {
     if (value->IsInstance<IntImmNode>()) {
-      return tvm::cast(DataType::Int(64), value);
+      return tvm::cast(PrimType::Int(64), value);
     }
-    TVM_FFI_ICHECK(value.dtype() == DataType::Int(64))
+    TVM_FFI_ICHECK(value.ty().MatchesElementType(DLDataTypeCode::kDLInt, 64))
         << "the value in ShapeType can only have dtype of int64";
     return value;
   });
@@ -86,7 +86,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 }
 
 // Tensor
-TensorType::TensorType(Expr shape, DataType dtype, ffi::Optional<VDevice> vdevice, Span span) {
+TensorType::TensorType(Expr shape, PrimType dtype, ffi::Optional<VDevice> vdevice, Span span) {
   ffi::ObjectPtr<TensorTypeNode> n = ffi::make_object<TensorTypeNode>();
   // assign ndim before move
   TVM_FFI_ICHECK(shape.defined()) << "Must provide a shape in this constructor";
@@ -103,7 +103,7 @@ TensorType::TensorType(Expr shape, DataType dtype, ffi::Optional<VDevice> vdevic
   data_ = std::move(n);
 }
 
-TensorType::TensorType(DataType dtype, int ndim, ffi::Optional<VDevice> vdevice, Span span) {
+TensorType::TensorType(PrimType dtype, int ndim, ffi::Optional<VDevice> vdevice, Span span) {
   ffi::ObjectPtr<TensorTypeNode> n = ffi::make_object<TensorTypeNode>();
   TVM_FFI_ICHECK(ndim >= -1) << "ndim of TensorType must be >= -1, but got " << ndim;
   n->ndim = ndim;
@@ -116,13 +116,14 @@ TensorType::TensorType(DataType dtype, int ndim, ffi::Optional<VDevice> vdevice,
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def(
-      "relax.TensorType", [](ffi::Optional<Expr> shape, ffi::Optional<DataType> dtype, int ndim,
+      "relax.TensorType", [](ffi::Optional<Expr> shape, ffi::Optional<PrimType> dtype, int ndim,
                              VDevice vdevice, Span span) {
+        PrimType resolved_dtype = dtype.value_or(PrimType(DLDataType{kDLOpaqueHandle, 0, 0}));
         if (shape.defined()) {
           TVM_FFI_CHECK_EQ(ndim, kUnknownNDim, ValueError) << "Cannot both specify shape and ndim";
-          return TensorType(shape.value(), dtype.value_or(DataType::Void()), vdevice, span);
+          return TensorType(shape.value(), resolved_dtype, vdevice, span);
         } else {
-          return TensorType(dtype.value_or(DataType::Void()), ndim, vdevice, span);
+          return TensorType(resolved_dtype, ndim, vdevice, span);
         }
       });
 }
diff --git a/src/relax/ir/emit_te.cc b/src/relax/ir/emit_te.cc
index 304911c1dca2..68e48eaf93b6 100644
--- a/src/relax/ir/emit_te.cc
+++ b/src/relax/ir/emit_te.cc
@@ -42,7 +42,7 @@ te::Tensor TETensor(Expr value, ffi::Map<tirx::Var, PrimExpr> tir_var_map, std::
   // checked-type might not be properly set. In this case we set the shape and dtype of the returned
   // TE tensor.
   if (const auto* constant = value.as<ConstantNode>()) {
-    n->dtype = DataType(constant->data->dtype);
+    n->dtype = PrimType(constant->data->dtype);
 
     int ndim = constant->data->ndim;
     ffi::Shape shape_tuple = constant->data.Shape();
diff --git a/src/relax/ir/expr.cc b/src/relax/ir/expr.cc
index 11e80135500a..b4c4486f0dd4 100644
--- a/src/relax/ir/expr.cc
+++ b/src/relax/ir/expr.cc
@@ -257,9 +257,9 @@ ShapeExpr::ShapeExpr(ffi::Array<PrimExpr> values, Span span) {
 
   n->values = values.Map([](PrimExpr value) {
     if (value->IsInstance<IntImmNode>()) {
-      return tvm::cast(DataType::Int(64), value);
+      return tvm::cast(PrimType::Int(64), value);
     }
-    TVM_FFI_ICHECK(value.dtype() == DataType::Int(64))
+    TVM_FFI_ICHECK(value.ty().MatchesElementType(DLDataTypeCode::kDLInt, 64))
         << "the value in ShapeType can only have dtype of int64";
     return value;
   });
@@ -350,7 +350,7 @@ Constant::Constant(runtime::Tensor data, ffi::Optional<Type> ty_annotation, Span
   if (ty_annotation.defined()) {
     n->ty = ty_annotation.value();
   } else {
-    TensorType tinfo(ShapeExpr(values), n->data.DataType(), VDevice(), span);
+    TensorType tinfo(ShapeExpr(values), PrimType(n->data.DataType()), VDevice(), span);
     n->ty = tinfo;
   }
 
@@ -366,7 +366,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 PrimValue::PrimValue(PrimExpr value, Span span) {
   ffi::ObjectPtr<PrimValueNode> n = ffi::make_object<PrimValueNode>();
-  n->ty = PrimType(value.dtype());
+  n->ty = PrimType(value.ty());
   n->value = std::move(value);
   n->span = std::move(span);
   data_ = std::move(n);
@@ -396,9 +396,9 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                         [](ffi::String value, Span span) { return StringImm(value, span); });
 }
 
-DataTypeImm::DataTypeImm(DataType value, Span span) {
+DataTypeImm::DataTypeImm(DLDataType value, Span span) {
   ffi::ObjectPtr<DataTypeImmNode> n = ffi::make_object<DataTypeImmNode>();
-  n->value = std::move(value);
+  n->value = value;
   n->span = std::move(span);
   n->ty = ObjectType();
   data_ = std::move(n);
@@ -407,7 +407,7 @@ DataTypeImm::DataTypeImm(DataType value, Span span) {
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("relax.DataTypeImm",
-                        [](DataType value, Span span) { return DataTypeImm(value, span); });
+                        [](DLDataType value, Span span) { return DataTypeImm(value, span); });
 }
 
 MatchCast::MatchCast(Var var, Expr value, Type ty, Span span) {
diff --git a/src/relax/op/ccl/ccl.cc b/src/relax/op/ccl/ccl.cc
index dd67f65dea09..15b8064d2b6f 100644
--- a/src/relax/op/ccl/ccl.cc
+++ b/src/relax/op/ccl/ccl.cc
@@ -85,7 +85,7 @@ Type InferTypeAllGather(const Call& call, const BlockBuilder& ctx) {
   const auto* attrs = call->attrs.as<AllGatherAttrs>();
   int num_workers = attrs->num_workers;
 
-  DataType output_dtype = input_ty->dtype;
+  PrimType output_dtype = input_ty->dtype;
   auto input_shape = input_ty->GetShape();
   if (!input_shape.defined()) {
     return input_ty;
@@ -143,7 +143,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 Type InferTypeScatter(const Call& call, const BlockBuilder& ctx) {
   TensorType input_ty = GetUnaryInputTensorType(call, ctx);
-  DataType output_dtype = input_ty->dtype;
+  PrimType output_dtype = input_ty->dtype;
 
   const auto* attrs = call->attrs.as<ScatterCollectiveAttrs>();
   int num_workers = attrs->num_workers;
diff --git a/src/relax/op/distributed/binary.cc b/src/relax/op/distributed/binary.cc
index 766d60edb86f..daaacff4121b 100644
--- a/src/relax/op/distributed/binary.cc
+++ b/src/relax/op/distributed/binary.cc
@@ -31,7 +31,7 @@ Type InferDistTypeBroadcastCMP(const Call& call, const BlockBuilder& ctx) {
   return InferDistTypeBroadcast(
       call, ctx,
       [](const Call& call, const BlockBuilder& ctx, const TensorType& x1_ty,
-         const TensorType& x2_ty) { return DataType::Bool(); });
+         const TensorType& x2_ty) { return DLDataType{kDLBool, 8, 1}; });
 }
 
 /***************** Arithmetic operators *****************/
diff --git a/src/relax/op/distributed/binary.h b/src/relax/op/distributed/binary.h
index 5fd39b50f364..a6d3fd9ba124 100644
--- a/src/relax/op/distributed/binary.h
+++ b/src/relax/op/distributed/binary.h
@@ -41,8 +41,8 @@ Type InferDistTypeBroadcast(const Call& call, const BlockBuilder& ctx, FType f_c
   TensorType x1_ty = input_dtensor_tys[0]->tensor_ty;
   TensorType x2_ty = input_dtensor_tys[1]->tensor_ty;
 
-  // DateType
-  DataType output_dtype = f_compute_out_dtype(call, ctx, x1_ty, x2_ty);
+  // Dtype
+  PrimType output_dtype(f_compute_out_dtype(call, ctx, x1_ty, x2_ty));
 
   // ndims
   TVM_FFI_ICHECK(!x1_ty->IsUnknownNdim() && !x2_ty->IsUnknownNdim())
diff --git a/src/relax/op/distributed/distributed.cc b/src/relax/op/distributed/distributed.cc
index b009630070cd..ff5bc986c0c7 100644
--- a/src/relax/op/distributed/distributed.cc
+++ b/src/relax/op/distributed/distributed.cc
@@ -154,7 +154,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 Type InferTypeRtoS(const Call& call, const BlockBuilder& ctx) {
   TensorType input_ty = GetUnaryInputTensorType(call, ctx);
-  DataType output_dtype = input_ty->dtype;
+  PrimType output_dtype = input_ty->dtype;
 
   const auto* attrs = call->attrs.as<ScatterCollectiveAttrs>();
   int num_workers = attrs->num_workers;
diff --git a/src/relax/op/distributed/linear_algebra.cc b/src/relax/op/distributed/linear_algebra.cc
index 80fccbe115a9..b498f1a4a953 100644
--- a/src/relax/op/distributed/linear_algebra.cc
+++ b/src/relax/op/distributed/linear_algebra.cc
@@ -32,9 +32,9 @@ Type InferDistTypeMatmul(const Call& call, const BlockBuilder& ctx) {
   TensorType x2_ty = input_dtensor_tys[1]->tensor_ty;
 
   const auto* attrs = call->attrs.as<MatmulAttrs>();
-  DataType out_dtype = attrs->out_dtype.is_void()
-                           ? InferBinaryArithOpOutDtype(call, ctx, x1_ty, x2_ty)
-                           : attrs->out_dtype;
+  PrimType out_dtype = PrimType(attrs->out_dtype == DLDataType{kDLOpaqueHandle, 0, 0}
+                                    ? InferBinaryArithOpOutDtype(call, ctx, x1_ty, x2_ty)
+                                    : attrs->out_dtype);
 
   if (x1_ty->IsUnknownNdim() || x2_ty->IsUnknownNdim()) {
     TVM_FFI_VISIT_THROW(ValueError, call)
diff --git a/src/relax/op/distributed/nn.cc b/src/relax/op/distributed/nn.cc
index 1339a18e72d0..fcdc37c54046 100644
--- a/src/relax/op/distributed/nn.cc
+++ b/src/relax/op/distributed/nn.cc
@@ -33,7 +33,9 @@ Type InferDistTypeSoftmax(const Call& call, const BlockBuilder& ctx) {
   if (input_tensor_ty->IsUnknownNdim()) {
     TVM_FFI_VISIT_THROW(ValueError, call) << "Input of distributed operator must have known ndim";
   }
-  if (!input_tensor_ty->IsUnknownDtype() && !input_tensor_ty->dtype.is_float()) {
+  PrimType input_dtype = input_tensor_ty->dtype;
+  // Softmax validation preserves the old float-kind check; lanes do not affect this policy.
+  if (!input_tensor_ty->IsUnknownDtype() && !input_dtype.MatchesCode(DLDataTypeCode::kDLFloat)) {
     TVM_FFI_VISIT_THROW(TypeError, call) << "Softmax requires the input tensor to have float "
                                             "dtype. However, the given input dtype is "
                                          << input_tensor_ty->dtype;
diff --git a/src/relax/op/distributed/unary.cc b/src/relax/op/distributed/unary.cc
index 4356b403c6d9..8e4ccce23a9c 100644
--- a/src/relax/op/distributed/unary.cc
+++ b/src/relax/op/distributed/unary.cc
@@ -25,7 +25,7 @@ namespace distributed {
 
 Type InferDistTypeUnaryCheck(const Call& call, const BlockBuilder& ctx) {
   return InferDistTypeUnary<false>(call, ctx,
-                                   [](const TensorType& input_ty) { return DataType::Bool(); });
+                                   [](const TensorType& input_ty) { return PrimType::Bool(); });
 }
 
 RELAX_REGISTER_UNARY_ARITH_DIST_INFER_TYPE(abs, /*require_float_dtype=*/false);
diff --git a/src/relax/op/distributed/unary.h b/src/relax/op/distributed/unary.h
index 92c719ad0b98..58e0a41e27cb 100644
--- a/src/relax/op/distributed/unary.h
+++ b/src/relax/op/distributed/unary.h
@@ -40,15 +40,22 @@ Type InferDistTypeUnary(const Call& call, const BlockBuilder& ctx, FType f_compu
   distributed::DTensorType input_dtensor_ty = input_dtensor_tys[0];
   TensorType input_tensor_ty = input_dtensor_ty->tensor_ty;
 
+  PrimType input_dtype = input_tensor_ty->dtype;
+  // Unary op validation preserves the old float-kind check; lanes do not affect this policy.
   if (require_float_dtype && !input_tensor_ty->IsUnknownDtype() &&
-      !input_tensor_ty->dtype.is_float()) {
+      !input_dtype.MatchesCode(DLDataTypeCode::kDLFloat)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << call->op
         << " requires the input tensor to have float dtype. However, the given input dtype is "
         << input_tensor_ty->dtype;
   }
   auto output_ty = ffi::make_object<TensorTypeNode>(*input_tensor_ty.get());
-  output_ty->dtype = f_compute_out_dtype(input_tensor_ty);
+  auto computed_dtype = f_compute_out_dtype(input_tensor_ty);
+  if constexpr (std::is_same_v<std::decay_t<decltype(computed_dtype)>, PrimType>) {
+    output_ty->dtype = computed_dtype;
+  } else {
+    output_ty->dtype = PrimType(computed_dtype);
+  }
   TensorType out_tensor_ty(output_ty);
   return distributed::DTensorType(out_tensor_ty, input_dtensor_ty->device_mesh,
                                   input_dtensor_ty->placement);
diff --git a/src/relax/op/image/resize.cc b/src/relax/op/image/resize.cc
index b92167e031f1..82b12c0fe26f 100644
--- a/src/relax/op/image/resize.cc
+++ b/src/relax/op/image/resize.cc
@@ -41,7 +41,7 @@ TVM_FFI_STATIC_INIT_BLOCK() { Resize3DAttrs::RegisterReflection(); }
 Expr resize2d(Expr data, Expr size, ffi::Array<FloatImm> roi, ffi::String layout,
               ffi::String method, ffi::String coordinate_transformation_mode,
               ffi::String rounding_method, double cubic_alpha, int cubic_exclude,
-              double extrapolation_value, ffi::Optional<DataType> out_dtype) {
+              double extrapolation_value, ffi::Optional<DLDataType> out_dtype) {
   ffi::ObjectPtr<Resize2DAttrs> attrs = ffi::make_object<Resize2DAttrs>();
   attrs->roi = std::move(roi);
   attrs->layout = std::move(layout);
@@ -51,7 +51,7 @@ Expr resize2d(Expr data, Expr size, ffi::Array<FloatImm> roi, ffi::String layout
   attrs->cubic_alpha = cubic_alpha;
   attrs->cubic_exclude = cubic_exclude;
   attrs->extrapolation_value = extrapolation_value;
-  attrs->out_dtype = out_dtype.value_or(DataType::Void());
+  attrs->out_dtype = out_dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
 
   static const Op& op = Op::Get("relax.image.resize2d");
   return Call(op, {std::move(data), std::move(size)}, Attrs(attrs), {});
@@ -93,7 +93,9 @@ Type InferTypeResize2D(const Call& call, const BlockBuilder& ctx) {
                                                     /*tgt_layout=*/"NCHW",     //
                                                     /*tensor_name=*/"data");
 
-  DataType out_dtype = attrs->out_dtype.is_void() ? data_ty->dtype : attrs->out_dtype;
+  PrimType out_dtype = attrs->out_dtype == DLDataType{kDLOpaqueHandle, 0, 0}
+                           ? data_ty->dtype
+                           : PrimType(attrs->out_dtype);
 
   ffi::Optional<ShapeExpr> data_shape =
       CheckNdimPerLayoutAndGetShape(call, ctx, ffi::GetRef<TensorType>(data_ty), data_layout);
@@ -155,7 +157,7 @@ TVM_REGISTER_OP("relax.image.resize2d")
 Expr resize3d(Expr data, Expr size, ffi::Array<FloatImm> roi, ffi::String layout,
               ffi::String method, ffi::String coordinate_transformation_mode,
               ffi::String rounding_method, double cubic_alpha, int cubic_exclude,
-              double extrapolation_value, ffi::Optional<DataType> out_dtype) {
+              double extrapolation_value, ffi::Optional<DLDataType> out_dtype) {
   ffi::ObjectPtr<Resize3DAttrs> attrs = ffi::make_object<Resize3DAttrs>();
   attrs->roi = std::move(roi);
   attrs->layout = std::move(layout);
@@ -165,7 +167,7 @@ Expr resize3d(Expr data, Expr size, ffi::Array<FloatImm> roi, ffi::String layout
   attrs->cubic_alpha = cubic_alpha;
   attrs->cubic_exclude = cubic_exclude;
   attrs->extrapolation_value = extrapolation_value;
-  attrs->out_dtype = out_dtype.value_or(DataType::Void());
+  attrs->out_dtype = out_dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
 
   static const Op& op = Op::Get("relax.image.resize3d");
   return Call(op, {std::move(data), std::move(size)}, Attrs(attrs), {});
@@ -207,7 +209,9 @@ Type InferTypeResize3D(const Call& call, const BlockBuilder& ctx) {
                                                      /*tgt_layout=*/"NCDHW",    //
                                                      /*tensor_name=*/"data");
 
-  DataType out_dtype = attrs->out_dtype.is_void() ? data_ty->dtype : attrs->out_dtype;
+  PrimType out_dtype = attrs->out_dtype == DLDataType{kDLOpaqueHandle, 0, 0}
+                           ? data_ty->dtype
+                           : PrimType(attrs->out_dtype);
 
   ffi::Optional<ShapeExpr> data_shape =
       CheckNdimPerLayoutAndGetShape(call, ctx, ffi::GetRef<TensorType>(data_ty), data_layout);
@@ -315,7 +319,7 @@ Type InferTypeGridSample(const Call& call, const BlockBuilder& ctx) {
                                                    /*tgt_layout=*/is_ncdhw ? "NCDHW" : "NCHW",
                                                    /*tensor_name=*/"data");
 
-  DataType out_dtype = data_ty->dtype;
+  PrimType out_dtype = data_ty->dtype;
 
   ffi::Optional<ShapeExpr> data_shape =
       CheckNdimPerLayoutAndGetShape(call, ctx, ffi::GetRef<TensorType>(data_ty), data_layout);
@@ -422,7 +426,7 @@ Type InferTypeAffineGrid(const Call& call, const BlockBuilder& ctx) {
     }
   }
 
-  DataType out_dtype = data_ty->dtype;
+  PrimType out_dtype = data_ty->dtype;
 
   if (data_shape == nullptr || size_value == nullptr) {
     return TensorType(out_dtype, /*ndim=*/4, data_ty->vdevice);
diff --git a/src/relax/op/image/resize.h b/src/relax/op/image/resize.h
index 382a3a162be2..1aaed69f9146 100644
--- a/src/relax/op/image/resize.h
+++ b/src/relax/op/image/resize.h
@@ -36,13 +36,13 @@ namespace relax {
 Expr resize2d(Expr data, Expr size, ffi::Array<FloatImm> roi, ffi::String layout,
               ffi::String method, ffi::String coordinate_transformation_mode,
               ffi::String rounding_method, double cubic_alpha, int cubic_exclude,
-              double extrapolation_value, ffi::Optional<DataType> out_dtype);
+              double extrapolation_value, ffi::Optional<DLDataType> out_dtype);
 
 /*! \brief Image resize3d operator. */
 Expr resize3d(Expr data, Expr size, ffi::Array<FloatImm> roi, ffi::String layout,
               ffi::String method, ffi::String coordinate_transformation_mode,
               ffi::String rounding_method, double cubic_alpha, int cubic_exclude,
-              double extrapolation_value, ffi::Optional<DataType> out_dtype);
+              double extrapolation_value, ffi::Optional<DLDataType> out_dtype);
 
 /*! \brief Image grid_sample operator. */
 Expr grid_sample(Expr data, Expr grid, ffi::String method, ffi::String layout,
diff --git a/src/relax/op/memory/view.cc b/src/relax/op/memory/view.cc
index 25ad9aa66d8e..828eba4950f0 100644
--- a/src/relax/op/memory/view.cc
+++ b/src/relax/op/memory/view.cc
@@ -87,7 +87,7 @@ Type InferTypeView(const Call& call, const BlockBuilder& ctx) {
     }
   }();
 
-  auto view_dtype = [&]() -> std::optional<DataType> {
+  auto view_dtype = [&]() -> std::optional<DLDataType> {
     Type ty = GetType(arg_dtype);
 
     if (HasVoidType(arg_dtype)) {
@@ -116,7 +116,7 @@ Type InferTypeView(const Call& call, const BlockBuilder& ctx) {
     } else if (ty.as<ObjectTypeNode>()) {
       // The view changes the datatype, but we don't know what it is
       // being changed into.
-      return DataType::Void();
+      return DLDataType{kDLOpaqueHandle, 0, 0};
     } else {
       TVM_FFI_THROW(TypeError) << "Operator " << call->op
                                << " expects the dtype argument to be a relax::DataTypeImm, "
@@ -131,7 +131,7 @@ Type InferTypeView(const Call& call, const BlockBuilder& ctx) {
       // No byte offset is specified, so no change is applied.
       return IntImm::Int64(0);
     } else if (auto prim_ty = ty.as<PrimTypeNode>()) {
-      TVM_FFI_CHECK_EQ(prim_ty->dtype, DataType::Int(64), TypeError)
+      TVM_FFI_CHECK_EQ(prim_ty->dtype, (DLDataType{kDLInt, 64, 1}), TypeError)
           << "Operator " << call->op
           << " expects the relative_byte_offset to be a 64-bit integer, but received "
           << arg_relative_byte_offset << ", which has type " << ty;
@@ -167,17 +167,16 @@ Type InferTypeView(const Call& call, const BlockBuilder& ctx) {
     output_ndim = data_ty->ndim;
   }
 
-  DataType output_dtype = view_dtype.value_or(data_ty->dtype);
+  DLDataType output_raw_dtype = view_dtype.value_or(data_ty->dtype->dtype);
+  PrimType output_dtype(output_raw_dtype);
 
-  // Helper function, returns the number of bytes per vectorized
-  // element.  Cannot use `DataType::bytes`, as it returns the
-  // number of bytes per scalar element.
-  auto get_size_bytes = [](const DataType& dtype) -> ffi::Optional<IntImm> {
-    if (dtype.is_void()) {
+  // Helper function returns the number of bytes per vectorized element.
+  auto get_size_bytes = [](DLDataType dtype) -> ffi::Optional<IntImm> {
+    PrimType ty(dtype);
+    if (ty.IsVoid() || ty.IsScalableVector()) {
       return std::nullopt;
     } else {
-      auto size_bits = dtype.bits() * dtype.lanes();
-      return IntImm::Int64((size_bits + 7) / 8);
+      return IntImm::Int64(static_cast<int64_t>(ty.StorageBytes()));
     }
   };
 
@@ -199,8 +198,8 @@ Type InferTypeView(const Call& call, const BlockBuilder& ctx) {
   ffi::Optional<PrimExpr> input_nelements = get_num_elements(input_shape);
   ffi::Optional<PrimExpr> output_nelements = get_num_elements(output_shape);
 
-  ffi::Optional<IntImm> input_element_size = get_size_bytes(data_ty->dtype);
-  ffi::Optional<IntImm> output_element_size = get_size_bytes(output_dtype);
+  ffi::Optional<IntImm> input_element_size = get_size_bytes(data_ty->dtype->dtype);
+  ffi::Optional<IntImm> output_element_size = get_size_bytes(output_raw_dtype);
 
   if (input_nelements && output_nelements && input_element_size && output_element_size &&
       view_relative_byte_offset) {
@@ -329,8 +328,9 @@ Expr LowerBuiltinView(const BlockBuilder& bb, const Call& call) {
   }
 
   if (HasVoidType(dtype)) {
-    auto data_dtype = data->ty.as<TensorType>().value()->dtype;
-    TVM_FFI_ICHECK(!data_dtype.is_void())
+    DLDataType data_dtype = data->ty.as<TensorType>().value()->dtype->dtype;
+    TVM_FFI_ICHECK(!(((data_dtype).code == kDLOpaqueHandle) && ((data_dtype).bits == 0) &&
+                     ((data_dtype).lanes == 0)))
         << "Legalization of " << call->op
         << " requires that either the output dtype be explicitly specified, "
         << "or the input dtype is known.  "
diff --git a/src/relax/op/nn/attention.cc b/src/relax/op/nn/attention.cc
index 83080537c1d0..62e7d2959346 100644
--- a/src/relax/op/nn/attention.cc
+++ b/src/relax/op/nn/attention.cc
@@ -143,7 +143,7 @@ Type InferTypeAttention(const Call& call, const BlockBuilder& ctx) {
   return TensorType(ShapeExpr(output_shape), q_ty->dtype, q_ty->vdevice);
 }
 
-Call InferMixedPrecisionAttention(const Call& call, const DataType& out_dtype) {
+Call InferMixedPrecisionAttention(const Call& call, DLDataType out_dtype) {
   return attention(call->args[0], call->args[1], call->args[2], std::nullopt, std::nullopt,
                    std::nullopt, std::nullopt)
       .as_or_throw<Call>();
diff --git a/src/relax/op/nn/convolution.cc b/src/relax/op/nn/convolution.cc
index 1fa9b9b1ae94..90d58a9e662d 100644
--- a/src/relax/op/nn/convolution.cc
+++ b/src/relax/op/nn/convolution.cc
@@ -47,7 +47,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 Expr conv1d(Expr data, Expr weight, ffi::Array<int64_t> strides, ffi::Array<int64_t> padding,
             ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
             ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-            ffi::Optional<DataType> out_dtype) {
+            ffi::Optional<DLDataType> out_dtype) {
   padding = GetCompletePadding1D(std::move(padding));
 
   TVM_FFI_ICHECK_GT(groups, 0)
@@ -62,7 +62,8 @@ Expr conv1d(Expr data, Expr weight, ffi::Array<int64_t> strides, ffi::Array<int6
   return MakeConv<Conv1DAttrs>(std::move(data), std::move(weight), std::move(strides),
                                std::move(padding), std::move(dilation), groups, data_layout,
                                std::move(kernel_layout), out_layout.value_or(data_layout),
-                               out_dtype.value_or(DataType::Void()), /*op_name=*/"relax.nn.conv1d");
+                               out_dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0})),
+                               /*op_name=*/"relax.nn.conv1d");
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
@@ -91,9 +92,9 @@ Type InferTypeConv1d(const Call& call, const BlockBuilder& ctx) {
   ffi::Optional<ShapeExpr> weight_shape =
       CheckNdimPerLayoutAndGetShape(call, ctx, weight_ty, weight_layout);
 
-  DataType out_dtype = attrs->out_dtype.is_void()
-                           ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
-                           : attrs->out_dtype;
+  PrimType out_dtype = PrimType(attrs->out_dtype == DLDataType{kDLOpaqueHandle, 0, 0}
+                                    ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
+                                    : attrs->out_dtype);
   ffi::Optional<VDevice> vdevice = InferBinaryArithOpOutVDevice(call, ctx, data_ty, weight_ty);
   if (!data_shape.defined() || !weight_shape.defined()) {
     return TensorType(out_dtype, out_layout.ndim(), vdevice);
@@ -186,7 +187,7 @@ InferLayoutOutput InferLayoutConv1d(
   return InferLayoutOutput({data_layout, weight_layout}, {output_layout}, Attrs(new_attrs));
 }
 
-Call InferMixedPrecisionConv1d(const Call& call, const DataType& out_dtype) {
+Call InferMixedPrecisionConv1d(const Call& call, DLDataType out_dtype) {
   const auto* conv1d_attrs = call->attrs.as<Conv1DAttrs>();
   return conv1d(call->args[0], call->args[1], conv1d_attrs->strides, conv1d_attrs->padding,
                 conv1d_attrs->dilation, conv1d_attrs->groups, conv1d_attrs->data_layout,
@@ -210,7 +211,7 @@ TVM_REGISTER_OP("relax.nn.conv1d")
 Expr conv2d(Expr data, Expr weight, ffi::Array<int64_t> strides, ffi::Array<int64_t> padding,
             ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
             ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-            ffi::Optional<DataType> out_dtype) {
+            ffi::Optional<DLDataType> out_dtype) {
   padding = GetCompletePadding2D(std::move(padding));
   if (strides.size() == 1) {
     strides.push_back(strides[0]);
@@ -231,7 +232,8 @@ Expr conv2d(Expr data, Expr weight, ffi::Array<int64_t> strides, ffi::Array<int6
   return MakeConv<Conv2DAttrs>(std::move(data), std::move(weight), std::move(strides),
                                std::move(padding), std::move(dilation), groups, data_layout,
                                std::move(kernel_layout), out_layout.value_or(data_layout),
-                               out_dtype.value_or(DataType::Void()), /*op_name=*/"relax.nn.conv2d");
+                               out_dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0})),
+                               /*op_name=*/"relax.nn.conv2d");
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
@@ -260,9 +262,9 @@ Type InferTypeConv2d(const Call& call, const BlockBuilder& ctx) {
   ffi::Optional<ShapeExpr> weight_shape =
       CheckNdimPerLayoutAndGetShape(call, ctx, weight_ty, weight_layout);
 
-  DataType out_dtype = attrs->out_dtype.is_void()
-                           ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
-                           : attrs->out_dtype;
+  PrimType out_dtype = PrimType(attrs->out_dtype == DLDataType{kDLOpaqueHandle, 0, 0}
+                                    ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
+                                    : attrs->out_dtype);
   ffi::Optional<VDevice> vdevice = InferBinaryArithOpOutVDevice(call, ctx, data_ty, weight_ty);
   if (!data_shape.defined() || !weight_shape.defined()) {
     return TensorType(out_dtype, out_layout.ndim(), vdevice);
@@ -336,9 +338,10 @@ InferLayoutOutput InferLayoutConv2d(
     SLayout desired_data_layout = (*it).second[0];
     SLayout desired_weight_layout = (*it).second[1];
     SLayout desired_output_layout = (*it).second.size() == 3 ? (*it).second[2] : (*it).second[0];
-    tirx::SLayout input_layout(attrs->data_layout, DataType::Int(64));
-    tirx::SLayout kernel_layout(attrs->kernel_layout, DataType::Int(64));
-    tirx::SLayout out_layout(attrs->out_layout, DataType::Int(64));
+    tvm::PrimType i64_ty = tvm::PrimType::Int(64);
+    tirx::SLayout input_layout(attrs->data_layout, i64_ty);
+    tirx::SLayout kernel_layout(attrs->kernel_layout, i64_ty);
+    tirx::SLayout out_layout(attrs->out_layout, i64_ty);
 
     if ((desired_data_layout.ndim() == input_layout.ndim()) &&
         (desired_weight_layout.ndim() == kernel_layout.ndim()) &&
@@ -396,7 +399,7 @@ InferLayoutOutput InferLayoutConv2d(
   return InferLayoutOutput({data_layout, weight_layout}, {output_layout}, Attrs(new_attrs));
 }
 
-Call InferMixedPrecisionConv2d(const Call& call, const DataType& out_dtype) {
+Call InferMixedPrecisionConv2d(const Call& call, DLDataType out_dtype) {
   const auto* conv2d_attrs = call->attrs.as<Conv2DAttrs>();
   return conv2d(call->args[0], call->args[1], conv2d_attrs->strides, conv2d_attrs->padding,
                 conv2d_attrs->dilation, conv2d_attrs->groups, conv2d_attrs->data_layout,
@@ -420,7 +423,7 @@ TVM_REGISTER_OP("relax.nn.conv2d")
 Expr conv3d(Expr data, Expr weight, ffi::Array<int64_t> strides, ffi::Array<int64_t> padding,
             ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
             ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-            ffi::Optional<DataType> out_dtype) {
+            ffi::Optional<DLDataType> out_dtype) {
   padding = GetCompletePadding3D(std::move(padding));
   if (strides.size() == 1) {
     strides.push_back(strides[0]);
@@ -443,7 +446,8 @@ Expr conv3d(Expr data, Expr weight, ffi::Array<int64_t> strides, ffi::Array<int6
   return MakeConv<Conv3DAttrs>(std::move(data), std::move(weight), std::move(strides),
                                std::move(padding), std::move(dilation), groups, data_layout,
                                std::move(kernel_layout), out_layout.value_or(data_layout),
-                               out_dtype.value_or(DataType::Void()), /*op_name=*/"relax.nn.conv3d");
+                               out_dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0})),
+                               /*op_name=*/"relax.nn.conv3d");
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
@@ -472,9 +476,9 @@ Type InferTypeConv3d(const Call& call, const BlockBuilder& ctx) {
   ffi::Optional<ShapeExpr> weight_shape =
       CheckNdimPerLayoutAndGetShape(call, ctx, weight_ty, weight_layout);
 
-  DataType out_dtype = attrs->out_dtype.is_void()
-                           ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
-                           : attrs->out_dtype;
+  PrimType out_dtype = PrimType(attrs->out_dtype == DLDataType{kDLOpaqueHandle, 0, 0}
+                                    ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
+                                    : attrs->out_dtype);
   ffi::Optional<VDevice> vdevice = InferBinaryArithOpOutVDevice(call, ctx, data_ty, weight_ty);
   if (!data_shape.defined() || !weight_shape.defined()) {
     return TensorType(out_dtype, out_layout.ndim(), vdevice);
@@ -581,7 +585,7 @@ InferLayoutOutput InferLayoutConv3d(
   return InferLayoutOutput({data_layout, weight_layout}, {output_layout}, Attrs(new_attrs));
 }
 
-Call InferMixedPrecisionConv3d(const Call& call, const DataType& out_dtype) {
+Call InferMixedPrecisionConv3d(const Call& call, DLDataType out_dtype) {
   const auto* conv3d_attrs = call->attrs.as<Conv3DAttrs>();
   return conv3d(call->args[0], call->args[1], conv3d_attrs->strides, conv3d_attrs->padding,
                 conv3d_attrs->dilation, conv3d_attrs->groups, conv3d_attrs->data_layout,
@@ -604,7 +608,7 @@ Expr conv1d_transpose(Expr data, Expr weight, ffi::Array<int64_t> strides,
                       ffi::Array<int64_t> padding, ffi::Array<int64_t> output_padding,
                       ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
                       ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-                      ffi::Optional<DataType> out_dtype) {
+                      ffi::Optional<DLDataType> out_dtype) {
   padding = GetCompletePadding1D(std::move(padding));
 
   TVM_FFI_ICHECK_GT(groups, 0)
@@ -630,7 +634,7 @@ Expr conv1d_transpose(Expr data, Expr weight, ffi::Array<int64_t> strides,
   attrs->data_layout = data_layout;
   attrs->kernel_layout = std::move(kernel_layout);
   attrs->out_layout = out_layout.value_or(data_layout);
-  attrs->out_dtype = std::move(out_dtype.value_or(DataType::Void()));
+  attrs->out_dtype = out_dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
   const Op& op = Op::Get("relax.nn.conv1d_transpose");
   return Call(op, {data, weight}, Attrs(attrs), {});
 }
@@ -660,9 +664,9 @@ Type InferTypeConv1dTranspose(const Call& call, const BlockBuilder& ctx) {
   ffi::Optional<ShapeExpr> weight_shape =
       CheckNdimPerLayoutAndGetShape(call, ctx, weight_ty, weight_layout);
 
-  DataType out_dtype = attrs->out_dtype.is_void()
-                           ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
-                           : attrs->out_dtype;
+  PrimType out_dtype = PrimType(attrs->out_dtype == DLDataType{kDLOpaqueHandle, 0, 0}
+                                    ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
+                                    : attrs->out_dtype);
   ffi::Optional<VDevice> vdevice = InferBinaryArithOpOutVDevice(call, ctx, data_ty, weight_ty);
   if (!data_shape.defined() || !weight_shape.defined()) {
     return TensorType(out_dtype, out_layout.ndim(), vdevice);
@@ -758,7 +762,7 @@ InferLayoutOutput InferLayoutConv1dTranspose(
   return InferLayoutOutput({data_layout, weight_layout}, {output_layout}, Attrs(new_attrs));
 }
 
-Call InferMixedPrecisionConv1dTranspose(const Call& call, const DataType& out_dtype) {
+Call InferMixedPrecisionConv1dTranspose(const Call& call, DLDataType out_dtype) {
   const auto* conv1d_transpose_attrs = call->attrs.as<Conv1DTransposeAttrs>();
   return conv1d_transpose(call->args[0], call->args[1], conv1d_transpose_attrs->strides,
                           conv1d_transpose_attrs->padding, conv1d_transpose_attrs->output_padding,
@@ -786,7 +790,7 @@ Expr conv2d_transpose(Expr data, Expr weight, ffi::Array<int64_t> strides,
                       ffi::Array<int64_t> padding, ffi::Array<int64_t> output_padding,
                       ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
                       ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-                      ffi::Optional<DataType> out_dtype) {
+                      ffi::Optional<DLDataType> out_dtype) {
   padding = GetCompletePadding2D(std::move(padding));
   if (output_padding.size() == 1) {
     output_padding.push_back(output_padding[0]);
@@ -821,7 +825,7 @@ Expr conv2d_transpose(Expr data, Expr weight, ffi::Array<int64_t> strides,
   attrs->data_layout = data_layout;
   attrs->kernel_layout = std::move(kernel_layout);
   attrs->out_layout = out_layout.value_or(data_layout);
-  attrs->out_dtype = std::move(out_dtype.value_or(DataType::Void()));
+  attrs->out_dtype = out_dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
   const Op& op = Op::Get("relax.nn.conv2d_transpose");
   return Call(op, {data, weight}, Attrs(attrs), {});
 }
@@ -852,9 +856,9 @@ Type InferTypeConv2dTranspose(const Call& call, const BlockBuilder& ctx) {
   ffi::Optional<ShapeExpr> weight_shape =
       CheckNdimPerLayoutAndGetShape(call, ctx, weight_ty, weight_layout);
 
-  DataType out_dtype = attrs->out_dtype.is_void()
-                           ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
-                           : attrs->out_dtype;
+  PrimType out_dtype = PrimType(attrs->out_dtype == DLDataType{kDLOpaqueHandle, 0, 0}
+                                    ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
+                                    : attrs->out_dtype);
   ffi::Optional<VDevice> vdevice = InferBinaryArithOpOutVDevice(call, ctx, data_ty, weight_ty);
   if (!data_shape.defined() || !weight_shape.defined()) {
     return TensorType(out_dtype, out_layout.ndim(), vdevice);
@@ -987,7 +991,7 @@ InferLayoutOutput InferLayoutConv2dTranspose(
   return InferLayoutOutput({data_layout, weight_layout}, {output_layout}, Attrs(new_attrs));
 }
 
-Call InferMixedPrecisionConv2dTranspose(const Call& call, const DataType& out_dtype) {
+Call InferMixedPrecisionConv2dTranspose(const Call& call, DLDataType out_dtype) {
   const auto* conv2d_transpose_attrs = call->attrs.as<Conv2DTransposeAttrs>();
   return conv2d_transpose(call->args[0], call->args[1], conv2d_transpose_attrs->strides,
                           conv2d_transpose_attrs->padding, conv2d_transpose_attrs->output_padding,
@@ -1015,7 +1019,7 @@ Expr conv3d_transpose(Expr data, Expr weight, ffi::Array<int64_t> strides,
                       ffi::Array<int64_t> padding, ffi::Array<int64_t> output_padding,
                       ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
                       ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-                      ffi::Optional<DataType> out_dtype) {
+                      ffi::Optional<DLDataType> out_dtype) {
   padding = GetCompletePadding3D(std::move(padding));
   if (output_padding.size() == 1) {
     output_padding.push_back(output_padding[0]);
@@ -1053,7 +1057,7 @@ Expr conv3d_transpose(Expr data, Expr weight, ffi::Array<int64_t> strides,
   attrs->data_layout = data_layout;
   attrs->kernel_layout = std::move(kernel_layout);
   attrs->out_layout = out_layout.value_or(data_layout);
-  attrs->out_dtype = std::move(out_dtype.value_or(DataType::Void()));
+  attrs->out_dtype = out_dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
   const Op& op = Op::Get("relax.nn.conv3d_transpose");
   return Call(op, {data, weight}, Attrs(attrs), {});
 }
@@ -1084,9 +1088,9 @@ Type InferTypeConv3dTranspose(const Call& call, const BlockBuilder& ctx) {
   ffi::Optional<ShapeExpr> weight_shape =
       CheckNdimPerLayoutAndGetShape(call, ctx, weight_ty, weight_layout);
 
-  DataType out_dtype = attrs->out_dtype.is_void()
-                           ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
-                           : attrs->out_dtype;
+  PrimType out_dtype = PrimType(attrs->out_dtype == DLDataType{kDLOpaqueHandle, 0, 0}
+                                    ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
+                                    : attrs->out_dtype);
   ffi::Optional<VDevice> vdevice = InferBinaryArithOpOutVDevice(call, ctx, data_ty, weight_ty);
   if (!data_shape.defined() || !weight_shape.defined()) {
     return TensorType(out_dtype, out_layout.ndim(), vdevice);
@@ -1227,7 +1231,7 @@ InferLayoutOutput InferLayoutConv3dTranspose(
   return InferLayoutOutput({data_layout, weight_layout}, {output_layout}, Attrs(new_attrs));
 }
 
-Call InferMixedPrecisionConv3dTranspose(const Call& call, const DataType& out_dtype) {
+Call InferMixedPrecisionConv3dTranspose(const Call& call, DLDataType out_dtype) {
   const auto* conv3d_transpose_attrs = call->attrs.as<Conv3DTransposeAttrs>();
   return conv3d_transpose(call->args[0], call->args[1], conv3d_transpose_attrs->strides,
                           conv3d_transpose_attrs->padding, conv3d_transpose_attrs->output_padding,
diff --git a/src/relax/op/nn/convolution.h b/src/relax/op/nn/convolution.h
index b08eb8a83ff8..b33a19f07057 100644
--- a/src/relax/op/nn/convolution.h
+++ b/src/relax/op/nn/convolution.h
@@ -39,7 +39,7 @@ template <typename T>
 inline Expr MakeConv(Expr data, Expr weight, ffi::Array<int64_t> strides,
                      ffi::Array<int64_t> padding, ffi::Array<int64_t> dilation, int groups,
                      ffi::String data_layout, ffi::String kernel_layout, ffi::String out_layout,
-                     DataType out_dtype, std::string op_name) {
+                     DLDataType out_dtype, std::string op_name) {
   auto attrs = ffi::make_object<T>();
   attrs->strides = std::move(strides);
   attrs->padding = std::move(padding);
@@ -48,7 +48,7 @@ inline Expr MakeConv(Expr data, Expr weight, ffi::Array<int64_t> strides,
   attrs->data_layout = std::move(data_layout);
   attrs->kernel_layout = std::move(kernel_layout);
   attrs->out_layout = std::move(out_layout);
-  attrs->out_dtype = std::move(out_dtype);
+  attrs->out_dtype = out_dtype;
   const Op& op = Op::Get(op_name);
   return Call(op, {data, weight}, Attrs(attrs), {});
 }
@@ -57,19 +57,19 @@ inline Expr MakeConv(Expr data, Expr weight, ffi::Array<int64_t> strides,
 Expr conv1d(Expr data, Expr weight, ffi::Array<int64_t> strides, ffi::Array<int64_t> padding,
             ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
             ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-            ffi::Optional<DataType> out_dtype);
+            ffi::Optional<DLDataType> out_dtype);
 
 /*! \brief 2D convolution */
 Expr conv2d(Expr data, Expr weight, ffi::Array<int64_t> strides, ffi::Array<int64_t> padding,
             ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
             ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-            ffi::Optional<DataType> out_dtype);
+            ffi::Optional<DLDataType> out_dtype);
 
 /*! \brief 3D convolution */
 Expr conv3d(Expr data, Expr weight, ffi::Array<int64_t> strides, ffi::Array<int64_t> padding,
             ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
             ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-            ffi::Optional<DataType> out_dtype);
+            ffi::Optional<DLDataType> out_dtype);
 
 /*!
  * \brief One dimensional transposed convolution operator.
@@ -81,7 +81,7 @@ Expr conv1d_transpose(Expr data, Expr weight, ffi::Array<int64_t> strides,
                       ffi::Array<int64_t> padding, ffi::Array<int64_t> output_padding,
                       ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
                       ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-                      ffi::Optional<DataType> out_dtype);
+                      ffi::Optional<DLDataType> out_dtype);
 
 /*!
  * \brief Two dimensional transposed convolution operator.
@@ -93,7 +93,7 @@ Expr conv2d_transpose(Expr data, Expr weight, ffi::Array<int64_t> strides,
                       ffi::Array<int64_t> padding, ffi::Array<int64_t> output_padding,
                       ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
                       ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-                      ffi::Optional<DataType> out_dtype);
+                      ffi::Optional<DLDataType> out_dtype);
 
 /*!
  * \brief Three dimensional transposed convolution operator.
@@ -105,7 +105,7 @@ Expr conv3d_transpose(Expr data, Expr weight, ffi::Array<int64_t> strides,
                       ffi::Array<int64_t> padding, ffi::Array<int64_t> output_padding,
                       ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
                       ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-                      ffi::Optional<DataType> out_dtype);
+                      ffi::Optional<DLDataType> out_dtype);
 
 }  // namespace relax
 }  // namespace tvm
diff --git a/src/relax/op/nn/nn.cc b/src/relax/op/nn/nn.cc
index b24f81c72d49..5deb6db937bb 100644
--- a/src/relax/op/nn/nn.cc
+++ b/src/relax/op/nn/nn.cc
@@ -122,7 +122,9 @@ Type InferTypePRelu(const Call& call, const BlockBuilder& ctx) {
   if (data_ty->IsUnknownNdim()) {
     return data_ty;
   }
-  if (!data_ty->IsUnknownDtype() && !data_ty->dtype.is_float()) {
+  PrimType data_dtype = data_ty->dtype;
+  // PRelu preserves the old float-kind check; vector lanes are irrelevant to this check.
+  if (!data_ty->IsUnknownDtype() && !data_dtype.MatchesCode(DLDataTypeCode::kDLFloat)) {
     TVM_FFI_VISIT_THROW(TypeError, call) << "Prelu requires the input tensor to have float "
                                             "dtype. However, the given input dtype is "
                                          << data_ty->dtype;
@@ -186,10 +188,14 @@ Type InferTypeSoftmax(const Call& call, const BlockBuilder& ctx) {
   if (data_ty->IsUnknownNdim()) {
     return data_ty;
   }
-  if (!data_ty->IsUnknownDtype() && !data_ty->dtype.is_float() && !data_ty->dtype.is_bfloat()) {
-    TVM_FFI_VISIT_THROW(TypeError, call) << "Softmax requires the input tensor to have float "
-                                            "dtype. However, the given input dtype is "
-                                         << data_ty->dtype;
+  if (!data_ty->IsUnknownDtype()) {
+    PrimType data_dtype = data_ty->dtype;
+    // Softmax only requires a floating element kind; lane encoding is irrelevant to the check.
+    if (!data_dtype.MatchesCode(kDLFloat, kDLBfloat)) {
+      TVM_FFI_VISIT_THROW(TypeError, call) << "Softmax requires the input tensor to have float "
+                                              "dtype. However, the given input dtype is "
+                                           << data_ty->dtype;
+    }
   }
   const auto* attrs = call->attrs.as<SoftmaxAttrs>();
   NormalizeAxis(call, ctx, data_ty->ndim, attrs->axis);
@@ -380,10 +386,14 @@ bool NormCheckDtypeAndShape(const Call& call, const BlockBuilder& ctx,
     axes_non_neg = NormalizeAxes(call, ctx, data_ty->ndim, axes);
   }
   int n_axis = axes.size();
-  if (!data_ty->IsUnknownDtype() && (!data_ty->dtype.is_float() && !data_ty->dtype.is_bfloat())) {
-    TVM_FFI_VISIT_THROW(TypeError, call)
-        << op << " requires the input data to have float dtype. However, the given data dtype is "
-        << data_ty->dtype;
+  if (!data_ty->IsUnknownDtype()) {
+    PrimType data_dtype = data_ty->dtype;
+    // Norm ops only require a floating element kind; lane encoding is irrelevant to the check.
+    if (!data_dtype.MatchesCode(kDLFloat, kDLBfloat)) {
+      TVM_FFI_VISIT_THROW(TypeError, call)
+          << op << " requires the input data to have float dtype. However, the given data dtype is "
+          << data_ty->dtype;
+    }
   }
   for (int i = 1; i < n_input; ++i) {
     if (input_ty[i]->dtype != data_ty->dtype) {
@@ -462,7 +472,7 @@ Type InferTypeBatchNorm(const Call& call, const BlockBuilder& ctx) {
   const auto* attrs = call->attrs.as<BatchNormAttrs>();
   bool unknown_shape = NormCheckDtypeAndShape(call, ctx, input_ty, {attrs->axis});
 
-  DataType dtype = input_ty[0]->dtype;
+  PrimType dtype = input_ty[0]->dtype;
   if (unknown_shape) {
     auto vdev = input_ty[0]->vdevice;
     return TupleType({TensorType(dtype, input_ty[0]->ndim, vdev),
@@ -620,7 +630,9 @@ Type InferTypeGroupNorm(const Call& call, const BlockBuilder& ctx) {
           << channel_axis << ", axes: " << attrs->axes;
     }
   }
-  if (!data_ty->IsUnknownDtype() && !data_ty->dtype.is_float()) {
+  PrimType data_dtype = data_ty->dtype;
+  // GroupNorm preserves the old float-kind check; vector lanes are irrelevant to this check.
+  if (!data_ty->IsUnknownDtype() && !data_dtype.MatchesCode(DLDataTypeCode::kDLFloat)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << op << " expects that data must be float, but got " << data_ty->dtype;
   }
@@ -890,7 +902,7 @@ Type InferTypeCrossEntropy(const Call& call, const BlockBuilder& ctx) {
   TensorType label_ty = input_ty[1];
 
   // infer dtype
-  DataType dtype = InferBinaryArithOpOutDtype(call, ctx, pred_ty, label_ty);
+  PrimType dtype(InferBinaryArithOpOutDtype(call, ctx, pred_ty, label_ty));
 
   // infer vdevice
   ffi::Optional<VDevice> vdevice = InferBinaryArithOpOutVDevice(call, ctx, pred_ty, label_ty);
@@ -1002,23 +1014,26 @@ Type InferTypeNLLLoss(const Call& call, const BlockBuilder& ctx) {
   }
 
   // infer dtype, vdevice
-  DataType output_dtype;
-  ffi::Optional<VDevice> vdevice;
-  if (wgt_ty != nullptr) {
-    output_dtype = InferBinaryArithOpOutDtype(call, ctx, ffi::GetRef<TensorType>(pred_ty),
-                                              ffi::GetRef<TensorType>(wgt_ty));
-    vdevice = InferBinaryArithOpOutVDevice(call, ctx, ffi::GetRef<TensorType>(pred_ty),
-                                           ffi::GetRef<TensorType>(wgt_ty));
-  } else {
-    output_dtype = pred_ty->dtype;
-    vdevice = pred_ty->vdevice;
-  }
+  PrimType output_dtype =
+      wgt_ty != nullptr
+          ? PrimType(InferBinaryArithOpOutDtype(call, ctx, ffi::GetRef<TensorType>(pred_ty),
+                                                ffi::GetRef<TensorType>(wgt_ty)))
+          : pred_ty->dtype;
+  ffi::Optional<VDevice> vdevice =
+      wgt_ty != nullptr ? InferBinaryArithOpOutVDevice(call, ctx, ffi::GetRef<TensorType>(pred_ty),
+                                                       ffi::GetRef<TensorType>(wgt_ty))
+                        : pred_ty->vdevice;
 
   // the type of targets must be int/uint.
-  if (!tgt_ty->IsUnknownDtype() && !tgt_ty->dtype.is_int() && !tgt_ty->dtype.is_uint()) {
-    TVM_FFI_VISIT_THROW(TypeError, call)
-        << "NLLLoss expects the dtype of targets to be int/uint. However, the dtype of targets is "
-        << tgt_ty->dtype;
+  if (!tgt_ty->IsUnknownDtype()) {
+    PrimType target_dtype = tgt_ty->dtype;
+    // NLLLoss only needs the target element kind; vector lanes do not affect target indexing.
+    if (!target_dtype.MatchesCode(DLDataTypeCode::kDLInt) &&
+        !target_dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
+      TVM_FFI_VISIT_THROW(TypeError, call) << "NLLLoss expects the dtype of targets to be "
+                                              "int/uint. However, the dtype of targets is "
+                                           << tgt_ty->dtype;
+    }
   }
 
   // infer ndim
diff --git a/src/relax/op/nn/pooling.cc b/src/relax/op/nn/pooling.cc
index 856cd75c5902..84f994bc612f 100644
--- a/src/relax/op/nn/pooling.cc
+++ b/src/relax/op/nn/pooling.cc
@@ -275,7 +275,8 @@ InferLayoutOutput InferLayoutPool2d(
   ffi::ObjectPtr<Pool2DAttrs> new_attrs = ffi::make_object<Pool2DAttrs>(*attrs);
 
   if (layout->layout.ndim() != layout->layout.ndim_primal()) {
-    tirx::SLayout in_layout(attrs->layout, DataType::Int(64));
+    tvm::PrimType i64_ty = tvm::PrimType::Int(64);
+    tirx::SLayout in_layout(attrs->layout, i64_ty);
     auto desired_layout = TransposeSubLayoutLike(attrs->layout, InitialLayout(4), layout->layout);
     auto data_si = GetType(call->args[0]);
     TensorType data_ty = data_si.as<TensorType>().value();
@@ -675,7 +676,8 @@ InferLayoutOutput InferLayoutAdaptiveAvgPool2D(
   LayoutDecision layout = GetLayoutDecision(var_layout_map, call->args[0]);
   ffi::ObjectPtr<AdaptivePool2DAttrs> new_attrs = ffi::make_object<AdaptivePool2DAttrs>(*attrs);
   if (layout->layout.ndim() != layout->layout.ndim_primal()) {
-    tirx::SLayout in_layout(attrs->layout, DataType::Int(64));
+    tvm::PrimType i64_ty = tvm::PrimType::Int(64);
+    tirx::SLayout in_layout(attrs->layout, i64_ty);
     auto desired_layout = TransposeSubLayoutLike(attrs->layout, InitialLayout(4), layout->layout);
     auto data_si = GetType(call->args[0]);
     TensorType data_ty = data_si.as<TensorType>().value();
diff --git a/src/relax/op/op.cc b/src/relax/op/op.cc
index 9c58ab769950..16e5d5f20d0e 100644
--- a/src/relax/op/op.cc
+++ b/src/relax/op/op.cc
@@ -409,9 +409,9 @@ static ffi::Optional<Type> InferCallTIROutputTypeFromArguments(
       TVM_FFI_ICHECK(packed_tuple_ty);
       PrimType dummy_arg_ty = [&]() {
         if (packed_tuple_ty->values) {
-          return PrimType(packed_tuple_ty->values.value()[i].dtype());
+          return PrimType(packed_tuple_ty->values.value()[i].ty());
         } else {
-          return PrimType(DataType::Int(64));
+          return PrimType::Int(64);
         }
       }();
       dummy_args.push_back(Var("dummy_trailing_arg", dummy_arg_ty));
@@ -1119,7 +1119,7 @@ Type InferTypeSize(const Call& call, const BlockBuilder& ctx) {
   auto* tensor_ty = GetType(call->args[0]).as<TensorTypeNode>();
   TVM_FFI_ICHECK(tensor_ty) << "size expects a tensor input, but received " << arg_ty
                             << "; use MatchCast if necessary";
-  return TensorType(ShapeExpr(ffi::Array<PrimExpr>{}), DataType::Int(64));
+  return TensorType(ShapeExpr(ffi::Array<PrimExpr>{}), PrimType::Int(64));
 }
 
 TVM_REGISTER_OP("relax.size")
@@ -1182,7 +1182,7 @@ Type ReturnShapeToTensorType(const Call& call, const BlockBuilder& ctx) {
   const auto* ty = GetTypeAs<ShapeTypeNode>(call->args[0]);
   TVM_FFI_ICHECK(ty);
   int32_t ndim = ty->ndim;
-  return TensorType(ShapeExpr({PrimExpr(ndim)}), DataType::Int(64));
+  return TensorType(ShapeExpr({PrimExpr(ndim)}), PrimType::Int(64));
 }
 
 TVM_REGISTER_OP("relax.shape_to_tensor")
@@ -1209,10 +1209,10 @@ Type InferTypeAllocateTensor(const Call& call, const BlockBuilder& ctx) {
       << "must be ShapeExpr, but got " << call->args[0]->GetTypeKey();
   TVM_FFI_ICHECK(call->args[1].as<DataTypeImmNode>())
       << "must be DataTypeImm, but got " << call->args[1]->GetTypeKey();
-  DataType out_dtype;
+  PrimType out_dtype = PrimType::Void();
   if (const auto* dtype_node = call->args[1].as<DataTypeImmNode>()) {
     const DataTypeImm dtype_imm = ffi::GetRef<DataTypeImm>(dtype_node);
-    out_dtype = dtype_imm->value;
+    out_dtype = PrimType(dtype_imm->value);
   }
   int64_t vdevice_index = -1;
   if (auto* prim_value_node = call->args[2].as<PrimValueNode>()) {
@@ -1284,10 +1284,10 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 Type InferTypeMemAllocTensor(const Call& call, const BlockBuilder& ctx) {
   TVM_FFI_ICHECK(GetTypeAs<ShapeTypeNode>(call->args[2]))
       << "must be a Expr of ShapeType, but got " << call->args[1]->GetTypeKey();
-  DataType out_dtype;
+  PrimType out_dtype = PrimType::Void();
   if (const auto* dtype_node = call->args[3].as<DataTypeImmNode>()) {
     const DataTypeImm dtype_imm = ffi::GetRef<DataTypeImm>(dtype_node);
-    out_dtype = dtype_imm->value;
+    out_dtype = PrimType(dtype_imm->value);
   }
 
   if (call->args.size() == 5) {
@@ -1408,10 +1408,10 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 // vm alloc_tensor
 
 Type InferTypeVMAllocTensor(const Call& call, const BlockBuilder& ctx) {
-  DataType out_dtype;
+  PrimType out_dtype = PrimType::Void();
   if (const auto* dtype_node = call->args[3].as<DataTypeImmNode>()) {
     const DataTypeImm dtype_imm = ffi::GetRef<DataTypeImm>(dtype_node);
-    out_dtype = dtype_imm->value;
+    out_dtype = PrimType(dtype_imm->value);
   }
   int64_t vdevice_index = -1;
   if (auto* prim_value_node = call->args[4].as<PrimValueNode>()) {
diff --git a/src/relax/op/op_common.h b/src/relax/op/op_common.h
index cb0d6034e2d1..a19f59d4d56a 100644
--- a/src/relax/op/op_common.h
+++ b/src/relax/op/op_common.h
@@ -33,6 +33,7 @@
 
 #include <optional>
 #include <tuple>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -184,14 +185,12 @@ std::tuple<ArgTypes...> GetArgType(const Call& call, const BlockBuilder& ctx) {
     tvm::ffi::reflection::GlobalDef().def("relax.op." OpRegName, OpName); \
   }
 
-/************ Utilities ************/
-
 /*!
  * \brief Infer the type for unary elementwise ops.
  * \param call The context Call to the operator.
  * \param ctx The error reporting context.
  * \param f_compute_out_dtype The function to compute the output dtype, with
- * signature DataType f_compute_out_dtype(const TensorType& input_ty).
+ * signature DLDataType or PrimType f_compute_out_dtype(const TensorType& input_ty).
  * \tparam require_float_dtype whether this op requires the input dtype to be float
  * \tparam Ftype the type of f_compute_out_dtype
  * \return The inferred type.
@@ -199,15 +198,21 @@ std::tuple<ArgTypes...> GetArgType(const Call& call, const BlockBuilder& ctx) {
 template <bool require_float_dtype, typename FType>
 inline Type InferTypeUnary(const Call& call, const BlockBuilder& ctx, FType f_compute_out_dtype) {
   TensorType input_ty = GetUnaryInputTensorType(call, ctx);
+  DLDataType input_dtype = input_ty->dtype->dtype;
   if (require_float_dtype && !input_ty->IsUnknownDtype() &&
-      (!input_ty->dtype.is_float() && !input_ty->dtype.is_bfloat())) {
+      (input_dtype.code != kDLFloat && input_dtype.code != kDLBfloat)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << call->op
         << " requires the input tensor to have float dtype. However, the given input dtype is "
         << input_ty->dtype;
   }
   auto output_ty = ffi::make_object<TensorTypeNode>(*input_ty.get());
-  output_ty->dtype = f_compute_out_dtype(input_ty);
+  auto computed_dtype = f_compute_out_dtype(input_ty);
+  if constexpr (std::is_same_v<std::decay_t<decltype(computed_dtype)>, PrimType>) {
+    output_ty->dtype = computed_dtype;
+  } else {
+    output_ty->dtype = PrimType(computed_dtype);
+  }
   if (call->ty_args.size() > 0) {
     auto defined_ty = call->ty_args[0].as<TensorTypeNode>();
     TVM_FFI_ICHECK(defined_ty);
@@ -274,9 +279,9 @@ InferLayoutOutput InferLayoutUnaryEwise(
  * \return The inferred element dtype.
  * \throw Throw exception if the Type doesn't have an element type.
  */
-inline std::optional<DataType> GetElementDType(const Type& ty) {
+inline std::optional<PrimType> GetElementDType(const Type& ty) {
   if (const auto* prim = ty.as<PrimTypeNode>()) {
-    return prim->dtype;
+    return ffi::GetRef<PrimType>(prim);
   } else if (const auto* tensor = ty.as<TensorTypeNode>()) {
     return tensor->dtype;
   } else {
@@ -296,8 +301,8 @@ inline std::optional<DataType> GetElementDType(const Type& ty) {
  * \return The inferred output dtype.
  * \throw Throw exception if the dtype of two input TensorType don’t match
  */
-inline DataType InferBinaryArithOpOutDtype(const Call& call, const BlockBuilder& ctx,
-                                           const Type& lhs_ty, const Type& rhs_ty) {
+inline DLDataType InferBinaryArithOpOutDtype(const Call& call, const BlockBuilder& ctx,
+                                             const Type& lhs_ty, const Type& rhs_ty) {
   auto opt_lhs_dtype = GetElementDType(lhs_ty);
   if (!opt_lhs_dtype) {
     TVM_FFI_VISIT_THROW(TypeError, call)
@@ -318,15 +323,17 @@ inline DataType InferBinaryArithOpOutDtype(const Call& call, const BlockBuilder&
   }
   auto rhs_dtype = opt_rhs_dtype.value();
 
-  if (lhs_dtype.is_void() || rhs_dtype.is_void()) {
-    return DataType::Void();
-  } else if (lhs_dtype != rhs_dtype && !lhs_dtype.is_bool() && !rhs_dtype.is_bool()) {
+  if (lhs_dtype.IsVoid() || rhs_dtype.IsVoid()) {
+    return DLDataType{kDLOpaqueHandle, 0, 0};
+  } else if (lhs_dtype->dtype != rhs_dtype->dtype &&
+             !lhs_dtype.MatchesCode(DLDataTypeCode::kDLBool) &&
+             !rhs_dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Binary operators must have the same datatype for both operands.  "
         << "However, " << call << " uses datatype " << lhs_dtype << " on the LHS (Type of "
         << lhs_ty << "), and datatype " << rhs_dtype << " on the RHS (Type of " << rhs_ty << ").";
   }
-  return lhs_dtype;
+  return lhs_dtype->dtype;
 }
 
 /*!
@@ -469,7 +476,7 @@ bool IsIdentityPermutation(const std::vector<int>& permutation);
  */
 inline ffi::Array<IntImm> ConvertIntImmToInt64(const ffi::Array<IntImm>& int_imms) {
   return int_imms.Map(
-      [](const IntImm& i) { return cast(DataType::Int(64), i).as_or_throw<IntImm>(); });
+      [](const IntImm& i) { return cast(PrimType::Int(64), i).as_or_throw<IntImm>(); });
 }
 
 /************ Utilities for NN operators ************/
@@ -560,8 +567,9 @@ inline ffi::Array<int64_t> GetCompletePadding3D(ffi::Array<int64_t> padding) {
 inline std::pair<tirx::SLayout, tirx::SBijectiveLayout> CheckTensorLayout(
     const Call& call, const BlockBuilder& ctx, const ffi::String& tensor_layout,
     const ffi::String& tgt_layout, const ffi::String& tensor_name) {
-  tirx::SLayout _tensor_layout(tensor_layout, DataType::Int(64));
-  tirx::SBijectiveLayout tensor2tgt(_tensor_layout, tirx::SLayout(tgt_layout, DataType::Int(64)));
+  tvm::PrimType i64_ty = tvm::PrimType::Int(64);
+  tirx::SLayout _tensor_layout(tensor_layout, i64_ty);
+  tirx::SBijectiveLayout tensor2tgt(_tensor_layout, tirx::SLayout(tgt_layout, i64_ty));
   if (!tensor2tgt.defined()) {
     TVM_FFI_VISIT_THROW(ValueError, call)
         << call->op << " requires the given " << tensor_name << " layout to be convertible from "
diff --git a/src/relax/op/tensor/binary.cc b/src/relax/op/tensor/binary.cc
index 84c411238473..cbc786de0f8e 100644
--- a/src/relax/op/tensor/binary.cc
+++ b/src/relax/op/tensor/binary.cc
@@ -51,11 +51,11 @@ Type InferTypeBroadcast(const Call& call, const BlockBuilder& ctx, FType f_compu
       << "Arguments to binary operators must be either R.Tensor or R.Prim types, "
       << "but expression " << call << " has RHS " << call->args[1] << ", which has Type " << rhs_ty;
 
-  // DateType
-  DataType output_dtype = f_compute_out_dtype(call, ctx, lhs_ty, rhs_ty);
+  // Dtype
+  PrimType output_dtype(f_compute_out_dtype(call, ctx, lhs_ty, rhs_ty));
 
   if (lhs_ty.as<PrimTypeNode>() && rhs_ty.as<PrimTypeNode>()) {
-    return PrimType(output_dtype);
+    return output_dtype;
   }
 
   // VDevice
@@ -136,7 +136,7 @@ Type InferTypeBroadcastArith(const Call& call, const BlockBuilder& ctx) {
 Type InferTypeBroadcastCMP(const Call& call, const BlockBuilder& ctx) {
   return InferTypeBroadcast(call, ctx,
                             [](const Call& call, const BlockBuilder& ctx, const Type& lhs_ty,
-                               const Type& rhs_ty) { return DataType::Bool(); });
+                               const Type& rhs_ty) { return DLDataType{kDLBool, 8, 1}; });
 }
 
 InferLayoutOutput InferLayoutBinaryEwise(
diff --git a/src/relax/op/tensor/create.cc b/src/relax/op/tensor/create.cc
index e7a972896569..fbe3a0b0c534 100644
--- a/src/relax/op/tensor/create.cc
+++ b/src/relax/op/tensor/create.cc
@@ -46,7 +46,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 /* relax.full */
 Expr full(ffi::Variant<Expr, ffi::Array<PrimExpr>> shape, Expr fill_value,
-          ffi::Optional<DataType> dtype) {
+          ffi::Optional<DLDataType> dtype) {
   Expr shape_in_expr{nullptr};
   if (const auto* expr = shape.as<ExprNode>()) {
     shape_in_expr = ffi::GetRef<Expr>(expr);
@@ -59,7 +59,7 @@ Expr full(ffi::Variant<Expr, ffi::Array<PrimExpr>> shape, Expr fill_value,
   }
 
   ffi::ObjectPtr<InitAttrs> attrs = ffi::make_object<InitAttrs>();
-  attrs->dtype = dtype.value_or(DataType::Void());
+  attrs->dtype = dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
 
   static const Op& op = Op::Get("relax.full");
   return Call(op, {std::move(shape_in_expr), std::move(fill_value)}, Attrs(attrs), {});
@@ -88,7 +88,8 @@ Type InferTypeFull(const Call& call, const BlockBuilder& ctx) {
   }
 
   const auto* attrs = call->attrs.as<InitAttrs>();
-  DataType out_dtype = attrs->dtype.is_void() ? fill_value_ty->dtype : attrs->dtype;
+  PrimType out_dtype = attrs->dtype == DLDataType{kDLOpaqueHandle, 0, 0} ? fill_value_ty->dtype
+                                                                         : PrimType(attrs->dtype);
   return TensorType(/*shape=*/call->args[0], out_dtype, fill_value_ty->vdevice);
 }
 
@@ -104,9 +105,9 @@ TVM_REGISTER_OP("relax.full")
     .set_attr<bool>("FPurity", true);
 
 /* relax.full_like */
-Expr full_like(Expr x, Expr fill_value, ffi::Optional<DataType> dtype) {
+Expr full_like(Expr x, Expr fill_value, ffi::Optional<DLDataType> dtype) {
   ffi::ObjectPtr<InitAttrs> attrs = ffi::make_object<InitAttrs>();
-  attrs->dtype = dtype.value_or(DataType::Void());
+  attrs->dtype = dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
   static const Op& op = Op::Get("relax.full_like");
   return Call(op, {std::move(x), std::move(fill_value)}, Attrs(attrs), {});
 }
@@ -127,11 +128,11 @@ Type InferTypeFullLike(const Call& call, const BlockBuilder& ctx) {
   }
 
   const auto* attrs = call->attrs.as<InitAttrs>();
-  if (attrs->dtype.is_void()) {
+  if (attrs->dtype == DLDataType{kDLOpaqueHandle, 0, 0}) {
     return data_ty;
   } else {
     auto output_ty = ffi::make_object<TensorTypeNode>(*data_ty.get());
-    output_ty->dtype = attrs->dtype;
+    output_ty->dtype = PrimType(attrs->dtype);
     return TensorType(output_ty);
   }
 }
@@ -158,25 +159,26 @@ Type InferTypeOnesZeros(const Call& call, const BlockBuilder& ctx) {
         << call->args[0]->ty->GetTypeKey();
   }
   const auto* attrs = call->attrs.as<InitAttrs>();
-  return TensorType(/*shape=*/call->args[0], attrs->dtype);
+  return TensorType(/*shape=*/call->args[0], PrimType(attrs->dtype));
 }
 
 // Structure info inference for ones_like and zeros_like
 Type InferTypeOnesLikeZerosLike(const Call& call, const BlockBuilder& ctx) {
   TensorType data_ty = GetUnaryInputTensorType(call, ctx);
   const auto* attrs = call->attrs.as<InitAttrs>();
-  if (attrs->dtype.is_void()) {
+  if (attrs->dtype == DLDataType{kDLOpaqueHandle, 0, 0}) {
     return data_ty;
   } else {
     auto output_ty = ffi::make_object<TensorTypeNode>(*data_ty.get());
-    output_ty->dtype = attrs->dtype;
+    output_ty->dtype = PrimType(attrs->dtype);
     return TensorType(output_ty);
   }
 }
 
 /* relax.ones & relax.ones_like */
-Expr ones(Expr shape, DataType dtype) {
-  TVM_FFI_ICHECK(!dtype.is_void()) << "Ones op expects the input dtype not to be void";
+Expr ones(Expr shape, DLDataType dtype) {
+  TVM_FFI_ICHECK((dtype != DLDataType{kDLOpaqueHandle, 0, 0}))
+      << "Ones op expects the input dtype not to be void";
   ffi::ObjectPtr<InitAttrs> attrs = ffi::make_object<InitAttrs>();
   attrs->dtype = dtype;
 
@@ -184,9 +186,9 @@ Expr ones(Expr shape, DataType dtype) {
   return Call(op, {std::move(shape)}, Attrs(attrs), {});
 }
 
-Expr ones_like(Expr x, ffi::Optional<DataType> dtype) {
+Expr ones_like(Expr x, ffi::Optional<DLDataType> dtype) {
   ffi::ObjectPtr<InitAttrs> attrs = ffi::make_object<InitAttrs>();
-  attrs->dtype = dtype.value_or(DataType::Void());
+  attrs->dtype = dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
   static const Op& op = Op::Get("relax.ones_like");
   return Call(op, {std::move(x)}, Attrs(attrs), {});
 }
@@ -212,8 +214,9 @@ TVM_REGISTER_OP("relax.ones_like")
     .set_attr<bool>("FPurity", true);
 
 /* relax.zeros & relax.zeros_like */
-Expr zeros(Expr shape, DataType dtype) {
-  TVM_FFI_ICHECK(!dtype.is_void()) << "Zeros op expects the input dtype not to be void";
+Expr zeros(Expr shape, DLDataType dtype) {
+  TVM_FFI_ICHECK((dtype != DLDataType{kDLOpaqueHandle, 0, 0}))
+      << "Zeros op expects the input dtype not to be void";
   ffi::ObjectPtr<InitAttrs> attrs = ffi::make_object<InitAttrs>();
   attrs->dtype = dtype;
 
@@ -221,9 +224,9 @@ Expr zeros(Expr shape, DataType dtype) {
   return Call(op, {std::move(shape)}, Attrs(attrs), {});
 }
 
-Expr zeros_like(Expr x, ffi::Optional<DataType> dtype) {
+Expr zeros_like(Expr x, ffi::Optional<DLDataType> dtype) {
   ffi::ObjectPtr<InitAttrs> attrs = ffi::make_object<InitAttrs>();
-  attrs->dtype = dtype.value_or(DataType::Void());
+  attrs->dtype = dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
   static const Op& op = Op::Get("relax.zeros_like");
   return Call(op, {std::move(x)}, Attrs(attrs), {});
 }
@@ -249,16 +252,16 @@ TVM_REGISTER_OP("relax.zeros_like")
     .set_attr<bool>("FPurity", true);
 
 /* relax.eye & relax.eye_like */
-Expr eye(PrimValue n, PrimValue m, PrimValue k, DataType dtype) {
+Expr eye(PrimValue n, PrimValue m, PrimValue k, DLDataType dtype) {
   ffi::ObjectPtr<InitAttrs> attrs = ffi::make_object<InitAttrs>();
   attrs->dtype = dtype;
   static const Op& op = Op::Get("relax.eye");
   return Call(op, {std::move(n), std::move(m), std::move(k)}, Attrs(attrs), {});
 }
 
-Expr eye_like(Expr x, PrimValue k, ffi::Optional<DataType> dtype) {
+Expr eye_like(Expr x, PrimValue k, ffi::Optional<DLDataType> dtype) {
   ffi::ObjectPtr<InitAttrs> attrs = ffi::make_object<InitAttrs>();
-  attrs->dtype = dtype.value_or(DataType::Void());
+  attrs->dtype = dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
   static const Op& op = Op::Get("relax.eye_like");
   return Call(op, {std::move(x), std::move(k)}, Attrs(attrs), {});
 }
@@ -285,8 +288,8 @@ Type InferTypeEye(const Call& call, const BlockBuilder& ctx) {
   PrimExpr n = get_prim_value(call->args[0], "n");
   PrimExpr m = get_prim_value(call->args[1], "m");
 
-  DataType dtype = call->attrs.as<InitAttrs>()->dtype;
-  return TensorType(ShapeExpr({n, m}), dtype);
+  DLDataType dtype = call->attrs.as<InitAttrs>()->dtype;
+  return TensorType(ShapeExpr({n, m}), PrimType(dtype));
 }
 
 Type InferTypeEyeLike(const Call& call, const BlockBuilder& ctx) {
@@ -309,7 +312,8 @@ Type InferTypeEyeLike(const Call& call, const BlockBuilder& ctx) {
   }
 
   const auto* attrs = call->attrs.as<InitAttrs>();
-  DataType out_dtype = attrs->dtype.is_void() ? x_ty->dtype : attrs->dtype;
+  PrimType out_dtype =
+      attrs->dtype == DLDataType{kDLOpaqueHandle, 0, 0} ? x_ty->dtype : PrimType(attrs->dtype);
 
   return TensorType(x_ty->shape.value(), out_dtype, x_ty->vdevice);
 }
@@ -333,7 +337,7 @@ TVM_REGISTER_OP("relax.eye_like")
     .set_attr<bool>("FPurity", true);
 
 /* relax.arange */
-Expr arange(PrimValue start, PrimValue stop, PrimValue step, DataType dtype) {
+Expr arange(PrimValue start, PrimValue stop, PrimValue step, DLDataType dtype) {
   ffi::ObjectPtr<InitAttrs> attrs = ffi::make_object<InitAttrs>();
   attrs->dtype = dtype;
   static const Op& op = Op::Get("relax.arange");
@@ -362,17 +366,18 @@ Type InferTypeArange(const Call& call, const BlockBuilder& ctx) {
   PrimExpr start = get_prim_value(call->args[0], "start");
   PrimExpr end = get_prim_value(call->args[1], "end");
   PrimExpr step = get_prim_value(call->args[2], "step");
-  DataType dtype = call->attrs.as<InitAttrs>()->dtype;
+  DLDataType dtype = call->attrs.as<InitAttrs>()->dtype;
   PrimExpr num_elem;
-  if (start.dtype().is_int() && end.dtype().is_int() && step.dtype().is_int()) {
+  if (start.ty().code() == DLDataTypeCode::kDLInt && end.ty().code() == DLDataTypeCode::kDLInt &&
+      step.ty().code() == DLDataTypeCode::kDLInt) {
     num_elem = tvm::floordiv((end - start + step - 1), step);
   } else {
-    num_elem = tvm::cast(tvm::DataType::Int(64),
-                         tvm::ceil(tvm::cast(tvm::DataType::Float(32), end - start) / step));
+    num_elem = tvm::cast(tvm::PrimType::Int(64),
+                         tvm::ceil(tvm::cast(tvm::PrimType::Float(32), end - start) / step));
   }
   arith::Analyzer analyzer;
   num_elem = analyzer->Simplify(num_elem);
-  return TensorType(ShapeExpr({num_elem}), dtype);
+  return TensorType(ShapeExpr({num_elem}), PrimType(dtype));
 }
 
 TVM_REGISTER_OP("relax.arange")
@@ -387,7 +392,7 @@ TVM_REGISTER_OP("relax.arange")
 
 /* relax.hamming_window */
 Expr hamming_window(PrimValue window_size, PrimValue periodic, PrimValue alpha, PrimValue beta,
-                    DataType dtype) {
+                    DLDataType dtype) {
   ffi::ObjectPtr<InitAttrs> attrs = ffi::make_object<InitAttrs>();
   attrs->dtype = dtype;
   static const Op& op = Op::Get("relax.hamming_window");
@@ -401,8 +406,8 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 }
 
 Type InferTypeHammingWindow(const Call& call, const BlockBuilder& ctx) {
-  DataType dtype = call->attrs.as<InitAttrs>()->dtype;
-  if (dtype.is_int() || dtype.is_uint() || dtype.is_uint()) {
+  DLDataType dtype = call->attrs.as<InitAttrs>()->dtype;
+  if (dtype.code == DLDataTypeCode::kDLInt || dtype.code == DLDataTypeCode::kDLUInt) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Hamming Window expects the datatype to be float but got " << dtype;
   }
@@ -422,7 +427,7 @@ Type InferTypeHammingWindow(const Call& call, const BlockBuilder& ctx) {
         << window_size;
   }
   window_size = analyzer->Simplify(window_size);
-  return TensorType(ShapeExpr({window_size}), dtype);
+  return TensorType(ShapeExpr({window_size}), PrimType(dtype));
 }
 
 TVM_REGISTER_OP("relax.hamming_window")
diff --git a/src/relax/op/tensor/create.h b/src/relax/op/tensor/create.h
index 284448111739..497a535a4d0f 100644
--- a/src/relax/op/tensor/create.h
+++ b/src/relax/op/tensor/create.h
@@ -42,7 +42,7 @@ namespace relax {
  * \return The result tensor.
  */
 Expr full(ffi::Variant<Expr, ffi::Array<PrimExpr>> shape, Expr fill_value,
-          ffi::Optional<DataType> dtype);
+          ffi::Optional<DLDataType> dtype);
 
 /*!
  * \brief Construct a tensor such that
@@ -55,7 +55,7 @@ Expr full(ffi::Variant<Expr, ffi::Array<PrimExpr>> shape, Expr fill_value,
  * void, the input tensor's dtype will be used.
  * \return The result tensor.
  */
-Expr full_like(Expr x, Expr fill_value, ffi::Optional<DataType> dtype);
+Expr full_like(Expr x, Expr fill_value, ffi::Optional<DLDataType> dtype);
 
 /*!
  * \brief Construct a tensor of all ones, with the input shape and dtype.
@@ -63,7 +63,7 @@ Expr full_like(Expr x, Expr fill_value, ffi::Optional<DataType> dtype);
  * \param dtype The data type of the created tensor.
  * \return The result tensor.
  */
-Expr ones(Expr shape, DataType dtype);
+Expr ones(Expr shape, DLDataType dtype);
 
 /*!
  * \brief Construct a tensor with all ones, with shape of the input tensor shape.
@@ -73,7 +73,7 @@ Expr ones(Expr shape, DataType dtype);
  * void, the input tensor's dtype will be used.
  * \return The result tensor.
  */
-Expr ones_like(Expr x, ffi::Optional<DataType> dtype);
+Expr ones_like(Expr x, ffi::Optional<DLDataType> dtype);
 
 /*!
  * \brief Construct a tensor of all zeros, with the input shape and dtype.
@@ -81,7 +81,7 @@ Expr ones_like(Expr x, ffi::Optional<DataType> dtype);
  * \param dtype The data type of the created tensor.
  * \return The result tensor.
  */
-Expr zeros(Expr shape, DataType dtype);
+Expr zeros(Expr shape, DLDataType dtype);
 
 /*!
  * \brief Construct a tensor with all zeros, with shape of the input tensor shape.
@@ -91,7 +91,7 @@ Expr zeros(Expr shape, DataType dtype);
  * void, the input tensor's dtype will be used.
  * \return The result tensor.
  */
-Expr zeros_like(Expr x, ffi::Optional<DataType> dtype);
+Expr zeros_like(Expr x, ffi::Optional<DLDataType> dtype);
 
 /*!
  * \brief Construct a 2-D tensor with ones on the diagonal and zeros elsewhere.
@@ -102,7 +102,7 @@ Expr zeros_like(Expr x, ffi::Optional<DataType> dtype);
  * \param dtype The data type of the created tensor.
  * \return The result tensor.
  */
-Expr eye(PrimValue n, PrimValue m, PrimValue k, DataType dtype);
+Expr eye(PrimValue n, PrimValue m, PrimValue k, DLDataType dtype);
 
 /*!
  * \brief Construct a tensor with ones on the diagonal and zeros elsewhere,
@@ -115,10 +115,10 @@ Expr eye(PrimValue n, PrimValue m, PrimValue k, DataType dtype);
  * void, the input tensor's dtype will be used.
  * \return The result tensor.
  */
-Expr eye_like(Expr x, PrimValue k, ffi::Optional<DataType> dtype);
+Expr eye_like(Expr x, PrimValue k, ffi::Optional<DLDataType> dtype);
 
 /*! \brief Construct a tensor with evenly spaced elements. */
-Expr arange(PrimValue start, PrimValue stop, PrimValue step, DataType dtype);
+Expr arange(PrimValue start, PrimValue stop, PrimValue step, DLDataType dtype);
 
 /*!
  * \brief Hamming window function.
@@ -131,7 +131,7 @@ Expr arange(PrimValue start, PrimValue stop, PrimValue step, DataType dtype);
  * \return The result tensor.
  */
 Expr hamming_window(PrimValue window_size, PrimValue periodic, PrimValue alpha, PrimValue beta,
-                    DataType dtype);
+                    DLDataType dtype);
 
 /*! \brief Return the lower triangular part of a matrix or a batch of matrices. */
 Expr tril(Expr x, Expr k);
diff --git a/src/relax/op/tensor/datatype.cc b/src/relax/op/tensor/datatype.cc
index 907dffb0b3f3..ec1043a025e1 100644
--- a/src/relax/op/tensor/datatype.cc
+++ b/src/relax/op/tensor/datatype.cc
@@ -38,7 +38,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 /* relax.astype */
 
-Expr astype(Expr x, DataType dtype) {
+Expr astype(Expr x, DLDataType dtype) {
   ffi::ObjectPtr<AstypeAttrs> attrs = ffi::make_object<AstypeAttrs>();
   attrs->dtype = dtype;
 
@@ -55,7 +55,7 @@ Type InferTypeAstype(const Call& call, const BlockBuilder& ctx) {
   TensorType ty = GetUnaryInputTensorType(call, ctx);
   const auto* attrs = call->attrs.as<AstypeAttrs>();
   ffi::ObjectPtr<TensorTypeNode> new_ty = ffi::make_object<TensorTypeNode>(*ty.get());
-  new_ty->dtype = attrs->dtype;
+  new_ty->dtype = PrimType(attrs->dtype);
   return TensorType(new_ty);
 }
 
@@ -70,7 +70,7 @@ TVM_REGISTER_OP("relax.astype")
 
 /* relax.wrap_param */
 
-Expr MakeWrapParam(Expr data, DataType dtype) {
+Expr MakeWrapParam(Expr data, DLDataType dtype) {
   ffi::ObjectPtr<WrapParamAttrs> attrs = ffi::make_object<WrapParamAttrs>();
   attrs->dtype = dtype;
 
@@ -87,7 +87,7 @@ Type InferTypeWrapParam(const Call& call, const BlockBuilder& ctx) {
   TensorType ty = GetUnaryInputTensorType(call, ctx);
   const auto* attrs = call->attrs.as<WrapParamAttrs>();
   ffi::ObjectPtr<TensorTypeNode> new_ty = ffi::make_object<TensorTypeNode>(*ty.get());
-  new_ty->dtype = attrs->dtype;
+  new_ty->dtype = PrimType(attrs->dtype);
   return TensorType(new_ty);
 }
 
diff --git a/src/relax/op/tensor/datatype.h b/src/relax/op/tensor/datatype.h
index b612c45fc941..db2ee396c0d6 100644
--- a/src/relax/op/tensor/datatype.h
+++ b/src/relax/op/tensor/datatype.h
@@ -37,7 +37,7 @@ namespace relax {
  * \param dtype The target data type
  * \return The casted result.
  */
-Expr astype(Expr x, DataType dtype);
+Expr astype(Expr x, DLDataType dtype);
 
 /*!
  * \brief A wrapper to wrap the input const tensor to the given data type.
@@ -45,7 +45,7 @@ Expr astype(Expr x, DataType dtype);
  * \param dtype The target data type
  * \return The wrapped result.
  */
-Expr wrap_param(Expr x, DataType dtype);
+Expr wrap_param(Expr x, DLDataType dtype);
 
 }  // namespace relax
 }  // namespace tvm
diff --git a/src/relax/op/tensor/index.cc b/src/relax/op/tensor/index.cc
index 515f37126183..5321798b8e48 100644
--- a/src/relax/op/tensor/index.cc
+++ b/src/relax/op/tensor/index.cc
@@ -72,7 +72,7 @@ Type InferTypeTake(const Call& call, const BlockBuilder& ctx) {
     if (auto tensor_ty = ty.as<TensorType>()) {
       return tensor_ty.value();
     } else if (auto prim_ty = ty.as<PrimTypeNode>()) {
-      return TensorType(ShapeExpr(ffi::Array<PrimExpr>{}), prim_ty->dtype);
+      return TensorType(ShapeExpr(ffi::Array<PrimExpr>{}), ffi::GetRef<PrimType>(prim_ty));
     } else {
       TVM_FFI_VISIT_THROW(TypeError, call)
           << "Operator " << call->op << " requires the indices argument to be "
@@ -84,11 +84,14 @@ Type InferTypeTake(const Call& call, const BlockBuilder& ctx) {
 
   if (indices_ty->IsUnknownDtype()) {
     LOG(WARNING) << "Data type of indices has not been specified. Assume it has an integer type.";
-  } else if (!(indices_ty->dtype.is_int() || indices_ty->dtype.is_uint())) {
-    TVM_FFI_VISIT_THROW(TypeError, call)
-        << "Take op requires the input indices to have integer dtype. However, the "
-           "given indices dtype is "
-        << indices_ty->dtype;
+  } else {
+    PrimType indices_dtype = indices_ty->dtype;
+    if (!indices_dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
+      TVM_FFI_VISIT_THROW(TypeError, call)
+          << "Take op requires the input indices to have integer dtype. However, the "
+             "given indices dtype is "
+          << indices_ty->dtype;
+    }
   }
 
   const auto* attrs = call->attrs.as<TakeAttrs>();
@@ -309,7 +312,7 @@ Type InferTypeStridedSlice(const Call& call, const BlockBuilder& ctx) {
     }
   }();
 
-  TVM_FFI_ICHECK(IsBaseOf(relax::TensorType(DataType::Void(), kUnknownNDim), GetType(data)))
+  TVM_FFI_ICHECK(IsBaseOf(relax::TensorType(PrimType::Void(), kUnknownNDim), GetType(data)))
       << "Operator " << call->op << " requires the first argument to be a tensor.  "
       << "However, in expression " << call << ", the first argument " << data << " has type "
       << GetType(data);
@@ -325,9 +328,8 @@ Type InferTypeStridedSlice(const Call& call, const BlockBuilder& ctx) {
     const auto* tuple = ty.as<TupleTypeNode>();
     if (!tuple) return false;
 
-    return std::all_of(tuple->fields.begin(), tuple->fields.end(), [](const Type& field) {
-      return IsBaseOf(tvm::PrimType(DataType::Int(64)), field);
-    });
+    return std::all_of(tuple->fields.begin(), tuple->fields.end(),
+                       [](const Type& field) { return IsBaseOf(tvm::PrimType::Int(64), field); });
   };
   auto check_tuple = [&](const char* name, Expr expr) {
     auto ty = GetType(expr);
@@ -347,7 +349,7 @@ Type InferTypeStridedSlice(const Call& call, const BlockBuilder& ctx) {
 
   const auto* data_ty = data->ty.as<TensorTypeNode>();
 
-  DataType dtype = DataType::Void();
+  PrimType dtype(DLDataType{kDLOpaqueHandle, 0, 0});
   ffi::Optional<VDevice> vdevice = std::nullopt;
   int ndim = kUnknownNDim;
   if (data_ty) {
@@ -545,7 +547,7 @@ Type InferTypeDynStridedSlice(const Call& call, const BlockBuilder& ctx) {
       LOG(WARNING) << "Dynamic strided slice assumes " << name
                    << " to be int64 when it is not specified.";
     } else {
-      TVM_FFI_ICHECK(ty->dtype == DataType::Int(64))
+      TVM_FFI_ICHECK(ty->dtype == PrimType::Int(64))
           << "Dynamic strided_slice expects the input " << name
           << "values to be all int64. However, " << name << " has dtype " << ty->dtype << ".";
     }
diff --git a/src/relax/op/tensor/inspect.cc b/src/relax/op/tensor/inspect.cc
index bf57670e7f2a..97955eb62455 100644
--- a/src/relax/op/tensor/inspect.cc
+++ b/src/relax/op/tensor/inspect.cc
@@ -88,24 +88,21 @@ std::tuple<TensorType, ffi::Optional<int64_t>> GetTensorArgInfoWithIndex(const C
   return {ffi::GetRef<TensorType>(tensor_ty), int_imm_axis};
 }
 
-DataType GetTensorDataType(const Call& call) { return GetTensorArgInfo(call)->dtype; }
+tirx::PrimFunc GetDLTensorField(tirx::builtin::TVMStructFieldKind field, PrimType field_ty) {
+  tirx::Var dlpack_handle("dlpack_handle", PrimType::Handle());
 
-tirx::PrimFunc GetDLTensorField(tirx::builtin::TVMStructFieldKind field, DataType field_dtype) {
-  tirx::Var dlpack_handle("dlpack_handle", DataType::Handle());
-
-  tirx::Var value("value", field_dtype);
+  tirx::Var value("value", field_ty);
 
   tirx::Stmt body = tirx::SeqStmt(
-      {tirx::Bind(value, tirx::Call(field_dtype, tirx::builtin::tvm_struct_get(),
+      {tirx::Bind(value, tirx::Call(field_ty, tirx::builtin::tvm_struct_get(),
                                     {dlpack_handle, IntImm::Int32(0), IntImm::Int32(field)})),
        tirx::Evaluate(tvm::ret(value))});
 
   DictAttrs attrs({{"tirx.is_scheduled", true}, {"tirx.is_host_func", true}});
 
-  tirx::PrimFunc func(ffi::Array<tirx::Var>{dlpack_handle}, body, tvm::PrimType(field_dtype), {},
-                      attrs);
+  tirx::PrimFunc func(ffi::Array<tirx::Var>{dlpack_handle}, body, field_ty, {}, attrs);
 
-  FuncType ty({TensorType(DataType::Void(), kUnknownNDim)}, PrimType(field_dtype));
+  FuncType ty({TensorType(PrimType::Void(), kUnknownNDim)}, field_ty);
   func->ty = ty;
 
   return func;
@@ -120,23 +117,14 @@ Expr tensor_dtype_code(Expr expr) {
   return Call(op, {expr});
 }
 
-Type InferTypeTensorDtypeCode(const Call& call, const BlockBuilder&) {
-  auto dlpack_type = DataType::UInt(8);
-
-  DataType dtype = GetTensorDataType(call);
-  if (dtype.is_void()) {
-    return PrimType(dlpack_type);
-  } else {
-    return PrimType(dlpack_type);
-  }
-}
+Type InferTypeTensorDtypeCode(const Call& call, const BlockBuilder&) { return PrimType::UInt(8); }
 
 Expr LegalizeTensorDtypeCode(const BlockBuilder& bb, const Call& call) {
-  auto field_dtype = call->ty.as_or_throw<PrimType>()->dtype;
+  PrimType field_ty = call->ty.as_or_throw<tvm::PrimType>();
 
   Expr arg = call->args[0];
   tirx::PrimFunc getter =
-      GetDLTensorField(tirx::builtin::TVMStructFieldKind::kDLTensorTypeCode, field_dtype);
+      GetDLTensorField(tirx::builtin::TVMStructFieldKind::kDLTensorTypeCode, field_ty);
 
   GlobalVar gvar_getter = bb->AddFunction(getter, "_get_tensor_dtype_code");
   return Call(gvar_getter, {arg});
@@ -158,23 +146,14 @@ Expr tensor_dtype_bits(Expr expr) {
   return Call(op, {expr});
 }
 
-Type InferTypeTensorDtypeBits(const Call& call, const BlockBuilder&) {
-  auto dlpack_type = DataType::UInt(8);
-
-  DataType dtype = GetTensorDataType(call);
-  if (dtype.is_void()) {
-    return PrimType(dlpack_type);
-  } else {
-    return PrimType(dlpack_type);
-  }
-}
+Type InferTypeTensorDtypeBits(const Call& call, const BlockBuilder&) { return PrimType::UInt(8); }
 
 Expr LegalizeTensorDtypeBits(const BlockBuilder& bb, const Call& call) {
-  auto field_dtype = call->ty.as_or_throw<PrimType>()->dtype;
+  PrimType field_ty = call->ty.as_or_throw<tvm::PrimType>();
 
   Expr arg = call->args[0];
   tirx::PrimFunc getter =
-      GetDLTensorField(tirx::builtin::TVMStructFieldKind::kDLTensorTypeBits, field_dtype);
+      GetDLTensorField(tirx::builtin::TVMStructFieldKind::kDLTensorTypeBits, field_ty);
 
   GlobalVar gvar_getter = bb->AddFunction(getter, "_get_tensor_dtype_bits");
   return Call(gvar_getter, {arg});
@@ -196,23 +175,14 @@ Expr tensor_dtype_lanes(Expr expr) {
   return Call(op, {expr});
 }
 
-Type InferTypeTensorDtypeLanes(const Call& call, const BlockBuilder&) {
-  auto dlpack_type = DataType::UInt(16);
-
-  DataType dtype = GetTensorDataType(call);
-  if (dtype.is_void()) {
-    return PrimType(dlpack_type);
-  } else {
-    return PrimType(dlpack_type);
-  }
-}
+Type InferTypeTensorDtypeLanes(const Call& call, const BlockBuilder&) { return PrimType::UInt(16); }
 
 Expr LegalizeTensorDtypeLanes(const BlockBuilder& bb, const Call& call) {
-  auto field_dtype = call->ty.as_or_throw<PrimType>()->dtype;
+  PrimType field_ty = call->ty.as_or_throw<tvm::PrimType>();
 
   Expr arg = call->args[0];
   tirx::PrimFunc getter =
-      GetDLTensorField(tirx::builtin::TVMStructFieldKind::kDLTensorTypeLanes, field_dtype);
+      GetDLTensorField(tirx::builtin::TVMStructFieldKind::kDLTensorTypeLanes, field_ty);
 
   GlobalVar gvar_getter = bb->AddFunction(getter, "_get_tensor_dtype_lanes");
   return Call(gvar_getter, {arg});
@@ -234,23 +204,14 @@ Expr tensor_ndim(Expr expr) {
   return Call(op, {expr});
 }
 
-Type InferTypeTensorNDim(const Call& call, const BlockBuilder&) {
-  auto dlpack_type = DataType::Int(32);
-
-  auto ty = GetTensorArgInfo(call);
-  if (ty->IsUnknownNdim()) {
-    return PrimType(dlpack_type);
-  } else {
-    return PrimType(dlpack_type);
-  }
-}
+Type InferTypeTensorNDim(const Call& call, const BlockBuilder&) { return PrimType::Int(32); }
 
 Expr LegalizeTensorNDim(const BlockBuilder& bb, const Call& call) {
-  auto field_dtype = call->ty.as_or_throw<PrimType>()->dtype;
+  PrimType field_ty = call->ty.as_or_throw<tvm::PrimType>();
 
   Expr arg = call->args[0];
   tirx::PrimFunc getter =
-      GetDLTensorField(tirx::builtin::TVMStructFieldKind::kDLTensorNDim, field_dtype);
+      GetDLTensorField(tirx::builtin::TVMStructFieldKind::kDLTensorNDim, field_ty);
 
   GlobalVar gvar_getter = bb->AddFunction(getter, "_get_tensor_ndim");
   return Call(gvar_getter, {arg});
@@ -273,45 +234,45 @@ Expr tensor_shape_i(Expr expr) {
 }
 
 Type InferTypeTensorShape(const Call& call, const BlockBuilder&) {
-  auto dlpack_type = DataType::Int(64);
+  auto dlpack_type = PrimType::Int(64);
 
   auto [tensor_ty, int_imm_axis] = GetTensorArgInfoWithIndex(call);
 
   auto tensor_shape = tensor_ty->GetShape();
 
   if (int_imm_axis && tensor_shape.defined()) {
-    return PrimType(tensor_shape.value()[int_imm_axis.value()].dtype());
+    return tensor_shape.value()[int_imm_axis.value()].ty();
   } else {
-    return PrimType(dlpack_type);
+    return dlpack_type;
   }
 }
 
 Expr LegalizeTensorShape(const BlockBuilder& bb, const Call& call) {
-  auto field_dtype = call->ty.as_or_throw<PrimType>()->dtype;
+  PrimType field_ty = call->ty.as_or_throw<tvm::PrimType>();
 
   tirx::PrimFunc getter = [&]() -> tirx::PrimFunc {
-    tirx::Var dlpack_handle("dlpack_handle", DataType::Handle());
-    tirx::Var axis("axis", DataType::Int(64));
+    tirx::Var dlpack_handle("dlpack_handle", PrimType::Handle());
+    tirx::Var axis("axis", PrimType::Int(64));
 
-    tirx::Var ndim("ndim", DataType::Int(32));
+    tirx::Var ndim("ndim", PrimType::Int(32));
 
-    tirx::Buffer shape_buffer = tirx::decl_buffer({ndim}, field_dtype, "shape");
+    tirx::Buffer shape_buffer = tirx::decl_buffer({ndim}, field_ty, "shape");
 
-    tirx::Var extent("extent", field_dtype);
+    tirx::Var extent("extent", field_ty);
 
     tirx::Stmt body = tirx::SeqStmt(
         {tirx::AssertStmt(0 <= axis, tirx::StringImm("RuntimeError"),
                           {tirx::StringImm("Specified axis may not be negative")}),
          tirx::Bind(ndim,
-                    tirx::Call(ndim->dtype, tirx::builtin::tvm_struct_get(),
+                    tirx::Call(ndim.ty(), tirx::builtin::tvm_struct_get(),
                                {dlpack_handle, IntImm::Int32(0),
                                 IntImm::Int32(tirx::builtin::TVMStructFieldKind::kDLTensorNDim)})),
          tirx::AssertStmt(
-             axis < tvm::cast(axis->dtype, ndim), tirx::StringImm("RuntimeError"),
+             axis < tvm::cast(axis.ty(), ndim), tirx::StringImm("RuntimeError"),
              {tirx::StringImm(
                  "Specified axis may not be larger than the tensor's dimensionality")}),
          tirx::Bind(shape_buffer->data,
-                    tirx::Call(DataType::Handle(), tirx::builtin::tvm_struct_get(),
+                    tirx::Call(tvm::PrimType::Handle(), tirx::builtin::tvm_struct_get(),
                                {dlpack_handle, IntImm::Int32(0),
                                 IntImm::Int32(tirx::builtin::TVMStructFieldKind::kDLTensorShape)})),
          tirx::DeclBuffer(shape_buffer), tirx::Bind(extent, tirx::BufferLoad(shape_buffer, {axis})),
@@ -319,10 +280,9 @@ Expr LegalizeTensorShape(const BlockBuilder& bb, const Call& call) {
 
     DictAttrs attrs({{"tirx.is_scheduled", true}, {"tirx.is_host_func", true}});
 
-    tirx::PrimFunc func({dlpack_handle, axis}, body, tvm::PrimType(field_dtype), {}, attrs);
+    tirx::PrimFunc func({dlpack_handle, axis}, body, field_ty, {}, attrs);
 
-    FuncType ty({TensorType(DataType::Void(), kUnknownNDim), PrimType(axis->dtype)},
-                PrimType(field_dtype));
+    FuncType ty({TensorType(PrimType::Void(), kUnknownNDim), axis.ty()}, field_ty);
     func->ty = ty;
     return func;
   }();
@@ -349,7 +309,7 @@ Expr tensor_stride_i(Expr expr) {
 }
 
 Type InferTypeTensorStride(const Call& call, const BlockBuilder&) {
-  auto dlpack_type = DataType::Int(64);
+  auto dlpack_type = PrimType::Int(64);
 
   auto [tensor_ty, int_imm_axis] = GetTensorArgInfoWithIndex(call);
 
@@ -373,9 +333,9 @@ Type InferTypeTensorStride(const Call& call, const BlockBuilder&) {
     for (size_t axis = int_imm_axis.value() + 1; axis < tensor_shape.size(); axis++) {
       stride = stride * tensor_shape[axis];
     }
-    return PrimType(stride.dtype());
+    return stride.ty();
   } else {
-    return PrimType(dlpack_type);
+    return dlpack_type;
   }
 }
 
@@ -396,7 +356,7 @@ Expr tensor_byte_offset(Expr expr) {
 }
 
 Type InferTypeTensorByteOffset(const Call& call, const BlockBuilder&) {
-  auto dlpack_type = DataType::UInt(64);
+  auto dlpack_type = PrimType::UInt(64);
 
   auto tensor_ty = GetTensorArgInfo(call);
 
@@ -405,9 +365,9 @@ Type InferTypeTensorByteOffset(const Call& call, const BlockBuilder&) {
     // Relax implicitly requires that the byte offset is zero for any
     // legalizable tensor.  See InferTypeTensorStride for full
     // explanation.
-    return PrimType(dlpack_type);
+    return dlpack_type;
   } else {
-    return PrimType(dlpack_type);
+    return dlpack_type;
   }
 }
 
@@ -427,7 +387,7 @@ Expr tensor_elem_offset(Expr expr) {
 }
 
 Type InferTypeTensorElemOffset(const Call& call, const BlockBuilder&) {
-  auto dlpack_type = DataType::UInt(64);
+  auto dlpack_type = PrimType::UInt(64);
 
   auto tensor_ty = GetTensorArgInfo(call);
 
@@ -436,9 +396,9 @@ Type InferTypeTensorElemOffset(const Call& call, const BlockBuilder&) {
     // Relax implicitly requires that the element offset is zero for
     // any legalizable tensor.  See InferTypeTensorStride for
     // full explanation.
-    return PrimType(dlpack_type);
+    return dlpack_type;
   } else {
-    return PrimType(dlpack_type);
+    return dlpack_type;
   }
 }
 
diff --git a/src/relax/op/tensor/inspect.h b/src/relax/op/tensor/inspect.h
index 3f820ab58a83..92cc4c256c79 100644
--- a/src/relax/op/tensor/inspect.h
+++ b/src/relax/op/tensor/inspect.h
@@ -36,7 +36,7 @@ namespace inspect {
  * `TensorType`.
  *
  * \returns The uint8_t value of the type_code, with
- * `PrimType(DataType::UInt(8))`
+ * `PrimType::UInt(8)`
  */
 Expr tensor_dtype_code(Expr expr);
 
@@ -46,7 +46,7 @@ Expr tensor_dtype_code(Expr expr);
  * `TensorType`.
  *
  * \returns The uint8_t value of the number of bits, with
- * `PrimType(DataType::UInt(8))`.  For vectorized types, returns
+ * `PrimType::UInt(8)`.  For vectorized types, returns
  * the bit width of the underlying scalar type (e.g. 32 for
  * "float32x4", not 128).
  */
@@ -58,7 +58,7 @@ Expr tensor_dtype_bits(Expr expr);
  * `TensorType`.
  *
  * \returns The uint16_t value of the number of lanes, with
- * `PrimType(DataType::UInt(16))`
+ * `PrimType::UInt(16)`
  */
 Expr tensor_dtype_lanes(Expr expr);
 
@@ -68,7 +68,7 @@ Expr tensor_dtype_lanes(Expr expr);
  * `TensorType`.
  *
  * \returns The int32_t value of the dimensionality, with
- * `PrimType(DataType::Int(32))`.
+ * `PrimType::Int(32)`.
  */
 Expr tensor_ndim(Expr expr);
 
@@ -81,7 +81,7 @@ Expr tensor_ndim(Expr expr);
  *     axis < tensor_ndim(expr)`, or else the results are undefined.
  *
  * \returns The int64_t extent of the specified tensor axis, with
- * `PrimType(DataType::Int(64))`.
+ * `PrimType::Int(64)`.
  */
 Expr tensor_shape_i(Expr expr, Expr axis);
 
@@ -98,7 +98,7 @@ Expr tensor_shape_i(Expr expr, Expr axis);
  *     axis < tensor_ndim(expr)`, or else the results are undefined.
  *
  * \returns The int64_t extent of the specified tensor axis, with
- * `PrimType(DataType::Int(64))`.
+ * `PrimType::Int(64)`.
  */
 Expr tensor_stride_i(Expr expr, Expr axis);
 
@@ -107,7 +107,7 @@ Expr tensor_stride_i(Expr expr, Expr axis);
  * \param expr The relax expression to be inspected.  Must have
  * `TensorType`.
  *
- * \returns The uint64_t byte offset, with `PrimType(DataType::UInt(64))`.
+ * \returns The uint64_t byte offset, with `PrimType::UInt(64)`.
  */
 Expr tensor_byte_offset(Expr expr);
 
@@ -120,7 +120,7 @@ Expr tensor_byte_offset(Expr expr);
  * \param expr The relax expression to be inspected.  Must have
  * `TensorType`.
  *
- * \returns The uint64_t element offset, with `PrimType(DataType::UInt(64))`.
+ * \returns The uint64_t element offset, with `PrimType::UInt(64)`.
  */
 Expr tensor_elem_offset(Expr expr);
 
diff --git a/src/relax/op/tensor/linear_algebra.cc b/src/relax/op/tensor/linear_algebra.cc
index a1693c6563f2..6ea68b422378 100644
--- a/src/relax/op/tensor/linear_algebra.cc
+++ b/src/relax/op/tensor/linear_algebra.cc
@@ -42,9 +42,9 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 /* relax.matmul */
 
-Expr matmul(Expr x1, Expr x2, ffi::Optional<DataType> out_dtype) {
+Expr matmul(Expr x1, Expr x2, ffi::Optional<DLDataType> out_dtype) {
   ffi::ObjectPtr<MatmulAttrs> attrs = ffi::make_object<MatmulAttrs>();
-  attrs->out_dtype = out_dtype.value_or(DataType::Void());
+  attrs->out_dtype = out_dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
 
   static const Op& op = Op::Get("relax.matmul");
   return Call(op, {std::move(x1), std::move(x2)}, Attrs(attrs), {});
@@ -74,9 +74,9 @@ Type InferTypeMatmul(const Call& call, const BlockBuilder& ctx) {
   }
 
   const auto* attrs = call->attrs.as<MatmulAttrs>();
-  DataType out_dtype = attrs->out_dtype.is_void()
-                           ? InferBinaryArithOpOutDtype(call, ctx, x1_ty, x2_ty)
-                           : attrs->out_dtype;
+  PrimType out_dtype = PrimType(attrs->out_dtype == DLDataType{kDLOpaqueHandle, 0, 0}
+                                    ? InferBinaryArithOpOutDtype(call, ctx, x1_ty, x2_ty)
+                                    : attrs->out_dtype);
 
   if (x1_ty->IsUnknownNdim() || x2_ty->IsUnknownNdim()) {
     if (vdev.defined()) {
@@ -158,7 +158,7 @@ Type InferTypeMatmul(const Call& call, const BlockBuilder& ctx) {
   return TensorType(ShapeExpr(output_shape), out_dtype);
 }
 
-Call InferMixedPrecisionMatmul(const Call& call, const DataType& out_dtype) {
+Call InferMixedPrecisionMatmul(const Call& call, DLDataType out_dtype) {
   return matmul(call->args[0], call->args[1], out_dtype).as_or_throw<Call>();
 }
 
@@ -218,17 +218,17 @@ Type InferTypeEinsum(const Call& call, const BlockBuilder& ctx) {
 
   ffi::String subscripts = attrs->subscripts;
 
-  DataType operand_dtype = operands_tensor_ty[0]->dtype;
+  PrimType operand_ty = operands_tensor_ty[0]->dtype;
   std::vector<ffi::Array<PrimExpr>> input_shapes;
   input_shapes.reserve(operands_tensor_ty.size());
 
   for (TensorType tensor_ty : operands_tensor_ty) {
     // Check the input tuple consists of tensors with same dtype
-    if (tensor_ty->dtype != operand_dtype) {
+    if (tensor_ty->dtype != operand_ty) {
       TVM_FFI_VISIT_THROW(TypeError, call)
           << "Einsum expects all input tensors to have the same dtype. However, the "
              "input contains tensors with dtype "
-          << operand_dtype << " and " << tensor_ty->dtype;
+          << operand_ty << " and " << tensor_ty->dtype;
     }
 
     // Get input shapes
@@ -237,18 +237,18 @@ Type InferTypeEinsum(const Call& call, const BlockBuilder& ctx) {
       input_shapes.push_back(shape_expr->values);
     } else {
       if (!vdevice_unknown) {
-        return TensorType(operand_dtype, tensor_ty->ndim, vdev);
+        return TensorType(operand_ty, tensor_ty->ndim, vdev);
       }
-      return TensorType(operand_dtype, tensor_ty->ndim);
+      return TensorType(operand_ty, tensor_ty->ndim);
     }
   }
   // Calculate output shape using InferEinsumShape in topi
   ffi::Array<PrimExpr> oshape = topi::InferEinsumShape(subscripts, input_shapes);
 
   if (!vdevice_unknown) {
-    return TensorType(ShapeExpr(oshape), operand_dtype, vdev);
+    return TensorType(ShapeExpr(oshape), operand_ty, vdev);
   }
-  return TensorType(ShapeExpr(oshape), operand_dtype);
+  return TensorType(ShapeExpr(oshape), operand_ty);
 }
 
 TVM_REGISTER_OP("relax.einsum")
diff --git a/src/relax/op/tensor/linear_algebra.h b/src/relax/op/tensor/linear_algebra.h
index ddfceae4dc35..481193f897b8 100644
--- a/src/relax/op/tensor/linear_algebra.h
+++ b/src/relax/op/tensor/linear_algebra.h
@@ -41,7 +41,7 @@ namespace relax {
  * When it is not specified, the output dtype will be the same as input dtype.
  * \return The computed result.
  */
-Expr matmul(Expr x1, Expr x2, ffi::Optional<DataType> out_dtype);
+Expr matmul(Expr x1, Expr x2, ffi::Optional<DLDataType> out_dtype);
 
 /*!
  * \brief Einstein summation on the operands.
diff --git a/src/relax/op/tensor/manipulate.cc b/src/relax/op/tensor/manipulate.cc
index caa730091383..f0c7947b5ba2 100644
--- a/src/relax/op/tensor/manipulate.cc
+++ b/src/relax/op/tensor/manipulate.cc
@@ -35,7 +35,7 @@
 #include <utility>
 #include <vector>
 
-#include "tvm/runtime/data_type.h"
+#include "tvm/ffi/dtype.h"
 
 namespace tvm {
 namespace relax {
@@ -219,7 +219,7 @@ Type InferTypeConcat(const Call& call, const BlockBuilder& ctx) {
 
   const auto* attrs = call->attrs.as<ConcatAttrs>();
   int output_ndim = attrs->axis.has_value() ? kUnknownNDim : 1;
-  DataType output_dtype = DataType::Void();
+  PrimType output_dtype = PrimType::Void();
   ffi::Optional<VDevice> vdev = std::nullopt;
   bool shape_unknown = false;
   bool is_void_dtype = false;
@@ -229,9 +229,9 @@ Type InferTypeConcat(const Call& call, const BlockBuilder& ctx) {
 
   for (TensorType ty : tensor_ty) {
     // Update the output dtype.
-    if (ty->dtype.is_void()) {
+    if (ty->IsUnknownDtype()) {
       is_void_dtype = true;
-    } else if (output_dtype.is_void()) {
+    } else if (output_dtype.IsVoid()) {
       output_dtype = ty->dtype;
     } else if (ty->dtype != output_dtype) {
       TVM_FFI_VISIT_THROW(TypeError, call)
@@ -285,7 +285,7 @@ Type InferTypeConcat(const Call& call, const BlockBuilder& ctx) {
   }
 
   if (is_void_dtype) {
-    output_dtype = DataType::Void();
+    output_dtype = PrimType::Void();
   }
   if (vdevice_unknown) {
     vdev = std::nullopt;
@@ -573,14 +573,16 @@ Type InferTypeIndexTensor(const Call& call, const BlockBuilder& ctx) {
         << "index_tensor expects a non‑empty tuple of index tensors";
   }
 
-  DataType output_dtype = data_ty->dtype;
+  PrimType output_dtype = data_ty->dtype;
   int n_indices = static_cast<int>(indices_ty.size());
   ffi::Optional<VDevice> vdev = data_ty->vdevice;
 
   // Indices must be integers
   for (int i = 0; i < n_indices; ++i) {
     const auto& s = indices_ty[i];
-    if (!s->IsUnknownDtype() && !s->dtype.is_int()) {
+    PrimType index_dtype = s->dtype;
+    // Indexing only requires integer element kind; vector lanes do not affect shape inference.
+    if (!s->IsUnknownDtype() && index_dtype.code() != DLDataTypeCode::kDLInt) {
       TVM_FFI_VISIT_THROW(TypeError, call)
           << "index_tensor requires every index tensor to have an integer dtype; "
           << "index " << i << " has dtype " << s->dtype;
@@ -725,9 +727,10 @@ Type InferTypeLayoutTransform(const Call& call, const BlockBuilder& ctx) {
   // Check pad_value has same dtype as input.
   if (optional_pad_value.defined()) {
     PrimExpr padded_value = optional_pad_value.value()->value;
-    if (padded_value->dtype != data_ty->dtype) {
+    PrimType padded_dtype = padded_value.ty();
+    if (padded_dtype != data_ty->dtype) {
       TVM_FFI_VISIT_THROW(TypeError, call)
-          << "layout_transform pad_value dtype (" << padded_value->dtype << ") and input dtype ("
+          << "layout_transform pad_value dtype (" << padded_dtype << ") and input dtype ("
           << data_ty->dtype << ") must be the same";
     }
   }
@@ -916,9 +919,10 @@ Expr ConvertNewShapeToExpr(const Expr& data,
            "Array of PrimExprs. However, the given new shape is "
         << shape;
     PrimExpr len = ffi::GetRef<PrimExpr>(_len);
-    TVM_FFI_ICHECK(len->dtype.is_int()) << "Reshape requires the new shape values to be all "
-                                           "integers. However, the give new shape is "
-                                        << shape;
+    TVM_FFI_ICHECK(len.ty().code() == DLDataTypeCode::kDLInt)
+        << "Reshape requires the new shape values to be all "
+           "integers. However, the give new shape is "
+        << shape;
     const auto* int_len = len.as<IntImmNode>();
     if (int_len != nullptr && int_len->value == 0) {
       // Note that this dimension should be copied from the original shape.
@@ -1108,7 +1112,7 @@ Type InferTypeSplit(const Call& call, const BlockBuilder& ctx) {
 
     TVM_FFI_ICHECK_NE(axis, -1);
 
-    IntImm zero(DataType::Int(64), /*value=*/0);
+    IntImm zero(tvm::PrimType::Int(64), /*value=*/0);
 
     std::vector<Type> output_ty;
     for (size_t i = 0; i < p_indices.size() + 1; i++) {
@@ -1489,7 +1493,7 @@ Type InferTypeStack(const Call& call, const BlockBuilder& ctx) {
 
   // Default axis is 0 if not specified
   int output_ndim = tensor_ty[0]->ndim + 1;  // Stack adds one dimension
-  DataType output_dtype = DataType::Void();
+  PrimType output_dtype = PrimType::Void();
   ffi::Optional<VDevice> vdev = std::nullopt;
   bool shape_unknown = false;
   bool is_void_dtype = false;
@@ -1499,9 +1503,9 @@ Type InferTypeStack(const Call& call, const BlockBuilder& ctx) {
 
   for (TensorType ty : tensor_ty) {
     // Check dtype consistency
-    if (ty->dtype.is_void()) {
+    if (ty->IsUnknownDtype()) {
       is_void_dtype = true;
-    } else if (output_dtype.is_void()) {
+    } else if (output_dtype.IsVoid()) {
       output_dtype = ty->dtype;
     } else if (ty->dtype != output_dtype) {
       TVM_FFI_VISIT_THROW(TypeError, call)
@@ -1542,7 +1546,7 @@ Type InferTypeStack(const Call& call, const BlockBuilder& ctx) {
     }
   }
 
-  if (is_void_dtype) output_dtype = DataType::Void();
+  if (is_void_dtype) output_dtype = PrimType::Void();
   if (vdevice_unknown) vdev = std::nullopt;
 
   // Normalize axis (default to 0 if not specified)
@@ -1650,7 +1654,7 @@ Type InferTypeCollapseSumLike(const Call& call, const BlockBuilder& ctx) {
   TensorType data_ty = input_ty[0];
   TensorType collapse_target_ty = input_ty[1];
 
-  DataType output_dtype = data_ty->dtype;
+  PrimType output_dtype = data_ty->dtype;
 
   ffi::Optional<ffi::Array<PrimExpr>> data_shape_value;
   if (data_ty->shape.defined()) {
@@ -1711,7 +1715,7 @@ Type InferTypeCollapseSumTo(const Call& call, const BlockBuilder& ctx) {
         << call->args[1]->ty->GetTypeKey();
   }
 
-  DataType output_dtype = data_ty->dtype;
+  PrimType output_dtype = data_ty->dtype;
 
   ffi::Optional<ffi::Array<PrimExpr>> data_shape_value;
   if (data_ty->shape.defined()) {
@@ -2099,14 +2103,15 @@ Type InferTypeReverseSequence(const Call& call, const BlockBuilder& ctx) {
         << "ReverseSequence requires seq_lengths to be 1-D. However, seq_lengths has ndim "
         << seq_lengths_ty->ndim;
   }
-  if (!seq_lengths_ty->dtype.is_void() && !seq_lengths_ty->dtype.is_int()) {
+  PrimType seq_lengths_dtype = seq_lengths_ty->dtype;
+  if (!seq_lengths_ty->IsUnknownDtype() && !seq_lengths_dtype.MatchesCode(DLDataTypeCode::kDLInt)) {
     TVM_FFI_VISIT_THROW(ValueError, call)
         << "ReverseSequence requires seq_lengths to have dtype int32 or int64. However, "
            "seq_lengths has dtype "
         << seq_lengths_ty->dtype;
   }
-  if (seq_lengths_ty->dtype.is_int() && seq_lengths_ty->dtype.bits() != 32 &&
-      seq_lengths_ty->dtype.bits() != 64) {
+  if (seq_lengths_dtype.MatchesCode(DLDataTypeCode::kDLInt) &&
+      seq_lengths_dtype->dtype.bits != 32 && seq_lengths_dtype->dtype.bits != 64) {
     TVM_FFI_VISIT_THROW(ValueError, call)
         << "ReverseSequence requires seq_lengths to have dtype int32 or int64. However, "
            "seq_lengths has dtype "
@@ -2192,7 +2197,9 @@ Type InferTypeGatherElements(const Call& call, const BlockBuilder& ctx) {
         << call->args[1]->ty->GetTypeKey();
   }
 
-  if (!indices_ty->IsUnknownDtype() && !indices_ty->dtype.is_int()) {
+  PrimType indices_dtype = indices_ty->dtype;
+  // Gather indices only require integer element kind; vector lanes do not affect shape inference.
+  if (!indices_ty->IsUnknownDtype() && indices_dtype.code() != DLDataTypeCode::kDLInt) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "GatherElements requires the input indices to have int64 dtype. However, the "
         << "given indices dtype is " << indices_ty->dtype;
@@ -2295,7 +2302,7 @@ Type InferTypeGatherND(const Call& call, const BlockBuilder& ctx) {
   TVM_FFI_ICHECK_GE(attrs->batch_dims, 0);
   int batch_dims = static_cast<int>(attrs->batch_dims);
   int input_dims = data_ty->ndim;
-  if (!indices_ty->IsUnknownDtype() && indices_ty->dtype != DataType::Int(64)) {
+  if (!indices_ty->IsUnknownDtype() && indices_ty->dtype != PrimType::Int(64)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "GatherND requires the input indices to have int64 dtype. However, the "
         << "given indices dtype is " << indices_ty->dtype;
@@ -2430,10 +2437,14 @@ Type InferTypeIndexPut(const Call& call, const BlockBuilder& ctx) {
     if (tensor_ty->IsUnknownDtype()) {
       LOG(WARNING) << "Data type of index tensor " << i
                    << " has not been specified. Assume it has an integer type.";
-    } else if (!(tensor_ty->dtype.is_int() || tensor_ty->dtype.is_uint())) {
-      TVM_FFI_VISIT_THROW(TypeError, call)
-          << "IndexPut requires each index tensor to have integer dtype. "
-          << "However, index tensor " << i << " has dtype=" << tensor_ty->dtype;
+    } else {
+      PrimType index_dtype = tensor_ty->dtype;
+      if (!index_dtype.MatchesCode(DLDataTypeCode::kDLInt) &&
+          !index_dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
+        TVM_FFI_VISIT_THROW(TypeError, call)
+            << "IndexPut requires each index tensor to have integer dtype. "
+            << "However, index tensor " << i << " has dtype=" << tensor_ty->dtype;
+      }
     }
   }
 
@@ -2531,7 +2542,7 @@ Type InferTypeMeshgrid(const Call& call, const BlockBuilder& ctx) {
   }
 
   std::vector<PrimExpr> lengths;
-  DataType common_dtype = DataType::Void();
+  PrimType common_dtype = PrimType::Void();
   bool shape_unknown = false;
   ffi::Optional<VDevice> vdev = std::nullopt;
   bool vdevice_unknown = false;
@@ -2545,9 +2556,9 @@ Type InferTypeMeshgrid(const Call& call, const BlockBuilder& ctx) {
           << i;
     }
 
-    if (ty->dtype.is_void()) {
+    if (ty->IsUnknownDtype()) {
       continue;
-    } else if (common_dtype.is_void()) {
+    } else if (common_dtype.IsVoid()) {
       common_dtype = ty->dtype;
     } else if (ty->dtype != common_dtype) {
       TVM_FFI_VISIT_THROW(TypeError, call)
@@ -2683,11 +2694,15 @@ Type InferTypeScatterElements(const Call& call, const BlockBuilder& ctx) {
 
   if (indices_ty->IsUnknownDtype()) {
     LOG(WARNING) << "Data type of indices has not been specified. Assume it has an integer type.";
-  } else if (!(indices_ty->dtype.is_int() || indices_ty->dtype.is_uint())) {
-    TVM_FFI_VISIT_THROW(TypeError, call)
-        << "ScatterElements op requires the input indices to have integer dtype. However, the "
-           "given indices dtype is "
-        << indices_ty->dtype;
+  } else {
+    PrimType indices_dtype = indices_ty->dtype;
+    if (!indices_dtype.MatchesCode(DLDataTypeCode::kDLInt) &&
+        !indices_dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
+      TVM_FFI_VISIT_THROW(TypeError, call)
+          << "ScatterElements op requires the input indices to have integer dtype. However, the "
+             "given indices dtype is "
+          << indices_ty->dtype;
+    }
   }
 
   const auto* indices_shape = indices_ty->shape.as<ShapeExprNode>();
@@ -2803,11 +2818,15 @@ Type InferTypeScatterND(const Call& call, const BlockBuilder& ctx) {
 
   if (indices_ty->IsUnknownDtype()) {
     LOG(WARNING) << "Data type of indices has not been specified. Assume it has an integer type.";
-  } else if (!(indices_ty->dtype.is_int() || indices_ty->dtype.is_uint())) {
-    TVM_FFI_VISIT_THROW(TypeError, call)
-        << "ScatterND op requires the input indices to have integer dtype. However, "
-           "the given indices dtype is "
-        << indices_ty->dtype;
+  } else {
+    PrimType indices_dtype = indices_ty->dtype;
+    if (!indices_dtype.MatchesCode(DLDataTypeCode::kDLInt) &&
+        !indices_dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
+      TVM_FFI_VISIT_THROW(TypeError, call)
+          << "ScatterND op requires the input indices to have integer dtype. However, "
+             "the given indices dtype is "
+          << indices_ty->dtype;
+    }
   }
 
   const auto* data_shape = data_ty->shape.as<ShapeExprNode>();
@@ -3003,10 +3022,11 @@ Type InferTypeSliceScatter(const Call& call, const BlockBuilder& ctx) {
           << ") to be a PrimValue, but got " << arg_expr->GetTypeKey();
     }
     const PrimExpr& prim_expr = prim_value_node->value;
-    if (!prim_expr.dtype().is_int() && !prim_expr.dtype().is_uint()) {
+    tvm::PrimType prim_ty = prim_expr.ty();
+    if (prim_ty.code() != DLDataTypeCode::kDLInt && prim_ty.code() != DLDataTypeCode::kDLUInt) {
       TVM_FFI_VISIT_THROW(TypeError, call)
           << "SliceScatter expects `" << key << "` (" << prim_expr
-          << ") to be an integer PrimValue, but got dtype " << prim_expr.dtype();
+          << ") to be an integer PrimValue, but got dtype " << prim_ty;
     }
     return prim_expr;
   };
@@ -3085,8 +3105,8 @@ Expr one_hot(Expr indices, PrimValue on_value, PrimValue off_value, int depth, i
   attrs->axis = axis;
 
   // Check if on_value and off_value have the same dtype
-  DataType on_dtype = on_value->value->dtype;
-  DataType off_dtype = off_value->value->dtype;
+  PrimType on_dtype = on_value->value.ty();
+  PrimType off_dtype = off_value->value.ty();
   TVM_FFI_ICHECK(on_dtype == off_dtype)
       << "one_hot: on_value and off_value must have the same dtype, "
       << "but got " << on_dtype << " and " << off_dtype;
@@ -3108,19 +3128,25 @@ Type InferTypeOneHot(const Call& call, const BlockBuilder& ctx) {
   PrimValue on_value = call->args[1].as_or_throw<PrimValue>();
   PrimValue off_value = call->args[2].as_or_throw<PrimValue>();
   // Check if on_value and off_value have the same dtype
-  TVM_FFI_ICHECK(on_value->value->dtype == off_value->value->dtype)
+  PrimType on_dtype = on_value->value.ty();
+  PrimType off_dtype = off_value->value.ty();
+  TVM_FFI_ICHECK(on_dtype == off_dtype)
       << "one_hot: on_value and off_value must have the same dtype, "
-      << "but got " << on_value->value->dtype << " and " << off_value->value->dtype;
-  DataType dtype = on_value->value->dtype;
+      << "but got " << on_dtype << " and " << off_dtype;
+  PrimType dtype = on_dtype;
 
   // Check if indices has an integer dtype
   if (indices_ty->IsUnknownDtype()) {
     LOG(WARNING) << "Data type of indices has not been specified. Assume it has an integer type.";
-  } else if (!(indices_ty->dtype.is_int() || indices_ty->dtype.is_uint())) {
-    TVM_FFI_VISIT_THROW(TypeError, call)
-        << "one_hot op requires the input indices to have integer dtype. However, the "
-           "given indices dtype is "
-        << indices_ty->dtype;
+  } else {
+    PrimType indices_dtype = indices_ty->dtype;
+    if (!indices_dtype.MatchesCode(DLDataTypeCode::kDLInt) &&
+        !indices_dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
+      TVM_FFI_VISIT_THROW(TypeError, call)
+          << "one_hot op requires the input indices to have integer dtype. However, the "
+             "given indices dtype is "
+          << indices_ty->dtype;
+    }
   }
   // Check if indices has unknown dimension
   if (indices_ty->IsUnknownNdim()) {
diff --git a/src/relax/op/tensor/qdq.cc b/src/relax/op/tensor/qdq.cc
index 974d70e7300a..8940594abc51 100644
--- a/src/relax/op/tensor/qdq.cc
+++ b/src/relax/op/tensor/qdq.cc
@@ -39,7 +39,7 @@ TVM_FFI_STATIC_INIT_BLOCK() { QuantizeAttrs::RegisterReflection(); }
 
 /* relax.quantize */
 
-Expr quantize(Expr data, Expr scale, Expr zero_point, int axis, DataType out_dtype) {
+Expr quantize(Expr data, Expr scale, Expr zero_point, int axis, DLDataType out_dtype) {
   ffi::ObjectPtr<QuantizeAttrs> attrs = ffi::make_object<QuantizeAttrs>();
   attrs->axis = axis;
   attrs->out_dtype = out_dtype;
@@ -54,9 +54,14 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 Type InferTypeQuantize(const Call& call, const BlockBuilder& ctx) {
   const auto* attrs = call->attrs.as<QuantizeAttrs>();
-  if (attrs->out_dtype != DataType::Int(8) && attrs->out_dtype != DataType::UInt(8) &&
-      attrs->out_dtype != DataType::Int(16) && attrs->out_dtype != DataType::UInt(16) &&
-      attrs->out_dtype != DataType::Float8E4M3FN() && attrs->out_dtype != DataType::Float8E5M2()) {
+  if (attrs->out_dtype != DLDataType{kDLInt, 8, 1} &&
+      attrs->out_dtype != DLDataType{kDLUInt, 8, 1} &&
+      attrs->out_dtype != DLDataType{kDLInt, 16, 1} &&
+      attrs->out_dtype != DLDataType{kDLUInt, 16, 1} &&
+      attrs->out_dtype != DLDataType{static_cast<uint8_t>(kDLFloat8_e4m3fn),
+                                     static_cast<uint8_t>(8), static_cast<uint16_t>(1)} &&
+      attrs->out_dtype != DLDataType{static_cast<uint8_t>(kDLFloat8_e5m2), static_cast<uint8_t>(8),
+                                     static_cast<uint16_t>(1)}) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Unsupported output datatype attribute for operation: '" << attrs->out_dtype;
   }
@@ -64,24 +69,27 @@ Type InferTypeQuantize(const Call& call, const BlockBuilder& ctx) {
   TensorType input_ty = GetInputTensorType(call, ctx)[0];
   TensorType scale_ty = GetInputTensorType(call, ctx)[1];
   TensorType zp_ty = GetInputTensorType(call, ctx)[2];
+  PrimType input_dtype = input_ty->dtype;
+  PrimType scale_dtype = scale_ty->dtype;
+  PrimType zp_dtype = zp_ty->dtype;
 
   // Check input datatype:
-  if (input_ty->dtype != DataType::Float(16) && input_ty->dtype != DataType::Float(32)) {
+  if (input_dtype != PrimType::Float(16) && input_dtype != PrimType::Float(32)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Unsupported input datatype for operation: " << input_ty->dtype;
   }
 
   // Check datatype of scale param:
-  if (scale_ty->dtype != DataType::Float(32) && scale_ty->dtype != DataType::Float(16)) {
+  if (scale_dtype != PrimType::Float(32) && scale_dtype != PrimType::Float(16)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "scale param datatype should be one of [float16, float32], but got " << scale_ty->dtype;
   }
 
   // Check datatype of zero_point param:
-  if (zp_ty->dtype != DataType::Int(8) && zp_ty->dtype != DataType::UInt(8) &&
-      zp_ty->dtype != DataType::Int(16) && zp_ty->dtype != DataType::UInt(16) &&
-      zp_ty->dtype != DataType::Int(32) && zp_ty->dtype != DataType::UInt(32) &&
-      zp_ty->dtype != DataType::Float(16)) {
+  if (zp_dtype != PrimType::Int(8) && zp_dtype != PrimType::UInt(8) &&
+      zp_dtype != PrimType::Int(16) && zp_dtype != PrimType::UInt(16) &&
+      zp_dtype != PrimType::Int(32) && zp_dtype != PrimType::UInt(32) &&
+      zp_dtype != PrimType::Float(16)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "zero_point param datatype should be one of "
         << "['int8', 'uint8', 'int16', 'uint16', 'int32', 'uint32', 'float16'], "
@@ -124,7 +132,7 @@ Type InferTypeQuantize(const Call& call, const BlockBuilder& ctx) {
   if (!is_scalar_or_singleton_vector(zp_ty)) check_param_size(zp_ty, input_ty, "zero_point");
 
   auto output_ty = ffi::make_object<TensorTypeNode>(*input_ty.get());
-  output_ty->dtype = attrs->out_dtype;
+  output_ty->dtype = PrimType(attrs->out_dtype);
   return TensorType(output_ty);
 }
 
@@ -139,7 +147,7 @@ TVM_REGISTER_OP("relax.quantize")
 
 /* relax.dequantize */
 
-Expr dequantize(Expr data, Expr scale, Expr zero_point, int axis, DataType out_dtype) {
+Expr dequantize(Expr data, Expr scale, Expr zero_point, int axis, DLDataType out_dtype) {
   ffi::ObjectPtr<QuantizeAttrs> attrs = ffi::make_object<QuantizeAttrs>();
   attrs->axis = axis;
   attrs->out_dtype = out_dtype;
@@ -154,7 +162,8 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 Type InferTypeDequantize(const Call& call, const BlockBuilder& ctx) {
   const auto* attrs = call->attrs.as<QuantizeAttrs>();
-  if (attrs->out_dtype != DataType::Float(16) && attrs->out_dtype != DataType::Float(32)) {
+  if (attrs->out_dtype != DLDataType{kDLFloat, 16, 1} &&
+      attrs->out_dtype != DLDataType{kDLFloat, 32, 1}) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Unsupported output datatype attribute for operation: " << attrs->out_dtype;
   }
@@ -162,28 +171,34 @@ Type InferTypeDequantize(const Call& call, const BlockBuilder& ctx) {
   TensorType input_ty = GetInputTensorType(call, ctx)[0];
   TensorType scale_ty = GetInputTensorType(call, ctx)[1];
   TensorType zp_ty = GetInputTensorType(call, ctx)[2];
+  PrimType input_dtype = input_ty->dtype;
+  PrimType scale_dtype = scale_ty->dtype;
+  PrimType zp_dtype = zp_ty->dtype;
 
   // Check input datatype:
-  if (input_ty->dtype != DataType::Int(8) && input_ty->dtype != DataType::UInt(8) &&
-      input_ty->dtype != DataType::Int(16) && input_ty->dtype != DataType::UInt(16) &&
-      input_ty->dtype != DataType::Int(32) && input_ty->dtype != DataType::Float8E4M3FN() &&
-      input_ty->dtype != DataType::Float8E5M2() && input_ty->dtype != DataType::Float(16) &&
-      input_ty->dtype != DataType::Float(32)) {
+  if (input_dtype != PrimType::Int(8) && input_dtype != PrimType::UInt(8) &&
+      input_dtype != PrimType::Int(16) && input_dtype != PrimType::UInt(16) &&
+      input_dtype != PrimType::Int(32) &&
+      input_dtype != PrimType(DLDataType{static_cast<uint8_t>(kDLFloat8_e4m3fn),
+                                         static_cast<uint8_t>(8), static_cast<uint16_t>(1)}) &&
+      input_dtype != PrimType(DLDataType{static_cast<uint8_t>(kDLFloat8_e5m2),
+                                         static_cast<uint8_t>(8), static_cast<uint16_t>(1)}) &&
+      input_dtype != PrimType::Float(16) && input_dtype != PrimType::Float(32)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Unsupported input datatype for operation: " << attrs->out_dtype;
   }
 
   // Check datatype of scale param:
-  if (scale_ty->dtype != DataType::Float(32) && scale_ty->dtype != DataType::Float(16)) {
+  if (scale_dtype != PrimType::Float(32) && scale_dtype != PrimType::Float(16)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "scale param datatype should be one of [float16, float32], but got " << scale_ty->dtype;
   }
 
   // Check datatype of zero_point param:
-  if (zp_ty->dtype != DataType::Int(8) && zp_ty->dtype != DataType::UInt(8) &&
-      zp_ty->dtype != DataType::Int(16) && zp_ty->dtype != DataType::UInt(16) &&
-      zp_ty->dtype != DataType::Int(32) && zp_ty->dtype != DataType::UInt(32) &&
-      zp_ty->dtype != DataType::Float(16)) {
+  if (zp_dtype != PrimType::Int(8) && zp_dtype != PrimType::UInt(8) &&
+      zp_dtype != PrimType::Int(16) && zp_dtype != PrimType::UInt(16) &&
+      zp_dtype != PrimType::Int(32) && zp_dtype != PrimType::UInt(32) &&
+      zp_dtype != PrimType::Float(16)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "zero_point param datatype should be one of "
         << "['int8', 'uint8', 'int16', 'uint16', 'int32', 'uint32', 'float16'], "
@@ -226,7 +241,7 @@ Type InferTypeDequantize(const Call& call, const BlockBuilder& ctx) {
   if (!is_scalar_or_singleton_vector(zp_ty)) check_param_size(zp_ty, input_ty, "zero_point");
 
   auto output_ty = ffi::make_object<TensorTypeNode>(*input_ty.get());
-  output_ty->dtype = attrs->out_dtype;
+  output_ty->dtype = PrimType(attrs->out_dtype);
   return TensorType(output_ty);
 }
 
diff --git a/src/relax/op/tensor/qdq.h b/src/relax/op/tensor/qdq.h
index 9d13dcde277f..bdb31f87e61e 100644
--- a/src/relax/op/tensor/qdq.h
+++ b/src/relax/op/tensor/qdq.h
@@ -40,7 +40,7 @@ namespace relax {
  * \param out_dtype The data type of the output tensor.
  * \return The computed result.
  */
-Expr quantize(Expr data, Expr scale, Expr zero_point, int axis, DataType out_dtype);
+Expr quantize(Expr data, Expr scale, Expr zero_point, int axis, DLDataType out_dtype);
 
 /*!
  * \brief Dequantize op.
@@ -53,7 +53,7 @@ Expr quantize(Expr data, Expr scale, Expr zero_point, int axis, DataType out_dty
  * \param out_dtype The data type of the output tensor.
  * \return The computed result.
  */
-Expr dequantize(Expr data, Expr scale, Expr zero_point, int axis, DataType out_dtype);
+Expr dequantize(Expr data, Expr scale, Expr zero_point, int axis, DLDataType out_dtype);
 
 }  // namespace relax
 }  // namespace tvm
diff --git a/src/relax/op/tensor/sampling.cc b/src/relax/op/tensor/sampling.cc
index 27f9241e2c29..196e6f887649 100644
--- a/src/relax/op/tensor/sampling.cc
+++ b/src/relax/op/tensor/sampling.cc
@@ -37,7 +37,8 @@ TVM_FFI_STATIC_INIT_BLOCK() { MultinomialFromUniformAttrs::RegisterReflection();
 
 /* relax.multinomial_from_uniform */
 
-Expr multinomial_from_uniform(Expr prob, Expr uniform_sample, Expr sample_indices, DataType dtype) {
+Expr multinomial_from_uniform(Expr prob, Expr uniform_sample, Expr sample_indices,
+                              DLDataType dtype) {
   ffi::ObjectPtr<MultinomialFromUniformAttrs> attrs =
       ffi::make_object<MultinomialFromUniformAttrs>();
   attrs->dtype = dtype;
@@ -59,19 +60,24 @@ Type InferTypeMultinomialFromUniform(const Call& call, const BlockBuilder& ctx)
   TensorType sample_indices_ty = GetInputTensorType(call, 2, ctx);
   const auto* attrs = call->attrs.as<MultinomialFromUniformAttrs>();
 
-  if (!prob_ty->dtype.is_float()) {
+  // Only the element kind matters here; shape inference does not depend on vector lanes.
+  if (prob_ty->dtype.code() != DLDataTypeCode::kDLFloat &&
+      prob_ty->dtype.code() != DLDataTypeCode::kDLBfloat) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Multinomial_from_uniform op requires the input prob to have float dtype. "
            "However, the given prob dtype is "
         << prob_ty->dtype;
   }
-  if (!uniform_sample_ty->dtype.is_float()) {
+  // Only the element kind matters here; shape inference does not depend on vector lanes.
+  if (uniform_sample_ty->dtype.code() != DLDataTypeCode::kDLFloat &&
+      uniform_sample_ty->dtype.code() != DLDataTypeCode::kDLBfloat) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Multinomial_from_uniform op requires the input uniform_sample to have float "
            "dtype. However, the given uniform_sample dtype is "
         << uniform_sample_ty->dtype;
   }
-  if (!sample_indices_ty->dtype.is_int()) {
+  // Only the element kind matters here; shape inference does not depend on vector lanes.
+  if (sample_indices_ty->dtype.code() != DLDataTypeCode::kDLInt) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Multinomial from uniform op requires the input sample_indices to have int "
            "dtype. However, the given sample_indices dtype is "
@@ -79,7 +85,7 @@ Type InferTypeMultinomialFromUniform(const Call& call, const BlockBuilder& ctx)
   }
   if (prob_ty->IsUnknownNdim() || uniform_sample_ty->IsUnknownNdim() ||
       sample_indices_ty->IsUnknownNdim()) {
-    return TensorType(attrs->dtype, kUnknownNDim, prob_ty->vdevice);
+    return TensorType(PrimType(attrs->dtype), kUnknownNDim, prob_ty->vdevice);
   }
   if (prob_ty->ndim != 2) {
     TVM_FFI_VISIT_THROW(ValueError, call)
@@ -109,7 +115,7 @@ Type InferTypeMultinomialFromUniform(const Call& call, const BlockBuilder& ctx)
   // The output shape is expected to be `(n, 1)`
 
   if (prob_shape == nullptr || uniform_sample_shape == nullptr || sample_indices_shape == nullptr) {
-    return TensorType(attrs->dtype, 2, prob_ty->vdevice);
+    return TensorType(PrimType(attrs->dtype), 2, prob_ty->vdevice);
   }
 
   PrimExpr batch = prob_shape->values[0];
@@ -132,7 +138,7 @@ Type InferTypeMultinomialFromUniform(const Call& call, const BlockBuilder& ctx)
         << uniform_sample_ty->shape << " and the given sample_indices tensor has shape "
         << sample_indices_ty->shape;
   }
-  return TensorType(ShapeExpr({n, 1}), attrs->dtype, prob_ty->vdevice);
+  return TensorType(ShapeExpr({n, 1}), PrimType(attrs->dtype), prob_ty->vdevice);
 }
 
 TVM_REGISTER_OP("relax.multinomial_from_uniform")
diff --git a/src/relax/op/tensor/sampling.h b/src/relax/op/tensor/sampling.h
index d13aa835d68d..077ef4313669 100644
--- a/src/relax/op/tensor/sampling.h
+++ b/src/relax/op/tensor/sampling.h
@@ -49,7 +49,8 @@ namespace relax {
  * \param dtype The data type of the output tensor.
  * \return The sampled result.
  */
-Expr multinomial_from_uniform(Expr prob, Expr uniform_sample, Expr sample_indices, DataType dtype);
+Expr multinomial_from_uniform(Expr prob, Expr uniform_sample, Expr sample_indices,
+                              DLDataType dtype);
 
 }  // namespace relax
 }  // namespace tvm
diff --git a/src/relax/op/tensor/search.cc b/src/relax/op/tensor/search.cc
index d80f484ebcf5..c5021f6f5aef 100644
--- a/src/relax/op/tensor/search.cc
+++ b/src/relax/op/tensor/search.cc
@@ -64,10 +64,9 @@ Type InferTypeBucketize(const Call& call, const BlockBuilder& ctx) {
   }
 
   auto attrs = call->attrs.as<BucketizeAttrs>();
-  DataType out_dtype;
-  out_dtype = DataType::Int(64);
+  PrimType out_dtype = PrimType::Int(64);
   if (attrs->out_int32) {
-    out_dtype = DataType::Int(32);
+    out_dtype = PrimType::Int(32);
   }
 
   const auto* data_shape = input_tensor_info->shape.as<ShapeExprNode>();
@@ -119,13 +118,15 @@ Type InferTypeWhere(const Call& call, const BlockBuilder& ctx) {
     }
   }
 
-  if (!cond_ty->dtype.is_bool()) {
+  PrimType cond_dtype = cond_ty->dtype;
+  // Where condition validation only checks the boolean element kind; lanes are irrelevant here.
+  if (!cond_dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Where requires the input condition tensor to have boolean dtype. However, "
            "the given condition dtype is "
         << cond_ty->dtype;
   }
-  DataType output_dtype = InferBinaryArithOpOutDtype(call, ctx, x1_ty, x2_ty);
+  PrimType output_dtype(InferBinaryArithOpOutDtype(call, ctx, x1_ty, x2_ty));
 
   int output_ndim;
   if (cond_ty->IsUnknownNdim() || x1_ty->IsUnknownNdim() || x2_ty->IsUnknownNdim()) {
@@ -209,7 +210,7 @@ Type InferTypeArgmaxArgmin(const Call& call, const BlockBuilder& ctx) {
     TVM_FFI_ICHECK_GE(out_ndim, 0);
   }
 
-  DataType out_dtype = DataType::Int(64);
+  PrimType out_dtype = PrimType::Int(64);
   // The inference rule for reduction operator output shapes:
   // - axes is None, keepdims is false -> return the zero-rank shape;
   // - axes is None, keepdims is true -> return the shape whose ndim is the same as input and every
@@ -230,7 +231,7 @@ Type InferTypeArgmaxArgmin(const Call& call, const BlockBuilder& ctx) {
   }
 
   if (data_ty->ndim > 0) {
-    out_dtype = data_shape->values[0]->dtype;
+    out_dtype = data_shape->values[0].ty();
   }
 
   ffi::Array<PrimExpr> out_shape;
diff --git a/src/relax/op/tensor/set.cc b/src/relax/op/tensor/set.cc
index 57999a3356b7..a92cbee4a001 100644
--- a/src/relax/op/tensor/set.cc
+++ b/src/relax/op/tensor/set.cc
@@ -106,9 +106,9 @@ Type InferTypeUnique(const Call& call, const BlockBuilder& ctx) {
   if (f_convert_to_int64(return_index->value)) {
     if (data_ty->ndim == 0) {
       output_ty.push_back(
-          TensorType(ShapeExpr({IntImm::Int64(/*value=*/1)}), DataType::Int(64), data_ty->vdevice));
+          TensorType(ShapeExpr({IntImm::Int64(/*value=*/1)}), PrimType::Int(64), data_ty->vdevice));
     } else {
-      output_ty.push_back(TensorType(DataType::Int(64), /*ndim=*/1, data_ty->vdevice));
+      output_ty.push_back(TensorType(PrimType::Int(64), /*ndim=*/1, data_ty->vdevice));
     }
   }
 
@@ -116,9 +116,9 @@ Type InferTypeUnique(const Call& call, const BlockBuilder& ctx) {
   if (f_convert_to_int64(return_inverse->value)) {
     if (data_ty->ndim == 0) {
       output_ty.push_back(
-          TensorType(ShapeExpr({IntImm::Int64(/*value=*/1)}), DataType::Int(64), data_ty->vdevice));
+          TensorType(ShapeExpr({IntImm::Int64(/*value=*/1)}), PrimType::Int(64), data_ty->vdevice));
     } else {
-      output_ty.push_back(TensorType(DataType::Int(64), /*ndim=*/1, data_ty->vdevice));
+      output_ty.push_back(TensorType(PrimType::Int(64), /*ndim=*/1, data_ty->vdevice));
     }
   }
 
@@ -126,9 +126,9 @@ Type InferTypeUnique(const Call& call, const BlockBuilder& ctx) {
   if (f_convert_to_int64(return_counts->value)) {
     if (data_ty->ndim == 0) {
       output_ty.push_back(
-          TensorType(ShapeExpr({IntImm::Int64(/*value=*/1)}), DataType::Int(64), data_ty->vdevice));
+          TensorType(ShapeExpr({IntImm::Int64(/*value=*/1)}), PrimType::Int(64), data_ty->vdevice));
     } else {
-      output_ty.push_back(TensorType(DataType::Int(64), /*ndim=*/1, data_ty->vdevice));
+      output_ty.push_back(TensorType(PrimType::Int(64), /*ndim=*/1, data_ty->vdevice));
     }
   }
 
@@ -175,7 +175,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 Type InferTypeNonzero(const Call& call, const BlockBuilder& ctx) {
   TensorType data_ty = GetInputTensorType(call, 0, ctx);
-  return TensorType(DataType::Int(64), 2, data_ty->vdevice);
+  return TensorType(PrimType::Int(64), 2, data_ty->vdevice);
 }
 
 TVM_REGISTER_OP("relax.nonzero")
diff --git a/src/relax/op/tensor/sorting.cc b/src/relax/op/tensor/sorting.cc
index 2d014cded4ec..c470fa0d4f6e 100644
--- a/src/relax/op/tensor/sorting.cc
+++ b/src/relax/op/tensor/sorting.cc
@@ -66,7 +66,7 @@ TVM_REGISTER_OP("relax.sort")
 
 /* relax.argsort */
 
-Expr argsort(Expr data, int axis, bool descending, DataType dtype) {
+Expr argsort(Expr data, int axis, bool descending, DLDataType dtype) {
   auto attrs = ffi::make_object<ArgsortAttrs>();
   attrs->axis = std::move(axis);
   attrs->descending = std::move(descending);
@@ -84,7 +84,8 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 Type InferTypeArgsort(const Call& call, const BlockBuilder& ctx) {
   TensorType data_ty = GetUnaryInputTensorType(call, ctx);
   const auto* attrs = call->attrs.as<ArgsortAttrs>();
-  DataType out_type = attrs->dtype.is_void() ? data_ty->dtype : attrs->dtype;
+  PrimType out_type =
+      attrs->dtype == DLDataType{kDLOpaqueHandle, 0, 0} ? data_ty->dtype : PrimType(attrs->dtype);
   if (data_ty->shape.defined()) {
     return TensorType(data_ty->shape.value(), out_type, data_ty->vdevice);
   }
@@ -100,7 +101,7 @@ TVM_REGISTER_OP("relax.argsort")
 
 /* relax.topk */
 
-Expr topk(Expr data, int k, int axis, ffi::String ret_type, bool largest, DataType dtype) {
+Expr topk(Expr data, int k, int axis, ffi::String ret_type, bool largest, DLDataType dtype) {
   auto attrs = ffi::make_object<TopKAttrs>();
   attrs->k = std::move(k);
   attrs->axis = std::move(axis);
@@ -121,7 +122,8 @@ Type InferTypeTopK(const Call& call, const BlockBuilder& ctx) {
   TensorType data_ty = GetUnaryInputTensorType(call, ctx);
   const auto* data_shape = data_ty->shape.as<ShapeExprNode>();
   const auto* attrs = call->attrs.as<TopKAttrs>();
-  DataType indices_type = attrs->dtype.is_void() ? data_ty->dtype : attrs->dtype;
+  PrimType indices_type =
+      attrs->dtype == DLDataType{kDLOpaqueHandle, 0, 0} ? data_ty->dtype : PrimType(attrs->dtype);
   int ndim = data_ty->ndim;
   int k = attrs->k;
   ffi::String ret_type = attrs->ret_type;
diff --git a/src/relax/op/tensor/sorting.h b/src/relax/op/tensor/sorting.h
index a4154ce416ad..8a2ec98388df 100644
--- a/src/relax/op/tensor/sorting.h
+++ b/src/relax/op/tensor/sorting.h
@@ -51,7 +51,7 @@ Expr sort(Expr data, int axis, bool descending);
  * \param dtype The data type of the output indices.
  * \return The computed result.
  */
-Expr argsort(Expr data, int axis, bool descending, DataType dtype);
+Expr argsort(Expr data, int axis, bool descending, DLDataType dtype);
 
 /*!
  * \brief Get the top k elements in an input tensor along the given axis.
@@ -63,7 +63,7 @@ Expr argsort(Expr data, int axis, bool descending, DataType dtype);
  * \param dtype The data type of the indices output.
  * \return The computed result.
  */
-Expr topk(Expr data, int k, int axis, ffi::String ret_type, bool largest, DataType dtype);
+Expr topk(Expr data, int k, int axis, ffi::String ret_type, bool largest, DLDataType dtype);
 
 }  // namespace relax
 }  // namespace tvm
diff --git a/src/relax/op/tensor/statistical.cc b/src/relax/op/tensor/statistical.cc
index 9fe68afe2901..15bbd701e67f 100644
--- a/src/relax/op/tensor/statistical.cc
+++ b/src/relax/op/tensor/statistical.cc
@@ -155,7 +155,8 @@ Type InferTypeScan(const Call& call, const BlockBuilder& ctx) {
   TensorType data_ty = GetUnaryInputTensorType(call, ctx);
   const auto* attrs = call->attrs.as<ScanopAttrs>();
 
-  DataType out_type = attrs->dtype.is_void() ? data_ty->dtype : attrs->dtype;
+  PrimType out_type =
+      attrs->dtype == DLDataType{kDLOpaqueHandle, 0, 0} ? data_ty->dtype : PrimType(attrs->dtype);
 
   if (!attrs->axis.has_value()) {
     // flattened
@@ -216,7 +217,7 @@ Type InferTypeStatisticalExtension(const Call& call, const BlockBuilder& ctx) {
       return TensorType(ShapeExpr(ffi::Array<PrimExpr>()), data_ty->dtype, data_ty->vdevice);
     }
     return TupleType({TensorType(data_ty->dtype, out_ndim, data_ty->vdevice),
-                      TensorType(DataType::Int(64), out_ndim, data_ty->vdevice)});
+                      TensorType(PrimType::Int(64), out_ndim, data_ty->vdevice)});
   }
 
   ffi::Array<PrimExpr> out_shape;
@@ -234,15 +235,15 @@ Type InferTypeStatisticalExtension(const Call& call, const BlockBuilder& ctx) {
     return TensorType(ShapeExpr(out_shape), data_ty->dtype, data_ty->vdevice);
   else
     return TupleType({TensorType(ShapeExpr(out_shape), data_ty->dtype, data_ty->vdevice),
-                      TensorType(ShapeExpr(out_shape), DataType::Int(64), data_ty->vdevice)});
+                      TensorType(ShapeExpr(out_shape), PrimType::Int(64), data_ty->vdevice)});
 }
 
 /* relax.cumprod */
-Expr cumprod(Expr data, ffi::Optional<int64_t> axis, ffi::Optional<DataType> dtype,
+Expr cumprod(Expr data, ffi::Optional<int64_t> axis, ffi::Optional<DLDataType> dtype,
              bool exclusive) {
   auto attrs = ffi::make_object<ScanopAttrs>();
   attrs->axis = std::move(axis);
-  attrs->dtype = std::move(dtype.value_or(DataType::Void()));
+  attrs->dtype = dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
   attrs->exclusive = exclusive;
 
   static const Op& op = Op::Get("relax.cumprod");
@@ -262,10 +263,11 @@ TVM_REGISTER_OP("relax.cumprod")
     .set_attr<bool>("FPurity", true);
 
 /* relax.cumsum */
-Expr cumsum(Expr data, ffi::Optional<int64_t> axis, ffi::Optional<DataType> dtype, bool exclusive) {
+Expr cumsum(Expr data, ffi::Optional<int64_t> axis, ffi::Optional<DLDataType> dtype,
+            bool exclusive) {
   auto attrs = ffi::make_object<ScanopAttrs>();
   attrs->axis = std::move(axis);
-  attrs->dtype = std::move(dtype.value_or(DataType::Void()));
+  attrs->dtype = dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
   attrs->exclusive = exclusive;
 
   static const Op& op = Op::Get("relax.cumsum");
diff --git a/src/relax/op/tensor/statistical.h b/src/relax/op/tensor/statistical.h
index 2d80790926ed..3ab998110603 100644
--- a/src/relax/op/tensor/statistical.h
+++ b/src/relax/op/tensor/statistical.h
@@ -99,7 +99,7 @@ Expr sum(Expr x, ffi::Optional<ffi::Array<int64_t>> axis, bool keepdims);
  * result.
  */
 Expr cumprod(Expr data, ffi::Optional<int64_t> axis = std::nullopt,
-             ffi::Optional<DataType> dtype = std::nullopt, bool exclusive = false);
+             ffi::Optional<DLDataType> dtype = std::nullopt, bool exclusive = false);
 
 /*!
  * \brief Numpy style cumsum op. Return the cumulative inclusive sum of the elements along
@@ -114,7 +114,7 @@ Expr cumprod(Expr data, ffi::Optional<int64_t> axis = std::nullopt,
  * \return The computed result.
  */
 Expr cumsum(Expr data, ffi::Optional<int64_t> axis = std::nullopt,
-            ffi::Optional<DataType> dtype = std::nullopt, bool exclusive = false);
+            ffi::Optional<DLDataType> dtype = std::nullopt, bool exclusive = false);
 
 /*! \brief Computes the variance of tensor elements over given axes. */
 Expr variance(Expr x, ffi::Optional<ffi::Array<int64_t>> axis, bool keepdims);
diff --git a/src/relax/op/tensor/ternary.cc b/src/relax/op/tensor/ternary.cc
index 6daacfe16578..1e21e7dbdcc7 100644
--- a/src/relax/op/tensor/ternary.cc
+++ b/src/relax/op/tensor/ternary.cc
@@ -57,9 +57,9 @@ Type InferTypeEwiseFMA(const Call& call, const BlockBuilder& ctx) {
     }
   }
 
-  DataType output_dtype;
+  PrimType output_dtype = PrimType::Void();
   if (t1->IsUnknownDtype() || t2->IsUnknownDtype() || t3->IsUnknownDtype()) {
-    output_dtype = DataType::Void();
+    output_dtype = PrimType::Void();
   } else if (t1->dtype != t2->dtype || t2->dtype != t3->dtype) {
     TVM_FFI_VISIT_THROW(TypeError, call) << "Data types " << t1->dtype << ", " << t2->dtype
                                          << ", and " << t3->dtype << " must be equal for EwiseFMA";
diff --git a/src/relax/op/tensor/unary.cc b/src/relax/op/tensor/unary.cc
index 598ec78aacda..bd15223df878 100644
--- a/src/relax/op/tensor/unary.cc
+++ b/src/relax/op/tensor/unary.cc
@@ -33,7 +33,7 @@ namespace relax {
 
 Type InferTypeUnaryCheck(const Call& call, const BlockBuilder& ctx) {
   return InferTypeUnary<false>(call, ctx,
-                               [](const TensorType& input_ty) { return DataType::Bool(); });
+                               [](const TensorType& input_ty) { return PrimType::Bool(); });
 }
 
 /***************** Arithmetic operators *****************/
diff --git a/src/relax/op/vision/nms.cc b/src/relax/op/vision/nms.cc
index bde579f0ed5a..6f289d6b8755 100644
--- a/src/relax/op/vision/nms.cc
+++ b/src/relax/op/vision/nms.cc
@@ -84,8 +84,8 @@ Type InferTypeAllClassNMS(const Call& call, const BlockBuilder& ctx) {
     ShapeExpr oshape(oshape_values);
     tvm::ffi::Array<PrimExpr> counts_values = {1};
     ShapeExpr counts_shape(counts_values);
-    tvm::ffi::Array<Type> fields = {TensorType(oshape, DataType::Int(64), vdev),
-                                    TensorType(counts_shape, DataType::Int(64), vdev)};
+    tvm::ffi::Array<Type> fields = {TensorType(oshape, PrimType::Int(64), vdev),
+                                    TensorType(counts_shape, PrimType::Int(64), vdev)};
     return TupleType(fields);
   }
 
@@ -96,9 +96,9 @@ Type InferTypeAllClassNMS(const Call& call, const BlockBuilder& ctx) {
   ShapeExpr scores_shape(scores_values);
   tvm::ffi::Array<PrimExpr> counts_values = {batch};
   ShapeExpr counts_shape(counts_values);
-  tvm::ffi::Array<Type> fields = {TensorType(indices_shape, DataType::Int(64), vdev),
-                                  TensorType(scores_shape, DataType::Float(32), vdev),
-                                  TensorType(counts_shape, DataType::Int(64), vdev)};
+  tvm::ffi::Array<Type> fields = {TensorType(indices_shape, PrimType::Int(64), vdev),
+                                  TensorType(scores_shape, PrimType::Float(32), vdev),
+                                  TensorType(counts_shape, PrimType::Int(64), vdev)};
   return TupleType(fields);
 }
 
@@ -153,9 +153,9 @@ Type InferTypeGetValidCounts(const Call& call, const BlockBuilder& ctx) {
   auto vdev = data_ty->vdevice;
   const auto* data_shape = data_ty->shape.as<ShapeExprNode>();
   if (data_shape == nullptr) {
-    tvm::ffi::Array<Type> fields = {TensorType(DataType::Int(32), /*ndim=*/1, vdev),
+    tvm::ffi::Array<Type> fields = {TensorType(PrimType::Int(32), /*ndim=*/1, vdev),
                                     TensorType(data_ty->dtype, /*ndim=*/3, vdev),
-                                    TensorType(DataType::Int(32), /*ndim=*/2, vdev)};
+                                    TensorType(PrimType::Int(32), /*ndim=*/2, vdev)};
     return TupleType(fields);
   }
 
@@ -177,9 +177,9 @@ Type InferTypeGetValidCounts(const Call& call, const BlockBuilder& ctx) {
   }
 
   tvm::ffi::Array<Type> fields = {
-      TensorType(ShapeExpr({batch}), DataType::Int(32), vdev),
+      TensorType(ShapeExpr({batch}), PrimType::Int(32), vdev),
       TensorType(ShapeExpr({batch, num_anchors, elem_length}), data_ty->dtype, vdev),
-      TensorType(ShapeExpr({batch, num_anchors}), DataType::Int(32), vdev)};
+      TensorType(ShapeExpr({batch, num_anchors}), PrimType::Int(32), vdev)};
   return TupleType(fields);
 }
 
@@ -251,12 +251,12 @@ Type InferTypeNMS(const Call& call, const BlockBuilder& ctx) {
     TVM_FFI_VISIT_THROW(ValueError, call)
         << "non_max_suppression expects indices to be 2-D, got ndim " << indices_ty->ndim;
   }
-  if (!valid_count_ty->IsUnknownDtype() && valid_count_ty->dtype != DataType::Int(32)) {
+  if (!valid_count_ty->IsUnknownDtype() && valid_count_ty->dtype != PrimType::Int(32)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "non_max_suppression expects valid_count to have dtype int32, got "
         << valid_count_ty->dtype;
   }
-  if (!indices_ty->IsUnknownDtype() && indices_ty->dtype != DataType::Int(32)) {
+  if (!indices_ty->IsUnknownDtype() && indices_ty->dtype != PrimType::Int(32)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "non_max_suppression expects indices to have dtype int32, got " << indices_ty->dtype;
   }
@@ -319,30 +319,30 @@ Type InferTypeNMS(const Call& call, const BlockBuilder& ctx) {
       //                   valid_box_count[batch, 1])
       if (data_shape == nullptr) {
         tvm::ffi::Array<Type> fields = {TensorType(data_ty->dtype, /*ndim=*/3, vdev),
-                                        TensorType(DataType::Int(32), /*ndim=*/2, vdev),
-                                        TensorType(DataType::Int(32), /*ndim=*/2, vdev)};
+                                        TensorType(PrimType::Int(32), /*ndim=*/2, vdev),
+                                        TensorType(PrimType::Int(32), /*ndim=*/2, vdev)};
         return TupleType(fields);
       }
       auto batch = data_shape->values[0];
       auto num_anchors = data_shape->values[1];
       tvm::ffi::Array<Type> fields = {
           TensorType(ffi::GetRef<ShapeExpr>(data_shape), data_ty->dtype, vdev),
-          TensorType(ShapeExpr({batch, num_anchors}), DataType::Int(32), vdev),
-          TensorType(ShapeExpr({batch, IntImm::Int64(1)}), DataType::Int(32), vdev)};
+          TensorType(ShapeExpr({batch, num_anchors}), PrimType::Int(32), vdev),
+          TensorType(ShapeExpr({batch, IntImm::Int64(1)}), PrimType::Int(32), vdev)};
       return TupleType(fields);
     }
 
     // Hard NMS returns (box_indices[batch, num_anchors], valid_box_count[batch, 1])
     if (data_shape == nullptr) {
-      tvm::ffi::Array<Type> fields = {TensorType(DataType::Int(32), /*ndim=*/2, vdev),
-                                      TensorType(DataType::Int(32), /*ndim=*/2, vdev)};
+      tvm::ffi::Array<Type> fields = {TensorType(PrimType::Int(32), /*ndim=*/2, vdev),
+                                      TensorType(PrimType::Int(32), /*ndim=*/2, vdev)};
       return TupleType(fields);
     }
     auto batch = data_shape->values[0];
     auto num_anchors = data_shape->values[1];
     tvm::ffi::Array<Type> fields = {
-        TensorType(ShapeExpr({batch, num_anchors}), DataType::Int(32), vdev),
-        TensorType(ShapeExpr({batch, IntImm::Int64(1)}), DataType::Int(32), vdev)};
+        TensorType(ShapeExpr({batch, num_anchors}), PrimType::Int(32), vdev),
+        TensorType(ShapeExpr({batch, IntImm::Int64(1)}), PrimType::Int(32), vdev)};
     return TupleType(fields);
   }
 
diff --git a/src/relax/script/printer/dependent_type.cc b/src/relax/script/printer/dependent_type.cc
index a37c21406fac..e3a14c0cdafe 100644
--- a/src/relax/script/printer/dependent_type.cc
+++ b/src/relax/script/printer/dependent_type.cc
@@ -100,7 +100,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           }
           if (!n->IsUnknownDtype()) {
             kwargs_keys.push_back("dtype");
-            kwargs_values.push_back(LiteralDoc::DataType(n->dtype, n_p->Attr("dtype")));
+            kwargs_values.push_back(LiteralDoc::DataType(n->dtype->dtype, n_p->Attr("dtype")));
           }
           if (!n->shape.defined() && !n->IsUnknownNdim()) {
             kwargs_keys.push_back("ndim");
diff --git a/src/relax/script/printer/distributed.cc b/src/relax/script/printer/distributed.cc
index f05ec8fe714a..97d800d5d139 100644
--- a/src/relax/script/printer/distributed.cc
+++ b/src/relax/script/printer/distributed.cc
@@ -61,11 +61,11 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           }
           if (!n->tensor_ty->IsUnknownDtype()) {
             if (!require_kwargs) {
-              args.push_back(LiteralDoc::DataType(n->tensor_ty->dtype, n_p->Attr("dtype")));
+              args.push_back(LiteralDoc::DataType(n->tensor_ty->dtype->dtype, n_p->Attr("dtype")));
             } else {
               kwargs_keys.push_back("dtype");
               kwargs_values.push_back(
-                  LiteralDoc::DataType(n->tensor_ty->dtype, n_p->Attr("dtype")));
+                  LiteralDoc::DataType(n->tensor_ty->dtype->dtype, n_p->Attr("dtype")));
             }
           } else {
             require_kwargs = true;
diff --git a/src/relax/script/printer/expr.cc b/src/relax/script/printer/expr.cc
index dfce2b40b1f9..7b2f39ecf335 100644
--- a/src/relax/script/printer/expr.cc
+++ b/src/relax/script/printer/expr.cc
@@ -81,21 +81,21 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
         });
 
 ffi::Optional<ExprDoc> SpecialScalar(const runtime::Tensor& n, const AccessPath& p) {
-  DataType dtype = n.DataType();
+  DLDataType dtype = n.DataType();
   const void* data = n->data;
   if (n->ndim != 0 || n->device.device_type != kDLCPU) {
     return std::nullopt;
   }
 
-  if (dtype == DataType::Int(8)) {
+  if (dtype == DLDataType{kDLInt, 8, 1}) {
     return LiteralDoc::Int(*reinterpret_cast<const int8_t*>(data), p);
-  } else if (dtype == DataType::Int(16)) {
+  } else if (dtype == DLDataType{kDLInt, 16, 1}) {
     return LiteralDoc::Int(*reinterpret_cast<const int16_t*>(data), p);
-  } else if (dtype == DataType::Int(32)) {
+  } else if (dtype == DLDataType{kDLInt, 32, 1}) {
     return LiteralDoc::Int(*reinterpret_cast<const int32_t*>(data), p);
-  } else if (dtype == DataType::Int(64)) {
+  } else if (dtype == DLDataType{kDLInt, 64, 1}) {
     return LiteralDoc::Int(*reinterpret_cast<const int64_t*>(data), p);
-  } else if (dtype == DataType::Float(16)) {
+  } else if (dtype == DLDataType{kDLFloat, 16, 1}) {
     // From IEEE-754 float16 definition
     //
     // Ref: https://en.wikipedia.org/wiki/Half-precision_floating-point_format
@@ -122,11 +122,11 @@ ffi::Optional<ExprDoc> SpecialScalar(const runtime::Tensor& n, const AccessPath&
     }
 
     return LiteralDoc::Float(value, p);
-  } else if (dtype == DataType::Float(32)) {
+  } else if (dtype == DLDataType{kDLFloat, 32, 1}) {
     return LiteralDoc::Float(*reinterpret_cast<const float*>(data), p);
-  } else if (dtype == DataType::Float(64)) {
+  } else if (dtype == DLDataType{kDLFloat, 64, 1}) {
     return LiteralDoc::Float(*reinterpret_cast<const double*>(data), p);
-  } else if (dtype == DataType::Bool()) {
+  } else if (dtype == DLDataType{kDLBool, 8, 1}) {
     return LiteralDoc::Boolean(*reinterpret_cast<const uint8_t*>(data), p);
   } else {
     return std::nullopt;
diff --git a/src/relax/script/printer/tir.cc b/src/relax/script/printer/tir.cc
index e0742f8edd44..06bce7c1ff8c 100644
--- a/src/relax/script/printer/tir.cc
+++ b/src/relax/script/printer/tir.cc
@@ -43,9 +43,10 @@ RelaxFrameNode* GetRelaxFrame(IRDocsifier d) {
 }
 
 Doc PrintTIRVar(tirx::Var n, AccessPath n_p, IRDocsifier d) {
-  TVM_FFI_CHECK(n->dtype.is_scalar(), TypeError)
+  PrimType n_ty = n.ty();
+  TVM_FFI_CHECK(!n_ty.IsScalableVector() && !n_ty.IsFixedLengthVector(), TypeError)
       << "Relax only uses scalar TIR variables,"
-      << "but received TIR variable " << n << " with dtype " << n->dtype;
+      << "but received TIR variable " << n << " with dtype " << n_ty->dtype;
 
   if (!d->IsVarDefined(n)) {
     RelaxFrameNode* f = GetRelaxFrame(d);
@@ -77,7 +78,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tvm::IntImm>(                                             //
         "relax", [](tvm::IntImm n, AccessPath n_p, IRDocsifier d) -> Doc {  //
           // TODO(@junrushao): support non-int64 cases
-          if (n->dtype.is_bool()) {
+          if (n->ty().MatchesElementType(DLDataTypeCode::kDLBool, 8)) {
             return LiteralDoc::Boolean(n->value, n_p);
           } else {
             return LiteralDoc::Int(n->value, n_p);
diff --git a/src/relax/transform/adjust_matmul_order.cc b/src/relax/transform/adjust_matmul_order.cc
index 4cf8831514dc..2d6e6fcc5e33 100644
--- a/src/relax/transform/adjust_matmul_order.cc
+++ b/src/relax/transform/adjust_matmul_order.cc
@@ -208,22 +208,24 @@ std::tuple<DFPattern, ffi::TypedFunction<Expr(Expr, ffi::Map<DFPattern, Expr>)>>
     // If two of the three are compile-time, group those two values
     // together, to allow them to be lifted out and pre-computed.
     if (is_compile_time(expr_a) && is_compile_time(expr_b)) {
-      return matmul(matmul(expr_a, expr_b, DataType::Void()), expr_c, DataType::Void());
+      return matmul(matmul(expr_a, expr_b, (DLDataType{kDLOpaqueHandle, 0, 0})), expr_c,
+                    (DLDataType{kDLOpaqueHandle, 0, 0}));
     } else if (is_compile_time(expr_b) && is_compile_time(expr_c)) {
-      return matmul(expr_a, matmul(expr_b, expr_c, DataType::Void()), DataType::Void());
+      return matmul(expr_a, matmul(expr_b, expr_c, (DLDataType{kDLOpaqueHandle, 0, 0})),
+                    (DLDataType{kDLOpaqueHandle, 0, 0}));
     }
 
     // Otherwise, select the order that reduces the total number of
     // operations required, assuming a naive matmul (see below).
 
     if (shape_a.size() == 1) {
-      shape_a = {IntImm(shape_a[0].dtype(), 1), shape_a[0]};
+      shape_a = {IntImm(shape_a[0].ty(), 1), shape_a[0]};
     }
     if (shape_b.size() == 1) {
       if (matches.count(pat_matmul_on_lhs)) {
-        shape_b = {shape_b[0], IntImm(shape_b[0].dtype(), 1)};
+        shape_b = {shape_b[0], IntImm(shape_b[0].ty(), 1)};
       } else if (matches.count(pat_matmul_on_rhs)) {
-        shape_b = {IntImm(shape_b[0].dtype(), 1), shape_b[0]};
+        shape_b = {IntImm(shape_b[0].ty(), 1), shape_b[0]};
       } else {
         TVM_FFI_THROW(InternalError)
             << "OrPattern " << pat << " matched, but neither " << pat_matmul_on_lhs << " nor "
@@ -231,7 +233,7 @@ std::tuple<DFPattern, ffi::TypedFunction<Expr(Expr, ffi::Map<DFPattern, Expr>)>>
       }
     }
     if (shape_c.size() == 1) {
-      shape_c = {shape_c[0], IntImm(shape_c[0].dtype(), 1)};
+      shape_c = {shape_c[0], IntImm(shape_c[0].ty(), 1)};
     }
 
     PrimExpr size_N = shape_a[shape_a.size() - 2];  // row of A
@@ -285,9 +287,11 @@ std::tuple<DFPattern, ffi::TypedFunction<Expr(Expr, ffi::Map<DFPattern, Expr>)>>
                       size_N > 0 && size_R > 0 && size_M > 0 && size_B > 0);
 
     if (analyzer->CanProve(ops_with_lhs_first < ops_with_rhs_first)) {
-      return matmul(matmul(expr_a, expr_b, DataType::Void()), expr_c, DataType::Void());
+      return matmul(matmul(expr_a, expr_b, (DLDataType{kDLOpaqueHandle, 0, 0})), expr_c,
+                    (DLDataType{kDLOpaqueHandle, 0, 0}));
     } else if (analyzer->CanProve(ops_with_rhs_first < ops_with_lhs_first)) {
-      return matmul(expr_a, matmul(expr_b, expr_c, DataType::Void()), DataType::Void());
+      return matmul(expr_a, matmul(expr_b, expr_c, (DLDataType{kDLOpaqueHandle, 0, 0})),
+                    (DLDataType{kDLOpaqueHandle, 0, 0}));
     }
 
     // If we cannot determine which order is best, keep the existing order.
diff --git a/src/relax/transform/allocate_workspace.cc b/src/relax/transform/allocate_workspace.cc
index 4dfc84b822da..a593cb7ffee7 100644
--- a/src/relax/transform/allocate_workspace.cc
+++ b/src/relax/transform/allocate_workspace.cc
@@ -61,7 +61,7 @@ class ExternFunctionRewriter : ExprMutator {
       // Append the workspace parameter to this function.
       ffi::Array<Var> new_params = func_node->params;
 
-      auto ty = TensorType(ShapeExpr({IntImm::Int32(max_workspace_size_)}), DataType::UInt(8));
+      auto ty = TensorType(ShapeExpr({IntImm::Int32(max_workspace_size_)}), PrimType::UInt(8));
       Var workspace_param(name_sup_->FreshName("workspace"), ty);
 
       if (func_node->GetAttr<ffi::String>(attr::kCodegen)) {
@@ -149,7 +149,7 @@ class WorkspaceProvider : ExprMutator {
     builder_->BeginDataflowBlock();
     if (!workspace_var_main_.defined()) {
       auto shape = ShapeExpr({IntImm::Int32(max_workspace_size_)});
-      auto ty = DataTypeImm(DataType::UInt(8));
+      auto ty = DataTypeImm((DLDataType{kDLUInt, 8, 1}));
       auto workspace = MakeAllocTensor(shape, ty, PrimValue::Int64(0));
       workspace_var_main_ = builder_->Emit(workspace, "workspace_main");
     }
diff --git a/src/relax/transform/alter_op_impl.cc b/src/relax/transform/alter_op_impl.cc
index a938b946d20c..7a3b5743f423 100644
--- a/src/relax/transform/alter_op_impl.cc
+++ b/src/relax/transform/alter_op_impl.cc
@@ -45,7 +45,7 @@ static constexpr const char* kOperatorName = "operator_name";
 
 /*! \brief Construct ranges from shape dimensions */
 static ffi::Array<Range> ConstructRangeFromShape(const ffi::Array<PrimExpr>& shape) {
-  return shape.Map([](const PrimExpr& dim) { return Range(IntImm(dim.dtype(), 0), dim); });
+  return shape.Map([](const PrimExpr& dim) { return Range(IntImm(dim.ty(), 0), dim); });
 }
 
 static ffi::Array<PrimExpr> GetShapeFromTensorType(const TensorType& tensor_ty) {
@@ -206,7 +206,7 @@ class AlterOpImplMutator : public ExprMutator {
    * \brief Adds the \p remove_pad op to the module if it has not already been added before.
    * \returns The global var associated with the remove_pad PrimFunc.
    */
-  GlobalVar GetOrCreateRemovePadOp(const ffi::Array<PrimExpr>& old_shape, const DataType& dtype) {
+  GlobalVar GetOrCreateRemovePadOp(const ffi::Array<PrimExpr>& old_shape, DLDataType dtype) {
     int t_shape = old_shape.size();
     if (remove_pad_map_.count(t_shape) != 0) {
       return remove_pad_map_[t_shape];
@@ -214,8 +214,8 @@ class AlterOpImplMutator : public ExprMutator {
     // Create dynamic shapes for input and output tensors
     ffi::Array<PrimExpr> dyn_padded_shape, dyn_old_shape;
     for (int i = 0; i < t_shape; i++) {
-      tirx::Var var1("p" + std::to_string(i), old_shape[i].dtype());
-      tirx::Var var2("i" + std::to_string(i), old_shape[i].dtype());
+      tirx::Var var1("p" + std::to_string(i), old_shape[i].ty());
+      tirx::Var var2("i" + std::to_string(i), old_shape[i].ty());
       dyn_padded_shape.push_back(var1);
       dyn_old_shape.push_back(var2);
     }
@@ -264,7 +264,7 @@ class AlterOpImplMutator : public ExprMutator {
           TransformLayout(expr, inverse_index_map, axis_separator, input_axis_separator));
       const auto& tensor_ty = padded_expr->ty.as_or_throw<TensorType>();
 
-      GlobalVar gv_remove_pad = GetOrCreateRemovePadOp(old_shape, tensor_ty->dtype);
+      GlobalVar gv_remove_pad = GetOrCreateRemovePadOp(old_shape, tensor_ty->dtype->dtype);
       return Call(call_tir_op_, {gv_remove_pad, Tuple({padded_expr})}, {}, {old_tensor_ty});
     }
   }
diff --git a/src/relax/transform/call_tir_rewrite.cc b/src/relax/transform/call_tir_rewrite.cc
index 61fee5be7f8d..5a1bbcaa0040 100644
--- a/src/relax/transform/call_tir_rewrite.cc
+++ b/src/relax/transform/call_tir_rewrite.cc
@@ -90,12 +90,12 @@ class CallTIRMutator : public ExprMutator {
         }
 
         if (!is_inplace) {
-          outs.push_back(builder_->Emit(
-              Call(alloc_tensor_op,
-                   {output_ty->shape.value().as_or_throw<ShapeExpr>(),
-                    DataTypeImm(output_ty->dtype), PrimValue::Int64(dev_index), StringImm(scope)},
-                   Attrs(), {output_ty}),
-              "alloc"));
+          outs.push_back(builder_->Emit(Call(alloc_tensor_op,
+                                             {output_ty->shape.value().as_or_throw<ShapeExpr>(),
+                                              DataTypeImm(output_ty->dtype->dtype),
+                                              PrimValue::Int64(dev_index), StringImm(scope)},
+                                             Attrs(), {output_ty}),
+                                        "alloc"));
         } else {
           // if there is only one output, it must be an in-place argument, but check anyway
           TVM_FFI_ICHECK(inplace_attrs->inplace_indices[0] != -1)
@@ -129,8 +129,8 @@ class CallTIRMutator : public ExprMutator {
             outs.push_back(
                 builder_->Emit(Call(alloc_tensor_op,
                                     {field_tensor->shape.value().as_or_throw<ShapeExpr>(),
-                                     DataTypeImm(field_tensor->dtype), PrimValue::Int64(dev_index),
-                                     StringImm(scope)},
+                                     DataTypeImm(field_tensor->dtype->dtype),
+                                     PrimValue::Int64(dev_index), StringImm(scope)},
                                     Attrs(), {field_tensor}),
                                "alloc"));
           } else {
diff --git a/src/relax/transform/combine_parallel_matmul.cc b/src/relax/transform/combine_parallel_matmul.cc
index 1319356ee169..128202063695 100644
--- a/src/relax/transform/combine_parallel_matmul.cc
+++ b/src/relax/transform/combine_parallel_matmul.cc
@@ -202,7 +202,7 @@ ffi::TypedFunction<ffi::Map<Var, Expr>(ffi::Map<DFPattern, Var>, ffi::Map<Var, E
       }
 
       auto concat_rhs = concat(Tuple(rhs), rhs_dim - 1);
-      auto out_dtype = GetTensorType(matchings[patterns.matmul[indices[0]]])->dtype;
+      DLDataType out_dtype = GetTensorType(matchings[patterns.matmul[indices[0]]])->dtype->dtype;
       auto matmul_combined = matmul(lhs, concat_rhs, out_dtype);
 
       if (branch_info.bias_dim) {
diff --git a/src/relax/transform/compute_prim_value.cc b/src/relax/transform/compute_prim_value.cc
index 4ad34d04367d..4c937fe135dc 100644
--- a/src/relax/transform/compute_prim_value.cc
+++ b/src/relax/transform/compute_prim_value.cc
@@ -43,11 +43,12 @@ class PrimValueComputeInjector : public ExprMutator {
       return node;
     }
 
-    auto ret_dtype = node->value->dtype;
+    tvm::PrimType ret_ty = node->value.ty();
     auto param_vars = tirx::UndefinedVars(node->value);
-    tirx::Stmt body = tirx::Evaluate(tirx::Call(ret_dtype, tirx::builtin::ret(), {node->value}));
+    tirx::Stmt body =
+        tirx::Evaluate(tirx::Call(node->value.ty(), tirx::builtin::ret(), {node->value}));
 
-    tirx::PrimFunc func(param_vars, body, tvm::PrimType(ret_dtype), {},
+    tirx::PrimFunc func(param_vars, body, ret_ty, {},
                         DictAttrs({{tirx::attr::kIsHostFunc, true}, {tvm::attr::kSTir, true}}));
     func = s_tir::RenewDefs(func);
 
diff --git a/src/relax/transform/convert_layout.cc b/src/relax/transform/convert_layout.cc
index ed2a9b1c8a8a..bd4631bb4cf8 100644
--- a/src/relax/transform/convert_layout.cc
+++ b/src/relax/transform/convert_layout.cc
@@ -102,7 +102,7 @@ class LayoutConvertMutator : public ExprMutator {
     ffi::Array<PrimExpr> initial_indices_expr;
     initial_indices.reserve(ndim);
     for (int i = 0; i < ndim; ++i) {
-      auto var = tvm::tirx::Var("i" + std::to_string(i), DataType::Int(32));
+      auto var = tvm::tirx::Var("i" + std::to_string(i), PrimType::Int(32));
       initial_indices.push_back(var);
       initial_indices_expr.push_back(var);
     }
diff --git a/src/relax/transform/dataflow_inplace.cc b/src/relax/transform/dataflow_inplace.cc
index fcedd3119599..289c1c3c3b40 100644
--- a/src/relax/transform/dataflow_inplace.cc
+++ b/src/relax/transform/dataflow_inplace.cc
@@ -383,7 +383,7 @@ std::unordered_set<Type, ffi::ObjectPtrHash, ffi::ObjectPtrEqual> GatherCandidat
     const Type& result_ty) {
   if (auto* tensor_info = result_ty.as<TensorTypeNode>()) {
     // don't consider void dtype (don't know the size at compile time)
-    if (tensor_info->dtype.is_void()) {
+    if (tensor_info->dtype.IsVoid()) {
       return {};
     }
     // don't consider cases where we don't know the shape at compile time
diff --git a/src/relax/transform/decompose_ops.cc b/src/relax/transform/decompose_ops.cc
index 494e4a67a4a4..156d3c278c46 100644
--- a/src/relax/transform/decompose_ops.cc
+++ b/src/relax/transform/decompose_ops.cc
@@ -66,7 +66,7 @@ Tuple DecomposeBatchNorm(const Call& call) {
   Expr moving_var = ExpandToMatchInput(call->args[4], ty->ndim, {attrs->axis});
 
   // output = (x - mean) / sqrt(var + epsilon) * gamma + beta
-  Expr epsilon = MakeConstantScalar(attrs->epsilon, ty->dtype);
+  Expr epsilon = MakeConstantScalar(attrs->epsilon, ty->dtype->dtype);
   Expr sqrt_var = sqrt(add(moving_var, epsilon));
   Expr out = divide(subtract(data, moving_mean), sqrt_var);
 
@@ -103,8 +103,8 @@ Expr MutateBatchNormForTraining(Call call) {
   Expr data_mean = mean(data, reduce_axes, false);
   Expr data_var = variance(data, reduce_axes, false);
 
-  Expr momentum = MakeConstantScalar(attrs->momentum, ty->dtype);
-  Expr one_minus_mom = MakeConstantScalar(1 - attrs->momentum, ty->dtype);
+  Expr momentum = MakeConstantScalar(attrs->momentum, ty->dtype->dtype);
+  Expr one_minus_mom = MakeConstantScalar(1 - attrs->momentum, ty->dtype->dtype);
 
   Expr new_moving_mean = add(multiply(one_minus_mom, moving_mean), multiply(momentum, data_mean));
   Expr new_moving_var = add(multiply(one_minus_mom, moving_var), multiply(momentum, data_var));
@@ -128,7 +128,7 @@ Expr DecomposeLayerNorm(const Call& call) {
   Expr data_var = variance(data, attrs->axes, true);
 
   // output = (x - mean) / sqrt(var + epsilon) * gamma + beta
-  Expr epsilon = MakeConstantScalar(attrs->epsilon, ty->dtype);
+  Expr epsilon = MakeConstantScalar(attrs->epsilon, ty->dtype->dtype);
   Expr sqrt_var = sqrt(add(data_var, epsilon));
   Expr out = divide(subtract(data, data_mean), sqrt_var);
 
@@ -159,7 +159,7 @@ Expr TensorToShape(const Call& call_node, const BlockBuilder& builder) {
   // ffi::Array<PrimExpr>), we define symbolic variables and returns them as a ShapeExpr.
   ffi::Array<PrimExpr> shape_var;
   for (int i = 0; i < ty->ndim; i++) {
-    shape_var.push_back(tirx::Var("x", DataType::Int(64)));
+    shape_var.push_back(tirx::Var("x", PrimType::Int(64)));
   }
   // bind symbolic variables to the shape tuple
   relax::Var var("y", ShapeType(shape_var));
diff --git a/src/relax/transform/expand_matmul_of_sum.cc b/src/relax/transform/expand_matmul_of_sum.cc
index 1e768478fd95..9bf5fbd53b2d 100644
--- a/src/relax/transform/expand_matmul_of_sum.cc
+++ b/src/relax/transform/expand_matmul_of_sum.cc
@@ -88,7 +88,8 @@ std::tuple<DFPattern, ffi::TypedFunction<Expr(Expr, ffi::Map<DFPattern, Expr>)>>
       rhs_b = permute_dims(rhs_b, axes);
     }
 
-    return add(matmul(lhs, rhs_a, DataType::Void()), matmul(lhs, rhs_b, DataType::Void()));
+    return add(matmul(lhs, rhs_a, (DLDataType{kDLOpaqueHandle, 0, 0})),
+               matmul(lhs, rhs_b, (DLDataType{kDLOpaqueHandle, 0, 0})));
   };
 
   return {pat_matmul, rewriter};
diff --git a/src/relax/transform/fold_constant.cc b/src/relax/transform/fold_constant.cc
index d615c014709b..7c92ae49c578 100644
--- a/src/relax/transform/fold_constant.cc
+++ b/src/relax/transform/fold_constant.cc
@@ -197,7 +197,7 @@ class ConstantFolder : public ExprMutator {
   // Returns std::nullopt on failure.
   ffi::Optional<Expr> ConstEvaluateCallTIR(tirx::PrimFunc tir_func,
                                            ffi::Array<runtime::Tensor> arr_args, ffi::Shape shape,
-                                           DataType ret_type) {
+                                           DLDataType ret_type) {
     // obtain function from the cache.
     ffi::Optional<ffi::Function> func = GetCachedBuild(tir_func);
     if (!func) return std::nullopt;
@@ -243,7 +243,8 @@ class ConstantFolder : public ExprMutator {
       if (!shape) return std::nullopt;
       auto tensor_ty = tuple_ty->fields[i].as_or_throw<TensorType>();
       if (tensor_ty->IsUnknownDtype()) return std::nullopt;
-      ret_tensors.push_back(runtime::Tensor::Empty(shape.value(), tensor_ty->dtype, cpu_dev));
+      ret_tensors.push_back(
+          runtime::Tensor::Empty(shape.value(), tensor_ty->dtype->dtype, cpu_dev));
     }
 
     // Pack input args + all output tensors.
@@ -288,7 +289,8 @@ class ConstantFolder : public ExprMutator {
     ffi::Optional<ffi::Shape> shape = MatchConstShape(call->ty_args[0]);
     if (shape) {
       TensorType ret_ty = call->ty.as_or_throw<TensorType>();
-      return ConstEvaluateCallTIR(func.value(), arr_args.value(), shape.value(), ret_ty->dtype)
+      return ConstEvaluateCallTIR(func.value(), arr_args.value(), shape.value(),
+                                  ret_ty->dtype->dtype)
           .value_or({});
     }
     return {};
@@ -391,7 +393,7 @@ class ConstantFolder : public ExprMutator {
         for (size_t i = 0; i < values.size(); i++) {
           PrimExpr val = values[i];
           arr.push_back(val.as<IntImmNode>()->value);
-          is_known &= (val.dtype() == DataType::Int(64));
+          is_known &= val.ty().MatchesElementType(DLDataTypeCode::kDLInt, 64);
         }
         if (is_known) {
           const auto func = tvm::ffi::Function::GetGlobalRequired("relax.run.shape_to_tensor");
diff --git a/src/relax/transform/fuse_tir.cc b/src/relax/transform/fuse_tir.cc
index d5e656d15256..00c1029a98d1 100644
--- a/src/relax/transform/fuse_tir.cc
+++ b/src/relax/transform/fuse_tir.cc
@@ -60,10 +60,10 @@ class SymbolicMatcher : ExprFunctor<void(const PrimExpr& n, const PrimExpr& othe
   void VisitExpr(const PrimExpr& node, const PrimExpr& other) {
     if (node.same_as(other)) {
       return;
-    } else if (node.dtype().code() != other.dtype().code()) {
+    } else if (node.ty().code() != other.ty().code()) {
       TVM_FFI_THROW(InternalError)
-          << "Parameter expression " << node << " with dtype " << node.dtype()
-          << " cannot match to argument " << other << " with dtype " << other.dtype();
+          << "Parameter expression " << node << " with dtype " << node.ty()->dtype
+          << " cannot match to argument " << other << " with dtype " << other.ty()->dtype;
     } else {
       ExprFunctor::VisitExpr(node, other);
     }
@@ -120,9 +120,10 @@ class SymbolicMatcher : ExprFunctor<void(const PrimExpr& n, const PrimExpr& othe
   void VisitExpr_(const CastNode* op, const PrimExpr& other) {
     const auto* rhs = other.as<CastNode>();
     if (!rhs) {
-      TVM_FFI_THROW(InternalError) << "Parameter expression " << ffi::GetRef<PrimExpr>(op)
-                                   << " expected an cast to " << op->dtype << " as the argument, "
-                                   << "but was provided with the argument " << other;
+      TVM_FFI_THROW(InternalError)
+          << "Parameter expression " << ffi::GetRef<PrimExpr>(op) << " expected an cast to "
+          << op->ty()->dtype << " as the argument, "
+          << "but was provided with the argument " << other;
     }
     VisitExpr(op->value, rhs->value);
   }
@@ -132,10 +133,11 @@ class SymbolicMatcher : ExprFunctor<void(const PrimExpr& n, const PrimExpr& othe
 
     if (lhs.same_as(rhs)) {
       // Reference identity, no further checks needed.
-    } else if (op->dtype.code() != rhs->dtype.code()) {
+    } else if (op->ty().code() != rhs.ty().code()) {
       TVM_FFI_THROW(InternalError)
-          << "Parameter expression " << ffi::GetRef<PrimExpr>(op) << " with dtype " << op->dtype
-          << " cannot match to argument " << rhs << " with dtype " << rhs.dtype();
+          << "Parameter expression " << ffi::GetRef<PrimExpr>(op) << " with dtype "
+          << op->ty()->dtype << " cannot match to argument " << rhs << " with dtype "
+          << rhs.ty()->dtype;
     } else if (auto it = var_remap_->find(lhs); it != var_remap_->end()) {
       VisitExpr((*it).second, rhs);
     } else {
@@ -592,7 +594,7 @@ class FusedTIRConstructor : public ExprVisitor {
         // printed, it's more readable when done explicitly.  Since
         // Buffer is used more than param it gets the name with better
         // readability.
-        tirx::Var param = tirx::Var("p_" + buffer->name, tvm::PrimType(DataType::Handle()));
+        tirx::Var param = tirx::Var("p_" + buffer->name, tvm::PrimType::Handle());
         func_info_.params.push_back(param);
         func_info_.buffer_map.Set(param, buffer);
       }
@@ -636,8 +638,7 @@ class FusedTIRConstructor : public ExprVisitor {
         continue;
       }
 
-      tirx::Var param =
-          tirx::Var("p_output" + std::to_string(out_idx), tvm::PrimType(DataType::Handle()));
+      tirx::Var param = tirx::Var("p_output" + std::to_string(out_idx), tvm::PrimType::Handle());
       out_idx++;
       func_info_.buffer_map.Set(param, buffers[i]);
       func_info_.params.push_back(param);
@@ -855,9 +856,10 @@ class FusedTIRConstructor : public ExprVisitor {
     for (int64_t idx : output_indices) {
       int i = static_cast<int>(idx);
       const tirx::Var& param = func->params[static_cast<size_t>(i)];
-      if (param->dtype.is_int() || param->dtype.is_uint()) {
+      tvm::PrimType param_ty = param.ty();
+      if (param_ty.code() == DLDataTypeCode::kDLInt || param_ty.code() == DLDataTypeCode::kDLUInt) {
         if (symbolic_var_index == -1) symbolic_var_index = i;
-      } else if (param->dtype.is_handle()) {
+      } else if (param_ty.IsHandle()) {
         TVM_FFI_ICHECK(symbolic_var_index == -1)
             << "The scalar input should be at the ending of the "
                "parameter list.";
@@ -865,7 +867,7 @@ class FusedTIRConstructor : public ExprVisitor {
       } else {
         TVM_FFI_THROW(InternalError)
             << "The params of PrimFunc are expected to be Buffer handle or scalar, but got: "
-            << param->dtype;
+            << param_ty->dtype;
       }
     }
 
@@ -967,7 +969,7 @@ class FusedTIRConstructor : public ExprVisitor {
       // Case 1. The relax param is a Tensor, we directly create a tirx var and buffer
       const auto* shape_expr = tensor->shape.as<ShapeExprNode>();
       TVM_FFI_ICHECK(shape_expr) << "FuseTIR expects all Tensor parameters have a known shape.";
-      DataType dtype = tensor->dtype;
+      DLDataType dtype = tensor->dtype->dtype;
       tirx::Buffer buffer;
       if (tir_buffer_param.defined()) {
         buffer = tirx::decl_buffer(shape_expr->values, dtype, name_hint,
@@ -980,7 +982,7 @@ class FusedTIRConstructor : public ExprVisitor {
 
     } else if (const auto* prim_value = ty.as<PrimTypeNode>()) {
       // Case 2. The relax param is a scalar, we directly create a tirx var
-      out->push_back(tirx::Var(name_hint, prim_value->dtype));
+      out->push_back(tirx::Var(name_hint, tvm::PrimType(prim_value->dtype)));
 
     } else if (const auto* shape_expr = ty.as<ShapeTypeNode>()) {
       // Case 3. The relax param is a tuple of scalars, each represented as a tirx var
@@ -1257,7 +1259,7 @@ class TIRFuseMutator : public ExprMutator {
         if (const auto* literal = arg.as<PrimValueNode>()) {
           tir_vars.push_back(literal->value);
         } else if (const auto* var = arg.as<VarNode>()) {
-          tir_vars.push_back(tirx::Var(var->name_hint(), prim_value->dtype));
+          tir_vars.push_back(tirx::Var(var->name_hint(), tvm::PrimType(prim_value->dtype)));
         } else {
           TVM_FFI_THROW(TypeError) << "FuseTIR expects scalar arguments to be PrimValue or Var, "
                                    << "but received " << arg;
diff --git a/src/relax/transform/gradient.cc b/src/relax/transform/gradient.cc
index df22650e036d..992103de7d91 100644
--- a/src/relax/transform/gradient.cc
+++ b/src/relax/transform/gradient.cc
@@ -304,7 +304,7 @@ class BackwardBindingGenerator : private ExprVisitor {
 
     // Initialize the adjoint of target_var as ones op. We have already checked the target.
     auto* target_ty = GetTypeAs<TensorTypeNode>(target_var);
-    generator.UpdateAdjoint(target_var, ones(target_ty->shape.value(), target_ty->dtype));
+    generator.UpdateAdjoint(target_var, ones(target_ty->shape.value(), target_ty->dtype->dtype));
 
     // Do reverse-mode ad, so visit bindings backwards
     for (auto it = forward_block->bindings.rbegin(); it != forward_block->bindings.rend(); ++it) {
@@ -546,7 +546,7 @@ class BackwardBindingGenerator : private ExprVisitor {
       auto* tensor_ty = ty.as<TensorTypeNode>();
       TVM_FFI_ICHECK(tensor_ty) << "The leaf of adjoint should be a Tensor.";
       TVM_FFI_ICHECK(tensor_ty->shape.defined()) << "Missing shape when building zeros tuple.";
-      const Expr& init = zeros(tensor_ty->shape.value(), tensor_ty->dtype);
+      const Expr& init = zeros(tensor_ty->shape.value(), tensor_ty->dtype->dtype);
       return init;
     });
     return AdjointMsgToExpr(msg);
@@ -707,7 +707,8 @@ class GradientMutator : private ExprMutator {
 
   static bool IsFloatTensorType(const Type& ty) {
     auto* tensor_ty = ty.as<TensorTypeNode>();
-    return tensor_ty && tensor_ty->dtype.is_float();
+    // Gradient eligibility preserves the old float-kind check; lanes do not affect this policy.
+    return tensor_ty && tensor_ty->dtype.MatchesCode(DLDataTypeCode::kDLFloat);
   }
 
   // When the return value is a Var, it is the target;
diff --git a/src/relax/transform/infer_amp_utils.cc b/src/relax/transform/infer_amp_utils.cc
index 41c6cfe5ae42..4952aeea8fa2 100644
--- a/src/relax/transform/infer_amp_utils.cc
+++ b/src/relax/transform/infer_amp_utils.cc
@@ -22,19 +22,19 @@
 namespace tvm {
 namespace relax {
 
-NType NTypeFrom(const Type& ty, DataType dtype) {
+NType NTypeFrom(const Type& ty, DLDataType dtype) {
   auto fmapleaf = [&](const Type& ty) -> NType {
     const auto* tensor = ty.as<TensorTypeNode>();
     TVM_FFI_ICHECK(tensor) << "Expected TensorType, but got " << ty;
-    if (dtype == DataType::Void())
-      return NType(DLDataTypeToString(tensor->dtype));
+    if (dtype == DLDataType{kDLOpaqueHandle, 0, 0})
+      return NType(DLDataTypeToString(tensor->dtype->dtype));
     else
       return NType(DLDataTypeToString(dtype));
   };
   return MapToNestedMsg<ffi::String>(ty, fmapleaf);
 }
 
-NType NTypeFrom(const Expr& expr, DataType dtype) { return NTypeFrom(GetType(expr), dtype); }
+NType NTypeFrom(const Expr& expr, DLDataType dtype) { return NTypeFrom(GetType(expr), dtype); }
 
 NType NTypeMerge(const NType& a, const NType& b) {
   auto fcombine = [&](const ffi::String& a_str, const ffi::String& b_str) -> ffi::String {
@@ -44,20 +44,20 @@ NType NTypeMerge(const NType& a, const NType& b) {
       return a_str;
     }
 
-    DataType a = DataType(ffi::StringToDLDataType(a_str));
-    DataType b = DataType(ffi::StringToDLDataType(b_str));
-    TVM_FFI_ICHECK_EQ(a.code(), b.code());
-    TVM_FFI_ICHECK_EQ(a.lanes(), b.lanes());
-    return a.bits() > b.bits() ? a_str : b_str;
+    DLDataType a = ffi::StringToDLDataType(a_str);
+    DLDataType b = ffi::StringToDLDataType(b_str);
+    TVM_FFI_ICHECK_EQ(a.code, b.code);
+    TVM_FFI_ICHECK_EQ(a.lanes, b.lanes);
+    return a.bits > b.bits ? a_str : b_str;
   };
   return CombineNestedMsg<ffi::String>(a, b, fcombine);
 }
 
-ffi::Array<ffi::ObjectRef> InferMixedPrecisionFollow(const Call& call, const DataType& out_dtype) {
+ffi::Array<ffi::ObjectRef> InferMixedPrecisionFollow(const Call& call, DLDataType out_dtype) {
   return {IntImm::Int32(MixedPrecisionPolicyKind::kFollow), call};
 }
 
-ffi::Array<ffi::ObjectRef> InferMixedPrecisionNever(const Call& call, const DataType& out_dtype) {
+ffi::Array<ffi::ObjectRef> InferMixedPrecisionNever(const Call& call, DLDataType out_dtype) {
   return {IntImm::Int32(MixedPrecisionPolicyKind::kNever), call};
 }
 
diff --git a/src/relax/transform/infer_amp_utils.h b/src/relax/transform/infer_amp_utils.h
index faa33edd4a18..7f9f884a29d0 100644
--- a/src/relax/transform/infer_amp_utils.h
+++ b/src/relax/transform/infer_amp_utils.h
@@ -58,10 +58,10 @@ struct NTypeEqual {
 };
 
 // Construct a NType from an Type
-NType NTypeFrom(const Type& ty, DataType dtype = DataType::Void());
+NType NTypeFrom(const Type& ty, DLDataType dtype = DLDataType{kDLOpaqueHandle, 0, 0});
 
 // Construct a NType from an Expr
-NType NTypeFrom(const Expr& expr, DataType dtype = DataType::Void());
+NType NTypeFrom(const Expr& expr, DLDataType dtype = DLDataType{kDLOpaqueHandle, 0, 0});
 
 // Merge two messages, we keep the higher precision type for each leaf tensor
 NType NTypeMerge(const NType& a, const NType& b);
@@ -70,12 +70,11 @@ NType NTypeMerge(const NType& a, const NType& b);
 using VarDTypeMap = std::unordered_map<Var, NType>;
 
 // Call is a call node, out_dtype is the expected output_dtype
-using FInferMixedPrecision =
-    ffi::TypedFunction<Call(const Call& call_node, const DataType& out_dtype)>;
+using FInferMixedPrecision = ffi::TypedFunction<Call(const Call& call_node, DLDataType out_dtype)>;
 
-ffi::Array<ffi::ObjectRef> InferMixedPrecisionFollow(const Call& call, const DataType& out_dtype);
+ffi::Array<ffi::ObjectRef> InferMixedPrecisionFollow(const Call& call, DLDataType out_dtype);
 
-ffi::Array<ffi::ObjectRef> InferMixedPrecisionNever(const Call& call, const DataType& out_dtype);
+ffi::Array<ffi::ObjectRef> InferMixedPrecisionNever(const Call& call, DLDataType out_dtype);
 
 }  // namespace relax
 }  // namespace tvm
diff --git a/src/relax/transform/lazy_transform_params.cc b/src/relax/transform/lazy_transform_params.cc
index 7c42928d7d87..b800199610b8 100644
--- a/src/relax/transform/lazy_transform_params.cc
+++ b/src/relax/transform/lazy_transform_params.cc
@@ -65,8 +65,7 @@ class LazyInputMutator : public ExprMutator {
       param_lookup.insert({func->params[i], i - num_input_params});
     }
 
-    Var fget_param("fget_param",
-                   FuncType({PrimType(DataType::Int(64)), ObjectType()}, ObjectType()));
+    Var fget_param("fget_param", FuncType({PrimType::Int(64), ObjectType()}, ObjectType()));
 
     ffi::Array<Var> new_params(func->params.begin(), func->params.begin() + num_input_params);
     new_params.push_back(fget_param);
@@ -145,7 +144,7 @@ class LazyOutputMutator : public ExprMutator {
       define_lookup(0, func_body->body);
     }
 
-    Var fset_output("fset_output", FuncType({PrimType(DataType::Int(64)), ObjectType()},
+    Var fset_output("fset_output", FuncType({PrimType::Int(64), ObjectType()},
                                             TupleType(ffi::Array<Type>{}), /* purity = */ false));
     plan_ = FunctionPlan{std::move(output_lookup), fset_output};
 
diff --git a/src/relax/transform/legalize_ops.cc b/src/relax/transform/legalize_ops.cc
index 00bd8e859ac3..2c518cfbbeae 100644
--- a/src/relax/transform/legalize_ops.cc
+++ b/src/relax/transform/legalize_ops.cc
@@ -282,7 +282,7 @@ class LegalizeMutator : public ExprMutator {
         //     This fallback would only be applicable for cases where
         //     both the dtype and the dimensionality are known.  While
         //     Relax can express a tensor with unknown dtype and
-        //     dimensionality as `TensorType(DataType::Void(),
+        //     dimensionality as `TensorType(DLDataType{kDLOpaqueHandle, 0, 0},
         //     kUnknownNDim)`, TIR cannot express unknown dtype or
         //     unknown dimensionality.
         return false;
diff --git a/src/relax/transform/lower_alloc_tensor.cc b/src/relax/transform/lower_alloc_tensor.cc
index 66c2c95b89c2..52bca3e707eb 100644
--- a/src/relax/transform/lower_alloc_tensor.cc
+++ b/src/relax/transform/lower_alloc_tensor.cc
@@ -72,7 +72,10 @@ class Mutator : public ExprMutator {
       }();
 
       PrimExpr nbytes = [&]() -> PrimExpr {
-        PrimExpr nbytes = IntImm::Int64(dtype->value.bytes());
+        PrimType dtype_ty(dtype->value);
+        TVM_FFI_ICHECK(!dtype_ty.IsScalableVector())
+            << "Cannot statically compute allocation size for scalable vector dtype " << dtype_ty;
+        PrimExpr nbytes = IntImm::Int64(static_cast<int64_t>(dtype_ty.StorageBytes()));
         for (const auto& dim : shape) {
           nbytes *= dim;
         }
@@ -112,7 +115,7 @@ class Mutator : public ExprMutator {
       auto offset = PrimValue::Int64(0);
 
       Expr storage = relax::Call(mem_alloc_storage_op, {size, runtime_device_index, storage_scope,
-                                                        DataTypeImm(DataType::UInt(8))});
+                                                        DataTypeImm((DLDataType{kDLUInt, 8, 1}))});
       storage = builder_->Emit(storage, "storage");
       Expr tensor =
           relax::Call(mem_alloc_tensor_op, {storage, offset, shape_arg, dtype, op->args[2]});
diff --git a/src/relax/transform/remove_unused_outputs.cc b/src/relax/transform/remove_unused_outputs.cc
index 995fe019be04..f8a9e8cde70b 100644
--- a/src/relax/transform/remove_unused_outputs.cc
+++ b/src/relax/transform/remove_unused_outputs.cc
@@ -289,7 +289,7 @@ Pass RemoveUnusedOutputs() {
                   // into the old tuple, but it's simpler to just let
                   // CanonicalizeBindings and DCE handle it.
                   new_results.push_back(
-                      relax::PrimValue(FloatImm(DataType::Float(64), std::nan(""))));
+                      relax::PrimValue(FloatImm(tvm::PrimType::Float(64), std::nan(""))));
                 }
               }
 
diff --git a/src/relax/transform/remove_unused_parameters.cc b/src/relax/transform/remove_unused_parameters.cc
index ebe9fa000f77..4f28f9d13132 100644
--- a/src/relax/transform/remove_unused_parameters.cc
+++ b/src/relax/transform/remove_unused_parameters.cc
@@ -100,7 +100,7 @@ std::optional<CalleeAnalysis> AnalyzeCallee(Function func) {
   }
 
   for (const auto& tir_var : free_tir_vars) {
-    Var relax_var("param_" + tir_var->name_hint, PrimType(tir_var.dtype()));
+    Var relax_var("param_" + tir_var->name_hint, PrimType(tir_var.ty()));
     params.push_back(relax_var);
   }
 
diff --git a/src/relax/transform/reorder_take_after_matmul.cc b/src/relax/transform/reorder_take_after_matmul.cc
index bd36c5cb89c5..7fd0fb7eecaa 100644
--- a/src/relax/transform/reorder_take_after_matmul.cc
+++ b/src/relax/transform/reorder_take_after_matmul.cc
@@ -92,7 +92,7 @@ std::tuple<DFPattern, ffi::TypedFunction<Expr(Expr, ffi::Map<DFPattern, Expr>)>>
       // indices.shape = [outfeatures]
 
       // out_table.shape = [*batch, table_size]
-      auto out_table = matmul(lhs, weights, DataType::Void());
+      auto out_table = matmul(lhs, weights, (DLDataType{kDLOpaqueHandle, 0, 0}));
       // new_output.shape = [*batch, outfeatures]
       auto new_output = take(out_table, indices, matmul_ty->ndim - 1);
 
@@ -116,7 +116,7 @@ std::tuple<DFPattern, ffi::TypedFunction<Expr(Expr, ffi::Map<DFPattern, Expr>)>>
       auto fused_weight = reshape(reordered_weight,
                                   ShapeExpr({weight_shape[1], weight_shape[0] * weight_shape[2]}));
       // fused_output.shape = [batch1, batch2, table_size * outfeatures]
-      auto fused_output = matmul(lhs, fused_weight, DataType::Void());
+      auto fused_output = matmul(lhs, fused_weight, (DLDataType{kDLOpaqueHandle, 0, 0}));
       // indexed_output.shape = [batch1, batch2, table_size, outfeatures]
       auto indexed_output = reshape(
           fused_output, ShapeExpr({lhs_shape[0], lhs_shape[1], weight_shape[0], weight_shape[2]}));
diff --git a/src/relax/transform/split_call_tir_by_pattern.cc b/src/relax/transform/split_call_tir_by_pattern.cc
index 4d15c0fd88f5..19e0dfdf8f00 100644
--- a/src/relax/transform/split_call_tir_by_pattern.cc
+++ b/src/relax/transform/split_call_tir_by_pattern.cc
@@ -129,7 +129,7 @@ class ForMatcher : public TensorizeComparator {
         if (match) {
           evaluated_symbols.back().insert(symbol_map.begin(), symbol_map.end());
           evaluated_symbols.back()[ffi::GetRef<Var>(operand_a)] =
-              MakeConstScalar(rhs_ptr->b.dtype(), 1);
+              MakeConstScalar(rhs_ptr->b.ty(), 1);
           return true;
         }
       }
@@ -142,7 +142,7 @@ class ForMatcher : public TensorizeComparator {
         if (match) {
           evaluated_symbols.back().insert(symbol_map.begin(), symbol_map.end());
           evaluated_symbols.back()[ffi::GetRef<Var>(operand_b)] =
-              MakeConstScalar(rhs_ptr->a.dtype(), 1);
+              MakeConstScalar(rhs_ptr->a.ty(), 1);
           return true;
         }
       }
@@ -160,7 +160,7 @@ class ForMatcher : public TensorizeComparator {
         if (match) {
           evaluated_symbols.back().insert(symbol_map.begin(), symbol_map.end());
           evaluated_symbols.back()[ffi::GetRef<Var>(operand_a)] =
-              MakeConstScalar(rhs_ptr->b.dtype(), 0);
+              MakeConstScalar(rhs_ptr->b.ty(), 0);
           return true;
         }
       }
@@ -173,7 +173,7 @@ class ForMatcher : public TensorizeComparator {
         if (match) {
           evaluated_symbols.back().insert(symbol_map.begin(), symbol_map.end());
           evaluated_symbols.back()[ffi::GetRef<Var>(operand_b)] =
-              MakeConstScalar(rhs_ptr->a.dtype(), 0);
+              MakeConstScalar(rhs_ptr->a.ty(), 0);
           return true;
         }
       }
@@ -622,7 +622,7 @@ std::pair<PrimFunc, ffi::Optional<PrimFunc>> SplitFunctions(
     }
   }
   arg_partition->push_back(arg_partition1);
-  new_params1.push_back(Var("output", DataType::Handle()));
+  new_params1.push_back(Var("output", PrimType::Handle()));
   ffi::Map<Var, Buffer> new_buffer_map1;
   for (const auto& kv : func->buffer_map) {
     if (partitioner.input1.count(kv.second)) {
@@ -635,7 +635,7 @@ std::pair<PrimFunc, ffi::Optional<PrimFunc>> SplitFunctions(
   // Step 4. Craft the second function.
   ffi::Array<Var> new_params2;
   std::vector<int> arg_partition2;
-  new_params2.push_back(Var("input", DataType::Handle()));
+  new_params2.push_back(Var("input", PrimType::Handle()));
   for (int i = 0; i < static_cast<int>(func->params.size()); i++) {
     Var param = func->params[i];
     if (partitioner.input2.count(func->buffer_map[param])) {
@@ -752,7 +752,7 @@ class SplitMutator : public ExprMutator {
     TVM_FFI_ICHECK(lib_func->IsInstance<ExternFuncNode>());
     builder_->UpdateFunction(gv, lib_func);
     tirx::Buffer intermediate_buffer = func1->buffer_map.at(func1->params.back());
-    DataType dtype = intermediate_buffer->dtype;
+    PrimType dtype = intermediate_buffer->dtype;
     Call call1(call_dps_packed_, {lib_func, Tuple(args1)}, call->attrs,
                {TensorType(ShapeExpr(intermediate_buffer->shape), dtype)});
     Var call_var1 = builder_->Emit(call1);
diff --git a/src/relax/transform/split_layout_rewrite_preproc.cc b/src/relax/transform/split_layout_rewrite_preproc.cc
index 0560582fac59..e09e377e8a70 100644
--- a/src/relax/transform/split_layout_rewrite_preproc.cc
+++ b/src/relax/transform/split_layout_rewrite_preproc.cc
@@ -65,11 +65,11 @@ class SplitPrimFuncLayoutRewrite : public StmtMutator {
     ffi::Map<Var, Buffer> buffer_map;
 
     for (const auto& info : rewrite_infos_) {
-      params.push_back(Var(info.pre_rewrite_buffer->name, DataType::Handle()));
+      params.push_back(Var(info.pre_rewrite_buffer->name, PrimType::Handle()));
       buffer_map.Set(params.back(), info.pre_rewrite_buffer);
     }
     for (const auto& info : rewrite_infos_) {
-      params.push_back(Var(info.post_rewrite_buffer->name, DataType::Handle()));
+      params.push_back(Var(info.post_rewrite_buffer->name, PrimType::Handle()));
       buffer_map.Set(params.back(), info.post_rewrite_buffer);
     }
 
diff --git a/src/relax/transform/static_plan_block_memory.cc b/src/relax/transform/static_plan_block_memory.cc
index 651b70961090..2a04461555d0 100644
--- a/src/relax/transform/static_plan_block_memory.cc
+++ b/src/relax/transform/static_plan_block_memory.cc
@@ -106,7 +106,7 @@ class StorageTokenNode : public ffi::Object {
   /*! \brief Number of bytes that this token requires. */
   PrimExpr bytes;
   /*! \brief The dtype of this token. */
-  DataType dtype;
+  DLDataType dtype;
   /*! \brief The memory scope of the token. */
   std::string storage_scope;
   /*! \brief The VDevice information. */
@@ -135,10 +135,13 @@ class StorageTokenNode : public ffi::Object {
  */
 class StorageToken : public ffi::ObjectRef {
  public:
-  explicit StorageToken(ffi::Array<PrimExpr> shape, DataType dtype, std::string storage_scope,
+  explicit StorageToken(ffi::Array<PrimExpr> shape, DLDataType dtype, std::string storage_scope,
                         ffi::Optional<VDevice> vdevice = std::nullopt) {
     // Compute the tensor size from the shape.
-    int64_t const_coeff = dtype.bytes() * dtype.lanes();
+    PrimType dtype_ty(dtype);
+    TVM_FFI_ICHECK(!dtype_ty.IsScalableVector())
+        << "Cannot statically plan storage size for scalable vector dtype " << dtype_ty;
+    int64_t const_coeff = static_cast<int64_t>(dtype_ty.StorageBytes());
     PrimExpr size = IntImm::Int64(1);
     bool size_computed = false;
 
@@ -303,13 +306,16 @@ class TokenAllocatorMixed {
   }
 
  private:
-  /*! \brief The hash class to enable std::pair as map key class. */
-  struct PairHash {
-    template <class T1, class T2>
-    std::size_t operator()(const std::pair<T1, T2>& p) const {
-      auto h1 = std::hash<T1>{}(p.first);
-      auto h2 = std::hash<T2>{}(p.second);
-      return h1 ^ h2;
+  using PoolKey = std::pair<std::string, DLDataType>;
+
+  /*! \brief The hash class to enable storage scope and raw dtype as map key class. */
+  struct PoolKeyHash {
+    std::size_t operator()(const PoolKey& p) const {
+      std::size_t h = std::hash<std::string>{}(p.first);
+      h ^= static_cast<std::size_t>(p.second.code) + 0x9e3779b9 + (h << 6) + (h >> 2);
+      h ^= static_cast<std::size_t>(p.second.bits) + 0x9e3779b9 + (h << 6) + (h >> 2);
+      h ^= static_cast<std::size_t>(p.second.lanes) + 0x9e3779b9 + (h << 6) + (h >> 2);
+      return h;
     }
   };
 
@@ -318,9 +324,7 @@ class TokenAllocatorMixed {
   /*! \brief A constant scale representing the token search range. */
   const int match_range_{16};
   /*! \brief The pool of available storage tokens for each storage scope and dtype. */
-  std::unordered_map<std::pair<std::string, DataType>, std::multimap<int64_t, StorageToken>,
-                     PairHash>
-      available_pool_;
+  std::unordered_map<PoolKey, std::multimap<int64_t, StorageToken>, PoolKeyHash> available_pool_;
   /*! \brief All the storage tokens that have been allocated with actual storage. */
   std::vector<StorageToken> full_pool_;
 };
@@ -636,7 +640,7 @@ class StorageAllocatorInit : public StorageAllocatorBaseVisitor {
     const auto* shape = ty->shape.as<ShapeExprNode>();
     TVM_FFI_ICHECK_NOTNULL(shape);
     TVM_FFI_ICHECK(!ty->IsUnknownDtype());
-    TVM_FFI_ICHECK(ty->dtype == call->args[1].as_or_throw<DataTypeImm>()->value);
+    TVM_FFI_ICHECK(ty->dtype->dtype == call->args[1].as_or_throw<DataTypeImm>()->value);
     TVM_FFI_ICHECK(!token_map_.count(call));
 
     // Use the upper bounds of TIR vars as their values. The upper bound shape can still be dynamic
@@ -653,7 +657,7 @@ class StorageAllocatorInit : public StorageAllocatorBaseVisitor {
     }
     ffi::Optional<VDevice> vdevice = GetGlobalVDevice(ctx_mod_, vdevice_index);
 
-    StorageToken token(upper_bounded_shape, ty->dtype, storage_scope->value, vdevice);
+    StorageToken token(upper_bounded_shape, ty->dtype->dtype, storage_scope->value, vdevice);
 
     Tokens tokens(token);
     SetTokens(call, tokens);
@@ -938,7 +942,7 @@ class StorageAllocationRewriter : public ExprMutator {
       if (it_token == token2storage_var_.end()) {
         ShapeExpr size({token->bytes});
         PrimValue virtual_device_index = runtime_device_index;
-        DataType dtype = token->dtype;
+        DLDataType dtype = token->dtype;
         Call alloc_storage(mem_alloc_storage,
                            {std::move(size), virtual_device_index, StringImm(token->storage_scope),
                             DataTypeImm(dtype)},
@@ -951,7 +955,7 @@ class StorageAllocationRewriter : public ExprMutator {
 
       // And always create a `memory.alloc_tensor` for the old `builtin.alloc_tensor`.
       PrimValue offset = PrimValue::Int64(0);
-      DataType dtype = ty->dtype;
+      DLDataType dtype = ty->dtype->dtype;
       return Call(mem_alloc_tensor,
                   {storage_var, offset, ty->shape.value(), DataTypeImm(dtype), call->args[2]},
                   Attrs());
@@ -970,22 +974,26 @@ class StorageAllocationRewriter : public ExprMutator {
           GetUpperBoundShape(shape->values, ana_.get(), dom_map_);
       if (!IsStaticShape(shape->values)) {
         TVM_FFI_ICHECK(!ty->IsUnknownDtype());
-        TVM_FFI_ICHECK_EQ(ty->dtype, call->args[1].as_or_throw<DataTypeImm>()->value);
+        TVM_FFI_ICHECK_EQ(ty->dtype->dtype, call->args[1].as_or_throw<DataTypeImm>()->value);
         PrimExpr bytes = upper_bounded_shape[0];
         for (int i = 1; i < static_cast<int>(upper_bounded_shape.size()); ++i) {
           bytes *= upper_bounded_shape[i];
         }
-        bytes *= ty->dtype.bytes() * ty->dtype.lanes();
+        DLDataType dtype = ty->dtype->dtype;
+        PrimType dtype_ty(dtype);
+        TVM_FFI_ICHECK(!dtype_ty.IsScalableVector())
+            << "Cannot statically plan storage size for scalable vector dtype " << dtype_ty;
+        bytes *= IntImm::Int64(static_cast<int64_t>(dtype_ty.StorageBytes()));
         Call alloc_storage(mem_alloc_storage,
                            {/*size=*/ShapeExpr({bytes}),
                             /*virtual_device_index=*/call->args[2].as_or_throw<PrimValue>(),
                             /*storage_scope=*/call->args[3].as_or_throw<StringImm>(),  //
-                            /*dtype=*/DataTypeImm(ty->dtype)});
+                            /*dtype=*/DataTypeImm(dtype)});
         Var storage = builder_->Emit(alloc_storage, "storage");
         return Call(mem_alloc_tensor, {storage,  //
                                        /*offset=*/PrimValue::Int64(0),
                                        /*shape=*/ffi::GetRef<ShapeExpr>(shape),  //
-                                       /*dtype=*/DataTypeImm(ty->dtype),
+                                       /*dtype=*/DataTypeImm(dtype),
                                        /*vdevice_index=*/call->args[2]});
       }
     }
@@ -1040,7 +1048,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   refl::GlobalDef().def("relax.transform.StaticPlanBlockMemory", StaticPlanBlockMemory);
 }
 
-PrimExpr GetTextureMemorySizeFromVDevice(ffi::Array<PrimExpr> pshape, DataType dtype,
+PrimExpr GetTextureMemorySizeFromVDevice(ffi::Array<PrimExpr> pshape, DLDataType dtype,
                                          VDevice vdevice) {
   int image_row_align = static_cast<int>(
       vdevice->target->GetAttr<int64_t>("image_base_address_alignment").value_or(64));
@@ -1056,7 +1064,9 @@ PrimExpr GetTextureMemorySizeFromVDevice(ffi::Array<PrimExpr> pshape, DataType d
   };
   auto shape = Shape{pshape};
 
-  size_t size = runtime::GetTextureMemorySize<Shape>(shape, dtype.bytes() * 8, dtype.lanes(),
+  int lanes = static_cast<int16_t>(dtype.lanes);
+  TVM_FFI_ICHECK_GE(lanes, 0) << "Can't fetch the bytes of a scalable vector at a compile time.";
+  size_t size = runtime::GetTextureMemorySize<Shape>(shape, dtype.bits, lanes,
                                                      vdevice->memory_scope, image_row_align);
   return IntImm::Int64(size);
 }
diff --git a/src/relax/transform/to_mixed_precision.cc b/src/relax/transform/to_mixed_precision.cc
index ddd23ce2ea7b..45d2af9b8579 100644
--- a/src/relax/transform/to_mixed_precision.cc
+++ b/src/relax/transform/to_mixed_precision.cc
@@ -116,9 +116,9 @@ int GetMixedPrecisionInfo(const CallNode* call_node) {
  */
 class DTypeDecisionCollector : public ExprVisitor {
  public:
-  explicit DTypeDecisionCollector(DataType output_dtype) : output_dtype_(output_dtype) {}
+  explicit DTypeDecisionCollector(DLDataType output_dtype) : output_dtype_(output_dtype) {}
 
-  static VarDTypeMap Collect(Function func, DataType output_dtype) {
+  static VarDTypeMap Collect(Function func, DLDataType output_dtype) {
     DTypeDecisionCollector collector(output_dtype);
     collector.VisitExpr(func);
     return std::move(collector.only_fp16_map_);
@@ -165,7 +165,7 @@ class DTypeDecisionCollector : public ExprVisitor {
   }
 
   // merge the message for all vars in the expr list
-  void RequireArgsToType(ffi::Array<Expr> args, DataType to) {
+  void RequireArgsToType(ffi::Array<Expr> args, DLDataType to) {
     std::vector<Expr> arg_arr;
     std::vector<NType> to_arr;
     for (const Expr& arg : args) {
@@ -262,16 +262,16 @@ class DTypeDecisionCollector : public ExprVisitor {
     }
   }
 
-  DataType unknown_ = DataType(DataType::TypeCode::kFloat, 0, 1);
-  DataType fp16_ = DataType(DataType::TypeCode::kFloat, 16, 1);
-  DataType fp32_ = DataType(DataType::TypeCode::kFloat, 32, 1);
-  DataType output_dtype_;
+  DLDataType unknown_ = DLDataType{kDLFloat, 0, 1};
+  DLDataType fp16_ = DLDataType{kDLFloat, 16, 1};
+  DLDataType fp32_ = DLDataType{kDLFloat, 32, 1};
+  DLDataType output_dtype_;
   VarDTypeMap only_fp16_map_;
 };
 
 class ToMixedPrecisionRewriter : public ExprMutator {
  public:
-  explicit ToMixedPrecisionRewriter(const VarDTypeMap* only_fp16_map, DataType output_dtype,
+  explicit ToMixedPrecisionRewriter(const VarDTypeMap* only_fp16_map, DLDataType output_dtype,
                                     const std::unordered_set<std::string>& fp16_input_names)
       : only_fp16_map_(only_fp16_map),
         output_dtype_(output_dtype),
@@ -290,7 +290,7 @@ class ToMixedPrecisionRewriter : public ExprMutator {
           if (tensor_ty->vdevice.defined()) {
             vdev = tensor_ty->vdevice.value();
           }
-          TensorType fp16_ty(tensor_ty->shape.value(), DataType::Float(16), vdev, tensor_ty->span);
+          TensorType fp16_ty(tensor_ty->shape.value(), PrimType::Float(16), vdev, tensor_ty->span);
           Var fp16_var(var->vid, fp16_ty, var->span);
           var_remap_[var->vid] = fp16_var;
           return fp16_var;
@@ -315,13 +315,14 @@ class ToMixedPrecisionRewriter : public ExprMutator {
       if (NTypeEqual()(to[0], NTypeFrom(expr))) return expr;
       // We only rewrite the expr if the dtype is fp16 or fp32, dtypes such as int32, float64 is not
       // supported to be rewritten
-      if (tensor->dtype != fp16_ && tensor->dtype != fp32_) return expr;
-      return astype(expr, DataType(ffi::StringToDLDataType(to[0].LeafValue())));
+      DLDataType tensor_dtype = tensor->dtype->dtype;
+      if (tensor_dtype != fp16_ && tensor_dtype != fp32_) return expr;
+      return astype(expr, ffi::StringToDLDataType(to[0].LeafValue()));
     };
     return TransformTupleLeaf<ffi::String>(expr, std::array<NType, 1>({to}), fvisitleaf);
   }
 
-  ffi::Array<Expr> RewriteArgs(const ffi::Array<Expr>& args, DataType to) {
+  ffi::Array<Expr> RewriteArgs(const ffi::Array<Expr>& args, DLDataType to) {
     ffi::Array<Expr> new_args;
     for (const Expr& arg : args) {
       if (IsNestedTensor(arg)) {
@@ -346,7 +347,7 @@ class ToMixedPrecisionRewriter : public ExprMutator {
   bool AllFP16Castable(const ffi::Array<Expr>& args) {
     auto is_fp16 = [](Type ty) {
       if (auto tensor_ty = ty.as<TensorTypeNode>();
-          tensor_ty && tensor_ty->dtype == DataType::Float(16)) {
+          tensor_ty && tensor_ty->dtype == PrimType::Float(16)) {
         return true;
       }
       return false;
@@ -359,7 +360,7 @@ class ToMixedPrecisionRewriter : public ExprMutator {
         return false;
       }
 
-      if (data.DataType() == DataType::Float(16)) {
+      if (data.DataType() == DLDataType{kDLFloat, 16, 1}) {
         return true;
       }
 
@@ -372,17 +373,17 @@ class ToMixedPrecisionRewriter : public ExprMutator {
       std::vector<uint8_t> bytes(size_1d * elem_bytes);
       data.CopyToBytes(bytes.data(), bytes.size());
 
-      if (data.DataType() == DataType::Float(32)) {
+      if (data.DataType() == DLDataType{kDLFloat, 32, 1}) {
         return CheckInFP16Range<float>(bytes, size_1d);
-      } else if (data.DataType() == DataType::Float(64)) {
+      } else if (data.DataType() == DLDataType{kDLFloat, 64, 1}) {
         return CheckInFP16Range<double>(bytes, size_1d);
-      } else if (data.DataType() == DataType::Int(8)) {
+      } else if (data.DataType() == DLDataType{kDLInt, 8, 1}) {
         return CheckInFP16Range<std::int8_t>(bytes, size_1d);
-      } else if (data.DataType() == DataType::Int(16)) {
+      } else if (data.DataType() == DLDataType{kDLInt, 16, 1}) {
         return CheckInFP16Range<std::int16_t>(bytes, size_1d);
-      } else if (data.DataType() == DataType::Int(32)) {
+      } else if (data.DataType() == DLDataType{kDLInt, 32, 1}) {
         return CheckInFP16Range<std::int32_t>(bytes, size_1d);
-      } else if (data.DataType() == DataType::Int(64)) {
+      } else if (data.DataType() == DLDataType{kDLInt, 64, 1}) {
         return CheckInFP16Range<std::int64_t>(bytes, size_1d);
       }
       return false;
@@ -476,7 +477,7 @@ class ToMixedPrecisionRewriter : public ExprMutator {
     new_call.CopyOnWrite()->args = RemapArgs(new_call->args);
 
     // Then we rewrite the args according to the policy
-    std::optional<DataType> opt_new_dtype = std::nullopt;
+    std::optional<DLDataType> opt_new_dtype = std::nullopt;
 
     if (policy == kAlways) {
       opt_new_dtype = fp16_;
@@ -589,16 +590,16 @@ class ToMixedPrecisionRewriter : public ExprMutator {
 
   const VarDTypeMap* only_fp16_map_;
 
-  DataType fp16_ = DataType(DataType::TypeCode::kFloat, 16, 1);
-  DataType fp32_ = DataType(DataType::TypeCode::kFloat, 32, 1);
-  DataType output_dtype_;
+  DLDataType fp16_ = DLDataType{kDLFloat, 16, 1};
+  DLDataType fp32_ = DLDataType{kDLFloat, 32, 1};
+  DLDataType output_dtype_;
   ffi::Array<Var> params_;
   std::unordered_set<std::string> fp16_input_names_;
 
   const Op& wrap_param_op = Op::Get("relax.wrap_param");
 };
 
-Expr ToMixedPrecision(const Function& f, const DataType& out_dtype,
+Expr ToMixedPrecision(const Function& f, DLDataType out_dtype,
                       ffi::Optional<ffi::Array<ffi::String>> fp16_input_names) {
   VarDTypeMap only_fp16_map = DTypeDecisionCollector::Collect(f, out_dtype);
   std::unordered_set<std::string> fp16_input_names_set;
@@ -611,7 +612,7 @@ Expr ToMixedPrecision(const Function& f, const DataType& out_dtype,
 
 namespace transform {
 
-Pass ToMixedPrecision(const DataType& out_dtype,
+Pass ToMixedPrecision(DLDataType out_dtype,
                       ffi::Optional<ffi::Array<ffi::String>> fp16_input_names) {
   auto pass_func = [=](Function f, IRModule m, PassContext pc) {
     return ToMixedPrecision(f, out_dtype, fp16_input_names).as_or_throw<Function>();
diff --git a/src/relax/transform/utils.h b/src/relax/transform/utils.h
index 275c7ca94f8d..d4607459c74f 100644
--- a/src/relax/transform/utils.h
+++ b/src/relax/transform/utils.h
@@ -319,39 +319,39 @@ class FunctionCopier : public SymbolicVarRenewMutator {
  * \return A Constant.
  */
 template <typename T>
-inline Constant MakeConstantScalar(T value, DataType dtype) {
+inline Constant MakeConstantScalar(T value, DLDataType dtype) {
   runtime::Tensor arr = runtime::Tensor::Empty({}, dtype, {kDLCPU, 0});
-  if (dtype == DataType::Float(32)) {
+  if (dtype == DLDataType{kDLFloat, 32, 1}) {
     *static_cast<float*>(arr->data) = static_cast<float>(value);
-  } else if (dtype == DataType::Float(64)) {
+  } else if (dtype == DLDataType{kDLFloat, 64, 1}) {
     *static_cast<double*>(arr->data) = static_cast<double>(value);
-  } else if (dtype == DataType::Int(32)) {
+  } else if (dtype == DLDataType{kDLInt, 32, 1}) {
     *static_cast<int32_t*>(arr->data) = static_cast<int32_t>(value);
-  } else if (dtype == DataType::Int(64)) {
+  } else if (dtype == DLDataType{kDLInt, 64, 1}) {
     *static_cast<int64_t*>(arr->data) = static_cast<int64_t>(value);
-  } else if (dtype == DataType::Bool()) {
+  } else if (dtype == DLDataType{kDLBool, 8, 1}) {
     *static_cast<bool*>(arr->data) = static_cast<bool>(value);
-  } else if (dtype == DataType::UInt(8)) {
+  } else if (dtype == DLDataType{kDLUInt, 8, 1}) {
     *static_cast<uint8_t*>(arr->data) = static_cast<uint8_t>(value);
-  } else if (dtype == DataType::UInt(16)) {
+  } else if (dtype == DLDataType{kDLUInt, 16, 1}) {
     *static_cast<uint16_t*>(arr->data) = static_cast<uint16_t>(value);
-  } else if (dtype == DataType::UInt(32)) {
+  } else if (dtype == DLDataType{kDLUInt, 32, 1}) {
     *static_cast<uint32_t*>(arr->data) = static_cast<uint32_t>(value);
-  } else if (dtype == DataType::UInt(64)) {
+  } else if (dtype == DLDataType{kDLUInt, 64, 1}) {
     *static_cast<uint64_t*>(arr->data) = static_cast<uint64_t>(value);
-  } else if (dtype == DataType::Int(8)) {
+  } else if (dtype == DLDataType{kDLInt, 8, 1}) {
     *static_cast<int8_t*>(arr->data) = static_cast<int8_t>(value);
-  } else if (dtype == DataType::Int(16)) {
+  } else if (dtype == DLDataType{kDLInt, 16, 1}) {
     *static_cast<int16_t*>(arr->data) = static_cast<int16_t>(value);
-  } else if (dtype == DataType::Int(32)) {
+  } else if (dtype == DLDataType{kDLInt, 32, 1}) {
     *static_cast<int32_t*>(arr->data) = static_cast<int32_t>(value);
-  } else if (dtype == DataType::Int(64)) {
+  } else if (dtype == DLDataType{kDLInt, 64, 1}) {
     *static_cast<int64_t*>(arr->data) = static_cast<int64_t>(value);
-  } else if (dtype == DataType::Float(16)) {
+  } else if (dtype == DLDataType{kDLFloat, 16, 1}) {
     // convert to float16 storage is uint16_t
     *static_cast<uint16_t*>(arr->data) =
         __truncXfYf2__<float, uint32_t, 23, uint16_t, uint16_t, 10>(static_cast<float>(value));
-  } else if (dtype == DataType::BFloat(16)) {
+  } else if (dtype == DLDataType{kDLBfloat, 16, 1}) {
     // convert to bfloat16 storage is uint16_t
     *static_cast<uint16_t*>(arr->data) =
         __truncXfYf2__<float, uint32_t, 23, uint16_t, uint16_t, 7>(static_cast<float>(value));
diff --git a/src/relax/utils.cc b/src/relax/utils.cc
index 370947e4b01f..2f5cc6d9dea8 100644
--- a/src/relax/utils.cc
+++ b/src/relax/utils.cc
@@ -179,11 +179,11 @@ tvm::ffi::Map<tirx::Var, PrimExpr> InferSymbolicVarMap(
 }
 
 bool IsBoolType(const Type& ty, bool permit_unknown_rank, bool permit_unknown_dtype) {
-  DataType dtype;
+  DLDataType dtype;
   int ndim;
 
   if (const auto* tensor = ty.as<TensorTypeNode>()) {
-    dtype = tensor->dtype;
+    dtype = tensor->dtype->dtype;
     ndim = tensor->ndim;
   } else if (const auto* prim = ty.as<PrimTypeNode>()) {
     dtype = prim->dtype;
@@ -192,7 +192,9 @@ bool IsBoolType(const Type& ty, bool permit_unknown_rank, bool permit_unknown_dt
     return false;
   }
 
-  bool correct_dtype = dtype.is_bool() || (permit_unknown_dtype && dtype.is_void());
+  // Bool-type matching preserves the old element-code-only behavior; rank is checked separately.
+  bool correct_dtype = dtype.code == DLDataTypeCode::kDLBool ||
+                       (permit_unknown_dtype && dtype == DLDataType{kDLOpaqueHandle, 0, 0});
   bool correct_rank = ndim == 0 || (permit_unknown_rank && ndim == -1);
   return correct_dtype && correct_rank;
 }
diff --git a/src/runtime/extra/contrib/cblas/cblas.cc b/src/runtime/extra/contrib/cblas/cblas.cc
index d71eaeb17672..a19ccc99bb3f 100644
--- a/src/runtime/extra/contrib/cblas/cblas.cc
+++ b/src/runtime/extra/contrib/cblas/cblas.cc
@@ -21,10 +21,10 @@
  * \file Use external cblas library call.
  */
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/error.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 
 extern "C" {
 #include <cblas.h>
@@ -35,7 +35,6 @@ extern "C" {
 namespace tvm {
 namespace contrib {
 
-using namespace runtime;
 inline CBLAS_TRANSPOSE CBLASBooleanToTranspose(bool trans) {
   return trans ? CblasTrans : CblasNoTrans;
 }
@@ -128,38 +127,39 @@ struct CblasDgemmBatchIterativeOp {
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
-      .def_packed(
-          "tvm.contrib.cblas.matmul",
-          [](ffi::PackedArgs args, ffi::Any* ret) {
-            auto A = args[0].cast<DLTensor*>();
-            TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
+      .def_packed("tvm.contrib.cblas.matmul",
+                  [](ffi::PackedArgs args, ffi::Any* ret) {
+                    auto A = args[0].cast<DLTensor*>();
+                    TVM_FFI_ICHECK((A->dtype == DLDataType{kDLFloat, 32, 1} ||
+                                    A->dtype == DLDataType{kDLFloat, 64, 1}));
 
-            if (TypeMatch(A->dtype, kDLFloat, 32))
-              CallGemm(args, ret, CblasSgemmOp());
-            else
-              CallGemm(args, ret, CblasDgemmOp());
-          })
-      .def_packed(
-          "tvm.contrib.cblas.batch_matmul",
-          [](ffi::PackedArgs args, ffi::Any* ret) {
-            auto A = args[0].cast<DLTensor*>();
-            TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
-            if (TypeMatch(A->dtype, kDLFloat, 32)) {
-              CallBatchGemm(args, ret, CblasSgemmBatchOp());
-            } else {
-              CallBatchGemm(args, ret, CblasDgemmBatchOp());
-            }
-          })
-      .def_packed(
-          "tvm.contrib.cblas.batch_matmul_iterative", [](ffi::PackedArgs args, ffi::Any* ret) {
-            auto A = args[0].cast<DLTensor*>();
-            TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
-            if (TypeMatch(A->dtype, kDLFloat, 32)) {
-              CallBatchGemm(args, ret, CblasSgemmBatchIterativeOp());
-            } else {
-              CallBatchGemm(args, ret, CblasDgemmBatchIterativeOp());
-            }
-          });
+                    if (A->dtype == DLDataType{kDLFloat, 32, 1})
+                      CallGemm(args, ret, CblasSgemmOp());
+                    else
+                      CallGemm(args, ret, CblasDgemmOp());
+                  })
+      .def_packed("tvm.contrib.cblas.batch_matmul",
+                  [](ffi::PackedArgs args, ffi::Any* ret) {
+                    auto A = args[0].cast<DLTensor*>();
+                    TVM_FFI_ICHECK((A->dtype == DLDataType{kDLFloat, 32, 1} ||
+                                    A->dtype == DLDataType{kDLFloat, 64, 1}));
+                    if (A->dtype == DLDataType{kDLFloat, 32, 1}) {
+                      CallBatchGemm(args, ret, CblasSgemmBatchOp());
+                    } else {
+                      CallBatchGemm(args, ret, CblasDgemmBatchOp());
+                    }
+                  })
+      .def_packed("tvm.contrib.cblas.batch_matmul_iterative",
+                  [](ffi::PackedArgs args, ffi::Any* ret) {
+                    auto A = args[0].cast<DLTensor*>();
+                    TVM_FFI_ICHECK((A->dtype == DLDataType{kDLFloat, 32, 1} ||
+                                    A->dtype == DLDataType{kDLFloat, 64, 1}));
+                    if (A->dtype == DLDataType{kDLFloat, 32, 1}) {
+                      CallBatchGemm(args, ret, CblasSgemmBatchIterativeOp());
+                    } else {
+                      CallBatchGemm(args, ret, CblasDgemmBatchIterativeOp());
+                    }
+                  });
 }
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/runtime/extra/contrib/cblas/dnnl_blas.cc b/src/runtime/extra/contrib/cblas/dnnl_blas.cc
index 08d72e57b7ad..c0828c12e8b6 100644
--- a/src/runtime/extra/contrib/cblas/dnnl_blas.cc
+++ b/src/runtime/extra/contrib/cblas/dnnl_blas.cc
@@ -21,10 +21,10 @@
  * \file Use external cblas library call.
  */
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/error.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 
 extern "C" {
 #include <dnnl.h>
@@ -35,7 +35,6 @@ extern "C" {
 namespace tvm {
 namespace contrib {
 
-using namespace runtime;
 inline char DNNLBooleanToTransposeChar(bool trans) { return trans ? 'T' : 'N'; }
 
 struct DNNLSgemmOp {
@@ -52,7 +51,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def_packed("tvm.contrib.dnnl.matmul", [](ffi::PackedArgs args, ffi::Any* ret) {
     auto A = args[0].cast<DLTensor*>();
-    TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 32));
+    TVM_FFI_ICHECK((A->dtype == DLDataType{kDLFloat, 32, 1}));
     CallGemm(args, ret, DNNLSgemmOp());
   });
 }
diff --git a/src/runtime/extra/contrib/cblas/gemm_common.h b/src/runtime/extra/contrib/cblas/gemm_common.h
index 52f306e86238..65b13aa4c728 100644
--- a/src/runtime/extra/contrib/cblas/gemm_common.h
+++ b/src/runtime/extra/contrib/cblas/gemm_common.h
@@ -26,8 +26,8 @@
 #define TVM_RUNTIME_CONTRIB_CBLAS_GEMM_COMMON_H_
 
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/function.h>
-#include <tvm/runtime/data_type.h>
 
 #include <algorithm>
 #include <string>
@@ -37,7 +37,6 @@ namespace contrib {
 
 using ffi::Any;
 using ffi::PackedArgs;
-using runtime::TypeMatch;
 
 inline int ColumnStride(const DLTensor* tensor) {
   // If the tensor itself is transposed then it will have strides
@@ -96,8 +95,8 @@ inline void CallGemm(ffi::PackedArgs args, ffi::Any* ret, TGemmOp op) {
   transa = IsInPlaceTransposed(A) ? !transa : transa;
   transb = IsInPlaceTransposed(B) ? !transb : transb;
 
-  TVM_FFI_ICHECK(TypeMatch(B->dtype, kDLFloat, bit_depth));
-  TVM_FFI_ICHECK(TypeMatch(C->dtype, kDLFloat, bit_depth));
+  TVM_FFI_ICHECK((B->dtype == DLDataType{kDLFloat, static_cast<uint8_t>(bit_depth), 1}));
+  TVM_FFI_ICHECK((C->dtype == DLDataType{kDLFloat, static_cast<uint8_t>(bit_depth), 1}));
   double alpha = args.size() > 5 ? args[5].cast<double>() : 1.0;
   double beta = args.size() > 6 ? args[6].cast<double>() : 0.0;
   op(transb, transa, ColumnCount(B, transb), RowCount(A, transa), ColumnCount(A, transa),
@@ -143,9 +142,9 @@ inline void CallU8S8S32Gemm(ffi::PackedArgs args, ffi::Any* ret, TGemmOp op) {
   transa = IsInPlaceTransposed(A) ? !transa : transa;
   transb = IsInPlaceTransposed(B) ? !transb : transb;
 
-  TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLUInt, 8));
-  TVM_FFI_ICHECK(TypeMatch(B->dtype, kDLInt, 8));
-  TVM_FFI_ICHECK(TypeMatch(C->dtype, kDLInt, 32));
+  TVM_FFI_ICHECK((A->dtype == DLDataType{kDLUInt, 8, 1}));
+  TVM_FFI_ICHECK((B->dtype == DLDataType{kDLInt, 8, 1}));
+  TVM_FFI_ICHECK((C->dtype == DLDataType{kDLInt, 32, 1}));
   double alpha = args.size() > 5 ? args[5].cast<double>() : 1.0;
   double beta = args.size() > 6 ? args[6].cast<double>() : 0.0;
   op(transb, transa, ColumnCount(B, transb), RowCount(A, transa), ColumnCount(A, transa),
@@ -207,8 +206,8 @@ inline void CallBatchGemm(ffi::PackedArgs args, ffi::Any* ret, TBatchGemmOp op)
   transa = IsInPlaceTransposed3D(A) ? !transa : transa;
   transb = IsInPlaceTransposed3D(B) ? !transb : transb;
 
-  TVM_FFI_ICHECK(TypeMatch(B->dtype, kDLFloat, bit_depth));
-  TVM_FFI_ICHECK(TypeMatch(C->dtype, kDLFloat, bit_depth));
+  TVM_FFI_ICHECK((B->dtype == DLDataType{kDLFloat, static_cast<uint8_t>(bit_depth), 1}));
+  TVM_FFI_ICHECK((C->dtype == DLDataType{kDLFloat, static_cast<uint8_t>(bit_depth), 1}));
 
   double alpha = args.size() > 5 ? args[5].cast<double>() : 1.0;
   double beta = args.size() > 6 ? args[6].cast<double>() : 0.0;
diff --git a/src/runtime/extra/contrib/cblas/mkl.cc b/src/runtime/extra/contrib/cblas/mkl.cc
index 20f0c539076b..366ada41d2f1 100644
--- a/src/runtime/extra/contrib/cblas/mkl.cc
+++ b/src/runtime/extra/contrib/cblas/mkl.cc
@@ -21,10 +21,10 @@
  * \file Use external mkl library call.
  */
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/error.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 
 extern "C" {
 #include <mkl_cblas.h>
@@ -35,7 +35,6 @@ extern "C" {
 namespace tvm {
 namespace contrib {
 
-using namespace runtime;
 inline CBLAS_TRANSPOSE MKLBooleanToTranspose(bool trans) {
   return trans ? CblasTrans : CblasNoTrans;
 }
@@ -160,9 +159,10 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def_packed("tvm.contrib.mkl.matmul", [](ffi::PackedArgs args, ffi::Any* ret) {
     auto A = args[0].cast<DLTensor*>();
-    TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
+    TVM_FFI_ICHECK(
+        (A->dtype == DLDataType{kDLFloat, 32, 1} || A->dtype == DLDataType{kDLFloat, 64, 1}));
 
-    if (TypeMatch(A->dtype, kDLFloat, 32))
+    if (A->dtype == DLDataType{kDLFloat, 32, 1})
       CallGemm(args, ret, MKLSgemmOp());
     else
       CallGemm(args, ret, MKLDgemmOp());
@@ -178,33 +178,34 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                     auto A = args[0].cast<DLTensor*>();
                     auto B = args[1].cast<DLTensor*>();
                     auto C = args[2].cast<DLTensor*>();
-                    TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLUInt, 8) &&
-                                   TypeMatch(B->dtype, kDLInt, 8) &&
-                                   TypeMatch(C->dtype, kDLInt, 32));
+                    TVM_FFI_ICHECK((A->dtype == DLDataType{kDLUInt, 8, 1} &&
+                                    B->dtype == DLDataType{kDLInt, 8, 1} &&
+                                    C->dtype == DLDataType{kDLInt, 32, 1}));
 
                     CallU8S8S32Gemm(args, ret, MKLGemmU8S8S32Op());
                   })
-      .def_packed(
-          "tvm.contrib.mkl.batch_matmul",
-          [](ffi::PackedArgs args, ffi::Any* ret) {
-            auto A = args[0].cast<DLTensor*>();
-            TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
-            if (TypeMatch(A->dtype, kDLFloat, 32)) {
-              CallBatchGemm(args, ret, MKLSgemmBatchOp());
-            } else {
-              CallBatchGemm(args, ret, MKLDgemmBatchOp());
-            }
-          })
-      .def_packed(
-          "tvm.contrib.mkl.batch_matmul_iterative", [](ffi::PackedArgs args, ffi::Any* ret) {
-            auto A = args[0].cast<DLTensor*>();
-            TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
-            if (TypeMatch(A->dtype, kDLFloat, 32)) {
-              CallBatchGemm(args, ret, MKLSgemmBatchIterativeOp());
-            } else {
-              CallBatchGemm(args, ret, MKLDgemmBatchIterativeOp());
-            }
-          });
+      .def_packed("tvm.contrib.mkl.batch_matmul",
+                  [](ffi::PackedArgs args, ffi::Any* ret) {
+                    auto A = args[0].cast<DLTensor*>();
+                    TVM_FFI_ICHECK((A->dtype == DLDataType{kDLFloat, 32, 1} ||
+                                    A->dtype == DLDataType{kDLFloat, 64, 1}));
+                    if (A->dtype == DLDataType{kDLFloat, 32, 1}) {
+                      CallBatchGemm(args, ret, MKLSgemmBatchOp());
+                    } else {
+                      CallBatchGemm(args, ret, MKLDgemmBatchOp());
+                    }
+                  })
+      .def_packed("tvm.contrib.mkl.batch_matmul_iterative",
+                  [](ffi::PackedArgs args, ffi::Any* ret) {
+                    auto A = args[0].cast<DLTensor*>();
+                    TVM_FFI_ICHECK((A->dtype == DLDataType{kDLFloat, 32, 1} ||
+                                    A->dtype == DLDataType{kDLFloat, 64, 1}));
+                    if (A->dtype == DLDataType{kDLFloat, 32, 1}) {
+                      CallBatchGemm(args, ret, MKLSgemmBatchIterativeOp());
+                    } else {
+                      CallBatchGemm(args, ret, MKLDgemmBatchIterativeOp());
+                    }
+                  });
 }
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/runtime/extra/contrib/coreml/coreml_runtime.mm b/src/runtime/extra/contrib/coreml/coreml_runtime.mm
index a72948b250a7..d9823407fb0a 100644
--- a/src/runtime/extra/contrib/coreml/coreml_runtime.mm
+++ b/src/runtime/extra/contrib/coreml/coreml_runtime.mm
@@ -44,15 +44,15 @@
     [shape addObject:[NSNumber numberWithInteger:data_in->shape[i]]];
   }
 
-  DataType dtype(data_in->dtype);
+  DLDataType dtype = data_in->dtype;
   MLMultiArrayDataType dataType;
-  if (dtype == DataType::Float(64)) {
+  if (dtype == DLDataType{kDLFloat, 64, 1}) {
     dataType = MLMultiArrayDataTypeDouble;
     size *= sizeof(double);
-  } else if (dtype == DataType::Float(32)) {
+  } else if (dtype == DLDataType{kDLFloat, 32, 1}) {
     dataType = MLMultiArrayDataTypeFloat32;
     size *= sizeof(float);
-  } else if (dtype == DataType::Int(32)) {
+  } else if (dtype == DLDataType{kDLInt, 32, 1}) {
     dataType = MLMultiArrayDataTypeInt32;
     size *= sizeof(int);
   } else {
@@ -87,15 +87,15 @@
     shape.push_back(n);
   }
 
-  DataType dtype;
+  DLDataType dtype = DLDataType{kDLOpaqueHandle, 0, 0};
   if (data_desc.dataType == MLMultiArrayDataTypeDouble) {
-    dtype = DataType::Float(64);
+    dtype = DLDataType{kDLFloat, 64, 1};
     size *= sizeof(double);
   } else if (data_desc.dataType == MLMultiArrayDataTypeFloat32) {
-    dtype = DataType::Float(32);
+    dtype = DLDataType{kDLFloat, 32, 1};
     size *= sizeof(float);
   } else if (data_desc.dataType == MLMultiArrayDataTypeInt32) {
-    dtype = DataType::Int(32);
+    dtype = DLDataType{kDLInt, 32, 1};
     size *= sizeof(int);
   } else {
     LOG(FATAL) << "unexpected data type " << data_desc.dataType;
diff --git a/src/runtime/extra/contrib/cublas/cublas.cc b/src/runtime/extra/contrib/cublas/cublas.cc
index 4ef1b702c16c..461bbee1f86c 100644
--- a/src/runtime/extra/contrib/cublas/cublas.cc
+++ b/src/runtime/extra/contrib/cublas/cublas.cc
@@ -21,11 +21,11 @@
  * \file Use external cblas library call.
  */
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/error.h>
 #include <tvm/ffi/extra/c_env_api.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 
 #include "../../../../../3rdparty/compiler-rt/builtin_fp16.h"
 #include "../cblas/gemm_common.h"
@@ -34,7 +34,6 @@
 namespace tvm {
 namespace contrib {
 
-using namespace runtime;
 inline cublasOperation_t CUBLASBooleanToTranspose(bool item) {
   return item ? CUBLAS_OP_T : CUBLAS_OP_N;
 }
@@ -125,11 +124,11 @@ struct CublasDgemmBatchOp {
 
 // Check cublas supported mix-precision computation type and return computeType
 bool CheckMixPrecisionType(DLDataType in_dtype, DLDataType out_dtype, bool int_support = true) {
-  if (int_support && TypeMatch(out_dtype, kDLInt, 32)) {
-    return TypeMatch(in_dtype, kDLInt, 8);
-  } else if (TypeMatch(out_dtype, kDLFloat, 32)) {
-    return TypeMatch(in_dtype, kDLInt, 8) || TypeMatch(in_dtype, kDLFloat, 16) ||
-           TypeMatch(in_dtype, kDLBfloat, 16);
+  if (int_support && out_dtype == DLDataType{kDLInt, 32, 1}) {
+    return in_dtype == DLDataType{kDLInt, 8, 1};
+  } else if (out_dtype == DLDataType{kDLFloat, 32, 1}) {
+    return in_dtype == DLDataType{kDLInt, 8, 1} || in_dtype == DLDataType{kDLFloat, 16, 1} ||
+           in_dtype == DLDataType{kDLBfloat, 16, 1};
   } else {
     return false;
   }
@@ -145,7 +144,7 @@ void CallCublasLt(cublasLtHandle_t hdl, cudaStream_t stream,
                   const DLTensor* C, bool transa, bool transb, void* workspace_ptr,
                   size_t workspace_size, cublasLtEpilogue_t epilogue,
                   std::optional<float> dq_scale) {
-  TVM_FFI_ICHECK(TypeEqual(A->dtype, B->dtype));
+  TVM_FFI_ICHECK(A->dtype == B->dtype);
   // Reversed strides indicates an in-place transpose operation.
   transa = IsInPlaceTransposed(A) ? !transa : transa;
   transb = IsInPlaceTransposed(B) ? !transb : transb;
@@ -164,26 +163,26 @@ void CallCublasLt(cublasLtHandle_t hdl, cudaStream_t stream,
   void* alpha = &alpha_value;
   void* beta = &zero_fp32;
 
-  if (TypeMatch(A->dtype, kDLFloat, 16)) {
+  if (A->dtype == DLDataType{kDLFloat, 16, 1}) {
     ab_type = CUDA_R_16F;
-  } else if (TypeMatch(A->dtype, kDLBfloat, 16)) {
+  } else if (A->dtype == DLDataType{kDLBfloat, 16, 1}) {
     ab_type = CUDA_R_16BF;
-  } else if (TypeMatch(A->dtype, kDLInt, 8)) {
+  } else if (A->dtype == DLDataType{kDLInt, 8, 1}) {
     ab_type = CUDA_R_8I;
-  } else if (TypeMatch(A->dtype, DataType::TypeCode::kFloat8_e4m3fn, 8)) {
+  } else if (A->dtype == DLDataType{kDLFloat8_e4m3fn, 8, 1}) {
 #if CUDART_VERSION >= 11080
-    TVM_FFI_ICHECK(TypeMatch(B->dtype, DataType::TypeCode::kFloat8_e4m3fn, 8));
+    TVM_FFI_ICHECK((B->dtype == DLDataType{kDLFloat8_e4m3fn, 8, 1}));
     ab_type = CUDA_R_8F_E4M3;
 #else
     TVM_FFI_THROW(InternalError) << "Float8 (E4M3) is only supported in CUDA 11.8 and above.";
 #endif
   }
 
-  if (TypeMatch(C->dtype, kDLFloat, 16)) {
+  if (C->dtype == DLDataType{kDLFloat, 16, 1}) {
     c_type = CUDA_R_16F;
-  } else if (TypeMatch(C->dtype, kDLBfloat, 16)) {
+  } else if (C->dtype == DLDataType{kDLBfloat, 16, 1}) {
     c_type = CUDA_R_16BF;
-  } else if (TypeMatch(C->dtype, kDLInt, 32)) {
+  } else if (C->dtype == DLDataType{kDLInt, 32, 1}) {
     c_type = CUDA_R_32I;
     compute_type = CUBLAS_COMPUTE_32I;
     scale_type = CUDA_R_32I;
@@ -346,9 +345,9 @@ inline void CallLtIgemm(ffi::PackedArgs args, ffi::Any* ret, cublasLtHandle_t hd
   TVM_FFI_ICHECK_EQ(ElementStride(B), 1);
   TVM_FFI_ICHECK_EQ(ElementStride(C), 1);
 
-  TVM_FFI_ICHECK(TypeEqual(A->dtype, B->dtype));
-  TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLInt, 8));
-  TVM_FFI_ICHECK(TypeMatch(C->dtype, kDLInt, 32));
+  TVM_FFI_ICHECK(A->dtype == B->dtype);
+  TVM_FFI_ICHECK((A->dtype == DLDataType{kDLInt, 8, 1}));
+  TVM_FFI_ICHECK((C->dtype == DLDataType{kDLInt, 32, 1}));
 
   TVM_FFI_ICHECK(CheckMixPrecisionType(A->dtype, C->dtype)) << "Unsupported data type";
   int32_t alpha = args.size() > 5 ? args[5].cast<int32_t>() : 1;
@@ -405,7 +404,7 @@ inline void CallGemmEx(ffi::PackedArgs args, ffi::Any* ret, cublasHandle_t hdl)
   TVM_FFI_ICHECK_EQ(ElementStride(B), 1);
   TVM_FFI_ICHECK_EQ(ElementStride(C), 1);
 
-  TVM_FFI_ICHECK(TypeEqual(A->dtype, B->dtype));
+  TVM_FFI_ICHECK(A->dtype == B->dtype);
 
   // C can never be transposed.
   TVM_FFI_ICHECK(!IsInPlaceTransposed(C));
@@ -415,9 +414,9 @@ inline void CallGemmEx(ffi::PackedArgs args, ffi::Any* ret, cublasHandle_t hdl)
   transb = IsInPlaceTransposed(B) ? !transb : transb;
 
   TVM_FFI_ICHECK(CheckMixPrecisionType(A->dtype, C->dtype)) << "Unsupported data type";
-  TVM_FFI_ICHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride(A) % 4 == 0)
+  TVM_FFI_ICHECK((!(A->dtype == DLDataType{kDLInt, 8, 1}) || ColumnStride(A) % 4 == 0))
       << "leading dimension must divide 4 for int8 gemm";
-  TVM_FFI_ICHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride(B) % 4 == 0)
+  TVM_FFI_ICHECK((!(B->dtype == DLDataType{kDLInt, 8, 1}) || ColumnStride(B) % 4 == 0))
       << "leading dimension must divide 4 for int8 gemm";
   double alpha = args.size() > 5 ? args[5].cast<double>() : 1.0;
   double beta = args.size() > 6 ? args[6].cast<double>() : 0.0;
@@ -464,7 +463,7 @@ inline void CallBatchGemmEx(ffi::PackedArgs args, ffi::Any* ret, cublasHandle_t
   TVM_FFI_ICHECK_EQ(ElementStride3D(B), 1);
   TVM_FFI_ICHECK_EQ(ElementStride3D(C), 1);
 
-  TVM_FFI_ICHECK(TypeEqual(A->dtype, B->dtype));
+  TVM_FFI_ICHECK(A->dtype == B->dtype);
 
   // C can never be transposed.
   TVM_FFI_ICHECK(!IsInPlaceTransposed3D(C));
@@ -474,9 +473,9 @@ inline void CallBatchGemmEx(ffi::PackedArgs args, ffi::Any* ret, cublasHandle_t
   transb = IsInPlaceTransposed3D(B) ? !transb : transb;
 
   TVM_FFI_ICHECK(CheckMixPrecisionType(A->dtype, C->dtype, true)) << "Unsupported data type";
-  TVM_FFI_ICHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride3D(A) % 4 == 0)
+  TVM_FFI_ICHECK((!(A->dtype == DLDataType{kDLInt, 8, 1}) || ColumnStride3D(A) % 4 == 0))
       << "leading dimension must divide 4 for int8 gemm";
-  TVM_FFI_ICHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride3D(B) % 4 == 0)
+  TVM_FFI_ICHECK((!(B->dtype == DLDataType{kDLInt, 8, 1}) || ColumnStride3D(B) % 4 == 0))
       << "leading dimension must divide 4 for int8 gemm";
   double alpha = args.size() > 5 ? args[5].cast<double>() : 1.0;
   double beta = args.size() > 6 ? args[6].cast<double>() : 0.0;
@@ -538,13 +537,14 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
         CUBLASTryEnableTensorCore(entry_ptr->handle);
 
-        if (TypeEqual(A->dtype, C->dtype)) {
-          TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 16) || TypeMatch(A->dtype, kDLFloat, 32) ||
-                         TypeMatch(A->dtype, kDLFloat, 64));
+        if (A->dtype == C->dtype) {
+          TVM_FFI_ICHECK((A->dtype == DLDataType{kDLFloat, 16, 1} ||
+                          A->dtype == DLDataType{kDLFloat, 32, 1} ||
+                          A->dtype == DLDataType{kDLFloat, 64, 1}));
 
-          if (TypeMatch(A->dtype, kDLFloat, 16))
+          if (A->dtype == DLDataType{kDLFloat, 16, 1})
             CallGemm(args, ret, CublasHgemmOp(entry_ptr->handle));
-          else if (TypeMatch(A->dtype, kDLFloat, 32))
+          else if (A->dtype == DLDataType{kDLFloat, 32, 1})
             CallGemm(args, ret, CublasSgemmOp(entry_ptr->handle));
           else
             CallGemm(args, ret, CublasDgemmOp(entry_ptr->handle));
@@ -565,7 +565,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
         CUBLASTryEnableTensorCore(entry_ptr->handle);
 
-        TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLInt, 8)) << "Expects dtype to be int8\n";
+        TVM_FFI_ICHECK((A->dtype == DLDataType{kDLInt, 8, 1})) << "Expects dtype to be int8\n";
         cublasLtHandle_t ltHandle;
         CHECK_CUBLAS_ERROR(cublasLtCreate(&ltHandle));
         cudaStream_t stream =
@@ -586,13 +586,14 @@ TVM_FFI_STATIC_INIT_BLOCK() {
         CuBlasThreadEntry* entry_ptr = CuBlasThreadEntry::ThreadLocal(A->device);
 
         CUBLASTryEnableTensorCore(entry_ptr->handle);
-        if (TypeEqual(A->dtype, C->dtype)) {
-          TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 16) || TypeMatch(A->dtype, kDLFloat, 32) ||
-                         TypeMatch(A->dtype, kDLFloat, 64));
+        if (A->dtype == C->dtype) {
+          TVM_FFI_ICHECK((A->dtype == DLDataType{kDLFloat, 16, 1} ||
+                          A->dtype == DLDataType{kDLFloat, 32, 1} ||
+                          A->dtype == DLDataType{kDLFloat, 64, 1}));
 
-          if (TypeMatch(A->dtype, kDLFloat, 16))
+          if (A->dtype == DLDataType{kDLFloat, 16, 1})
             CallBatchGemm(args, ret, CublasHgemmBatchOp(entry_ptr->handle));
-          else if (TypeMatch(A->dtype, kDLFloat, 32))
+          else if (A->dtype == DLDataType{kDLFloat, 32, 1})
             CallBatchGemm(args, ret, CublasSgemmBatchOp(entry_ptr->handle));
           else
             CallBatchGemm(args, ret, CublasDgemmBatchOp(entry_ptr->handle));
diff --git a/src/runtime/extra/contrib/cudnn/conv_backward.cc b/src/runtime/extra/contrib/cudnn/conv_backward.cc
index df3d7c8e6ff7..47b8ab50cdbf 100644
--- a/src/runtime/extra/contrib/cudnn/conv_backward.cc
+++ b/src/runtime/extra/contrib/cudnn/conv_backward.cc
@@ -21,9 +21,9 @@
  * \file cuDNN kernel calls for backward algorithms.
  */
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/logging.h>
 
@@ -32,8 +32,6 @@
 namespace tvm {
 namespace contrib {
 
-using namespace runtime;
-
 void ConvolutionBackwardData(int mode, int format, int algo, int dims, int groups, const int pad[],
                              const int stride[], const int dilation[], DLTensor* dy, DLTensor* w,
                              DLTensor* dx, const std::string& conv_dtype) {
diff --git a/src/runtime/extra/contrib/cudnn/conv_forward.cc b/src/runtime/extra/contrib/cudnn/conv_forward.cc
index 3a573297f29e..aba57b7a9de7 100644
--- a/src/runtime/extra/contrib/cudnn/conv_forward.cc
+++ b/src/runtime/extra/contrib/cudnn/conv_forward.cc
@@ -21,9 +21,9 @@
  * \file cuDNN kernel calls for the forward algorithm.
  */
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/logging.h>
 
@@ -32,8 +32,6 @@
 namespace tvm {
 namespace contrib {
 
-using namespace runtime;
-
 void ConvolutionForward(int mode, int format, int algo, int dims, int groups, const int pad[],
                         const int stride[], const int dilation[], const DLTensor* x,
                         const DLTensor* w, const DLTensor* y, const std::string& conv_dtype) {
diff --git a/src/runtime/extra/contrib/cudnn/cudnn_utils.cc b/src/runtime/extra/contrib/cudnn/cudnn_utils.cc
index 5c34d4a2b0a6..3edb20dbacbc 100644
--- a/src/runtime/extra/contrib/cudnn/cudnn_utils.cc
+++ b/src/runtime/extra/contrib/cudnn/cudnn_utils.cc
@@ -23,10 +23,10 @@
 
 #include "cudnn_utils.h"
 
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/extra/c_env_api.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 
 #include <string>
 #include <vector>
diff --git a/src/runtime/extra/contrib/cudnn/softmax.cc b/src/runtime/extra/contrib/cudnn/softmax.cc
index fde7d5e4e182..50b4f69f7383 100644
--- a/src/runtime/extra/contrib/cudnn/softmax.cc
+++ b/src/runtime/extra/contrib/cudnn/softmax.cc
@@ -31,8 +31,6 @@
 namespace tvm {
 namespace contrib {
 
-using namespace runtime;
-
 void softmax_impl(cudnnSoftmaxAlgorithm_t alg, ffi::PackedArgs args, ffi::Any* ret) {
   auto x = args[0].cast<DLTensor*>();
   auto y = args[1].cast<DLTensor*>();
diff --git a/src/runtime/extra/contrib/cutlass/fp16_group_gemm.cuh b/src/runtime/extra/contrib/cutlass/fp16_group_gemm.cuh
index 35c4a5767236..85653222169b 100644
--- a/src/runtime/extra/contrib/cutlass/fp16_group_gemm.cuh
+++ b/src/runtime/extra/contrib/cutlass/fp16_group_gemm.cuh
@@ -49,17 +49,17 @@ void tvm_cutlass_group_gemm_impl(Tensor x, Tensor weight, Tensor indptr, Tensor
   float alpha = 1.0f;
   float beta = 0.0f;
 
-  if (DataType(x->dtype) == DataType::Float(16)) {
-    TVM_FFI_ICHECK(DataType(weight->dtype) == DataType::Float(16));
-    TVM_FFI_ICHECK(DataType(out->dtype) == DataType::Float(16));
+  if (x->dtype == DLDataType{kDLFloat, 16, 1}) {
+    TVM_FFI_ICHECK((weight->dtype == DLDataType{kDLFloat, 16, 1}));
+    TVM_FFI_ICHECK((out->dtype == DLDataType{kDLFloat, 16, 1}));
     using Dtype = cutlass::half_t;
     CutlassGroupGemm<Arch, Dtype, Dtype, Dtype>::run(
         static_cast<Dtype*>(x->data), static_cast<Dtype*>(weight->data),
         static_cast<int64_t*>(indptr->data), static_cast<uint8_t*>(workspace->data),
         workspace->shape[0], n, k, num_groups, alpha, beta, static_cast<Dtype*>(out->data), stream);
-  } else if (DataType(x->dtype) == DataType::BFloat(16)) {
-    TVM_FFI_ICHECK(DataType(weight->dtype) == DataType::BFloat(16));
-    TVM_FFI_ICHECK(DataType(out->dtype) == DataType::BFloat(16));
+  } else if (x->dtype == DLDataType{kDLBfloat, 16, 1}) {
+    TVM_FFI_ICHECK((weight->dtype == DLDataType{kDLBfloat, 16, 1}));
+    TVM_FFI_ICHECK((out->dtype == DLDataType{kDLBfloat, 16, 1}));
     using Dtype = cutlass::bfloat16_t;
     CutlassGroupGemm<Arch, Dtype, Dtype, Dtype>::run(
         static_cast<Dtype*>(x->data), static_cast<Dtype*>(weight->data),
diff --git a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm.cuh b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm.cuh
index db88ec0faaed..1af60af4da3a 100644
--- a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm.cuh
+++ b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm.cuh
@@ -66,14 +66,15 @@ void tvm_cutlass_fp8_groupwise_scaled_gemm_impl(Tensor a, Tensor b, Tensor scale
   TVM_FFI_ICHECK_EQ((n + block_size_0 - 1) / block_size_0, scales_b->shape[0]);
   TVM_FFI_ICHECK_EQ(scales_b->shape[1] * block_size_1, k);
 
-  using tvm::runtime::DataType;
-  TVM_FFI_ICHECK_EQ(DataType(a->dtype), DataType::Float8E4M3FN());
-  TVM_FFI_ICHECK_EQ(DataType(b->dtype), DataType::Float8E4M3FN());
-  TVM_FFI_ICHECK_EQ(DataType(scales_a->dtype), DataType::Float(32));
-  TVM_FFI_ICHECK_EQ(DataType(scales_b->dtype), DataType::Float(32));
-  TVM_FFI_ICHECK_EQ(DataType(workspace->dtype), DataType::UInt(8));
+  TVM_FFI_ICHECK_EQ(a->dtype, DLDataType{kDLFloat8_e4m3fn, 8, 1});
+  TVM_FFI_ICHECK_EQ(b->dtype, DLDataType{kDLFloat8_e4m3fn, 8, 1});
+  TVM_FFI_ICHECK_EQ(scales_a->dtype, DLDataType{kDLFloat, 32, 1});
+  TVM_FFI_ICHECK_EQ(scales_b->dtype, DLDataType{kDLFloat, 32, 1});
+  TVM_FFI_ICHECK_EQ(workspace->dtype, DLDataType{kDLUInt, 8, 1});
+  int64_t workspace_nbytes =
+      workspace->shape[0] * ((workspace->dtype.bits * workspace->dtype.lanes + 7) / 8);
 
-  if (DataType(out->dtype) == DataType::Float(16)) {
+  if (out->dtype == DLDataType{kDLFloat, 16, 1}) {
     CutlassFP8GroupwiseGemm<Arch, TileShape, ClusterShape, cutlass::float_e4m3_t,
                             cutlass::float_e4m3_t, cutlass::half_t,
                             float>::run(static_cast<cutlass::float_e4m3_t*>(a->data),
@@ -81,10 +82,9 @@ void tvm_cutlass_fp8_groupwise_scaled_gemm_impl(Tensor a, Tensor b, Tensor scale
                                         static_cast<float*>(scales_a->data),
                                         static_cast<float*>(scales_b->data),
                                         static_cast<cutlass::half_t*>(out->data),
-                                        static_cast<uint8_t*>(workspace->data),
-                                        workspace->shape[0] * DataType(workspace->dtype).bytes(), m,
+                                        static_cast<uint8_t*>(workspace->data), workspace_nbytes, m,
                                         n, k, 1, stream);
-  } else if (DataType(out->dtype) == DataType::BFloat(16)) {
+  } else if (out->dtype == DLDataType{kDLBfloat, 16, 1}) {
     CutlassFP8GroupwiseGemm<Arch, TileShape, ClusterShape, cutlass::float_e4m3_t,
                             cutlass::float_e4m3_t, cutlass::bfloat16_t,
                             float>::run(static_cast<cutlass::float_e4m3_t*>(a->data),
@@ -92,11 +92,10 @@ void tvm_cutlass_fp8_groupwise_scaled_gemm_impl(Tensor a, Tensor b, Tensor scale
                                         static_cast<float*>(scales_a->data),
                                         static_cast<float*>(scales_b->data),
                                         static_cast<cutlass::bfloat16_t*>(out->data),
-                                        static_cast<uint8_t*>(workspace->data),
-                                        workspace->shape[0] * DataType(workspace->dtype).bytes(), m,
+                                        static_cast<uint8_t*>(workspace->data), workspace_nbytes, m,
                                         n, k, 1, stream);
   } else {
-    LOG(FATAL) << "Unsupported output dtype: " << DataType(out->dtype);
+    LOG(FATAL) << "Unsupported output dtype: " << out->dtype;
   }
 }
 
@@ -131,14 +130,15 @@ void tvm_cutlass_fp8_groupwise_scaled_bmm_impl(Tensor a, Tensor b, Tensor scales
   TVM_FFI_ICHECK_EQ(scales_b->shape[1] * block_size_0, n);
   TVM_FFI_ICHECK_EQ(scales_b->shape[2] * block_size_1, k);
 
-  using tvm::runtime::DataType;
-  TVM_FFI_ICHECK_EQ(DataType(a->dtype), DataType::Float8E4M3FN());
-  TVM_FFI_ICHECK_EQ(DataType(b->dtype), DataType::Float8E4M3FN());
-  TVM_FFI_ICHECK_EQ(DataType(scales_a->dtype), DataType::Float(32));
-  TVM_FFI_ICHECK_EQ(DataType(scales_b->dtype), DataType::Float(32));
-  TVM_FFI_ICHECK_EQ(DataType(workspace->dtype), DataType::UInt(8));
+  TVM_FFI_ICHECK_EQ(a->dtype, DLDataType{kDLFloat8_e4m3fn, 8, 1});
+  TVM_FFI_ICHECK_EQ(b->dtype, DLDataType{kDLFloat8_e4m3fn, 8, 1});
+  TVM_FFI_ICHECK_EQ(scales_a->dtype, DLDataType{kDLFloat, 32, 1});
+  TVM_FFI_ICHECK_EQ(scales_b->dtype, DLDataType{kDLFloat, 32, 1});
+  TVM_FFI_ICHECK_EQ(workspace->dtype, DLDataType{kDLUInt, 8, 1});
+  int64_t workspace_nbytes =
+      workspace->shape[0] * ((workspace->dtype.bits * workspace->dtype.lanes + 7) / 8);
 
-  if (DataType(out->dtype) == DataType::Float(16)) {
+  if (out->dtype == DLDataType{kDLFloat, 16, 1}) {
     CutlassFP8GroupwiseGemm<Arch, TileShape, ClusterShape, cutlass::float_e4m3_t,
                             cutlass::float_e4m3_t, cutlass::half_t,
                             float>::run(static_cast<cutlass::float_e4m3_t*>(a->data),
@@ -146,10 +146,9 @@ void tvm_cutlass_fp8_groupwise_scaled_bmm_impl(Tensor a, Tensor b, Tensor scales
                                         static_cast<float*>(scales_a->data),
                                         static_cast<float*>(scales_b->data),
                                         static_cast<cutlass::half_t*>(out->data),
-                                        static_cast<uint8_t*>(workspace->data),
-                                        workspace->shape[0] * DataType(workspace->dtype).bytes(), m,
+                                        static_cast<uint8_t*>(workspace->data), workspace_nbytes, m,
                                         n, k, batch_size, stream);
-  } else if (DataType(out->dtype) == DataType::BFloat(16)) {
+  } else if (out->dtype == DLDataType{kDLBfloat, 16, 1}) {
     CutlassFP8GroupwiseGemm<Arch, TileShape, ClusterShape, cutlass::float_e4m3_t,
                             cutlass::float_e4m3_t, cutlass::bfloat16_t,
                             float>::run(static_cast<cutlass::float_e4m3_t*>(a->data),
@@ -157,11 +156,10 @@ void tvm_cutlass_fp8_groupwise_scaled_bmm_impl(Tensor a, Tensor b, Tensor scales
                                         static_cast<float*>(scales_a->data),
                                         static_cast<float*>(scales_b->data),
                                         static_cast<cutlass::bfloat16_t*>(out->data),
-                                        static_cast<uint8_t*>(workspace->data),
-                                        workspace->shape[0] * DataType(workspace->dtype).bytes(), m,
+                                        static_cast<uint8_t*>(workspace->data), workspace_nbytes, m,
                                         n, k, batch_size, stream);
   } else {
-    LOG(FATAL) << "Unsupported output dtype: " << DataType(out->dtype);
+    LOG(FATAL) << "Unsupported output dtype: " << out->dtype;
   }
 }
 
diff --git a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_group_gemm_sm100.cu b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_group_gemm_sm100.cu
index ea70eee38650..6bd9f45ab25e 100644
--- a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_group_gemm_sm100.cu
+++ b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_group_gemm_sm100.cu
@@ -57,15 +57,14 @@ void tvm_fp8_groupwise_scaled_group_gemm_sm100(Tensor a, Tensor b, Tensor scales
   TVM_FFI_ICHECK_EQ((n + block_size_0 - 1) / block_size_0, scales_b->shape[1]);
   TVM_FFI_ICHECK_EQ((k + block_size_1 - 1) / block_size_1, scales_b->shape[2]);
 
-  using tvm::runtime::DataType;
-  TVM_FFI_ICHECK_EQ(DataType(a->dtype), DataType::Float8E4M3FN());
-  TVM_FFI_ICHECK_EQ(DataType(b->dtype), DataType::Float8E4M3FN());
-  TVM_FFI_ICHECK_EQ(DataType(scales_a->dtype), DataType::Float(32));
-  TVM_FFI_ICHECK_EQ(DataType(scales_b->dtype), DataType::Float(32));
-  TVM_FFI_ICHECK_EQ(DataType(indptr->dtype), DataType::Int(64));
-  TVM_FFI_ICHECK_EQ(DataType(workspace->dtype), DataType::UInt(8));
+  TVM_FFI_ICHECK_EQ(a->dtype, DLDataType{kDLFloat8_e4m3fn, 8, 1});
+  TVM_FFI_ICHECK_EQ(b->dtype, DLDataType{kDLFloat8_e4m3fn, 8, 1});
+  TVM_FFI_ICHECK_EQ(scales_a->dtype, DLDataType{kDLFloat, 32, 1});
+  TVM_FFI_ICHECK_EQ(scales_b->dtype, DLDataType{kDLFloat, 32, 1});
+  TVM_FFI_ICHECK_EQ(indptr->dtype, DLDataType{kDLInt, 64, 1});
+  TVM_FFI_ICHECK_EQ(workspace->dtype, DLDataType{kDLUInt, 8, 1});
 
-  if (DataType(out->dtype) == DataType::Float(16)) {
+  if (out->dtype == DLDataType{kDLFloat, 16, 1}) {
     using Dtype = cutlass::half_t;
     cutlass_fp8_groupwise_scaled_group_gemm_sm100<cutlass::float_e4m3_t, cutlass::float_e4m3_t,
                                                   Dtype, float>(
@@ -73,7 +72,7 @@ void tvm_fp8_groupwise_scaled_group_gemm_sm100(Tensor a, Tensor b, Tensor scales
         static_cast<float*>(scales_a->data), static_cast<float*>(scales_b->data),
         static_cast<int64_t*>(indptr->data), static_cast<uint8_t*>(workspace->data),
         workspace->shape[0], n, k, num_groups, static_cast<Dtype*>(out->data), stream);
-  } else if (DataType(out->dtype) == DataType::BFloat(16)) {
+  } else if (out->dtype == DLDataType{kDLBfloat, 16, 1}) {
     using Dtype = cutlass::bfloat16_t;
     cutlass_fp8_groupwise_scaled_group_gemm_sm100<cutlass::float_e4m3_t, cutlass::float_e4m3_t,
                                                   Dtype, float>(
diff --git a/src/runtime/extra/contrib/dnnl/dnnl_utils.cc b/src/runtime/extra/contrib/dnnl/dnnl_utils.cc
index 23992209f2ad..e41d378b3d30 100644
--- a/src/runtime/extra/contrib/dnnl/dnnl_utils.cc
+++ b/src/runtime/extra/contrib/dnnl/dnnl_utils.cc
@@ -32,21 +32,21 @@ namespace contrib {
 dnnl::memory::data_type dtype_dl2dnnl(DLDataType dltype) {
   using dt = dnnl::memory::data_type;
   dt dnnl_type = dt::undef;
-  if (dltype.code == DataType::TypeCode::kFloat) {
+  if (dltype.code == DLDataTypeCode::kDLFloat) {
     if (dltype.bits == 16) {
       dnnl_type = dt::f16;
     } else if (dltype.bits == 32) {
       dnnl_type = dt::f32;
     }
-  } else if (dltype.code == DataType::TypeCode::kBFloat && dltype.bits == 16) {
+  } else if (dltype.code == DLDataTypeCode::kDLBfloat && dltype.bits == 16) {
     dnnl_type = dt::bf16;
-  } else if (dltype.code == DataType::TypeCode::kInt) {
+  } else if (dltype.code == DLDataTypeCode::kDLInt) {
     if (dltype.bits == 8) {
       dnnl_type = dt::s8;
     } else if (dltype.bits == 32) {
       dnnl_type = dt::s32;
     }
-  } else if (dltype.code == DataType::TypeCode::kUInt && dltype.bits == 8) {
+  } else if (dltype.code == DLDataTypeCode::kDLUInt && dltype.bits == 8) {
     dnnl_type = dt::u8;
   }
   if (dnnl_type == dt::undef) {
diff --git a/src/runtime/extra/contrib/dnnl/dnnl_utils.h b/src/runtime/extra/contrib/dnnl/dnnl_utils.h
index a598b6704450..6f36ed4d8fbe 100644
--- a/src/runtime/extra/contrib/dnnl/dnnl_utils.h
+++ b/src/runtime/extra/contrib/dnnl/dnnl_utils.h
@@ -34,7 +34,7 @@
 //  -Wzero-as-null-pointer-constant and -Wdocumentation-unknown-command
 #include <dnnl.hpp>
 
-#include "tvm/runtime/data_type.h"
+#include "tvm/ffi/dtype.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/extra/contrib/hipblas/hipblas.cc b/src/runtime/extra/contrib/hipblas/hipblas.cc
index 5276b4f7956d..18e136b0fdec 100644
--- a/src/runtime/extra/contrib/hipblas/hipblas.cc
+++ b/src/runtime/extra/contrib/hipblas/hipblas.cc
@@ -21,10 +21,10 @@
  * \file Use external hipblas library call.
  */
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/error.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 
 #include "../../../../../3rdparty/compiler-rt/builtin_fp16.h"
 #include "../cblas/gemm_common.h"
@@ -33,7 +33,6 @@
 namespace tvm {
 namespace contrib {
 
-using namespace runtime;
 inline hipblasOperation_t HIPBLASBooleanToTranspose(bool item) {
   return item ? HIPBLAS_OP_T : HIPBLAS_OP_N;
 }
@@ -117,10 +116,10 @@ struct HipblasDgemmBatchOp {
 
 // Check supported mix-precision computation type and return computeType
 bool CheckMixPrecisionType(DLDataType in_dtype, DLDataType out_dtype, bool int_support = true) {
-  if (int_support && TypeMatch(out_dtype, kDLInt, 32)) {
-    return TypeMatch(in_dtype, kDLInt, 8);
-  } else if (TypeMatch(out_dtype, kDLFloat, 32)) {
-    return TypeMatch(in_dtype, kDLInt, 8) || TypeMatch(in_dtype, kDLFloat, 16);
+  if (int_support && out_dtype == DLDataType{kDLInt, 32, 1}) {
+    return in_dtype == DLDataType{kDLInt, 8, 1};
+  } else if (out_dtype == DLDataType{kDLFloat, 32, 1}) {
+    return in_dtype == DLDataType{kDLInt, 8, 1} || in_dtype == DLDataType{kDLFloat, 16, 1};
   } else {
     return false;
   }
@@ -131,7 +130,7 @@ void CallHipblasLt(hipblasLtHandle_t hdl, hipStream_t stream,
                    const DLTensor* B, const DLTensor* bias, const DLTensor* C, bool transa,
                    bool transb, void* workspace_ptr, size_t workspace_size,
                    hipblasLtEpilogue_t epilogue) {
-  TVM_FFI_ICHECK(TypeEqual(A->dtype, B->dtype));
+  TVM_FFI_ICHECK(A->dtype == B->dtype);
   // Reversed strides indicates an in-place transpose operation.
   transa = IsInPlaceTransposed(A) ? !transa : transa;
   transb = IsInPlaceTransposed(B) ? !transb : transb;
@@ -147,15 +146,15 @@ void CallHipblasLt(hipblasLtHandle_t hdl, hipStream_t stream,
   void* alpha = &one_fp32;
   void* beta = &zero_fp32;
 
-  if (TypeMatch(A->dtype, kDLFloat, 16)) {
+  if (A->dtype == DLDataType{kDLFloat, 16, 1}) {
     ab_type = HIP_R_16F;
-  } else if (TypeMatch(A->dtype, kDLInt, 8)) {
+  } else if (A->dtype == DLDataType{kDLInt, 8, 1}) {
     ab_type = HIP_R_8I;
   }
 
-  if (TypeMatch(C->dtype, kDLFloat, 16)) {
+  if (C->dtype == DLDataType{kDLFloat, 16, 1}) {
     c_type = HIP_R_16F;
-  } else if (TypeMatch(C->dtype, kDLInt, 32)) {
+  } else if (C->dtype == DLDataType{kDLInt, 32, 1}) {
     c_type = HIP_R_32I;
     compute_type = HIPBLAS_COMPUTE_32I;
     scale_type = HIP_R_32I;
@@ -288,7 +287,7 @@ inline void CallGemmEx(ffi::PackedArgs args, ffi::Any* ret, hipblasHandle_t hdl)
   TVM_FFI_ICHECK_EQ(ElementStride(B), 1);
   TVM_FFI_ICHECK_EQ(ElementStride(C), 1);
 
-  TVM_FFI_ICHECK(TypeEqual(A->dtype, B->dtype));
+  TVM_FFI_ICHECK(A->dtype == B->dtype);
 
   // C can never be transposed.
   TVM_FFI_ICHECK(!IsInPlaceTransposed(C));
@@ -298,9 +297,9 @@ inline void CallGemmEx(ffi::PackedArgs args, ffi::Any* ret, hipblasHandle_t hdl)
   transb = IsInPlaceTransposed(B) ? !transb : transb;
 
   TVM_FFI_ICHECK(CheckMixPrecisionType(A->dtype, C->dtype)) << "Unsupported data type";
-  TVM_FFI_ICHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride(A) % 4 == 0)
+  TVM_FFI_ICHECK((!(A->dtype == DLDataType{kDLInt, 8, 1}) || ColumnStride(A) % 4 == 0))
       << "leading dimension must divide 4 for int8 gemm";
-  TVM_FFI_ICHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride(B) % 4 == 0)
+  TVM_FFI_ICHECK((!(B->dtype == DLDataType{kDLInt, 8, 1}) || ColumnStride(B) % 4 == 0))
       << "leading dimension must divide 4 for int8 gemm";
   double alpha = args.size() > 5 ? args[5].cast<double>() : 1.0;
   double beta = args.size() > 6 ? args[6].cast<double>() : 0.0;
@@ -347,7 +346,7 @@ inline void CallBatchGemmEx(ffi::PackedArgs args, ffi::Any* ret, hipblasHandle_t
   TVM_FFI_ICHECK_EQ(ElementStride3D(B), 1);
   TVM_FFI_ICHECK_EQ(ElementStride3D(C), 1);
 
-  TVM_FFI_ICHECK(TypeEqual(A->dtype, B->dtype));
+  TVM_FFI_ICHECK(A->dtype == B->dtype);
 
   // C can never be transposed.
   TVM_FFI_ICHECK(!IsInPlaceTransposed3D(C));
@@ -357,9 +356,9 @@ inline void CallBatchGemmEx(ffi::PackedArgs args, ffi::Any* ret, hipblasHandle_t
   transb = IsInPlaceTransposed3D(B) ? !transb : transb;
 
   TVM_FFI_ICHECK(CheckMixPrecisionType(A->dtype, C->dtype, true)) << "Unsupported data type";
-  TVM_FFI_ICHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride3D(A) % 4 == 0)
+  TVM_FFI_ICHECK((!(A->dtype == DLDataType{kDLInt, 8, 1}) || ColumnStride3D(A) % 4 == 0))
       << "leading dimension must divide 4 for int8 gemm";
-  TVM_FFI_ICHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride3D(B) % 4 == 0)
+  TVM_FFI_ICHECK((!(B->dtype == DLDataType{kDLInt, 8, 1}) || ColumnStride3D(B) % 4 == 0))
       << "leading dimension must divide 4 for int8 gemm";
   double alpha = args.size() > 5 ? args[5].cast<double>() : 1.0;
   double beta = args.size() > 6 ? args[6].cast<double>() : 0.0;
@@ -419,14 +418,14 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
                     HipBlasThreadEntry* entry_ptr = HipBlasThreadEntry::ThreadLocal(A->device);
 
-                    if (TypeEqual(A->dtype, C->dtype)) {
-                      TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 16) ||
-                                     TypeMatch(A->dtype, kDLFloat, 32) ||
-                                     TypeMatch(A->dtype, kDLFloat, 64));
+                    if (A->dtype == C->dtype) {
+                      TVM_FFI_ICHECK((A->dtype == DLDataType{kDLFloat, 16, 1} ||
+                                      A->dtype == DLDataType{kDLFloat, 32, 1} ||
+                                      A->dtype == DLDataType{kDLFloat, 64, 1}));
 
-                      if (TypeMatch(A->dtype, kDLFloat, 16)) {
+                      if (A->dtype == DLDataType{kDLFloat, 16, 1}) {
                         CallGemm(args, ret, HipblasHgemmOp(entry_ptr->handle));
-                      } else if (TypeMatch(A->dtype, kDLFloat, 32)) {
+                      } else if (A->dtype == DLDataType{kDLFloat, 32, 1}) {
                         CallGemm(args, ret, HipblasSgemmOp(entry_ptr->handle));
                       } else {
                         CallGemm(args, ret, HipblasDgemmOp(entry_ptr->handle));
@@ -441,13 +440,14 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
         HipBlasThreadEntry* entry_ptr = HipBlasThreadEntry::ThreadLocal(A->device);
 
-        if (TypeEqual(A->dtype, C->dtype)) {
-          TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 16) || TypeMatch(A->dtype, kDLFloat, 32) ||
-                         TypeMatch(A->dtype, kDLFloat, 64));
+        if (A->dtype == C->dtype) {
+          TVM_FFI_ICHECK((A->dtype == DLDataType{kDLFloat, 16, 1} ||
+                          A->dtype == DLDataType{kDLFloat, 32, 1} ||
+                          A->dtype == DLDataType{kDLFloat, 64, 1}));
 
-          if (TypeMatch(A->dtype, kDLFloat, 16)) {
+          if (A->dtype == DLDataType{kDLFloat, 16, 1}) {
             CallBatchGemm(args, ret, HipblasHgemmBatchOp(entry_ptr->handle));
-          } else if (TypeMatch(A->dtype, kDLFloat, 32)) {
+          } else if (A->dtype == DLDataType{kDLFloat, 32, 1}) {
             CallBatchGemm(args, ret, HipblasSgemmBatchOp(entry_ptr->handle));
           } else {
             CallBatchGemm(args, ret, HipblasDgemmBatchOp(entry_ptr->handle));
diff --git a/src/runtime/extra/contrib/json/json_node.h b/src/runtime/extra/contrib/json/json_node.h
index c165f6b05cf3..40c96d826914 100644
--- a/src/runtime/extra/contrib/json/json_node.h
+++ b/src/runtime/extra/contrib/json/json_node.h
@@ -29,9 +29,9 @@
 #include <tvm/ffi/any.h>
 #include <tvm/ffi/container/array.h>
 #include <tvm/ffi/container/map.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/extra/json.h>
 #include <tvm/ffi/string.h>
-#include <tvm/runtime/data_type.h>
 
 #include <cstdint>
 #include <cstdio>
diff --git a/src/runtime/extra/contrib/nvshmem/memory_allocator.cc b/src/runtime/extra/contrib/nvshmem/memory_allocator.cc
index cb6e3520c8c1..1483563b6200 100644
--- a/src/runtime/extra/contrib/nvshmem/memory_allocator.cc
+++ b/src/runtime/extra/contrib/nvshmem/memory_allocator.cc
@@ -57,7 +57,7 @@ class NVSHMEMAllocator final : public PooledAllocator {
     return allocator;
   }
 
-  Tensor Empty(ffi::Shape shape, DataType dtype, Device device) {
+  Tensor Empty(ffi::Shape shape, DLDataType dtype, Device device) {
     class NVSHMEMAlloc {
      public:
       explicit NVSHMEMAlloc(Buffer buffer) : buffer_(buffer) {}
@@ -87,7 +87,7 @@ class NVSHMEMAllocator final : public PooledAllocator {
   void DeviceFreeDataSpace(Device dev, void* ptr) final { nvshmem_free(ptr); }
 };
 
-Tensor NVSHMEMEmpty(ffi::Shape shape, DataType dtype, ffi::Optional<Device> device) {
+Tensor NVSHMEMEmpty(ffi::Shape shape, DLDataType dtype, ffi::Optional<Device> device) {
   return NVSHMEMAllocator::Global()->Empty(shape, dtype, UseDefaultDeviceIfNone(device));
 }
 
diff --git a/src/runtime/extra/contrib/random/random.cc b/src/runtime/extra/contrib/random/random.cc
index a3d0cd8b85a8..0a96185933e3 100644
--- a/src/runtime/extra/contrib/random/random.cc
+++ b/src/runtime/extra/contrib/random/random.cc
@@ -21,10 +21,10 @@
  * \file External random functions for tensor.
  */
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/error.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 
 #include <algorithm>
 #include <cstdint>
@@ -69,8 +69,6 @@
 namespace tvm {
 namespace contrib {
 
-using namespace runtime;
-
 struct RandomThreadLocalEntry {
   RandomEngine random_engine;
   static RandomThreadLocalEntry* ThreadLocal();
diff --git a/src/runtime/extra/contrib/sort/sort.cc b/src/runtime/extra/contrib/sort/sort.cc
index 51a94111b6e6..6e3a99f93522 100644
--- a/src/runtime/extra/contrib/sort/sort.cc
+++ b/src/runtime/extra/contrib/sort/sort.cc
@@ -23,10 +23,10 @@
 
 #include <dlpack/dlpack.h>
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/error.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 
 #include <algorithm>
 #include <vector>
@@ -36,8 +36,6 @@
 namespace tvm {
 namespace contrib {
 
-using namespace runtime;
-
 template <typename DType, bool stable_comparison = false>
 bool CompareAscend(const std::pair<int64_t, DType>& lhs, const std::pair<int64_t, DType>& rhs) {
   if constexpr (stable_comparison) {
diff --git a/src/runtime/extra/contrib/vllm/cache_alloc.cc b/src/runtime/extra/contrib/vllm/cache_alloc.cc
index 266138406cb9..42601d7a5e69 100644
--- a/src/runtime/extra/contrib/vllm/cache_alloc.cc
+++ b/src/runtime/extra/contrib/vllm/cache_alloc.cc
@@ -39,9 +39,9 @@ ffi::Array<Tensor> AllocateKVCache(int head_size, int num_layers, int num_heads,
   for (int i = 0; i < num_layers; ++i) {
     Tensor key_blocks =
         Tensor::Empty({num_blocks, num_heads, head_size / vec_size, block_size, vec_size},
-                      runtime::DataType::Float(16), dev);
+                      DLDataType{kDLFloat, 16, 1}, dev);
     Tensor value_blocks = Tensor::Empty({num_blocks, num_heads, head_size, block_size},
-                                        runtime::DataType::Float(16), dev);
+                                        DLDataType{kDLFloat, 16, 1}, dev);
     cache.push_back(key_blocks);
     cache.push_back(value_blocks);
   }
diff --git a/src/runtime/extra/contrib/vllm/cache_kernels.cu b/src/runtime/extra/contrib/vllm/cache_kernels.cu
index 5af93a1fd904..6a09497a8d12 100644
--- a/src/runtime/extra/contrib/vllm/cache_kernels.cu
+++ b/src/runtime/extra/contrib/vllm/cache_kernels.cu
@@ -206,16 +206,16 @@ TVM_FFI_STATIC_INIT_BLOCK() {
         DLDevice dev = key_cache->device;
 
         Tensor key_cache_ptrs_gpu =
-            Tensor::Empty({static_cast<int>(num_layers)}, runtime::DataType::Int(64), dev);
+            Tensor::Empty({static_cast<int>(num_layers)}, DLDataType{kDLInt, 64, 1}, dev);
         Tensor value_cache_ptrs_gpu =
-            Tensor::Empty({static_cast<int>(num_layers)}, runtime::DataType::Int(64), dev);
+            Tensor::Empty({static_cast<int>(num_layers)}, DLDataType{kDLInt, 64, 1}, dev);
         key_cache_ptrs_gpu.CopyFromBytes(key_cache_ptrs.data(),
                                          sizeof(int64_t) * key_cache_ptrs.size());
         value_cache_ptrs_gpu.CopyFromBytes(value_cache_ptrs.data(),
                                            sizeof(int64_t) * value_cache_ptrs.size());
 
         Tensor block_mapping_gpu =
-            Tensor::Empty(block_mapping.Shape(), runtime::DataType::Int(64), dev);
+            Tensor::Empty(block_mapping.Shape(), DLDataType{kDLInt, 64, 1}, dev);
         block_mapping_gpu.CopyFromBytes(block_mapping->data,
                                         sizeof(int64_t) * block_mapping->shape[0]);
 
diff --git a/src/runtime/extra/disco/builtin.cc b/src/runtime/extra/disco/builtin.cc
index da9f472b3e76..d9d5fc132768 100644
--- a/src/runtime/extra/disco/builtin.cc
+++ b/src/runtime/extra/disco/builtin.cc
@@ -71,7 +71,7 @@ ffi::Module LoadVMModule(std::string path, ffi::Optional<Device> device) {
   return mod;
 }
 
-Tensor DiscoEmptyTensor(ffi::Shape shape, DataType dtype, ffi::Optional<Device> device) {
+Tensor DiscoEmptyTensor(ffi::Shape shape, DLDataType dtype, ffi::Optional<Device> device) {
   return Tensor::Empty(shape, dtype, UseDefaultDeviceIfNone(device));
 }
 
@@ -131,7 +131,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   refl::GlobalDef()
       .def("runtime.disco.load_vm_module", LoadVMModule)
       .def("runtime.disco.empty",
-           [](ffi::Shape shape, DataType dtype, ffi::Optional<Device> device, bool worker0_only,
+           [](ffi::Shape shape, DLDataType dtype, ffi::Optional<Device> device, bool worker0_only,
               bool in_group) -> ffi::Optional<Tensor> {
              int worker_id = WorkerId();
              int group_size =
diff --git a/src/runtime/extra/disco/cuda_ipc/cuda_ipc_memory.cc b/src/runtime/extra/disco/cuda_ipc/cuda_ipc_memory.cc
index 426557b7b7ad..a8a8030f0169 100644
--- a/src/runtime/extra/disco/cuda_ipc/cuda_ipc_memory.cc
+++ b/src/runtime/extra/disco/cuda_ipc/cuda_ipc_memory.cc
@@ -97,10 +97,12 @@ class CUDAIPCMemoryAllocator final : public memory::PooledAllocator {
     auto [data_ptr, data_comm_ptrs] =
         AllocIPCMemory(dev, size, alignment, type_hint, /*reset_memory_to_zero=*/false);
     int barrier_ptr_size = sizeof(uint32_t) * (MAX_ALL_REDUCE_BLOCKS + 2) * MAX_RANKS_PER_NODE;
-    auto [barrier_in_ptr, barrier_in_comm_ptrs] = AllocIPCMemory(
-        dev, barrier_ptr_size, alignment, DataType::UInt(32), /*reset_memory_to_zero=*/true);
-    auto [barrier_out_ptr, barrier_out_comm_ptrs] = AllocIPCMemory(
-        dev, barrier_ptr_size, alignment, DataType::UInt(32), /*reset_memory_to_zero=*/true);
+    auto [barrier_in_ptr, barrier_in_comm_ptrs] =
+        AllocIPCMemory(dev, barrier_ptr_size, alignment, DLDataType{kDLUInt, 32, 1},
+                       /*reset_memory_to_zero=*/true);
+    auto [barrier_out_ptr, barrier_out_comm_ptrs] =
+        AllocIPCMemory(dev, barrier_ptr_size, alignment, DLDataType{kDLUInt, 32, 1},
+                       /*reset_memory_to_zero=*/true);
 
     // Create the CUDAIPCMemory object.
     ffi::ObjectPtr<CUDAIPCMemoryObj> ipc_memory = ffi::make_object<CUDAIPCMemoryObj>();
diff --git a/src/runtime/extra/disco/cuda_ipc/custom_allreduce.cc b/src/runtime/extra/disco/cuda_ipc/custom_allreduce.cc
index ffe00d5feef9..3eaca5ba98d4 100644
--- a/src/runtime/extra/disco/cuda_ipc/custom_allreduce.cc
+++ b/src/runtime/extra/disco/cuda_ipc/custom_allreduce.cc
@@ -81,7 +81,7 @@ void CustomAllReduce(DLTensor* send, int strategy, DLTensor* recv) {
     // Dispatch to nccl AllReduce if the customized all-reduce cannot apply.
     deviceStream_t stream = ctx->GetDefaultStream();
     NCCL_CALL(ncclAllReduce(send->data, recv->data, num_elements,
-                            /*datatype=*/nccl::AsNCCLDataType(DataType(send->dtype)),
+                            /*datatype=*/nccl::AsNCCLDataType(send->dtype),
                             /*op=*/ncclSum, ctx->global_comm, stream));
     return;
   }
diff --git a/src/runtime/extra/disco/loader.cc b/src/runtime/extra/disco/loader.cc
index 86caac6573ed..f714112aecf3 100644
--- a/src/runtime/extra/disco/loader.cc
+++ b/src/runtime/extra/disco/loader.cc
@@ -17,10 +17,10 @@
  * under the License.
  */
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/extra/json.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/disco/builtin.h>
 #include <tvm/runtime/vm/tensor_cache_support.h>
 
@@ -45,7 +45,7 @@ using ParamRecord = TensorCacheMetadata::FileRecord::ParamRecord;
 struct ShardInfo {
   struct TensorInfo {
     ffi::Shape shape;
-    DataType dtype;
+    DLDataType dtype;
   };
   struct ShardFunc {
     std::string name;
@@ -67,8 +67,7 @@ ShardInfo::TensorInfo LoadTensorInfoFromJSON(const json::Array& json_tensor_info
     shape.push_back(shape_json[i].cast<int64_t>());
   }
   std::string dtype = json_tensor_info[1].cast<ffi::String>();
-  return ShardInfo::TensorInfo{ffi::Shape(std::move(shape)),
-                               DataType(ffi::StringToDLDataType(dtype))};
+  return ShardInfo::TensorInfo{ffi::Shape(std::move(shape)), ffi::StringToDLDataType(dtype)};
 }
 
 ShardInfo::ShardFunc LoadShardFuncFromJSON(const json::Array& json_shard_func) {
@@ -301,7 +300,7 @@ Tensor ShardLoaderObj::Load(int weight_index) const {
   bool needs_sharding = !param_info.shard_info.funcs.empty();
   if (needs_sharding) {
     ffi::Shape shape = param_info.shard_info.funcs.back().output_info.shape;
-    DataType dtype = param_info.shard_info.funcs.back().output_info.dtype;
+    DLDataType dtype = param_info.shard_info.funcs.back().output_info.dtype;
     TVM_FFI_CHECK(shape.size() >= 1 && shape[0] == num_shards, ValueError)
         << "The first dimension of the "
         << "output shape must be equal to the "
diff --git a/src/runtime/extra/disco/nccl/nccl.cc b/src/runtime/extra/disco/nccl/nccl.cc
index 887f440b1b4f..cd00a1ac3d6b 100644
--- a/src/runtime/extra/disco/nccl/nccl.cc
+++ b/src/runtime/extra/disco/nccl/nccl.cc
@@ -122,8 +122,8 @@ void AllReduce(Tensor send, ReduceKind reduce_kind, bool in_group, Tensor recv)
   ffi::Shape shape = send.Shape();
   int64_t numel = shape->Product();
   deviceStream_t stream = ctx->GetDefaultStream();
-  DataType dtype = DataType(send->dtype);
-  if (dtype == DataType::Float8E4M3FN() || dtype == DataType::Float8E5M2()) {
+  DLDataType dtype = send->dtype;
+  if (dtype == DLDataType{kDLFloat8_e4m3fn, 8, 1} || dtype == DLDataType{kDLFloat8_e5m2, 8, 1}) {
     TVM_FFI_THROW(InternalError)
         << "Float8 data type cannot be allreduced, as nccl does not support this data type.";
   }
@@ -139,7 +139,7 @@ void AllGather(Tensor send, bool in_group, Tensor recv) {
   int64_t numel = shape->Product();
   deviceStream_t stream = ctx->GetDefaultStream();
   NCCL_CALL(ncclAllGather(send->data, recv->data, numel,
-                          /*datatype=*/AsNCCLDataType(DataType(send->dtype)),
+                          /*datatype=*/AsNCCLDataType(send->dtype),
                           in_group ? ctx->group_comm : ctx->global_comm, stream));
 }
 
@@ -162,7 +162,7 @@ void BroadcastFromWorker0(ffi::Optional<Tensor> send, bool in_group, Tensor recv
 
   deviceStream_t stream = ctx->GetDefaultStream();
   NCCL_CALL(ncclBroadcast(send_data, recv->data, numel,
-                          /*datatype=*/AsNCCLDataType(DataType(recv->dtype)),
+                          /*datatype=*/AsNCCLDataType(recv->dtype),
                           /*root=*/0, in_group ? ctx->group_comm : ctx->global_comm, stream));
 }
 
@@ -185,9 +185,9 @@ void ScatterFromWorker0(ffi::Optional<Tensor> send, bool in_group, Tensor recv)
            "of elements in the buffer to be "
            "divisible by the number of workers, but got numel = "
         << numel << " and " << num_receiver << " workers.";
-    DataType dtype(buffer->dtype);
+    DLDataType dtype = buffer->dtype;
     int64_t numel_per_shard = numel / num_receiver;
-    int64_t bytes_per_shard = numel_per_shard * dtype.bytes();
+    int64_t bytes_per_shard = numel_per_shard * ((dtype.bits * dtype.lanes + 7) / 8);
     TVM_FFI_CHECK_EQ(numel_per_shard, recv.Shape().Product(), ValueError)
         << "The number of elements in buffer `recv` must be the same as each shard "
            "of "
@@ -209,7 +209,7 @@ void ScatterFromWorker0(ffi::Optional<Tensor> send, bool in_group, Tensor recv)
     NCCL_CALL(ncclGroupStart());
   }
   int64_t numel = recv.Shape().Product();
-  DataType dtype(recv->dtype);
+  DLDataType dtype = recv->dtype;
   NCCL_CALL(ncclRecv(recv->data, numel, AsNCCLDataType(dtype), 0,
                      in_group ? ctx->group_comm : ctx->global_comm, stream));
   NCCL_CALL(ncclGroupEnd());
@@ -234,9 +234,9 @@ void GatherToWorker0(Tensor send, bool in_group, ffi::Optional<Tensor> recv) {
            "of elements in the buffer to be "
            "divisible by the number of workers, but got numel = "
         << numel << " and " << num_receiver << " workers.";
-    DataType dtype(buffer->dtype);
+    DLDataType dtype = buffer->dtype;
     int64_t numel_per_shard = numel / num_receiver;
-    int64_t bytes_per_shard = numel_per_shard * dtype.bytes();
+    int64_t bytes_per_shard = numel_per_shard * ((dtype.bits * dtype.lanes + 7) / 8);
     TVM_FFI_CHECK_EQ(numel_per_shard, send.Shape().Product(), ValueError)
         << "The number of elements in buffer `send` must be the same as each shard "
            "of "
@@ -258,7 +258,7 @@ void GatherToWorker0(Tensor send, bool in_group, ffi::Optional<Tensor> recv) {
     NCCL_CALL(ncclGroupStart());
   }
   int64_t numel = send.Shape().Product();
-  DataType dtype(send->dtype);
+  DLDataType dtype = send->dtype;
   NCCL_CALL(ncclSend(send->data, numel, AsNCCLDataType(dtype), 0,
                      in_group ? ctx->group_comm : ctx->global_comm, stream));
   NCCL_CALL(ncclGroupEnd());
diff --git a/src/runtime/extra/disco/nccl/nccl_context.h b/src/runtime/extra/disco/nccl/nccl_context.h
index 7a99be0897c0..d529ab441d11 100644
--- a/src/runtime/extra/disco/nccl/nccl_context.h
+++ b/src/runtime/extra/disco/nccl/nccl_context.h
@@ -86,39 +86,39 @@ inline void StreamDestroy(deviceStream_t stream) { ROCM_CALL(hipStreamDestroy(st
 
 #endif
 
-/*! \brief Convert DataType to ncclDataType. */
-inline ncclDataType_t AsNCCLDataType(runtime::DataType dtype) {
-  if (dtype == DataType::Int(8)) {
+/*! \brief Convert DLPack dtype to ncclDataType. */
+inline ncclDataType_t AsNCCLDataType(DLDataType dtype) {
+  if (dtype == DLDataType{kDLInt, 8, 1}) {
     return ncclInt8;
   }
-  if (dtype == DataType::UInt(8) || dtype == DataType::Float8E4M3FN() ||
-      dtype == DataType::Float8E5M2()) {
+  if (dtype == DLDataType{kDLUInt, 8, 1} || dtype == DLDataType{kDLFloat8_e4m3fn, 8, 1} ||
+      dtype == DLDataType{kDLFloat8_e5m2, 8, 1}) {
     // For float8 data type, pretend to be uint8 in nccl.
     // And will throw error when allreduce, as it makes no sense in this case.
     return ncclUint8;
   }
-  if (dtype == DataType::Int(32)) {
+  if (dtype == DLDataType{kDLInt, 32, 1}) {
     return ncclInt32;
   }
-  if (dtype == DataType::UInt(32)) {
+  if (dtype == DLDataType{kDLUInt, 32, 1}) {
     return ncclUint32;
   }
-  if (dtype == DataType::Int(64)) {
+  if (dtype == DLDataType{kDLInt, 64, 1}) {
     return ncclInt64;
   }
-  if (dtype == DataType::UInt(64)) {
+  if (dtype == DLDataType{kDLUInt, 64, 1}) {
     return ncclUint64;
   }
-  if (dtype == DataType::Float(16)) {
+  if (dtype == DLDataType{kDLFloat, 16, 1}) {
     return ncclFloat16;
   }
-  if (dtype == DataType::Float(32)) {
+  if (dtype == DLDataType{kDLFloat, 32, 1}) {
     return ncclFloat32;
   }
-  if (dtype == DataType::Float(64)) {
+  if (dtype == DLDataType{kDLFloat, 64, 1}) {
     return ncclFloat64;
   }
-  if (dtype == DataType::BFloat(16)) {
+  if (dtype == DLDataType{kDLBfloat, 16, 1}) {
     return ncclBfloat16;
   }
   TVM_FFI_THROW(ValueError) << "Unsupported data type " << dtype;
diff --git a/src/runtime/tensor.cc b/src/runtime/tensor.cc
index 887d576537f2..ed12d0b4885a 100644
--- a/src/runtime/tensor.cc
+++ b/src/runtime/tensor.cc
@@ -33,7 +33,7 @@
 
 #include "../support/base64.h"
 #include "../support/bytes_io.h"
-#include "tvm/runtime/data_type.h"
+#include "tvm/ffi/dtype.h"
 
 namespace tvm {
 namespace runtime {
@@ -52,11 +52,11 @@ inline void VerifyDataType(DLDataType dtype) {
       return;
     else if (dtype.bits == 4 && dtype.code == kDLInt)
       return;
-    else if (dtype.bits == 6 && dtype.code == DataType::kFloat6_e2m3fn)
+    else if (dtype.bits == 6 && dtype.code == kDLFloat6_e2m3fn)
       return;
-    else if (dtype.bits == 6 && dtype.code == DataType::kFloat6_e3m2fn)
+    else if (dtype.bits == 6 && dtype.code == kDLFloat6_e3m2fn)
       return;
-    else if (dtype.bits == 4 && dtype.code == DataType::kFloat4_e2m1fn)
+    else if (dtype.bits == 4 && dtype.code == kDLFloat4_e2m1fn)
       return;
     else
       TVM_FFI_ICHECK_EQ(dtype.bits % 8, 0);
diff --git a/src/runtime/vm/attn_backend.h b/src/runtime/vm/attn_backend.h
index 067fa8d10dc1..6aececc755ea 100644
--- a/src/runtime/vm/attn_backend.h
+++ b/src/runtime/vm/attn_backend.h
@@ -321,7 +321,7 @@ class PagedDecodeFunc : public AttnBackendFunc {
                             Tensor page_locked_int_workspace_buffer, HostMemoryVector* page_indptr,
                             int64_t batch_size, int64_t page_size, int64_t num_qo_heads,
                             int64_t num_kv_heads, int64_t qk_head_dim, int64_t v_head_dim,
-                            RoPEMode rope_mode, DataType q_dtype, DataType kv_dtype,
+                            RoPEMode rope_mode, DLDataType q_dtype, DLDataType kv_dtype,
                             TVMStreamHandle copy_stream) {
     // Do nothing. Subclasses can override to customize behavior.
   }
@@ -377,7 +377,7 @@ class FlashInferPagedDecodeFunc : public PagedDecodeFunc {
                     Tensor page_locked_int_workspace_buffer, HostMemoryVector* page_indptr,
                     int64_t batch_size, int64_t page_size, int64_t num_qo_heads,
                     int64_t num_kv_heads, int64_t qk_head_dim, int64_t v_head_dim,
-                    RoPEMode rope_mode, DataType q_dtype, DataType kv_dtype,
+                    RoPEMode rope_mode, DLDataType q_dtype, DLDataType kv_dtype,
                     TVMStreamHandle copy_stream) final {
     // Todo(tvm-team): enable cuda graph
     ffi::Shape plan_info_vec =
diff --git a/src/runtime/vm/attn_utils.h b/src/runtime/vm/attn_utils.h
index 7a2c93414c0f..4f9cd648e9d7 100644
--- a/src/runtime/vm/attn_utils.h
+++ b/src/runtime/vm/attn_utils.h
@@ -359,7 +359,7 @@ class HostMemoryVector {
 
   explicit HostMemoryVector(int64_t reserved_size, DLDataType dtype, Device device)
       : reserved_size_(reserved_size) {
-    TVM_FFI_ICHECK(DataType(dtype) == DataType::Int(32));
+    TVM_FFI_ICHECK((dtype == DLDataType{kDLInt, 32, 1}));
     data_ = Tensor::Empty({reserved_size}, dtype, device);
   }
 
@@ -368,7 +368,7 @@ class HostMemoryVector {
     if (current_size_ == reserved_size_) {
       reserved_size_ *= 2;
       Tensor new_data = Tensor::Empty({reserved_size_}, data_->dtype, data_->device);
-      std::memcpy(new_data->data, data_->data, current_size_ * DataType(data_->dtype).bytes());
+      std::memcpy(new_data->data, data_->data, current_size_ * (((data_->dtype).bits + 7) / 8));
       data_ = new_data;
     }
     static_cast<int32_t*>(data_->data)[current_size_++] = value;
@@ -382,7 +382,7 @@ class HostMemoryVector {
         reserved_size_ *= 2;
       }
       Tensor new_data = Tensor::Empty({reserved_size_}, data_->dtype, data_->device);
-      std::memcpy(new_data->data, data_->data, current_size_ * DataType(data_->dtype).bytes());
+      std::memcpy(new_data->data, data_->data, current_size_ * (((data_->dtype).bits + 7) / 8));
       data_ = new_data;
     }
     std::memcpy(static_cast<int32_t*>(data_->data) + current_size_, values.data(),
@@ -466,7 +466,7 @@ class PagedKVCacheAuxDataManager {
         device_(device),
         preferred_host_device_(preferred_host_device),
         copy_stream_(copy_stream) {
-    TVM_FFI_ICHECK(DataType(dtype_aux) == DataType::Int(32));
+    TVM_FFI_ICHECK((dtype_aux == DLDataType{kDLInt, 32, 1}));
   }
 
   virtual ~PagedKVCacheAuxDataManager() = default;
diff --git a/src/runtime/vm/builtin.cc b/src/runtime/vm/builtin.cc
index 8fc18c5c0722..30fbf77b9c7f 100644
--- a/src/runtime/vm/builtin.cc
+++ b/src/runtime/vm/builtin.cc
@@ -22,11 +22,11 @@
 #include <tvm/ffi/any.h>
 #include <tvm/ffi/container/array.h>
 #include <tvm/ffi/container/shape.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/error.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/memory.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/memory/memory_manager.h>
 #include <tvm/runtime/tensor.h>
@@ -243,14 +243,14 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 void CheckTensorInfo(ffi::PackedArgs args, ffi::Any* rv) {
   ffi::AnyView arg = args[0];
   int ndim = args[1].cast<int>();
-  DataType dtype;
+  DLDataType dtype;
   ffi::Optional<ffi::String> err_ctx;
 
   if (args.size() == 3) {
-    dtype = DataType::Void();
+    dtype = DLDataType{kDLOpaqueHandle, 0, 0};
     err_ctx = args[2].cast<ffi::Optional<ffi::String>>();
   } else {
-    dtype = args[2].cast<DataType>();
+    dtype = args[2].cast<DLDataType>();
     err_ctx = args[3].cast<ffi::Optional<ffi::String>>();
   }
 
@@ -264,10 +264,10 @@ void CheckTensorInfo(ffi::PackedArgs args, ffi::Any* rv) {
         << err_ctx.value_or("") << " expect Tensor with ndim " << ndim << " but get " << ptr->ndim;
   }
 
-  if (dtype != DataType::Void()) {
-    TVM_FFI_CHECK(DataType(ptr->dtype) == dtype, ValueError)
+  if (dtype != DLDataType{kDLOpaqueHandle, 0, 0}) {
+    TVM_FFI_CHECK(ptr->dtype == dtype, ValueError)
         << err_ctx.value_or("") << " expect Tensor with dtype " << dtype << " but get "
-        << DataType(ptr->dtype);
+        << ptr->dtype;
   }
 }
 
@@ -301,23 +301,24 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 /*!
  * \brief Builtin function to check if arg is PrimValue(dtype)
  * \param arg The input argument.
- * \param dtype Expected dtype of the PrimValue.  Can be DataType::Void() for unknown dtype.
+ * \param dtype Expected dtype of the PrimValue.  Can be DLDataType{kDLOpaqueHandle, 0, 0} for
+ * unknown dtype.
  * \param err_ctx Additional context if error occurs.
  */
-void CheckPrimValueInfo(ffi::AnyView arg, DataType dtype, ffi::Optional<ffi::String> err_ctx) {
+void CheckPrimValueInfo(ffi::AnyView arg, DLDataType dtype, ffi::Optional<ffi::String> err_ctx) {
   if (auto opt_obj = arg.as<ffi::ObjectRef>()) {
     TVM_FFI_THROW(TypeError) << err_ctx.value_or("") << ", expected dtype " << dtype
                              << ", but received ObjectRef of type "
                              << opt_obj.value()->GetTypeKey();
-  } else if (dtype.is_bool()) {
+  } else if (((dtype).code == kDLBool)) {
     arg.cast<bool>();
-  } else if (dtype.is_int()) {
+  } else if (((dtype).code == kDLInt)) {
     arg.cast<int64_t>();
-  } else if (dtype.is_uint()) {
+  } else if (((dtype).code == kDLUInt)) {
     arg.cast<uint64_t>();
-  } else if (dtype.is_float()) {
+  } else if (((dtype).code == kDLFloat)) {
     arg.cast<double>();
-  } else if (dtype.is_handle()) {
+  } else if (dtype.code == kDLOpaqueHandle && !(dtype.bits == 0 && dtype.lanes == 0)) {
     arg.cast<void*>();
   } else {
     TVM_FFI_THROW(TypeError) << err_ctx.value_or("") << ", unsupported dtype " << dtype;
@@ -398,7 +399,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
         Storage sobj = args[0].cast<Storage>();
         int64_t offset = args[1].cast<int64_t>();
         ffi::Shape shape = args[2].cast<ffi::Shape>();
-        DataType dtype = args[3].cast<DataType>();
+        DLDataType dtype = args[3].cast<DLDataType>();
         if (args.size() == 5) {
           ffi::String scope = args[4].cast<ffi::String>();
           *rv = sobj->AllocTensorScoped(offset, shape, dtype, scope);
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index 33ff1503f823..9e3a5f932309 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -101,8 +101,7 @@ std::string VMExecutable::Stats() const {
       oss << opt_int.value();
       oss << ", ";
     } else if (auto opt_dtype = it.as<DLDataType>()) {
-      DataType dtype(opt_dtype.value());
-      oss << dtype;
+      oss << opt_dtype.value();
       oss << ", ";
     } else {
       TVM_FFI_THROW(InternalError) << "Unsupported constant pool type " << it.GetTypeKey();
diff --git a/src/runtime/vm/lm_support.cc b/src/runtime/vm/lm_support.cc
index 51b271441a27..2516e0d8a1af 100644
--- a/src/runtime/vm/lm_support.cc
+++ b/src/runtime/vm/lm_support.cc
@@ -362,7 +362,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 // NOTE this is a built-in highly related to LM so we put it here.
 int SampleTopPFromLogits(Tensor logits, double temperature, double top_p, double uniform_sample) {
   TVM_FFI_ICHECK(logits.IsContiguous());
-  TVM_FFI_ICHECK(logits.DataType() == DataType::Float(32));
+  TVM_FFI_ICHECK((logits.DataType() == DLDataType{kDLFloat, 32, 1}));
 
   if (logits->device.device_type != kDLCPU) {
     logits = logits.CopyTo(DLDevice{kDLCPU, 0});
@@ -428,7 +428,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 int SampleTopPFromProb(Tensor prob, double top_p, double uniform_sample) {
   TVM_FFI_ICHECK(prob.IsContiguous());
-  TVM_FFI_ICHECK(prob.DataType() == DataType::Float(32));
+  TVM_FFI_ICHECK((prob.DataType() == DLDataType{kDLFloat, 32, 1}));
 
   if (prob->device.device_type != kDLCPU) {
     prob = prob.CopyTo(DLDevice{kDLCPU, 0});
@@ -543,7 +543,8 @@ Tensor MultinomialFromUniform(Tensor prob, Tensor uniform_sample) {
   int64_t vocab_size = prob->shape[prob->ndim - 1];
   const float* pprob = static_cast<float*>(prob->data);
   const float* psample = static_cast<float*>(uniform_sample->data);
-  Tensor new_array = Tensor::Empty({batch_size, 1}, DataType::Int(64), uniform_sample->device);
+  Tensor new_array =
+      Tensor::Empty({batch_size, 1}, DLDataType{kDLInt, 64, 1}, uniform_sample->device);
   int64_t* parray = static_cast<int64_t*>(new_array->data);
   for (int64_t i = 0; i < batch_size; ++i) {
     float cum_sum_prob = 0.0f;
@@ -569,8 +570,9 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 void ApplyRepetitionPenalty(Tensor logits, Tensor token_ids, double penalty) {
   TVM_FFI_ICHECK(logits.IsContiguous());
   TVM_FFI_ICHECK(token_ids.IsContiguous());
-  TVM_FFI_ICHECK(logits.DataType() == DataType::Float(32)) << "Logits data type is not float32!";
-  TVM_FFI_ICHECK(token_ids.DataType() == DataType::Int(32)) << "token ids must be int32!";
+  TVM_FFI_ICHECK((logits.DataType() == DLDataType{kDLFloat, 32, 1}))
+      << "Logits data type is not float32!";
+  TVM_FFI_ICHECK((token_ids.DataType() == DLDataType{kDLInt, 32, 1})) << "token ids must be int32!";
   TVM_FFI_ICHECK(logits->device.device_type == kDLCPU) << "logits device must be CPU!";
   TVM_FFI_ICHECK(token_ids->device.device_type == kDLCPU) << "token_ids device must be CPU!";
   float* logits_raw_data = static_cast<float*>(logits->data);
@@ -606,9 +608,11 @@ void ApplyPresenceAndFrequencyPenalty(Tensor logits, Tensor token_ids, Tensor to
   TVM_FFI_ICHECK(logits.IsContiguous());
   TVM_FFI_ICHECK(token_ids.IsContiguous());
   TVM_FFI_ICHECK(token_freqs.IsContiguous());
-  TVM_FFI_ICHECK(logits.DataType() == DataType::Float(32)) << "Logits data type is not float32!";
-  TVM_FFI_ICHECK(token_ids.DataType() == DataType::Int(32)) << "token ids must be int32!";
-  TVM_FFI_ICHECK(token_freqs.DataType() == DataType::Int(32)) << "token freqs must be int32!";
+  TVM_FFI_ICHECK((logits.DataType() == DLDataType{kDLFloat, 32, 1}))
+      << "Logits data type is not float32!";
+  TVM_FFI_ICHECK((token_ids.DataType() == DLDataType{kDLInt, 32, 1})) << "token ids must be int32!";
+  TVM_FFI_ICHECK((token_freqs.DataType() == DLDataType{kDLInt, 32, 1}))
+      << "token freqs must be int32!";
   TVM_FFI_ICHECK(logits->device.device_type == kDLCPU) << "logits device must be CPU!";
   TVM_FFI_ICHECK(token_ids->device.device_type == kDLCPU) << "token_ids device must be CPU!";
   TVM_FFI_ICHECK(token_freqs->device.device_type == kDLCPU) << "token_ids device must be CPU!";
@@ -633,7 +637,8 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 // This is an inplace operation.
 void ApplySoftmaxWithTemperature(Tensor logits, double temperature) {
   TVM_FFI_ICHECK(logits.IsContiguous());
-  TVM_FFI_ICHECK(logits.DataType() == DataType::Float(32)) << "Logits data type is not float32!";
+  TVM_FFI_ICHECK((logits.DataType() == DLDataType{kDLFloat, 32, 1}))
+      << "Logits data type is not float32!";
   TVM_FFI_ICHECK(logits->device.device_type == kDLCPU) << "logits device must be CPU!";
   int vocab_size = logits->shape[logits->ndim - 1];
   float* logits_raw_data = static_cast<float*>(logits->data);
diff --git a/src/runtime/vm/paged_kv_cache.cc b/src/runtime/vm/paged_kv_cache.cc
index e5c4576e01c1..cd7920d6eef0 100644
--- a/src/runtime/vm/paged_kv_cache.cc
+++ b/src/runtime/vm/paged_kv_cache.cc
@@ -116,9 +116,9 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
   const ffi::Optional<Tensor> rope_ext_factors_;
 
   /*! \brief The KV cache dtype. */
-  const DataType kv_dtype_;
+  const DLDataType kv_dtype_;
   /*! \brief We fix int32 to be the index dtype of auxiliary data. */
-  const DLDataType dtype_aux_ = DLDataType(DataType::Int(32, 1));
+  const DLDataType dtype_aux_ = DLDataType{kDLInt, 32, 1};
 
   /********************* Page Structures *********************/
 
@@ -326,7 +326,7 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
         rotary_scale_(rotary_scale),
         rotary_theta_(rotary_theta),
         rope_ext_factors_(std::move(rope_ext_factors)),
-        kv_dtype_(DataType(dtype)),
+        kv_dtype_(dtype),
         reserved_num_seqs_(reserved_num_seqs),
         f_transpose_append_mha_(std::move(f_transpose_append_mha)),
         f_transpose_append_mla_(std::move(f_transpose_append_mla)),
@@ -372,7 +372,7 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
         pages_.push_back(nvshmem_pages_.CreateView(
             {num_total_pages_, 2, num_kv_heads_, page_size_, qk_head_dim_}, nvshmem_pages_->dtype,
             i * num_total_pages_ * 2 * num_kv_heads_ * page_size_ * qk_head_dim_ *
-                nvshmem_pages_.DataType().bytes()));
+                (nvshmem_pages_.DataType().bits + 7) / 8));
       }
 
       const auto f_transfer_kv_ptr = tvm::ffi::Function::GetGlobal("nvshmem.KVTransfer");
@@ -450,9 +450,9 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
     for (int d = 0; d < kPagedKVCacheMaxBlockDepth; ++d) {
       if (NeedKernelBeginForward()) {
         temp_int_attn_workspace_.push_back(
-            Tensor::Empty({kIntAttnWorkspaceByte}, DataType::UInt(8), device));
+            Tensor::Empty({kIntAttnWorkspaceByte}, DLDataType{kDLUInt, 8, 1}, device));
         temp_int_pinned_attn_workspace_.push_back(Tensor::Empty(
-            {kIntAttnWorkspaceByte}, DataType::UInt(8), GetPreferredHostDevice(device)));
+            {kIntAttnWorkspaceByte}, DLDataType{kDLUInt, 8, 1}, GetPreferredHostDevice(device)));
       }
       qo_indptr_on_depths_view_.push_back(Tensor());
       page_indptr_on_depths_view_.push_back(Tensor());
@@ -470,11 +470,11 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
     // Additional workspace for the "prefill with ragged kv" kernel.
     if (NeedKernelBeginForward()) {
       temp_int_attn_workspace_.push_back(
-          Tensor::Empty({kIntAttnWorkspaceByte}, DataType::UInt(8), device));
+          Tensor::Empty({kIntAttnWorkspaceByte}, DLDataType{kDLUInt, 8, 1}, device));
       temp_int_pinned_attn_workspace_.push_back(Tensor::Empty(
-          {kIntAttnWorkspaceByte}, DataType::UInt(8), GetPreferredHostDevice(device)));
+          {kIntAttnWorkspaceByte}, DLDataType{kDLUInt, 8, 1}, GetPreferredHostDevice(device)));
       temp_float_attn_workspace_ =
-          Tensor::Empty({kFloatAttnWorkspaceByte}, DataType::UInt(8), device);
+          Tensor::Empty({kFloatAttnWorkspaceByte}, DLDataType{kDLUInt, 8, 1}, device);
     }
 
     if (std::find(attn_kinds_.begin(), attn_kinds_.end(), AttnKind::kMHA) != attn_kinds_.end()) {
@@ -488,9 +488,9 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
     temp_attn_output_device_ =
         Tensor::Empty({prefill_chunk_size_, num_qo_heads, v_head_dim}, dtype, device);
     temp_attn_lse_device_ =
-        Tensor::Empty({prefill_chunk_size_, num_qo_heads}, DataType::Float(32), device);
+        Tensor::Empty({prefill_chunk_size_, num_qo_heads}, DLDataType{kDLFloat, 32, 1}, device);
     merged_attn_lse_device_ =
-        Tensor::Empty({prefill_chunk_size_, num_qo_heads}, DataType::Float(32), device);
+        Tensor::Empty({prefill_chunk_size_, num_qo_heads}, DLDataType{kDLFloat, 32, 1}, device);
     for (int64_t page_id = num_total_pages - 1; page_id >= 0; --page_id) {
       free_page_ids_.push_back(page_id);
     }
diff --git a/src/runtime/vm/rnn_state.cc b/src/runtime/vm/rnn_state.cc
index 9926b3d235e8..a38acf6e1cdf 100644
--- a/src/runtime/vm/rnn_state.cc
+++ b/src/runtime/vm/rnn_state.cc
@@ -83,7 +83,7 @@ class RNNStateImpObj : public RNNStateObj {
   const ffi::Array<Tensor> init_layer_value_;
 
   /*! \brief We fix int32 to be the index dtype of auxiliary data. */
-  const DLDataType dtype_aux_ = DLDataType(DataType::Int(32, 1));
+  const DLDataType dtype_aux_ = DLDataType{kDLInt, 32, 1};
 
   /******************* Storage Structures *******************/
 
diff --git a/src/runtime/vm/tensor_cache_support.cc b/src/runtime/vm/tensor_cache_support.cc
index ee77c5ddd8f0..62fd1a34c62f 100644
--- a/src/runtime/vm/tensor_cache_support.cc
+++ b/src/runtime/vm/tensor_cache_support.cc
@@ -64,7 +64,7 @@ TensorCacheMetadata::FileRecord::ParamRecord JSONAsParamRecord(const json::Objec
   TensorCacheMetadata::FileRecord::ParamRecord result;
   std::string dtype = json["dtype"].cast<ffi::String>();
   result.name = json["name"].cast<ffi::String>();
-  result.dtype = DataType(ffi::StringToDLDataType(dtype));
+  result.dtype = ffi::StringToDLDataType(dtype);
   result.format = json["format"].cast<ffi::String>();
   result.nbytes = json["nbytes"].cast<int64_t>();
   result.byte_offset = json["byteOffset"].cast<int64_t>();
@@ -154,7 +154,7 @@ void CopyTensorFromBytes(Tensor param, const void* data, size_t nbytes,
 Tensor TensorCacheMetadata::FileRecord::ParamRecord::Load(
     Device device, const std::string* raw_data, ffi::Optional<Tensor>* staging_buffer) const {
   Tensor arr = Tensor::Empty(shape, dtype, device);
-  if (dtype == DataType::Float(32) && format == "f32-to-bf16") {
+  if (dtype == DLDataType{kDLFloat, 32, 1} && format == "f32-to-bf16") {
     // decode bf16 to f32
     std::vector<uint16_t> buffer(nbytes / 2);
     std::vector<uint32_t> decoded(nbytes / 2);
diff --git a/src/s_tir/analysis/calculate_allocated_memory.cc b/src/s_tir/analysis/calculate_allocated_memory.cc
index 51330a63e88b..41df4ee4bb8a 100644
--- a/src/s_tir/analysis/calculate_allocated_memory.cc
+++ b/src/s_tir/analysis/calculate_allocated_memory.cc
@@ -76,7 +76,7 @@ class AllocBufferCalculator : public StmtExprVisitor {
         break;
       }
     }
-    size *= op->buffer->dtype.bytes() * op->buffer->dtype.lanes();
+    size *= ((op->buffer->dtype.bits() + 7) / 8) * op->buffer->dtype.lanes();
     _current_size[storage_scope] += size;
     _max_size[storage_scope] = std::max(_current_size[storage_scope], _max_size[storage_scope]);
     StmtExprVisitor::VisitStmt_(op);
diff --git a/src/s_tir/analysis/estimate_flops.cc b/src/s_tir/analysis/estimate_flops.cc
index d77e715db1b6..bcde2d4b70bd 100644
--- a/src/s_tir/analysis/estimate_flops.cc
+++ b/src/s_tir/analysis/estimate_flops.cc
@@ -26,15 +26,13 @@ namespace tvm {
 namespace s_tir {
 using namespace tvm::tirx;
 
-int32_t DataType2Int(const tvm::DataType& dtype) {
+int32_t DataType2Int(DLDataType dtype) {
   static_assert(sizeof(DLDataType) == sizeof(int32_t), "Incorrect size of DLDataType");
   union {
     DLDataType src;
     int32_t dst;
   } converter;
-  converter.src.code = dtype.code();
-  converter.src.bits = dtype.bits();
-  converter.src.lanes = dtype.lanes();
+  converter.src = dtype;
   return converter.dst;
 }
 
@@ -57,7 +55,7 @@ ffi::String Int2DataTypeStr(int32_t dtype) {
 struct TResult {
   TResult() = default;
 
-  void Add(const tvm::DataType& dtype) { data_[DataType2Int(dtype)] += 1; }
+  void Add(DLDataType dtype) { data_[DataType2Int(dtype)] += 1; }
 
   TResult operator+=(const TResult& rhs) {
     for (const auto& kv : rhs.data_) {
@@ -98,7 +96,7 @@ class FlopEstimator : private ExprFunctor<TResult(const PrimExpr& n)>,
   TResult VisitExpr_(const Node* op) final {     \
     TResult result = VisitExpr(op->a);           \
     result += VisitExpr(op->b);                  \
-    result.Add(op->dtype);                       \
+    result.Add(op->ty()->dtype);                 \
     return result;                               \
   }
   TVM_TIR_ESTIMATE_FLOP_VISIT_BINARY(AddNode);
diff --git a/src/s_tir/analysis/sblock_access_region_detector.cc b/src/s_tir/analysis/sblock_access_region_detector.cc
index 18eef8e2fe01..9fa0a7b0b325 100644
--- a/src/s_tir/analysis/sblock_access_region_detector.cc
+++ b/src/s_tir/analysis/sblock_access_region_detector.cc
@@ -348,7 +348,7 @@ ffi::Array<BufferRegion> BlockReadWriteDetector::CollectRegions(
       const tvm::arith::IntSet& range = regions[i][j];
       if (range.CanProveSinglePoint(ana_)) {
         PrimExpr min = range.min();
-        region.push_back(Range::FromMinExtent(min, MakeConst(min.dtype(), 1)));
+        region.push_back(Range::FromMinExtent(min, MakeConst(min.ty(), 1)));
       } else {
         region.push_back(range.CoverRange(Range::FromMinExtent(0, buffers[i]->shape[j])));
       }
diff --git a/src/s_tir/analysis/verify_gpu_code.cc b/src/s_tir/analysis/verify_gpu_code.cc
index bd7b7c92ba7c..8155fd791e4b 100644
--- a/src/s_tir/analysis/verify_gpu_code.cc
+++ b/src/s_tir/analysis/verify_gpu_code.cc
@@ -76,19 +76,19 @@ class GPUCodeVerifier : public StmtExprVisitor {
         break;
       }
     }
+    PrimType dtype_ty = op->buffer->dtype;
+    TVM_FFI_ICHECK(!dtype_ty.IsScalableVector())
+        << "Cannot verify GPU memory usage for scalable vector dtype " << dtype_ty;
     if (storage_scope.rank == runtime::StorageRank::kLocal) {
-      local_memory_per_block_ +=
-          static_cast<size_t>(const_size) * op->buffer->dtype.bytes() * op->buffer->dtype.lanes();
+      local_memory_per_block_ += static_cast<size_t>(const_size) * ElementBytes(dtype_ty);
     } else if (storage_scope.rank == runtime::StorageRank::kShared) {
-      shared_memory_per_block_ +=
-          static_cast<size_t>(const_size) * op->buffer->dtype.bytes() * op->buffer->dtype.lanes();
+      shared_memory_per_block_ += static_cast<size_t>(const_size) * ElementBytes(dtype_ty);
     }
-    if (op->buffer->dtype.is_vector()) {
-      if (static_cast<size_t>(op->buffer->dtype.lanes() * op->buffer->dtype.bytes()) >
-          max_vector_bytes_) {
+    if (dtype_ty.IsFixedLengthVector()) {
+      if (ElementBytes(dtype_ty) > max_vector_bytes_) {
         std::stringstream s;
-        s << "Number of lanes (" << op->buffer->dtype.lanes() << ") times number of bytes ("
-          << op->buffer->dtype.bytes() << ") for dtype " << op->buffer->dtype
+        s << "Number of lanes (" << dtype_ty.lanes() << ") times number of bytes ("
+          << ((dtype_ty.bits() + 7) / 8) << ") for dtype " << dtype_ty
           << " is greater than the maximum number of vector bytes (" << max_vector_bytes_ << ")";
         errors_.push_back(s.str());
       }
@@ -202,11 +202,12 @@ class GPUCodeVerifier : public StmtExprVisitor {
   void CheckBufferIndicesVectorizable(const ffi::Array<PrimExpr> indices) {
     for (const auto index : indices) {
       if (const auto* ramp = index.as<RampNode>()) {
-        if (!is_one(ramp->stride) &&
-            static_cast<size_t>(ramp->dtype.lanes() * ramp->dtype.bytes()) > max_vector_bytes_) {
+        PrimType ramp_ty = ramp->ty();
+        if (!is_one(ramp->stride) && ramp_ty.IsFixedLengthVector() &&
+            ElementBytes(ramp_ty) > max_vector_bytes_) {
           std::stringstream s;
-          s << "Number of lanes (" << ramp->dtype.lanes() << ") times number of bytes ("
-            << ramp->dtype.bytes() << ") for dtype " << ramp->dtype
+          s << "Number of lanes (" << ramp_ty.lanes() << ") times number of bytes ("
+            << ((ramp_ty.bits() + 7) / 8) << ") for dtype " << ramp_ty
             << " is greater than the maximum number of vector bytes (" << max_vector_bytes_ << ")";
           errors_.push_back(s.str());
         }
@@ -215,11 +216,12 @@ class GPUCodeVerifier : public StmtExprVisitor {
   }
 
   void VisitExpr_(const CastNode* op) {
-    if (op->dtype.is_vector()) {
-      if (static_cast<size_t>(op->dtype.lanes() * op->dtype.bytes()) > max_vector_bytes_) {
+    PrimType op_ty = op->ty();
+    if (op_ty.IsFixedLengthVector()) {
+      if (ElementBytes(op_ty) > max_vector_bytes_) {
         std::stringstream s;
-        s << "Number of lanes (" << op->dtype.lanes() << ") times number of bytes ("
-          << op->dtype.bytes() << ") for dtype " << op->dtype
+        s << "Number of lanes (" << op_ty.lanes() << ") times number of bytes ("
+          << ((op_ty.bits() + 7) / 8) << ") for dtype " << op_ty
           << " is greater than the maximum number of vector bytes (" << max_vector_bytes_ << ")";
         errors_.push_back(s.str());
       }
@@ -228,11 +230,12 @@ class GPUCodeVerifier : public StmtExprVisitor {
   }
 
   void VisitExpr_(const BufferLoadNode* op) {
-    if (op->dtype.is_vector()) {
-      if (static_cast<size_t>(op->dtype.lanes() * op->dtype.bytes()) > max_vector_bytes_) {
+    PrimType op_ty = op->ty();
+    if (op_ty.IsFixedLengthVector()) {
+      if (ElementBytes(op_ty) > max_vector_bytes_) {
         std::stringstream s;
-        s << "Number of lanes (" << op->dtype.lanes() << ") times number of bytes ("
-          << op->dtype.bytes() << ") for dtype " << op->dtype
+        s << "Number of lanes (" << op_ty.lanes() << ") times number of bytes ("
+          << ((op_ty.bits() + 7) / 8) << ") for dtype " << op_ty
           << " is greater than the maximum number of vector bytes (" << max_vector_bytes_ << ")";
         errors_.push_back(s.str());
       }
@@ -242,12 +245,12 @@ class GPUCodeVerifier : public StmtExprVisitor {
   }
 
   void VisitStmt_(const BufferStoreNode* op) {
-    if (op->value->dtype.is_vector()) {
-      if (static_cast<size_t>(op->value->dtype.lanes() * op->value->dtype.bytes()) >
-          max_vector_bytes_) {
+    PrimType value_ty = op->value.ty();
+    if (value_ty.IsFixedLengthVector()) {
+      if (ElementBytes(value_ty) > max_vector_bytes_) {
         std::stringstream s;
-        s << "Number of lanes (" << op->value->dtype.lanes() << ") times number of bytes ("
-          << op->value->dtype.bytes() << ") for dtype " << op->value->dtype
+        s << "Number of lanes (" << value_ty.lanes() << ") times number of bytes ("
+          << ((value_ty.bits() + 7) / 8) << ") for dtype " << value_ty
           << " is greater than the maximum number of vector bytes (" << max_vector_bytes_ << ")";
         errors_.push_back(s.str());
       }
@@ -277,6 +280,8 @@ class GPUCodeVerifier : public StmtExprVisitor {
 
   std::vector<ffi::String> errors_;
 
+  static size_t ElementBytes(const PrimType& ty) { return ty.StorageBytes(); }
+
   void Reset_() {
     local_memory_per_block_ = 0;
     shared_memory_per_block_ = 0;
diff --git a/src/s_tir/backend/adreno/inject_texture_alloc.cc b/src/s_tir/backend/adreno/inject_texture_alloc.cc
index e4e7c322ef55..5b6aeda19362 100644
--- a/src/s_tir/backend/adreno/inject_texture_alloc.cc
+++ b/src/s_tir/backend/adreno/inject_texture_alloc.cc
@@ -79,11 +79,11 @@ class TextureAllocInjector : public arith::IRMutatorWithAnalyzer {
       ffi::Array<PrimExpr> args;
       args.push_back(StringImm(storage_scope));
       args.push_back(IntImm::Int64(3));
-      args.push_back(Call(DataType::Handle(), builtin::tvm_stack_make_shape(),
+      args.push_back(Call(PrimType::Handle(), builtin::tvm_stack_make_shape(),
                           {texture.width, texture.height, texture.depth}));
       args.push_back(IntImm::Int64(channel_size));
       stmt = Bind(op->buffer->data,
-                  Call(op->buffer->data.dtype(), builtin::nd_mem_alloc_with_scope(), args));
+                  Call(op->buffer->data.ty(), builtin::nd_mem_alloc_with_scope(), args));
     }
     return stmt;
   }
diff --git a/src/s_tir/backend/adreno/texture_flatten.cc b/src/s_tir/backend/adreno/texture_flatten.cc
index 0dd939ad817a..d4297e42e4d2 100644
--- a/src/s_tir/backend/adreno/texture_flatten.cc
+++ b/src/s_tir/backend/adreno/texture_flatten.cc
@@ -100,7 +100,7 @@ class TextureFlattener : public TextureLoweringBase {
     if (IsTextureStorage(storage_scope)) {
       ffi::Array<PrimExpr> args = GetTextureAccessArgs(op, op->buffer);
       args.push_back(op->value);
-      stmt = Evaluate(Call(args[0]->dtype, builtin::texture2d_store(), args));
+      stmt = Evaluate(Call(args[0].ty(), builtin::texture2d_store(), args));
     }
 
     return stmt;
@@ -147,7 +147,7 @@ class TextureFlattener : public TextureLoweringBase {
     PrimExpr col_offset = SimplifyOffset(col_dims, col_indices);
     PrimExpr depth_offset = SimplifyOffset(depth_dims, depth_indices);
     PrimExpr channel_size = IntImm(
-        DataType::Int(32, 1), *tirx::as_const_int(buffer->shape.back()) * buffer->dtype.bits());
+        PrimType::Int(32, 1), *tirx::as_const_int(buffer->shape.back()) * buffer->dtype.bits());
     args.push_back(row_offset);
     args.push_back(col_offset);
     args.push_back(depth_offset);
diff --git a/src/s_tir/data_layout.cc b/src/s_tir/data_layout.cc
index 787386c8ccb9..6fa2db0206e4 100644
--- a/src/s_tir/data_layout.cc
+++ b/src/s_tir/data_layout.cc
@@ -22,10 +22,10 @@
  * \brief Data SLayout expression.
  */
 #include <tvm/arith/analyzer.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ir/expr.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/s_tir/data_layout.h>
 #include <tvm/tirx/analysis.h>
@@ -113,8 +113,9 @@ SLayout::SLayout(const ffi::Array<IterVar>& axes) {
   data_ = std::move(node);
 }
 
-SLayout::SLayout(const std::string& name, DataType dtype) {  // NOLINT(*)
-  TVM_FFI_CHECK(dtype.is_int(), TypeError) << "The input dtype should be integer type";
+SLayout::SLayout(const std::string& name, PrimType index_ty) {  // NOLINT(*)
+  TVM_FFI_CHECK(index_ty.code() == DLDataTypeCode::kDLInt, TypeError)
+      << "The input dtype should be integer type";
   if (name == "__undef__") return;
 
   auto node = ffi::make_object<SLayoutNode>();
@@ -131,8 +132,8 @@ SLayout::SLayout(const std::string& name, DataType dtype) {  // NOLINT(*)
     if (c >= 'A' && c <= 'Z') {
       TVM_FFI_ICHECK_EQ(factor, 0) << "Invalid layout " << name << ": invalid factor size "
                                    << factor << " before dimension " << c;
-      IterVar axis(Range(IntImm(dtype, 0), Var(std::string(1, c), dtype)),
-                   Var(std::string(1, c), dtype), tirx::kDataPar);
+      IterVar axis(Range(IntImm(index_ty, 0), Var(std::string(1, c), index_ty)),
+                   Var(std::string(1, c), index_ty), tirx::kDataPar);
       if (!in_packing) {
         node->axes.push_back(axis);
       } else {
@@ -143,7 +144,7 @@ SLayout::SLayout(const std::string& name, DataType dtype) {  // NOLINT(*)
                                    << factor << " for dimension " << c;
       std::stringstream name;
       name << factor << c;
-      IterVar axis(Range(IntImm(dtype, 0), IntImm(dtype, factor)), Var(name.str(), dtype),
+      IterVar axis(Range(IntImm(index_ty, 0), IntImm(index_ty, factor)), Var(name.str(), index_ty),
                    tirx::kDataPar);
       if (!in_packing) {
         node->axes.push_back(axis);
@@ -174,8 +175,8 @@ SLayout::SLayout(const std::string& name, DataType dtype) {  // NOLINT(*)
         extent = extent * factor->value;
       }
       std::string grouped_name = ss.str();
-      IterVar grouped_axis(Range(IntImm(dtype, 0), IntImm(dtype, extent)), Var(grouped_name, dtype),
-                           tirx::kDataPar);
+      IterVar grouped_axis(Range(IntImm(index_ty, 0), IntImm(index_ty, extent)),
+                           Var(grouped_name, index_ty), tirx::kDataPar);
       node->axes.push_back(grouped_axis);
 
       in_packing = false;
@@ -231,21 +232,21 @@ ffi::Array<IterVar> SLayout::UnpackIterVar(IterVar packed_iter) {
   int64_t factor = 0, final_factor = 1;
 
   std::string name(packed_iter->var->name_hint.c_str());
-  DataType dtype = packed_iter->var.dtype();
+  PrimType index_ty = packed_iter->var.ty();
 
   for (auto ch : name) {
     if (ch >= '0' && ch <= '9') {
       factor = factor * 10 + (ch - '0');
     } else if (ch >= 'a' && ch <= 'z') {
       TVM_FFI_ICHECK(factor != 0) << "Invalid Factor Size";
-      result.push_back(IterVar(Range(IntImm(dtype, 0), IntImm(dtype, factor)),
-                               Var(std::string(1, ch), dtype), tirx::kDataPar));
+      result.push_back(IterVar(Range(IntImm(index_ty, 0), IntImm(index_ty, factor)),
+                               Var(std::string(1, ch), index_ty), tirx::kDataPar));
       final_factor *= factor;
       factor = 0;
     } else if (ch >= 'A' && ch <= 'Z') {
       TVM_FFI_ICHECK(factor == 0) << "Can't have non-zero factors for primal axis";
-      result.push_back(IterVar(Range(IntImm(dtype, 0), Var(std::string(1, ch), dtype)),
-                               Var(std::string(1, ch), dtype), tirx::kDataPar));
+      result.push_back(IterVar(Range(IntImm(index_ty, 0), Var(std::string(1, ch), index_ty)),
+                               Var(std::string(1, ch), index_ty), tirx::kDataPar));
     }
   }
 
@@ -256,7 +257,7 @@ IterVar SLayout::PackIterVar(ffi::Array<IterVar> iter_vars) {
   std::stringstream name;
   size_t extent = 1;
 
-  DataType dtype = iter_vars[0]->dom->extent.as<PrimExpr>().value()->dtype;
+  PrimType index_ty = iter_vars[0]->dom->extent.as<PrimExpr>().value().ty();
   for (auto itvar : iter_vars) {
     TVM_FFI_ICHECK(itvar->dom->extent.as<IntImm>())
         << "Packed Axis can contain only Subordinate Axes";
@@ -264,7 +265,7 @@ IterVar SLayout::PackIterVar(ffi::Array<IterVar> iter_vars) {
     extent = extent * itvar->dom->extent.as<IntImm>().value()->value;
   }
 
-  return IterVar(Range(IntImm(dtype, 0), IntImm(dtype, extent)), Var(name.str(), dtype),
+  return IterVar(Range(IntImm(index_ty, 0), IntImm(index_ty, extent)), Var(name.str(), index_ty),
                  tirx::kDataPar);
 }
 
@@ -357,7 +358,8 @@ inline bool GetStoreRule(ffi::Array<PrimExpr>* index_rule, ffi::Array<PrimExpr>*
             if (axis == sub_axis) {
               const auto* sub_extent = inter_unpacked_axes[l]->dom->extent.as<IntImmNode>();
               TVM_FFI_ICHECK(sub_extent) << "Expected Integer Extents for Offset Calculation";
-              factor_ij = factor_ij * IntImm(sub_extent->dtype, sub_extent->value);
+              factor_ij =
+                  factor_ij * IntImm(ffi::GetRef<PrimExpr>(sub_extent).ty(), sub_extent->value);
             }
           }
         }
@@ -498,11 +500,11 @@ inline ffi::Array<PrimExpr> TransformShape(const ffi::Array<PrimExpr>& src_shape
               << ", get " << orig_shape;
         }
       }
-      bind_map[orig_axis->var.get()] = IntImm(orig_axis->var->dtype, 0);
+      bind_map[orig_axis->var.get()] = IntImm(orig_axis->var.ty(), 0);
     } else {
-      bind_map[orig_axis->var.get()] = orig_axis->var->dtype == orig_shape->dtype
+      bind_map[orig_axis->var.get()] = orig_axis->var.ty()->dtype == orig_shape.ty()->dtype
                                            ? orig_shape
-                                           : cast(orig_axis->var->dtype, orig_shape);
+                                           : cast(orig_axis->var.ty(), orig_shape);
     }
   }
   // infer the target shape,
@@ -583,7 +585,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
-      .def("s_tir.SLayout", [](std::string name, DataType dtype) { return SLayout(name, dtype); })
+      .def("s_tir.SLayout", [](std::string name, PrimType dtype) { return SLayout(name, dtype); })
       .def("s_tir.SLayoutIndexOf",
            [](SLayout layout, std::string axis) -> int { return layout.IndexOf(axis); })
       .def("s_tir.SLayoutFactorOf",
diff --git a/src/s_tir/meta_schedule/arg_info.cc b/src/s_tir/meta_schedule/arg_info.cc
index dc452b370037..73fa41773883 100644
--- a/src/s_tir/meta_schedule/arg_info.cc
+++ b/src/s_tir/meta_schedule/arg_info.cc
@@ -98,7 +98,7 @@ ffi::Array<ArgInfo> ArgInfo::FromPrimFunc(const tirx::PrimFunc& func) {
   for (const tirx::Var& arg : func->params) {
     if (ffi::Optional<tirx::Buffer> _buffer = func->buffer_map.Get(arg)) {
       tirx::Buffer buffer = _buffer.value();
-      result.push_back(TensorInfo(/*dtype=*/buffer->dtype,
+      result.push_back(TensorInfo(/*dtype=*/buffer->dtype->dtype,
                                   /*shape=*/AsVector<PrimExpr, int64_t>(buffer->shape)));
     } else {
       TVM_FFI_THROW(ValueError) << "Unsupported argument type: " << arg;
@@ -117,7 +117,7 @@ ffi::Array<ArgInfo> ArgInfo::FromEntryFunc(const IRModule& mod, bool remove_prep
 
 /******** TensorInfo ********/
 
-TensorInfo::TensorInfo(runtime::DataType dtype, ffi::Shape shape) {
+TensorInfo::TensorInfo(DLDataType dtype, ffi::Shape shape) {
   ffi::ObjectPtr<TensorInfoNode> n = ffi::make_object<TensorInfoNode>();
   n->dtype = dtype;
   n->shape = shape;
@@ -150,7 +150,7 @@ TensorInfo TensorInfo::FromJSON(const ffi::ObjectRef& json_obj) {
   }
   std::vector<int64_t> s;
   std::transform(shape.begin(), shape.end(), std::back_inserter(s), [](int64_t i) { return i; });
-  return TensorInfo(DataType(dtype), ffi::Shape(s.begin(), s.end()));
+  return TensorInfo(dtype, ffi::Shape(s.begin(), s.end()));
 }
 
 /******** Repr ********/
@@ -182,10 +182,9 @@ TVM_FFI_STATIC_INIT_BLOCK() {
       .def("s_tir.meta_schedule.ArgInfoFromPrimFunc", ArgInfo::FromPrimFunc)
       .def("s_tir.meta_schedule.ArgInfoFromEntryFunc", ArgInfo::FromEntryFunc)
       .def("s_tir.meta_schedule.ArgInfoFromJSON", ArgInfo::FromJSON)
-      .def("s_tir.meta_schedule.TensorInfo",
-           [](runtime::DataType dtype, ffi::Shape shape) -> TensorInfo {
-             return TensorInfo(dtype, shape);
-           });
+      .def("s_tir.meta_schedule.TensorInfo", [](DLDataType dtype, ffi::Shape shape) -> TensorInfo {
+        return TensorInfo(dtype, shape);
+      });
 }
 
 }  // namespace meta_schedule
diff --git a/src/s_tir/meta_schedule/database/database_utils.cc b/src/s_tir/meta_schedule/database/database_utils.cc
index ea1473ae6500..826c38c8d1b0 100644
--- a/src/s_tir/meta_schedule/database/database_utils.cc
+++ b/src/s_tir/meta_schedule/database/database_utils.cc
@@ -32,7 +32,9 @@ void JSONDumps(Any json_obj, std::ostringstream& os) {
     os << "null";
   } else if (auto opt_int_imm = json_obj.try_cast<IntImm>()) {
     IntImm int_imm = *std::move(opt_int_imm);
-    if (int_imm->dtype == DataType::Bool()) {
+    PrimType int_ty = int_imm.ty();
+    if (int_ty.MatchesElementType(DLDataTypeCode::kDLBool, 8) && !int_ty.IsScalableVector() &&
+        !int_ty.IsFixedLengthVector()) {
       if (int_imm->value) {
         os << "true";
       } else {
@@ -154,7 +156,6 @@ class JSONTokenizer {
   bool NextFalse() { return NextLiteral("false", 5); }
 
   bool NextNumber(Token* token) {
-    using runtime::DataType;
     bool is_float = false;
     const char* st = cur_;
     for (; cur_ != end_; ++cur_) {
diff --git a/src/s_tir/meta_schedule/feature_extractor/per_store_feature.cc b/src/s_tir/meta_schedule/feature_extractor/per_store_feature.cc
index f0e3aa897cdd..2f87217db065 100644
--- a/src/s_tir/meta_schedule/feature_extractor/per_store_feature.cc
+++ b/src/s_tir/meta_schedule/feature_extractor/per_store_feature.cc
@@ -273,12 +273,12 @@ Pass SimplifyForFeatureExtraction() {
           HasBufferLoad(node->condition)) {
         return ffi::GetRef<Select>(node);
       }
-      return MakeConst(node->dtype, 1.0);
+      return MakeConst(node->ty(), 1.0);
     }
 
     PrimExpr VisitExpr_(const VarNode* var) final {
       if (unit_vars_.count(ffi::GetRef<Var>(var))) {
-        return MakeConst(var->dtype, 0.0);
+        return MakeConst(var->ty(), 0.0);
       }
       return ffi::GetRef<Var>(var);
     }
@@ -553,7 +553,7 @@ Feature::ArithOps::ArithOps(const BufferStoreNode* store, int64_t prod_loop_exte
   }
 #define TVM_FEATURE_BINARY(Type, FloatCounter, IntCounter) \
   void VisitExpr_(const Type* op) final {                  \
-    if (op->dtype.is_float()) {                            \
+    if (op->ty().code() == DLDataTypeCode::kDLFloat) {     \
       result_.FloatCounter += this->prod_loop_extent_;     \
     } else {                                               \
       result_.IntCounter += this->prod_loop_extent_;       \
@@ -589,13 +589,13 @@ Feature::ArithOps::ArithOps(const BufferStoreNode* store, int64_t prod_loop_exte
       bool is_pure =
           effect_kind == CallEffectKind::kPure || effect_kind == CallEffectKind::kExprAnnotation;
       if (is_pure) {
-        if (op->dtype.is_float()) {
+        if (op->ty().code() == DLDataTypeCode::kDLFloat) {
           result_.float_math_func += prod_loop_extent_;
         } else {
           result_.int_math_func += prod_loop_extent_;
         }
       } else {
-        if (op->dtype.is_float()) {
+        if (op->ty().code() == DLDataTypeCode::kDLFloat) {
           result_.float_other_func += prod_loop_extent_;
         } else {
           result_.int_other_func += prod_loop_extent_;
@@ -852,7 +852,7 @@ void Feature::SetRegion(const LoopNest& loop_nest, IntVec* for_touched_bytes,
       feature.access_shape = utils::RelaxAndUnion(feature.multi_indices, &numel, analyzer);
       numel = std::max<int64_t>(0, numel);
       feature.loop_accessed_numel[i][buffer] = numel;
-      touched_bytes += numel * buffer->dtype.bytes();
+      touched_bytes += numel * ((buffer->dtype.bits() + 7) / 8);
       (*buffer_touched_under_loop)[loop][buffer].push_back(numel);
     }
   }
@@ -880,7 +880,7 @@ void Feature::SubFeature::SetStride(const LoopNest& loop_nest, arith::AnalyzerOb
     TVM_FFI_ICHECK_EQ(access_shape.size(), buffer_shape.size());
     for (int i = ndim - 1; i >= 0; --i) {
       if (access_shape[i] == buffer_shape[i]) {
-        num_continuous_bytes = buffer_shape[i] * buffer->dtype.bytes();
+        num_continuous_bytes = buffer_shape[i] * ((buffer->dtype.bits() + 7) / 8);
         break;
       }
     }
@@ -953,7 +953,7 @@ void Feature::SubFeature::SetReuse(const LoopNest& loop_nest, int64_t top_loop_t
           const BufferNode* buffer = iter.first;
           const IntVec& numels = iter.second;
           int64_t numel = std::accumulate(numels.begin(), numels.end(), int64_t(0));
-          reuse_dis_bytes += numel * buffer->dtype.bytes();
+          reuse_dis_bytes += numel * ((buffer->dtype.bits() + 7) / 8);
         }
       }
       break;
@@ -973,7 +973,7 @@ void Feature::SubFeature::SetReuse(const LoopNest& loop_nest, int64_t top_loop_t
         const BufferNode* buffer = iter.first;
         const IntVec& numels = iter.second;
         int64_t numel = std::accumulate(numels.begin(), numels.end(), int64_t(0));
-        reuse_dis_bytes += numel * buffer->dtype.bytes();
+        reuse_dis_bytes += numel * ((buffer->dtype.bits() + 7) / 8);
       }
       reuse_dis_iter /= extent;
       reuse_dis_bytes /= extent;
@@ -983,7 +983,7 @@ void Feature::SubFeature::SetReuse(const LoopNest& loop_nest, int64_t top_loop_t
 }
 
 void Feature::SubFeature::SetFeature(const LoopNest& loop_nest, int64_t cache_line_bytes) {
-  int64_t dtype_bytes = this->buffer->dtype.bytes();
+  int64_t dtype_bytes = ((this->buffer->dtype.bits() + 7) / 8);
   this->stride = this->innermost_stride;
   this->bytes = dtype_bytes * loop_nest.prod;
   if (loop_nest.loops.empty()) {
@@ -1023,7 +1023,7 @@ Feature::Feature(const BufferStoreNode* store, const LoopNest& loop_nest, int64_
   int64_t top_loop_touch_bytes = 0.0;
   if (n_loops > 0) {
     for (const SubFeature& feature : sub_features) {
-      int64_t bytes = feature.buffer->dtype.bytes();
+      int64_t bytes = ((feature.buffer->dtype.bits() + 7) / 8);
       int64_t n_buffer = feature.loop_accessed_numel[0].size();
       top_loop_touch_bytes += bytes * n_buffer;
     }
@@ -1161,7 +1161,7 @@ struct Feature {
     for (int64_t x : shape) {
       numel *= x;
     }
-    alloc_size = numel * buffer->dtype.bytes();
+    alloc_size = numel * ((buffer->dtype.bits() + 7) / 8);
     alloc_prod = numel * loop_nest.prod;
     alloc_outer_prod = loop_nest.prod;
   }
diff --git a/src/s_tir/meta_schedule/measure_callback/add_to_database.cc b/src/s_tir/meta_schedule/measure_callback/add_to_database.cc
index 0c74e66d2af3..57008c7d953a 100644
--- a/src/s_tir/meta_schedule/measure_callback/add_to_database.cc
+++ b/src/s_tir/meta_schedule/measure_callback/add_to_database.cc
@@ -47,7 +47,7 @@ class AddToDatabaseNode : public MeasureCallbackNode {
       if (result->run_secs.defined()) {
         run_secs = result->run_secs.value();
       } else {
-        run_secs = ffi::Array<FloatImm>{FloatImm(DataType::Float(32), 1e10)};
+        run_secs = ffi::Array<FloatImm>{FloatImm(PrimType::Float(32), 1e10)};
       }
       database->CommitTuningRecord(TuningRecord(
           /*trace=*/candidate->sch->trace().value(),
diff --git a/src/s_tir/meta_schedule/mutator/mutator.cc b/src/s_tir/meta_schedule/mutator/mutator.cc
index d4060f5bf6b6..33d2b41b4aa7 100644
--- a/src/s_tir/meta_schedule/mutator/mutator.cc
+++ b/src/s_tir/meta_schedule/mutator/mutator.cc
@@ -54,27 +54,27 @@ Mutator Mutator::PyMutator(
 
 ffi::Map<Mutator, FloatImm> Mutator::DefaultLLVM() {
   return ffi::Map<Mutator, FloatImm>{
-      {Mutator::MutateTileSize(), FloatImm(DataType::Float(64), 0.9)},
-      {Mutator::MutateComputeLocation(), FloatImm(DataType::Float(64), 0.05)},
-      {Mutator::MutateUnroll(), FloatImm(DataType::Float(64), 0.03)},
-      {Mutator::MutateParallel(/*max_jobs_per_core=*/16), FloatImm(DataType::Float(64), 0.02)}};
+      {Mutator::MutateTileSize(), FloatImm(PrimType::Float(64), 0.9)},
+      {Mutator::MutateComputeLocation(), FloatImm(PrimType::Float(64), 0.05)},
+      {Mutator::MutateUnroll(), FloatImm(PrimType::Float(64), 0.03)},
+      {Mutator::MutateParallel(/*max_jobs_per_core=*/16), FloatImm(PrimType::Float(64), 0.02)}};
 }
 
 ffi::Map<Mutator, FloatImm> Mutator::DefaultCUDA() {
   return ffi::Map<Mutator, FloatImm>{
-      {Mutator::MutateTileSize(), FloatImm(DataType::Float(64), 0.9)},
-      {Mutator::MutateUnroll(), FloatImm(DataType::Float(64), 0.08)},
-      {Mutator::MutateThreadBinding(), FloatImm(DataType::Float(64), 0.02)}};
+      {Mutator::MutateTileSize(), FloatImm(PrimType::Float(64), 0.9)},
+      {Mutator::MutateUnroll(), FloatImm(PrimType::Float(64), 0.08)},
+      {Mutator::MutateThreadBinding(), FloatImm(PrimType::Float(64), 0.02)}};
 }
 
 ffi::Map<Mutator, FloatImm> Mutator::DefaultCUDATensorCore() { return Mutator::DefaultCUDA(); }
 
 ffi::Map<Mutator, FloatImm> Mutator::DefaultHexagon() {
   return ffi::Map<Mutator, FloatImm>{
-      {Mutator::MutateTileSize(), FloatImm(DataType::Float(64), 0.9)},
-      {Mutator::MutateComputeLocation(), FloatImm(DataType::Float(64), 0.05)},
-      {Mutator::MutateUnroll(), FloatImm(DataType::Float(64), 0.03)},
-      {Mutator::MutateParallel(/*max_jobs_per_core=*/16), FloatImm(DataType::Float(64), 0.02)}};
+      {Mutator::MutateTileSize(), FloatImm(PrimType::Float(64), 0.9)},
+      {Mutator::MutateComputeLocation(), FloatImm(PrimType::Float(64), 0.05)},
+      {Mutator::MutateUnroll(), FloatImm(PrimType::Float(64), 0.03)},
+      {Mutator::MutateParallel(/*max_jobs_per_core=*/16), FloatImm(PrimType::Float(64), 0.02)}};
 }
 
 // Pattern A (RM): auto-default repr from reflection.
diff --git a/src/s_tir/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/s_tir/meta_schedule/postproc/rewrite_cooperative_fetch.cc
index a2b57e3f5c5f..9221e8c3bfdd 100644
--- a/src/s_tir/meta_schedule/postproc/rewrite_cooperative_fetch.cc
+++ b/src/s_tir/meta_schedule/postproc/rewrite_cooperative_fetch.cc
@@ -93,9 +93,9 @@ size_t GetMaxUsedDtypeBytes(SBlock block) {
 
   tirx::PostOrderVisit(block->body, [&](const ffi::ObjectRef& obj) {
     if (const auto* store = obj.as<tirx::BufferStoreNode>()) {
-      max_bytes = std::max(max_bytes, static_cast<size_t>(store->value->dtype.bytes()));
+      max_bytes = std::max(max_bytes, static_cast<size_t>((store->value.ty().bits() + 7) / 8));
     } else if (const auto* load = obj.as<tirx::BufferLoadNode>()) {
-      max_bytes = std::max(max_bytes, static_cast<size_t>(load->dtype.bytes()));
+      max_bytes = std::max(max_bytes, static_cast<size_t>((load->ty().bits() + 7) / 8));
     } else if (const auto* call = obj.as<tirx::CallNode>()) {
       static const Op& q_multiply_shift_per_axis_op = Op::Get("tirx.q_multiply_shift_per_axis");
       static const Op& q_multiply_shift_op = Op::Get("tirx.q_multiply_shift");
@@ -104,7 +104,7 @@ size_t GetMaxUsedDtypeBytes(SBlock block) {
         max_bytes = std::max<size_t>(max_bytes, 8);
       }
     } else if (const auto* cast = obj.as<tirx::CastNode>()) {
-      max_bytes = std::max<size_t>(max_bytes, cast->dtype.bytes());
+      max_bytes = std::max<size_t>(max_bytes, (cast->ty().bits() + 7) / 8);
     }
   });
 
diff --git a/src/s_tir/meta_schedule/profiler.cc b/src/s_tir/meta_schedule/profiler.cc
index 91415447a48c..05580bcdee10 100644
--- a/src/s_tir/meta_schedule/profiler.cc
+++ b/src/s_tir/meta_schedule/profiler.cc
@@ -32,7 +32,7 @@ namespace meta_schedule {
 ffi::Map<ffi::String, FloatImm> ProfilerNode::Get() const {
   ffi::Map<ffi::String, FloatImm> ret;
   for (const auto& kv : stats_sec) {
-    ret.Set(kv.first, FloatImm(DataType::Float(64), kv.second));
+    ret.Set(kv.first, FloatImm(PrimType::Float(64), kv.second));
   }
   return ret;
 }
diff --git a/src/s_tir/meta_schedule/schedule/cuda/thread_bind.cc b/src/s_tir/meta_schedule/schedule/cuda/thread_bind.cc
index f9e8b14d8679..3aec6e51c364 100644
--- a/src/s_tir/meta_schedule/schedule/cuda/thread_bind.cc
+++ b/src/s_tir/meta_schedule/schedule/cuda/thread_bind.cc
@@ -60,7 +60,7 @@ std::function<ExprRV(int64_t)> MakeFactorSampler(Schedule sch, ffi::Array<int64_
     if (n == 1) {
       return IntImm::Int32(extents[0]);
     }
-    ffi::Array<FloatImm> probs(n, FloatImm(DataType::Float(32), 1.0 / n));
+    ffi::Array<FloatImm> probs(n, FloatImm(PrimType::Float(32), 1.0 / n));
     return sch->SampleCategorical(extents, probs);
   };
 }
diff --git a/src/s_tir/meta_schedule/schedule_rule/cross_thread_reduction.cc b/src/s_tir/meta_schedule/schedule_rule/cross_thread_reduction.cc
index 1c7506e83068..5f075ea1c210 100644
--- a/src/s_tir/meta_schedule/schedule_rule/cross_thread_reduction.cc
+++ b/src/s_tir/meta_schedule/schedule_rule/cross_thread_reduction.cc
@@ -78,7 +78,7 @@ class CrossThreadReductionNode : public ScheduleRuleNode {
 
     // Step 3. Try block fusion.
     int n_candidate = static_cast<int>(thread_extents.size());
-    ffi::Array<FloatImm> probs(n_candidate, FloatImm(DataType::Float(32), 1.0 / n_candidate));
+    ffi::Array<FloatImm> probs(n_candidate, FloatImm(PrimType::Float(32), 1.0 / n_candidate));
     s_tir::ExprRV thread_extent = tmp_sch->SampleCategorical(thread_extents, probs);
     if (fusible) {
       TVM_FFI_ICHECK(target_sblock.defined());
diff --git a/src/s_tir/meta_schedule/schedule_rule/multi_level_tiling.cc b/src/s_tir/meta_schedule/schedule_rule/multi_level_tiling.cc
index 2da29cc8e983..1cd504dfee68 100644
--- a/src/s_tir/meta_schedule/schedule_rule/multi_level_tiling.cc
+++ b/src/s_tir/meta_schedule/schedule_rule/multi_level_tiling.cc
@@ -369,15 +369,15 @@ void MultiLevelTilingNode::AnnotateCooperativeFetching(Schedule* sch,
   // Filter out invalid vector lanes according to the data type.
   const tirx::SBlockNode* block_node = (*sch)->GetSRef(block)->StmtAs<tirx::SBlockNode>();
   TVM_FFI_ICHECK_EQ(block_node->writes.size(), 1);
-  const runtime::DataType dtype = block_node->writes[0]->buffer->dtype;
+  const DLDataType dtype = block_node->writes[0]->buffer->dtype->dtype;
   std::function<bool(int)> f_filter = nullptr;
-  if (dtype == runtime::DataType::Float(32)) {
+  if (dtype == DLDataType{kDLFloat, 32, 1}) {
     f_filter = [&](int vector_len) { return vector_len <= 4; };
-  } else if (dtype == runtime::DataType::Float(16)) {
+  } else if (dtype == DLDataType{kDLFloat, 16, 1}) {
     f_filter = [&](int vector_len) {
       return (vector_len == 1 || vector_len % 2 == 0) && vector_len <= 8;
     };
-  } else if (dtype == runtime::DataType::Int(8)) {
+  } else if (dtype == DLDataType{kDLInt, 8, 1}) {
     f_filter = [&](int vector_len) { return vector_len <= 16; };
   }
   std::vector<int> valid_vector_lens;
@@ -396,7 +396,7 @@ void MultiLevelTilingNode::AnnotateCooperativeFetching(Schedule* sch,
     valid_vector_lens_arr.reserve(valid_vector_lens.size());
     for (int v : valid_vector_lens) valid_vector_lens_arr.push_back(static_cast<int64_t>(v));
     s_tir::ExprRV vector_load_len = (*sch)->SampleCategorical(
-        valid_vector_lens_arr, ffi::Array<FloatImm>(n, FloatImm(DataType::Float(32), prob)));
+        valid_vector_lens_arr, ffi::Array<FloatImm>(n, FloatImm(PrimType::Float(32), prob)));
     (*sch)->Annotate(block, s_tir::attr::meta_schedule_cooperative_fetch, vector_load_len);
   }
 }
diff --git a/src/s_tir/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/s_tir/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
index 1eb00bf8e6de..6a97fe642178 100644
--- a/src/s_tir/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
+++ b/src/s_tir/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
@@ -626,10 +626,12 @@ std::vector<State> MultiLevelTilingTensorCoreNode::AddReadReuseTensorCore(
     tirx::Buffer cache_read_buffer =
         s_tir::GetNthAccessBuffer(sch->state(), ffi::GetRef<tirx::SBlock>(cache_read_block), 0,
                                   s_tir::BufferIndexType::kWrite);
-    const DataType& dtype = cache_read_buffer->dtype;
-    if (dtype.is_float16()) {
+    const DLDataType dtype = cache_read_buffer->dtype->dtype;
+    // Storage alignment is chosen from element storage width; this schedule rule uses scalar
+    // cache-read buffers, so the old element-type-only test is preserved.
+    if ((((dtype).code == kDLFloat) && ((dtype).bits == 16))) {
       sch->StorageAlign(cache_read, 0, -2, 32, 8);
-    } else if (dtype.is_int() && dtype.bits() == 8) {
+    } else if (((dtype).code == kDLInt) && dtype.bits == 8) {
       sch->StorageAlign(cache_read, 0, -2, 32, 16);
     } else {
       TVM_PY_LOG(WARNING, logger) << "StorageAlign is not applied for data type " << dtype
diff --git a/src/s_tir/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc b/src/s_tir/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc
index a0c5f0a1f344..5ce5a1a8cc0e 100644
--- a/src/s_tir/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc
+++ b/src/s_tir/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc
@@ -75,7 +75,7 @@ class ParallelizeVectorizeUnrollNode : public ScheduleRuleNode {
     if (!unroll_max_steps.empty() && !s_tir::CheckSpatialPrimFunc(sch, root_rv)) {
       int n = unroll_max_steps.size();
       double prob = 1.0 / n;
-      ffi::Array<FloatImm> probs(n, FloatImm(DataType::Float(32), prob));
+      ffi::Array<FloatImm> probs(n, FloatImm(PrimType::Float(32), prob));
       PrimExpr max_step = sch->SampleCategorical(unroll_max_steps, probs);
       if (unroll_explicit) {
         sch->Annotate(root_rv, s_tir::attr::meta_schedule_unroll_explicit, max_step);
diff --git a/src/s_tir/meta_schedule/schedule_rule/schedule_rule.cc b/src/s_tir/meta_schedule/schedule_rule/schedule_rule.cc
index 6c421bd671be..eee9ef2685b8 100644
--- a/src/s_tir/meta_schedule/schedule_rule/schedule_rule.cc
+++ b/src/s_tir/meta_schedule/schedule_rule/schedule_rule.cc
@@ -16,8 +16,8 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 
 #include "../utils.h"
 
diff --git a/src/s_tir/meta_schedule/utils.h b/src/s_tir/meta_schedule/utils.h
index ed6e11e24384..946f7e58ebc4 100644
--- a/src/s_tir/meta_schedule/utils.h
+++ b/src/s_tir/meta_schedule/utils.h
@@ -463,7 +463,7 @@ inline ffi::Array<FloatImm> AsFloatArray(const ffi::ObjectRef& obj) {
   for (Any val : *arr) {
     auto float_value = [&]() -> FloatImm {
       if (auto opt_int_imm = val.try_cast<IntImm>()) {
-        return FloatImm(DataType::Float(32), (*opt_int_imm)->value);
+        return FloatImm(PrimType::Float(32), (*opt_int_imm)->value);
       } else if (auto opt_float_imm = val.try_cast<FloatImm>()) {
         return *std::move(opt_float_imm);
       } else {
diff --git a/src/s_tir/schedule/analysis/layout.cc b/src/s_tir/schedule/analysis/layout.cc
index 35e04cbced6c..223bd4683270 100644
--- a/src/s_tir/schedule/analysis/layout.cc
+++ b/src/s_tir/schedule/analysis/layout.cc
@@ -40,7 +40,7 @@ ffi::Array<PrimExpr> GetStrides(const Buffer& buffer) {
     return {};
   }
   ffi::Array<PrimExpr> strides(ndim, PrimExpr{nullptr});
-  PrimExpr stride = MakeConst(buffer->DefaultIndexType(), 1);
+  PrimExpr stride = MakeConst(PrimType(buffer->DefaultIndexType()), 1);
   for (int i = ndim - 1; i >= 0; --i) {
     strides.Set(i, stride);
     stride = stride * buffer->shape[i];
@@ -146,7 +146,7 @@ ffi::Optional<IndexMap> SuggestIndexMap(const Buffer& buffer, const ffi::Array<P
   // Step 2. Calculate a functor that flattens a multi-dimensional index
   auto f_flatten_index = [ndim, strides = GetStrides(buffer), dtype = buffer->DefaultIndexType()](
                              const ffi::Array<PrimExpr>& indices) -> PrimExpr {
-    PrimExpr flatten_index = IntImm(dtype, 0);
+    PrimExpr flatten_index = IntImm(PrimType(dtype), 0);
     for (int i = 0; i < ndim; ++i) {
       flatten_index = flatten_index + strides[i] * indices[i];
     }
@@ -223,7 +223,7 @@ ffi::Optional<IndexMap> SuggestIndexMap(const Buffer& buffer, const ffi::Array<P
     }
 
     // Step 6.2: Fuse all the indices. This is the inverse of Step 5.2.
-    PrimExpr flattened_index = IntImm(indices[0]->dtype, 0);
+    PrimExpr flattened_index = IntImm(indices[0].ty(), 0);
     int64_t stride = 1;
     for (int i = static_cast<int>(split_exprs.size()) - 1; i >= 0; --i) {
       flattened_index = inv_permuted_indices[i] * IntImm::Int32(stride) + flattened_index;
diff --git a/src/s_tir/schedule/analysis/reducer.cc b/src/s_tir/schedule/analysis/reducer.cc
index d6bb5c903492..f79afdacd16c 100644
--- a/src/s_tir/schedule/analysis/reducer.cc
+++ b/src/s_tir/schedule/analysis/reducer.cc
@@ -137,7 +137,7 @@ class PatternMatcher : public ExprVisitor {
     if (ptr == nullptr) {
       match_success_ = false;
     } else {
-      if (!runtime::TypeEqual(op->dtype, ptr->dtype)) {
+      if (op->ty()->dtype != ptr->ty()->dtype) {
         match_success_ = false;
       } else {
         PrimExpr tmp = expr_to_match_;
diff --git a/src/s_tir/schedule/concrete_schedule.cc b/src/s_tir/schedule/concrete_schedule.cc
index b891f6cb81be..7dd4f1da71bf 100644
--- a/src/s_tir/schedule/concrete_schedule.cc
+++ b/src/s_tir/schedule/concrete_schedule.cc
@@ -498,8 +498,8 @@ ffi::Array<LoopRV> ConcreteScheduleNode::Split(const LoopRV& loop_rv,
       if (is_const_int(factor) && !is_positive_const(factor)) {
         throw NonPositiveFactorError(state_->mod, factor.as<IntImmNode>()->value, i);
       }
-      if (factor.dtype().bits() > loop->extent.dtype().bits()) {
-        factor = cast(loop->extent.dtype(), factor);
+      if (factor.ty().bits() > loop->extent.ty().bits()) {
+        factor = cast(loop->extent.ty(), factor);
       }
       factors.push_back(factor);
       tot_length *= factor;
@@ -565,8 +565,8 @@ ffi::Array<LoopRV> ConcreteScheduleNode::LoopPartition(
       if (is_const_int(factor) && !is_positive_const(factor)) {
         throw NonPositiveFactorError(state_->mod, factor.as<IntImmNode>()->value, i);
       }
-      if (factor.dtype().bits() > loop->extent.dtype().bits()) {
-        factor = cast(loop->extent.dtype(), factor);
+      if (factor.ty().bits() > loop->extent.ty().bits()) {
+        factor = cast(loop->extent.ty(), factor);
       }
       factors.push_back(factor);
       tot_length += factor;
diff --git a/src/s_tir/schedule/concrete_schedule.h b/src/s_tir/schedule/concrete_schedule.h
index 5dd094dc388c..13bdaef6a224 100644
--- a/src/s_tir/schedule/concrete_schedule.h
+++ b/src/s_tir/schedule/concrete_schedule.h
@@ -369,7 +369,7 @@ inline T ConcreteScheduleNode::CreateRV(const StmtSRef& sref) {
 }
 
 inline ExprRV ConcreteScheduleNode::CreateRV(int64_t value) {
-  Var rv("v" + std::to_string(this->symbol_table_.size() + 1), DataType::Int(32));
+  Var rv("v" + std::to_string(this->symbol_table_.size() + 1), PrimType::Int(32));
   this->symbol_table_.Set(rv, IntImm::Int32(static_cast<int32_t>(value)));
   return rv;
 }
diff --git a/src/s_tir/schedule/ir_comparator.cc b/src/s_tir/schedule/ir_comparator.cc
index 1529923ca5fe..8b5ed55ed74d 100644
--- a/src/s_tir/schedule/ir_comparator.cc
+++ b/src/s_tir/schedule/ir_comparator.cc
@@ -94,8 +94,8 @@ bool TensorizeComparator::VisitStmt(const Stmt& n, const Stmt& other) {
 
 bool TensorizeComparator::VisitExpr(const PrimExpr& n, const PrimExpr& other) {
   bool equal = n.same_as(other) ||
-               ((n->type_index() == other->type_index()) &&
-                n.dtype().code() == other.dtype().code() && ExprComparator::VisitExpr(n, other)) ||
+               ((n->type_index() == other->type_index()) && n.ty().code() == other.ty().code() &&
+                ExprComparator::VisitExpr(n, other)) ||
                (ContainsVscaleCall(n) && analyzer_->CanProveEqual(n, other));
 
   if (!equal && assert_mode_) {
@@ -109,11 +109,11 @@ bool TensorizeComparator::VisitExpr(const PrimExpr& n, const PrimExpr& other) {
 bool TensorizeComparator::VisitExpr_(const CallNode* op, const PrimExpr& other) {
   const auto* rhs = other.as<CallNode>();
   if (!rhs->op.same_as(op->op)) return false;
-  if (op->dtype.code() != rhs->dtype.code()) {
+  if (op->ty().code() != rhs->ty().code()) {
     if (assert_mode_) {
       std::ostringstream os;
-      os << "CallNode data type codes do not match: op->dtype.code()=" << op->dtype.code()
-         << " vs rhs->dtype.code()=" << rhs->dtype.code();
+      os << "CallNode data type codes do not match: op->dtype.code()=" << op->ty().code()
+         << " vs rhs->dtype.code()=" << rhs->ty().code();
       EmitError(os.str());
     }
     return false;
@@ -330,11 +330,11 @@ bool TensorizeComparator::VisitExpr_(const VarNode* op, const PrimExpr& other) {
   const auto* rhs = other.as<VarNode>();
   auto lhs = ffi::GetRef<Var>(op);
   if (lhs.same_as(other)) return true;
-  if (op->dtype.code() != rhs->dtype.code()) {
+  if (op->ty().code() != rhs->ty().code()) {
     if (assert_mode_) {
       std::ostringstream os;
-      os << "VarNode data type codes do not match: op->dtype.code()=" << op->dtype.code()
-         << " vs rhs->dtype.code()=" << rhs->dtype.code();
+      os << "VarNode data type codes do not match: op->dtype.code()=" << op->ty().code()
+         << " vs rhs->dtype.code()=" << rhs->ty().code();
       EmitError(os.str());
     }
     return false;
@@ -363,7 +363,7 @@ bool TensorizeComparator::DefEqual(const Var& lhs, const Var& rhs) {
   equal_map_[lhs] = rhs;
   // Cast if necessary. This allows the workload and the tensor intrin to have different dtypes in
   // the indices.
-  analyzer_->Bind(lhs, cast(lhs.dtype(), rhs));
+  analyzer_->Bind(lhs, cast(lhs.ty(), rhs));
   return true;
 }
 
diff --git a/src/s_tir/schedule/primitive/block_annotate.cc b/src/s_tir/schedule/primitive/block_annotate.cc
index cbdfae481d14..5081f5e8aff4 100644
--- a/src/s_tir/schedule/primitive/block_annotate.cc
+++ b/src/s_tir/schedule/primitive/block_annotate.cc
@@ -298,7 +298,7 @@ class DTypeMutator : private ReplaceBufferMutator {
    * \param block_sref_reuse The block sref reuse map to be updated
    * \return The new block after the mutation
    */
-  static SBlock Mutate(const SBlock& allocate_site, const Buffer& old_buffer, const DataType& dtype,
+  static SBlock Mutate(const SBlock& allocate_site, const Buffer& old_buffer, DLDataType dtype,
                        ffi::Map<SBlock, SBlock>* block_sref_reuse) {
     Buffer new_buffer = WithDType(old_buffer, dtype);
     DTypeMutator mutator(old_buffer, new_buffer, dtype, block_sref_reuse);
@@ -307,16 +307,16 @@ class DTypeMutator : private ReplaceBufferMutator {
   }
 
  private:
-  DTypeMutator(const Buffer& old_buffer, Buffer new_buffer, const DataType& dtype,
+  DTypeMutator(const Buffer& old_buffer, Buffer new_buffer, DLDataType dtype,
                ffi::Map<SBlock, SBlock>* block_sref_reuse)
       : ReplaceBufferMutator(old_buffer, std::move(new_buffer), block_sref_reuse),
-        src_dtype_(old_buffer->dtype),
+        src_dtype_(old_buffer->dtype->dtype),
         tgt_dtype_(dtype) {}
 
   MatchBufferRegion VisitMatchBufferRegion(const MatchBufferRegion& match_buffer) final {
     auto it = buffer_var_map_.find(match_buffer->source->buffer->data.get());
     if (it != buffer_var_map_.end()) {
-      Buffer new_target_buffer = WithDType(match_buffer->buffer, it->second->dtype);
+      Buffer new_target_buffer = WithDType(match_buffer->buffer, it->second->dtype->dtype);
       buffer_var_map_[match_buffer->buffer->data.get()] = new_target_buffer;
       return MatchBufferRegion(new_target_buffer,
                                BufferRegion(it->second, match_buffer->source->region));
@@ -330,7 +330,7 @@ class DTypeMutator : private ReplaceBufferMutator {
     auto it = buffer_var_map_.find(node->buffer->data.get());
     if (it != buffer_var_map_.end()) {
       node.CopyOnWrite()->buffer = it->second;
-      node.CopyOnWrite()->value = Cast(tgt_dtype_, node->value);
+      node.CopyOnWrite()->value = Cast(PrimType(tgt_dtype_), node->value);
     }
     return node;
   }
@@ -339,12 +339,12 @@ class DTypeMutator : private ReplaceBufferMutator {
     BufferLoad node = StmtExprMutator::VisitExpr_(op).as_or_throw<BufferLoad>();
     auto it = buffer_var_map_.find(node->buffer->data.get());
     if (it != buffer_var_map_.end()) {
-      return Cast(src_dtype_, BufferLoad(it->second, node->indices));
+      return Cast(PrimType(src_dtype_), BufferLoad(it->second, node->indices));
     }
     return node;
   }
 
-  DataType src_dtype_, tgt_dtype_;
+  DLDataType src_dtype_, tgt_dtype_;
 };
 
 void UnsafeSetDType(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
@@ -352,10 +352,10 @@ void UnsafeSetDType(ScheduleState self, const StmtSRef& block_sref, int buffer_i
   const SBlockNode* block = TVM_SREF_TO_SBLOCK(block_sref);
   Buffer buffer =
       GetNthAccessBuffer(self, ffi::GetRef<SBlock>(block), buffer_index, BufferIndexType::kWrite);
-  DataType target_dtype(ffi::StringToDLDataType(dtype));
+  DLDataType target_dtype = ffi::StringToDLDataType(dtype);
 
   // Step 1. If `dtype` equals the original data type, just return.
-  if (buffer->dtype == target_dtype) {
+  if (buffer->dtype->dtype == target_dtype) {
     return;
   }
 
diff --git a/src/s_tir/schedule/primitive/blockize_tensorize.cc b/src/s_tir/schedule/primitive/blockize_tensorize.cc
index c5fa57e835ca..fae81d233b48 100644
--- a/src/s_tir/schedule/primitive/blockize_tensorize.cc
+++ b/src/s_tir/schedule/primitive/blockize_tensorize.cc
@@ -37,7 +37,7 @@ bool UsesVar(const T& x, const Var& var) {
 }
 
 Range RangeFromExtent(const PrimExpr& extent) {
-  return Range::FromMinExtent(IntImm(extent->dtype, 0), extent);
+  return Range::FromMinExtent(IntImm(extent.ty(), 0), extent);
 }
 
 template <class T>
@@ -256,7 +256,7 @@ ffi::Map<Var, PrimExpr> DeriveBlockBinding(
       // substitution
       if (is_one(outer_mark->extent) && !preserve_unit_iters) {
         // Simplify outer if not preserve_unit_iters
-        sub = IntImm(outer_mark->extent.dtype(), 0);
+        sub = IntImm(outer_mark->extent.ty(), 0);
       } else {
         sub = outer_iter;
       }
@@ -776,14 +776,14 @@ void Tensorize(ScheduleState self, const StmtSRef& sref, const TensorIntrin& int
   auto f_update_max_dtype_bits_from_region = [&](const ffi::Array<BufferRegion>& buffer_regions) {
     for (const BufferRegion& buffer_region : buffer_regions) {
       for (const auto& range : buffer_region->region) {
-        index_dtype_bits = std::max(index_dtype_bits, range->min.dtype().bits());
+        index_dtype_bits = std::max(index_dtype_bits, range->min.ty().bits());
       }
     }
   };
   f_update_max_dtype_bits_from_region(block_realize->block->reads);
   f_update_max_dtype_bits_from_region(block_realize->block->writes);
   TVM_FFI_ICHECK(index_dtype_bits > 0);
-  intrin_impl = IndexDataTypeNormalizer(DataType::Int(index_dtype_bits)).Rewrite(intrin_impl);
+  intrin_impl = IndexDataTypeNormalizer(PrimType::Int(index_dtype_bits)).Rewrite(intrin_impl);
   // Step 2: Structural pattern matching
   TensorizeComparator comparator(self->mod, /*assert_mode=*/true);
   comparator.VisitStmt(block_realize, intrin_desc->body);
@@ -829,12 +829,12 @@ void Tensorize(ScheduleState self, const StmtSRef& sref, const TensorIntrin& int
     new_region.reserve(cur->shape.size());
     for (int i = 0; i < offset; i++) {
       PrimExpr min = indices_base[i];
-      PrimExpr extent = MakeConst(min.dtype(), 1);
+      PrimExpr extent = MakeConst(min.ty(), 1);
       new_region.push_back(Range::FromMinExtent(min, extent));
     }
     for (int i = 0; i < static_cast<int>(old_region.size()); i++) {
       PrimExpr min = indices_base[i + offset];
-      PrimExpr extent = cast(min.dtype(), old_region[i]->extent);
+      PrimExpr extent = cast(min.ty(), old_region[i]->extent);
       new_region.push_back(Range::FromMinExtent(min, extent));
     }
     match_buffer_regions.push_back(MatchBufferRegion(impl, BufferRegion(cur, new_region)));
diff --git a/src/s_tir/schedule/primitive/cache_index.cc b/src/s_tir/schedule/primitive/cache_index.cc
index 1fc7dbdc263e..1ef05eed66d1 100644
--- a/src/s_tir/schedule/primitive/cache_index.cc
+++ b/src/s_tir/schedule/primitive/cache_index.cc
@@ -58,14 +58,14 @@ struct IndexInfo {
  * \param range The range of the integer.
  * \returns A data type that covers the input range.
  */
-DataType DetermineDatatype(const arith::IntSet& range) {
+PrimType DeterminePrimType(const arith::IntSet& range) {
   arith::Analyzer ana;
   if (ana->CanProve(range.min() >= INT32_MIN && range.max() <= INT32_MAX)) {
-    return DataType::Int(32);
+    return PrimType::Int(32);
   } else {
     TVM_FFI_ICHECK(ana->CanProve(range.min() >= IntImm::Int64(INT64_MIN) &&
                                  range.max() <= IntImm::Int64(INT64_MAX)));
-    return DataType::Int(64);
+    return PrimType::Int(64);
   }
 }
 
@@ -261,23 +261,23 @@ ffi::Array<SBlock> MakeIndexCacheStage(IndexInfo* info, const ffi::String& stora
       });
     }
 
-    DataType data_type = index_expr.dtype();
+    PrimType data_ty = index_expr.ty();
     Var index_buffer_var("index_var_" + std::to_string(expr_index),
-                         PointerType(PrimType(data_type), storage_scope));
+                         PointerType(data_ty, storage_scope));
     ffi::Array<PrimExpr> buffer_shape;
     for (const Var& it : info->origin_block_vars[expr_index]) {
       buffer_shape.push_back(
           arith::EvalSet(info->var_binding.at(it), arith::AsIntSet(info->range_map)).max() + 1);
     }
-    info->cache_buffer.push_back(Buffer(index_buffer_var, data_type, buffer_shape, {1}, {0},
+    info->cache_buffer.push_back(Buffer(index_buffer_var, data_ty->dtype, buffer_shape, {1}, {0},
                                         index_buffer_var->name_hint, 0, 0, kDefault));
 
     // Create loop vars and block vars' binding_value
     std::vector<Var> loop_vars;
     ffi::Map<Var, Var> replace_table;
     for (const Var& it : iter_vars) {
-      DataType data_type = DetermineDatatype(arith::IntSet::FromRange(info->range_map.at(it)));
-      Var loop_var("ax" + std::to_string(replace_table.size()), data_type);
+      PrimType data_ty = DeterminePrimType(arith::IntSet::FromRange(info->range_map.at(it)));
+      Var loop_var("ax" + std::to_string(replace_table.size()), data_ty);
       loop_vars.push_back(loop_var);
       replace_table.Set(it, loop_var);
     }
@@ -296,15 +296,15 @@ ffi::Array<SBlock> MakeIndexCacheStage(IndexInfo* info, const ffi::String& stora
     // Create block vars, block's accessed region and accessing indices
     for (size_t i = 0; i < info->origin_block_vars[expr_index].size(); i++) {
       const Var& block_var = info->origin_block_vars[expr_index][i];
-      Var var("v" + std::to_string(access_indices.size()), block_var.dtype());
-      Range range = Range::FromMinExtent(IntImm(block_var.dtype(), 0),
-                                         info->range_map.at(iter_vars[i])->extent);
+      Var var("v" + std::to_string(access_indices.size()), block_var.ty());
+      Range range =
+          Range::FromMinExtent(IntImm(block_var.ty(), 0), info->range_map.at(iter_vars[i])->extent);
       block_vars.push_back(IterVar(/*dom=*/range,
                                    /*var=*/var,
                                    /*IterVarType=*/kDataPar));
 
       access_indices.push_back(var);
-      access_region.push_back(Range::FromMinExtent(var, MakeConst(var.dtype(), 1)));
+      access_region.push_back(Range::FromMinExtent(var, MakeConst(var.ty(), 1)));
       block_var_map.Set(block_var, var);
     }
 
diff --git a/src/s_tir/schedule/primitive/cache_read_write.cc b/src/s_tir/schedule/primitive/cache_read_write.cc
index 46b8842a88b1..f5426f93cbae 100644
--- a/src/s_tir/schedule/primitive/cache_read_write.cc
+++ b/src/s_tir/schedule/primitive/cache_read_write.cc
@@ -165,7 +165,7 @@ SBlock MakeReindexCacheStage(const BufferRegion& cache_region, ReindexCacheStage
   ffi::Map<Var, Var> var_map;
   for (size_t i = 0; i < info->loop_vars.size(); ++i) {
     Var original_var = info->loop_vars[i];
-    Var loop_var(original_var->name_hint, original_var.dtype());
+    Var loop_var(original_var->name_hint, original_var.ty());
     var_map.Set(original_var, loop_var);
     loop_vars.push_back(loop_var);
   }
@@ -174,7 +174,7 @@ SBlock MakeReindexCacheStage(const BufferRegion& cache_region, ReindexCacheStage
     PrimExpr original_iter_value = info->block_iter_values[i];
     IterVar block_var = IterVar(
         /*dom=*/original_block_var->dom,
-        /*var=*/Var(original_block_var->var->name_hint, original_block_var->var.dtype()),
+        /*var=*/Var(original_block_var->var->name_hint, original_block_var->var.ty()),
         /*IterVarType=*/kDataPar);
     var_map.Set(original_block_var->var, block_var->var);
     block_vars.push_back(block_var);
@@ -247,7 +247,7 @@ SBlock MakeCacheStage(const BufferRegion& cache_region, CacheStageInfo* info,
   std::vector<PrimExpr> iter_values;
   // Create loop vars and block vars' binding_value
   for (const Range& axis_range : cache_region->region) {
-    Var loop_var("ax" + std::to_string(loop_vars.size()), axis_range->extent.dtype());
+    Var loop_var("ax" + std::to_string(loop_vars.size()), axis_range->extent.ty());
     loop_vars.push_back(loop_var);
     iter_values.push_back(cache_full_region ? (axis_range->min + loop_var) : loop_var);
   }
@@ -262,35 +262,35 @@ SBlock MakeCacheStage(const BufferRegion& cache_region, CacheStageInfo* info,
   // Create block vars, block's accessed region and accessing indices
   for (int i = 0; i < static_cast<int>(cache_region->buffer->shape.size()); ++i) {
     Range axis_range = cache_region->region[i];
-    Var var("v" + std::to_string(read_access_indices.size()), axis_range->extent.dtype());
+    Var var("v" + std::to_string(read_access_indices.size()), axis_range->extent.ty());
     if (cache_full_region) {
       PrimExpr dim = cache_region->buffer->shape[i];
-      block_vars.push_back(IterVar(/*dom=*/Range::FromMinExtent(IntImm(dim->dtype, 0), dim),
+      block_vars.push_back(IterVar(/*dom=*/Range::FromMinExtent(IntImm(dim.ty(), 0), dim),
                                    /*var=*/var,
                                    /*IterVarType=*/kDataPar));
       read_access_indices.push_back(var);
       write_access_indices.push_back(var);
-      read_access_region.push_back(Range::FromMinExtent(var, MakeConst(var.dtype(), 1)));
-      write_access_region.push_back(Range::FromMinExtent(var, MakeConst(var.dtype(), 1)));
+      read_access_region.push_back(Range::FromMinExtent(var, MakeConst(var.ty(), 1)));
+      write_access_region.push_back(Range::FromMinExtent(var, MakeConst(var.ty(), 1)));
     } else {
       block_vars.push_back(IterVar(
-          /*dom=*/Range::FromMinExtent(IntImm(axis_range->extent.dtype(), 0), axis_range->extent),
+          /*dom=*/Range::FromMinExtent(IntImm(axis_range->extent.ty(), 0), axis_range->extent),
           /*var=*/var,
           /*IterVarType=*/kDataPar));
       if (cache_region->buffer.same_as(info->read_buffer)) {
         // cache_read
         read_access_indices.push_back(axis_range->min + var);
         read_access_region.push_back(
-            Range::FromMinExtent(axis_range->min + var, MakeConst(var.dtype(), 1)));
+            Range::FromMinExtent(axis_range->min + var, MakeConst(var.ty(), 1)));
         write_access_indices.push_back(var);
-        write_access_region.push_back(Range::FromMinExtent(var, MakeConst(var.dtype(), 1)));
+        write_access_region.push_back(Range::FromMinExtent(var, MakeConst(var.ty(), 1)));
       } else {
         // cache_write
         write_access_indices.push_back(axis_range->min + var);
         write_access_region.push_back(
-            Range::FromMinExtent(axis_range->min + var, MakeConst(var.dtype(), 1)));
+            Range::FromMinExtent(axis_range->min + var, MakeConst(var.ty(), 1)));
         read_access_indices.push_back(var);
-        read_access_region.push_back(Range::FromMinExtent(var, MakeConst(var.dtype(), 1)));
+        read_access_region.push_back(Range::FromMinExtent(var, MakeConst(var.ty(), 1)));
       }
     }
   }
@@ -361,7 +361,7 @@ SBlock MakeReIndexStage(const SBlock& block, CacheStageInfo* info,
   std::unordered_set<int> skipped_block_iters;
   for (int i = 0, n = block->iter_vars.size(); i < n; ++i) {
     const IterVar& iter = block->iter_vars[i];
-    Var var("v" + std::to_string(new_block_iters.size()), iter->var->dtype);
+    Var var("v" + std::to_string(new_block_iters.size()), iter->var.ty());
     bool used = covered.count(iter->var);
     if (used) {
       new_block_iters.push_back(IterVar(/*dom=*/iter->dom,
@@ -415,7 +415,7 @@ SBlock MakeReIndexStage(const SBlock& block, CacheStageInfo* info,
     if (skipped_block_iters.count(i)) {
       continue;
     }
-    Var loop_var("ax" + std::to_string(loop_vars.size()), block->iter_vars[i]->var->dtype);
+    Var loop_var("ax" + std::to_string(loop_vars.size()), block->iter_vars[i]->var.ty());
     loop_vars.push_back(loop_var);
     iter_values.push_back(loop_var);
   }
@@ -1620,7 +1620,7 @@ class ReIndexRewriter : public StmtExprMutator {
       for (const IterVar& iter : block->iter_vars) {
         if (covered_.count(iter->var)) {
           indices_.push_back(iter->var);
-          region_.push_back(Range::FromMinExtent(iter->var, IntImm(iter->var->dtype, 1)));
+          region_.push_back(Range::FromMinExtent(iter->var, IntImm(iter->var.ty(), 1)));
         }
       }
       SBlock stmt = StmtExprMutator::VisitStmt_(block).as_or_throw<SBlock>();
diff --git a/src/s_tir/schedule/primitive/compute_at.cc b/src/s_tir/schedule/primitive/compute_at.cc
index 8e1050709173..9dcdb1bb04b1 100644
--- a/src/s_tir/schedule/primitive/compute_at.cc
+++ b/src/s_tir/schedule/primitive/compute_at.cc
@@ -267,12 +267,12 @@ class ScopeReconstructor : private StmtMutator {
     for (int i = 0; i < n_iters; ++i) {
       Range iter_dom = iter_doms[i].dom.CoverRange(block_->iter_vars[i]->dom);
       if (preserve_unit_loops || !is_one(iter_dom->extent)) {
-        int bits = std::max(iter_dom->min.dtype().bits(), iter_dom->extent.dtype().bits());
-        Var var("ax" + std::to_string(loop_vars.size()), DataType::Int(bits));
+        int bits = std::max(iter_dom->min.ty().bits(), iter_dom->extent.ty().bits());
+        Var var("ax" + std::to_string(loop_vars.size()), PrimType::Int(bits));
         loop_vars.push_back(var);
         loop_extents.push_back(analyzer->Simplify(iter_dom->extent));
         iter_values.push_back(iter_dom->min + var);
-        analyzer->Bind(var, Range::FromMinExtent(IntImm(var.dtype(), 0), iter_dom->extent));
+        analyzer->Bind(var, Range::FromMinExtent(IntImm(var.ty(), 0), iter_dom->extent));
       } else {
         iter_values.push_back(iter_dom->min);
       }
@@ -578,7 +578,7 @@ bool UpdateBlockVarDomainAffine(const BufferNode* buffer, const ffi::Array<IterV
   NDIntSet required_bound;
   for (size_t i = 0; i < ndim; ++i) {
     required_bound.push_back(
-        arith::IntSet::Interval(IntImm(buffer->shape[i]->dtype, 0), max(buffer->shape[i] - 1, 0)));
+        arith::IntSet::Interval(IntImm(buffer->shape[i].ty(), 0), max(buffer->shape[i] - 1, 0)));
   }
   ffi::Map<Var, arith::IntSet> var_dom =
       InverseAffineIterMap(res->indices, required_region, analyzer);
diff --git a/src/s_tir/schedule/primitive/compute_inline.cc b/src/s_tir/schedule/primitive/compute_inline.cc
index 3f177d52f81a..e295941dbdb1 100644
--- a/src/s_tir/schedule/primitive/compute_inline.cc
+++ b/src/s_tir/schedule/primitive/compute_inline.cc
@@ -513,7 +513,7 @@ class ComputeInliner : public BaseInliner {
     }
     idx_vars_.resize(buffer_ndim);
     for (size_t i = 0; i < idx_vars_.size(); ++i) {
-      idx_vars_[i] = Var("ph_" + std::to_string(i), inlined_store_->indices[i].dtype());
+      idx_vars_[i] = Var("ph_" + std::to_string(i), inlined_store_->indices[i].ty());
     }
     auto inverse_iter_map = arith::InverseAffineIterMap(
         res->indices, ffi::Array<PrimExpr>(idx_vars_.begin(), idx_vars_.end()));
@@ -726,7 +726,7 @@ class ReverseComputeInliner : public BaseInliner {
     if (producer_block->annotations.count(s_tir::attr::auto_copy) != 0) {
       auto bind = [&](const ForNode* loop) {
         analyzer_->Bind(loop->loop_var,
-                        Range::FromMinExtent(IntImm(loop->extent->dtype, 0), loop->extent));
+                        Range::FromMinExtent(IntImm(loop->extent.ty(), 0), loop->extent));
       };
       const ForNode* producer_inner_loop = producer_block->body.as<ForNode>();
       while (producer_inner_loop->body.as<ForNode>()) {
diff --git a/src/s_tir/schedule/primitive/decompose_padding.cc b/src/s_tir/schedule/primitive/decompose_padding.cc
index 0a62222a4a34..98e38d259b0c 100644
--- a/src/s_tir/schedule/primitive/decompose_padding.cc
+++ b/src/s_tir/schedule/primitive/decompose_padding.cc
@@ -173,7 +173,7 @@ class PaddingInfoAnalyzer {
     }
     for (const arith::IterSumExpr& sum : res->indices) {
       if (sum->args.empty()) {
-        region.push_back(Range::FromMinExtent(sum->base, IntImm(sum->base.dtype(), /* value */ 1)));
+        region.push_back(Range::FromMinExtent(sum->base, IntImm(sum->base.ty(), /* value */ 1)));
       } else {
         TVM_FFI_ICHECK_EQ(sum->args.size(), 1U);
         if (!analyzer_->CanProveEqual(sum->args[0]->scale, 1)) {
@@ -291,7 +291,7 @@ static std::pair<Stmt, SBlockRealize> CreateInBoundBlock(const SBlockRealizeNode
     const IterVar& origin_itervar = block->iter_vars[i];
     Var new_var = origin_itervar->var.copy_with_suffix("");
     Range new_range =
-        Range::FromMinExtent(IntImm(new_var->dtype, 0), info.in_bound_region[i]->extent);
+        Range::FromMinExtent(IntImm(new_var.ty(), 0), info.in_bound_region[i]->extent);
     new_iter_vars.push_back(IterVar(new_range, new_var, IterVarType::kDataPar));
     repl_dict.Set(origin_itervar->var, new_var + info.in_bound_region[i]->min);
 
diff --git a/src/s_tir/schedule/primitive/for_kind.cc b/src/s_tir/schedule/primitive/for_kind.cc
index cbb7437e54dd..121205b5500d 100644
--- a/src/s_tir/schedule/primitive/for_kind.cc
+++ b/src/s_tir/schedule/primitive/for_kind.cc
@@ -174,9 +174,9 @@ void ParallelizeComputation(const ScheduleState& self, const StmtSRef& loop_sref
   ffi::ObjectPtr<ForNode> new_loop = ffi::make_object<ForNode>(*loop);
   new_loop->kind = for_kind;
   if (thread_axis.has_value()) {
-    new_loop->thread_binding = IterVar(/*dom=*/Range(nullptr),                                    //
-                                       /*var=*/Var(thread_axis.value(), loop->loop_var.dtype()),  //
-                                       /*iter_type=*/kThreadIndex,                                //
+    new_loop->thread_binding = IterVar(/*dom=*/Range(nullptr),                                 //
+                                       /*var=*/Var(thread_axis.value(), loop->loop_var.ty()),  //
+                                       /*iter_type=*/kThreadIndex,                             //
                                        /*thread_tag=*/thread_axis.value());
   } else {
     new_loop->thread_binding = std::nullopt;
diff --git a/src/s_tir/schedule/primitive/layout_transformation.cc b/src/s_tir/schedule/primitive/layout_transformation.cc
index 91c2e5276f26..e9cbf4f75a2d 100644
--- a/src/s_tir/schedule/primitive/layout_transformation.cc
+++ b/src/s_tir/schedule/primitive/layout_transformation.cc
@@ -294,7 +294,7 @@ class TransformLayoutPlanner : private StmtExprVisitor {
       new_indices = inverse->initial_indices.Map([](Var var) {
         std::stringstream ss;
         ss << "v_" << var->name_hint;
-        return Var(ss.str(), var.dtype());
+        return Var(ss.str(), var.ty());
       });
 
       ffi::Map<Var, Var>
@@ -314,7 +314,7 @@ class TransformLayoutPlanner : private StmtExprVisitor {
         PrimExpr dim = new_buffer->shape[i];
         new_iter_values.push_back(var);
         new_iter_vars.push_back(
-            IterVar(Range::FromMinExtent(IntImm(dim.dtype(), 0), dim), virtual_var, kDataPar));
+            IterVar(Range::FromMinExtent(IntImm(dim.ty(), 0), dim), virtual_var, kDataPar));
         loop_var_to_virtual_var.Set(var, virtual_var);
       }
 
@@ -476,7 +476,7 @@ class TransformLayoutPlanner : private StmtExprVisitor {
     for (size_t i = 0; i < inverse->initial_indices.size(); i++) {
       const auto& loop_var = inverse->initial_indices[i];
       const auto& dim = new_buffer->shape[i];
-      Var block_var("v_" + loop_var->name_hint, loop_var->dtype);
+      Var block_var("v_" + loop_var->name_hint, loop_var.ty());
       IterVar iter_var(Range(0, dim), block_var, kDataPar);
       loop_indices_to_block_indices.Set(loop_var, block_var);
       indices.push_back(iter_var->var);
@@ -488,7 +488,7 @@ class TransformLayoutPlanner : private StmtExprVisitor {
     PrimExpr pad_value_at_index =
         pad_value.value()->MapIndices(indices, ffi::GetRef<arith::Analyzer>(analyzer))[0];
     PrimExpr expr = (!padding_predicate) || (BufferLoad(new_buffer, indices) == pad_value_at_index);
-    Stmt stmt = Evaluate(Call(DataType::Bool(), builtin::assume(), {expr}));
+    Stmt stmt = Evaluate(Call(PrimType::Bool(), builtin::assume(), {expr}));
 
     std::stringstream block_name;
     block_name << "buffer_" << new_buffer->name << "_assumptions";
@@ -571,7 +571,7 @@ class TransformLayoutPlanner : private StmtExprVisitor {
     for (size_t i = 0; i < inverse->initial_indices.size(); i++) {
       const auto& loop_var = inverse->initial_indices[i];
       const auto& dim = new_buffer->shape[i];
-      Var block_var("v_" + loop_var->name_hint, loop_var->dtype);
+      Var block_var("v_" + loop_var->name_hint, loop_var.ty());
       IterVar iter_var(Range(0, dim), block_var, kDataPar);
       indices.push_back(iter_var->var);
       iter_vars.push_back(iter_var);
@@ -991,7 +991,7 @@ class TransformationPaddingTypeError : public ScheduleError {
   TransformationPaddingTypeError(IRModule mod, Buffer buffer, IndexMap pad_value)
       : mod_(mod), buffer_(buffer), pad_value_(pad_value) {
     TVM_FFI_ICHECK_EQ(pad_value_->final_indices.size(), 1);
-    pad_value_dtype_ = pad_value_->final_indices[0].dtype();
+    pad_value_dtype_ = pad_value_->final_indices[0].ty()->dtype;
   }
 
   ffi::String FastErrorString() const final {
@@ -1015,7 +1015,7 @@ class TransformationPaddingTypeError : public ScheduleError {
   IRModule mod_;
   Buffer buffer_;
   IndexMap pad_value_;
-  DataType pad_value_dtype_;
+  DLDataType pad_value_dtype_;
 };
 
 class TransformationPaddingExpressionError : public ScheduleError {
@@ -1116,19 +1116,21 @@ IndexMap LegalizeIndexMapDType(const IndexMap& index_map, const ffi::Array<PrimE
 
   ffi::Array<Var> initial_indices;
   ffi::Map<Var, PrimExpr> var_map;
-  std::optional<DataType> index_dtype = std::nullopt;
+  std::optional<DLDataType> index_dtype = std::nullopt;
 
   for (size_t i = 0; i < args.size(); ++i) {
+    DLDataType arg_dtype = args[i].ty()->dtype;
     if (index_dtype.has_value()) {
-      TVM_FFI_ICHECK_EQ(*index_dtype, args[i]->dtype)
-          << "Buffer index " << args[i] << " has dtype " << args[i]->dtype
+      TVM_FFI_ICHECK_EQ(*index_dtype, arg_dtype)
+          << "Buffer index " << args[i] << " has dtype " << arg_dtype
           << ", but previous index for the same buffer access used index type " << *index_dtype;
     } else {
-      index_dtype = args[i]->dtype;
+      index_dtype = arg_dtype;
     }
 
-    if (args[i]->dtype != initial_indices_orig[i].dtype()) {
-      auto new_idx = Var(initial_indices_orig[i]->name_hint, args[i]->dtype);
+    DLDataType initial_dtype = initial_indices_orig[i].ty()->dtype;
+    if (arg_dtype != initial_dtype) {
+      auto new_idx = Var(initial_indices_orig[i]->name_hint, args[i].ty());
       initial_indices.push_back(new_idx);
       var_map.Set(initial_indices_orig[i], new_idx);
     } else {
@@ -1140,7 +1142,7 @@ IndexMap LegalizeIndexMapDType(const IndexMap& index_map, const ffi::Array<PrimE
     auto final_indices = index_map->final_indices.Map([&](PrimExpr index) {
       if (auto* ptr = index.as<IntImmNode>()) {
         TVM_FFI_ICHECK(index_dtype.has_value());
-        return tirx::MakeConst(*index_dtype, ptr->value);
+        return tirx::MakeConst(PrimType(*index_dtype), ptr->value);
       } else {
         return SubstituteWithDataTypeLegalization(index,
                                                   [&](const Var& var) { return var_map.Get(var); });
@@ -1176,7 +1178,7 @@ void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_
     if (pad_value.value()->final_indices.size() != 1) {
       throw TransformationPaddingIndexMapError(self->mod, pad_value.value());
     }
-    if (pad_value.value()->final_indices[0]->dtype != old_buffer->dtype) {
+    if (pad_value.value()->final_indices[0].ty() != old_buffer->dtype) {
       throw TransformationPaddingTypeError(self->mod, old_buffer, pad_value.value());
     }
 
@@ -1194,7 +1196,7 @@ void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_
     std::tie(opt_inverse, padding_predicate) = [&]() {
       ffi::Array<Range> region;
       for (const auto& dim : old_buffer->shape) {
-        region.push_back(Range::FromMinExtent(IntImm(dim.dtype(), 0), dim));
+        region.push_back(Range::FromMinExtent(IntImm(dim.ty(), 0), dim));
       }
       return index_map.NonSurjectiveInverse(region, analyzer);
     }();
@@ -1412,7 +1414,7 @@ void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
   ffi::Array<IterVar> new_block_iters;  // new block iters
   ffi::Array<PrimExpr> new_block_vars;  // iter_var->var of new block iters
   for (size_t i = 0; i < transformed_block_iters.size(); ++i) {
-    Var new_block_var{"v" + std::to_string(i), transformed_block_iters[i]->dtype};
+    Var new_block_var{"v" + std::to_string(i), transformed_block_iters[i].ty()};
     new_block_vars.push_back(new_block_var);
     IterVarType iter_type;
     if (is_one(new_block_iter_range[i])) {
@@ -1424,7 +1426,7 @@ void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
       throw OpaqueNewIterTypeError(self->mod, ffi::GetRef<SBlock>(block_ptr),
                                    transformed_block_iters[i]);
     }
-    auto dtype = new_block_var.dtype();
+    PrimType dtype = new_block_var.ty();
     new_block_iters.push_back(IterVar(
         /*dom=*/Range::FromMinExtent(IntImm(dtype, 0), cast(dtype, new_block_iter_range[i])),
         /*var=*/std::move(new_block_var), /*iter_type=*/iter_type));
@@ -1437,7 +1439,7 @@ void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
   {
     ffi::Array<Range> initial_ranges;
     for (const PrimExpr& extent : block_iter_range_array) {
-      initial_ranges.push_back(Range::FromMinExtent(IntImm(extent.dtype(), 0), extent));
+      initial_ranges.push_back(Range::FromMinExtent(IntImm(extent.ty(), 0), extent));
     }
     IndexMap inverse_index_map{nullptr};
     try {
@@ -1462,7 +1464,7 @@ void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
   // Make new loop vars
   ffi::Array<PrimExpr> new_loop_vars;
   for (int i = 0; i < static_cast<int>(new_block_iters.size()); ++i) {
-    new_loop_vars.push_back(Var("ax" + std::to_string(i), new_block_iters[i]->var.dtype()));
+    new_loop_vars.push_back(Var("ax" + std::to_string(i), new_block_iters[i]->var.ty()));
   }
 
   // Make new block realize
diff --git a/src/s_tir/schedule/primitive/loop_transformation.cc b/src/s_tir/schedule/primitive/loop_transformation.cc
index 18996f555d18..2864b190ea30 100644
--- a/src/s_tir/schedule/primitive/loop_transformation.cc
+++ b/src/s_tir/schedule/primitive/loop_transformation.cc
@@ -58,7 +58,7 @@ class SubstituteVarAndCollectOpaqueBlock : public StmtExprMutator {
   PrimExpr VisitExpr_(const VarNode* op) final {
     Var var = ffi::GetRef<Var>(op);
     if (ffi::Optional<PrimExpr> ret = vmap_(var)) {
-      return tvm::cast(var.dtype(), ret.value());
+      return tvm::cast(var.ty(), ret.value());
     } else {
       return var;
     }
@@ -411,13 +411,13 @@ ffi::Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref,
   CheckLoopStartsWithZero(self, loop_sref, analyzer.get());
 
   // Find the most common dtype
-  DataType dtype;
+  PrimType dtype = PrimType::Int(32);
   {
-    int bits = loop->loop_var.dtype().bits();
+    int bits = loop->loop_var.ty().bits();
     for (const PrimExpr& factor : factors) {
-      bits = std::max(bits, factor.dtype().bits());
+      bits = std::max(bits, factor.ty().bits());
     }
-    dtype = DataType::Int(bits);
+    dtype = PrimType::Int(bits);
   }
   int n = factors.size();
   PrimExpr substitute_value = IntImm(dtype, 0);
@@ -556,9 +556,9 @@ class BlockMutator : public StmtExprMutator {
     // As we are working on cloned block, we need to create new instances of iter_var
     ffi::Array<IterVar> new_iter_vars =
         MutateArray(new_block->iter_vars, [this, &iter_var_](const IterVar& iter) {
-          auto dtype = iter->var.dtype();
+          auto dtype = iter->var.ty();
           // Create new Var instance for each IterVar
-          Var new_var = Var(iter->var->name_hint, iter->var.dtype());
+          Var new_var = Var(iter->var->name_hint, iter->var.ty());
           IterVar new_iter = iter;
           new_iter.CopyOnWrite()->var = new_var;
           // Change the domain of IterVar corresponding to partitioned loop_var
@@ -623,7 +623,7 @@ class BlockMutator : public StmtExprMutator {
 
   Stmt VisitStmt_(const ForNode* op) final {
     For res = StmtMutator::VisitStmt_(op).as_or_throw<For>();
-    Var new_var = Var(op->loop_var->name_hint, op->loop_var.dtype());
+    Var new_var = Var(op->loop_var->name_hint, op->loop_var.ty());
 
     if (!op->loop_var.same_as(new_var)) {
       // If the partioned loop contains nested for loop, then create new iteration variable instance
@@ -655,13 +655,13 @@ ffi::Array<StmtSRef> LoopPartition(ScheduleState self, const StmtSRef& loop_sref
 
   arith::Analyzer analyzer;
   // Find the most common dtype
-  DataType dtype;
+  PrimType dtype = PrimType::Int(32);
   {
-    int bits = loop->loop_var.dtype().bits();
+    int bits = loop->loop_var.ty().bits();
     for (const PrimExpr& factor : factors) {
-      bits = std::max(bits, factor.dtype().bits());
+      bits = std::max(bits, factor.ty().bits());
     }
-    dtype = DataType::Int(bits);
+    dtype = PrimType::Int(bits);
   }
 
   ffi::String block_name = get_sblock_name(loop->body) + "_" + loop->loop_var->name_hint;
@@ -921,14 +921,14 @@ StmtSRef Fuse(ScheduleState self, const ffi::Array<StmtSRef>& loop_srefs,
   // Step 2. Create fused loop var and replace the original loop vars
   std::string suffix;
   int n = loops.size();
-  int bits = loops[0]->loop_var.dtype().bits();
+  int bits = loops[0]->loop_var.ty().bits();
   for (int i = 1; i < n; i++) {
     suffix += "_" + loops[i]->loop_var->name_hint;
-    bits = std::max(bits, loops[i]->loop_var.dtype().bits());
+    bits = std::max(bits, loops[i]->loop_var.ty().bits());
   }
   suffix += "_fused";
 
-  Var fused_var = loops[0]->loop_var.copy_with_suffix(suffix).copy_with_dtype(DataType::Int(bits));
+  Var fused_var = loops[0]->loop_var.copy_with_suffix(suffix).copy_with_dtype(PrimType::Int(bits));
   ffi::Array<PrimExpr> substitute_value;
   substitute_value.resize(loops.size());
   PrimExpr lower = 1;
@@ -1144,7 +1144,7 @@ void Reorder(ScheduleState self, const ffi::Array<StmtSRef>& ordered_loop_srefs)
 StmtSRef AddUnitLoop(ScheduleState self, StmtSRef sref) {
   if (sref->stmt->IsInstance<ForNode>()) {
     For new_loop =
-        For(Var("u", DataType::Int(32)), 0, 1, ForKind::kSerial, ffi::GetRef<Stmt>(sref->stmt));
+        For(Var("u", PrimType::Int(32)), 0, 1, ForKind::kSerial, ffi::GetRef<Stmt>(sref->stmt));
     self->Replace(sref, new_loop, {});
     return self->stmt2ref.at(new_loop.get());
   }
@@ -1154,7 +1154,7 @@ StmtSRef AddUnitLoop(ScheduleState self, StmtSRef sref) {
 
     Stmt VisitStmt_(const SBlockRealizeNode* realize) final {
       if (realize->block.get() == src_block_) {
-        new_loop_ = For(Var("u", DataType::Int(32)), 0, 1, ForKind::kSerial,
+        new_loop_ = For(Var("u", PrimType::Int(32)), 0, 1, ForKind::kSerial,
                         ffi::GetRef<SBlockRealize>(realize));
         return new_loop_;
       }
diff --git a/src/s_tir/schedule/primitive/pad_einsum.cc b/src/s_tir/schedule/primitive/pad_einsum.cc
index c03638b09b98..a0c65698d182 100644
--- a/src/s_tir/schedule/primitive/pad_einsum.cc
+++ b/src/s_tir/schedule/primitive/pad_einsum.cc
@@ -147,7 +147,7 @@ struct BufferPadding {
       PrimExpr pos = buffer_region->region[i]->min;
       TVM_FFI_ICHECK(pos->IsInstance<IntImmNode>() || pos->IsInstance<VarNode>());
       if (pos->IsInstance<IntImmNode>()) {
-        shape.push_back(IntImm(pos->dtype, 1));
+        shape.push_back(IntImm(pos.ty(), 1));
       } else if (ffi::Optional<PrimExpr> extent = iter_extents.Get(pos.as_or_throw<Var>())) {
         shape.push_back(extent.value());
       } else {
@@ -173,11 +173,11 @@ struct BufferPadding {
       } else {
         dim = buffer->shape[i];
       }
-      Range dom = Range::FromMinExtent(IntImm(dim->dtype, 0), dim);
-      loop_vars.push_back(Var("i" + std::to_string(i), dim->dtype));
+      Range dom = Range::FromMinExtent(IntImm(dim.ty(), 0), dim);
+      loop_vars.push_back(Var("i" + std::to_string(i), dim.ty()));
       loop_doms.push_back(dom);
-      IterVar iter_var(dom, Var("v" + std::to_string(i), dim->dtype), kDataPar);
-      instance_dom.push_back(Range::FromMinExtent(iter_var->var, IntImm(dim->dtype, 1)));
+      IterVar iter_var(dom, Var("v" + std::to_string(i), dim.ty()), kDataPar);
+      instance_dom.push_back(Range::FromMinExtent(iter_var->var, IntImm(dim.ty(), 1)));
       iter_vars.push_back(iter_var);
       indices.push_back(iter_var->var);
     }
@@ -190,8 +190,8 @@ struct BufferPadding {
         }
       }
       PrimExpr rhs = BufferLoad(buffer, indices);
-      body = BufferStore(padded_buffer, if_then_else(predicate, rhs, MakeConst(rhs->dtype, 0)),
-                         indices);
+      body =
+          BufferStore(padded_buffer, if_then_else(predicate, rhs, MakeConst(rhs.ty(), 0)), indices);
     } else {
       body = BufferStore(buffer, BufferLoad(padded_buffer, indices), indices);
     }
@@ -389,7 +389,7 @@ void PadEinsum(ScheduleState self, const StmtSRef& block_sref, const ffi::Array<
   for (int i = 0, n = padding.size(); i < n; ++i) {
     const IterVar& iter = block->iter_vars[i];
     PrimExpr dom = iter->dom->extent;
-    PrimExpr pad_imm = IntImm(dom->dtype, padding[i]);
+    PrimExpr pad_imm = IntImm(dom.ty(), padding[i]);
     PrimExpr new_dom = analyzer->Simplify(ceildiv(dom, pad_imm) * pad_imm);
     if (!analyzer->CanProveEqual(new_dom, dom)) {
       replacer.iter2padded_extents.Set(iter->var, new_dom);
diff --git a/src/s_tir/schedule/primitive/reduction.cc b/src/s_tir/schedule/primitive/reduction.cc
index 51fe3afde4e1..169508943b2d 100644
--- a/src/s_tir/schedule/primitive/reduction.cc
+++ b/src/s_tir/schedule/primitive/reduction.cc
@@ -318,7 +318,7 @@ struct ReducerRegistry {
                   return ffi::Array<PrimExpr>{x[0] + y[0]};
                 },
                 [](const ffi::Array<PrimExpr>& values) {
-                  return ffi::Array<PrimExpr>{MakeConst(values[0]->dtype, 0)};
+                  return ffi::Array<PrimExpr>{MakeConst(values[0].ty(), 0)};
                 }),
             CreateReducerGetter(
                 /*n_buffers=*/1,
@@ -326,7 +326,7 @@ struct ReducerRegistry {
                   return ffi::Array<PrimExpr>{x[0] * y[0]};
                 },
                 [](const ffi::Array<PrimExpr>& values) {
-                  return ffi::Array<PrimExpr>{MakeConst(values[0]->dtype, 1)};
+                  return ffi::Array<PrimExpr>{MakeConst(values[0].ty(), 1)};
                 }),
             CreateReducerGetter(
                 /*n_buffers=*/1,
@@ -334,7 +334,7 @@ struct ReducerRegistry {
                   return ffi::Array<PrimExpr>{min(x[0], y[0])};
                 },
                 [](const ffi::Array<PrimExpr>& values) {
-                  return ffi::Array<PrimExpr>{max_value(values[0]->dtype)};
+                  return ffi::Array<PrimExpr>{max_value(values[0].ty())};
                 }),
             CreateReducerGetter(
                 /*n_buffers=*/1,
@@ -342,7 +342,7 @@ struct ReducerRegistry {
                   return ffi::Array<PrimExpr>{max(x[0], y[0])};
                 },
                 [](const ffi::Array<PrimExpr>& values) {
-                  return ffi::Array<PrimExpr>{min_value(values[0]->dtype)};
+                  return ffi::Array<PrimExpr>{min_value(values[0].ty())};
                 }),
             CreateReducerGetter(
                 /*n_buffers=*/2,
@@ -350,8 +350,8 @@ struct ReducerRegistry {
                   return ffi::Array<PrimExpr>{x[0] + y[0], x[1] + y[1]};
                 },
                 [](const ffi::Array<PrimExpr>& values) {
-                  return ffi::Array<PrimExpr>{MakeConst(values[0]->dtype, 0),
-                                              MakeConst(values[1]->dtype, 0)};
+                  return ffi::Array<PrimExpr>{MakeConst(values[0].ty(), 0),
+                                              MakeConst(values[1].ty(), 0)};
                 }),
             CreateReducerGetter(
                 /*n_buffers=*/2,
@@ -361,8 +361,8 @@ struct ReducerRegistry {
                   return ffi::Array<PrimExpr>{idx, val};
                 },
                 [](const ffi::Array<PrimExpr>& values) {
-                  return ffi::Array<PrimExpr>{MakeConst(values[0]->dtype, -1),
-                                              min_value(values[1]->dtype)};
+                  return ffi::Array<PrimExpr>{MakeConst(values[0].ty(), -1),
+                                              min_value(values[1].ty())};
                 }),
             CreateReducerGetter(
                 /*n_buffers=*/2,
@@ -374,8 +374,8 @@ struct ReducerRegistry {
                   return ffi::Array<PrimExpr>{idx, val};
                 },
                 [](const ffi::Array<PrimExpr>& values) {
-                  return ffi::Array<PrimExpr>{MakeConst(values[0]->dtype, -1),
-                                              min_value(values[1]->dtype)};
+                  return ffi::Array<PrimExpr>{MakeConst(values[0].ty(), -1),
+                                              min_value(values[1].ty())};
                 }),
             CreateReducerGetter(
                 /*n_buffers=*/2,
@@ -385,8 +385,8 @@ struct ReducerRegistry {
                   return ffi::Array<PrimExpr>{idx, val};
                 },
                 [](const ffi::Array<PrimExpr>& values) {
-                  return ffi::Array<PrimExpr>{MakeConst(values[0]->dtype, -1),
-                                              max_value(values[1]->dtype)};
+                  return ffi::Array<PrimExpr>{MakeConst(values[0].ty(), -1),
+                                              max_value(values[1].ty())};
                 }),
             CreateReducerGetter(
                 /*n_buffers=*/2,
@@ -397,8 +397,8 @@ struct ReducerRegistry {
                   return ffi::Array<PrimExpr>{idx, val};
                 },
                 [](const ffi::Array<PrimExpr>& values) {
-                  return ffi::Array<PrimExpr>{MakeConst(values[0]->dtype, -1),
-                                              max_value(values[1]->dtype)};
+                  return ffi::Array<PrimExpr>{MakeConst(values[0].ty(), -1),
+                                              max_value(values[1].ty())};
                 })} {}
 
   static void RegisterReducer(
@@ -423,8 +423,8 @@ struct ReducerRegistry {
       ffi::Array<Var> lhs;
       ffi::Array<Var> rhs;
       for (int i = 0; i < n_buffers; ++i) {
-        lhs.push_back(Var("x" + std::to_string(i), values[i]->dtype));
-        rhs.push_back(Var("y" + std::to_string(i), values[i]->dtype));
+        lhs.push_back(Var("x" + std::to_string(i), values[i].ty()));
+        rhs.push_back(Var("y" + std::to_string(i), values[i].ty()));
       }
       return CommReducer(lhs, rhs, combiner_getter(lhs, rhs), identity_getter(values));
     };
@@ -741,7 +741,7 @@ class BaseBlockCreator {
     ffi::Array<Var> let_vars;
     let_vars.reserve(n_buffers_);
     for (int i = 0; i < n_buffers_; ++i) {
-      Var var("v_" + update_buffers_[i]->name, PrimType(stored_values[i]->dtype));
+      Var var("v_" + update_buffers_[i]->name, stored_values[i].ty());
       let_vars.push_back(var);
       buf_stores.push_back(BufferStore(update_buffers_[i], var, update_indices_[i]));
     }
@@ -932,7 +932,7 @@ class RFactorBlockCreator : public BaseBlockCreator {
       ffi::Array<Range> region = write_region->region;
       region.insert(
           region.begin() + factor_axis_,
-          Range::FromMinExtent(additional_iter_->var, MakeConst(additional_iter_->var.dtype(), 1)));
+          Range::FromMinExtent(additional_iter_->var, MakeConst(additional_iter_->var.ty(), 1)));
       ffi::Optional<Buffer> rf_buffer = buffer_map.Get(write_region->buffer);
       TVM_FFI_ICHECK(rf_buffer.defined());
       write_regions_.push_back(BufferRegion(rf_buffer.value(), Substitute(region, var_map_)));
@@ -1025,7 +1025,7 @@ class WriteBackBlockCreator : public BaseBlockCreator {
       ffi::Array<Range> region;
       region.reserve(buf_load->indices.size());
       for (const PrimExpr& index : buf_load->indices) {
-        region.push_back(Range::FromMinExtent(index, MakeConst(index.dtype(), 1)));
+        region.push_back(Range::FromMinExtent(index, MakeConst(index.ty(), 1)));
       }
       buf_regions.push_back(BufferRegion(buf_load->buffer, std::move(region)));
     }
diff --git a/src/s_tir/schedule/transform.cc b/src/s_tir/schedule/transform.cc
index 3e29d1b6691d..8df7f7df9da9 100644
--- a/src/s_tir/schedule/transform.cc
+++ b/src/s_tir/schedule/transform.cc
@@ -49,9 +49,9 @@ Buffer WithScope(const Buffer& buffer, const ffi::String& scope) {
   return Buffer(new_buffer);
 }
 
-Buffer WithDType(const Buffer& buffer, const DataType& dtype) {
+Buffer WithDType(const Buffer& buffer, DLDataType dtype) {
   ffi::ObjectPtr<BufferNode> new_buffer = ffi::make_object<BufferNode>(*buffer.get());
-  new_buffer->dtype = dtype;
+  new_buffer->dtype = PrimType(dtype);
   const auto* ptr_type = TVM_TYPE_AS(buffer->data->type_annotation, PointerTypeNode);
   new_buffer->data =
       Var(buffer->data->name_hint, PointerType(PrimType(dtype), ptr_type->storage_scope));
diff --git a/src/s_tir/schedule/transform.h b/src/s_tir/schedule/transform.h
index da6d54a96629..9a408845e8e5 100644
--- a/src/s_tir/schedule/transform.h
+++ b/src/s_tir/schedule/transform.h
@@ -61,7 +61,7 @@ Buffer WithScope(const Buffer& buffer, const ffi::String& scope);
  * \param scope The target data type.
  * \return The new buffer with target data type.
  */
-Buffer WithDType(const Buffer& buffer, const DataType& dtype);
+Buffer WithDType(const Buffer& buffer, DLDataType dtype);
 
 /*!
  * \brief Replaces the buffer within the specific sequence of regions
diff --git a/src/s_tir/schedule/utils.h b/src/s_tir/schedule/utils.h
index b50416c2e198..7509dad5bdbe 100644
--- a/src/s_tir/schedule/utils.h
+++ b/src/s_tir/schedule/utils.h
@@ -166,7 +166,7 @@ inline bool IsSingleStmt(const Stmt& stmt) {
  */
 inline IterVar IterVarFromLoop(const For& loop, ffi::String name, IterVarType iter_var_type) {
   return IterVar(Range::FromMinExtent(loop->min, loop->extent),
-                 Var(std::move(name), loop->loop_var.dtype()), iter_var_type);
+                 Var(std::move(name), loop->loop_var.ty()), iter_var_type);
 }
 
 /*!
@@ -241,7 +241,7 @@ inline ffi::Optional<Var> AnalyzeVarWithShift(const PrimExpr& expr,
   // match: "var - shift"
   if ((var - shift).Match(expr)) {
     IntImm result = shift.Eval();
-    *constant = IntImm(result->dtype, -result->value);
+    *constant = IntImm(result.ty(), -result->value);
     return var.Eval();
   }
   return std::nullopt;
diff --git a/src/s_tir/transform/bound_checker.cc b/src/s_tir/transform/bound_checker.cc
index ea0364c12823..86086da945a8 100644
--- a/src/s_tir/transform/bound_checker.cc
+++ b/src/s_tir/transform/bound_checker.cc
@@ -71,7 +71,7 @@ class BoundChecker : public StmtExprMutator {
 
   Stmt VisitStmt_(const AllocBufferNode* op) final {
     if (UpdateIsNeeded(op->buffer->data)) {
-      Update(op->buffer->data, op->buffer->shape, op->buffer->dtype);
+      Update(op->buffer->data, op->buffer->shape, op->buffer->dtype->dtype);
     }
     return StmtExprMutator::VisitStmt_(op);
   }
@@ -118,15 +118,17 @@ class BoundChecker : public StmtExprMutator {
     return (buffer_var.defined() && mem_to_shape_.count(buffer_var.get()));
   }
 
-  void Update(const Var& buffer_var, ffi::Array<PrimExpr> new_shape, const DataType& type) {
+  void Update(const Var& buffer_var, ffi::Array<PrimExpr> new_shape, DLDataType dtype) {
     // Sanity check at first.
     if (!ShapeIsValid(new_shape)) {
       return;
     }
 
+    int16_t lanes = static_cast<int16_t>(dtype.lanes);
+    TVM_FFI_ICHECK_GE(lanes, 0);
     new_shape.MutateByApply([&](const PrimExpr& dim) {
       // Cast to uint64 to avoid potential overflow.
-      return IntImm(DataType::UInt(64), type.lanes()) * dim;
+      return IntImm(PrimType::UInt(64), lanes) * dim;
     });
     mem_to_shape_[buffer_var.get()] = new_shape;
   }
@@ -175,7 +177,8 @@ class BoundChecker : public StmtExprMutator {
   }
 
   bool IsValidScalar(const PrimExpr& expr) const {
-    return expr.defined() && expr.dtype().is_scalar();
+    if (!expr.defined()) return false;
+    return expr.ty().IsScalar();
   }
 
   bool CanInstrument(const ffi::Array<PrimExpr>& indices, const Var& buffer_var) const {
@@ -210,8 +213,8 @@ class BoundChecker : public StmtExprMutator {
         upper_bound = analyzer_->Simplify(upper_bound);
 
         // Cast to the same type - signed, to be able to check lower bound.
-        index = Cast(DataType::Int(64), index);
-        upper_bound = Cast(DataType::Int(64), upper_bound);
+        index = Cast(PrimType::Int(64), index);
+        upper_bound = Cast(PrimType::Int(64), upper_bound);
 
         // Looks like a lower bound should always be zero after normalization.
         PrimExpr lower_bound = IntImm::Int64(0);
diff --git a/src/s_tir/transform/canonicalize_loop.cc b/src/s_tir/transform/canonicalize_loop.cc
index 9c10280eb6ef..9c18cb9c88d1 100644
--- a/src/s_tir/transform/canonicalize_loop.cc
+++ b/src/s_tir/transform/canonicalize_loop.cc
@@ -47,7 +47,7 @@ class LoopCanonicalizer : public StmtExprMutator {
       return StmtExprMutator::VisitStmt_(op);
     }
     const auto* loop_var = op->loop_var.get();
-    PrimExpr step = op->step.value_or(MakeConst(loop_var->dtype, 1));
+    PrimExpr step = op->step.value_or(MakeConst(loop_var->ty(), 1));
 
     // report warning for negative step, since it would be a forever loop
     if (!analyzer_->CanProveGreaterEqual(step, 1)) {
@@ -59,7 +59,7 @@ class LoopCanonicalizer : public StmtExprMutator {
     new_iter_info_[loop_var] = std::make_pair(step, op->min);
     auto n = CopyOnWrite(op);
     n->body = VisitStmt(op->body);
-    n->min = IntImm(loop_var->dtype, 0);
+    n->min = IntImm(ffi::GetRef<PrimExpr>(loop_var).ty(), 0);
     n->extent = analyzer_->Simplify(ceildiv(op->extent, step));
     n->step = std::nullopt;
     new_iter_info_.erase(loop_var);
diff --git a/src/s_tir/transform/compact_buffer_region.cc b/src/s_tir/transform/compact_buffer_region.cc
index 4ea7b63bfe89..c7a6e0fd1fef 100644
--- a/src/s_tir/transform/compact_buffer_region.cc
+++ b/src/s_tir/transform/compact_buffer_region.cc
@@ -181,7 +181,7 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
 
   void VisitStmt_(const BindNode* op) final {
     StmtExprVisitor::VisitExpr(op->value);
-    if (arith::IsIndexType(op->value->dtype)) {
+    if (arith::IsIndexTypedExpr(op->value)) {
       dom_analyzer_->Bind(op->var, op->value);
       dom_map_.emplace(op->var.get(), arith::IntSet::SinglePoint(op->value));
     }
@@ -189,12 +189,12 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
 
   void VisitExpr_(const LetNode* op) final {
     StmtExprVisitor::VisitExpr(op->value);
-    if (arith::IsIndexType(op->value->dtype)) {
+    if (arith::IsIndexTypedExpr(op->value)) {
       dom_analyzer_->Bind(op->var, op->value);
       dom_map_.emplace(op->var.get(), arith::IntSet::SinglePoint(op->value));
     }
     StmtExprVisitor::VisitExpr(op->body);
-    if (arith::IsIndexType(op->value->dtype)) {
+    if (arith::IsIndexTypedExpr(op->value)) {
       dom_map_.erase(op->var.get());
     }
   }
@@ -322,7 +322,7 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
       ancestor_iters_.push_back(iter);
       Range dom = iter->dom;
       if (!dom.defined()) {  // dom is empty for legacy te schedule
-        dom = Range::FromMinExtent(IntImm(op->value->dtype, 0), op->value);
+        dom = Range::FromMinExtent(IntImm(op->value.ty(), 0), op->value);
       }
       dom_analyzer_->Bind(iter->var, dom);
       dom_map_.emplace(iter->var.get(), arith::IntSet::FromRange(dom));
@@ -367,8 +367,9 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
       }
       // Step 2. Relax the access region
       auto normalize_pred = [](const PrimExpr& pred) {
-        if (pred->dtype.is_bool()) return pred;
-        return pred != IntImm(pred->dtype, 0);
+        PrimType pred_ty = pred.ty();
+        if (pred_ty.MatchesCode(DLDataTypeCode::kDLBool)) return pred;
+        return pred != IntImm(pred.ty(), 0);
       };
       PrimExpr predicate = dom_analyzer_->Simplify(std::accumulate(
           pending_conditions_.begin(), pending_conditions_.end(), PrimExpr(IntImm::Bool(true)),
@@ -439,7 +440,7 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
     for (size_t i = 0; i < nd_int_set.size(); ++i) {
       const arith::IntSet& int_set = nd_int_set[i];
       Range original =
-          Range(/*begin=*/IntImm(original_shape[i]->dtype, 0), /*end=*/original_shape[i]);
+          Range(/*begin=*/IntImm(original_shape[i].ty(), 0), /*end=*/original_shape[i]);
       Range range = int_set.CoverRange(original);
       PrimExpr min, extent;
       if (collect_inbound_) {
@@ -470,7 +471,7 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
         // try estimate a constant upperbound on region's extent
         int64_t upperbound = dom_analyzer_->const_int_bound(extent)->max_value;
         if (upperbound != arith::ConstIntBound::kPosInf) {
-          extent = MakeConst(extent->dtype, upperbound);
+          extent = MakeConst(extent.ty(), upperbound);
         } else {
           result_region.Set(i, original);
           continue;
@@ -699,15 +700,15 @@ ffi::Array<PrimExpr> CalcStrides(const BufferAllocInfo& alloc_info,
   if (alloc_info.dim_aligns.size()) {
     TVM_FFI_ICHECK(alloc_info.dim_aligns.size() == shape.size());
     strides.resize(shape.size());
-    PrimExpr stride = MakeConst(shape[0].dtype(), 1);
+    PrimExpr stride = MakeConst(shape[0].ty(), 1);
     for (size_t i = shape.size(); i != 0; --i) {
       size_t dim = i - 1;
       DimAlignInfo info = alloc_info.dim_aligns[dim];
       int align_factor = info.align_factor;
       int align_offset = info.align_offset;
       if (align_factor != 0) {
-        PrimExpr factor = MakeConst(stride.dtype(), align_factor);
-        PrimExpr offset = MakeConst(stride.dtype(), align_offset);
+        PrimExpr factor = MakeConst(stride.ty(), align_factor);
+        PrimExpr offset = MakeConst(stride.ty(), align_offset);
         stride = stride + indexmod(factor + offset - indexmod(stride, factor), factor);
       }
       strides[dim] = stride;
diff --git a/src/s_tir/transform/default_gpu_schedule.cc b/src/s_tir/transform/default_gpu_schedule.cc
index 970d4c10d399..70f701668f51 100644
--- a/src/s_tir/transform/default_gpu_schedule.cc
+++ b/src/s_tir/transform/default_gpu_schedule.cc
@@ -131,10 +131,10 @@ tirx::PrimFunc WrapBareSBlockBody(const tirx::PrimFunc& func) {
   if (inner->IsInstance<tirx::ForNode>() || inner->IsInstance<tirx::SBlockRealizeNode>()) {
     return func;
   }
-  tvm::IntImm zero(tvm::DataType::Int(32), 0);
-  tvm::IntImm one(tvm::DataType::Int(32), 1);
-  tirx::Var loop_var("u", tvm::DataType::Int(32));
-  tirx::Var iter_var_var("vu", tvm::DataType::Int(32));
+  tvm::IntImm zero(tvm::PrimType::Int(32), 0);
+  tvm::IntImm one(tvm::PrimType::Int(32), 1);
+  tirx::Var loop_var("u", tvm::PrimType::Int(32));
+  tirx::Var iter_var_var("vu", tvm::PrimType::Int(32));
   tirx::IterVar new_iter(tvm::Range::FromMinExtent(zero, one), iter_var_var,
                          tirx::IterVarType::kDataPar);
   tirx::SBlock inner_block = realize->block;
diff --git a/src/s_tir/transform/inject_double_buffer.cc b/src/s_tir/transform/inject_double_buffer.cc
index 874cf6ca677a..63339096d65c 100644
--- a/src/s_tir/transform/inject_double_buffer.cc
+++ b/src/s_tir/transform/inject_double_buffer.cc
@@ -164,15 +164,15 @@ class DoubleBufferInjector : public StmtExprMutator {
             << "It is better to split with multiple of 2";
         TVM_FFI_ICHECK(is_zero(old_loop->min));
         PrimExpr zero = old_loop->min;
-        PrimExpr new_ext = old_loop->extent - MakeConst(old_loop->loop_var.dtype(), 1);
-        PrimExpr factor = MakeConst(new_ext.dtype(), split_loop_);
+        PrimExpr new_ext = old_loop->extent - MakeConst(old_loop->loop_var.ty(), 1);
+        PrimExpr factor = MakeConst(new_ext.ty(), split_loop_);
         PrimExpr outer_ext = new_ext / factor;
         PrimExpr tail_base = outer_ext * factor;
-        Var outer_var(old_loop->loop_var->name_hint + ".outer", old_loop->loop_var.dtype());
+        Var outer_var(old_loop->loop_var->name_hint + ".outer", old_loop->loop_var.ty());
         std::unordered_map<const VarNode*, PrimExpr> vmap;
         std::vector<Stmt> loop_seq;
         for (int32_t i = 0; i < split_loop_; ++i) {
-          vmap[old_loop->loop_var.get()] = outer_var * factor + MakeConst(factor.dtype(), i);
+          vmap[old_loop->loop_var.get()] = outer_var * factor + MakeConst(factor.ty(), i);
           loop_seq.emplace_back(Substitute(old_loop->body, vmap));
         }
         Stmt loop = For(outer_var, zero, outer_ext, old_loop->kind, SeqStmt::Flatten(loop_seq));
@@ -180,7 +180,7 @@ class DoubleBufferInjector : public StmtExprMutator {
         std::vector<Stmt> tail_seq;
         Stmt tail_body = StripDoubleBufferWrite()(old_loop->body);
         for (int32_t i = 0; i < split_loop_; ++i) {
-          PrimExpr idx = tail_base + MakeConst(tail_base.dtype(), i);
+          PrimExpr idx = tail_base + MakeConst(tail_base.ty(), i);
           vmap[old_loop->loop_var.get()] = idx;
           tail_seq.emplace_back(IfThenElse(idx < old_loop->extent, Substitute(tail_body, vmap)));
         }
@@ -274,11 +274,11 @@ class DoubleBufferInjector : public StmtExprMutator {
     }
     StorageEntry& e = it->second;
     e.loop = loop_nest_.back();
-    PrimExpr zero = IntImm(e.loop->loop_var.dtype(), 0);
-    PrimExpr one = IntImm(e.loop->loop_var.dtype(), 1);
-    PrimExpr two = IntImm(e.loop->loop_var.dtype(), 2);
+    PrimExpr zero = IntImm(e.loop->loop_var.ty(), 0);
+    PrimExpr one = IntImm(e.loop->loop_var.ty(), 1);
+    PrimExpr two = IntImm(e.loop->loop_var.ty(), 2);
     PrimExpr loop_shift = e.loop->loop_var + one;
-    e.switch_write_var = Var(e.loop->loop_var->name_hint + ".db", e.loop->loop_var.dtype());
+    e.switch_write_var = Var(e.loop->loop_var->name_hint + ".db", e.loop->loop_var.ty());
     e.switch_read_var = indexmod(e.loop->loop_var, two);
     in_double_buffer_scope_ = true;
     Stmt body = this->VisitStmt(op->body);
diff --git a/src/s_tir/transform/inject_permuted_layout.cc b/src/s_tir/transform/inject_permuted_layout.cc
index a816c43e32a0..a0e896f0dc6a 100644
--- a/src/s_tir/transform/inject_permuted_layout.cc
+++ b/src/s_tir/transform/inject_permuted_layout.cc
@@ -269,7 +269,7 @@ class PermutedLayoutInjector : private IRMutatorWithAnalyzer {
       auto new_access_ptr = HandleAccessPtrAndOffset(access_ptr, smem_offset);
       auto new_call = call.CopyOnWrite();
       new_call->args.Set(5, new_access_ptr);
-      new_call->args.Set(6, IntImm(smem_offset->dtype, 0));
+      new_call->args.Set(6, IntImm(smem_offset.ty(), 0));
       return call;
     } else if (call->op.same_as(mma_store_op)) {
       // TODO(yixin): mma_store is not fully tested yet
diff --git a/src/s_tir/transform/inject_ptx_async_copy.cc b/src/s_tir/transform/inject_ptx_async_copy.cc
index 500c2623be41..514439c9f8fb 100644
--- a/src/s_tir/transform/inject_ptx_async_copy.cc
+++ b/src/s_tir/transform/inject_ptx_async_copy.cc
@@ -56,10 +56,10 @@ class PTXAsyncCopyInjector : public StmtMutator {
                  PrimExpr predicate_value = PrimExpr()) {
     if (load->buffer.scope() == "global") {
       TVM_FFI_ICHECK(load->indices.size() == 1 && store->indices.size() == 1);
-      TVM_FFI_ICHECK(load->indices[0]->dtype.lanes() == store->indices[0]->dtype.lanes());
+      TVM_FFI_ICHECK(load->indices[0].ty().lanes() == store->indices[0].ty().lanes());
 
-      const int indices_lanes = load->indices[0]->dtype.lanes();
-      const int bytes = indices_lanes * load->buffer->dtype.bytes();
+      const int indices_lanes = load->indices[0].ty().lanes();
+      const int bytes = indices_lanes * ((load->buffer->dtype.bits() + 7) / 8);
 
       if (bytes == 4 || bytes == 8 || bytes == 16) {
         auto dst_elem_type = GetPointerType(store->buffer->data->type_annotation);
@@ -72,13 +72,13 @@ class PTXAsyncCopyInjector : public StmtMutator {
           // The only case where src and dst have different dtypes is when the dst shared memory
           // is a byte buffer generated by merging dynamic shared memory.
           TVM_FFI_ICHECK(store->buffer.scope() == "shared.dyn");
-          TVM_FFI_ICHECK(dst_elem_type.value() == DataType::UInt(8));
+          TVM_FFI_ICHECK((dst_elem_type.value() == DLDataType{kDLUInt, 8, 1}));
           // BufferStore/Load have the "pointer reinterpret" semantics according to their
           // "value" dtype. Their "indices" are supposed to be applied after such pointer cast,
           // for example: ((*float16)(byte_buffer))[buffer->indices] = fp16_value;
           // To replace BufferStore/Load with cp.async, we need to multiply the store index by
           // the byte size of the "value" dtype, to get the correct offset into the byte buffer.
-          index_factor = src_elem_type->bytes();
+          index_factor = (src_elem_type.value().bits + 7) / 8;
         }
 
         if (indices_lanes == 1) {
diff --git a/src/s_tir/transform/inject_ptx_ldg32.cc b/src/s_tir/transform/inject_ptx_ldg32.cc
index 7b63b22f6965..2d07aafc5446 100644
--- a/src/s_tir/transform/inject_ptx_ldg32.cc
+++ b/src/s_tir/transform/inject_ptx_ldg32.cc
@@ -115,8 +115,9 @@ class PTXRewriter : public StmtMutator {
     }
     has_buffer_1 = true;
     // addr[0] -> global_addr /  addr[1] -> local_addr
-    addr_buffer = decl_buffer({IntImm::Int32(2)}, DataType::Int(32), "addr", "local");
-    predicate_buffer = decl_buffer({IntImm::Int32(1)}, DataType::Bool(), "predicate", "local");
+    addr_buffer = decl_buffer({IntImm::Int32(2)}, DLDataType{kDLInt, 32, 1}, "addr", "local");
+    predicate_buffer =
+        decl_buffer({IntImm::Int32(1)}, DLDataType{kDLBool, 8, 1}, "predicate", "local");
   }
 
   bool has_buffer_1 = false, has_buffer_2 = false;
diff --git a/src/s_tir/transform/inject_software_pipeline.cc b/src/s_tir/transform/inject_software_pipeline.cc
index 4e4307ef1f18..7269c41f7a4c 100644
--- a/src/s_tir/transform/inject_software_pipeline.cc
+++ b/src/s_tir/transform/inject_software_pipeline.cc
@@ -120,7 +120,7 @@ class PipelineOpaqueAccessRewriter {
         ffi::Array<PrimExpr> new_args = call->args;
         const Buffer& new_buffer = (*it).second;
         new_args.Set(4, RewriteWmmaFragmentIndex(buffer, new_buffer, call->args[4]));
-        return Call(call->dtype, call->op, new_args, call->attrs, call->span);
+        return Call(call.ty(), call->op, new_args, call->attrs, call->span);
       }
     } else if (call->op.same_as(mma_sync)) {
       ffi::Array<PrimExpr> new_args = call->args;
@@ -134,7 +134,7 @@ class PipelineOpaqueAccessRewriter {
           new_args.Set(i * 2 + 1, new_index);
         }
       }
-      return Call(call->dtype, call->op, new_args, call->attrs, call->span);
+      return Call(call.ty(), call->op, new_args, call->attrs, call->span);
     } else if (call->op.same_as(access_ptr)) {
       return RewriteBufferAccess(call, {1});
     } else if (call->op.same_as(ptx_mma_legacy)) {
@@ -197,7 +197,7 @@ class PipelineOpaqueAccessRewriter {
         new_args.Set(i + 1, new_index);
       }
     }
-    return Call(call->dtype, call->op, new_args, call->attrs, call->span);
+    return Call(call.ty(), call->op, new_args, call->attrs, call->span);
   }
 
   const ffi::Map<Var, Buffer>& buffer_data_to_buffer_;
@@ -767,7 +767,7 @@ class PipelineRewriter : public StmtExprMutator {
           // If the async operation that this wait_queue is waiting on is predicated, and we cannot
           // prove that the predicate is always true, the precise wait count is only valid
           // at iterations where the predicate is true;
-          auto wait_count = Call(DataType::Int(32), builtin::if_then_else(),
+          auto wait_count = Call(PrimType::Int(32), builtin::if_then_else(),
                                  {state.predicate.value(), state.pending_wait.wait_count, 0});
           attach_wait_scope(state.pending_wait.insert_before, stage_id, wait_count);
         } else {
diff --git a/src/s_tir/transform/inject_virtual_thread.cc b/src/s_tir/transform/inject_virtual_thread.cc
index 035236e8af38..58133bc4999b 100644
--- a/src/s_tir/transform/inject_virtual_thread.cc
+++ b/src/s_tir/transform/inject_virtual_thread.cc
@@ -218,17 +218,18 @@ class VTInjector : public arith::IRMutatorWithAnalyzer {
   PrimExpr VisitExpr_(const CallNode* op) final {
     if (op->op.same_as(builtin::tvm_access_ptr())) {
       TVM_FFI_ICHECK_EQ(op->args.size(), 5U);
-      DataType dtype = op->args[0].dtype();
+      DLDataType dtype = op->args[0].ty()->dtype;
       const VarNode* buffer = op->args[1].as<VarNode>();
       auto it = alloc_remap_.find(buffer);
       if (it == alloc_remap_.end()) return StmtExprMutator::VisitExpr_(op);
       visit_touched_var_ = true;
       PrimExpr offset = this->VisitExpr(op->args[2]);
       PrimExpr extent = this->VisitExpr(op->args[3]);
-      PrimExpr stride = it->second / MakeConst(offset.dtype(), dtype.lanes());
+      PrimExpr stride = it->second / MakeConst(offset.ty(), static_cast<int16_t>((dtype).lanes));
       offset = RewriteIndex(offset, stride);
 
-      return Call(op->dtype, op->op, {op->args[0], op->args[1], offset, extent, op->args[4]});
+      return Call(ffi::GetRef<PrimExpr>(op).ty(), op->op,
+                  {op->args[0], op->args[1], offset, extent, op->args[4]});
     } else if (op->op.same_as(builtin::tvm_context_id())) {
       return allow_share_ ? ffi::GetRef<PrimExpr>(op) : var_;
     } else {
@@ -465,15 +466,15 @@ class VTInjector : public arith::IRMutatorWithAnalyzer {
       // do unrolling if it is inside innermost content.
       ffi::Array<Stmt> seq;
       for (int i = 0; i < num_threads_; ++i) {
-        seq.push_back(Substitute(stmt, {{var_, MakeConst(var_.dtype(), i)}}));
+        seq.push_back(Substitute(stmt, {{var_, MakeConst(var_.ty(), i)}}));
       }
       return SeqStmt::Flatten(seq);
     } else {
       // insert a for loop
-      Var idx(var_->name_hint + ".s", var_->dtype);
+      Var idx(var_->name_hint + ".s", var_.ty());
       stmt = Substitute(stmt, {{var_, idx}});
-      return For(idx, IntImm(idx.dtype(), 0), MakeConst(idx.dtype(), num_threads_),
-                 ForKind::kSerial, stmt);
+      return For(idx, IntImm(idx.ty(), 0), MakeConst(idx.ty(), num_threads_), ForKind::kSerial,
+                 stmt);
     }
   }
 
diff --git a/src/s_tir/transform/lift_thread_binding.cc b/src/s_tir/transform/lift_thread_binding.cc
index 9aebcfe4b0ac..cd8be790c886 100644
--- a/src/s_tir/transform/lift_thread_binding.cc
+++ b/src/s_tir/transform/lift_thread_binding.cc
@@ -133,7 +133,7 @@ class ThreadBindingLifter : public StmtExprMutator {
       for (const auto& [iter_var, annotation] : it->second) {
         body = For(iter_var->var, iter_var->dom->min, iter_var->dom->extent,
                    ForKind::kThreadBinding, std::move(body),
-                   IterVar(Range(nullptr), Var(iter_var->thread_tag, iter_var->var->dtype),
+                   IterVar(Range(nullptr), Var(iter_var->thread_tag, iter_var->var.ty()),
                            kThreadIndex, iter_var->thread_tag),
                    annotation, std::nullopt);
       }
diff --git a/src/s_tir/transform/loop_partition.cc b/src/s_tir/transform/loop_partition.cc
index 87d31bae91d0..e755b8265b95 100644
--- a/src/s_tir/transform/loop_partition.cc
+++ b/src/s_tir/transform/loop_partition.cc
@@ -261,7 +261,7 @@ class PartitionFinder : public StmtExprVisitor {
       const IterVarNode* thread_axis = op->node.as<IterVarNode>();
       TVM_FFI_ICHECK(thread_axis);
       const VarNode* var = thread_axis->var.get();
-      IntSet dom = IntSet::FromRange(Range(IntImm(op->value.dtype(), 0), op->value));
+      IntSet dom = IntSet::FromRange(Range(IntImm(op->value.ty(), 0), op->value));
       hint_map_.insert({var, dom});
       relax_map_.insert({var, dom});
       StmtExprVisitor::VisitStmt_(op);
@@ -458,11 +458,11 @@ class LoopPartitioner : public StmtMutator {
     Stmt res;
     if (scope.rank == 1) {
       // threadIdx should be put into relax map, in case of divergence.
-      relax_map_.insert({var.get(), IntSet::Interval(IntImm(var.dtype(), 0), op->value - 1)});
+      relax_map_.insert({var.get(), IntSet::Interval(IntImm(var.ty(), 0), op->value - 1)});
       res = StmtMutator::VisitStmt_(op);
       relax_map_.erase(var.get());
     } else {
-      hint_map_.insert({var.get(), IntSet::Interval(IntImm(var.dtype(), 0), op->value - 1)});
+      hint_map_.insert({var.get(), IntSet::Interval(IntImm(var.ty(), 0), op->value - 1)});
       res = StmtMutator::VisitStmt_(op);
       hint_map_.erase(var.get());
     }
@@ -774,7 +774,7 @@ inline Stmt LoopPartitioner::MakeFor(const ffi::Object* node, PrimExpr extent, S
   } else {
     TVM_FFI_ICHECK(for_node->kind != ForKind::kThreadBinding);
     auto new_loop = ffi::make_object<ForNode>(*for_node);
-    new_loop->min = IntImm(for_node->min.dtype(), 0);
+    new_loop->min = IntImm(for_node->min.ty(), 0);
     new_loop->extent = extent;
     new_loop->body = body;
     return For(new_loop);
diff --git a/src/s_tir/transform/lower_async_dma.cc b/src/s_tir/transform/lower_async_dma.cc
index 89660d4fefd2..72e16a7ed039 100644
--- a/src/s_tir/transform/lower_async_dma.cc
+++ b/src/s_tir/transform/lower_async_dma.cc
@@ -76,11 +76,17 @@ class AsyncDMALowerer : public arith::IRMutatorWithAnalyzer {
 
     auto src = BufferLoad(mem_copy->source->buffer, {src_min});
     auto dst = BufferLoad(mem_copy->dest->buffer, {dst_min});
+    DLDataType src_dtype = src.ty()->dtype;
+    int src_bytes = (src_dtype.bits * static_cast<int16_t>(src_dtype.lanes) + 7) / 8;
+    PrimExpr dst_nbytes = dst_extent * src_bytes;
     return Evaluate(
-        Call(DataType::Int(32), builtin::dma_copy(),
-             {async_queue_id_.value(), Call(DataType::Handle(), builtin::address_of(), {dst}),
-              Call(DataType::Handle(), builtin::address_of(), {src}),
-              dst_extent * src->dtype.bytes(), dma_bypass_cache_}));
+        Call(PrimType::Int(32), builtin::dma_copy(),
+             ffi::Array<PrimExpr>{
+                 async_queue_id_.value(),
+                 Call(PrimType::Handle(), builtin::address_of(), ffi::Array<PrimExpr>{dst}, Span()),
+                 Call(PrimType::Handle(), builtin::address_of(), ffi::Array<PrimExpr>{src}, Span()),
+                 dst_nbytes, dma_bypass_cache_},
+             Span()));
   }
 
   Stmt VisitStmt_(const AttrStmtNode* op) final {
@@ -119,7 +125,7 @@ class AsyncDMALowerer : public arith::IRMutatorWithAnalyzer {
         return previsit;
       }
       auto call_dma_wait =
-          Evaluate(Call(DataType::Int(32), builtin::dma_wait(), {queue_id, async_wait->value}));
+          Evaluate(Call(PrimType::Int(32), builtin::dma_wait(), {queue_id, async_wait->value}));
 
       // concatenate the call with the body and return
       return SeqStmt({call_dma_wait, arith::IRMutatorWithAnalyzer::VisitStmt(async_wait->body)});
@@ -147,9 +153,9 @@ class AsyncDMALowerer : public arith::IRMutatorWithAnalyzer {
       auto result = arith::IRMutatorWithAnalyzer::VisitStmt_(op);
       if (dmas_in_group_ > 1) {
         auto call_dma_start_group = Evaluate(
-            Call(DataType::Int(32), builtin::dma_start_group(), {async_queue_id_.value()}));
+            Call(PrimType::Int(32), builtin::dma_start_group(), {async_queue_id_.value()}));
         auto call_dma_end_group =
-            Evaluate(Call(DataType::Int(32), builtin::dma_end_group(), {async_queue_id_.value()}));
+            Evaluate(Call(PrimType::Int(32), builtin::dma_end_group(), {async_queue_id_.value()}));
         result = SeqStmt({call_dma_start_group, result, call_dma_end_group});
       }
 
diff --git a/src/s_tir/transform/lower_cross_thread_reduction.cc b/src/s_tir/transform/lower_cross_thread_reduction.cc
index 626158203d2c..56c938e689e9 100644
--- a/src/s_tir/transform/lower_cross_thread_reduction.cc
+++ b/src/s_tir/transform/lower_cross_thread_reduction.cc
@@ -147,7 +147,7 @@ ffi::Array<Buffer> MakeScratchpads(const ffi::Array<Buffer>& reduction_buffers,
   for (const Buffer& buffer : reduction_buffers) {
     ffi::String name = is_cross_thread_buffer ? "cross" : "in";
     name = name + "_thread_" + buffer->name;
-    new_buffers.push_back(Buffer(/*ptr=*/Var(name, PointerType(PrimType(buffer->dtype), "local")),
+    new_buffers.push_back(Buffer(/*ptr=*/Var(name, PointerType(buffer->dtype, "local")),
                                  /*dtype=*/buffer->dtype,
                                  /*shape=*/{IntImm::Int32(1)},
                                  /*strides=*/{IntImm::Int32(1)},
@@ -377,7 +377,7 @@ Stmt TransformReductionBlock(const SBlockRealizeNode* realize,
     ffi::Array<PrimExpr> parameters;
     parameters.reserve(reduction_loops.size() + 4);
     // 1-st argument: number of buffers
-    parameters.push_back(IntImm(DataType::UInt(32), n_buffers));
+    parameters.push_back(IntImm(PrimType::UInt(32), n_buffers));
     // Next `n_buffers` arguments: sources
     if (it_buffers.defined()) {
       for (int i = 0; i < n_buffers; ++i) {
@@ -424,7 +424,7 @@ Stmt TransformReductionBlock(const SBlockRealizeNode* realize,
                         /*attr_key=*/s_tir::attr::reduce_scope,
                         /*value=*/ConstHandle(0),
                         /*body=*/
-                        Evaluate(Call(/*dtype=*/DataType::Handle(),
+                        Evaluate(Call(/*dtype=*/PrimType::Handle(),
                                       /*op=*/tirx::builtin::tvm_thread_allreduce(),
                                       /*args=*/std::move(parameters)))))));
   }
@@ -507,7 +507,7 @@ Stmt TransformReductionBlock(const SBlockRealizeNode* realize,
     if (wb_buffers[0].scope() != "local") {
       for (const ForNode* loop : reduction_loops) {
         if (loop->thread_binding.defined()) {
-          wb_predicate = wb_predicate && (loop->loop_var == IntImm(loop->loop_var->dtype, 0));
+          wb_predicate = wb_predicate && (loop->loop_var == IntImm(loop->loop_var.ty(), 0));
         }
       }
     }
@@ -862,7 +862,7 @@ class CrossThreadReductionTransformer : public StmtMutator {
     loop_vars.reserve(unbound_thread2range.size());
     for (auto [scope, range] : unbound_thread2range) {
       std::string dim_index(1, static_cast<char>(scope.dim_index + 'x'));
-      Var loop_var("t" + dim_index, range->min->dtype);
+      Var loop_var("t" + dim_index, range->min.ty());
       loop_vars.push_back(loop_var);
       predicate = (loop_var == range->min) && predicate;
     }
@@ -882,7 +882,7 @@ class CrossThreadReductionTransformer : public StmtMutator {
           /*kind=*/ForKind::kThreadBinding,                   //
           /*body=*/body,                                      //
           /*thread_binding=*/
-          IterVar(Range(), Var("", loop_vars[i]->dtype), IterVarType::kThreadIndex,
+          IterVar(Range(), Var("", loop_vars[i].ty()), IterVarType::kThreadIndex,
                   "threadIdx." + dim_index),
           /*annotations=*/{},
           /*step=*/std::nullopt);
diff --git a/src/s_tir/transform/lower_match_buffer.cc b/src/s_tir/transform/lower_match_buffer.cc
index f8e30e643494..2bedda4b4491 100644
--- a/src/s_tir/transform/lower_match_buffer.cc
+++ b/src/s_tir/transform/lower_match_buffer.cc
@@ -42,7 +42,7 @@ class MatchBufferLower : public StmtExprMutator {
   explicit MatchBufferLower(const PrimFunc& func) {
     for (const Var& param : func->params) {
       // Mark input var as const variable.
-      if (!param.dtype().is_handle()) var_map_.Set(param, param);
+      if (!param.ty().IsHandle()) var_map_.Set(param, param);
     }
   }
 
@@ -212,7 +212,7 @@ class MatchBufferLower : public StmtExprMutator {
         // Non-zero elem_offset is ill-defined for non-flat memory.
         // If needed in the future, will require `ffi::Array<PrimExpr>
         // elem_offsets`, with one offset for each flattened index.
-        Bind(buffer->elem_offset, IntImm(buffer->elem_offset.dtype(), 0));
+        Bind(buffer->elem_offset, IntImm(buffer->elem_offset.ty(), 0));
       }
     }
 
@@ -223,7 +223,7 @@ class MatchBufferLower : public StmtExprMutator {
     if (!buffer->strides.empty()) {
       TVM_FFI_ICHECK_EQ(buffer->strides.size(), buffer->shape.size());
       if (source_buffer->strides.empty()) {
-        PrimExpr stride = MakeConst(buffer->strides.back().dtype(), 1);
+        PrimExpr stride = MakeConst(buffer->strides.back().ty(), 1);
         for (size_t i = buffer->shape.size(); i > 0; --i) {
           const PrimExpr& shape = source_buffer->shape[i - 1 + offset];
           Bind(buffer->strides[i - 1], stride, buffer->name + ".strides_" + std::to_string(i - 1));
@@ -246,13 +246,16 @@ class MatchBufferLower : public StmtExprMutator {
   }
 
   void Bind(const PrimExpr& arg, PrimExpr value, const std::string& arg_name = "argument") {
-    if (arg.dtype() != value.dtype()) {
-      if (arg.dtype().is_int() && value.dtype().is_int() &&
-          arg.dtype().lanes() == value.dtype().lanes()) {
-        value = cast(arg.dtype(), value);
+    PrimType arg_ty = arg.ty();
+    PrimType value_ty = value.ty();
+    if (arg_ty->dtype != value_ty->dtype) {
+      bool same_lanes = arg_ty.lanes() == value_ty.lanes();
+      if (arg_ty.MatchesCode(DLDataTypeCode::kDLInt) &&
+          value_ty.MatchesCode(DLDataTypeCode::kDLInt) && same_lanes) {
+        value = cast(arg_ty, value);
       } else {
-        TVM_FFI_ICHECK_EQ(arg.dtype(), value.dtype())
-            << "The data type mismatched: " << arg->dtype << " vs. " << value->dtype;
+        TVM_FFI_ICHECK_EQ(arg_ty->dtype, value_ty->dtype)
+            << "The data type mismatched: " << arg_ty->dtype << " vs. " << value_ty->dtype;
       }
     }
     // Handle recursive case
diff --git a/src/s_tir/transform/lower_opaque_block.cc b/src/s_tir/transform/lower_opaque_block.cc
index 7560b3f33bb1..0f1c810b67c4 100644
--- a/src/s_tir/transform/lower_opaque_block.cc
+++ b/src/s_tir/transform/lower_opaque_block.cc
@@ -131,8 +131,8 @@ class OpaqueBlockLower : public StmtExprMutator {
 
     } else {
       PrimExpr expr = it->second;
-      if (expr.dtype() != var.dtype()) {
-        expr = tvm::cast(var.dtype(), std::move(expr));
+      if (expr.ty() != var.ty()) {
+        expr = tvm::cast(var.ty(), std::move(expr));
       }
       return expr;
     }
diff --git a/src/s_tir/transform/lower_thread_allreduce.cc b/src/s_tir/transform/lower_thread_allreduce.cc
index 0473690b7afa..ca3ff8699b48 100644
--- a/src/s_tir/transform/lower_thread_allreduce.cc
+++ b/src/s_tir/transform/lower_thread_allreduce.cc
@@ -180,14 +180,14 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
     TVM_FFI_ICHECK_EQ(size, size_of_args->value);
     ffi::Array<PrimExpr> inits = combiner->identity_element;
     std::vector<PrimExpr> values(size);
-    std::vector<DataType> types(size);
+    std::vector<DLDataType> dtypes(size);
     PrimExpr cond = call->args[size + 1];
     for (size_t idx = 0; idx < size; ++idx) {
       values[idx] = call->args[1 + idx];
       if (!is_one(cond)) {
         values[idx] = Select(cond, values[idx], inits[idx]);
       }
-      types[idx] = values[idx].dtype();
+      dtypes[idx] = values[idx].ty()->dtype;
     }
     std::vector<Buffer> buffers(size);
     for (size_t idx = 0; idx < size; ++idx) {
@@ -305,15 +305,14 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
     // In the second stage we use the first 16 lanes of the first warp to reduce
     // the remaining elements, and this reduction can also be optimized by
     // shuffle_down warp-level primitives.
-    PrimExpr zero_index = IntImm(reduce_index->dtype, 0);
-    if (IsWarpReduction(types, group_extent, reduce_extent, contiguous_reduce_extent)) {
+    PrimExpr zero_index = IntImm(reduce_index.ty(), 0);
+    if (IsWarpReduction(dtypes, group_extent, reduce_extent, contiguous_reduce_extent)) {
       std::vector<PrimExpr> reduce_results;
-      DataType mask_dtype = DataType::UInt(32);
-      PrimExpr mask = Call(mask_dtype, builtin::tvm_warp_activemask(), {});
+      PrimExpr mask = Call(PrimType::UInt(32), builtin::tvm_warp_activemask(), {});
 
       if (reduce_extent <= warp_size_) {
         std::tie(reduce_results, new_alloc_bufs) =
-            MakeWarpAllreduce(values, types, combiner, reduce_index, reduce_extent, group_index,
+            MakeWarpAllreduce(values, dtypes, combiner, reduce_index, reduce_extent, group_index,
                               mask, std::nullopt, &seq);
 
         // Broadcast the reduction result from lane 0 to all other lanes.
@@ -322,7 +321,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
         for (size_t i = 0; i < size; ++i) {
           Buffer buf = reduce_results[i].as_or_throw<BufferLoad>()->buffer;
           PrimExpr val = BufferLoad(buf, {zero_index});
-          TVM_FFI_ICHECK_EQ(val->dtype, types[i]);
+          TVM_FFI_ICHECK_EQ(val->ty()->dtype, dtypes[i]);
           PrimExpr splat = WarpShuffle(builtin::tvm_warp_shuffle(), new_alloc_bufs.back(), val,
                                        reduce_extent * group_index);
           seq.push_back(BufferStore(buf, splat, {zero_index}));
@@ -336,7 +335,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
         staging_shared_bufs.reserve(size);
         for (size_t i = 0; i < size; ++i) {
           Buffer staging_shared_buf = decl_buffer(
-              /*shape=*/{MakeConst(reduce_index->dtype, n_warps * group_extent)},
+              /*shape=*/{MakeConst(reduce_index.ty(), n_warps * group_extent)},
               /*dtype=*/buffers[i]->dtype, /*name=*/"red_buf_staging", /*storage_scope=*/"shared");
           staging_shared_bufs.push_back(staging_shared_buf);
           new_alloc_bufs.push_back(staging_shared_buf);
@@ -344,7 +343,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
 
         // 2. First round of allreduce.
         std::tie(reduce_results, local_bufs) =
-            MakeWarpAllreduce(values, types, combiner, reduce_index, warp_size_, group_index, mask,
+            MakeWarpAllreduce(values, dtypes, combiner, reduce_index, warp_size_, group_index, mask,
                               std::nullopt, &seq);
         new_alloc_bufs.insert(new_alloc_bufs.end(), local_bufs.begin(), local_bufs.end());
 
@@ -369,8 +368,8 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
                                  /*indices=*/{group_index * n_warps + reduce_index});
         }
         std::tie(reduce_results, local_bufs) = MakeWarpAllreduce(
-            values, types, combiner, reduce_index, n_warps, group_index, mask,
-            /*predicate=*/reduce_index < MakeConst(reduce_index->dtype, n_warps), &seq);
+            values, dtypes, combiner, reduce_index, n_warps, group_index, mask,
+            /*predicate=*/reduce_index < MakeConst(reduce_index.ty(), n_warps), &seq);
         new_alloc_bufs.insert(new_alloc_bufs.end(), local_bufs.begin(), local_bufs.end());
 
         // 5. Create shared memory buffer(s) of `group_extent` elements, storing
@@ -380,7 +379,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
         for (size_t i = 0; i < size; ++i) {
           new_alloc_bufs.push_back(reduce_results[i].as_or_throw<BufferLoad>()->buffer);
           Buffer broadcast_shared_buf = decl_buffer(
-              /*shape=*/{MakeConst(reduce_index->dtype, group_extent)},
+              /*shape=*/{MakeConst(reduce_index.ty(), group_extent)},
               /*dtype=*/buffers[i]->dtype, /*name=*/"red_result", /*storage_scope=*/"shared");
           write_result.push_back(
               BufferStore(broadcast_shared_buf, reduce_results[i], {group_index}));
@@ -395,7 +394,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       for (size_t i = 0; i < size; ++i) {
         TVM_FFI_ICHECK(!load_remap_.count(buffers[i]->data.get()));
         Buffer buf = reduce_results[i].as_or_throw<BufferLoad>()->buffer;
-        TVM_FFI_ICHECK_EQ(reduce_results[i]->dtype, types[i]);
+        TVM_FFI_ICHECK_EQ(reduce_results[i].ty()->dtype, dtypes[i]);
         load_remap_[buffers[i]->data.get()] = reduce_results[i];
 
         // The AllocBuffer doesn't need to be emitted here since alloc_remap_
@@ -418,20 +417,20 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       // previous iteration on the same buffer.
       seq.emplace_back(SyncThread("shared"));
       for (size_t idx = 0; idx < size; ++idx) {
-        shared_bufs[idx] = decl_buffer({IntImm(group_index->dtype, group_extent * reduce_extent)},
-                                       types[idx], "red_buf" + std::to_string(idx), "shared");
+        shared_bufs[idx] = decl_buffer({IntImm(group_index.ty(), group_extent * reduce_extent)},
+                                       dtypes[idx], "red_buf" + std::to_string(idx), "shared");
         seq.emplace_back(BufferStore(shared_bufs[idx], values[idx],
                                      {BufIndex(reduce_index, group_index, reduce_extent)}));
       }
       seq.emplace_back(SyncThread("shared"));
-      seq.emplace_back(MakeBufAllreduce(combiner, types, shared_bufs, reduce_index, group_index,
+      seq.emplace_back(MakeBufAllreduce(combiner, dtypes, shared_bufs, reduce_index, group_index,
                                         reduce_extent, group_extent, contiguous_reduce_extent));
       for (size_t idx = 0; idx < size; ++idx) {
         TVM_FFI_ICHECK(!load_remap_.count(buffers[idx]->data.get()));
-        PrimExpr pred = MakeConst(DataType::Bool(types[idx].lanes()), true);
+        PrimExpr pred = MakeConst(PrimType::Bool(static_cast<int16_t>(dtypes[idx].lanes)), true);
         BufferLoad load(shared_bufs[idx],
-                        {BufIndex(IntImm(reduce_index.dtype(), 0), group_index, reduce_extent)});
-        TVM_FFI_ICHECK_EQ(load->dtype, types[idx]);
+                        {BufIndex(IntImm(reduce_index.ty(), 0), group_index, reduce_extent)});
+        TVM_FFI_ICHECK_EQ(load->ty()->dtype, dtypes[idx]);
         load_remap_[buffers[idx]->data.get()] = load;
         alloc_remap_[buffers[idx]->data.get()] = shared_bufs[idx];
         var_remap_[buffers[idx]->data.get()] = shared_bufs[idx]->data;
@@ -455,7 +454,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
 
   std::pair<std::vector<PrimExpr>, std::vector<Buffer>> MakeWarpAllreduce(
       std::vector<PrimExpr> src_values,                  //
-      std::vector<DataType> dtypes,                      //
+      std::vector<DLDataType> dtypes,                    //
       const CommReducerNode* combiner,                   //
       PrimExpr reduce_index, int reduce_extent,          //
       PrimExpr group_index,                              //
@@ -496,7 +495,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
     // active channels.
     ffi::Optional<Buffer> mask_buffer;
     if (need_warp_shuffle_mask_) {
-      mask_buffer = decl_buffer(shape, mask->dtype, "mask", "local");
+      mask_buffer = decl_buffer(shape, mask.ty()->dtype, "mask", "local");
       seq->emplace_back(BufferStore(mask_buffer.value(), mask, zero_indices));
       // Push the buffer description.  Later this will have an
       // allocation built for it.
@@ -514,7 +513,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       for (int i = 0; i < n_buffers; ++i) {
         Buffer shared_buf = shared_bufs[i];
         BufferLoad val(shared_buf, zero_indices);
-        TVM_FFI_ICHECK_EQ(val->dtype, dtypes[i]);
+        TVM_FFI_ICHECK_EQ(val->ty()->dtype, dtypes[i]);
         a.push_back(val);
 
         // __shfl_*sync calls shall not appear in if_then_else expressions
@@ -535,7 +534,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
         seq->push_back(s);
 
         BufferLoad load = BufferLoad(local_buf, zero_indices);
-        TVM_FFI_ICHECK_EQ(load->dtype, dtypes[i]);
+        TVM_FFI_ICHECK_EQ(load->ty()->dtype, dtypes[i]);
         b.push_back(load);
       }
 
@@ -574,7 +573,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
   }
 
   // make allreduce.
-  Stmt MakeBufAllreduce(const CommReducerNode* combiner, const std::vector<DataType>& types,
+  Stmt MakeBufAllreduce(const CommReducerNode* combiner, const std::vector<DLDataType>& dtypes,
                         const ffi::Array<Buffer>& shared_bufs, PrimExpr reduce_index,
                         PrimExpr group_index, int reduce_extent, int group_extent,
                         int contiguous_reduce_extent) {
@@ -594,11 +593,11 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       for (size_t i = 0; i < size; ++i) {
         BufferLoad b_load(shared_bufs[i],
                           {BufIndex(reduce_index + offset, group_index, reduce_extent)});
-        TVM_FFI_ICHECK_EQ(b_load->dtype, types[i]);
+        TVM_FFI_ICHECK_EQ(b_load->ty()->dtype, dtypes[i]);
         b.push_back(b_load);
 
         BufferLoad a_load(shared_bufs[i], {buf_index});
-        TVM_FFI_ICHECK_EQ(a_load->dtype, types[i]);
+        TVM_FFI_ICHECK_EQ(a_load->ty()->dtype, dtypes[i]);
         a.push_back(a_load);
       }
       ffi::Array<PrimExpr> ret = (*combiner)(a, b);
@@ -658,7 +657,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
         for (auto expr : loads) {
           Var var(
               "w_" + std::to_string(reduce_align) + "_" + std::to_string(in_warp_local_vars.size()),
-              expr->dtype);
+              expr.ty());
           in_warp_local_vars.push_back(var);
         }
 
@@ -717,7 +716,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
   }
   // sync thread op.
   static Stmt SyncThread(const std::string& sync) {
-    return Evaluate(Call(DataType::Int(32), builtin::tvm_storage_sync(), {StringImm(sync)}));
+    return Evaluate(Call(PrimType::Int(32), builtin::tvm_storage_sync(), {StringImm(sync)}));
   }
 
   // Emit warp shuffle  calls.
@@ -732,14 +731,14 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
     }
     PrimExpr width = IntImm::Int32(warp_size_);
     ffi::Array<PrimExpr> args{mask, val, delta_or_lane, width, width};
-    return Call(val.dtype(), op, args);
+    return Call(val.ty(), op, args);
   }
 
   // Check if we can use warp level reduction.
   //
   // Note: The ROCm backend will only have warp reductions for now.
   // Also, the warp/wavefront size differs (64 on rocm, 32 on cuda and metal).
-  bool IsWarpReduction(const std::vector<DataType>& types, int group_extent, int reduce_extent,
+  bool IsWarpReduction(const std::vector<DLDataType>& dtypes, int group_extent, int reduce_extent,
                        int contiguous_reduce_extent) {
     if ((target_->kind->name != "cuda") && (target_->kind->name != "rocm") &&
         (target_->kind->name != "metal") && (target_->kind->name != "webgpu")) {
@@ -750,19 +749,22 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
 
     // rocm only supports 32 bit operands for shuffling at the moment
     if ((target_->kind->name == "rocm") &&
-        (std::any_of(types.begin(), types.end(), [](DataType ty) {
-          if (ty.is_fixed_length_vector()) return ty.bits() * ty.lanes() != 32;
-          return ty.bits() != 32;
+        (std::any_of(dtypes.begin(), dtypes.end(), [](DLDataType dtype) {
+          int16_t lanes = static_cast<int16_t>(dtype.lanes);
+          if (lanes > 1) return dtype.bits * lanes != 32;
+          return dtype.bits != 32;
         }))) {
       return false;
     }
 
     // Supported types:
     // {u}int, {u}long, {u}long long, float, double, half/half2
-    if (std::any_of(types.begin(), types.end(), [](DataType ty) {
-          if (ty.is_float16()) return ty.lanes() > 2;
-          if (ty.is_fixed_length_vector()) return true;
-          return ty.bytes() < 4 || ty.bytes() > 8;
+    if (std::any_of(dtypes.begin(), dtypes.end(), [](DLDataType dtype) {
+          int16_t lanes = static_cast<int16_t>(dtype.lanes);
+          if (dtype.code == kDLFloat && dtype.bits == 16) return lanes > 2;
+          if (lanes > 1) return true;
+          int bytes = (dtype.bits * lanes + 7) / 8;
+          return bytes < 4 || bytes > 8;
         })) {
       return false;
     }
diff --git a/src/s_tir/transform/lower_vtcm_alloc.cc b/src/s_tir/transform/lower_vtcm_alloc.cc
index cd33d870628f..eb9ecefe7e49 100644
--- a/src/s_tir/transform/lower_vtcm_alloc.cc
+++ b/src/s_tir/transform/lower_vtcm_alloc.cc
@@ -43,9 +43,9 @@ class VtcmAllocator : public StmtExprMutator {
       ffi::Array<PrimExpr> args;
       args.push_back(StringImm(storage_scope));
       args.push_back(IntImm::Int64(op->buffer->shape.size()));
-      args.push_back(Call(DataType::Handle(), builtin::tvm_stack_make_shape(), op->buffer->shape));
+      args.push_back(Call(PrimType::Handle(), builtin::tvm_stack_make_shape(), op->buffer->shape));
       return Bind(op->buffer->data,
-                  Call(op->buffer->data.dtype(), builtin::nd_mem_alloc_with_scope(), args));
+                  Call(op->buffer->data.ty(), builtin::nd_mem_alloc_with_scope(), args));
     }
     return StmtExprMutator::VisitStmt_(op);
   }
diff --git a/src/s_tir/transform/memhammer_tensorcore_rewrite.cc b/src/s_tir/transform/memhammer_tensorcore_rewrite.cc
index 0e74dc1d0b17..25988c2637a5 100644
--- a/src/s_tir/transform/memhammer_tensorcore_rewrite.cc
+++ b/src/s_tir/transform/memhammer_tensorcore_rewrite.cc
@@ -105,8 +105,9 @@ ffi::Array<Range> RelaxIndices(const ffi::Array<PrimExpr>& indices,
  */
 Stmt RewriteWmmaLoad(Stmt stmt) {
   using arith::IntSet;
-  const DataType dtype = DataType::Float(16);
-  const DataType int32 = DataType::Int(32);
+  const PrimType dtype_ty = PrimType::Float(16);
+  const DLDataType dtype = dtype_ty->dtype;
+  const PrimType int32_ty = PrimType::Int(32);
 
   Stmt body = stmt;
   std::vector<const ForNode*> loops;
@@ -128,21 +129,21 @@ Stmt RewriteWmmaLoad(Stmt stmt) {
   Buffer tgt_buffer = buf_store->buffer;
   std::string layout = tgt_buffer.scope() == "wmma.matrix_a" ? "row_major" : "col_major";
   Buffer new_src_buffer(
-      /*data=*/Var("src", PointerType(PrimType(dtype), src_buffer.scope())),
+      /*data=*/Var("src", PointerType(dtype_ty, src_buffer.scope())),
       /*dtype=*/dtype,
       /*shape=*/{IntImm::Int32(16), IntImm::Int32(16)},
-      /*strides=*/{Var("s1", int32), Var("s0", int32)},
-      /*elem_offset=*/Var("src_elem_offset", int32),
+      /*strides=*/{Var("s1", int32_ty), Var("s0", int32_ty)},
+      /*elem_offset=*/Var("src_elem_offset", int32_ty),
       /*name=*/"src",
       /*data_alignment=*/64,
       /*offset_factor=*/16,
       /*buffer_type=*/kDefault);
   Buffer new_tgt_buffer(
-      /*data=*/Var("tgt", PointerType(PrimType(dtype), tgt_buffer.scope())),
+      /*data=*/Var("tgt", PointerType(dtype_ty, tgt_buffer.scope())),
       /*dtype=*/dtype,
       /*shape=*/{IntImm::Int32(16), IntImm::Int32(16)},
       /*strides=*/{},
-      /*elem_offset=*/Var("tgt_elem_offset", int32),
+      /*elem_offset=*/Var("tgt_elem_offset", int32_ty),
       /*name=*/"tgt",
       /*data_alignment=*/64,
       /*offset_factor=*/16,
@@ -160,7 +161,7 @@ Stmt RewriteWmmaLoad(Stmt stmt) {
           /*name_hint=*/"wmma_load",
           /*body=*/
           Evaluate(Call(
-              /*data=*/runtime::DataType::Handle(),
+              /*data=*/PrimType::Handle(),
               /*op=*/tvm_load_matrix_sync_op,
               {
                   /*0:*/ new_tgt_buffer->data,
@@ -171,7 +172,7 @@ Stmt RewriteWmmaLoad(Stmt stmt) {
                       floordiv(floormod(new_tgt_buffer->elem_offset, 256), 16),
                   /*5:*/
                   Call(
-                      /*dtype=*/runtime::DataType::Handle(),
+                      /*dtype=*/PrimType::Handle(),
                       /*op=*/builtin::tvm_access_ptr(),
                       /*args=*/
                       {
@@ -207,7 +208,7 @@ Stmt RewriteWmmaLoad(Stmt stmt) {
  */
 Stmt RewriteWmmaStore(Stmt stmt) {
   using arith::IntSet;
-  const DataType int32 = DataType::Int(32);
+  const PrimType int32_ty = PrimType::Int(32);
 
   Stmt body = stmt;
   std::vector<const ForNode*> loops;
@@ -236,22 +237,23 @@ Stmt RewriteWmmaStore(Stmt stmt) {
   Buffer src_buffer = buf_load->buffer;
   Buffer tgt_buffer = buf_store->buffer;
 
-  const DataType dtype = src_buffer->dtype;
+  PrimType dtype_ty = src_buffer->dtype;
+  const DLDataType dtype = dtype_ty->dtype;
 
-  Buffer new_src_buffer(/*data=*/Var("src", PointerType(PrimType(dtype), src_buffer.scope())),
+  Buffer new_src_buffer(/*data=*/Var("src", PointerType(dtype_ty, src_buffer.scope())),
                         /*dtype=*/dtype,
                         /*shape=*/{IntImm::Int32(16), IntImm::Int32(16)},
                         /*strides=*/{},
-                        /*elem_offset=*/Var("src_elem_offset", int32),
+                        /*elem_offset=*/Var("src_elem_offset", int32_ty),
                         /*name=*/"src",
                         /*data_alignment=*/64,
                         /*offset_factor=*/16,
                         /*buffer_type=*/kDefault);
-  Buffer new_tgt_buffer(/*data=*/Var("tgt", PointerType(PrimType(dtype), tgt_buffer.scope())),
+  Buffer new_tgt_buffer(/*data=*/Var("tgt", PointerType(dtype_ty, tgt_buffer.scope())),
                         /*dtype=*/dtype,
                         /*shape=*/{IntImm::Int32(16), IntImm::Int32(16)},
-                        /*strides=*/{Var("s1", int32), Var("s0", int32)},
-                        /*elem_offset=*/Var("tgt_elem_offset", int32),
+                        /*strides=*/{Var("s1", int32_ty), Var("s0", int32_ty)},
+                        /*elem_offset=*/Var("tgt_elem_offset", int32_ty),
                         /*name=*/"tgt",
                         /*data_alignment=*/64,
                         /*offset_factor=*/16,
@@ -268,7 +270,7 @@ Stmt RewriteWmmaStore(Stmt stmt) {
              /*writes=*/{BufferRegion(tgt_buffer, write_region)},
              /*name_hint=*/"wmma_store",
              Evaluate(Call(
-                 /*data=*/runtime::DataType::Handle(),
+                 /*data=*/PrimType::Handle(),
                  /*op=*/tvm_store_matrix_sync_op,
                  {/*0:*/ new_src_buffer->data,
                   /*1:*/ 16,
@@ -278,7 +280,7 @@ Stmt RewriteWmmaStore(Stmt stmt) {
                       floordiv(floormod(new_src_buffer->elem_offset, 256), 16),
                   /*5:*/
                   Call(
-                      /*data=*/runtime::DataType::Handle(),
+                      /*data=*/PrimType::Handle(),
                       /*op=*/builtin::tvm_access_ptr(),
                       {
                           /*0:*/ TypeAnnotation(new_tgt_buffer->dtype),
@@ -418,7 +420,7 @@ std::pair<Stmt, ffi::Optional<For>> TileMmaToGlobalBlock(Stmt stmt) {
  */
 Stmt RewriteMmaStore(Stmt stmt) {
   using arith::IntSet;
-  const DataType int32 = DataType::Int(32);
+  const PrimType int32_ty = PrimType::Int(32);
 
   // Step 1. Get inner loop body
   Stmt body = stmt;
@@ -458,21 +460,22 @@ Stmt RewriteMmaStore(Stmt stmt) {
   // Step 3.1. Generate new buffer
   Buffer src_buffer = buf_load->buffer;
   Buffer tgt_buffer = buf_store->buffer;
-  const DataType dtype = src_buffer->dtype;
-  Buffer new_src_buffer(/*data=*/Var("src", PointerType(PrimType(dtype), src_buffer.scope())),
+  PrimType dtype_ty = src_buffer->dtype;
+  const DLDataType dtype = dtype_ty->dtype;
+  Buffer new_src_buffer(/*data=*/Var("src", PointerType(dtype_ty, src_buffer.scope())),
                         /*dtype=*/dtype,
                         /*shape=*/{IntImm::Int32(8), IntImm::Int32(8)},
                         /*strides=*/{},
-                        /*elem_offset=*/Var("src_elem_offset", int32),
+                        /*elem_offset=*/Var("src_elem_offset", int32_ty),
                         /*name=*/"src",
                         /*data_alignment=*/64,
                         /*offset_factor=*/8,
                         /*buffer_type=*/kDefault);
-  Buffer new_tgt_buffer(/*data=*/Var("tgt", PointerType(PrimType(dtype), tgt_buffer.scope())),
+  Buffer new_tgt_buffer(/*data=*/Var("tgt", PointerType(dtype_ty, tgt_buffer.scope())),
                         /*dtype=*/dtype,
                         /*shape=*/{IntImm::Int32(8), IntImm::Int32(8)},
-                        /*strides=*/{Var("s1", int32), Var("s0", int32)},
-                        /*elem_offset=*/Var("tgt_elem_offset", int32),
+                        /*strides=*/{Var("s1", int32_ty), Var("s0", int32_ty)},
+                        /*elem_offset=*/Var("tgt_elem_offset", int32_ty),
                         /*name=*/"tgt",
                         /*data_alignment=*/64,
                         /*offset_factor=*/8,
diff --git a/src/s_tir/transform/merge_shared_memory_allocations.cc b/src/s_tir/transform/merge_shared_memory_allocations.cc
index c28f6b01c801..89d472087331 100644
--- a/src/s_tir/transform/merge_shared_memory_allocations.cc
+++ b/src/s_tir/transform/merge_shared_memory_allocations.cc
@@ -338,9 +338,9 @@ class SharedMemoryRewriter : public StmtExprMutator {
    */
   Var MakeMergedBufferVar() {
     if (is_dynamic_) {
-      return Var("buf_dyn_shmem", PointerType(PrimType(DataType::UInt(8)), "shared.dyn"));
+      return Var("buf_dyn_shmem", PointerType(PrimType::UInt(8), "shared.dyn"));
     } else {
-      return Var("buf_shmem", PointerType(PrimType(DataType::UInt(8)), "shared"));
+      return Var("buf_shmem", PointerType(PrimType::UInt(8), "shared"));
     }
   }
 
@@ -390,8 +390,9 @@ class SharedMemoryRewriter : public StmtExprMutator {
       }
 
       // 7. Wrap with the merged-buffer AllocBuffer.
-      Buffer merged_buf(scope.merged_buf_var, DataType::UInt(8), {scope.merged_alloc_size}, {},
-                        PrimExpr(), scope.merged_buf_var->name_hint, 0, 0, BufferType::kDefault);
+      Buffer merged_buf(scope.merged_buf_var, DLDataType{kDLUInt, 8, 1}, {scope.merged_alloc_size},
+                        {}, PrimExpr(), scope.merged_buf_var->name_hint, 0, 0,
+                        BufferType::kDefault);
       ffi::Map<ffi::String, ffi::Any> annotations;
       if (scope.has_volatile_alloc) {
         annotations.Set(tirx::attr::kVolatile, true);
@@ -451,7 +452,7 @@ class SharedMemoryRewriter : public StmtExprMutator {
           << "and is to be run after "
           << "FlattenBuffer";
       ffi::Array<PrimExpr> indices = {
-          node->indices[0] + this->GetBufferOffset(node->buffer->data, node->buffer->dtype)};
+          node->indices[0] + this->GetBufferOffset(node->buffer->data, node->buffer->dtype->dtype)};
 
       auto writer = node.CopyOnWrite();
       writer->buffer = GetUpdatedBuffer(node->buffer);
@@ -490,7 +491,7 @@ class SharedMemoryRewriter : public StmtExprMutator {
     static const Op& ptx_cp_async_op = Op::Get("tirx.ptx.cp_async_raw");
     if (op->op.same_as(builtin::tvm_access_ptr())) {
       TVM_FFI_ICHECK_EQ(op->args.size(), 5U);
-      DataType dtype = op->args[0].dtype();
+      DLDataType dtype = op->args[0].ty()->dtype;
       Var buffer = op->args[1].as_or_throw<Var>();
       if (!IsAppropriateSharedMemory(buffer) || scope_stack_.empty() ||
           !scope_stack_.back().shmem_allocs.count(buffer.get())) {
@@ -500,7 +501,7 @@ class SharedMemoryRewriter : public StmtExprMutator {
 
       PrimExpr offset = this->VisitExpr(op->args[2]);
       PrimExpr extent = this->VisitExpr(op->args[3]);
-      return Call(op->dtype, op->op,
+      return Call(ffi::GetRef<PrimExpr>(op).ty(), op->op,
                   {op->args[0], scope_stack_.back().merged_buf_var, extra_offset + offset, extent,
                    op->args[4]});
     } else if (op->op.same_as(ptx_cp_async_op)) {
@@ -510,7 +511,7 @@ class SharedMemoryRewriter : public StmtExprMutator {
       TVM_FFI_ICHECK(ptr_type) << "The buffer should be a pointer type.";
       const auto* prim_type = ptr_type->element_type.as<PrimTypeNode>();
       TVM_FFI_ICHECK(prim_type) << "The buffer should be a pointer to a primitive type.";
-      DataType dtype = DataType(prim_type->dtype);
+      DLDataType dtype = prim_type->dtype;
       if (!IsAppropriateSharedMemory(buffer) || scope_stack_.empty() ||
           !scope_stack_.back().shmem_allocs.count(buffer.get())) {
         return StmtExprMutator::VisitExpr_(op);
@@ -520,15 +521,15 @@ class SharedMemoryRewriter : public StmtExprMutator {
       // the dst shared memory is a byte buffer generated by merging shared memory.
       // we need to multiply the offset index by the byte size of the original value dtype, to get
       // the correct offset of merged shared buffer.
-      int index_factor = dtype.bytes();
+      int index_factor = (static_cast<int>(dtype.bits) * static_cast<int>(dtype.lanes) + 7) / 8;
       if (op->args.size() == 5)
         return Call(
-            dtype, op->op,
+            ffi::GetRef<PrimExpr>(op).ty(), op->op,
             {scope_stack_.back().merged_buf_var, mul(extra_offset + offset, PrimExpr(index_factor)),
              op->args[2], op->args[3], op->args[4]});
       else
         return Call(
-            dtype, op->op,
+            ffi::GetRef<PrimExpr>(op).ty(), op->op,
             {scope_stack_.back().merged_buf_var, mul(extra_offset + offset, PrimExpr(index_factor)),
              op->args[2], op->args[3], op->args[4], op->args[5]});
     } else {
@@ -536,12 +537,13 @@ class SharedMemoryRewriter : public StmtExprMutator {
     }
   }
 
-  PrimExpr GetBufferOffset(Var buffer_var, DataType dtype) {
+  PrimExpr GetBufferOffset(Var buffer_var, DLDataType dtype) {
     TVM_FFI_ICHECK(!scope_stack_.empty());
     KernelScope& scope = scope_stack_.back();
     auto it = scope.buffer_byte_offsets.find(buffer_var.get());
     TVM_FFI_ICHECK(it != scope.buffer_byte_offsets.end());
-    return indexdiv(it->second, dtype.bytes());
+    int elem_bytes = (static_cast<int>(dtype.bits) * static_cast<int>(dtype.lanes) + 7) / 8;
+    return indexdiv(it->second, elem_bytes);
   }
 
   // Wrapper function to determine if the shared memory allocation for a variable is appropriate.
@@ -646,7 +648,8 @@ class SharedMemoryRewriter : public StmtExprMutator {
       for (int i = 0; i < static_cast<int>(e->allocs.size()); i++) {
         for (const VarNode* buffer : e->allocs[i]) {
           const Buffer& buf = scope.shmem_allocs.at(buffer);
-          align[i] = std::max(align[i], buf->dtype.bytes());
+          int elem_bytes = static_cast<int>(buf->dtype.StorageBytes());
+          align[i] = std::max(align[i], elem_bytes);
         }
       }
     }
@@ -658,13 +661,14 @@ class SharedMemoryRewriter : public StmtExprMutator {
         for (const VarNode* buffer : e->allocs[i]) {
           const Buffer& buf = scope.shmem_allocs.at(buffer);
           ffi::Array<PrimExpr> alloc_shape = GetBufferAllocationShape(buf);
-          int align_bytes = std::max(align[i], buf->dtype.bytes());
+          int elem_bytes = static_cast<int>(buf->dtype.StorageBytes());
+          int align_bytes = std::max(align[i], elem_bytes);
           if (buf->data_alignment > 0) {
             TVM_FFI_ICHECK(buf->data_alignment % align_bytes == 0)
                 << "The alignment of the buffer is not a multiple of the data type size.";
             align_bytes = buf->data_alignment;
           }
-          PrimExpr buffer_bytes = alloc_shape[0] * buf->dtype.bytes();
+          PrimExpr buffer_bytes = alloc_shape[0] * elem_bytes;
           inner_offset +=
               indexmod(align_bytes - indexmod(scope.merged_alloc_size + inner_offset, align_bytes),
                        align_bytes);
@@ -702,7 +706,8 @@ class SharedMemoryRewriter : public StmtExprMutator {
     // compiler can do a better job with register allocation.
     const uint64_t match_range = 16;
     ffi::Array<PrimExpr> alloc_shape = GetBufferAllocationShape(buf);
-    uint64_t op_elem_bits = buf->dtype.bits() * buf->dtype.lanes();
+    DLDataType dtype = buf->dtype->dtype;
+    uint64_t op_elem_bits = static_cast<uint64_t>(dtype.bits) * dtype.lanes;
     uint64_t const_nbits =
         static_cast<uint64_t>(ConstantAllocationSize(alloc_shape) * op_elem_bits);
     // disable reuse of small arrays, they will be lowered to registers in LLVM
diff --git a/src/s_tir/transform/profile_instrumentation.cc b/src/s_tir/transform/profile_instrumentation.cc
index 28b325ca9c60..c3af852e46a3 100644
--- a/src/s_tir/transform/profile_instrumentation.cc
+++ b/src/s_tir/transform/profile_instrumentation.cc
@@ -203,8 +203,8 @@ class InstrumentIntrin : public StmtMutator {
       return stmt;
     }
     PrimExpr id = static_cast<int32_t>(loop_info.id);
-    PrimExpr start_call = Call(DataType::Handle(), builtin::start_profile_intrinsic(), {id});
-    PrimExpr end_call = Call(DataType::Handle(), builtin::end_profile_intrinsic(), {id});
+    PrimExpr start_call = Call(PrimType::Handle(), builtin::start_profile_intrinsic(), {id});
+    PrimExpr end_call = Call(PrimType::Handle(), builtin::end_profile_intrinsic(), {id});
     const Stmt start_profile = Evaluate(start_call);
     const Stmt end_profile = Evaluate(end_call);
     Stmt new_stmt = SeqStmt({start_profile, stmt, end_profile});
@@ -243,8 +243,8 @@ PrimFunc AddProfileBuiltins(PrimFunc func, int32_t max_instr_depth, int32_t min_
 
   PrimExpr e = start_id++;
   if (!disable_func_instrumentation) {
-    PrimExpr start_call = Call(DataType::Handle(), builtin::start_profile_intrinsic(), {e});
-    PrimExpr end_call = Call(DataType::Handle(), builtin::end_profile_intrinsic(), {e});
+    PrimExpr start_call = Call(PrimType::Handle(), builtin::start_profile_intrinsic(), {e});
+    PrimExpr end_call = Call(PrimType::Handle(), builtin::end_profile_intrinsic(), {e});
     const Stmt start_profile = Evaluate(start_call);
     const Stmt end_profile = Evaluate(end_call);
     func_ptr->body = SeqStmt({start_profile, std::move(func_ptr->body), end_profile});
diff --git a/src/s_tir/transform/renew_defs.cc b/src/s_tir/transform/renew_defs.cc
index f192d6a416a9..499124756542 100644
--- a/src/s_tir/transform/renew_defs.cc
+++ b/src/s_tir/transform/renew_defs.cc
@@ -54,7 +54,7 @@ class RenewDefMutator : public StmtExprMutator {
       params.push_back(generator.ReDefineVar(param));
     }
     for (const auto& param : func->params) {
-      if (param->dtype.is_handle()) {
+      if (param->ty().IsHandle()) {
         const Buffer& buffer = func->buffer_map.at(param);
         for (const PrimExpr& e : buffer->shape) {
           if (const auto* v = e.as<VarNode>()) {
@@ -69,7 +69,7 @@ class RenewDefMutator : public StmtExprMutator {
     // TODO(Siyuan Feng): checking var is used after define
     ffi::Map<tirx::Var, Buffer> buffer_map;
     for (const auto& param : func->params) {
-      if (param->dtype.is_handle()) {
+      if (param->ty().IsHandle()) {
         const Buffer& buffer = func->buffer_map.at(param);
         Var new_param = generator.VisitExpr(param).as_or_throw<Var>();
         Buffer new_buffer = generator.DefineBuffer(buffer);
diff --git a/src/s_tir/transform/renormalize_split_pattern.cc b/src/s_tir/transform/renormalize_split_pattern.cc
index 2fbadfabd4c9..83fcb62e8ccf 100644
--- a/src/s_tir/transform/renormalize_split_pattern.cc
+++ b/src/s_tir/transform/renormalize_split_pattern.cc
@@ -83,8 +83,8 @@ class SplitPatternReNormalizer : public IRMutatorWithAnalyzer {
       if (c1_val > 0 && c2_val > 0) {
         int64_t c3 = ZeroAwareGCD(c1_val, c2_val);
         if (c3 > 1) {
-          IntImm c1_div = IntImm(c1.Eval().dtype(), c1_val / c3);
-          IntImm c2_div = IntImm(c2.Eval().dtype(), c2_val / c3);
+          IntImm c1_div = IntImm(c1.Eval().ty(), c1_val / c3);
+          IntImm c2_div = IntImm(c2.Eval().ty(), c2_val / c3);
           return RecursiveRewrite(floordiv(x.Eval() * c1_div + floordiv(y.Eval(), c3), c2_div));
         }
       }
@@ -95,12 +95,12 @@ class SplitPatternReNormalizer : public IRMutatorWithAnalyzer {
       if (c1_val > 0 && c2_val > 0) {
         int64_t c3 = ZeroAwareGCD(c1_val, c2_val);
         if (c3 > 1) {
-          IntImm c1_div = IntImm(c1.Eval().dtype(), c1_val / c3);
-          IntImm c2_div = IntImm(c2.Eval().dtype(), c2_val / c3);
-          return RecursiveRewrite(floordiv(
-              x.Eval() * Broadcast(c1_div, lanes.Eval()) +
-                  floordiv(y.Eval(), Broadcast(IntImm(c1.Eval().dtype(), c3), lanes.Eval())),
-              Broadcast(c2_div, lanes.Eval())));
+          IntImm c1_div = IntImm(c1.Eval().ty(), c1_val / c3);
+          IntImm c2_div = IntImm(c2.Eval().ty(), c2_val / c3);
+          return RecursiveRewrite(
+              floordiv(x.Eval() * Broadcast(c1_div, lanes.Eval()) +
+                           floordiv(y.Eval(), Broadcast(IntImm(c1.Eval().ty(), c3), lanes.Eval())),
+                       Broadcast(c2_div, lanes.Eval())));
         }
       }
     }
@@ -112,8 +112,8 @@ class SplitPatternReNormalizer : public IRMutatorWithAnalyzer {
       if (c1_val > 0 && c2_val > 0) {
         int64_t c3 = ZeroAwareGCD(c1_val, c2_val);
         if (c3 > 1) {
-          IntImm c1_div = IntImm(c1.Eval().dtype(), c1_val / c3);
-          IntImm c2_div = IntImm(c2.Eval().dtype(), c2_val / c3);
+          IntImm c1_div = IntImm(c1.Eval().ty(), c1_val / c3);
+          IntImm c2_div = IntImm(c2.Eval().ty(), c2_val / c3);
           return RecursiveRewrite(
               floordiv(x.Eval() * c1_div + floordiv(y.Eval() + z.Eval(), c3), c2_div));
         }
@@ -125,12 +125,12 @@ class SplitPatternReNormalizer : public IRMutatorWithAnalyzer {
       if (c1_val > 0 && c2_val > 0) {
         int64_t c3 = ZeroAwareGCD(c1_val, c2_val);
         if (c3 > 1) {
-          IntImm c1_div = IntImm(c1.Eval().dtype(), c1_val / c3);
-          IntImm c2_div = IntImm(c2.Eval().dtype(), c2_val / c3);
+          IntImm c1_div = IntImm(c1.Eval().ty(), c1_val / c3);
+          IntImm c2_div = IntImm(c2.Eval().ty(), c2_val / c3);
           return RecursiveRewrite(
               floordiv(x.Eval() * Broadcast(c1_div, lanes.Eval()) +
                            floordiv(y.Eval() + z.Eval(),
-                                    Broadcast(IntImm(c1.Eval().dtype(), c3), lanes.Eval())),
+                                    Broadcast(IntImm(c1.Eval().ty(), c3), lanes.Eval())),
                        Broadcast(c2_div, lanes.Eval())));
         }
       }
diff --git a/src/s_tir/transform/rewrite_unsafe_select.cc b/src/s_tir/transform/rewrite_unsafe_select.cc
index 8a0c3f1b4bd3..38a60ae81933 100644
--- a/src/s_tir/transform/rewrite_unsafe_select.cc
+++ b/src/s_tir/transform/rewrite_unsafe_select.cc
@@ -117,10 +117,11 @@ class UnsafeSelectRewriter : public StmtExprMutator {
     PrimExpr expr = StmtExprMutator::VisitExpr_(op);
     op = expr.as<SelectNode>();
     UnsafeExprDetector unsafe;
-    bool cond_is_scalar_bool = op->condition.dtype().is_bool() && op->condition.dtype().is_scalar();
+    PrimType cond_ty = op->condition.ty();
+    bool cond_is_scalar_bool = cond_ty.MatchesCode(DLDataTypeCode::kDLBool) && cond_ty.IsScalar();
     if ((unsafe.VisitExpr(op->true_value) || unsafe.VisitExpr(op->false_value)) &&
         cond_is_scalar_bool) {
-      return Call(op->dtype, builtin::if_then_else(),
+      return Call(ffi::GetRef<PrimExpr>(op).ty(), builtin::if_then_else(),
                   {op->condition, op->true_value, op->false_value});
     } else {
       return expr;
diff --git a/src/s_tir/transform/storage_access.cc b/src/s_tir/transform/storage_access.cc
index 0a347abb71c7..d4dddbde6243 100644
--- a/src/s_tir/transform/storage_access.cc
+++ b/src/s_tir/transform/storage_access.cc
@@ -43,7 +43,8 @@ void StorageAccessVisitor::VisitExpr_(const BufferLoadNode* op) {
     AccessEntry e;
     e.threads = env_threads();
     e.buffer = buf;
-    e.dtype = op->dtype.element_of();
+    e.dtype = op->ty()->dtype;
+    e.dtype.lanes = 1;
     for (const auto& index : op->indices) {
       e.touched.push_back(arith::IntSet::Vector(index));
     }
@@ -66,7 +67,8 @@ void StorageAccessVisitor::VisitStmt_(const BufferStoreNode* op) {
     AccessEntry e;
     e.threads = env_threads();
     e.buffer = buf;
-    e.dtype = op->value.dtype().element_of();
+    e.dtype = op->value.ty()->dtype;
+    e.dtype.lanes = 1;
     for (const auto& index : op->indices) {
       e.touched.push_back(arith::IntSet::Vector(index));
     }
@@ -240,7 +242,7 @@ void StorageAccessVisitor::VisitExpr_(const CallNode* op) {
     StmtExprVisitor::VisitExpr_(load);
   } else if (op->op.same_as(builtin::tvm_access_ptr())) {
     TVM_FFI_ICHECK_EQ(op->args.size(), 5U);
-    DataType dtype = op->args[0].dtype();
+    DLDataType dtype = op->args[0].ty()->dtype;
     const VarNode* buffer = op->args[1].as<VarNode>();
     if (buffer == nullptr) {
       // args[1] is not a raw Var — e.g. a nested tvm_access_ptr or some
diff --git a/src/s_tir/transform/storage_access.h b/src/s_tir/transform/storage_access.h
index d85dc5a3c3ae..b3bb8df1801b 100644
--- a/src/s_tir/transform/storage_access.h
+++ b/src/s_tir/transform/storage_access.h
@@ -61,7 +61,7 @@ class StorageAccessVisitor : public StmtExprVisitor {
     /*! \brief The buffer variable, if any */
     Var buffer = Var(ffi::ObjectPtr<VarNode>(nullptr));
     /*! \brief The access data type */
-    DataType dtype;
+    DLDataType dtype;
     /*! \brief The touched access range
      *
      * Has one IntSet for each index in the buffer being accessed.
diff --git a/src/s_tir/transform/thread_storage_sync.cc b/src/s_tir/transform/thread_storage_sync.cc
index 254a2d72e36e..3d7122fc821f 100644
--- a/src/s_tir/transform/thread_storage_sync.cc
+++ b/src/s_tir/transform/thread_storage_sync.cc
@@ -293,7 +293,7 @@ class ThreadSyncAfterWaitQueueInserter : public StmtExprMutator {
 
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == s_tir::attr::async_wait_queue_scope) {
-      auto sync = Evaluate(Call(DataType::Int(32), builtin::tvm_storage_sync(),
+      auto sync = Evaluate(Call(PrimType::Int(32), builtin::tvm_storage_sync(),
                                 {StringImm(sync_scope_.to_string())}));
       auto inner = op->body.as<AttrStmtNode>();
       TVM_FFI_ICHECK(inner && inner->attr_key == s_tir::attr::async_wait_inflight_count);
@@ -318,7 +318,7 @@ class ThreadSyncInserter : public StmtExprMutator {
   Stmt VisitStmt(const Stmt& stmt) final {
     if (syncs_.size() == 0) return stmt;
     if (syncs_.count(stmt.get())) {
-      Stmt barrier = Evaluate(Call(DataType::Int(32), builtin::tvm_storage_sync(),
+      Stmt barrier = Evaluate(Call(PrimType::Int(32), builtin::tvm_storage_sync(),
                                    {StringImm(sync_scope_.to_string())}));
       // Mutate after query, to avoid stmt change.
       auto ret = StmtExprMutator::VisitStmt(stmt);
diff --git a/src/s_tir/transform/unify_thread_binding.cc b/src/s_tir/transform/unify_thread_binding.cc
index c3c0b5a170c9..d3a32d8fd17f 100644
--- a/src/s_tir/transform/unify_thread_binding.cc
+++ b/src/s_tir/transform/unify_thread_binding.cc
@@ -55,7 +55,7 @@ class ThreadBindingUnifier : public StmtExprMutator {
     }
     IterVar old_iter_var = op->node.as_or_throw<IterVar>();
     return UnifyThreadBindingImpl(op, old_iter_var->var, old_iter_var,
-                                  Range::FromMinExtent(IntImm(op->value->dtype, 0), op->value));
+                                  Range::FromMinExtent(IntImm(op->value.ty(), 0), op->value));
   }
 
   Stmt VisitStmt_(const ForNode* op) final {
@@ -76,12 +76,12 @@ class ThreadBindingUnifier : public StmtExprMutator {
 
     } else {
       // Create a new unit loop with the annotation.
-      DataType dtype = op->loop_var->dtype;
-      return For(/*loop_var=*/Var("var", dtype),   //
-                 /*min=*/IntImm(dtype, 0),         //
-                 /*extent=*/IntImm(dtype, 1),      //
-                 /*kind=*/ForKind::kSerial, stmt,  //
-                 /*thread_binding=*/std::nullopt,  //
+      PrimType loop_ty = op->loop_var.ty();
+      return For(/*loop_var=*/Var("var", loop_ty),  //
+                 /*min=*/IntImm(loop_ty, 0),        //
+                 /*extent=*/IntImm(loop_ty, 1),     //
+                 /*kind=*/ForKind::kSerial, stmt,   //
+                 /*thread_binding=*/std::nullopt,   //
                  /*annotation=*/std::move(annotations),
                  /*step=*/std::nullopt);
     }
@@ -121,7 +121,7 @@ class ThreadBindingUnifier : public StmtExprMutator {
           << "` should have the same extent. However, there are two loops with extent "
           << new_iter_var->dom->extent << " and " << dom->extent << ", which are not equal";
     } else {
-      new_iter_var = IterVar(dom, Var(thread_tag, dom->extent.dtype()), old_iter_var->iter_type,
+      new_iter_var = IterVar(dom, Var(thread_tag, dom->extent.ty()), old_iter_var->iter_type,
                              old_iter_var->thread_tag);
       thread_tag2iter_var_map_.Set(thread_tag, new_iter_var);
       launch_threads_.push_back(new_iter_var);
@@ -130,7 +130,7 @@ class ThreadBindingUnifier : public StmtExprMutator {
     // Step 4. We will substitute the occurrences of the old variable in the old IterVar with the
     // new variable in further mutation. Thus, we store the mapping entry. Cast to old dtype if
     // needed (we assume both old and new dtype are valid for the range of the thread extent).
-    var_substitution_map_.Set(old_var, cast(old_var.dtype(), new_iter_var->var));
+    var_substitution_map_.Set(old_var, cast(old_var.ty(), new_iter_var->var));
 
     // Step 5. Mutate recursively, update the body with the new IterVar, and restore the depth
     // counter. Emit for-loops to launch threads if current statement is the outermost thread
diff --git a/src/script/printer/doc_printer/python_doc_printer.cc b/src/script/printer/doc_printer/python_doc_printer.cc
index 295b3e20e4e3..55da056f407a 100644
--- a/src/script/printer/doc_printer/python_doc_printer.cc
+++ b/src/script/printer/doc_printer/python_doc_printer.cc
@@ -323,7 +323,8 @@ void PythonDocPrinter::PrintTypedDoc(const LiteralDoc& doc) {
   if (value == nullptr) {
     output_ << "None";
   } else if (const auto* int_imm = value.as<IntImmNode>()) {
-    if (int_imm->dtype.is_bool()) {
+    PrimType int_ty = int_imm->ty();
+    if (int_ty.MatchesCode(DLDataTypeCode::kDLBool)) {
       output_ << (int_imm->value ? "True" : "False");
     } else {
       output_ << int_imm->value;
diff --git a/src/script/printer/ir/distributed.cc b/src/script/printer/ir/distributed.cc
index f748f4e9bd6b..a2840d60e4e9 100644
--- a/src/script/printer/ir/distributed.cc
+++ b/src/script/printer/ir/distributed.cc
@@ -16,6 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include <tvm/ffi/container/shape.h>
 #include <tvm/ir/expr.h>
 
 #include "./utils.h"
diff --git a/src/script/printer/script_printer.cc b/src/script/printer/script_printer.cc
index c0d4b88b3107..d46b061401c2 100644
--- a/src/script/printer/script_printer.cc
+++ b/src/script/printer/script_printer.cc
@@ -70,13 +70,13 @@ PrinterConfig::PrinterConfig(ffi::Map<ffi::String, Any> config_dict) {
     n->module_alias = v.value().as_or_throw<ffi::String>();
   }
   if (auto v = config_dict.Get("buffer_dtype")) {
-    n->buffer_dtype = DataType(ffi::StringToDLDataType(v.value().as_or_throw<ffi::String>()));
+    n->buffer_dtype = ffi::StringToDLDataType(v.value().as_or_throw<ffi::String>());
   }
   if (auto v = config_dict.Get("int_dtype")) {
-    n->int_dtype = DataType(ffi::StringToDLDataType(v.value().as_or_throw<ffi::String>()));
+    n->int_dtype = ffi::StringToDLDataType(v.value().as_or_throw<ffi::String>());
   }
   if (auto v = config_dict.Get("float_dtype")) {
-    n->float_dtype = DataType(ffi::StringToDLDataType(v.value().as_or_throw<ffi::String>()));
+    n->float_dtype = ffi::StringToDLDataType(v.value().as_or_throw<ffi::String>());
   }
   if (auto v = config_dict.Get("verbose_expr")) {
     n->verbose_expr = v.value().cast<bool>();
diff --git a/src/script/printer/utils.h b/src/script/printer/utils.h
index b30a26e15686..99e98556b6c6 100644
--- a/src/script/printer/utils.h
+++ b/src/script/printer/utils.h
@@ -114,8 +114,10 @@ inline ExprDoc Relax(const IRDocsifier& d, const ffi::String& attr) {
   return IdDoc(d->cfg->GetExtraConfig<ffi::String>("relax.prefix", "R"))->Attr(attr);
 }
 
-inline std::string DType2Str(const runtime::DataType& dtype) {
-  return dtype.is_void() ? "void" : ffi::DLDataTypeToString(dtype);
+inline std::string DType2Str(DLDataType dtype) {
+  return (((dtype).code == kDLOpaqueHandle) && ((dtype).bits == 0) && ((dtype).lanes == 0))
+             ? "void"
+             : ffi::DLDataTypeToString(dtype);
 }
 
 /*! \brief Add headers as comments to doc if needed */
diff --git a/src/target/build_common.h b/src/target/build_common.h
index 4ad5e9434449..e2b24bf1174d 100644
--- a/src/target/build_common.h
+++ b/src/target/build_common.h
@@ -50,7 +50,7 @@ inline ffi::Map<ffi::String, runtime::FunctionInfo> ExtractFuncInfo(const IRModu
     ffi::Array<DLDataType> arg_types;
     ffi::Array<runtime::ArgExtraTags> arg_extra_tags;
     for (size_t i = 0; i < f->params.size(); ++i) {
-      arg_types.push_back(f->params[i].dtype());
+      arg_types.push_back(f->params[i].ty()->dtype);
       auto is_tensormap = [](const tirx::Var& var) -> bool {
         const auto* type = var->type_annotation.as<PointerTypeNode>();
         if (type == nullptr) {
diff --git a/src/target/intrin_rule.cc b/src/target/intrin_rule.cc
index 300d9c00544e..1729ec1c95f9 100644
--- a/src/target/intrin_rule.cc
+++ b/src/target/intrin_rule.cc
@@ -128,18 +128,19 @@ TVM_REGISTER_OP("tirx.tvm_access_ptr")
       const CallNode* call = e.as<CallNode>();
       TVM_FFI_ICHECK(call != nullptr);
       TVM_FFI_ICHECK_EQ(call->args.size(), 5U);
-      DataType dtype = call->args[0].dtype();
+      PrimType dtype = call->args[0].ty();
       Var buffer_var = call->args[1].as_or_throw<Var>();
       PrimExpr offset = call->args[2];
-      TVM_FFI_ICHECK(call->dtype.is_handle());
+      TVM_FFI_ICHECK(call->ty().IsHandle());
       if (dtype.lanes() != 1) {
-        offset = offset * MakeConst(offset.dtype(), dtype.lanes());
-        offset = Ramp(offset, MakeConst(offset.dtype(), 1), dtype.lanes());
+        PrimType offset_ty = offset.ty();
+        offset = offset * MakeConst(offset_ty, dtype.lanes());
+        offset = Ramp(offset, MakeConst(offset_ty, 1), dtype.lanes());
       }
-      Buffer dummy_buf(buffer_var, dtype.element_of(), {offset + 1}, {}, 0, buffer_var->name_hint,
+      Buffer dummy_buf(buffer_var, dtype.WithLanes(1), {offset + 1}, {}, 0, buffer_var->name_hint,
                        0, 0, kDefault);
       BufferLoad buf_load(dummy_buf, {offset});
-      return Call(DataType::Handle(), builtin::address_of(), {buf_load});
+      return Call(PrimType::Handle(), builtin::address_of(), {buf_load});
     });
 
 PrimExpr DispatchFastErf(const PrimExpr& e) {
@@ -148,9 +149,10 @@ PrimExpr DispatchFastErf(const PrimExpr& e) {
   TVM_FFI_ICHECK(call != nullptr);
   TVM_FFI_ICHECK_EQ(call->args.size(), 1);
   PrimExpr arg = call->args[0];
-  int bits = arg.dtype().bits();
+  PrimType arg_ty = arg.ty();
+  int bits = arg_ty.bits();
   PrimExpr res;
-  if (arg.dtype().is_float() && (bits == 16 || bits == 32)) {
+  if (arg_ty.code() == DLDataTypeCode::kDLFloat && (bits == 16 || bits == 32)) {
     res = fast_erf_float_expr(arg, bits);
   } else {
     TVM_FFI_THROW(InternalError) << "Unsupported type in Metal fast_erf";
@@ -163,9 +165,10 @@ PrimExpr DispatchNumericalStableTanh(const PrimExpr& e) {
   const tirx::CallNode* call = e.as<tirx::CallNode>();
   TVM_FFI_ICHECK(call != nullptr);
   const PrimExpr& x = call->args[0];
-  PrimExpr one = MakeConst(x.dtype(), 1);
-  PrimExpr two = MakeConst(x.dtype(), 2);
-  PrimExpr neg_two = MakeConst(x.dtype(), -2);
+  PrimType x_ty = x.ty();
+  PrimExpr one = MakeConst(x_ty, 1);
+  PrimExpr two = MakeConst(x_ty, 2);
+  PrimExpr neg_two = MakeConst(x_ty, -2);
 
   PrimExpr exp_neg2x = exp(neg_two * x);
   PrimExpr exp_pos2x = exp(two * x);
@@ -173,7 +176,7 @@ PrimExpr DispatchNumericalStableTanh(const PrimExpr& e) {
   PrimExpr tanh_pos = (one - exp_neg2x) / (one + exp_neg2x);
   PrimExpr tanh_neg = (exp_pos2x - one) / (exp_pos2x + one);
   // MakeConst can handle both vector and scalar types.
-  return tirx::Select(x >= MakeConst(x.dtype(), 0), tanh_pos, tanh_neg);
+  return tirx::Select(x >= MakeConst(x_ty, 0), tanh_pos, tanh_neg);
 }
 
 }  // namespace intrin
@@ -186,7 +189,7 @@ TVM_REGISTER_OP("tirx.rsqrt")
     .set_attr<FLegalize>("default.FLegalize", [](const PrimExpr& e) -> PrimExpr {
       const CallNode* call = e.as<CallNode>();
       TVM_FFI_ICHECK(call != nullptr);
-      auto one = MakeConst(call->args[0].dtype(), 1);
+      auto one = MakeConst(call->args[0].ty(), 1);
       return one / sqrt(call->args[0]);
     });
 
@@ -194,7 +197,7 @@ TVM_REGISTER_OP("tirx.sigmoid")
     .set_attr<FLegalize>("default.FLegalize", [](const PrimExpr& e) -> PrimExpr {
       const CallNode* call = e.as<CallNode>();
       TVM_FFI_ICHECK(call != nullptr);
-      auto one = MakeConst(call->args[0].dtype(), 1);
+      auto one = MakeConst(call->args[0].ty(), 1);
       return one / (one + exp(-call->args[0]));
     });
 
@@ -226,14 +229,19 @@ TVM_REGISTER_OP("tirx.isinf")
 static PrimExpr QMultiplyShift(PrimExpr x, PrimExpr y, PrimExpr q, PrimExpr left_shift,
                                PrimExpr right_shift, PrimExpr is_left_shift_required) {
   // Only int32 types are supported (any number of lanes is allowed)
-  TVM_FFI_ICHECK(y.dtype().code() == DLDataTypeCode::kDLInt && y.dtype().bits() == 32);
-  TVM_FFI_ICHECK(left_shift.dtype().code() == DLDataTypeCode::kDLInt &&
-                 left_shift.dtype().bits() == 32);
-  TVM_FFI_ICHECK(right_shift.dtype().code() == DLDataTypeCode::kDLInt &&
-                 right_shift.dtype().bits() == 32);
-
-  DataType hp_dtype = DataType::Int(64, x.dtype().lanes());
-  DataType lp_dtype = DataType::Int(32, x.dtype().lanes());
+  TVM_FFI_ICHECK(y.ty().MatchesElementType(DLDataTypeCode::kDLInt, 32));
+  TVM_FFI_ICHECK(left_shift.ty().MatchesElementType(DLDataTypeCode::kDLInt, 32));
+  TVM_FFI_ICHECK(right_shift.ty().MatchesElementType(DLDataTypeCode::kDLInt, 32));
+
+  PrimType x_ty = x.ty();
+  auto signed_int_ty = [](int bits, const PrimType& source_ty) {
+    if (source_ty.IsScalableVector()) {
+      return PrimType::ScalableVector(DLDataTypeCode::kDLInt, bits, source_ty.VScaleFactor());
+    }
+    return PrimType::Int(bits, source_ty.lanes());
+  };
+  PrimType hp_dtype = signed_int_ty(64, x_ty);
+  PrimType lp_dtype = signed_int_ty(32, x_ty);
 
   // 1) Cast and Multiply the integer multiplier
   PrimExpr one = MakeConst(hp_dtype, 1);
@@ -290,7 +298,11 @@ TVM_REGISTER_OP("tirx.q_multiply_shift")
           return x << exp;
         } else {
           // power of 2 is less than 0, round and then apply right shift.
-          DataType lp_dtype = DataType::Int(32, x.dtype().lanes());
+          PrimType x_ty = x.ty();
+          PrimType lp_dtype =
+              x_ty.IsScalableVector()
+                  ? PrimType::ScalableVector(DLDataTypeCode::kDLInt, 32, x_ty.VScaleFactor())
+                  : PrimType::Int(32, x_ty.lanes());
           PrimExpr one = MakeConst(lp_dtype, 1);
           exp = -exp;
           PrimExpr rounding_factor = one << (exp - 1);
@@ -299,10 +311,11 @@ TVM_REGISTER_OP("tirx.q_multiply_shift")
         }
       } else {
         // Only int32 types are supported (any number of lanes is allowed)
-        TVM_FFI_ICHECK(s.dtype().code() == DLDataTypeCode::kDLInt && s.dtype().bits() == 32);
+        TVM_FFI_ICHECK(s.ty().MatchesElementType(DLDataTypeCode::kDLInt, 32));
 
         // Calculating integer shifts. MakeConst can handle both vector and scalar types.
-        PrimExpr zero = MakeConst(s.dtype(), 0);
+        PrimType s_ty = s.ty();
+        PrimExpr zero = MakeConst(s_ty, 0);
         PrimExpr left_shift = tirx::Select(s > zero, s, zero);
         PrimExpr right_shift = tirx::Select(s > zero, zero, -s);
         PrimExpr is_left_shift_required = (left_shift != zero);
diff --git a/src/target/intrin_rule.h b/src/target/intrin_rule.h
index a5f5a8931283..cf72a291ada6 100644
--- a/src/target/intrin_rule.h
+++ b/src/target/intrin_rule.h
@@ -25,6 +25,7 @@
 #define TVM_TARGET_INTRIN_RULE_H_
 
 #include <tvm/ffi/function.h>
+#include <tvm/ir/type.h>
 #include <tvm/tirx/builtin.h>
 #include <tvm/tirx/expr.h>
 
@@ -37,10 +38,10 @@ using namespace tirx;
 
 // Add float suffix to the intrinsics
 struct FloatSuffix {
-  std::string operator()(DataType t, std::string name) const {
-    if (t == DataType::Float(32)) {
+  std::string operator()(PrimType t, std::string name) const {
+    if (t->dtype == DLDataType{kDLFloat, 32, 1}) {
       return name + 'f';
-    } else if (t == DataType::Float(64)) {
+    } else if (t->dtype == DLDataType{kDLFloat, 64, 1}) {
       return name;
     } else {
       return "";
@@ -50,7 +51,7 @@ struct FloatSuffix {
 
 // Return the intrinsic name
 struct Direct {
-  std::string operator()(DataType t, std::string name) const { return name; }
+  std::string operator()(PrimType t, std::string name) const { return name; }
 };
 
 /*!
@@ -69,13 +70,10 @@ inline PrimExpr DispatchPureExtern(const PrimExpr& e) {
   TVM_FFI_ICHECK(op != nullptr);
   std::string name = op->name;
   TVM_FFI_ICHECK_EQ(name.substr(0, 5), "tirx.");
-  DataType dtype;
   if (dtype_from_arg) {
     TVM_FFI_ICHECK_EQ(call->args.size(), 1U);
-    dtype = call->args[0].dtype();
-  } else {
-    dtype = call->dtype;
   }
+  PrimType dtype = dtype_from_arg ? call->args[0].ty() : call->ty();
   name = T()(dtype, name.substr(5));
 
   if (name.length() != 0) {
@@ -83,7 +81,7 @@ inline PrimExpr DispatchPureExtern(const PrimExpr& e) {
     for (auto arg : call->args) {
       new_args.push_back(arg);
     }
-    return Call(call->dtype, builtin::call_pure_extern(), new_args);
+    return Call(e.ty(), builtin::call_pure_extern(), new_args);
   } else {
     return e;
   }
diff --git a/src/target/llvm/codegen_arm.cc b/src/target/llvm/codegen_arm.cc
index a9a0acb41213..149e3ee43f4f 100644
--- a/src/target/llvm/codegen_arm.cc
+++ b/src/target/llvm/codegen_arm.cc
@@ -67,17 +67,18 @@ llvm::Value* CodeGenARM::CreateIntrinsic(const CallNode* op) {
 PrimExpr CodeGenARM::ARMPopcount(const CallNode* call) {
   using namespace tirx;
   const PrimExpr& e = call->args[1];
+  PrimType call_ty = call->ty();
   llvm::Intrinsic::ID ctpop_id = llvm::Intrinsic::ctpop;
   llvm::Intrinsic::ID vpaddlu_id = llvm::Intrinsic::arm_neon_vpaddlu;
 
   // Fallback to default llvm lowering rule if input type not a full vector or half vector length
-  int total_size = call->dtype.bits() * call->dtype.lanes();
-  if (!call->dtype.is_fixed_length_vector() || call->dtype.bits() == 8 ||
+  int total_size = call_ty.bits() * call_ty.lanes();
+  if (!call_ty.IsFixedLengthVector() || call_ty.bits() == 8 ||
       (total_size != 128 && total_size != 64)) {
     ffi::Array<PrimExpr> vcnt_args;
-    vcnt_args.push_back(IntImm(DataType::UInt(32), ctpop_id));
+    vcnt_args.push_back(IntImm(PrimType::UInt(32), ctpop_id));
     vcnt_args.push_back(e);
-    return tirx::Call(call->dtype, builtin_call_llvm_pure_intrin_, vcnt_args);
+    return tirx::Call(call->ty(), builtin_call_llvm_pure_intrin_, vcnt_args);
   }
 
   // Popcount lowering rule:
@@ -86,11 +87,12 @@ PrimExpr CodeGenARM::ARMPopcount(const CallNode* call) {
   // to return back to original input type
 
   // Dvisions are always divisible (number of bits = 64 or 128)
-  DataType uint8_type = DataType(e.dtype().code(), 8, e.dtype().bits() * e.dtype().lanes() / 8);
-  DataType uint16_type =
-      DataType(uint8_type.code(), 16, uint8_type.bits() * uint8_type.lanes() / 16);
-  DataType uint32_type =
-      DataType(uint16_type.code(), 32, uint8_type.bits() * uint8_type.lanes() / 32);
+  PrimType e_dtype(e.ty()->dtype);
+  PrimType uint8_type = PrimType(e_dtype.code(), 8, e_dtype.bits() * e_dtype.lanes() / 8);
+  PrimType uint16_type =
+      PrimType(uint8_type.code(), 16, uint8_type.bits() * uint8_type.lanes() / 16);
+  PrimType uint32_type =
+      PrimType(uint16_type.code(), 32, uint8_type.bits() * uint8_type.lanes() / 32);
 
   // Interpret input as vector of 8bit values
   PrimExpr input8 = reinterpret(uint8_type, e);
@@ -98,33 +100,33 @@ PrimExpr CodeGenARM::ARMPopcount(const CallNode* call) {
   const CallNode* c0 = input8.as<CallNode>();
   TVM_FFI_ICHECK(c0 != nullptr);
   ffi::Array<PrimExpr> vcnt8_args;
-  vcnt8_args.push_back(IntImm(DataType::UInt(32), ctpop_id));
+  vcnt8_args.push_back(IntImm(PrimType::UInt(32), ctpop_id));
   vcnt8_args.push_back(input8);
   PrimExpr vcnt8 = tirx::Call(uint8_type, builtin_call_llvm_pure_intrin_, vcnt8_args);
 
   // Accumulation 8->16bit
   ffi::Array<PrimExpr> vcnt16_args;
-  vcnt16_args.push_back(IntImm(DataType::UInt(32), vpaddlu_id));
+  vcnt16_args.push_back(IntImm(PrimType::UInt(32), vpaddlu_id));
   vcnt16_args.push_back(vcnt8);
   PrimExpr vcnt16 = tirx::Call(uint16_type, builtin_call_llvm_pure_intrin_, vcnt16_args);
-  if (call->dtype.bits() == 16) {
+  if (call_ty.bits() == 16) {
     return vcnt16;
   }
 
   // Accumulation 16->32bit
   ffi::Array<PrimExpr> vcnt32_args;
-  vcnt32_args.push_back(IntImm(DataType::UInt(32), vpaddlu_id));
+  vcnt32_args.push_back(IntImm(PrimType::UInt(32), vpaddlu_id));
   vcnt32_args.push_back(vcnt16);
   PrimExpr vcnt32 = tirx::Call(uint32_type, builtin_call_llvm_pure_intrin_, vcnt32_args);
-  if (call->dtype.bits() == 32) {
+  if (call_ty.bits() == 32) {
     return vcnt32;
   }
 
   // Accumulation 32->64bit
   ffi::Array<PrimExpr> vcnt64_args;
-  vcnt64_args.push_back(IntImm(DataType::UInt(32), vpaddlu_id));
+  vcnt64_args.push_back(IntImm(PrimType::UInt(32), vpaddlu_id));
   vcnt64_args.push_back(vcnt32);
-  return tirx::Call(call->dtype, builtin_call_llvm_pure_intrin_, vcnt64_args);
+  return tirx::Call(call->ty(), builtin_call_llvm_pure_intrin_, vcnt64_args);
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index b43afb021454..fa73cf51c6fd 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -80,7 +80,7 @@ void CodeGenCPU::Init(const std::string& module_name, LLVMTarget* llvm_target,
 
   // Runtime types.
   t_tvm_shape_index_ =
-      llvm::Type::getIntNTy(*llvm_target_->GetContext(), DataType::ShapeIndex().bits());
+      llvm::Type::getIntNTy(*llvm_target_->GetContext(), DefaultIndexPrimType().bits());
   // Defined in 3rdparty/dlpack/include/dlpack/dlpack.h:
   // typedef struct { DLDeviceType device_type; int device_id; } DLDevice;
   t_tvm_device_ = llvm::StructType::create({t_int_, t_int_});
@@ -278,7 +278,7 @@ std::unique_ptr<llvm::Module> CodeGenCPU::Finish() {
   return CodeGenLLVM::Finish();
 }
 
-CodeGenLLVM::TypedPointer CodeGenCPU::CreateStructRefPtr(DataType t, llvm::Value* buf,
+CodeGenLLVM::TypedPointer CodeGenCPU::CreateStructRefPtr(PrimType t, llvm::Value* buf,
                                                          llvm::Value* index, int kind) {
   if (kind < builtin::kDLTensorKindBound_) {
     if (buf->getType() == t_void_p_) {
@@ -366,21 +366,21 @@ CodeGenLLVM::TypedPointer CodeGenCPU::CreateStructRefPtr(DataType t, llvm::Value
       buf = builder_->CreatePointerCast(buf, llvmGetPointerTo(t_tvm_ffi_any_, 0));
       // field 2 is the union value
       buf = builder_->CreateInBoundsGEP(t_tvm_ffi_any_, buf, {index, ConstInt32(2)});
-      if (t.is_bool()) {
+      if (t.MatchesCode(DLDataTypeCode::kDLBool)) {
         // it should be safe to set the pointer to the first byte of the union value
         buf = builder_->CreatePointerCast(buf, llvmGetPointerTo(DTypeToLLVMType(t), 0));
         return TypedPointer(t_int8_, buf);
-      } else if (t.is_int() && t.bits() == 64) {
+      } else if (t.MatchesCode(DLDataTypeCode::kDLInt) && t.bits() == 64) {
         buf = builder_->CreatePointerCast(buf, llvmGetPointerTo(t_int64_, 0));
         return TypedPointer(t_int64_, buf);
-      } else if (t.is_float() && t.bits() == 64) {
+      } else if (t.MatchesCode(DLDataTypeCode::kDLFloat) && t.bits() == 64) {
         buf = builder_->CreatePointerCast(buf, llvmGetPointerTo(t_float64_, 0));
         return TypedPointer(t_float64_, buf);
-      } else if (t.is_handle()) {
+      } else if (t.IsHandle()) {
         buf = builder_->CreatePointerCast(buf, llvmGetPointerTo(t_void_p_, 0));
         return TypedPointer(t_void_p_, buf);
       } else {
-        LOG(DEBUG) << "DataType " << t << " cannot be stored into a TVMFFIAny's value field";
+        LOG(DEBUG) << "PrimType " << t << " cannot be stored into a TVMFFIAny's value field";
       }
     }
     case builtin::kInt64ArrayElem: {
@@ -559,7 +559,7 @@ void CodeGenCPU::CreateComputeScope(const AttrStmtNode* op) {
     llvm::Argument* v = &(*it);
     const Var& var = vargs[idx];
     var_map_[var.get()] = v;
-    if (var.dtype().is_handle() && !alias_var_set_.count(var.get())) {
+    if (var.ty().IsHandle() && !alias_var_set_.count(var.get())) {
       // set non alias.
       fcompute->addParamAttr(idx, llvm::Attribute::NoAlias);
       // always not inline compute function to make the code structure clean
@@ -577,8 +577,8 @@ void CodeGenCPU::CreateComputeScope(const AttrStmtNode* op) {
   }
 
   function_ = fcompute;
-  di_subprogram_ = CreateDebugFunction(MakeStringRef(value->value), vargs.Map(GetType),
-                                       PrimType(DataType::Int(32)));
+  di_subprogram_ =
+      CreateDebugFunction(MakeStringRef(value->value), vargs.Map(GetType), PrimType::Int(32));
   auto* compute_entry = llvm::BasicBlock::Create(*ctx, "entry", function_);
   builder_->SetInsertPoint(compute_entry);
   this->VisitStmt(op->body);
@@ -655,8 +655,8 @@ void CodeGenCPU::CreateParallelLaunch(const Stmt& body, int num_task, std::strin
   UnpackClosureData(cdata, vfields, &new_vmap);
   // setup parallel env
   ParallelEnv par_env;
-  par_env.task_id = Var("task_id", DataType::Int(32));
-  par_env.num_task = Var("num_task", DataType::Int(32));
+  par_env.task_id = Var("task_id", PrimType::Int(32));
+  par_env.num_task = Var("num_task", PrimType::Int(32));
   new_vmap[par_env.task_id.get()] = task_id;
   new_vmap[par_env.num_task.get()] = builder_->CreateLoad(
       t_int32_,
@@ -787,7 +787,7 @@ llvm::Value* CodeGenCPU::GetPackedFuncHandle(const std::string& fname) {
 }
 
 CodeGenCPU::PackedCall CodeGenCPU::MakeCallPackedLowered(const ffi::Array<PrimExpr>& args,
-                                                         const DataType& r_type,
+                                                         const PrimType& r_type,
                                                          const int64_t begin, const int64_t end,
                                                          bool use_env_lookup) {
   std::string func_name = [&]() {
@@ -835,9 +835,9 @@ CodeGenCPU::PackedCall CodeGenCPU::MakeCallPackedLowered(const ffi::Array<PrimEx
 
   PackedCall pc = {nullptr};
 
-  if (!r_type.is_void()) {
+  if (!r_type.IsVoid()) {
     // Load the return value and cast it to the designated type (r_type).
-    DataType r_api_type = tirx::APIType(r_type);
+    PrimType r_api_type = tirx::APIType(r_type);
     llvm::Type* llvm_r_api_type = DTypeToLLVMType(r_api_type);
     llvm::Value* result_value =
         builder_->CreateInBoundsGEP(t_tvm_ffi_any_, result, {ConstInt32(0), ConstInt32(2)});
@@ -860,14 +860,16 @@ CodeGenCPU::PackedCall CodeGenCPU::MakeCallPackedLowered(const ffi::Array<PrimEx
 llvm::Value* CodeGenCPU::CreateCallPacked(const CallNode* op) {
   TVM_FFI_ICHECK_EQ(op->args.size(), 4U);
   bool use_string_lookup = op->op.same_as(builtin::tvm_call_packed_lowered());
-  PackedCall pc = MakeCallPackedLowered(op->args, op->dtype, op->args[2].as<IntImmNode>()->value,
+  PackedCall pc = MakeCallPackedLowered(op->args, PrimType(op->ty()->dtype),
+                                        op->args[2].as<IntImmNode>()->value,
                                         op->args[3].as<IntImmNode>()->value, use_string_lookup);
   return pc.ret_value;
 }
 
 llvm::Value* CodeGenCPU::CreateCallTracePacked(const CallNode* op) {
   TVM_FFI_ICHECK_EQ(op->args.size(), 5U);
-  PackedCall pc = MakeCallPackedLowered(op->args, op->dtype, op->args[2].as<IntImmNode>()->value,
+  PackedCall pc = MakeCallPackedLowered(op->args, PrimType(op->ty()->dtype),
+                                        op->args[2].as<IntImmNode>()->value,
                                         op->args[3].as<IntImmNode>()->value, true);
   llvm::LLVMContext* ctx = llvm_target_->GetContext();
   // Get traced value.
@@ -1029,16 +1031,17 @@ llvm::Value* CodeGenCPU::CreateIntrinsic(const CallNode* op) {
   } else if (op->op.same_as(builtin::tvm_struct_get())) {
     TVM_FFI_ICHECK_EQ(op->args.size(), 3U);
     int kind = op->args[2].as<IntImm>().value()->value;
+    PrimType op_dtype(op->ty()->dtype);
     TypedPointer ref =
-        CreateStructRefPtr(op->dtype, MakeValue(op->args[0]), MakeValue(op->args[1]), kind);
+        CreateStructRefPtr(op_dtype, MakeValue(op->args[0]), MakeValue(op->args[1]), kind);
     if (kind == builtin::kDLTensorAddr) {
       return builder_->CreatePointerCast(ref.addr, t_void_p_);
     }
 
     llvm::Value* struct_value = builder_->CreateLoad(ref.type, ref.addr);
 
-    if (op->dtype == DataType::Bool()) {
-      struct_value = CreateCast(DataType::Int(64), op->dtype, struct_value);
+    if (op_dtype == PrimType::Bool()) {
+      struct_value = CreateCast(PrimType::Int(64), op_dtype, struct_value);
     }
 
     return struct_value;
@@ -1046,7 +1049,7 @@ llvm::Value* CodeGenCPU::CreateIntrinsic(const CallNode* op) {
     TVM_FFI_ICHECK_EQ(op->args.size(), 4U);
     int kind = op->args[2].as<IntImm>().value()->value;
     llvm::Value* value = MakeValue(op->args[3]);
-    TypedPointer ref = CreateStructRefPtr(op->args[3].dtype(), MakeValue(op->args[0]),
+    TypedPointer ref = CreateStructRefPtr(PrimType(op->args[3].ty()->dtype), MakeValue(op->args[0]),
                                           MakeValue(op->args[1]), kind);
     TVM_FFI_ICHECK(kind != builtin::kDLTensorAddr);
     if (value->getType()->isPointerTy()) {
@@ -1180,7 +1183,7 @@ void CodeGenCPU::VisitStmt_(const ForNode* op) {
       TVM_FFI_ICHECK(parallel_env_.task_id.defined());
       TVM_FFI_ICHECK(parallel_env_.num_task.defined());
       TVM_FFI_ICHECK(parallel_env_.penv != nullptr);
-      DataType t = op->extent.dtype();
+      PrimType t(op->extent.ty()->dtype);
       PrimExpr num_task = cast(t, parallel_env_.num_task);
       PrimExpr task_id = cast(t, parallel_env_.task_id);
       TVM_FFI_ICHECK(!parallel_env_.in_parallel_loop)
diff --git a/src/target/llvm/codegen_cpu.h b/src/target/llvm/codegen_cpu.h
index 6096cc140517..d090687b7e31 100644
--- a/src/target/llvm/codegen_cpu.h
+++ b/src/target/llvm/codegen_cpu.h
@@ -128,7 +128,7 @@ class CodeGenCPU : public CodeGenLLVM {
   llvm::Value* GetPackedFuncHandle(const std::string& str);
   TypedPointer PackClosureData(const ffi::Array<Var>& fields, uint64_t* num_bytes,
                                std::string struct_name = "");
-  TypedPointer CreateStructRefPtr(DataType t, llvm::Value* buffer, llvm::Value* index, int kind);
+  TypedPointer CreateStructRefPtr(PrimType t, llvm::Value* buffer, llvm::Value* index, int kind);
   void UnpackClosureData(TypedPointer cdata, const ffi::Array<Var>& fields,
                          std::unordered_map<const VarNode*, llvm::Value*>* vmap);
   // Make packed call.
@@ -137,7 +137,7 @@ class CodeGenCPU : public CodeGenLLVM {
     llvm::Value* ret_type_index;
     llvm::BasicBlock* end_block;
   };
-  PackedCall MakeCallPackedLowered(const ffi::Array<PrimExpr>& args, const DataType& r_type,
+  PackedCall MakeCallPackedLowered(const ffi::Array<PrimExpr>& args, const PrimType& r_type,
                                    const int64_t begin, const int64_t end, bool use_string_lookup);
   // create call into tvm packed function.
   llvm::Value* CreateCallPacked(const CallNode* op);
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 4eb0a503f09b..0a5acd348a6c 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -99,6 +99,18 @@
 namespace tvm {
 namespace codegen {
 
+namespace {
+
+int GetLanesOrVScaleFactor(const PrimType& dtype) {
+  return dtype.IsScalableVector() ? dtype.VScaleFactor() : dtype.lanes();
+}
+
+PrimType WithScalableVScaleFactor(const PrimType& dtype, int vscale_factor) {
+  return PrimType::ScalableVector(dtype.code(), dtype.bits(), vscale_factor);
+}
+
+}  // namespace
+
 // CodeGenLLVM has members of type std::unique_ptr<T>. These members will be
 // instantiated in the constructor, which will requre that the type T is
 // complete at that point. Put the constructor (and destructor) here, since
@@ -253,7 +265,7 @@ llvm::Function* CodeGenLLVM::DeclareFunctionInternal(const GlobalVar& gvar, cons
   is_restricted_ = func->HasNonzeroAttr(tirx::attr::kNoAlias);
   for (Var param : func->params) {
     param_types.push_back(GetLLVMType(param));
-    if (!is_restricted_ && param.dtype().is_handle()) {
+    if (!is_restricted_ && PrimType(param.ty()->dtype).IsHandle()) {
       alias_var_set_.insert(param.get());
     }
   }
@@ -304,7 +316,7 @@ void CodeGenLLVM::AddFunctionInternal(const GlobalVar& gvar, const PrimFunc& f)
     var_map_[var.get()] = v;
     v->setName(std::string(var->name_hint));
     if (is_restricted_) {
-      if (var.dtype().is_handle() && !alias_var_set_.count(var.get())) {
+      if (PrimType(var.ty()->dtype).IsHandle() && !alias_var_set_.count(var.get())) {
         // set non alias.
         function_->addParamAttr(i, llvm::Attribute::NoAlias);
       }
@@ -558,21 +570,21 @@ int CodeGenLLVM::NativeVectorBits(const runtime::StorageScope& storage_scope) co
 
 unsigned CodeGenLLVM::GetGlobalAddressSpace() const { return 0; }
 
-llvm::Type* CodeGenLLVM::DTypeToLLVMType(const DataType& dtype) const {
-  if (dtype.is_handle()) {
+llvm::Type* CodeGenLLVM::DTypeToLLVMType(const PrimType& dtype) const {
+  if (dtype.IsHandle()) {
     TVM_FFI_ICHECK_EQ(dtype.lanes(), 1);
     return t_void_p_;
   }
-  if (dtype.is_void()) {
+  if (dtype.IsVoid()) {
     return t_void_;
   }
   llvm::Type* etype = nullptr;
   llvm::LLVMContext* ctx = llvm_target_->GetContext();
-  if (dtype.is_int() || dtype.is_uint()) {
+  if (dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
     etype = llvm::Type::getIntNTy(*ctx, dtype.bits());
-  } else if (dtype.is_bool()) {
+  } else if (dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
     etype = t_int1_;
-  } else if (dtype.is_float()) {
+  } else if (dtype.MatchesCode(DLDataTypeCode::kDLFloat)) {
     switch (dtype.bits()) {
       case 16:
         etype = llvm::Type::getHalfTy(*ctx);
@@ -586,21 +598,24 @@ llvm::Type* CodeGenLLVM::DTypeToLLVMType(const DataType& dtype) const {
       default:
         TVM_FFI_THROW(InternalError) << "do not support " << dtype;
     }
-  } else if (dtype.code() == DataType::kFloat8_e3m4 || dtype.code() == DataType::kFloat8_e4m3 ||
-             dtype.code() == DataType::kFloat8_e4m3b11fnuz ||
-             dtype.code() == DataType::kFloat8_e4m3fn ||
-             dtype.code() == DataType::kFloat8_e4m3fnuz || dtype.code() == DataType::kFloat8_e5m2 ||
-             dtype.code() == DataType::kFloat8_e5m2fnuz ||
-             dtype.code() == DataType::kFloat8_e8m0fnu) {
+  } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e3m4 ||
+             dtype.code() == DLDataTypeCode::kDLFloat8_e4m3 ||
+             dtype.code() == DLDataTypeCode::kDLFloat8_e4m3b11fnuz ||
+             dtype.code() == DLDataTypeCode::kDLFloat8_e4m3fn ||
+             dtype.code() == DLDataTypeCode::kDLFloat8_e4m3fnuz ||
+             dtype.code() == DLDataTypeCode::kDLFloat8_e5m2 ||
+             dtype.code() == DLDataTypeCode::kDLFloat8_e5m2fnuz ||
+             dtype.code() == DLDataTypeCode::kDLFloat8_e8m0fnu) {
     etype = llvm::Type::getInt8Ty(*ctx);
-  } else if (dtype.code() == DataType::kFloat6_e2m3fn || dtype.code() == DataType::kFloat6_e3m2fn) {
+  } else if (dtype.code() == DLDataTypeCode::kDLFloat6_e2m3fn ||
+             dtype.code() == DLDataTypeCode::kDLFloat6_e3m2fn) {
     etype = llvm::Type::getIntNTy(*ctx, 6);
-  } else if (dtype.code() == DataType::kFloat4_e2m1fn) {
+  } else if (dtype.code() == DLDataTypeCode::kDLFloat4_e2m1fn) {
     etype = llvm::Type::getIntNTy(*ctx, 4);
   }
-  if (!dtype.is_scalar()) {
-    if (dtype.is_scalable_vector()) {
-      return llvm::VectorType::get(etype, dtype.vscale_factor(), true);
+  if (!dtype.IsScalar()) {
+    if (dtype.IsScalableVector()) {
+      return llvm::VectorType::get(etype, dtype.VScaleFactor(), true);
     } else {
       return llvm::FixedVectorType::get(etype, dtype.lanes());
     }
@@ -611,12 +626,12 @@ llvm::Type* CodeGenLLVM::DTypeToLLVMType(const DataType& dtype) const {
 
 llvm::Type* CodeGenLLVM::GetLLVMType(const Type& type) const {
   if (auto* ptr = type.as<PrimTypeNode>()) {
-    return DTypeToLLVMType(ptr->dtype);
+    return DTypeToLLVMType(PrimType(ptr->dtype));
   } else if (auto* ptr = type.as<PointerTypeNode>()) {
     // LLVM IR doesn't allow void*, so pointer element types that do not
     // have an LLVM scalar equivalent need explicit handling.
     if (auto* primtype = ptr->element_type.as<PrimTypeNode>()) {
-      if (primtype->dtype.is_void()) {
+      if (PrimType(primtype->dtype).IsVoid()) {
         return t_void_p_;
       }
     } else if (ptr->element_type->IsInstance<TensorMapTypeNode>()) {
@@ -645,7 +660,7 @@ llvm::Type* CodeGenLLVM::GetLLVMType(const PrimExpr& expr) const {
 // This trick comes from Halide's CodeGen_LLVM
 //
 void CodeGenLLVM::AddAliasInfo(llvm::Instruction* inst, const VarNode* buffer_var, PrimExpr index,
-                               DataType access_dtype) {
+                               PrimType access_dtype) {
   if (alias_var_set_.count(buffer_var) != 0) {
     // Mark all possibly aliased pointer as same type.
     llvm::MDNode* meta = md_tbaa_alias_set_;
@@ -666,7 +681,7 @@ void CodeGenLLVM::AddAliasInfo(llvm::Instruction* inst, const VarNode* buffer_va
     base = ptr->value;
     xwith = 1;
   }
-  if (access_dtype.is_scalable_vector()) {
+  if (access_dtype.IsScalableVector()) {
     llvm::MDNode* meta = md_tbaa_root_;
     std::ostringstream buffer_addr;
     buffer_addr << buffer_var;
@@ -707,7 +722,7 @@ void CodeGenLLVM::AddAliasInfo(llvm::Instruction* inst, const VarNode* buffer_va
   inst->setMetadata("tbaa", md_builder_->createTBAAStructTagNode(meta, meta, 0));
 }
 
-void CodeGenLLVM::GetAlignment(DataType t, const VarNode* buf_var, const PrimExpr& index,
+void CodeGenLLVM::GetAlignment(PrimType t, const VarNode* buf_var, const PrimExpr& index,
                                int* p_alignment, int* p_native_bits) {
   int max_align_bits = t.bits();
   auto it = alloc_storage_info_.find(buf_var);
@@ -736,7 +751,7 @@ void CodeGenLLVM::GetAlignment(DataType t, const VarNode* buf_var, const PrimExp
   *p_alignment = align_bits / 8;
 }
 
-llvm::GlobalVariable* CodeGenLLVM::AllocateSharedMemory(DataType dtype, size_t size,
+llvm::GlobalVariable* CodeGenLLVM::AllocateSharedMemory(PrimType dtype, size_t size,
                                                         unsigned int shared_address_space,
                                                         int alignment,
                                                         llvm::GlobalValue::LinkageTypes linkage) {
@@ -794,7 +809,7 @@ llvm::Value* CodeGenLLVM::CreateVecFlip(llvm::Value* vec) {
 }
 
 llvm::Value* CodeGenLLVM::CreateVecPad(llvm::Value* vec, int target_lanes) {
-  llvm::Value* mask = llvm::UndefValue::get(DTypeToLLVMType(DataType::Int(32, target_lanes)));
+  llvm::Value* mask = llvm::UndefValue::get(DTypeToLLVMType(PrimType::Int(32, target_lanes)));
   int num_elems = GetVectorNumElements(vec);
   if (num_elems == target_lanes) return vec;
   TVM_FFI_ICHECK_LT(num_elems, target_lanes);
@@ -869,7 +884,7 @@ void CodeGenLLVM::CreateSerialFor(llvm::Value* begin, llvm::Value* end, llvm::Va
   TVM_FFI_ICHECK(!var_map_.count(loop_var.get()));
   var_map_[loop_var.get()] = loop_value;
 
-  auto lt = CreateLT(loop_var.dtype(), loop_value, end);
+  auto lt = CreateLT(PrimType(loop_var.ty()->dtype), loop_value, end);
   builder_->CreateCondBr(lt, for_body, for_end, md_very_likely_branch_);
   builder_->SetInsertPoint(for_body);
   EmitDebugLocation(body->span);
@@ -881,47 +896,56 @@ void CodeGenLLVM::CreateSerialFor(llvm::Value* begin, llvm::Value* end, llvm::Va
 
   builder_->CreateBr(for_next);
   builder_->SetInsertPoint(for_next);
-  llvm::Value* loop_next = CreateAdd(loop_var.dtype(), loop_value, stride);
+  llvm::Value* loop_next = CreateAdd(PrimType(loop_var.ty()->dtype), loop_value, stride);
   loop_value->addIncoming(loop_next, builder_->GetInsertBlock());
   builder_->CreateBr(for_begin);
   builder_->SetInsertPoint(for_end);
 }
 
 // cast operatpr
-llvm::Value* CodeGenLLVM::CreateCast(DataType from, DataType to, llvm::Value* value) {
+llvm::Value* CodeGenLLVM::CreateCast(PrimType from, PrimType to, llvm::Value* value) {
   llvm::Type* target = DTypeToLLVMType(to);
   if (value->getType() == target) return value;
   // TODO(tvm-team): consider add native support
-  TVM_FFI_ICHECK(!from.is_bfloat16()) << "BF16 needs to be storaged lowered first";
-  TVM_FFI_ICHECK(!to.is_bfloat16()) << "BF16 needs to be storaged lowered first";
-
-  if (to.is_handle()) {
+  // Storage lowering depends on scalar element type; LLVM vector type construction
+  // preserves the lane information separately.
+  TVM_FFI_ICHECK(!from.MatchesElementType(DLDataTypeCode::kDLBfloat, 16))
+      << "BF16 needs to be storaged lowered first";
+  TVM_FFI_ICHECK(!to.MatchesElementType(DLDataTypeCode::kDLBfloat, 16))
+      << "BF16 needs to be storaged lowered first";
+
+  if (to.IsHandle()) {
     return builder_->CreateBitCast(value, target);
-  } else if (to.is_bool()) {
-    if (from.is_float()) {
+  } else if (to.MatchesCode(DLDataTypeCode::kDLBool)) {
+    if (from.MatchesCode(DLDataTypeCode::kDLFloat)) {
       llvm::Constant* zero = llvm::ConstantFP::get(DTypeToLLVMType(from), 0.);
       return builder_->CreateFCmpUNE(value, zero);
     } else {
       llvm::Constant* zero = llvm::ConstantInt::get(DTypeToLLVMType(from), 0);
       return builder_->CreateICmpNE(value, zero);
     }
-  } else if (!from.is_float() && !to.is_float()) {
-    return builder_->CreateIntCast(value, target, from.is_int());
-  } else if (from.is_float() && to.is_int()) {
+  } else if (!from.MatchesCode(DLDataTypeCode::kDLFloat) &&
+             !to.MatchesCode(DLDataTypeCode::kDLFloat)) {
+    return builder_->CreateIntCast(value, target, from.MatchesCode(DLDataTypeCode::kDLInt));
+  } else if (from.MatchesCode(DLDataTypeCode::kDLFloat) && to.MatchesCode(DLDataTypeCode::kDLInt)) {
     return builder_->CreateFPToSI(value, target);
-  } else if (from.is_float() && to.is_uint()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLFloat) &&
+             to.MatchesCode(DLDataTypeCode::kDLUInt)) {
     if (to.bits() < 8) {
-      value = builder_->CreateFPToUI(value, DTypeToLLVMType(to.with_bits(8)));
+      value = builder_->CreateFPToUI(value, DTypeToLLVMType(to.WithBits(8)));
       return builder_->CreateIntCast(value, target, false);
     } else {
       return builder_->CreateFPToUI(value, target);
     }
-  } else if (from.is_int() && to.is_float()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLInt) && to.MatchesCode(DLDataTypeCode::kDLFloat)) {
     return builder_->CreateSIToFP(value, target);
-  } else if ((from.is_uint() || from.is_bool()) && to.is_float()) {
+  } else if ((from.MatchesCode(DLDataTypeCode::kDLUInt) ||
+              from.MatchesCode(DLDataTypeCode::kDLBool)) &&
+             to.MatchesCode(DLDataTypeCode::kDLFloat)) {
     return builder_->CreateUIToFP(value, target);
   } else {
-    TVM_FFI_ICHECK(from.is_float() && to.is_float());
+    TVM_FFI_ICHECK(from.MatchesCode(DLDataTypeCode::kDLFloat) &&
+                   to.MatchesCode(DLDataTypeCode::kDLFloat));
     return builder_->CreateFPCast(value, target);
   }
 }
@@ -951,9 +975,9 @@ llvm::Constant* CodeGenLLVM::GetConstString(const std::string& str) {
 }
 
 CodeGenLLVM::TypedPointer CodeGenLLVM::CreateBufferPtr(llvm::Value* buffer_ptr,
-                                                       DataType buffer_element_dtype,
+                                                       PrimType buffer_element_dtype,
                                                        llvm::ArrayRef<llvm::Value*> indices,
-                                                       DataType value_dtype) {
+                                                       PrimType value_dtype) {
   TVM_FFI_ICHECK_EQ(indices.size(), 1)
       << "CodeGenLLVM requires all buffers to be flat 1-d buffers.";
   llvm::Value* index = indices[0];
@@ -1360,7 +1384,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
   } else if (op->op.same_as(builtin::shift_left())) {
     return builder_->CreateShl(MakeValue(op->args[0]), MakeValue(op->args[1]));
   } else if (op->op.same_as(builtin::shift_right())) {
-    if (op->args[0].dtype().is_int()) {
+    if (PrimType(op->args[0].ty()->dtype).MatchesCode(DLDataTypeCode::kDLInt)) {
       return builder_->CreateAShr(MakeValue(op->args[0]), MakeValue(op->args[1]));
     } else {
       return builder_->CreateLShr(MakeValue(op->args[0]), MakeValue(op->args[1]));
@@ -1382,7 +1406,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     }
 
     TypedPointer buffer_ptr = CreateBufferPtr(MakeValue(load->buffer->data), load->buffer->dtype,
-                                              indices_val, load->dtype);
+                                              indices_val, PrimType(load->ty()->dtype));
     return buffer_ptr.addr;
   } else if (op->op.same_as(builtin::reinterpret()) && is_zero(op->args[0])) {
     return llvm::Constant::getNullValue(t_void_p_);
@@ -1397,9 +1421,9 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     uint64_t low = static_cast<uint64_t>(op->args[0].as_or_throw<IntImm>()->value);
     uint64_t high = static_cast<uint64_t>(op->args[1].as_or_throw<IntImm>()->value);
     uint64_t val = (high << 32U) | low;
-    return llvm::ConstantInt::get(DTypeToLLVMType(op->dtype), val);
+    return llvm::ConstantInt::get(DTypeToLLVMType(PrimType(op->ty()->dtype)), val);
   } else if (op->op.same_as(builtin::if_then_else())) {
-    TVM_FFI_ICHECK_EQ(op->args[0].dtype().lanes(), 1)
+    TVM_FFI_ICHECK_EQ(PrimType(op->args[0].ty()->dtype).lanes(), 1)
         << "if_then_else can only take scalar condition";
     llvm::LLVMContext* ctx = llvm_target_->GetContext();
     auto* then_block = llvm::BasicBlock::Create(*ctx, "if_then", function_);
@@ -1453,7 +1477,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     builder_->SetInsertPoint(post_dummy);
     return post_dummy;
   } else if (op->op.same_as(builtin::reinterpret())) {
-    llvm::Type* target = DTypeToLLVMType(op->dtype);
+    llvm::Type* target = DTypeToLLVMType(PrimType(op->ty()->dtype));
     llvm::Value* value = MakeValue(op->args[0]);
     if (value->getType()->isPointerTy() && target->isIntegerTy()) {
       return builder_->CreatePtrToInt(value, target);
@@ -1500,7 +1524,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     return builder_->CreateCall(f);
   } else if (op->op.same_as(builtin::get_active_lane_mask())) {
     llvm::Intrinsic::ID id = llvm::Intrinsic::get_active_lane_mask;
-    llvm::Function* f = GetIntrinsicDecl(id, DTypeToLLVMType(op->dtype),
+    llvm::Function* f = GetIntrinsicDecl(id, DTypeToLLVMType(PrimType(op->ty()->dtype)),
                                          {builder_->getInt32Ty(), builder_->getInt32Ty()});
     return builder_->CreateCall(f, {MakeValue(op->args[0]), MakeValue(op->args[1])});
   } else {
@@ -1510,13 +1534,13 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
 
 void CodeGenLLVM::Scalarize(const PrimExpr& e, std::function<void(int i, llvm::Value* v)> f) {
   if (const RampNode* ramp = e.as<RampNode>()) {
-    for (int i = 0; i < ramp->dtype.lanes(); ++i) {
+    for (int i = 0; i < PrimType(ramp->ty()->dtype).lanes(); ++i) {
       PrimExpr offset = ramp->base + (ramp->stride * i);
       f(i, MakeValue(offset));
     }
   } else {
     llvm::Value* value = MakeValue(e);
-    for (int i = 0; i < e.dtype().lanes(); ++i) {
+    for (int i = 0; i < PrimType(e.ty()->dtype).lanes(); ++i) {
       f(i, builder_->CreateExtractElement(value, i));
     }
   }
@@ -1526,58 +1550,59 @@ void CodeGenLLVM::Scalarize(const PrimExpr& e, std::function<void(int i, llvm::V
 llvm::Value* CodeGenLLVM::VisitExpr_(const VarNode* op) { return GetVarValue(op); }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const CastNode* op) {
-  return CreateCast(op->value.dtype(), op->dtype, MakeValue(op->value));
+  return CreateCast(PrimType(op->value.ty()->dtype), PrimType(op->ty()->dtype),
+                    MakeValue(op->value));
 }
 llvm::Value* CodeGenLLVM::VisitExpr_(const IntImmNode* op) {
-  return llvm::ConstantInt::getSigned(DTypeToLLVMType(op->dtype), op->value);
+  return llvm::ConstantInt::getSigned(DTypeToLLVMType(PrimType(op->ty()->dtype)), op->value);
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const FloatImmNode* op) {
-  return llvm::ConstantFP::get(DTypeToLLVMType(op->dtype), op->value);
+  return llvm::ConstantFP::get(DTypeToLLVMType(PrimType(op->ty()->dtype)), op->value);
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const StringImmNode* op) { return GetConstString(op->value); }
 
-#define DEFINE_CODEGEN_BINARY_OP(Op)                                                 \
-  llvm::Value* CodeGenLLVM::Create##Op(DataType t, llvm::Value* a, llvm::Value* b) { \
-    if (t.is_int()) {                                                                \
-      if (t.bits() >= 32) {                                                          \
-        return builder_->CreateNSW##Op(a, b);                                        \
-      } else {                                                                       \
-        return builder_->Create##Op(a, b);                                           \
-      }                                                                              \
-    } else if (t.is_uint()) {                                                        \
-      if (t.bits() >= 32) {                                                          \
-        return builder_->CreateNUW##Op(a, b);                                        \
-      } else {                                                                       \
-        return builder_->Create##Op(a, b);                                           \
-      }                                                                              \
-    } else {                                                                         \
-      TVM_FFI_ICHECK(t.is_float());                                                  \
-      return builder_->CreateF##Op(a, b);                                            \
-    }                                                                                \
-  }                                                                                  \
-  llvm::Value* CodeGenLLVM::VisitExpr_(const Op##Node* op) {                         \
-    return Create##Op(op->dtype, MakeValue(op->a), MakeValue(op->b));                \
+#define DEFINE_CODEGEN_BINARY_OP(Op)                                                  \
+  llvm::Value* CodeGenLLVM::Create##Op(PrimType t, llvm::Value* a, llvm::Value* b) {  \
+    if (t.MatchesCode(DLDataTypeCode::kDLInt)) {                                      \
+      if (t.bits() >= 32) {                                                           \
+        return builder_->CreateNSW##Op(a, b);                                         \
+      } else {                                                                        \
+        return builder_->Create##Op(a, b);                                            \
+      }                                                                               \
+    } else if (t.MatchesCode(DLDataTypeCode::kDLUInt)) {                              \
+      if (t.bits() >= 32) {                                                           \
+        return builder_->CreateNUW##Op(a, b);                                         \
+      } else {                                                                        \
+        return builder_->Create##Op(a, b);                                            \
+      }                                                                               \
+    } else {                                                                          \
+      TVM_FFI_ICHECK(t.MatchesCode(DLDataTypeCode::kDLFloat));                        \
+      return builder_->CreateF##Op(a, b);                                             \
+    }                                                                                 \
+  }                                                                                   \
+  llvm::Value* CodeGenLLVM::VisitExpr_(const Op##Node* op) {                          \
+    return Create##Op(PrimType(op->ty()->dtype), MakeValue(op->a), MakeValue(op->b)); \
   }
 
 DEFINE_CODEGEN_BINARY_OP(Add);
 DEFINE_CODEGEN_BINARY_OP(Sub);
 DEFINE_CODEGEN_BINARY_OP(Mul);
 
-#define DEFINE_CODEGEN_CMP_OP(Op)                                                    \
-  llvm::Value* CodeGenLLVM::Create##Op(DataType t, llvm::Value* a, llvm::Value* b) { \
-    if (t.is_int()) {                                                                \
-      return builder_->CreateICmpS##Op(a, b);                                        \
-    } else if (t.is_uint()) {                                                        \
-      return builder_->CreateICmpU##Op(a, b);                                        \
-    } else {                                                                         \
-      TVM_FFI_ICHECK(t.is_float());                                                  \
-      return builder_->CreateFCmpO##Op(a, b);                                        \
-    }                                                                                \
-  }                                                                                  \
-  llvm::Value* CodeGenLLVM::VisitExpr_(const Op##Node* op) {                         \
-    return Create##Op(op->a.dtype(), MakeValue(op->a), MakeValue(op->b));            \
+#define DEFINE_CODEGEN_CMP_OP(Op)                                                       \
+  llvm::Value* CodeGenLLVM::Create##Op(PrimType t, llvm::Value* a, llvm::Value* b) {    \
+    if (t.MatchesCode(DLDataTypeCode::kDLInt)) {                                        \
+      return builder_->CreateICmpS##Op(a, b);                                           \
+    } else if (t.MatchesCode(DLDataTypeCode::kDLUInt)) {                                \
+      return builder_->CreateICmpU##Op(a, b);                                           \
+    } else {                                                                            \
+      TVM_FFI_ICHECK(t.MatchesCode(DLDataTypeCode::kDLFloat));                          \
+      return builder_->CreateFCmpO##Op(a, b);                                           \
+    }                                                                                   \
+  }                                                                                     \
+  llvm::Value* CodeGenLLVM::VisitExpr_(const Op##Node* op) {                            \
+    return Create##Op(PrimType(op->a.ty()->dtype), MakeValue(op->a), MakeValue(op->b)); \
   }
 
 DEFINE_CODEGEN_CMP_OP(LT);
@@ -1588,12 +1613,13 @@ DEFINE_CODEGEN_CMP_OP(GE);
 llvm::Value* CodeGenLLVM::VisitExpr_(const DivNode* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
-  if (op->dtype.is_int()) {
+  PrimType dtype(op->ty()->dtype);
+  if (dtype.MatchesCode(DLDataTypeCode::kDLInt)) {
     return builder_->CreateSDiv(a, b);
-  } else if (op->dtype.is_uint()) {
+  } else if (dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
     return builder_->CreateUDiv(a, b);
   } else {
-    TVM_FFI_ICHECK(op->dtype.is_float());
+    TVM_FFI_ICHECK(dtype.MatchesCode(DLDataTypeCode::kDLFloat));
     return builder_->CreateFDiv(a, b);
   }
 }
@@ -1601,12 +1627,13 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const DivNode* op) {
 llvm::Value* CodeGenLLVM::VisitExpr_(const ModNode* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
-  if (op->dtype.is_int()) {
+  PrimType dtype(op->ty()->dtype);
+  if (dtype.MatchesCode(DLDataTypeCode::kDLInt)) {
     return builder_->CreateSRem(a, b);
-  } else if (op->dtype.is_uint()) {
+  } else if (dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
     return builder_->CreateURem(a, b);
   } else {
-    TVM_FFI_ICHECK(op->dtype.is_float());
+    TVM_FFI_ICHECK(dtype.MatchesCode(DLDataTypeCode::kDLFloat));
     return builder_->CreateFRem(a, b);
   }
 }
@@ -1614,19 +1641,20 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const ModNode* op) {
 llvm::Value* CodeGenLLVM::VisitExpr_(const MinNode* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
-  return builder_->CreateSelect(CreateLT(op->a.dtype(), a, b), a, b);
+  return builder_->CreateSelect(CreateLT(PrimType(op->a.ty()->dtype), a, b), a, b);
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const MaxNode* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
-  return builder_->CreateSelect(CreateGT(op->a.dtype(), a, b), a, b);
+  return builder_->CreateSelect(CreateGT(PrimType(op->a.ty()->dtype), a, b), a, b);
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const EQNode* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
-  if (op->a.dtype().is_int() || op->a.dtype().is_uint()) {
+  PrimType dtype(op->a.ty()->dtype);
+  if (dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
     return builder_->CreateICmpEQ(a, b);
   } else {
     return builder_->CreateFCmpOEQ(a, b);
@@ -1636,7 +1664,8 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const EQNode* op) {
 llvm::Value* CodeGenLLVM::VisitExpr_(const NENode* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
-  if (op->a.dtype().is_int() || op->a.dtype().is_uint()) {
+  PrimType dtype(op->a.ty()->dtype);
+  if (dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
     return builder_->CreateICmpNE(a, b);
   } else {
     return builder_->CreateFCmpONE(a, b);
@@ -1675,23 +1704,23 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const LetNode* op) {
   return MakeValue(op->body);
 }
 
-bool CodeGenLLVM::HasAlignmentPadding(DataType dtype) {
-  if (dtype.is_scalable_vector()) {
+bool CodeGenLLVM::HasAlignmentPadding(PrimType dtype) {
+  if (dtype.IsScalableVector()) {
     return false;
   }
   const llvm::DataLayout& data_layout = module_->getDataLayout();
   int bytes = data_layout.getTypeAllocSize(DTypeToLLVMType(dtype));
-  int bytes_scalar = data_layout.getTypeAllocSize(DTypeToLLVMType(dtype.element_of()));
+  int bytes_scalar = data_layout.getTypeAllocSize(DTypeToLLVMType(dtype.WithLanes(1)));
   return bytes != bytes_scalar * dtype.lanes();
 }
 
 void CodeGenLLVM::BufferAccessHelper(
     Buffer buffer, ffi::Array<PrimExpr> indices, ffi::Optional<PrimExpr> predicate,
-    DataType value_dtype,
+    PrimType value_dtype,
     std::function<llvm::Instruction*(TypedPointer buffer_ptr, int subelement_i,
                                      llvm::Value* predicate, int alignment, bool is_volatile)>
         make_instruction) {
-  DataType buffer_element_dtype = buffer->dtype;
+  PrimType buffer_element_dtype = buffer->dtype;
 
   TVM_FFI_ICHECK_GE(indices.size(), 1)
       << "Buffer " << buffer->name << " is accessed with no indices.  "
@@ -1703,21 +1732,20 @@ void CodeGenLLVM::BufferAccessHelper(
   // requires 1-d indices.
   std::vector<llvm::Value*> earlier_index_values;
   for (size_t i = 0; i < indices.size() - 1; i++) {
-    TVM_FFI_ICHECK_EQ(indices[i].dtype().lanes(), 1)
+    TVM_FFI_ICHECK_EQ(PrimType(indices[i].ty()->dtype).lanes(), 1)
         << "Buffer " << buffer->name << " is accessed with a multi-lane index at position " << i
         << ".  Multi-lane indices are only supported as the last index.";
     earlier_index_values.push_back(MakeValue(indices[i]));
   }
 
   PrimExpr last_index = indices[indices.size() - 1];
-  int last_index_lanes = last_index.dtype().get_lanes_or_vscale_factor();
-  int buffer_element_lanes = buffer_element_dtype.get_lanes_or_vscale_factor();
-  TVM_FFI_ICHECK_EQ(value_dtype.get_lanes_or_vscale_factor(),
-                    last_index_lanes * buffer_element_lanes);
+  int last_index_lanes = GetLanesOrVScaleFactor(PrimType(last_index.ty()->dtype));
+  int buffer_element_lanes = GetLanesOrVScaleFactor(buffer_element_dtype);
+  TVM_FFI_ICHECK_EQ(GetLanesOrVScaleFactor(value_dtype), last_index_lanes * buffer_element_lanes);
 
   // Record index and elemtype in original form used for alias info
   PrimExpr last_index_origin = last_index;
-  DataType buffer_element_dtype_origin = buffer_element_dtype;
+  PrimType buffer_element_dtype_origin = buffer_element_dtype;
 
   bool is_volatile = volatile_buf_.count(buffer->data.get());
 
@@ -1726,17 +1754,18 @@ void CodeGenLLVM::BufferAccessHelper(
   if (const RampNode* ramp_index = last_index.as<RampNode>()) {
     if (is_one(ramp_index->stride)) {
       last_index = ramp_index->base;
-      last_index_lanes = last_index.dtype().get_lanes_or_vscale_factor();
+      last_index_lanes = GetLanesOrVScaleFactor(PrimType(last_index.ty()->dtype));
     }
   }
 
   // All TVM arrays are densely packed.  If the vectorized LLVM type
   // contains padding for alignment, we need to index based on the
   // size of the scalar type to avoid introducing that padding.
-  bool last_index_is_scalar = !last_index.dtype().is_scalable_vector() && last_index_lanes == 1;
+  bool last_index_is_scalar =
+      !PrimType(last_index.ty()->dtype).IsScalableVector() && last_index_lanes == 1;
   if (last_index_is_scalar && HasAlignmentPadding(buffer_element_dtype)) {
     last_index = buffer_element_lanes * last_index;
-    buffer_element_dtype = buffer_element_dtype.element_of();
+    buffer_element_dtype = buffer_element_dtype.WithLanes(1);
     buffer_element_lanes = 1;
   }
 
@@ -1754,7 +1783,7 @@ void CodeGenLLVM::BufferAccessHelper(
     alignment = value_dtype.bits() / 8;
   }
 
-  TVM_FFI_ICHECK(!last_index.dtype().is_scalable_vector())
+  TVM_FFI_ICHECK(!PrimType(last_index.ty()->dtype).IsScalableVector())
       << "Scalable vector indices are not supported in LLVM buffer access codegen";
   llvm::Value* cached_vector_index = nullptr;
   for (int i = 0; i < last_index_lanes; ++i) {
@@ -1763,7 +1792,7 @@ void CodeGenLLVM::BufferAccessHelper(
     if (const RampNode* ramp = last_index.as<RampNode>()) {
       PrimExpr offset = ramp->base + (ramp->stride * i);
       last_index_value = MakeValue(offset);
-    } else if (last_index.dtype().is_vector()) {
+    } else if (!PrimType(last_index.ty()->dtype).IsScalar()) {
       if (i == 0) {
         cached_vector_index = MakeValue(last_index);
       }
@@ -1782,12 +1811,12 @@ void CodeGenLLVM::BufferAccessHelper(
     }
 
     TypedPointer buffer_ptr =
-        value_dtype.is_scalable_vector()
+        value_dtype.IsScalableVector()
             ? CreateBufferPtr(MakeValue(buffer->data), buffer_element_dtype, all_index_values,
-                              value_dtype.with_scalable_vscale_factor(value_dtype.vscale_factor() /
-                                                                      last_index_lanes))
+                              WithScalableVScaleFactor(
+                                  value_dtype, value_dtype.VScaleFactor() / last_index_lanes))
             : CreateBufferPtr(MakeValue(buffer->data), buffer_element_dtype, all_index_values,
-                              value_dtype.with_lanes(value_dtype.lanes() / last_index_lanes));
+                              value_dtype.WithLanes(value_dtype.lanes() / last_index_lanes));
     auto instruction =
         make_instruction(buffer_ptr, subelement_i, predicate_value, alignment, is_volatile);
     AddAliasInfo(instruction, buffer->data.get(), last_index_origin, buffer_element_dtype_origin);
@@ -1795,7 +1824,7 @@ void CodeGenLLVM::BufferAccessHelper(
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const BufferLoadNode* op) {
-  DataType value_dtype = op->dtype;
+  PrimType value_dtype(op->ty()->dtype);
 
   std::vector<llvm::Value*> loads;
 
@@ -1868,13 +1897,14 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const CallNode* op) {
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const RampNode* op) {
-  llvm::Value* vec = llvm::UndefValue::get(DTypeToLLVMType(op->dtype));
+  PrimType dtype(op->ty()->dtype);
+  llvm::Value* vec = llvm::UndefValue::get(DTypeToLLVMType(dtype));
   // TODO(ekalda): P4 in https://github.com/apache/tvm/issues/16455
-  TVM_FFI_ICHECK(!op->dtype.is_scalable_vector());
-  int lanes = op->dtype.lanes();
+  TVM_FFI_ICHECK(!dtype.IsScalableVector());
+  int lanes = dtype.lanes();
   for (int i = 0; i < lanes; ++i) {
     vec = builder_->CreateInsertElement(
-        vec, MakeValue(op->base + op->stride * MakeConst(op->stride.dtype(), i)), ConstInt32(i));
+        vec, MakeValue(op->base + op->stride * MakeConst(op->stride.ty(), i)), ConstInt32(i));
   }
   return vec;
 }
@@ -1884,7 +1914,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const ShuffleNode* op) {
   int total_lanes = 0;
   for (int i = 0, e = op->vectors.size(); i < e; ++i) {
     vecs[i] = VisitExpr(op->vectors[i]);
-    total_lanes += op->vectors[i].dtype().lanes();
+    total_lanes += PrimType(op->vectors[i].ty()->dtype).lanes();
   }
   llvm::Value* v0 = CreateVecConcat(vecs);
   std::vector<uint32_t> idx(op->indices.size());
@@ -1905,21 +1935,21 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const ShuffleNode* op) {
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const BroadcastNode* op) {
-  DataType dtype = op->dtype;
+  PrimType dtype(op->ty()->dtype);
   llvm::Value* value = MakeValue(op->value);
   llvm::Type* type = DTypeToLLVMType(dtype);
   llvm::Constant* undef = llvm::UndefValue::get(type);
   llvm::Constant* zero = ConstInt32(0);
   value = builder_->CreateInsertElement(undef, value, zero);
   llvm::ElementCount ec =
-      llvm::ElementCount::get(dtype.get_lanes_or_vscale_factor(), dtype.is_scalable_vector());
+      llvm::ElementCount::get(GetLanesOrVScaleFactor(dtype), dtype.IsScalableVector());
   llvm::Constant* mask = llvm::ConstantVector::getSplat(ec, zero);
   return builder_->CreateShuffleVector(value, undef, mask);
 }
 
 void CodeGenLLVM::VisitStmt_(const BufferStoreNode* op) {
   EmitDebugLocation(op);
-  DataType value_dtype = op->value.dtype();
+  PrimType value_dtype = PrimType(op->value.ty()->dtype);
   Var buffer_var = op->buffer->data;
 
   llvm::Value* value = MakeValue(op->value);
@@ -1960,7 +1990,7 @@ void CodeGenLLVM::VisitStmt_(const ForNode* op) {
   } else {
     TVM_FFI_ICHECK(op->kind == ForKind::kSerial);
   }
-  PrimExpr step = op->step.value_or(MakeConst(op->extent->dtype, 1));
+  PrimExpr step = op->step.value_or(MakeConst(op->extent.ty(), 1));
   PrimExpr end = is_zero(op->min) ? op->extent : analyzer_->Simplify(op->min + op->extent);
   llvm::Value* begin_value = MakeValue(op->min);
   llvm::Value* end_value = MakeValue(end);
@@ -2087,7 +2117,7 @@ void CodeGenLLVM::VisitStmt_(const BindNode* op) {
   EmitDebugLocation(op);
   const VarNode* v = op->var.get();
   TVM_FFI_ICHECK(!var_map_.count(v));
-  if (v->dtype.is_handle()) {
+  if (v->ty().IsHandle()) {
     if (!is_restricted_) {
       alias_var_set_.insert(v);
     }
@@ -2098,10 +2128,10 @@ void CodeGenLLVM::VisitStmt_(const BindNode* op) {
   // Therefore, to have the correct LLVM type for pointers, we may
   // need to introduce a pointer-cast, even though pointer-to-pointer
   // casts are not expressible with the `tirx::CastNode`.
-  if (v->dtype.is_handle() && v->type_annotation.defined()) {
-    TVM_FFI_ICHECK(op->value->dtype.is_handle())
+  if (v->ty().IsHandle() && v->type_annotation.defined()) {
+    TVM_FFI_ICHECK(op->value.ty().IsHandle())
         << "Variable " << op->var << " is a pointer with type " << op->value
-        << ", but is being bound to expression with type " << op->value->dtype;
+        << ", but is being bound to expression with type " << op->value.ty();
     auto* llvm_type = GetLLVMType(v->type_annotation);
     if (llvm_type != value->getType()) {
       value->setName((v->name_hint + "_void_ptr").c_str());
@@ -2274,7 +2304,10 @@ llvm::DIType* CodeGenLLVM::GetDebugType(const Type& ty_tir, llvm::Type* ty_llvm)
 
   } else if (ty_llvm->isPointerTy()) {
     auto* ptr_type = ty_tir.as<PointerTypeNode>();
-    TVM_FFI_ICHECK(ptr_type != nullptr || GetRuntimeDataType(ty_tir).is_handle())
+    DLDataType runtime_dtype = GetRuntimeDataType(ty_tir);
+    TVM_FFI_ICHECK(ptr_type != nullptr ||
+                   (runtime_dtype.code == static_cast<uint8_t>(DLDataTypeCode::kDLOpaqueHandle) &&
+                    !(runtime_dtype.bits == 0 && static_cast<int16_t>(runtime_dtype.lanes) == 0)))
         << "Got LLVM pointer type from non-pointer IR type: " << ty_tir;
     auto* pointee_type = ptr_type != nullptr ? GetDebugType(ptr_type->element_type,
                                                             GetLLVMType(ptr_type->element_type))
@@ -2283,24 +2316,24 @@ llvm::DIType* CodeGenLLVM::GetDebugType(const Type& ty_tir, llvm::Type* ty_llvm)
                                                      ty_llvm->getPrimitiveSizeInBits());
 
   } else if (auto* prim_type = ty_tir.as<PrimTypeNode>()) {
-    DataType dtype = prim_type->dtype;
+    PrimType dtype(prim_type->dtype);
     llvm::dwarf::TypeKind dwarf_type;
-    if (dtype.is_bool()) {
+    if (dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
       dwarf_type = llvm::dwarf::DW_ATE_boolean;
-    } else if (dtype.is_float()) {
+    } else if (dtype.MatchesCode(DLDataTypeCode::kDLFloat)) {
       dwarf_type = llvm::dwarf::DW_ATE_float;
-    } else if (dtype.is_int()) {
+    } else if (dtype.MatchesCode(DLDataTypeCode::kDLInt)) {
       dwarf_type = llvm::dwarf::DW_ATE_signed;
-    } else if (dtype.is_uint()) {
+    } else if (dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
       dwarf_type = llvm::dwarf::DW_ATE_unsigned;
     } else {
       return nullptr;
     }
 
-    if (dtype.is_scalable_vector()) return nullptr;
+    if (dtype.IsScalableVector()) return nullptr;
 
     return dbg_info_->di_builder_->createBasicType(
-        ffi::DLDataTypeToString(dtype).operator std::string(), dtype.bits() * dtype.lanes(),
+        ffi::DLDataTypeToString(dtype->dtype).operator std::string(), dtype.bits() * dtype.lanes(),
         dwarf_type);
 
   } else {
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index a2c3b6e4ff48..777eebe8097b 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -351,7 +351,7 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    */
   void BufferAccessHelper(
       Buffer buffer, ffi::Array<PrimExpr> indices, ffi::Optional<PrimExpr> predicate,
-      DataType value_dtype,
+      PrimType value_dtype,
       std::function<llvm::Instruction*(TypedPointer buffer_ptr, int subelement_i,
                                        llvm::Value* predicate, int alignment, bool is_volatile)>
           make_instruction);
@@ -400,7 +400,7 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    *
    * \return LLVM type of dtype
    */
-  llvm::Type* DTypeToLLVMType(const DataType& dtype) const;
+  llvm::Type* DTypeToLLVMType(const PrimType& dtype) const;
   /*!
    * \brief Get the LLVM Type for a given type.
    * \param dtype The runtime dtype.
@@ -450,28 +450,28 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
   // initialize the function state.
   void InitFuncState();
   // Get alignment given index.
-  void GetAlignment(DataType t, const VarNode* buf_var, const PrimExpr& index, int* p_alignment,
+  void GetAlignment(PrimType t, const VarNode* buf_var, const PrimExpr& index, int* p_alignment,
                     int* p_native_bits);
   // Returns whether the LLVM type has padding for alignment
-  bool HasAlignmentPadding(DataType dtype);
+  bool HasAlignmentPadding(PrimType dtype);
   // do a scalarize call with f
   llvm::Value* CreateScalarizedCall(const CallNode* op, llvm::Function* f,
                                     const std::vector<llvm::Value*>& args);
   // handle module import
   void HandleImport(const std::string& code);
   // cast operatpr
-  llvm::Value* CreateCast(DataType from, DataType to, llvm::Value* value);
+  llvm::Value* CreateCast(PrimType from, PrimType to, llvm::Value* value);
   // comparison op
   llvm::Value* GetVarValue(const VarNode* v) const;
-  llvm::Value* CreateLT(DataType t, llvm::Value* a, llvm::Value* b);
-  llvm::Value* CreateLE(DataType t, llvm::Value* a, llvm::Value* b);
-  llvm::Value* CreateGT(DataType t, llvm::Value* a, llvm::Value* b);
-  llvm::Value* CreateGE(DataType t, llvm::Value* a, llvm::Value* b);
-  llvm::Value* CreateAdd(DataType t, llvm::Value* a, llvm::Value* b);
-  llvm::Value* CreateSub(DataType t, llvm::Value* a, llvm::Value* b);
-  llvm::Value* CreateMul(DataType t, llvm::Value* a, llvm::Value* b);
-  virtual TypedPointer CreateBufferPtr(llvm::Value* buffer_ptr, DataType buffer_element_dtype,
-                                       llvm::ArrayRef<llvm::Value*> indices, DataType value_dtype);
+  llvm::Value* CreateLT(PrimType t, llvm::Value* a, llvm::Value* b);
+  llvm::Value* CreateLE(PrimType t, llvm::Value* a, llvm::Value* b);
+  llvm::Value* CreateGT(PrimType t, llvm::Value* a, llvm::Value* b);
+  llvm::Value* CreateGE(PrimType t, llvm::Value* a, llvm::Value* b);
+  llvm::Value* CreateAdd(PrimType t, llvm::Value* a, llvm::Value* b);
+  llvm::Value* CreateSub(PrimType t, llvm::Value* a, llvm::Value* b);
+  llvm::Value* CreateMul(PrimType t, llvm::Value* a, llvm::Value* b);
+  virtual TypedPointer CreateBufferPtr(llvm::Value* buffer_ptr, PrimType buffer_element_dtype,
+                                       llvm::ArrayRef<llvm::Value*> indices, PrimType value_dtype);
   // Vector concatenation.
   llvm::Value* CreateVecSlice(llvm::Value* vec, int begin, int extent);
   llvm::Value* CreateVecFlip(llvm::Value* vec);
@@ -482,9 +482,9 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
                        const Var& loop_var, const Stmt& body);
   // add alias information.
   void AddAliasInfo(llvm::Instruction* inst, const VarNode* buffer_var, PrimExpr index,
-                    DataType access_dtype);
+                    PrimType access_dtype);
 
-  llvm::GlobalVariable* AllocateSharedMemory(DataType dtype, size_t size,
+  llvm::GlobalVariable* AllocateSharedMemory(PrimType dtype, size_t size,
                                              unsigned int shared_address_space, int alignment,
                                              llvm::GlobalValue::LinkageTypes linkage);
 
diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index 0633c4fcb3b6..70a407b75984 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -78,8 +78,8 @@ llvm::ConstantArray* TensorToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::T
   TVM_FFI_ICHECK(arr.IsContiguous()) << "CodegenParams: only support contiguous arrays";
   TVM_FFI_ICHECK_EQ(arr->device.device_type, kDLCPU)
       << "CodegenParams: only support contiguous arrays";
-  TVM_FFI_ICHECK_EQ(arr_type.lanes(), 1)
-      << "CodegenParams: only support generating 1-lane parameters; saw " << arr_type.lanes();
+  TVM_FFI_ICHECK_EQ(arr_type.lanes, 1)
+      << "CodegenParams: only support generating 1-lane parameters; saw " << arr_type.lanes;
 
   auto shape = arr.Shape();
   int num_elements = 1;
@@ -89,15 +89,15 @@ llvm::ConstantArray* TensorToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::T
 
   std::vector<llvm::Constant*> elements;
 
-  switch (arr_type.code()) {
-    case runtime::DataType::kInt:
-      TVM_FFI_ICHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
-                     arr_type.bits() == 64)
+  switch (arr_type.code) {
+    case kDLInt:
+      TVM_FFI_ICHECK(arr_type.bits == 8 || arr_type.bits == 16 || arr_type.bits == 32 ||
+                     arr_type.bits == 64)
           << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
-          << arr_type.bits() << "-bit array";
-      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+          << arr_type.bits << "-bit array";
+      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits);
 
-      switch (arr_type.bits()) {
+      switch (arr_type.bits) {
         case 8:
           BuildLLVMVector<int8_t>(element_type, arr->data, num_elements, &elements);
           break;
@@ -116,14 +116,14 @@ llvm::ConstantArray* TensorToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::T
       }
       break;
 
-    case runtime::DataType::TypeCode::kUInt:
-      TVM_FFI_ICHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
-                     arr_type.bits() == 64)
+    case kDLUInt:
+      TVM_FFI_ICHECK(arr_type.bits == 8 || arr_type.bits == 16 || arr_type.bits == 32 ||
+                     arr_type.bits == 64)
           << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
-          << arr_type.bits() << "-bit array";
-      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+          << arr_type.bits << "-bit array";
+      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits);
 
-      switch (arr_type.bits()) {
+      switch (arr_type.bits) {
         case 8:
           BuildLLVMVector<uint8_t>(element_type, arr->data, num_elements, &elements);
           break;
@@ -142,11 +142,11 @@ llvm::ConstantArray* TensorToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::T
       }
       break;
 
-    case runtime::DataType::TypeCode::kFloat:
-      switch (arr_type.bits()) {
+    case kDLFloat:
+      switch (arr_type.bits) {
         case 16:
           // NOTE: float16 is treated as uint16_t.
-          element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+          element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits);
           BuildLLVMVector<uint16_t>(element_type, arr->data, num_elements, &elements);
           break;
         case 32:
@@ -159,15 +159,15 @@ llvm::ConstantArray* TensorToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::T
           break;
         default:
           TVM_FFI_ICHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
-                                << arr_type.bits() << "-bit array";
+                                << arr_type.bits << "-bit array";
           break;
       }
       break;
 
-    case runtime::DataType::TypeCode::kBFloat:
-      TVM_FFI_ICHECK(arr_type.bits() == 16)
-          << "CodegenParams: only support 16-bit bfloat; saw " << arr_type.bits() << "-bit array";
-      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+    case kDLBfloat:
+      TVM_FFI_ICHECK(arr_type.bits == 16)
+          << "CodegenParams: only support 16-bit bfloat; saw " << arr_type.bits << "-bit array";
+      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits);
       BuildLLVMVector<uint16_t>(element_type, arr->data, num_elements, &elements);
 
     default:
diff --git a/src/target/llvm/codegen_x86_64.cc b/src/target/llvm/codegen_x86_64.cc
index 292b20caa6ae..0ae7cd146c33 100644
--- a/src/target/llvm/codegen_x86_64.cc
+++ b/src/target/llvm/codegen_x86_64.cc
@@ -53,9 +53,10 @@ llvm::Value* CodeGenX86_64::VisitExpr_(const CastNode* op) {
   // LLVM does not automatically generate the correct instruction sequences for
   // half -> float conversion (i.e. using AVX2/AVX-512 vectorized variants of
   // vcvtph2ps), so we explicitly generate them ourselves.
-  const auto from = op->value.dtype();
-  const auto to = op->dtype;
-  if (from.is_float() && to.is_float() && from.bits() == 16 && to.bits() == 32) {
+  const auto from = PrimType(op->value.ty()->dtype);
+  const auto to = PrimType(op->ty()->dtype);
+  if (from.MatchesCode(DLDataTypeCode::kDLFloat) && to.MatchesCode(DLDataTypeCode::kDLFloat) &&
+      from.bits() == 16 && to.bits() == 32) {
     TVM_FFI_ICHECK_EQ(from.lanes(), to.lanes());
 
     const auto has_avx512 = llvm_target_->TargetHasCPUFeature("avx512f");
@@ -63,12 +64,12 @@ llvm::Value* CodeGenX86_64::VisitExpr_(const CastNode* op) {
     if (from.lanes() >= 16 && has_avx512) {
       return CallVectorIntrin(
           llvm::Intrinsic::x86_avx512_mask_vcvtph2ps_512, 16,
-          DTypeToLLVMType(DataType::Float(32, from.lanes())),
+          DTypeToLLVMType(PrimType::Float(32, from.lanes())),
           {
-              MakeValue(tirx::Call(DataType::Int(16, from.lanes()), tirx::builtin::reinterpret(),
+              MakeValue(tirx::Call(PrimType::Int(16, from.lanes()), tirx::builtin::reinterpret(),
                                    {op->value})),
-              MakeValue(tirx::Broadcast(FloatImm(DataType::Float(32), 0), from.lanes())),
-              /*mask=*/MakeValue(IntImm(DataType::Int(16), -1)),
+              MakeValue(tirx::Broadcast(FloatImm(PrimType::Float(32), 0), from.lanes())),
+              /*mask=*/MakeValue(IntImm(PrimType::Int(16), -1)),
               /*rounding-mode=*/MakeValue(IntImm::Int32(4)),
           });
     }
diff --git a/src/target/llvm/intrin_rule_llvm.cc b/src/target/llvm/intrin_rule_llvm.cc
index 33c74d90ddca..4ade49d44fdd 100644
--- a/src/target/llvm/intrin_rule_llvm.cc
+++ b/src/target/llvm/intrin_rule_llvm.cc
@@ -126,7 +126,7 @@ TVM_REGISTER_OP("tirx.exp10")
       const tirx::CallNode* call = e.as<tirx::CallNode>();
       TVM_FFI_ICHECK(call != nullptr);
       const PrimExpr& x = call->args[0];
-      PrimExpr ln10 = MakeConst(x.dtype(), 2.302585093);
+      PrimExpr ln10 = MakeConst(x.ty(), 2.302585093);
       PrimExpr ret = exp(x * ln10);
       return ret;
     });
@@ -162,8 +162,9 @@ TVM_REGISTER_OP("tirx.atanh")
       const tirx::CallNode* call = e.as<tirx::CallNode>();
       TVM_FFI_ICHECK(call != nullptr) << "Invalid call node in atanh legalization";
       const PrimExpr& x = call->args[0];
-      PrimExpr one = MakeConst(x.dtype(), 1.0);
-      return (log(one + x) - log(one - x)) * MakeConst(x.dtype(), 0.5);
+      PrimType x_ty = x.ty();
+      PrimExpr one = MakeConst(x_ty, 1.0);
+      return (log(one + x) - log(one - x)) * MakeConst(x_ty, 0.5);
     });
 
 TVM_REGISTER_OP("tirx.clz")
@@ -172,12 +173,12 @@ TVM_REGISTER_OP("tirx.clz")
       TVM_FFI_ICHECK(call != nullptr);
       TVM_FFI_ICHECK_EQ(call->args.size(), 1);
       ffi::Array<PrimExpr> cargs;
-      cargs.push_back(IntImm(DataType::UInt(32), ::llvm::Intrinsic::ctlz));
+      cargs.push_back(IntImm(PrimType::UInt(32), ::llvm::Intrinsic::ctlz));
       cargs.push_back(call->args[0]);
-      cargs.push_back(IntImm(DataType::Int(1), 1));  // is_zero_undef
+      cargs.push_back(IntImm(PrimType::Int(1), 1));  // is_zero_undef
       // LLVM requires that the return type must match the first argument type
-      auto clz = tirx::Call(call->args[0]->dtype, tirx::builtin::call_llvm_intrin(), cargs);
-      return cast(call->dtype, clz);
+      auto clz = tirx::Call(call->args[0].ty(), tirx::builtin::call_llvm_intrin(), cargs);
+      return cast(call->ty(), clz);
     });
 
 }  // namespace legalize
diff --git a/src/target/llvm/intrin_rule_llvm.h b/src/target/llvm/intrin_rule_llvm.h
index b70d2b8001e0..5fb8801386d1 100644
--- a/src/target/llvm/intrin_rule_llvm.h
+++ b/src/target/llvm/intrin_rule_llvm.h
@@ -43,7 +43,7 @@ inline PrimExpr DispatchLLVMPureIntrin(const PrimExpr& e) {
   TVM_FFI_ICHECK(call != nullptr);
   ffi::Array<PrimExpr> cargs;
   // intrin id.
-  cargs.push_back(IntImm(DataType::UInt(32), id));
+  cargs.push_back(IntImm(PrimType::UInt(32), id));
   TVM_FFI_ICHECK_EQ(call->args.size(), num_signature)
       << "llvm.call_llvm_intrin" << llvmGetIntrinName(id) << "expects " << num_signature
       << " arguments, but got " << call->args.size();
@@ -51,7 +51,7 @@ inline PrimExpr DispatchLLVMPureIntrin(const PrimExpr& e) {
   for (PrimExpr arg : call->args) {
     cargs.push_back(arg);
   }
-  return tirx::Call(call->dtype, tirx::builtin::call_llvm_pure_intrin(), cargs);
+  return tirx::Call(call->ty(), tirx::builtin::call_llvm_pure_intrin(), cargs);
 }
 
 template <unsigned id, int num_signature>
@@ -60,14 +60,14 @@ inline PrimExpr DispatchLLVMIntrin(const PrimExpr& e) {
   TVM_FFI_ICHECK(call != nullptr);
   ffi::Array<PrimExpr> cargs;
   // intrin id.
-  cargs.push_back(IntImm(DataType::UInt(32), id));
+  cargs.push_back(IntImm(PrimType::UInt(32), id));
   TVM_FFI_ICHECK_EQ(call->args.size(), num_signature)
       << "llvm.call_llvm_intrin" << llvmGetIntrinName(id) << "expects " << num_signature
       << " arguments, but got " << call->args.size();
   for (PrimExpr arg : call->args) {
     cargs.push_back(arg);
   }
-  return tirx::Call(call->dtype, tirx::builtin::call_llvm_intrin(), cargs);
+  return tirx::Call(call->ty(), tirx::builtin::call_llvm_intrin(), cargs);
 }
 
 }  // namespace codegen
diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index 3ada4404b0be..c0924b799099 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -109,7 +109,7 @@ void CodeGenC::PrintFunctionSignature(const ffi::String& function_name, const Pr
     }
 
     bool no_alias = func->HasNonzeroAttr(tirx::attr::kNoAlias);
-    bool is_handle = v.dtype().is_handle();
+    bool is_handle = v.ty().IsHandle();
     auto* ptr = v->type_annotation.as<PointerTypeNode>();
     if (ptr && ptr->element_type.as<TensorMapTypeNode>()) {
       is_handle = false;
@@ -205,7 +205,7 @@ void CodeGenC::PrintExpr(const PrimExpr& n, std::ostream& os) {  // NOLINT(*)
   if (print_ssa_form_) {
     std::ostringstream temp;
     VisitExpr(n, temp);
-    os << SSAGetID(temp.str(), n.dtype());
+    os << SSAGetID(temp.str(), n.ty()->dtype);
   } else {
     VisitExpr(n, os);
   }
@@ -213,8 +213,8 @@ void CodeGenC::PrintExpr(const PrimExpr& n, std::ostream& os) {  // NOLINT(*)
 
 static bool CheckOutermostBracketMatch(const std::string& s);
 
-void CodeGenC::PrintSSAAssign(const std::string& target, const std::string& src, DataType t) {
-  PrintType(t, stream);
+void CodeGenC::PrintSSAAssign(const std::string& target, const std::string& src, PrimType t) {
+  PrintType(t->dtype, stream);
   stream << ' ' << target << " = ";
   if (CheckOutermostBracketMatch(src)) {
     stream << src.substr(1, src.length() - 2);
@@ -225,7 +225,8 @@ void CodeGenC::PrintSSAAssign(const std::string& target, const std::string& src,
 }
 
 // Print a reference expression to a buffer.
-std::string CodeGenC::GetBufferRef(DataType t, const BufferNode* buffer, PrimExpr index) {
+std::string CodeGenC::GetBufferRef(DLDataType t, const BufferNode* buffer, PrimExpr index) {
+  PrimType t_ty(t);
   const VarNode* buffer_var = buffer->data.get();
   std::ostringstream os;
   std::string vid = GetVarID(buffer_var);
@@ -235,7 +236,7 @@ std::string CodeGenC::GetBufferRef(DataType t, const BufferNode* buffer, PrimExp
   }
   bool is_vol = IsVolatile(buffer_var);
 
-  auto ptr_cast = [this, is_vol, scope](DataType pointed_to) {
+  auto ptr_cast = [this, is_vol, scope](DLDataType pointed_to) {
     std::ostringstream ptr_os;
     ptr_os << "(";
     if (is_vol) {
@@ -249,7 +250,7 @@ std::string CodeGenC::GetBufferRef(DataType t, const BufferNode* buffer, PrimExp
     return ptr_os.str();
   };
 
-  DataType buffer_element_dtype = buffer->dtype;
+  DLDataType buffer_element_dtype = buffer->dtype->dtype;
 
   std::string buffer_str = vid;
   if (!HandleTypeMatch(buffer_var, buffer_element_dtype) || is_vol) {
@@ -259,19 +260,20 @@ std::string CodeGenC::GetBufferRef(DataType t, const BufferNode* buffer, PrimExp
   }
 
   std::string index_str = PrintExpr(index);
-  if ((t.bits() == 4 && !t.is_float4()) || (t.bits() == 1 && t.is_int())) {
+  if ((t.bits == 4 && t_ty.code() != DLDataTypeCode::kDLFloat4_e2m1fn) ||
+      (t.bits == 1 && t_ty.MatchesCode(DLDataTypeCode::kDLInt))) {
     // This is a special case, because CodegenCUDA::PrintType()
     // returns "int" for bool and for 4-bit integers. In most cases,
     // we divide by the number of lanes to determine the index.
     // However, the backing type for scalar int4 and scalar bool is
     // int32.  Therefore, we need to divide by the ratio of their
     // sizes in that case.
-    int div_factor = (t.lanes() == 1) ? (32 / t.bits()) : t.lanes();
+    int div_factor = (t_ty.lanes() == 1) ? (32 / t.bits) : t_ty.lanes();
 
     os << "*("
        << "(" << ptr_cast(t) << vid << ")"
        << " + " << index_str << " / " << div_factor << ")";
-  } else if (t.is_float4_e2m1fn() && t.lanes() == 1) {
+  } else if (t_ty.code() == DLDataTypeCode::kDLFloat4_e2m1fn && t_ty.lanes() == 1) {
     // float4_e2m1fn: sizeof(__nv_fp4_e2m1) = 1 byte, but data is packed
     // 2 elements per byte.  Divide element index by 2 to get byte offset.
     // This returns an lvalue so it works for address_of() and stores.
@@ -287,8 +289,9 @@ std::string CodeGenC::GetBufferRef(DataType t, const BufferNode* buffer, PrimExp
 }
 
 // Print a reference expression to a buffer.
-std::string CodeGenC::GetStructRef(DataType t, const PrimExpr& buffer, const PrimExpr& index,
+std::string CodeGenC::GetStructRef(DLDataType t, const PrimExpr& buffer, const PrimExpr& index,
                                    int kind) {
+  PrimType t_ty(t);
   if (kind < builtin::kDLTensorKindBound_) {
     std::ostringstream os;
     os << "(((DLTensor*)";
@@ -357,11 +360,11 @@ std::string CodeGenC::GetStructRef(DataType t, const PrimExpr& buffer, const Pri
     os << "(((TVMFFIAny*)";
     this->PrintExpr(buffer, os);
     os << ")[" << index << "].";
-    if (t.is_handle()) {
+    if (t_ty.IsHandle()) {
       os << "v_ptr";
-    } else if (t.is_float()) {
+    } else if (t_ty.code() == DLDataTypeCode::kDLFloat) {
       os << "v_float64";
-    } else if (t.is_int()) {
+    } else if (t_ty.MatchesCode(DLDataTypeCode::kDLInt)) {
       os << "v_int64";
     } else {
       TVM_FFI_THROW(InternalError) << "Do not know how to handle type" << t;
@@ -382,13 +385,13 @@ std::string CodeGenC::GetStructRef(DataType t, const PrimExpr& buffer, const Pri
   }
 }
 
-bool CodeGenC::HandleTypeMatch(const VarNode* buf_var, DataType t) const {
+bool CodeGenC::HandleTypeMatch(const VarNode* buf_var, DLDataType t) const {
   auto it = handle_data_type_.find(buf_var);
   if (it == handle_data_type_.end()) return false;
   return it->second == t;
 }
 
-void CodeGenC::RegisterHandleType(const VarNode* buf_var, DataType t) {
+void CodeGenC::RegisterHandleType(const VarNode* buf_var, DLDataType t) {
   auto it = handle_data_type_.find(buf_var);
   if (it == handle_data_type_.end()) {
     handle_data_type_[buf_var] = t;
@@ -401,39 +404,39 @@ void CodeGenC::RegisterHandleTypeFromPointer(const tirx::Var& var, const PrimExp
   if (value == nullptr) return;
   auto* call = value->as<tirx::CallNode>();
   if (call == nullptr || !call->op.same_as(builtin::ptr_byte_offset())) return;
-  std::optional<DataType> value_dtype = tirx::GetPointerType(GetType(*value));
+  std::optional<DLDataType> value_dtype = tirx::GetPointerType(GetType(*value));
   if (!value_dtype.has_value()) return;
   RegisterHandleType(var.get(), value_dtype.value());
   pointer_offset_vars_.insert(var.get());
 }
 
-void CodeGenC::PrintVecElemLoad(const std::string& vec, DataType t, int i,
+void CodeGenC::PrintVecElemLoad(const std::string& vec, DLDataType t, int i,
                                 std::ostream& os) {  // NOLINT(*)
   os << vec << ".s" << std::hex << i << std::dec;
 }
 
-void CodeGenC::PrintVecElemStore(const std::string& vec, DataType t, int i,
+void CodeGenC::PrintVecElemStore(const std::string& vec, DLDataType t, int i,
                                  const std::string& value) {
   this->PrintIndent();
   stream << vec << ".s" << std::hex << i << " = " << value << ";\n" << std::dec;
 }
 
-std::string CodeGenC::GetVecLoad(DataType t, const BufferNode* buffer, PrimExpr base) {
+std::string CodeGenC::GetVecLoad(DLDataType t, const BufferNode* buffer, PrimExpr base) {
   return GetBufferRef(t, buffer, base);
 }
 
-void CodeGenC::PrintVecStore(const BufferNode* buffer, DataType t, PrimExpr base,
+void CodeGenC::PrintVecStore(const BufferNode* buffer, DLDataType t, PrimExpr base,
                              const std::string& value) {
   std::string ref = GetBufferRef(t, buffer, base);
   this->PrintIndent();
   stream << ref << " = " << value << ";\n";
 }
 
-void CodeGenC::PrintVecConstructor(DataType t, std::ostream& os) {  // NOLINT(*)
+void CodeGenC::PrintVecConstructor(DLDataType t, std::ostream& os) {  // NOLINT(*)
   PrintType(t, os);
 }
 
-std::string CodeGenC::CastFromTo(std::string value, DataType from, DataType target) {
+std::string CodeGenC::CastFromTo(std::string value, DLDataType from, DLDataType target) {
   if (from == target) return value;
   std::ostringstream os;
   os << "((";
@@ -454,21 +457,21 @@ void CodeGenC::PrintStorageScope(const std::string& scope, std::ostream& os) {
 }
 
 inline void PrintConst(const IntImmNode* op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
-  if (op->dtype == DataType::Int(32)) {
+  if (op->ty() == PrimType::Int(32)) {
     std::ostringstream temp;
     temp << op->value;
     p->MarkConst(temp.str());
     os << temp.str();
   } else {
     os << "(";
-    p->PrintType(op->dtype, os);
+    p->PrintType(op->ty()->dtype, os);
     os << ")" << op->value;
   }
 }
 
-inline void PrintUIntConst(DataType dtype, uint64_t val, std::ostream& os,
+inline void PrintUIntConst(DLDataType dtype, uint64_t val, std::ostream& os,
                            CodeGenC* p) {  // NOLINT(*)
-  if (dtype == DataType::UInt(32)) {
+  if (dtype == DLDataType{kDLUInt, 32, 1}) {
     std::ostringstream temp;
     temp << val << "U";
     p->MarkConst(temp.str());
@@ -481,24 +484,24 @@ inline void PrintUIntConst(DataType dtype, uint64_t val, std::ostream& os,
 }
 
 inline void PrintConst(const FloatImmNode* op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
-  switch (op->dtype.bits()) {
+  switch (op->ty().bits()) {
     case 64:
     case 32: {
       std::ostringstream temp;
       temp << std::scientific << op->value;
-      if (op->dtype.bits() == 32) temp << 'f';
+      if (op->ty().bits() == 32) temp << 'f';
       p->MarkConst(temp.str());
       os << temp.str();
       break;
     }
     case 16: {
       os << '(';
-      p->PrintType(op->dtype, os);
+      p->PrintType(op->ty()->dtype, os);
       os << ')' << std::scientific << op->value << 'f';
       break;
     }
     default:
-      TVM_FFI_THROW(InternalError) << "Bad bit-width for float: " << op->dtype << "\n";
+      TVM_FFI_THROW(InternalError) << "Bad bit-width for float: " << op->ty()->dtype << "\n";
   }
 }
 
@@ -517,7 +520,7 @@ template <typename T>
 inline void PrintBinaryExpr(const T* op, const char* opstr,
                             std::ostream& os,  // NOLINT(*)
                             CodeGenC* p) {
-  if (op->dtype.lanes() == 1) {
+  if (op->ty().lanes() == 1) {
     if (isalpha(opstr[0])) {
       os << opstr << '(';
       p->PrintExpr(op->a, os);
@@ -532,14 +535,14 @@ inline void PrintBinaryExpr(const T* op, const char* opstr,
       os << ')';
     }
   } else {
-    p->PrintVecBinaryOp(opstr, op->dtype, op->a, op->b, os);
+    p->PrintVecBinaryOp(opstr, op->ty()->dtype, op->a, op->b, os);
   }
 }
 
 inline void PrintBinaryIntrinsic(const CallNode* op, const char* opstr,
                                  std::ostream& os,  // NOLINT(*)
                                  CodeGenC* p) {
-  if (op->dtype.lanes() == 1) {
+  if (op->ty().lanes() == 1) {
     TVM_FFI_ICHECK_EQ(op->args.size(), 2U);
     os << '(';
     p->PrintExpr(op->args[0], os);
@@ -547,13 +550,13 @@ inline void PrintBinaryIntrinsic(const CallNode* op, const char* opstr,
     p->PrintExpr(op->args[1], os);
     os << ')';
   } else {
-    p->PrintVecBinaryOp(opstr, op->dtype, op->args[0], op->args[1], os);
+    p->PrintVecBinaryOp(opstr, op->ty()->dtype, op->args[0], op->args[1], os);
   }
 }
 void CodeGenC::VisitExpr_(const CastNode* op, std::ostream& os) {  // NOLINT(*)
   std::stringstream value;
   this->PrintExpr(op->value, value);
-  os << CastFromTo(value.str(), op->value.dtype(), op->dtype);
+  os << CastFromTo(value.str(), op->value.ty()->dtype, op->ty()->dtype);
 }
 void CodeGenC::VisitExpr_(const VarNode* op, std::ostream& os) {  // NOLINT(*)
   os << GetVarID(op);
@@ -571,19 +574,20 @@ void CodeGenC::VisitExpr_(const DivNode* op, std::ostream& os) {  // NOLINT(*)
   PrintBinaryExpr(op, "/", os, this);
 }
 void CodeGenC::VisitExpr_(const ModNode* op, std::ostream& os) {  // NOLINT(*)
-  if (op->dtype.is_int() || op->dtype.is_uint()) {
+  PrimType op_ty = op->ty();
+  if (op_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
     PrintBinaryExpr(op, "%", os, this);
   } else {
-    TVM_FFI_ICHECK(op->dtype.is_float())
-        << "Expected floating point or integer dtype in Mod, but got " << op->dtype;
-    if (op->dtype.bits() == 32) {
+    TVM_FFI_ICHECK(op_ty.code() == DLDataTypeCode::kDLFloat)
+        << "Expected floating point or integer dtype in Mod, but got " << op->ty()->dtype;
+    if (op_ty.bits() == 32) {
       PrintBinaryExpr(op, "fmodf", os, this);
-    } else if (op->dtype.bits() == 64) {
+    } else if (op_ty.bits() == 64) {
       PrintBinaryExpr(op, "fmod", os, this);
     } else {
       TVM_FFI_ICHECK(false)
           << "Non single or double precision floating point in Mod, expected 32 or 64 bits but got "
-          << op->dtype.bits() << " bits.";
+          << op_ty.bits() << " bits.";
     }
   }
 }
@@ -658,7 +662,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
         for (size_t i = 1; i < op->args.size(); i++) {
           arg_types.push_back(GetType(op->args[i]));
         }
-        Type ret_type = GetTypeFromRuntimeDataType(op->dtype);
+        Type ret_type = GetType(ffi::GetRef<PrimExpr>(op));
         this->GenerateForwardFunctionDeclarations(func->value, arg_types, ret_type);
       }
     } else if (op_attr_global_symbol_.count(call_op)) {
@@ -672,7 +676,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
       uint64_t low = static_cast<uint64_t>(op->args[0].as_or_throw<IntImm>()->value);
       uint64_t high = static_cast<uint64_t>(op->args[1].as_or_throw<IntImm>()->value);
       uint64_t val = (high << 32U) | low;
-      PrintUIntConst(op->dtype, val, os, this);
+      PrintUIntConst(op->ty()->dtype, val, os, this);
     } else if (op->op.same_as(builtin::bitwise_xor())) {
       PrintBinaryIntrinsic(op, " ^ ", os, this);
     } else if (op->op.same_as(builtin::bitwise_or())) {
@@ -691,7 +695,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
       std::string result = name_supply_->FreshName("condval");
       std::string cond = PrintExpr(op->args[0]);
       this->PrintIndent();
-      PrintType(op->dtype, this->stream);
+      PrintType(op->ty()->dtype, this->stream);
       this->stream << " " << result << ";\n";
       this->PrintIndent();
       this->stream << "if (" << cond << ") {\n";
@@ -721,13 +725,14 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
         TVM_FFI_ICHECK_EQ(load->indices.size(), 1)
             << "CodeGenC only supports flat memory allocations.";
         const VarNode* data = load->buffer->data.get();
-        if (pointer_offset_vars_.count(data) && HandleTypeMatch(data, load->buffer->dtype) &&
+        if (pointer_offset_vars_.count(data) && HandleTypeMatch(data, load->buffer->dtype->dtype) &&
             !IsVolatile(data)) {
           os << "(" << GetVarID(data) << " + ";
           this->PrintExpr(load->indices[0], os);
           os << ")";
         } else {
-          os << "(&(" << GetBufferRef(load->dtype, load->buffer.get(), load->indices[0]) << "))";
+          os << "(&(" << GetBufferRef(load->ty()->dtype, load->buffer.get(), load->indices[0])
+             << "))";
         }
       } else {
         auto* var = op->args[0].as<tirx::VarNode>();
@@ -752,7 +757,8 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
       }
     } else if (op->op.same_as(builtin::tvm_struct_get())) {
       TVM_FFI_ICHECK_EQ(op->args.size(), 3U);
-      os << GetStructRef(op->dtype, op->args[0], op->args[1], op->args[2].as<IntImmNode>()->value);
+      os << GetStructRef(op->ty()->dtype, op->args[0], op->args[1],
+                         op->args[2].as<IntImmNode>()->value);
     } else if (op->op.same_as(builtin::isnullptr())) {
       TVM_FFI_ICHECK_EQ(op->args.size(), 1U);
       os << "(";
@@ -761,7 +767,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
     } else if (op->op.same_as(builtin::ptr_byte_offset())) {
       TVM_FFI_ICHECK_EQ(op->args.size(), 3U);
       os << "((";
-      PrintType(op->args[2].dtype(), os);
+      PrintType(op->args[2].ty()->dtype, os);
       os << "*)(((char*)";
       this->PrintExpr(op->args[0], os);
       os << ") + ";
@@ -775,10 +781,10 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
       this->PrintExpr(op->args[1], os);
       os << "))";
     } else if (op->op.same_as(builtin::reinterpret())) {
-      auto target_dtype = op->dtype;
-      auto source_dtype = op->args[0]->dtype;
-      TVM_FFI_ICHECK_EQ(target_dtype.lanes() * target_dtype.bits(),
-                        source_dtype.lanes() * source_dtype.bits())
+      auto target_dtype = op->ty()->dtype;
+      auto source_dtype = op->args[0].ty()->dtype;
+      TVM_FFI_ICHECK_EQ(PrimType(target_dtype).lanes() * target_dtype.bits,
+                        PrimType(source_dtype).lanes() * source_dtype.bits)
           << "reinterpret expects source and target to have the same number of bits";
       int ssa_scope = BeginScope();
       std::string rhs = SSAGetID(PrintExpr(op->args[0]), source_dtype);
@@ -815,7 +821,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
   }
 }
 
-void CodeGenC::PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr lhs, PrimExpr rhs,
+void CodeGenC::PrintVecBinaryOp(const std::string& op, DLDataType t, PrimExpr lhs, PrimExpr rhs,
                                 std::ostream& os) {  // NOLINT(*)
   if (isalpha(op[0])) {
     os << op << "(";
@@ -840,16 +846,18 @@ void CodeGenC::VisitExpr_(const BufferLoadNode* op, std::ostream& os) {  // NOLI
   TVM_FFI_ICHECK_EQ(op->indices.size(), 1) << "Load from non-flat memory not supported.";
   TVM_FFI_ICHECK(!op->predicate.defined()) << "Predicated buffer load is not supported.";
 
-  DataType value_dtype = op->dtype;
+  DLDataType value_dtype = op->ty()->dtype;
+  PrimType value_ty(value_dtype);
   PrimExpr index = op->indices[0];
   Var buffer_var = op->buffer->data;
-  DataType element_dtype = op->buffer->dtype;
+  DLDataType element_dtype = op->buffer->dtype->dtype;
+  PrimType element_ty(element_dtype);
 
-  int lanes = op->dtype.lanes();
+  int lanes = value_ty.lanes();
   // delcare type.
-  if (value_dtype.lanes() == element_dtype.lanes()) {
-    std::string ref = GetBufferRef(op->dtype, op->buffer.get(), index);
-    if (value_dtype.is_float4_e2m1fn() && value_dtype.lanes() == 1) {
+  if (value_ty.lanes() == element_ty.lanes()) {
+    std::string ref = GetBufferRef(op->ty()->dtype, op->buffer.get(), index);
+    if (value_ty.code() == DLDataTypeCode::kDLFloat4_e2m1fn && value_ty.lanes() == 1) {
       // GetBufferRef returns an lvalue: *(ptr + index/2), which reads the
       // full byte.  Extract the correct nibble (low for even, high for odd).
       std::string index_str = PrintExpr(index);
@@ -863,34 +871,34 @@ void CodeGenC::VisitExpr_(const BufferLoadNode* op, std::ostream& os) {  // NOLI
   } else {
     bool can_vector_load = false;
     arith::PVar<PrimExpr> base;
-    if (arith::ramp(base, 1, op->dtype.lanes()).Match(index)) {
+    if (arith::ramp(base, 1, value_ty.lanes()).Match(index)) {
       const RampNode* ramp = index.as<RampNode>();
       TVM_FFI_ICHECK(ramp);
       arith::ModularSet me = arith::Analyzer()->modular_set(ramp->base);
       // The condition: {k * coeff + base} divisible by the alignment for any k
-      if (me->coeff % op->dtype.lanes() == 0 && me->base % op->dtype.lanes() == 0) {
+      if (me->coeff % value_ty.lanes() == 0 && me->base % value_ty.lanes() == 0) {
         can_vector_load = true;
       }
     }
 
-    if (value_dtype.is_float4_e2m1fn() && lanes != 1) {
+    if (value_ty.code() == DLDataTypeCode::kDLFloat4_e2m1fn && lanes != 1) {
       // A float4_e2m1fn element has 4 bits, which is an incomplete byte.
       // So we cannot vector load it.
       can_vector_load = false;
     }
     if (can_vector_load) {
-      std::string ref = GetVecLoad(op->dtype, op->buffer.get(), base.Eval());
+      std::string ref = GetVecLoad(op->ty()->dtype, op->buffer.get(), base.Eval());
       HandleVolatileLoads(ref, op, os);
     } else {
       std::ostringstream svalue_expr;
-      std::string sindex = SSAGetID(PrintExpr(index), index.dtype());
+      std::string sindex = SSAGetID(PrintExpr(index), index.ty()->dtype);
       std::string vid = GetVarID(buffer_var.get());
-      DataType elem_type = op->dtype.element_of();
+      DLDataType elem_type{value_dtype.code, value_dtype.bits, 1};
       for (int i = 0; i < lanes; ++i) {
         std::ostringstream value_temp;
         if (!HandleTypeMatch(buffer_var.get(), elem_type)) {
           value_temp << "((";
-          if (buffer_var.get()->dtype.is_handle()) {
+          if (buffer_var.get()->ty().IsHandle()) {
             auto it = alloc_storage_scope_.find(buffer_var.get());
             if (it != alloc_storage_scope_.end()) {
               PrintStorageScope(it->second, value_temp);
@@ -902,9 +910,9 @@ void CodeGenC::VisitExpr_(const BufferLoadNode* op, std::ostream& os) {  // NOLI
           value_temp << vid;
         }
         value_temp << '[';
-        PrintVecElemLoad(sindex, index.dtype(), i, value_temp);
+        PrintVecElemLoad(sindex, index.ty()->dtype, i, value_temp);
         value_temp << ']';
-        PrintVecElemLoadExpr(op->dtype, i, value_temp.str(), svalue_expr);
+        PrintVecElemLoadExpr(op->ty()->dtype, i, value_temp.str(), svalue_expr);
       }
       os << svalue_expr.str();
     }
@@ -915,12 +923,14 @@ void CodeGenC::VisitStmt_(const BufferStoreNode* op) {
   TVM_FFI_ICHECK_EQ(op->indices.size(), 1) << "Store to non-flat memory not supported.";
   TVM_FFI_ICHECK(!op->predicate.defined()) << "Predicated buffer store is not supported.";
 
-  DataType value_dtype = op->value.dtype();
-  DataType element_dtype = op->buffer->dtype;
+  DLDataType value_dtype = op->value.ty()->dtype;
+  PrimType value_ty(value_dtype);
+  DLDataType element_dtype = op->buffer->dtype->dtype;
+  PrimType element_ty(element_dtype);
   PrimExpr index_expr = op->indices[0];
   Var buffer_var = op->buffer->data;
 
-  if (value_dtype.lanes() == element_dtype.lanes()) {
+  if (value_ty.lanes() == element_ty.lanes()) {
     std::string value = this->PrintExpr(op->value);
     std::string ref = this->GetBufferRef(value_dtype, op->buffer.get(), index_expr);
     this->PrintIndent();
@@ -928,8 +938,8 @@ void CodeGenC::VisitStmt_(const BufferStoreNode* op) {
   } else {
     arith::PVar<PrimExpr> base;
 
-    if (arith::ramp(base, 1, value_dtype.lanes()).Match(index_expr) &&
-        !value_dtype.is_float4_e2m1fn()) {
+    if (arith::ramp(base, 1, value_ty.lanes()).Match(index_expr) &&
+        value_ty.code() != DLDataTypeCode::kDLFloat4_e2m1fn) {
       std::string value = this->PrintExpr(op->value);
       this->PrintVecStore(op->buffer.get(), value_dtype, base.Eval(), value);
     } else {
@@ -938,15 +948,15 @@ void CodeGenC::VisitStmt_(const BufferStoreNode* op) {
       int vec_scope = BeginScope();
 
       // store elements separately
-      std::string index = SSAGetID(PrintExpr(index_expr), index_expr.dtype());
-      std::string value = SSAGetID(PrintExpr(op->value), op->value.dtype());
+      std::string index = SSAGetID(PrintExpr(index_expr), index_expr.ty()->dtype);
+      std::string value = SSAGetID(PrintExpr(op->value), op->value.ty()->dtype);
       std::string vid = GetVarID(buffer_var.get());
-      for (int i = 0; i < value_dtype.lanes(); ++i) {
+      for (int i = 0; i < value_ty.lanes(); ++i) {
         this->PrintIndent();
-        DataType elem_type = value_dtype.element_of();
+        DLDataType elem_type{value_dtype.code, value_dtype.bits, 1};
         if (!HandleTypeMatch(buffer_var.get(), elem_type)) {
           stream << "((";
-          if (buffer_var.get()->dtype.is_handle()) {
+          if (buffer_var.get()->ty().IsHandle()) {
             auto it = alloc_storage_scope_.find(buffer_var.get());
             if (it != alloc_storage_scope_.end()) {
               PrintStorageScope(it->second, stream);
@@ -958,9 +968,9 @@ void CodeGenC::VisitStmt_(const BufferStoreNode* op) {
           stream << vid;
         }
         stream << '[';
-        PrintVecElemLoad(index, index_expr.dtype(), i, stream);
+        PrintVecElemLoad(index, index_expr.ty()->dtype, i, stream);
         stream << "] = ";
-        PrintVecElemLoad(value, op->value.dtype(), i, stream);
+        PrintVecElemLoad(value, op->value.ty()->dtype, i, stream);
         stream << ";\n";
       }
       EndScope(vec_scope);
@@ -983,13 +993,13 @@ void CodeGenC::VisitExpr_(const LetNode* op, std::ostream& os) {  // NOLINT(*)
     var_idmap_[op->var.get()] = value;
   } else {
     PrintIndent();
-    if (op->var.dtype() == DataType::Handle() && handle_data_type_.count(op->var.get())) {
+    if (op->var.ty().IsHandle() && handle_data_type_.count(op->var.get())) {
       PrintType(handle_data_type_.at(op->var.get()), this->stream);
       this->stream << "* " << AllocVarID(op->var.get()) << " = (";
       PrintType(handle_data_type_.at(op->var.get()), this->stream);
       this->stream << "*)" << value << ";\n";
     } else {
-      PrintType(op->var.dtype(), this->stream);
+      PrintType(op->var.ty()->dtype, this->stream);
       this->stream << ' ' << AllocVarID(op->var.get()) << " = " << value << ";\n";
     }
   }
@@ -1004,8 +1014,8 @@ void CodeGenC::VisitExpr_(const LetNode* op, std::ostream& os) {  // NOLINT(*)
 void CodeGenC::VisitExpr_(const RampNode* op, std::ostream& os) {  // NOLINT(*)
   // NOTE: C have comma expression so cannot use (int2)(v0, v1)
   // instead should use int2(v0, v1)
-  PrintType(op->dtype, os);
-  int lanes = op->dtype.lanes();
+  PrintType(op->ty()->dtype, os);
+  int lanes = op->ty().lanes();
   os << "(";
   for (int i = 0; i < lanes; i++) {
     os << "(" << PrintExpr(op->base) << ")"
@@ -1031,11 +1041,11 @@ void CodeGenC::VisitExpr_(const ShuffleNode* op, std::ostream& os) {  // NOLINT(
   if (op->vectors.size() > 1) {
     for (const PrimExpr& vec : op->vectors) {
       std::string vec_value = this->PrintExpr(vec);
-      if (vec.dtype().lanes() == 1) {
+      if (vec.ty().lanes() == 1) {
         concat_vec.push_back(vec_value);
       } else {
         // print out each element
-        for (int i = 0; i < vec.dtype().lanes(); ++i) {
+        for (int i = 0; i < vec.ty().lanes(); ++i) {
           // access i-th element of each vector
           std::ostringstream vec_elem_strm;
           vec_elem_strm << vec_value << "[" << i << "]";
@@ -1046,14 +1056,14 @@ void CodeGenC::VisitExpr_(const ShuffleNode* op, std::ostream& os) {  // NOLINT(
   } else {
     // Extract elements from a single vector-type value.
     std::string vec_value = "(" + this->PrintExpr(op->vectors[0]) + ")";
-    if (op->vectors[0].dtype().lanes() == 1) {
+    if (op->vectors[0].ty().lanes() == 1) {
       concat_vec.push_back(vec_value);
     } else {
       // print out each element
-      for (int i = 0; i < op->vectors[0].dtype().lanes(); ++i) {
+      for (int i = 0; i < op->vectors[0].ty().lanes(); ++i) {
         // access i-th element of each vector
         std::ostringstream vec_elem_strm;
-        PrintVecElemLoad(vec_value, op->vectors[0].dtype(), i, vec_elem_strm);
+        PrintVecElemLoad(vec_value, op->vectors[0].ty()->dtype, i, vec_elem_strm);
         concat_vec.push_back(vec_elem_strm.str());
       }
     }
@@ -1071,7 +1081,7 @@ void CodeGenC::VisitExpr_(const ShuffleNode* op, std::ostream& os) {  // NOLINT(
   } else {
     // Print the shuffle as vector constructor
     // vec(e0, e1, e2, .. en)
-    PrintVecConstructor(op->dtype, os);
+    PrintVecConstructor(op->ty()->dtype, os);
     os << '(';
     for (size_t i = 0; i < op->indices.size(); ++i) {
       if (i != 0) os << ", ";
@@ -1108,13 +1118,13 @@ void CodeGenC::VisitStmt_(const BindNode* op) {
     var_idmap_[op->var.get()] = value;
   } else {
     PrintIndent();
-    if (op->var.dtype() == DataType::Handle() && handle_data_type_.count(op->var.get())) {
+    if (op->var.ty().IsHandle() && handle_data_type_.count(op->var.get())) {
       PrintType(handle_data_type_.at(op->var.get()), stream);
       stream << "* " << AllocVarID(op->var.get()) << " = (";
       PrintType(handle_data_type_.at(op->var.get()), stream);
       stream << "*)" << value << ";\n";
     } else {
-      PrintType(op->var.dtype(), this->stream);
+      PrintType(op->var.ty()->dtype, this->stream);
       this->stream << ' ' << AllocVarID(op->var.get()) << " = " << value << ";\n";
     }
   }
@@ -1138,10 +1148,10 @@ void CodeGenC::VisitStmt_(const AllocBufferNode* op) {
   alloc_storage_scope_[op->buffer->data.get()] = scope;
   PrintStorageScope(scope, stream);
 
-  PrintType(op->buffer->dtype, stream);
+  PrintType(op->buffer->dtype->dtype, stream);
   stream << ' ' << vid << '[' << constant_size << "];\n";
 
-  RegisterHandleType(op->buffer->data.get(), op->buffer->dtype);
+  RegisterHandleType(op->buffer->data.get(), op->buffer->dtype->dtype);
   if (op->annotations.count(tirx::attr::kVolatile)) {
     MarkVolatile(op->buffer->data.get());
   }
@@ -1247,7 +1257,7 @@ void CodeGenC::VisitStmt_(const ForNode* op) {
   PrintIndent();
   std::string vid = AllocVarID(op->loop_var.get());
   stream << "for (";
-  PrintType(op->loop_var.dtype(), stream);
+  PrintType(op->loop_var.ty()->dtype, stream);
   stream << ' ' << vid << " = " << begin_str << "; " << vid << " < " << end_str << "; ";
   if (step_str.empty()) {
     stream << "++" << vid;
@@ -1326,23 +1336,23 @@ void CodeGenC::VisitStmt_(const EvaluateNode* op) {
     } else if (call->op.same_as(builtin::tvm_struct_set())) {
       TVM_FFI_ICHECK_EQ(call->args.size(), 4);
       int kind = call->args[2].as<IntImmNode>()->value;
-      DataType store_dtype = call->args[3].dtype();
+      DLDataType store_dtype = call->args[3].ty()->dtype;
+      PrimType store_ty(store_dtype);
       std::string ref = GetStructRef(store_dtype, call->args[0], call->args[1], kind);
       std::string value = PrintExpr(call->args[3]);
       std::string cast;
 
-      if (kind == builtin::kTVMFFIAnyUnionValue &&
-          (store_dtype.bits() < 64 || store_dtype.is_handle())) {
+      if (kind == builtin::kTVMFFIAnyUnionValue && (store_dtype.bits < 64 || store_ty.IsHandle())) {
         this->PrintIndent();
         // when we set any union value, we need to be careful to
         // clear off the union value to zero if the set size is less than 64 bits
-        this->stream << GetStructRef(DataType::Int(64), call->args[0], call->args[1], kind)
+        this->stream << GetStructRef(DLDataType{kDLInt, 64, 1}, call->args[0], call->args[1], kind)
                      << " = 0;\n";
       }
 
       if (kind == builtin::kDLTensorStrides) {
         // cast void* to int64_t*
-        cast = call->args[3]->dtype.is_handle() ? "(int64_t*)" : "";
+        cast = call->args[3].ty().IsHandle() ? "(int64_t*)" : "";
       } else if (kind == builtin::kDLTensorDeviceType) {
         // cast int to enum
         cast = "(DLDeviceType)";
@@ -1359,9 +1369,12 @@ void CodeGenC::VisitStmt_(const EvaluateNode* op) {
   }
 }
 
-void CodeGenC::PrintVecElemLoadExpr(DataType t, int i, const std::string& value, std::ostream& os) {
-  TVM_FFI_ICHECK_GT(t.lanes(), 1);
-  if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
+void CodeGenC::PrintVecElemLoadExpr(DLDataType t, int i, const std::string& value,
+                                    std::ostream& os) {
+  PrimType t_ty(t);
+  int lanes = t_ty.lanes();
+  TVM_FFI_ICHECK_GT(lanes, 1);
+  if (t.bits == 8 && (t_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt))) {
     if (i != 0) {
       os << "|";
     }
@@ -1377,7 +1390,7 @@ void CodeGenC::PrintVecElemLoadExpr(DataType t, int i, const std::string& value,
     os << "(";
   }
   os << value;
-  if (i != t.lanes() - 1) {
+  if (i != lanes - 1) {
     os << ",";
   } else {
     os << "))";
diff --git a/src/target/source/codegen_c.h b/src/target/source/codegen_c.h
index a023277ed19c..61d640b66947 100644
--- a/src/target/source/codegen_c.h
+++ b/src/target/source/codegen_c.h
@@ -209,25 +209,27 @@ class CodeGenC : public ExprFunctor<void(const PrimExpr&, std::ostream&)>,
   virtual void PrintStorageScope(const std::string& scope, std::ostream& os);  // NOLINT(*)
   virtual void PrintStorageSync(const CallNode* op);                           // NOLINT(*)
   // Binary vector op.
-  virtual void PrintVecBinaryOp(const std::string& op, DataType op_type, PrimExpr lhs, PrimExpr rhs,
+  virtual void PrintVecBinaryOp(const std::string& op, DLDataType op_type, PrimExpr lhs,
+                                PrimExpr rhs,
                                 std::ostream& os);  // NOLINT(*)
   // print vector load
-  virtual std::string GetVecLoad(DataType t, const BufferNode* buffer, PrimExpr base);
+  virtual std::string GetVecLoad(DLDataType t, const BufferNode* buffer, PrimExpr base);
   // print vector store
-  virtual void PrintVecStore(const BufferNode* buffer, DataType t, PrimExpr base,
+  virtual void PrintVecStore(const BufferNode* buffer, DLDataType t, PrimExpr base,
                              const std::string& value);  // NOLINT(*)
   // print load of single element
-  virtual void PrintVecElemLoad(const std::string& vec, DataType t, int i,
+  virtual void PrintVecElemLoad(const std::string& vec, DLDataType t, int i,
                                 std::ostream& os);  // NOLINT(*)
   // print store of single element.
-  virtual void PrintVecElemStore(const std::string& vec, DataType t, int i,
+  virtual void PrintVecElemStore(const std::string& vec, DLDataType t, int i,
                                  const std::string& value);
   // print vector constructor
-  virtual void PrintVecConstructor(DataType t, std::ostream& os);
+  virtual void PrintVecConstructor(DLDataType t, std::ostream& os);
   // Get a cast type from to
-  virtual std::string CastFromTo(std::string value, DataType from, DataType target);
+  virtual std::string CastFromTo(std::string value, DLDataType from, DLDataType target);
   // Get load of single element with expression
-  virtual void PrintVecElemLoadExpr(DataType t, int i, const std::string& value, std::ostream& os);
+  virtual void PrintVecElemLoadExpr(DLDataType t, int i, const std::string& value,
+                                    std::ostream& os);
   // Print restrict keyword for a given Var if applicable
   virtual void PrintRestrict(const Var& v, std::ostream& os);
 
@@ -239,9 +241,9 @@ class CodeGenC : public ExprFunctor<void(const PrimExpr&, std::ostream&)>,
   /*! \brief Print a C string literal with proper escaping of special chars. */
   void PrintEscapedCString(const std::string& str, std::ostream& os);
   // Print reference to struct location
-  std::string GetStructRef(DataType t, const PrimExpr& buffer, const PrimExpr& index, int kind);
+  std::string GetStructRef(DLDataType t, const PrimExpr& buffer, const PrimExpr& index, int kind);
   // Print reference to a buffer as type t in index.
-  virtual std::string GetBufferRef(DataType t, const BufferNode* buffer, PrimExpr index);
+  virtual std::string GetBufferRef(DLDataType t, const BufferNode* buffer, PrimExpr index);
 
   /*!
    * \brief Handle volatile loads.
@@ -294,13 +296,13 @@ class CodeGenC : public ExprFunctor<void(const PrimExpr&, std::ostream&)>,
    * \param buf_var The buffer variable.
    * \param t The type to be checked.
    */
-  bool HandleTypeMatch(const VarNode* buf_var, DataType t) const;
+  bool HandleTypeMatch(const VarNode* buf_var, DLDataType t) const;
   /*!
    * \brief Register the data type of buf_var
    * \param buf_var The buffer variable.
    * \param t The type to be checked.
    */
-  void RegisterHandleType(const VarNode* buf_var, DataType t);
+  void RegisterHandleType(const VarNode* buf_var, DLDataType t);
   /*!
    * \brief Register a typed pointer produced by explicit pointer-offset intrinsics.
    *
@@ -310,7 +312,7 @@ class CodeGenC : public ExprFunctor<void(const PrimExpr&, std::ostream&)>,
    */
   void RegisterHandleTypeFromPointer(const tirx::Var& var, const PrimExpr* value);
   // override
-  void PrintSSAAssign(const std::string& target, const std::string& src, DataType t) override;
+  void PrintSSAAssign(const std::string& target, const std::string& src, PrimType t) override;
   /*! \brief reserves common C keywords */
   void ReserveKeywordsAsUnique();
 
@@ -324,7 +326,7 @@ class CodeGenC : public ExprFunctor<void(const PrimExpr&, std::ostream&)>,
   /*! \brief the storage scope of allocation */
   std::unordered_map<const VarNode*, std::string> alloc_storage_scope_;
   /*! \brief the data type of allocated buffers */
-  std::unordered_map<const VarNode*, DataType> handle_data_type_;
+  std::unordered_map<const VarNode*, DLDataType> handle_data_type_;
   /*! \brief Handle vars whose address_of(buffer[index]) should print as ptr + index. */
   std::unordered_set<const VarNode*> pointer_offset_vars_;
   /*! \brief Record of ops that have pre-defined global symbol. */
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index 1319ae4a4b57..0071dc15c7e3 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -120,24 +120,24 @@ void CodeGenCHost::PrintFuncPrefix(std::ostream& os) {  // NOLINT(*)
      << "TVM_DLL ";
 }
 
-void CodeGenCHost::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
-  int lanes = t.lanes();
-  if (t.is_handle()) {
+void CodeGenCHost::PrintType(DLDataType t, std::ostream& os) {  // NOLINT(*)
+  int lanes = static_cast<int16_t>(t.lanes);
+  if (t.code == kDLOpaqueHandle && !(t.bits == 0 && lanes == 0)) {
     TVM_FFI_ICHECK_EQ(lanes, 1) << "does not support vector types";
     os << "void*";
     return;
   }
-  if (t.is_void()) {
+  if (t.code == kDLOpaqueHandle && t.bits == 0 && lanes == 0) {
     os << "void";
     return;
   }
-  if (t == DataType::Bool()) {
+  if (t == DLDataType{kDLBool, 1, 1}) {
     os << "bool";
     return;
   }
   bool fail = false;
-  if (t.is_float()) {
-    switch (t.bits()) {
+  if (t.code == kDLFloat) {
+    switch (t.bits) {
       case 16:
         os << "half";
         break;
@@ -156,11 +156,11 @@ void CodeGenCHost::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
       os << lanes;
       return;
     }
-  } else if (t.is_uint() || t.is_int()) {
-    if (t.is_uint()) {
+  } else if (t.code == kDLUInt || t.code == kDLInt) {
+    if (t.code == kDLUInt) {
       os << 'u';
     }
-    switch (t.bits()) {
+    switch (t.bits) {
       case 8:
         os << "int8_t";
         break;
@@ -191,9 +191,9 @@ void CodeGenCHost::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
 
 void CodeGenCHost::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NOLINT(*)
   std::string v = PrintExpr(op->value);
-  int lanes = op->dtype.lanes();
+  int lanes = static_cast<int16_t>(op->ty()->dtype.lanes);
   os << "((";
-  PrintType(op->dtype, os);
+  PrintType(op->ty()->dtype, os);
   os << ")(";
   for (int i = 0; i < lanes; ++i) {
     if (i != 0) os << ", ";
@@ -356,10 +356,10 @@ inline void CodeGenCHost::PrintTernaryCondExpr(const T* op, const char* compare,
                                                std::ostream& os) {  // NOLINT(*)
   std::ostringstream temp_a;
   VisitExpr(op->a, temp_a);
-  std::string a_id = SSAGetID(temp_a.str(), op->a.dtype());
+  std::string a_id = SSAGetID(temp_a.str(), op->a.ty()->dtype);
   std::ostringstream temp_b;
   VisitExpr(op->b, temp_b);
-  std::string b_id = SSAGetID(temp_b.str(), op->b.dtype());
+  std::string b_id = SSAGetID(temp_b.str(), op->b.ty()->dtype);
 
   os << "((" << a_id << ") " << compare << " (" << b_id << ") "
      << "? (" << a_id << ") : (" << b_id << "))";
diff --git a/src/target/source/codegen_c_host.h b/src/target/source/codegen_c_host.h
index edeebe7da1cc..a384dc957a1e 100644
--- a/src/target/source/codegen_c_host.h
+++ b/src/target/source/codegen_c_host.h
@@ -57,8 +57,8 @@ class CodeGenCHost : public CodeGenC {
   void DefineModuleName();
 
   using CodeGenC::PrintType;
-  void PrintType(DataType t, std::ostream& os) final;  // NOLINT(*)
-  void PrintFuncPrefix(std::ostream& os) final;        // NOLINT(*)
+  void PrintType(DLDataType t, std::ostream& os) final;  // NOLINT(*)
+  void PrintFuncPrefix(std::ostream& os) final;          // NOLINT(*)
 
   // overload visitor functions
   void VisitExpr_(const BroadcastNode* op, std::ostream& os) final;  // NOLINT(*)
diff --git a/src/target/source/codegen_params.cc b/src/target/source/codegen_params.cc
index ae915f278f57..6f4cd1a12094 100644
--- a/src/target/source/codegen_params.cc
+++ b/src/target/source/codegen_params.cc
@@ -163,8 +163,8 @@ void PrintFloatingPointArray(void* data, size_t num_elements, int indent_chars,
 void TensorDataToC(::tvm::runtime::Tensor arr, int indent_chars, std::ostream& os,
                    const std::string& eol) {
   auto arr_type = arr.DataType();
-  TVM_FFI_ICHECK_EQ(arr_type.lanes(), 1)
-      << "CodegenParams: only support generating 1-lane parameters; saw " << arr_type.lanes();
+  TVM_FFI_ICHECK_EQ(arr_type.lanes, 1)
+      << "CodegenParams: only support generating 1-lane parameters; saw " << arr_type.lanes;
 
   auto shape = arr.Shape();
   int num_elements = 1;
@@ -176,72 +176,73 @@ void TensorDataToC(::tvm::runtime::Tensor arr, int indent_chars, std::ostream& o
   os.setf(std::ios::internal | std::ios::hex,
           std::ios::adjustfield | std::ios::basefield | std::ios::showbase);
   os.fill('0');
-  switch (arr_type.code()) {
-    case runtime::DataType::kInt:
-      TVM_FFI_ICHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
-                     arr_type.bits() == 64)
+  switch (static_cast<DLDataTypeCode>(arr_type.code)) {
+    case DLDataTypeCode::kDLInt:
+      TVM_FFI_ICHECK(arr_type.bits == 8 || arr_type.bits == 16 || arr_type.bits == 32 ||
+                     arr_type.bits == 64)
           << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
-          << arr_type.bits() << "-bit array";
-      if (arr_type.bits() == 8) {
+          << arr_type.bits << "-bit array";
+      if (arr_type.bits == 8) {
         PrintIntegralArray<int8_t>(arr->data, num_elements, indent_chars, os, eol);
-      } else if (arr_type.bits() == 16) {
+      } else if (arr_type.bits == 16) {
         PrintIntegralArray<int16_t>(arr->data, num_elements, indent_chars, os, eol);
-      } else if (arr_type.bits() == 32) {
+      } else if (arr_type.bits == 32) {
         PrintIntegralArray<int32_t>(arr->data, num_elements, indent_chars, os, eol);
-      } else if (arr_type.bits() == 64) {
+      } else if (arr_type.bits == 64) {
         PrintIntegralArray<int64_t>(arr->data, num_elements, indent_chars, os, eol);
       } else {
         TVM_FFI_ICHECK(false) << "should not get here";
       }
       break;
 
-    case runtime::DataType::TypeCode::kUInt:
-      TVM_FFI_ICHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
-                     arr_type.bits() == 64)
+    case DLDataTypeCode::kDLUInt:
+      TVM_FFI_ICHECK(arr_type.bits == 8 || arr_type.bits == 16 || arr_type.bits == 32 ||
+                     arr_type.bits == 64)
           << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
-          << arr_type.bits() << "-bit array";
+          << arr_type.bits << "-bit array";
 
-      if (arr_type.bits() == 8) {
+      if (arr_type.bits == 8) {
         PrintIntegralArray<uint8_t>(arr->data, num_elements, indent_chars, os, eol);
-      } else if (arr_type.bits() == 16) {
+      } else if (arr_type.bits == 16) {
         PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os, eol);
-      } else if (arr_type.bits() == 32) {
+      } else if (arr_type.bits == 32) {
         PrintIntegralArray<uint32_t>(arr->data, num_elements, indent_chars, os, eol);
-      } else if (arr_type.bits() == 64) {
+      } else if (arr_type.bits == 64) {
         PrintIntegralArray<uint64_t>(arr->data, num_elements, indent_chars, os, eol);
       } else {
         TVM_FFI_ICHECK(false) << "should not get here";
       }
       break;
 
-    case runtime::DataType::TypeCode::kFloat: {
+    case DLDataTypeCode::kDLFloat: {
       os.fill(' ');
       os.setf(std::ios::left, std::ios::adjustfield);
-      if (arr_type.bits() == 16) {
+      if (arr_type.bits == 16) {
         // NOTE: print types not widely supported by C as uint16_t.
         PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os, eol);
-      } else if (arr_type.bits() == 32) {
+      } else if (arr_type.bits == 32) {
         PrintFloatingPointArray<float>(arr->data, num_elements, indent_chars, os, eol);
-      } else if (arr_type.bits() == 64) {
+      } else if (arr_type.bits == 64) {
         PrintFloatingPointArray<double>(arr->data, num_elements, indent_chars, os, eol);
       } else {
         TVM_FFI_ICHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
-                              << arr_type.bits() << "-bit array";
+                              << arr_type.bits << "-bit array";
       }
       break;
     }
 
-    case runtime::DataType::TypeCode::kBFloat: {
+    case DLDataTypeCode::kDLBfloat: {
       // NOTE: print types not widely supported by C as uint16_t.
-      TVM_FFI_ICHECK(arr_type.bits() == 16)
-          << "CodegenParams: only support generating 16-bit bfloat params; saw " << arr_type.bits()
+      TVM_FFI_ICHECK(arr_type.bits == 16)
+          << "CodegenParams: only support generating 16-bit bfloat params; saw " << arr_type.bits
           << "-bit array";
       PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os, eol);
       break;
     }
 
     default:
-      TVM_FFI_ICHECK(false) << "Data type '" << arr_type << "' not supported";
+      TVM_FFI_ICHECK(false) << "Data type '" << ffi::DLDataTypeToString(arr_type)
+                            << "' not supported";
   }
 
   os.flags(old_fmtflags);
diff --git a/src/target/source/codegen_source_base.cc b/src/target/source/codegen_source_base.cc
index 2646a6597ef4..6318fbe514e2 100644
--- a/src/target/source/codegen_source_base.cc
+++ b/src/target/source/codegen_source_base.cc
@@ -34,7 +34,7 @@ void CodeGenSourceBase::ClearFuncState() {
   scope_mark_.clear();
 }
 
-std::string CodeGenSourceBase::SSAGetID(std::string src, DataType t) {
+std::string CodeGenSourceBase::SSAGetID(std::string src, PrimType t) {
   if (name_supply_->ContainsName(src)) return src;
   auto it = ssa_assign_map_.find(src);
   if (it != ssa_assign_map_.end()) {
@@ -99,50 +99,51 @@ void CodeGenSourceBase::EndScope(int scope_id) {
   indent_ -= 2;
 }
 
-void CodeGenSourceBase::PrintType(DataType type, std::ostream& os) {  // NOLINT(*)
-  TVM_FFI_ICHECK_EQ(type.lanes(), 1) << "do not yet support vector types";
-  if (type.is_handle()) {
+void CodeGenSourceBase::PrintType(DLDataType type, std::ostream& os) {  // NOLINT(*)
+  int lanes = static_cast<int16_t>(type.lanes);
+  TVM_FFI_ICHECK_EQ(lanes, 1) << "do not yet support vector types";
+  if (type.code == kDLOpaqueHandle && !(type.bits == 0 && lanes == 0)) {
     os << "void*";
     return;
   }
-  if (type.is_void()) {
+  if (type.code == kDLOpaqueHandle && type.bits == 0 && lanes == 0) {
     os << "void";
     return;
   }
   // default c may be have bool type, can be handled in subclass
-  if (type.is_bool()) {
+  if (type.code == kDLBool) {
     os << "int";
     return;
   }
-  if (type.is_float()) {
-    if (type.bits() == 32) {
+  if (type.code == kDLFloat) {
+    if (type.bits == 32) {
       os << "float";
       return;
     }
-    if (type.bits() == 64) {
+    if (type.bits == 64) {
       os << "double";
       return;
     }
-  } else if (type.is_uint()) {
-    switch (type.bits()) {
+  } else if (type.code == kDLUInt) {
+    switch (type.bits) {
       case 8:
       case 16:
       case 32:
       case 64: {
-        os << "uint" << type.bits() << "_t";
+        os << "uint" << static_cast<int>(type.bits) << "_t";
         return;
       }
       case 1:
         os << "int";
         return;
     }
-  } else if (type.is_int()) {
-    switch (type.bits()) {
+  } else if (type.code == kDLInt) {
+    switch (type.bits) {
       case 8:
       case 16:
       case 32:
       case 64: {
-        os << "int" << type.bits() << "_t";
+        os << "int" << static_cast<int>(type.bits) << "_t";
         return;
       }
     }
diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h
index f6e58cc9efba..d869a811fe5e 100644
--- a/src/target/source/codegen_source_base.h
+++ b/src/target/source/codegen_source_base.h
@@ -58,7 +58,7 @@ class CodeGenSourceBase {
    * \param t The type representation.
    * \param os The stream to print the ctype into
    */
-  virtual void PrintType(DataType type, std::ostream& os);  // NOLINT(*)
+  virtual void PrintType(DLDataType type, std::ostream& os);  // NOLINT(*)
   /*!
    * Print Type representation of type type.
    * \param type The type representation.
@@ -96,7 +96,10 @@ class CodeGenSourceBase {
    * \param src The source expression
    * \param t The type of the expression.
    */
-  std::string SSAGetID(std::string src, DataType t);
+  std::string SSAGetID(std::string src, PrimType t);
+  std::string SSAGetID(std::string src, DLDataType t) {
+    return SSAGetID(std::move(src), PrimType(t));
+  }
   /*!
    * \brief mark the beginning of a new scope
    * \return The scope id.
@@ -113,7 +116,7 @@ class CodeGenSourceBase {
    * \param src The source expression.
    * \param t The type of target.
    */
-  virtual void PrintSSAAssign(const std::string& target, const std::string& src, DataType t) = 0;
+  virtual void PrintSSAAssign(const std::string& target, const std::string& src, PrimType t) = 0;
 
   /*! \brief the declaration stream */
   std::ostringstream decl_stream;
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 57b82491c03d..972c85c3806e 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -198,7 +198,7 @@ class ConcreteCodegenSourceBase : public CodeGenSourceBase {
   /*!
    * \brief Do nothing as this class exist to get access to methods of CodeGenSourceBase
    */
-  void PrintSSAAssign(const std::string& target, const std::string& src, DataType t) final {
+  void PrintSSAAssign(const std::string& target, const std::string& src, PrimType t) final {
     return;
   }
 };
diff --git a/src/te/operation/compute_op.cc b/src/te/operation/compute_op.cc
index 0fa57d0f4617..a407fafa18fa 100644
--- a/src/te/operation/compute_op.cc
+++ b/src/te/operation/compute_op.cc
@@ -75,9 +75,9 @@ static inline void AssertReduceEqual(const tirx::ReduceNode* a, const tirx::Redu
 
 int ComputeOpNode::num_outputs() const { return body.size(); }
 
-DataType ComputeOpNode::output_dtype(size_t idx) const {
+PrimType ComputeOpNode::output_dtype(size_t idx) const {
   TVM_FFI_ICHECK_LT(idx, num_outputs());
-  return body[idx].dtype();
+  return body[idx].ty();
 }
 
 ffi::Array<PrimExpr> BaseComputeOpNode::output_shape(size_t idx) const {
@@ -100,8 +100,8 @@ Tensor compute(ffi::Array<PrimExpr> shape, FCompute fcompute, std::string name,
   for (size_t i = 0; i < ndim; ++i) {
     std::ostringstream os;
     os << "ax" << i;
-    axis.emplace_back(IterVar(Range(IntImm(shape[i]->dtype, 0), shape[i]),
-                              Var(os.str(), shape[i].dtype()), kDataPar));
+    axis.emplace_back(
+        IterVar(Range(IntImm(shape[i].ty(), 0), shape[i]), Var(os.str(), shape[i].ty()), kDataPar));
     args.push_back(axis.back()->var);
   }
 
@@ -117,8 +117,8 @@ ffi::Array<Tensor> compute(ffi::Array<PrimExpr> shape, FBatchCompute fcompute, s
   for (size_t i = 0; i < ndim; ++i) {
     std::ostringstream os;
     os << "ax" << i;
-    axis.emplace_back(IterVar(Range(IntImm(shape[i]->dtype, 0), shape[i]),
-                              Var(os.str(), shape[i].dtype()), kDataPar));
+    axis.emplace_back(
+        IterVar(Range(IntImm(shape[i].ty(), 0), shape[i]), Var(os.str(), shape[i].ty()), kDataPar));
     args.push_back(axis.back()->var);
   }
 
diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 79ba7d8a3918..6127336296e7 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -387,7 +387,7 @@ Stmt GenerateBodyStmt(const ffi::Array<PrimExpr>& indices, const ffi::Array<Buff
       const PrimExpr& right = analyzer->Simplify(f_transform_and_remap(reduce->source[i]));
       lhs.push_back(left);
       rhs.push_back(right);
-      TVM_FFI_ICHECK_EQ(left->dtype, right->dtype);
+      TVM_FFI_ICHECK_EQ(left.ty()->dtype, right.ty()->dtype);
     }
 
     ffi::Array<Var> temp_vars;
@@ -404,7 +404,7 @@ Stmt GenerateBodyStmt(const ffi::Array<PrimExpr>& indices, const ffi::Array<Buff
       const Buffer& buffer = buffers[i];
       PrimExpr value{nullptr};
       if (n_buffers > 1) {
-        temp_vars.push_back(Var("v_" + buffer->name, PrimType(lhs[i].dtype())));
+        temp_vars.push_back(Var("v_" + buffer->name, lhs[i].ty()));
         value = temp_vars.back();
       } else {
         PrimExpr combined = reduce->combiner.get()->operator()(lhs, rhs)[i];
@@ -493,8 +493,8 @@ Stmt GenerateStmtFromCompute(const te::ComputeOp& compute_op, CreateFuncInfo* in
     NestedScopeInfo cur_scope;
     for (size_t j = 0; j < axes.size(); ++j) {
       const IterVar& axis = axes[j];
-      DataType index_type =
-          DataType::Int(std::max(axis->dom->min.dtype().bits(), axis->dom->extent.dtype().bits()));
+      PrimType index_type =
+          PrimType::Int(std::max(axis->dom->min.ty().bits(), axis->dom->extent.ty().bits()));
       bool first_times_define =
           std::find(axes_levels[i].begin(), axes_levels[i].end(), axis) != axes_levels[i].end();
       if (first_times_define) {
@@ -524,7 +524,7 @@ Stmt GenerateStmtFromCompute(const te::ComputeOp& compute_op, CreateFuncInfo* in
     }
     if (i == axes_levels.size() - 1 && cur_scope.block_iters.empty()) {
       // for the leaf scope, we ensure at least one block var exists
-      IterVar dummy(Range::FromMinExtent(0, 1), Var("vi", DataType::Int(32)),
+      IterVar dummy(Range::FromMinExtent(0, 1), Var("vi", PrimType::Int(32)),
                     IterVarType::kDataPar);
       cur_scope.AddBlockIter(std::nullopt, dummy, 0);
     }
@@ -740,8 +740,9 @@ PrimFunc GenerateAndCompletePrimFunc(const ffi::Array<te::Tensor>& arg_list,
                                      const ffi::Array<Stmt>& root_stmts, CreateFuncInfo* info) {
   ffi::Array<Var> parameters;
   ffi::Map<Var, Buffer> buffer_map;
+  PrimType handle_ty = PrimType::Handle();
   for (const te::Tensor& tensor : arg_list) {
-    Var arg("var_" + tensor->GetNameHint(), PrimType(DataType::Handle()));
+    Var arg("var_" + tensor->GetNameHint(), handle_ty);
     parameters.push_back(arg);
     auto it = info->tensor2buffers.find(tensor);
     TVM_FFI_ICHECK(it != info->tensor2buffers.end());
@@ -760,7 +761,7 @@ PrimFunc GenerateAndCompletePrimFunc(const ffi::Array<te::Tensor>& arg_list,
 }
 
 PrimFunc CreatePrimFunc(const ffi::Array<te::Tensor>& arg_list,
-                        std::optional<DataType> index_dtype_override) {
+                        std::optional<PrimType> index_dtype_override) {
   // Information used in CreatePrimFunc and its sub-functions.
   CreateFuncInfo info(arg_list);
   // Root body stmts.
@@ -792,10 +793,10 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def_packed("te.CreatePrimFunc", [](ffi::PackedArgs args, ffi::Any* ret) {
     ffi::Array<ffi::ObjectRef> arg_list = args[0].cast<ffi::Array<ffi::ObjectRef>>();
-    std::optional<DataType> index_dtype_override{std::nullopt};
+    std::optional<PrimType> index_dtype_override{std::nullopt};
     // Add conversion to make std::optional compatible with FFI.
     if (args[1] != nullptr) {
-      index_dtype_override = args[1].cast<DataType>();
+      index_dtype_override = args[1].cast<PrimType>();
     }
     *ret = CreatePrimFunc(arg_list, index_dtype_override);
   });
@@ -806,10 +807,11 @@ PrimFunc GenerateAndCompletePrimFunc(const ffi::Array<ffi::ObjectRef>& arg_tir_v
                                      const ffi::Array<Stmt>& root_stmts, CreateFuncInfo* info) {
   ffi::Array<Var> parameters;
   ffi::Map<Var, Buffer> buffer_map;
+  PrimType handle_ty = PrimType::Handle();
   for (const ffi::ObjectRef& arg : arg_tir_var_list) {
     if (auto opt_tensor = arg.as<te::Tensor>()) {
       te::Tensor tensor = opt_tensor.value();
-      Var arg("var_" + tensor->GetNameHint(), PrimType(DataType::Handle()));
+      Var arg("var_" + tensor->GetNameHint(), handle_ty);
       parameters.push_back(arg);
       auto it = info->tensor2buffers.find(tensor);
       TVM_FFI_ICHECK(it != info->tensor2buffers.end());
@@ -831,7 +833,7 @@ PrimFunc GenerateAndCompletePrimFunc(const ffi::Array<ffi::ObjectRef>& arg_tir_v
 }
 
 PrimFunc CreatePrimFunc(const ffi::Array<ffi::ObjectRef>& arg_list,
-                        std::optional<DataType> index_dtype_override) {
+                        std::optional<PrimType> index_dtype_override) {
   ffi::Array<te::Tensor> tensor_arg_list;
   for (const ffi::ObjectRef& x : arg_list) {
     if (auto tensor_node = x.as<te::TensorNode>()) {
diff --git a/src/te/operation/create_primfunc.h b/src/te/operation/create_primfunc.h
index 107a22d33fe5..9b17dd135bda 100644
--- a/src/te/operation/create_primfunc.h
+++ b/src/te/operation/create_primfunc.h
@@ -31,11 +31,11 @@ namespace tirx {
 
 /*! \brief Use Tensor Expression to create a schedulable TensorIR func. */
 PrimFunc CreatePrimFunc(const ffi::Array<te::Tensor>& arg_list,
-                        std::optional<DataType> index_dtype_override = std::nullopt);
+                        std::optional<PrimType> index_dtype_override = std::nullopt);
 
 /*! \brief Use Tensor Expression to create a schedulable TensorIR func. */
 PrimFunc CreatePrimFunc(const ffi::Array<ffi::ObjectRef>& arg_list,
-                        std::optional<DataType> index_dtype_override);
+                        std::optional<PrimType> index_dtype_override);
 
 }  // namespace tirx
 }  // namespace tvm
diff --git a/src/te/operation/extern_op.cc b/src/te/operation/extern_op.cc
index b6b7c17691b9..6fbaf4482b5c 100644
--- a/src/te/operation/extern_op.cc
+++ b/src/te/operation/extern_op.cc
@@ -37,7 +37,9 @@ TVM_FFI_STATIC_INIT_BLOCK() { ExternOpNode::RegisterReflection(); }
 
 int ExternOpNode::num_outputs() const { return static_cast<int>(output_placeholders.size()); }
 
-DataType ExternOpNode::output_dtype(size_t i) const { return output_placeholders[i]->dtype; }
+PrimType ExternOpNode::output_dtype(size_t i) const {
+  return output_placeholders[i]->ElementType();
+}
 
 ffi::Array<PrimExpr> ExternOpNode::output_shape(size_t i) const {
   return output_placeholders[i]->shape;
diff --git a/src/te/operation/placeholder_op.cc b/src/te/operation/placeholder_op.cc
index 17f4791d7615..36e4629ef6fe 100644
--- a/src/te/operation/placeholder_op.cc
+++ b/src/te/operation/placeholder_op.cc
@@ -35,7 +35,7 @@ TVM_FFI_STATIC_INIT_BLOCK() { PlaceholderOpNode::RegisterReflection(); }
 
 int PlaceholderOpNode::num_outputs() const { return 1; }
 
-DataType PlaceholderOpNode::output_dtype(size_t i) const {
+PrimType PlaceholderOpNode::output_dtype(size_t i) const {
   TVM_FFI_ICHECK_EQ(i, 0U);
   return dtype;
 }
@@ -45,7 +45,7 @@ ffi::Array<PrimExpr> PlaceholderOpNode::output_shape(size_t i) const {
   return shape;
 }
 
-PlaceholderOp::PlaceholderOp(std::string name, ffi::Array<PrimExpr> shape, DataType dtype) {
+PlaceholderOp::PlaceholderOp(std::string name, ffi::Array<PrimExpr> shape, PrimType dtype) {
   auto n = ffi::make_object<PlaceholderOpNode>();
   n->name = name;
   n->shape = shape;
@@ -53,14 +53,14 @@ PlaceholderOp::PlaceholderOp(std::string name, ffi::Array<PrimExpr> shape, DataT
   data_ = std::move(n);
 }
 
-Tensor placeholder(ffi::Array<PrimExpr> shape, DataType dtype, std::string name) {
+Tensor placeholder(ffi::Array<PrimExpr> shape, PrimType dtype, std::string name) {
   return PlaceholderOp(name, shape, dtype).output(0);
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("te.Placeholder", [](ffi::Variant<PrimExpr, ffi::Array<PrimExpr>> shape_arg,
-                                             DataType dtype, std::string name) {
+                                             DLDataType dtype, std::string name) {
     auto shape = [&]() -> ffi::Array<PrimExpr> {
       if (auto arg_expr = shape_arg.as<PrimExpr>()) {
         return {arg_expr.value()};
@@ -70,7 +70,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
         TVM_FFI_THROW(InternalError) << "Variant did not contain either allowed type";
       }
     }();
-    return placeholder(shape, dtype, name);
+    return placeholder(shape, PrimType(dtype), name);
   });
 }
 
diff --git a/src/te/operation/scan_op.cc b/src/te/operation/scan_op.cc
index 5e8d4361ec85..96232b293b18 100644
--- a/src/te/operation/scan_op.cc
+++ b/src/te/operation/scan_op.cc
@@ -36,7 +36,7 @@ TVM_FFI_STATIC_INIT_BLOCK() { ScanOpNode::RegisterReflection(); }
 
 int ScanOpNode::num_outputs() const { return static_cast<int>(update.size()); }
 
-DataType ScanOpNode::output_dtype(size_t i) const { return update[i]->dtype; }
+PrimType ScanOpNode::output_dtype(size_t i) const { return update[i]->GetDataType(); }
 
 ffi::Array<PrimExpr> ScanOpNode::output_shape(size_t i) const {
   TVM_FFI_ICHECK_LT(i, state_placeholder.size());
diff --git a/src/te/tensor.cc b/src/te/tensor.cc
index e05f91cad049..d50349f6b508 100644
--- a/src/te/tensor.cc
+++ b/src/te/tensor.cc
@@ -41,15 +41,15 @@ void TensorNode::RegisterReflection() {
 TVM_FFI_STATIC_INIT_BLOCK() { TensorNode::RegisterReflection(); }
 
 IterVar thread_axis(Range dom, std::string tag) {
-  return IterVar(dom, Var(tag, dom.defined() ? dom->extent.dtype() : DataType::Int(32)),
-                 kThreadIndex, tag);
+  return IterVar(dom, Var(tag, dom.defined() ? dom->extent.ty() : PrimType::Int(32)), kThreadIndex,
+                 tag);
 }
 
 IterVar reduce_axis(Range dom, std::string name) {
-  return IterVar(dom, Var(name, dom->extent.dtype()), kCommReduce);
+  return IterVar(dom, Var(name, dom->extent.ty()), kCommReduce);
 }
 
-Var var(std::string name_hint, DataType t) { return Var(name_hint, t); }
+Var var(std::string name_hint, PrimType t) { return Var(name_hint, t); }
 
 // Tensor
 inline PrimExpr Tensor::IndexTensor(ffi::Array<PrimExpr> indices,
@@ -65,7 +65,7 @@ inline PrimExpr Tensor::IndexTensor(ffi::Array<PrimExpr> indices,
   if (support_negative_indices) {
     for (size_t i = 0; i < shape.size(); i++) {
       PrimExpr new_index =
-          Select(indices[i] < IntImm(indices[i]->dtype, 0), indices[i] + shape[i], indices[i]);
+          Select(indices[i] < IntImm(indices[i].ty(), 0), indices[i] + shape[i], indices[i]);
       indices.Set(i, new_index);
     }
   }
@@ -105,7 +105,7 @@ Tensor Operation::output(size_t i) const {
   return Tensor(node);
 }
 
-Tensor::Tensor(ffi::Array<PrimExpr> shape, DataType dtype, Operation op, int value_index) {
+Tensor::Tensor(ffi::Array<PrimExpr> shape, PrimType dtype, Operation op, int value_index) {
   auto n = ffi::make_object<TensorNode>();
   n->shape = std::move(shape);
   n->dtype = dtype;
@@ -117,8 +117,8 @@ Tensor::Tensor(ffi::Array<PrimExpr> shape, DataType dtype, Operation op, int val
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def(
-      "te.Tensor", [](ffi::Array<PrimExpr> shape, DataType dtype, Operation op, int value_index) {
-        return Tensor(shape, dtype, op, value_index);
+      "te.Tensor", [](ffi::Array<PrimExpr> shape, DLDataType dtype, Operation op, int value_index) {
+        return Tensor(shape, PrimType(dtype), op, value_index);
       });
 }
 
@@ -129,6 +129,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def_method("te.TensorEqual", &Tensor::operator==)
+      .def("te.TensorDType", [](Tensor tensor) -> DLDataType { return tensor->dtype->dtype; })
       .def("te.TensorHash",
            [](Tensor tensor) -> int64_t {
              return static_cast<int64_t>(std::hash<Tensor>()(tensor));
diff --git a/src/tirx/analysis/deep_equal.cc b/src/tirx/analysis/deep_equal.cc
index 53700a85a94a..dbf2e53c561d 100644
--- a/src/tirx/analysis/deep_equal.cc
+++ b/src/tirx/analysis/deep_equal.cc
@@ -30,17 +30,25 @@
 namespace tvm {
 namespace tirx {
 
-#define DEFINE_DEEP_EQUAL_BIN_EXPR(OpNode)                              \
-  bool VisitExpr_(const OpNode* plhs, const PrimExpr& rhs) final {      \
-    const auto* prhs = rhs.as<OpNode>();                                \
-    return plhs->dtype == prhs->dtype && VisitExpr(plhs->a, prhs->a) && \
-           VisitExpr(plhs->b, prhs->b);                                 \
+namespace {
+
+template <typename LHS, typename RHS>
+TVM_FFI_INLINE bool SameType(const LHS* lhs, const RHS* rhs) {
+  return lhs->ty() == rhs->ty();
+}
+
+}  // namespace
+
+#define DEFINE_DEEP_EQUAL_BIN_EXPR(OpNode)                                                     \
+  bool VisitExpr_(const OpNode* plhs, const PrimExpr& rhs) final {                             \
+    const auto* prhs = rhs.as<OpNode>();                                                       \
+    return SameType(plhs, prhs) && VisitExpr(plhs->a, prhs->a) && VisitExpr(plhs->b, prhs->b); \
   }
 
-#define DEFINE_DEEP_EQUAL_IMM_EXPR(OpNode)                           \
-  bool VisitExpr_(const OpNode* plhs, const PrimExpr& rhs) final {   \
-    const auto* prhs = rhs.as<OpNode>();                             \
-    return plhs->dtype == prhs->dtype && plhs->value == prhs->value; \
+#define DEFINE_DEEP_EQUAL_IMM_EXPR(OpNode)                         \
+  bool VisitExpr_(const OpNode* plhs, const PrimExpr& rhs) final { \
+    const auto* prhs = rhs.as<OpNode>();                           \
+    return SameType(plhs, prhs) && plhs->value == prhs->value;     \
   }
 
 class ExprDeepEqualChecker : private ExprFunctor<bool(const PrimExpr&, const PrimExpr&)> {
@@ -53,7 +61,7 @@ class ExprDeepEqualChecker : private ExprFunctor<bool(const PrimExpr&, const Pri
     if (lhs->type_index() != rhs->type_index()) return false;
     if (auto* plhs = lhs.as<IntImmNode>()) {
       auto* prhs = rhs.as<IntImmNode>();
-      return plhs->dtype == prhs->dtype && plhs->value == prhs->value;
+      return SameType(plhs, prhs) && plhs->value == prhs->value;
     }
     return ExprDeepEqualChecker().VisitExpr(lhs, rhs);
   }
@@ -104,7 +112,7 @@ class ExprDeepEqualChecker : private ExprFunctor<bool(const PrimExpr&, const Pri
   bool VisitExpr_(const BufferLoadNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<BufferLoadNode>();
     // we run pointer comparison of the buffer
-    return plhs->dtype == prhs->dtype && plhs->buffer.same_as(prhs->buffer) &&
+    return SameType(plhs, prhs) && plhs->buffer.same_as(prhs->buffer) &&
            ArrayDeepEqual(plhs->indices, prhs->indices) &&
            OptionalDeepEqual(plhs->predicate, prhs->predicate);
   }
@@ -112,26 +120,26 @@ class ExprDeepEqualChecker : private ExprFunctor<bool(const PrimExpr&, const Pri
   bool VisitExpr_(const ProducerLoadNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<ProducerLoadNode>();
     // run shallow pointer comparison of the producer
-    return plhs->dtype == prhs->dtype && plhs->producer.same_as(prhs->producer) &&
+    return SameType(plhs, prhs) && plhs->producer.same_as(prhs->producer) &&
            ArrayDeepEqual(plhs->indices, prhs->indices);
   }
 
   bool VisitExpr_(const LetNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<LetNode>();
-    return plhs->dtype == prhs->dtype && VisitExpr(plhs->var, prhs->var) &&
+    return SameType(plhs, prhs) && VisitExpr(plhs->var, prhs->var) &&
            VisitExpr(plhs->value, prhs->value) && VisitExpr(plhs->body, prhs->body);
   }
 
   bool VisitExpr_(const CallNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<CallNode>();
-    return plhs->dtype == prhs->dtype && plhs->op.same_as(prhs->op) &&
+    return SameType(plhs, prhs) && plhs->op.same_as(prhs->op) &&
            ArrayDeepEqual(plhs->args, prhs->args) &&
            ffi::StructuralEqual()(plhs->attrs, prhs->attrs);
   }
 
   bool VisitExpr_(const ReduceNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<ReduceNode>();
-    return plhs->dtype == prhs->dtype && plhs->combiner.same_as(prhs->combiner) &&
+    return SameType(plhs, prhs) && plhs->combiner.same_as(prhs->combiner) &&
            ArrayDeepEqual(plhs->source, prhs->source) && ArrayDeepEqual(plhs->init, prhs->init) &&
            ArrayDeepEqual(plhs->axis, prhs->axis) && VisitExpr(plhs->condition, prhs->condition) &&
            plhs->value_index == prhs->value_index;
@@ -139,36 +147,36 @@ class ExprDeepEqualChecker : private ExprFunctor<bool(const PrimExpr&, const Pri
 
   bool VisitExpr_(const CastNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<CastNode>();
-    return plhs->dtype == prhs->dtype && VisitExpr(plhs->value, prhs->value);
+    return SameType(plhs, prhs) && VisitExpr(plhs->value, prhs->value);
   }
 
   bool VisitExpr_(const NotNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<NotNode>();
-    return plhs->dtype == prhs->dtype && VisitExpr(plhs->a, prhs->a);
+    return SameType(plhs, prhs) && VisitExpr(plhs->a, prhs->a);
   }
 
   bool VisitExpr_(const SelectNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<SelectNode>();
-    return plhs->dtype == prhs->dtype && VisitExpr(plhs->condition, prhs->condition) &&
+    return SameType(plhs, prhs) && VisitExpr(plhs->condition, prhs->condition) &&
            VisitExpr(plhs->true_value, prhs->true_value) &&
            VisitExpr(plhs->false_value, prhs->false_value);
   }
 
   bool VisitExpr_(const RampNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<RampNode>();
-    return plhs->dtype == prhs->dtype && VisitExpr(plhs->base, prhs->base) &&
+    return SameType(plhs, prhs) && VisitExpr(plhs->base, prhs->base) &&
            VisitExpr(plhs->stride, prhs->stride) && VisitExpr(plhs->lanes, prhs->lanes);
   }
 
   bool VisitExpr_(const ShuffleNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<ShuffleNode>();
-    return plhs->dtype == prhs->dtype && ArrayDeepEqual(plhs->vectors, prhs->vectors) &&
+    return SameType(plhs, prhs) && ArrayDeepEqual(plhs->vectors, prhs->vectors) &&
            ArrayDeepEqual(plhs->indices, prhs->indices);
   }
 
   bool VisitExpr_(const BroadcastNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<BroadcastNode>();
-    return plhs->dtype == prhs->dtype && VisitExpr(plhs->value, prhs->value) &&
+    return SameType(plhs, prhs) && VisitExpr(plhs->value, prhs->value) &&
            VisitExpr(plhs->lanes, prhs->lanes);
   }
 
diff --git a/src/tirx/ir/buffer.cc b/src/tirx/ir/buffer.cc
index af3a75e5fc28..66c66149d500 100644
--- a/src/tirx/ir/buffer.cc
+++ b/src/tirx/ir/buffer.cc
@@ -51,10 +51,11 @@ ffi::Array<PrimExpr> SimplifyArray(arith::AnalyzerObj* ana, ffi::Array<PrimExpr>
   return array;
 }
 
-Buffer decl_buffer(ffi::Array<PrimExpr> shape, DataType dtype, ffi::String name,
+Buffer decl_buffer(ffi::Array<PrimExpr> shape, DLDataType dtype, ffi::String name,
                    ffi::String storage_scope, ffi::Optional<ffi::Array<IntImm>> axis_separators,
                    Span span) {
-  DataType storage_dtype = (dtype == DataType::Bool() ? DataType::Int(8) : dtype);
+  DLDataType storage_dtype =
+      (dtype == DLDataType{kDLBool, 8, 1} ? DLDataType{kDLInt, 8, 1} : dtype);
   return Buffer(Var(name, PointerType(PrimType(storage_dtype), storage_scope), span), dtype, shape,
                 ffi::Array<PrimExpr>(), PrimExpr(), name, 0, 0, kDefault,
                 axis_separators.value_or(ffi::Array<IntImm>()), span, std::nullopt,
@@ -322,20 +323,20 @@ ffi::Array<PrimExpr> BufferNode::ElemOffset(ffi::Array<PrimExpr> input_indices,
 }
 
 inline ffi::Array<PrimExpr> BufferOffset(const BufferNode* n, ffi::Array<PrimExpr> index,
-                                         DataType dtype) {
+                                         PrimType dtype) {
   ffi::Array<PrimExpr> offsets = n->ElemOffset(index);
   // If the Buffer has element type with more than one lane, scale to
   // get the offset in number of scalars.
-  if (n->dtype.lanes() != 1) {
+  if (PrimType(n->dtype).lanes() != 1) {
     PrimExpr last_offset = offsets[offsets.size() - 1];
-    offsets.Set(offsets.size() - 1, last_offset * MakeConst(last_offset.dtype(), dtype.lanes()));
+    offsets.Set(offsets.size() - 1, last_offset * MakeConst(last_offset.ty(), dtype.lanes()));
   }
 
   // If the requested type has more than one lane, make a RampNode at
   // that offset.
   if (dtype.lanes() != 1) {
     PrimExpr last_offset = offsets[offsets.size() - 1];
-    PrimExpr stride = MakeConst(last_offset.dtype(), 1);
+    PrimExpr stride = MakeConst(last_offset.ty(), 1);
     offsets.Set(offsets.size() - 1, tirx::Ramp(last_offset, stride, dtype.lanes()));
   }
 
@@ -404,8 +405,7 @@ Buffer Buffer::GetFlattenedBuffer() const {
   // The axis_separators for the output buffer.
   ffi::Array<IntImm> output_axis_separators;
   for (size_t i = 0; i < self->axis_separators.size(); i++) {
-    auto dtype = self->axis_separators[i]->dtype;
-    output_axis_separators.push_back(IntImm(dtype, i + 1));
+    output_axis_separators.push_back(IntImm(self->axis_separators[i].ty(), i + 1));
   }
 
   if (output_shape.size() == self->shape.size() && self->strides.empty()) {
@@ -427,20 +427,26 @@ Buffer Buffer::GetFlattenedBuffer() const {
   }
 }
 
-PrimExpr Buffer::vload(ffi::Array<PrimExpr> begin, DataType value_dtype,
+PrimExpr Buffer::vload(ffi::Array<PrimExpr> begin, PrimType value_dtype,
                        ffi::Optional<PrimExpr> predicate) const {
-  // specially handle bool, stored as DataType::Int(8)
+  // Specially handle bool, stored as int8 in buffers.
   const BufferNode* n = operator->();
   TVM_FFI_ICHECK(n != nullptr);
-  TVM_FFI_ICHECK(value_dtype.element_of() == n->dtype.element_of() &&
-                 value_dtype.get_lanes_or_vscale_factor() % n->dtype.lanes() == 0)
+  PrimType buffer_dtype(n->dtype);
+  int value_lanes =
+      value_dtype.IsScalableVector() ? value_dtype.VScaleFactor() : value_dtype.lanes();
+  int buffer_lanes =
+      buffer_dtype.IsScalableVector() ? buffer_dtype.VScaleFactor() : buffer_dtype.lanes();
+  TVM_FFI_ICHECK(value_dtype.WithLanes(1)->dtype == buffer_dtype.WithLanes(1)->dtype &&
+                 value_lanes % buffer_lanes == 0)
       << "Cannot load " << value_dtype << " from buffer of " << n->dtype;
 
   ffi::Array<PrimExpr> indices = begin;
   PrimExpr base = indices[indices.size() - 1];
-  if (value_dtype.is_fixed_length_vector()) {
-    int factor = value_dtype.lanes() / n->dtype.lanes();
-    if (factor > 1 && base.dtype().is_scalar()) {
+  if (value_dtype.IsFixedLengthVector()) {
+    int factor = value_dtype.lanes() / buffer_dtype.lanes();
+    PrimType base_ty = base.ty();
+    if (factor > 1 && !base_ty.IsFixedLengthVector() && !base_ty.IsScalableVector()) {
       indices.Set(indices.size() - 1, Ramp(base, 1, factor));
     }
   }
@@ -449,19 +455,25 @@ PrimExpr Buffer::vload(ffi::Array<PrimExpr> begin, DataType value_dtype,
 
 Stmt Buffer::vstore(ffi::Array<PrimExpr> begin, PrimExpr value,
                     ffi::Optional<PrimExpr> predicate) const {
-  // specially handle bool, stored as DataType::Int(8)
+  // Specially handle bool, stored as int8 in buffers.
   const BufferNode* n = operator->();
   TVM_FFI_ICHECK(n != nullptr);
-  DataType value_dtype = value.dtype();
-  TVM_FFI_ICHECK(value_dtype.element_of() == n->dtype.element_of() &&
-                 value_dtype.get_lanes_or_vscale_factor() % n->dtype.lanes() == 0)
+  PrimType value_dtype = value.ty();
+  PrimType buffer_dtype(n->dtype);
+  int value_lanes =
+      value_dtype.IsScalableVector() ? value_dtype.VScaleFactor() : value_dtype.lanes();
+  int buffer_lanes =
+      buffer_dtype.IsScalableVector() ? buffer_dtype.VScaleFactor() : buffer_dtype.lanes();
+  TVM_FFI_ICHECK(value_dtype.WithLanes(1)->dtype == buffer_dtype.WithLanes(1)->dtype &&
+                 value_lanes % buffer_lanes == 0)
       << "Cannot store " << value_dtype << " to buffer of " << n->dtype;
 
   ffi::Array<PrimExpr> indices = begin;
   PrimExpr base = indices[indices.size() - 1];
-  if (value_dtype.is_fixed_length_vector()) {
-    int factor = value_dtype.lanes() / n->dtype.lanes();
-    if (factor > 1 && base.dtype().is_scalar()) {
+  if (value_dtype.IsFixedLengthVector()) {
+    int factor = value_dtype.lanes() / buffer_dtype.lanes();
+    PrimType base_ty = base.ty();
+    if (factor > 1 && !base_ty.IsFixedLengthVector() && !base_ty.IsScalableVector()) {
       indices.Set(indices.size() - 1, Ramp(base, 1, factor));
     }
   }
@@ -484,7 +496,7 @@ Buffer Buffer::MakeStrideView() const {
   const BufferNode* self = operator->();
   TVM_FFI_ICHECK(self != nullptr);
   auto n = ffi::make_object<BufferNode>(*self);
-  PrimExpr acc = MakeConst(n->DefaultIndexType(), 1);
+  PrimExpr acc = MakeConst(PrimType(n->DefaultIndexType()), 1);
   for (size_t i = n->shape.size(); i != 0; --i) {
     temp.push_back(acc);
     acc = acc * n->shape[i - 1];
@@ -537,14 +549,14 @@ Buffer Buffer::MakeSlice(ffi::Array<PrimExpr> begins, ffi::Array<PrimExpr> exten
   return slice;
 }
 
-PrimExpr Buffer::access_ptr(int access_mask, DataType ptr_type, int content_lanes, PrimExpr offset,
+PrimExpr Buffer::access_ptr(int access_mask, PrimType ptr_type, int content_lanes, PrimExpr offset,
                             ffi::Optional<PrimExpr> input_extent) const {
   const BufferNode* self = operator->();
   TVM_FFI_ICHECK(self != nullptr);
   PrimExpr e_dtype;
   PrimExpr extent;
   if (self->shape.size() == 0) {
-    extent = MakeConst(self->DefaultIndexType(), 1);
+    extent = MakeConst(PrimType(self->DefaultIndexType()), 1);
   } else if (self->strides.size() == self->shape.size()) {
     int highest_dim = 0;
     extent = self->strides[highest_dim] * self->shape[highest_dim] - offset;
@@ -555,9 +567,9 @@ PrimExpr Buffer::access_ptr(int access_mask, DataType ptr_type, int content_lane
   }
   PrimExpr elem_offset = self->elem_offset + offset;
   if (content_lanes > 1) {
-    e_dtype = tirx::TypeAnnotation(self->dtype.with_lanes(content_lanes));
-    extent = extent / MakeConst(self->elem_offset.dtype(), content_lanes);
-    elem_offset = self->elem_offset / MakeConst(self->elem_offset.dtype(), content_lanes);
+    e_dtype = tirx::TypeAnnotation(PrimType(self->dtype).WithLanes(content_lanes));
+    extent = extent / MakeConst(self->elem_offset.ty(), content_lanes);
+    elem_offset = self->elem_offset / MakeConst(self->elem_offset.ty(), content_lanes);
   } else {
     e_dtype = tirx::TypeAnnotation(self->dtype);
   }
@@ -570,14 +582,14 @@ PrimExpr Buffer::access_ptr(int access_mask, DataType ptr_type, int content_lane
   return tirx::Call(ptr_type, tirx::builtin::tvm_access_ptr(), acc_args);
 }
 
-Buffer::Buffer(Var data, DataType dtype, ffi::Array<PrimExpr> shape, ffi::Array<PrimExpr> strides,
+Buffer::Buffer(Var data, PrimType dtype, ffi::Array<PrimExpr> shape, ffi::Array<PrimExpr> strides,
                PrimExpr elem_offset, ffi::String name, int data_alignment, int offset_factor,
                BufferType buffer_type, ffi::Array<IntImm> axis_separators, Span span,
                ffi::Optional<Layout> layout, ffi::Array<PrimExpr> allocated_addr) {
-  DataType storage_dtype = dtype;
+  DLDataType storage_dtype = dtype->dtype;
   // specially handle bool
-  if (storage_dtype == DataType::Bool()) {
-    storage_dtype = DataType::Int(8);
+  if (storage_dtype == DLDataType{kDLBool, 8, 1}) {
+    storage_dtype = DLDataType{kDLInt, 8, 1};
   }
   // The buffer dtype may differ from the dtype of the underlying
   // allocation, such as a single allocation that backs multiple
@@ -606,7 +618,7 @@ Buffer::Buffer(Var data, DataType dtype, ffi::Array<PrimExpr> shape, ffi::Array<
   n->axis_separators = std::move(axis_separators);
   n->name = std::move(name);
   if (!elem_offset.defined()) {
-    elem_offset = IntImm(n->DefaultIndexType(), 0);
+    elem_offset = IntImm(PrimType(n->DefaultIndexType()), 0);
   }
   if (data_alignment <= 0) {
     data_alignment = runtime::kAllocAlignment;
@@ -620,7 +632,7 @@ Buffer::Buffer(Var data, DataType dtype, ffi::Array<PrimExpr> shape, ffi::Array<
   n->buffer_type = buffer_type;
   if (n->buffer_type == kAutoBroadcast && n->shape.size() > 0 && n->strides.empty()) {
     for (size_t i = 0; i < n->shape.size(); ++i) {
-      n->strides.push_back(Var("stride", n->shape[i].dtype()));
+      n->strides.push_back(Var("stride", n->shape[i].ty()));
     }
   }
   n->span = std::move(span);
@@ -633,10 +645,11 @@ Buffer::Buffer(Var data, DataType dtype, ffi::Array<PrimExpr> shape, ffi::Array<
   data_ = std::move(n);
 }
 
-tirx::Buffer BufferWithOffsetAlignment(ffi::Array<PrimExpr> shape, DataType dtype, std::string name,
-                                       int data_alignment, int offset_factor, bool compact,
-                                       std::string memory_scope) {
-  DataType storage_dtype = (dtype == DataType::Bool() ? DataType::Int(8) : dtype);
+tirx::Buffer BufferWithOffsetAlignment(ffi::Array<PrimExpr> shape, DLDataType dtype,
+                                       std::string name, int data_alignment, int offset_factor,
+                                       bool compact, std::string memory_scope) {
+  DLDataType storage_dtype =
+      (dtype == DLDataType{kDLBool, 8, 1} ? DLDataType{kDLInt, 8, 1} : dtype);
   auto data = tirx::Var(name, PointerType(PrimType(storage_dtype), memory_scope));
   bool has_any = false;
   if (!compact) {
@@ -651,7 +664,7 @@ tirx::Buffer BufferWithOffsetAlignment(ffi::Array<PrimExpr> shape, DataType dtyp
 
   PrimExpr elem_offset;
   if (offset_factor != 0) {
-    elem_offset = tirx::Var(name + "_elem_offset", shape[0].dtype());
+    elem_offset = tirx::Var(name + "_elem_offset", shape[0].ty());
   } else {
     elem_offset = PrimExpr();
   }
@@ -667,7 +680,7 @@ Buffer Buffer::with_allocated_addr(ffi::Array<PrimExpr> allocated_addr) const {
   return output;
 }
 
-Buffer Buffer::with_dtype(DataType dtype) const {
+Buffer Buffer::with_dtype(PrimType dtype) const {
   Buffer output = *this;
   auto writer = output.CopyOnWrite();
   writer->dtype = dtype;
@@ -682,7 +695,7 @@ Buffer Buffer::with_data(Var data) const {
 }
 
 PrimExpr Buffer::OffsetOf_p(const Array<PrimExpr>& indices) const {
-  return tirx::Call(DataType::Int(32), tirx::builtin::buffer_offset(),
+  return tirx::Call(PrimType::Int(32), tirx::builtin::buffer_offset(),
                     {BufferLoad(*this, indices)});
 }
 
@@ -705,7 +718,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                     auto buffer_type = args[8].cast<ffi::String>();
                     BufferType type = (buffer_type == "auto_broadcast") ? kAutoBroadcast : kDefault;
                     auto data = args[0].cast<Var>();
-                    auto dtype = args[1].cast<DataType>();
+                    auto dtype = args[1].cast<PrimType>();
                     auto shape = args[2].cast<ffi::Array<PrimExpr>>();
                     auto strides = args[3].cast<ffi::Array<PrimExpr>>();
                     auto elem_offset = args[4].cast<PrimExpr>();
@@ -718,15 +731,21 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                     *ret = Buffer(data, dtype, shape, strides, elem_offset, name, data_alignment,
                                   offset_factor, type, axis_separators, span, layout);
                   })
-      .def_method("tirx.BufferAccessPtr", &Buffer::access_ptr)
+      .def_method(
+          "tirx.BufferAccessPtr",
+          static_cast<PrimExpr (Buffer::*)(int, PrimType, int, PrimExpr, ffi::Optional<PrimExpr>)
+                          const>(&Buffer::access_ptr))
       .def_method("tirx.BufferGetFlattenedBuffer", &Buffer::GetFlattenedBuffer)
       .def_method("tirx.BufferOffsetOf", &Buffer::OffsetOf)
       .def_method("tirx.BufferOffsetOfp", &Buffer::OffsetOf_p)
-      .def_method("tirx.BufferVLoad", &Buffer::vload)
+      .def_method("tirx.BufferVLoad",
+                  static_cast<PrimExpr (Buffer::*)(ffi::Array<PrimExpr>, PrimType,
+                                                   ffi::Optional<PrimExpr>) const>(&Buffer::vload))
       .def_method("tirx.BufferVStore", &Buffer::vstore)
       .def_method("tirx.BufferStorageScope", &Buffer::scope)
       .def_method("tirx.BufferWithAllocatedAddr", &Buffer::with_allocated_addr)
-      .def_method("tirx.BufferWithDtype", &Buffer::with_dtype)
+      .def_method("tirx.BufferWithDtype",
+                  static_cast<Buffer (Buffer::*)(DLDataType) const>(&Buffer::with_dtype))
       .def_method("tirx.BufferWithData", &Buffer::with_data)
       .def_method("tirx.BufferIsScalar", &Buffer::IsScalar);
 }
diff --git a/src/tirx/ir/buffer_common.h b/src/tirx/ir/buffer_common.h
index b6aebba2d327..41c4b15fbd81 100644
--- a/src/tirx/ir/buffer_common.h
+++ b/src/tirx/ir/buffer_common.h
@@ -23,8 +23,8 @@
 #ifndef TVM_TIR_IR_BUFFER_COMMON_H_
 #define TVM_TIR_IR_BUFFER_COMMON_H_
 
+#include <tvm/ffi/dtype.h>
 #include <tvm/ir/type.h>
-#include <tvm/runtime/data_type.h>
 
 #include <optional>
 
@@ -36,11 +36,11 @@ namespace tirx {
  *
  * \param type The type to be checked.
  *
- * \return An std::optional<DataType> object. If the type is a pointer
+ * \return An std::optional<DLDataType> object. If the type is a pointer
  * to a primitive type, the object has a value which is the pointed-to
  * type. Otherwise the object is nullopt.
  */
-inline std::optional<runtime::DataType> GetPointerType(const Type& type) {
+inline std::optional<DLDataType> GetPointerType(const Type& type) {
   if (type.defined()) {
     if (auto* ptr_type = type.as<PointerTypeNode>()) {
       if (auto* prim_type = ptr_type->element_type.as<PrimTypeNode>()) {
diff --git a/src/tirx/ir/data_type_rewriter.cc b/src/tirx/ir/data_type_rewriter.cc
index 769f635a6957..29eb5d0e0197 100644
--- a/src/tirx/ir/data_type_rewriter.cc
+++ b/src/tirx/ir/data_type_rewriter.cc
@@ -49,10 +49,10 @@ Stmt DataTypeLegalizer::VisitStmt_(const ForNode* op) {
   PrimExpr e = VisitExpr(op->loop_var);
   Var var = e.as_or_throw<Var>();
   auto n = CopyOnWrite(op);
-  n->min = cast(var.dtype(), op->min);
-  n->extent = cast(var.dtype(), op->extent);
+  n->min = cast(var.ty(), op->min);
+  n->extent = cast(var.ty(), op->extent);
   if (op->step.has_value()) {
-    n->step = cast(var.dtype(), *op->step);
+    n->step = cast(var.ty(), *op->step);
   }
   return For(n);
 }
@@ -62,8 +62,8 @@ Stmt DataTypeLegalizer::VisitStmt_(const SBlockRealizeNode* op) {
   ffi::Array<PrimExpr> new_iter_values;
   bool changed = false;
   for (int i = 0; i < static_cast<int>(op->iter_values.size()); ++i) {
-    auto dtype = realize->block->iter_vars[i]->var->dtype;
-    if (op->iter_values[i]->dtype != dtype) {
+    PrimType dtype = realize->block->iter_vars[i]->var.ty();
+    if (op->iter_values[i].ty() != dtype) {
       new_iter_values.push_back(cast(dtype, realize->iter_values[i]));
       changed = true;
     } else {
@@ -80,8 +80,8 @@ Stmt DataTypeLegalizer::VisitStmt_(const SBlockNode* op) {
   SBlock new_block = StmtExprMutator::VisitStmt_(op).as_or_throw<SBlock>();
   ffi::Array<IterVar> new_iter_vars =
       MutateArray(new_block->iter_vars, [/*this*/](const IterVar& iter) {
-        auto dtype = iter->var.dtype();
-        if (iter->dom->min->dtype != dtype || iter->dom->extent->dtype != dtype) {
+        PrimType dtype = iter->var.ty();
+        if (iter->dom->min.ty() != dtype || iter->dom->extent.ty() != dtype) {
           IterVar new_iter = iter;
           new_iter.CopyOnWrite()->dom =
               Range(cast(dtype, iter->dom->min), cast(dtype, iter->dom->extent));
@@ -111,15 +111,17 @@ Stmt DataTypeLegalizer::VisitStmt_(const AttrStmtNode* op) {
       Range dom = iv->dom;
       if (dom.defined()) {
         PrimExpr extend = dom->extent;
-        TVM_FFI_ICHECK(extend.dtype().is_int() && var.dtype().is_int());
-        if (var.dtype().bits() != extend.dtype().bits()) {
-          DataType dtype = var.dtype();
-          dom = Range(cast(dtype, dom->min), cast(dtype, extend), dom->span);
+        PrimType extend_ty = extend.ty();
+        PrimType var_ty = var.ty();
+        TVM_FFI_ICHECK(extend_ty.MatchesCode(DLDataTypeCode::kDLInt) &&
+                       var_ty.MatchesCode(DLDataTypeCode::kDLInt));
+        if (var_ty.bits() != extend_ty.bits()) {
+          dom = Range(cast(var_ty, dom->min), cast(var_ty, extend), dom->span);
         }
       }
       ivmap_[iv] = IterVar(dom, var, iv->iter_type, iv->thread_tag);
     }
-    return AttrStmt(ivmap_[iv], op->attr_key, cast(var.dtype(), op->value), op->body);
+    return AttrStmt(ivmap_[iv], op->attr_key, cast(var.ty(), op->value), op->body);
   }
   return StmtExprMutator::VisitStmt_(op);
 }
@@ -128,8 +130,8 @@ PrimExpr DataTypeLegalizer::VisitExpr_(const LetNode* op) {
   PrimExpr value = this->VisitExpr(op->value);
   Var var = op->var;
 
-  if (value.dtype() != op->var->dtype) {
-    var = op->var.copy_with_dtype(value.dtype());
+  if (value.ty() != op->var.ty()) {
+    var = op->var.copy_with_dtype(value.ty());
     var_remap_[op->var.get()] = var;
   }
 
@@ -146,8 +148,8 @@ Stmt DataTypeLegalizer::VisitStmt_(const BindNode* op) {
   PrimExpr value = this->VisitExpr(op->value);
   Var var = op->var;
 
-  if (value.dtype() != op->var->dtype) {
-    var = op->var.copy_with_dtype(value.dtype());
+  if (value.ty() != op->var.ty()) {
+    var = op->var.copy_with_dtype(value.ty());
     var_remap_[op->var.get()] = var;
   }
 
@@ -170,13 +172,15 @@ PrimExpr DataTypeLegalizer::VisitExpr_(const SelectNode* op) {
   PrimExpr true_value = this->VisitExpr(op->true_value);
   PrimExpr false_value = this->VisitExpr(op->false_value);
   if (condition.same_as(op->condition) && true_value.same_as(op->true_value) &&
-      false_value.same_as(op->false_value) && true_value.dtype() == false_value.dtype()) {
+      false_value.same_as(op->false_value) && true_value.ty() == false_value.ty()) {
     return ffi::GetRef<PrimExpr>(op);
   } else {
-    int bits = std::max(true_value.dtype().bits(), false_value.dtype().bits());
-    DataType dtype = true_value.dtype().with_bits(bits);
-    if (true_value.dtype() != dtype) true_value = cast(dtype, true_value);
-    if (false_value.dtype() != dtype) false_value = cast(dtype, false_value);
+    PrimType true_dtype = true_value.ty();
+    PrimType false_dtype = false_value.ty();
+    int bits = std::max(true_dtype.bits(), false_dtype.bits());
+    PrimType dtype = true_dtype.WithBits(bits);
+    if (true_dtype != dtype) true_value = cast(dtype, true_value);
+    if (false_dtype != dtype) false_value = cast(dtype, false_value);
     return Select(condition, true_value, false_value);
   }
 }
@@ -184,14 +188,17 @@ PrimExpr DataTypeLegalizer::VisitExpr_(const SelectNode* op) {
 PrimExpr DataTypeLegalizer::VisitExpr_(const RampNode* op) {
   PrimExpr base = VisitExpr(op->base);
   PrimExpr stride = VisitExpr(op->stride);
-  if (base.same_as(op->base) && stride.same_as(op->stride) && base.dtype() == stride.dtype()) {
+  if (base.same_as(op->base) && stride.same_as(op->stride) && base.ty() == stride.ty()) {
     return ffi::GetRef<PrimExpr>(op);
   } else {
-    TVM_FFI_ICHECK(base.dtype().is_int() && stride.dtype().is_int());
-    int bits = std::max(base.dtype().bits(), stride.dtype().bits());
-    DataType dtype = base.dtype().with_bits(bits);
-    if (base.dtype() != dtype) base = cast(dtype, base);
-    if (stride.dtype() != dtype) stride = cast(dtype, stride);
+    PrimType base_dtype = base.ty();
+    PrimType stride_dtype = stride.ty();
+    TVM_FFI_ICHECK(base_dtype.MatchesCode(DLDataTypeCode::kDLInt) &&
+                   stride_dtype.MatchesCode(DLDataTypeCode::kDLInt));
+    int bits = std::max(base_dtype.bits(), stride_dtype.bits());
+    PrimType dtype = base_dtype.WithBits(bits);
+    if (base_dtype->dtype != dtype->dtype) base = cast(dtype, base);
+    if (stride_dtype->dtype != dtype->dtype) stride = cast(dtype, stride);
     return Ramp(base, stride, op->lanes);
   }
 }
@@ -200,15 +207,15 @@ PrimExpr DataTypeLegalizer::VisitExpr_(const CastNode* op) {
   return StmtExprMutator::VisitExpr_(op);
 }
 
-#define TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC)             \
-  PrimExpr DataTypeLegalizer::VisitExpr_(const OP* op) {                  \
-    PrimExpr a = this->VisitExpr(op->a);                                  \
-    PrimExpr b = this->VisitExpr(op->b);                                  \
-    if (op->a.same_as(a) && op->b.same_as(b) && a.dtype() == b.dtype()) { \
-      return ffi::GetRef<PrimExpr>(op);                                   \
-    } else {                                                              \
-      return FUNC(a, b);                                                  \
-    }                                                                     \
+#define TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC)       \
+  PrimExpr DataTypeLegalizer::VisitExpr_(const OP* op) {            \
+    PrimExpr a = this->VisitExpr(op->a);                            \
+    PrimExpr b = this->VisitExpr(op->b);                            \
+    if (op->a.same_as(a) && op->b.same_as(b) && a.ty() == b.ty()) { \
+      return ffi::GetRef<PrimExpr>(op);                             \
+    } else {                                                        \
+      return FUNC(a, b);                                            \
+    }                                                               \
   }
 
 TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(AddNode, operator+);
@@ -251,15 +258,18 @@ PrimExpr DataTypeLegalizer::VisitExpr_(const CallNode* op) {
   if (op->op.same_as(pow_op)) {
     return pow(op->args[0], op->args[1]);
   } else if (op->op.same_as(builtin::if_then_else())) {
-    return Call(op->dtype, op->op, {op->args[0], op->args[1], op->args[2]}, op->attrs, op->span);
+    return Call(ffi::GetRef<PrimExpr>(op).ty(), op->op, {op->args[0], op->args[1], op->args[2]},
+                op->attrs, op->span);
   } else if (op->op.same_as(clz_op)) {
-    DataType before_dtype = before->args[0]->dtype;
-    DataType after_dtype = op->args[0]->dtype;
-    TVM_FFI_ICHECK((before_dtype.is_int() || before_dtype.is_uint()) &&
+    PrimType before_dtype = before->args[0].ty();
+    PrimType after_dtype = op->args[0].ty();
+    TVM_FFI_ICHECK((before_dtype.code() == DLDataTypeCode::kDLInt ||
+                    before_dtype.code() == DLDataTypeCode::kDLUInt) &&
                    (before_dtype.bits() == 32 || before_dtype.bits() == 64))
         << "clz only supports 32 or 64 bit integer types, but get type before legalizing: "
         << before_dtype;
-    TVM_FFI_ICHECK((after_dtype.is_int() || after_dtype.is_uint()) &&
+    TVM_FFI_ICHECK((after_dtype.code() == DLDataTypeCode::kDLInt ||
+                    after_dtype.code() == DLDataTypeCode::kDLUInt) &&
                    (after_dtype.bits() == 32 || after_dtype.bits() == 64))
         << "clz only supports 32 or 64 bit integer types, but get type after legalizing: "
         << after_dtype;
@@ -434,7 +444,8 @@ Stmt IndexDataTypeRewriter::VisitStmt_(const BufferStoreNode* op) {
 
   Buffer new_buffer = VisitBufferUse(op->buffer);
   auto value = this->VisitExpr(op->value);
-  if (new_buffer->dtype != value->dtype && value->dtype.is_scalar()) {
+  PrimType value_dtype = value.ty();
+  if (new_buffer->dtype != value_dtype && value_dtype.IsScalar()) {
     value = cast(new_buffer->dtype, value);
   }
   auto indices = VisitIndices(op->indices);
@@ -514,12 +525,12 @@ Stmt IndexDataTypeRewriter::VisitStmt_(const ForNode* op) {
     For new_for = ffi::GetRef<For>(op);
     auto* n = new_for.CopyOnWrite();
     n->loop_var = new_loop_var;
-    n->min = cast(new_loop_var.dtype(), min);
-    n->extent = cast(new_loop_var.dtype(), extent);
+    n->min = cast(new_loop_var.ty(), min);
+    n->extent = cast(new_loop_var.ty(), extent);
     if (op->thread_binding.defined()) {
       auto old_thread_binding = op->thread_binding.value();
       auto* ptr = old_thread_binding.CopyOnWrite();
-      ptr->var = old_thread_binding->var.copy_with_dtype(new_loop_var.dtype());
+      ptr->var = old_thread_binding->var.copy_with_dtype(new_loop_var.ty());
       n->thread_binding = ffi::Optional<IterVar>(std::move(old_thread_binding));
     }
     n->body = new_body;
@@ -540,17 +551,18 @@ Stmt IndexDataTypeRewriter::VisitStmt_(const BindNode* op) {
   PrimExpr value = VisitExpr(op->value);
   Var var = var_remap_[bind_stmt->var.get()];
   is_enabled_ = is_enabled;
-  TVM_FFI_ICHECK(value.dtype() == var.dtype());
+  TVM_FFI_ICHECK(value.ty() == var.ty());
   return Bind(var, value, bind_stmt->span);
 }
 
-#define TVM_DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC)                     \
-  PrimExpr IndexDataTypeRewriter::VisitExpr_(const OP* op) {                       \
-    bool is_enabled = is_enabled_;                                                 \
-    is_enabled_ = is_condition_ && op->a->dtype.is_int() && op->b->dtype.is_int(); \
-    auto result = Parent::VisitExpr_(op);                                          \
-    is_enabled_ = is_enabled;                                                      \
-    return result;                                                                 \
+#define TVM_DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC)                       \
+  PrimExpr IndexDataTypeRewriter::VisitExpr_(const OP* op) {                         \
+    bool is_enabled = is_enabled_;                                                   \
+    is_enabled_ = is_condition_ && op->a.ty().MatchesCode(DLDataTypeCode::kDLInt) && \
+                  op->b.ty().MatchesCode(DLDataTypeCode::kDLInt);                    \
+    auto result = Parent::VisitExpr_(op);                                            \
+    is_enabled_ = is_enabled;                                                        \
+    return result;                                                                   \
   }
 
 TVM_DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(EQNode, operator==);
@@ -567,8 +579,8 @@ PrimExpr IndexDataTypeRewriter::VisitExpr_(const CallNode* op) {
     is_condition_ = true;
     PrimExpr cond = VisitExpr(op->args[0]);
     is_condition_ = is_condition;
-    return Call(op->dtype, op->op, {cond, VisitExpr(op->args[1]), VisitExpr(op->args[2])},
-                op->attrs, op->span);
+    return Call(ffi::GetRef<PrimExpr>(op).ty(), op->op,
+                {cond, VisitExpr(op->args[1]), VisitExpr(op->args[2])}, op->attrs, op->span);
   }
   return Parent::VisitExpr_(op);
 }
@@ -582,20 +594,22 @@ PrimExpr IndexDataTypeRewriter::VisitExpr_(const SelectNode* op) {
   PrimExpr false_value = this->VisitExpr(op->false_value);
 
   if (condition.same_as(op->condition) && true_value.same_as(op->true_value) &&
-      false_value.same_as(op->false_value) && true_value.dtype() == false_value.dtype()) {
+      false_value.same_as(op->false_value) && true_value.ty() == false_value.ty()) {
     return ffi::GetRef<PrimExpr>(op);
   } else {
-    int bits = std::max(true_value.dtype().bits(), false_value.dtype().bits());
-    DataType dtype = true_value.dtype().with_bits(bits);
-    if (true_value.dtype() != dtype) true_value = cast(dtype, true_value);
-    if (false_value.dtype() != dtype) false_value = cast(dtype, false_value);
+    PrimType true_dtype = true_value.ty();
+    PrimType false_dtype = false_value.ty();
+    int bits = std::max(true_dtype.bits(), false_dtype.bits());
+    PrimType dtype = true_dtype.WithBits(bits);
+    if (true_dtype->dtype != dtype->dtype) true_value = cast(dtype, true_value);
+    if (false_dtype->dtype != dtype->dtype) false_value = cast(dtype, false_value);
     return Select(condition, true_value, false_value);
   }
 }
 
 #undef TVM_DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH
 
-IndexDataTypeNormalizer::IndexDataTypeNormalizer(DataType target_data_type)
+IndexDataTypeNormalizer::IndexDataTypeNormalizer(PrimType target_data_type)
     : target_data_type_(std::move(target_data_type)) {}
 
 PrimFunc IndexDataTypeNormalizer::Rewrite(PrimFunc func) {
@@ -612,7 +626,7 @@ PrimFunc IndexDataTypeNormalizer::Rewrite(PrimFunc func) {
   bool is_enabled = true;
   std::swap(is_enabled_, is_enabled);
   ffi::Array<Var> params = func->params.Map([this](Var param) {
-    if (param.dtype().is_int()) {
+    if (param.ty().MatchesCode(DLDataTypeCode::kDLInt)) {
       return this->VisitExpr(param).as_or_throw<Var>();
     } else {
       return param;
@@ -627,12 +641,12 @@ PrimFunc IndexDataTypeNormalizer::Rewrite(PrimFunc func) {
   return func;
 }
 
-bool IndexDataTypeNormalizer::CanRewriteDType(DataType dtype) const {
-  return dtype.is_int() && dtype.bits() >= 32;
+bool IndexDataTypeNormalizer::CanRewriteDType(PrimType dtype) const {
+  return dtype.code() == DLDataTypeCode::kDLInt && dtype.bits() >= 32;
 }
 
 PrimExpr IndexDataTypeNormalizer::VisitExpr_(const IntImmNode* op) {
-  if (is_enabled_ && CanRewriteDType(op->dtype)) {
+  if (is_enabled_ && CanRewriteDType(op->ty())) {
     TVM_FFI_ICHECK_LE(op->value, max_value(target_data_type_).as_or_throw<IntImm>()->value);
     return cast(target_data_type_, ffi::GetRef<IntImm>(op));
   }
@@ -640,7 +654,8 @@ PrimExpr IndexDataTypeNormalizer::VisitExpr_(const IntImmNode* op) {
 }
 
 PrimExpr IndexDataTypeNormalizer::VisitExpr_(const VarNode* op) {
-  if (is_enabled_ && CanRewriteDType(op->dtype) && op->dtype != target_data_type_ &&
+  PrimType dtype = op->ty();
+  if (is_enabled_ && CanRewriteDType(dtype) && dtype->dtype != target_data_type_->dtype &&
       !var_remap_.count(op)) {
     var_remap_[op] = ffi::GetRef<Var>(op).copy_with_dtype(target_data_type_);
   }
@@ -651,9 +666,10 @@ PrimExpr IndexDataTypeNormalizer::VisitExpr_(const CastNode* op) {
   // Unwrap the cast only when the dtype of this cast is integer dtype.
   // When the dtype of this cast is not integer dtype, it means that this cast
   // has some other purpose, and we should not unwrap the cast.
-  if (is_enabled_ && CanRewriteDType(op->dtype)) {
+  PrimType dtype = op->ty();
+  if (is_enabled_ && CanRewriteDType(dtype)) {
     PrimExpr value = IndexDataTypeNormalizer::VisitExpr(op->value);
-    return value->dtype == target_data_type_ ? value : Cast(target_data_type_, value);
+    return value.ty()->dtype == target_data_type_->dtype ? value : Cast(target_data_type_, value);
   }
   return IndexDataTypeRewriter::VisitExpr_(op);
 }
diff --git a/src/tirx/ir/data_type_rewriter.h b/src/tirx/ir/data_type_rewriter.h
index 1bea362f6283..193aa2e09b55 100644
--- a/src/tirx/ir/data_type_rewriter.h
+++ b/src/tirx/ir/data_type_rewriter.h
@@ -141,7 +141,7 @@ class IndexDataTypeRewriter : public DataTypeLegalizer {
  */
 class IndexDataTypeNormalizer : public IndexDataTypeRewriter {
  public:
-  explicit IndexDataTypeNormalizer(DataType target_data_type);
+  explicit IndexDataTypeNormalizer(PrimType target_data_type);
   PrimFunc Rewrite(PrimFunc func);
 
  protected:
@@ -153,9 +153,9 @@ class IndexDataTypeNormalizer : public IndexDataTypeRewriter {
   PrimExpr VisitExpr_(const CastNode* op) override;
 
   /*! \brief Specifies which data type we can rewrite */
-  virtual bool CanRewriteDType(DataType dtype) const;
+  virtual bool CanRewriteDType(PrimType dtype) const;
 
-  DataType target_data_type_ = DataType::Int(64);
+  PrimType target_data_type_ = PrimType::Int(64);
 };
 
 }  // namespace tirx
diff --git a/src/tirx/ir/exec_scope.cc b/src/tirx/ir/exec_scope.cc
index 582ac578ceac..072666610ed3 100644
--- a/src/tirx/ir/exec_scope.cc
+++ b/src/tirx/ir/exec_scope.cc
@@ -389,7 +389,7 @@ ffi::Array<PrimExpr> ResolveCuda(ScopeBinding binding,
       ffi::Array<PrimExpr> ret;
       for (int i = 0; i < out_dim; ++i) {
         ret.push_back(
-            tirx::Call(DataType::Int(32), ptx_fetch_register_op,
+            tirx::Call(PrimType::Int(32), ptx_fetch_register_op,
                        {IntImm::Int32(32), StringImm("clusterid." + std::string(1, 'x' + i))}));
       }
       return ret;
@@ -438,8 +438,8 @@ ffi::Array<PrimExpr> ScopeIdResolve::Resolve(ScopeBinding binding,
 
 PrimExpr ScopeIdResolve::ComputeWarpIdInCta(const LaunchParams& params) {
   PrimExpr warp_id = FloorDiv(GetLinearThreadIndex(params), 32);
-  PrimExpr mask = IntImm(DataType::UInt(32), 0xffffffff);
-  return Call(warp_id.dtype(), builtin::tvm_warp_shuffle(),
+  PrimExpr mask = IntImm(PrimType::UInt(32), 0xffffffff);
+  return Call(warp_id.ty(), builtin::tvm_warp_shuffle(),
               {mask, warp_id, IntImm::Int32(0), IntImm::Int32(32), IntImm::Int32(32)});
 }
 
diff --git a/src/tirx/ir/expr.cc b/src/tirx/ir/expr.cc
index c2e89c6ec0de..0e250924c296 100644
--- a/src/tirx/ir/expr.cc
+++ b/src/tirx/ir/expr.cc
@@ -55,6 +55,20 @@ std::optional<int> ExtractVscaleFactor(const PrimExpr& lanes) {
   }
   return std::nullopt;
 }
+
+int GetLanesOrVScaleFactor(const PrimType& ty) {
+  return ty.IsScalableVector() ? ty.VScaleFactor() : ty.lanes();
+}
+
+TVM_FFI_INLINE const PrimTypeNode* GetPrimTypeNode(const PrimExpr& expr) {
+  // Avoid PrimExpr::ty() ObjectRef materialization in expression constructor hot paths.
+  const auto* node = expr.get();
+  TVM_FFI_DCHECK(node != nullptr);
+  TVM_FFI_DCHECK(node->BaseExprNode::ty.defined());
+  const auto* prim_ty = node->BaseExprNode::ty.as<PrimTypeNode>();
+  TVM_FFI_DCHECK(prim_ty != nullptr);
+  return prim_ty;
+}
 }  // namespace
 
 TVM_FFI_STATIC_INIT_BLOCK() {
@@ -109,44 +123,46 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   // src/script/printer/tirx/expr.cc (-> ReprPrintTIR which delegates to TVMScriptPrinter).
 }
 
-#define TVM_DEFINE_BINOP_CONSTRUCTOR(Name)                                    \
-  Name::Name(PrimExpr a, PrimExpr b, Span span) {                             \
-    using T = Name::ContainerType;                                            \
-    TVM_FFI_CHECK(a.defined(), ValueError) << "a is undefined\n";             \
-    TVM_FFI_CHECK(b.defined(), ValueError) << "b is undefined\n";             \
-    TVM_FFI_CHECK(a.dtype() == b.dtype(), TypeError)                          \
-        << "mismatched types. " << a.dtype() << " vs. " << b.dtype() << "\n"; \
-    ffi::ObjectPtr<T> node = ffi::make_object<T>();                           \
-    node->dtype = a.dtype();                                                  \
-    node->a = std::move(a);                                                   \
-    node->b = std::move(b);                                                   \
-    node->span = std::move(span);                                             \
-    data_ = std::move(node);                                                  \
+#define TVM_DEFINE_BINOP_CONSTRUCTOR(Name)                                        \
+  Name::Name(PrimExpr a, PrimExpr b, Span span) {                                 \
+    using T = Name::ContainerType;                                                \
+    TVM_FFI_CHECK(a.defined(), ValueError) << "a is undefined\n";                 \
+    TVM_FFI_CHECK(b.defined(), ValueError) << "b is undefined\n";                 \
+    const PrimTypeNode* a_ty = GetPrimTypeNode(a);                                \
+    const PrimTypeNode* b_ty = GetPrimTypeNode(b);                                \
+    TVM_FFI_CHECK(a_ty->dtype == b_ty->dtype, TypeError)                          \
+        << "mismatched types. " << a_ty->dtype << " vs. " << b_ty->dtype << "\n"; \
+    ffi::ObjectPtr<T> node = ffi::make_object<T>();                               \
+    node->BaseExprNode::ty = a.get()->BaseExprNode::ty;                           \
+    node->a = std::move(a);                                                       \
+    node->b = std::move(b);                                                       \
+    node->span = std::move(span);                                                 \
+    data_ = std::move(node);                                                      \
   }
 
-#define TVM_DEFINE_CMPOP_CONSTRUCTOR(Name)                                                  \
-  Name::Name(PrimExpr a, PrimExpr b, Span span) {                                           \
-    using T = Name::ContainerType;                                                          \
-    TVM_FFI_CHECK(a.defined(), ValueError) << "a is undefined\n";                           \
-    TVM_FFI_CHECK(b.defined(), ValueError) << "b is undefined\n";                           \
-    TVM_FFI_CHECK(a.dtype() == b.dtype(), TypeError)                                        \
-        << "mismatched types. " << a.dtype() << " vs. " << b.dtype() << "\n";               \
-    ffi::ObjectPtr<T> node = ffi::make_object<T>();                                         \
-    DataType a_dtype = a.dtype();                                                           \
-    node->dtype =                                                                           \
-        DataType::Bool(a_dtype.get_lanes_or_vscale_factor(), a_dtype.is_scalable_vector()); \
-    node->a = std::move(a);                                                                 \
-    node->b = std::move(b);                                                                 \
-    node->span = std::move(span);                                                           \
-    data_ = std::move(node);                                                                \
+#define TVM_DEFINE_CMPOP_CONSTRUCTOR(Name)                                        \
+  Name::Name(PrimExpr a, PrimExpr b, Span span) {                                 \
+    using T = Name::ContainerType;                                                \
+    TVM_FFI_CHECK(a.defined(), ValueError) << "a is undefined\n";                 \
+    TVM_FFI_CHECK(b.defined(), ValueError) << "b is undefined\n";                 \
+    const PrimTypeNode* a_ty = GetPrimTypeNode(a);                                \
+    const PrimTypeNode* b_ty = GetPrimTypeNode(b);                                \
+    TVM_FFI_CHECK(a_ty->dtype == b_ty->dtype, TypeError)                          \
+        << "mismatched types. " << a_ty->dtype << " vs. " << b_ty->dtype << "\n"; \
+    ffi::ObjectPtr<T> node = ffi::make_object<T>();                               \
+    node->BaseExprNode::ty = PrimType(DLDataType{kDLBool, 8, a_ty->dtype.lanes}); \
+    node->a = std::move(a);                                                       \
+    node->b = std::move(b);                                                       \
+    node->span = std::move(span);                                                 \
+    data_ = std::move(node);                                                      \
   }
 
 // Var
-Var::Var(ffi::String name_hint, DataType dtype, Span span) {
+Var::Var(ffi::String name_hint, PrimType dtype, Span span) {
   auto n = ffi::make_object<VarNode>();
   n->name_hint = std::move(name_hint);
-  n->type_annotation = GetTypeFromRuntimeDataType(dtype);
-  n->dtype = std::move(dtype);
+  n->type_annotation = dtype;
+  n->BaseExprNode::ty = dtype;
   n->span = std::move(span);
   data_ = std::move(n);
 }
@@ -154,8 +170,12 @@ Var::Var(ffi::String name_hint, DataType dtype, Span span) {
 Var::Var(ffi::String name_hint, Type type_annotation, Span span) {
   auto n = ffi::make_object<VarNode>();
   n->name_hint = std::move(name_hint);
-  n->dtype = GetRuntimeDataType(type_annotation);
   n->type_annotation = std::move(type_annotation);
+  if (n->type_annotation.as<PrimTypeNode>()) {
+    n->BaseExprNode::ty = n->type_annotation;
+  } else {
+    n->BaseExprNode::ty = PrimType(GetRuntimeDLDataType(n->type_annotation));
+  }
   n->span = std::move(span);
   data_ = std::move(n);
 }
@@ -176,7 +196,7 @@ Var Var::copy_with_suffix(const ffi::String& suffix) const {
   return this->copy_with_name(get()->name_hint + suffix);
 }
 
-Var Var::copy_with_dtype(DataType dtype) const {
+Var Var::copy_with_dtype(PrimType dtype) const {
   const VarNode* node = get();
   ffi::ObjectPtr<VarNode> new_ptr;
   if (auto* ptr = this->as<SizeVarNode>()) {
@@ -184,8 +204,8 @@ Var Var::copy_with_dtype(DataType dtype) const {
   } else {
     new_ptr = ffi::make_object<VarNode>(*node);
   }
-  new_ptr->type_annotation = GetTypeFromRuntimeDataType(dtype);
-  new_ptr->dtype = std::move(dtype);
+  new_ptr->type_annotation = dtype;
+  new_ptr->BaseExprNode::ty = dtype;
   return Var(new_ptr);
 }
 
@@ -195,17 +215,17 @@ TVM_FFI_STATIC_INIT_BLOCK() {
     if (type.as<Type>()) {
       return Var(name_hint, type.cast<Type>(), span);
     } else {
-      return Var(name_hint, type.cast<DataType>(), span);
+      return Var(name_hint, type.cast<PrimType>(), span);
     }
   });
 }
 
 // SizeVar
-SizeVar::SizeVar(ffi::String name_hint, DataType dtype, Span span) {
+SizeVar::SizeVar(ffi::String name_hint, PrimType dtype, Span span) {
   auto n = ffi::make_object<SizeVarNode>();
   n->name_hint = std::move(name_hint);
-  n->type_annotation = GetTypeFromRuntimeDataType(dtype);
-  n->dtype = std::move(dtype);
+  n->type_annotation = dtype;
+  n->BaseExprNode::ty = n->type_annotation;
   n->span = std::move(span);
   data_ = std::move(n);
 }
@@ -213,8 +233,8 @@ SizeVar::SizeVar(ffi::String name_hint, DataType dtype, Span span) {
 SizeVar::SizeVar(ffi::String name_hint, Type type_annotation, Span span) {
   auto n = ffi::make_object<SizeVarNode>();
   n->name_hint = std::move(name_hint);
-  n->dtype = GetRuntimeDataType(type_annotation);
   n->type_annotation = std::move(type_annotation);
+  n->BaseExprNode::ty = PrimType(GetRuntimeDLDataType(n->type_annotation));
   n->span = std::move(span);
   data_ = std::move(n);
 }
@@ -222,20 +242,22 @@ SizeVar::SizeVar(ffi::String name_hint, Type type_annotation, Span span) {
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tirx.SizeVar",
-                        [](ffi::String s, DataType t, Span span) { return SizeVar(s, t, span); });
+                        [](ffi::String s, PrimType t, Span span) { return SizeVar(s, t, span); });
 }
 
 // IterVar
 IterVar::IterVar(Range dom, Var var, IterVarType t, ffi::String thread_tag, Span span) {
   ffi::ObjectPtr<IterVarNode> n = ffi::make_object<IterVarNode>();
   if (dom.defined() && dom->extent.defined()) {
-    TVM_FFI_ICHECK(dom->extent.dtype().is_int())
+    PrimType extent_ty = dom->extent.ty();
+    PrimType var_ty = var.ty();
+    TVM_FFI_ICHECK(extent_ty.code() == DLDataTypeCode::kDLInt)
         << "The dtype of the domain of an IterVar must be an integer type. However, the domain's "
            "dtype is "
-        << dom->extent.dtype();
-    TVM_FFI_ICHECK_EQ(dom->extent.dtype(), var.dtype())
-        << "The dtype of the extent of an IterVar (" << dom->extent.dtype()
-        << ") must match its associated Var's dtype (" << var.dtype() << ")";
+        << extent_ty->dtype;
+    TVM_FFI_ICHECK(extent_ty == var_ty)
+        << "The dtype of the extent of an IterVar (" << extent_ty->dtype
+        << ") must match its associated Var's dtype (" << var_ty->dtype << ")";
   }
   n->dom = dom;
   n->var = var;
@@ -256,7 +278,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 // StringImm
 StringImm::StringImm(ffi::String value, Span span) {
   ffi::ObjectPtr<StringImmNode> node = ffi::make_object<StringImmNode>();
-  node->dtype = DataType::Handle();
+  node->BaseExprNode::ty = PrimType::Handle();
   node->value = std::move(value);
   node->span = std::move(span);
   data_ = std::move(node);
@@ -269,12 +291,12 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 }
 
 // Cast
-Cast::Cast(DataType t, PrimExpr value, Span span) {
+Cast::Cast(PrimType value_ty, PrimExpr value, Span span) {
   TVM_FFI_ICHECK(value.defined());
-  TVM_FFI_ICHECK_EQ(t.get_lanes_or_vscale_factor(), value.dtype().get_lanes_or_vscale_factor());
-  TVM_FFI_ICHECK(t.is_scalable_vector() == value.dtype().is_scalable_vector());
+  PrimType value_expr_ty = value.ty();
+  TVM_FFI_ICHECK_EQ(value_ty->dtype.lanes, value_expr_ty->dtype.lanes);
   ffi::ObjectPtr<CastNode> node = ffi::make_object<CastNode>();
-  node->dtype = t;
+  node->BaseExprNode::ty = std::move(value_ty);
   node->value = std::move(value);
   node->span = std::move(span);
   data_ = std::move(node);
@@ -282,7 +304,7 @@ Cast::Cast(DataType t, PrimExpr value, Span span) {
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("tirx.Cast", [](DataType dtype, PrimExpr value, Span span) {
+  refl::GlobalDef().def("tirx.Cast", [](PrimType dtype, PrimExpr value, Span span) {
     return Cast(dtype, value, span);
   });
 }
@@ -426,13 +448,14 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 And::And(PrimExpr a, PrimExpr b, Span span) {
   TVM_FFI_CHECK(a.defined(), ValueError) << "a is undefined";
   TVM_FFI_CHECK(b.defined(), ValueError) << "b is undefined";
-  TVM_FFI_ICHECK(a.dtype().is_bool());
-  TVM_FFI_ICHECK(b.dtype().is_bool());
-  TVM_FFI_CHECK(a.dtype() == b.dtype(), TypeError) << "mismatched types";
+  PrimType a_ty = a.ty();
+  PrimType b_ty = b.ty();
+  TVM_FFI_ICHECK(a_ty.MatchesCode(DLDataTypeCode::kDLBool));
+  TVM_FFI_ICHECK(b_ty.MatchesCode(DLDataTypeCode::kDLBool));
+  TVM_FFI_CHECK(a_ty == b_ty, TypeError) << "mismatched types";
 
   ffi::ObjectPtr<AndNode> node = ffi::make_object<AndNode>();
-  node->dtype =
-      DataType::Bool(a.dtype().get_lanes_or_vscale_factor(), a.dtype().is_scalable_vector());
+  node->BaseExprNode::ty = PrimType(DLDataType{kDLBool, 8, a_ty->dtype.lanes});
   node->a = std::move(a);
   node->b = std::move(b);
   node->span = std::move(span);
@@ -449,13 +472,14 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 Or::Or(PrimExpr a, PrimExpr b, Span span) {
   TVM_FFI_CHECK(a.defined(), ValueError) << "a is undefined";
   TVM_FFI_CHECK(b.defined(), ValueError) << "b is undefined";
-  TVM_FFI_ICHECK(a.dtype().is_bool());
-  TVM_FFI_ICHECK(b.dtype().is_bool());
-  TVM_FFI_CHECK(a.dtype() == b.dtype(), TypeError) << "mismatched types";
+  PrimType a_ty = a.ty();
+  PrimType b_ty = b.ty();
+  TVM_FFI_ICHECK(a_ty.MatchesCode(DLDataTypeCode::kDLBool));
+  TVM_FFI_ICHECK(b_ty.MatchesCode(DLDataTypeCode::kDLBool));
+  TVM_FFI_CHECK(a_ty == b_ty, TypeError) << "mismatched types";
 
   ffi::ObjectPtr<OrNode> node = ffi::make_object<OrNode>();
-  node->dtype =
-      DataType::Bool(a.dtype().get_lanes_or_vscale_factor(), a.dtype().is_scalable_vector());
+  node->BaseExprNode::ty = PrimType(DLDataType{kDLBool, 8, a_ty->dtype.lanes});
   node->a = std::move(a);
   node->b = std::move(b);
   node->span = std::move(span);
@@ -471,11 +495,11 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 // Not
 Not::Not(PrimExpr a, Span span) {
   TVM_FFI_CHECK(a.defined(), ValueError) << "a is undefined";
-  TVM_FFI_ICHECK(a.dtype().is_bool());
+  PrimType a_ty = a.ty();
+  TVM_FFI_ICHECK(a_ty.MatchesCode(DLDataTypeCode::kDLBool));
 
   ffi::ObjectPtr<NotNode> node = ffi::make_object<NotNode>();
-  DataType a_dtype = a.dtype();
-  node->dtype = DataType::Bool(a_dtype.get_lanes_or_vscale_factor(), a_dtype.is_scalable_vector());
+  node->BaseExprNode::ty = PrimType(DLDataType{kDLBool, 8, a_ty->dtype.lanes});
   node->a = std::move(a);
   node->span = std::move(span);
   data_ = std::move(node);
@@ -491,16 +515,18 @@ Select::Select(PrimExpr condition, PrimExpr true_value, PrimExpr false_value, Sp
   TVM_FFI_CHECK(condition.defined(), ValueError) << "condition is undefined";
   TVM_FFI_CHECK(true_value.defined(), ValueError) << "true_value is undefined";
   TVM_FFI_CHECK(false_value.defined(), ValueError) << "true_value is undefined";
-  TVM_FFI_ICHECK(condition.dtype().is_bool());
-  TVM_FFI_ICHECK(condition.dtype().get_lanes_or_vscale_factor() ==
-                     true_value.dtype().get_lanes_or_vscale_factor() ||
-                 condition.dtype().is_scalar());
-  TVM_FFI_CHECK(false_value.dtype() == true_value.dtype(), TypeError)
+  PrimType condition_ty = condition.ty();
+  PrimType true_ty = true_value.ty();
+  PrimType false_ty = false_value.ty();
+  TVM_FFI_ICHECK(condition_ty.MatchesCode(DLDataTypeCode::kDLBool));
+  TVM_FFI_ICHECK(GetLanesOrVScaleFactor(condition_ty) == GetLanesOrVScaleFactor(true_ty) ||
+                 condition_ty.IsScalar());
+  TVM_FFI_CHECK(false_ty == true_ty, TypeError)
       << "mismatched types. "
-      << "False type: " << false_value.dtype() << "; True type: " << true_value.dtype();
+      << "False type: " << false_ty->dtype << "; True type: " << true_ty->dtype;
 
   ffi::ObjectPtr<SelectNode> node = ffi::make_object<SelectNode>();
-  node->dtype = true_value.dtype();
+  node->BaseExprNode::ty = true_ty;
   node->condition = std::move(condition);
   node->true_value = std::move(true_value);
   node->false_value = std::move(false_value);
@@ -520,10 +546,12 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 Ramp::Ramp(PrimExpr base, PrimExpr stride, PrimExpr lanes, Span span) {
   TVM_FFI_ICHECK(base.defined());
   TVM_FFI_ICHECK(stride.defined());
-  TVM_FFI_ICHECK(base.dtype().is_scalar());
-  TVM_FFI_ICHECK(stride.dtype().is_scalar());
-  if (stride.dtype() != base.dtype()) {
-    stride = cast(base.dtype(), stride);
+  PrimType base_ty = base.ty();
+  PrimType stride_ty = stride.ty();
+  TVM_FFI_ICHECK(base_ty.IsScalar());
+  TVM_FFI_ICHECK(stride_ty.IsScalar());
+  if (stride_ty != base_ty) {
+    stride = cast(base_ty, stride);
   }
 
   ffi::ObjectPtr<RampNode> node = ffi::make_object<RampNode>();
@@ -531,15 +559,16 @@ Ramp::Ramp(PrimExpr base, PrimExpr stride, PrimExpr lanes, Span span) {
   if (lanes_as_int) {
     int lanes = static_cast<int>(lanes_as_int->value);
     TVM_FFI_ICHECK_GT(lanes, 1);
-    node->dtype = base.dtype().with_lanes(lanes);
+    node->BaseExprNode::ty = base_ty.WithLanes(lanes);
     // Stick to int32 lanes for fixed length vectors
     node->lanes = lanes;
   } else { /* scalable vector */
     std::optional<int> vscale_factor = ExtractVscaleFactor(lanes);
     TVM_FFI_ICHECK(vscale_factor) << "Invalid expression for scalable lanes " << lanes;
 
-    node->dtype = base.dtype().with_scalable_vscale_factor(vscale_factor.value());
-    lanes = Mul(Call(DataType::Int(32), tirx::builtin::vscale(), {}), vscale_factor.value());
+    node->BaseExprNode::ty =
+        PrimType::ScalableVector(base_ty.code(), base_ty.bits(), vscale_factor.value());
+    lanes = Mul(Call(PrimType::Int(32), tirx::builtin::vscale(), {}), vscale_factor.value());
     node->lanes = lanes;
   }
   node->base = base;
@@ -558,22 +587,24 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 // Broadcast
 Broadcast::Broadcast(PrimExpr value, PrimExpr lanes, Span span) {
   TVM_FFI_ICHECK(value.defined());
-  TVM_FFI_ICHECK(value.dtype().is_scalar());
+  PrimType value_ty = value.ty();
+  TVM_FFI_ICHECK(value_ty.IsScalar());
 
   ffi::ObjectPtr<BroadcastNode> node = ffi::make_object<BroadcastNode>();
   auto* lanes_int = lanes.as<IntImmNode>();
   if (lanes_int) {
     int lanes = static_cast<int>(lanes_int->value);
     TVM_FFI_ICHECK_GT(lanes, 1);
-    node->dtype = value.dtype().with_lanes(lanes);
+    node->BaseExprNode::ty = value_ty.WithLanes(lanes);
     // Stick to int32 lanes for fixed length vectors
     node->lanes = lanes;
   } else { /* scalable vector */
     std::optional<int> vscale_factor = ExtractVscaleFactor(lanes);
     TVM_FFI_ICHECK(vscale_factor) << "Invalid expression for scalable lanes " << lanes;
 
-    node->dtype = value.dtype().with_scalable_vscale_factor(vscale_factor.value());
-    lanes = Mul(Call(DataType::Int(32), tirx::builtin::vscale(), {}), vscale_factor.value());
+    node->BaseExprNode::ty =
+        PrimType::ScalableVector(value_ty.code(), value_ty.bits(), vscale_factor.value());
+    lanes = Mul(Call(PrimType::Int(32), tirx::builtin::vscale(), {}), vscale_factor.value());
     node->lanes = lanes;
   }
   node->value = std::move(value);
@@ -592,10 +623,10 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 Let::Let(Var var, PrimExpr value, PrimExpr body, Span span) {
   TVM_FFI_ICHECK(value.defined());
   TVM_FFI_ICHECK(body.defined());
-  TVM_FFI_ICHECK_EQ(value.dtype(), var.dtype());
+  TVM_FFI_ICHECK(value.ty() == var.ty());
 
   ffi::ObjectPtr<LetNode> node = ffi::make_object<LetNode>();
-  node->dtype = body.dtype();
+  node->BaseExprNode::ty = body.ty();
   node->var = std::move(var);
   node->value = std::move(value);
   node->body = std::move(body);
@@ -628,7 +659,7 @@ static ffi::Array<PrimExpr> ConvertCallArgs(ffi::Array<CallArg> args) {
         if (is_one(r->extent)) {
           indices.push_back(r->min);
         } else if (r->extent.as<IntImmNode>()) {
-          indices.push_back(tirx::Ramp(r->min, MakeConst(r->min->dtype, 1), r->extent));
+          indices.push_back(tirx::Ramp(r->min, MakeConst(r->min.ty(), 1), r->extent));
         } else {
           TVM_FFI_THROW(ValueError)
               << "Cannot convert to BufferLoad: " << ffi::GetRef<BufferRegion>(br);
@@ -642,13 +673,13 @@ static ffi::Array<PrimExpr> ConvertCallArgs(ffi::Array<CallArg> args) {
   return prim_expr_args;
 }
 
-Call::Call(DataType dtype, RelaxExpr op, ffi::Array<PrimExpr> args, Attrs attrs, Span span) {
+Call::Call(PrimType ret_ty, RelaxExpr op, ffi::Array<PrimExpr> args, Attrs attrs, Span span) {
   for (size_t i = 0; i < args.size(); ++i) {
     TVM_FFI_ICHECK(args[i].defined()) << "arg " << i << " is not defined()";
   }
 
   ffi::ObjectPtr<CallNode> node = ffi::make_object<CallNode>();
-  node->dtype = dtype;
+  node->BaseExprNode::ty = std::move(ret_ty);
   node->op = std::move(op);
   node->args = std::move(args);
   node->attrs = std::move(attrs);
@@ -656,21 +687,21 @@ Call::Call(DataType dtype, RelaxExpr op, ffi::Array<PrimExpr> args, Attrs attrs,
   data_ = std::move(node);
 }
 
-Call::Call(DataType dtype, RelaxExpr op, ffi::Array<PrimExpr> args, Span span)
-    : Call(dtype, std::move(op), std::move(args), Attrs(), std::move(span)) {}
+Call::Call(PrimType ret_ty, RelaxExpr op, ffi::Array<PrimExpr> args, Span span)
+    : Call(std::move(ret_ty), std::move(op), std::move(args), Attrs(), std::move(span)) {}
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def("tirx.Call",
-           [](ffi::Optional<DataType> dtype, RelaxExpr op, ffi::Array<CallArg> args, Span span) {
-             return Call(dtype.value_or(DataType::Void()), op, ConvertCallArgs(args), Attrs(),
+           [](ffi::Optional<PrimType> dtype, RelaxExpr op, ffi::Array<CallArg> args, Span span) {
+             return Call(dtype.value_or(PrimType::Void()), op, ConvertCallArgs(args), Attrs(),
                          span);
            })
       .def("tirx.CallWithAttrs",
-           [](ffi::Optional<DataType> dtype, RelaxExpr op, ffi::Array<CallArg> args,
+           [](ffi::Optional<PrimType> dtype, RelaxExpr op, ffi::Array<CallArg> args,
               ffi::Optional<Attrs> attrs, Span span) {
-             return Call(dtype.value_or(DataType::Void()), op, ConvertCallArgs(args),
+             return Call(dtype.value_or(PrimType::Void()), op, ConvertCallArgs(args),
                          attrs.value_or(Attrs()), span);
            });
 }
@@ -680,17 +711,18 @@ Shuffle::Shuffle(ffi::Array<PrimExpr> vectors, ffi::Array<PrimExpr> indices, Spa
   TVM_FFI_ICHECK_NE(vectors.size(), 0U);
   TVM_FFI_ICHECK_NE(indices.size(), 0U);
 
-  DataType base_type = vectors[0].dtype().element_of();
+  PrimType base_type = vectors[0].ty().WithLanes(1);
   int total_lanes = 0;
 
   for (PrimExpr val : vectors) {
-    TVM_FFI_ICHECK(val.dtype().element_of() == base_type);
-    total_lanes += val.dtype().lanes();
+    PrimType val_ty = val.ty();
+    TVM_FFI_ICHECK(val_ty.WithLanes(1)->dtype == base_type->dtype);
+    total_lanes += val_ty.lanes();
   }
   TVM_FFI_ICHECK_LE(indices.size(), static_cast<size_t>(total_lanes));
 
   ffi::ObjectPtr<ShuffleNode> node = ffi::make_object<ShuffleNode>();
-  node->dtype = base_type.with_lanes(static_cast<int>(indices.size()));
+  node->BaseExprNode::ty = base_type.WithLanes(static_cast<int>(indices.size()));
   node->vectors = std::move(vectors);
   node->indices = std::move(indices);
   node->span = std::move(span);
@@ -705,7 +737,7 @@ PrimExpr Shuffle::Concat(ffi::Array<PrimExpr> vectors, Span span) {
   ffi::Array<PrimExpr> indices;
   int index = 0;
   for (const PrimExpr& e : vectors) {
-    for (int i = 0; i < e.dtype().lanes(); ++i) {
+    for (int i = 0; i < e.ty().lanes(); ++i) {
       indices.push_back(IntImm::Int32(index++));
     }
   }
@@ -743,7 +775,7 @@ CommReducer::CommReducer(ffi::Array<Var> lhs, ffi::Array<Var> rhs, ffi::Array<Pr
   std::unordered_map<const VarNode*, PrimExpr> var_map;
   var_map.reserve(n_group * 2);
   for (int i = 0; i < static_cast<int>(n_group); ++i) {
-    DataType dtype = identity_element[i].dtype();
+    PrimType dtype = identity_element[i].ty();
     Var l = lhs[i].copy_with_dtype(dtype);
     Var r = rhs[i].copy_with_dtype(dtype);
     var_map[lhs[i].get()] = l;
@@ -815,7 +847,7 @@ Reduce::Reduce(CommReducer combiner, ffi::Array<PrimExpr> source, ffi::Array<Ite
           << "but received " << init[i] << " of type " << init[i]->GetTypeKey();
     }
   }
-  n->dtype = source[value_index].dtype();
+  n->BaseExprNode::ty = source[value_index].ty();
   n->combiner = std::move(combiner);
   n->source = std::move(source);
   n->init = std::move(init);
@@ -838,28 +870,30 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 // BufferLoad
 void BufferLoadNode::LegalizeDType() {
   for (int i = 0; i < static_cast<int>(indices.size()) - 1; i++) {
-    TVM_FFI_ICHECK(indices[i].dtype().is_scalar())
+    TVM_FFI_ICHECK(indices[i].ty().IsScalar())
         << "Only the last index of a buffer access may be a vector type.";
   }
 
   if (indices.empty()) {
-    this->dtype = buffer->dtype;
+    this->BaseExprNode::ty = buffer->dtype;
   } else {
-    auto index_dtype = indices.back().dtype();
-    bool is_buffer_dtype_scalable = buffer->dtype.is_scalable_vector();
-    bool is_index_scalable = index_dtype.is_scalable_vector();
+    PrimType index_ty = indices.back().ty();
+    int16_t buffer_encoded_lanes = static_cast<int16_t>(buffer->dtype->dtype.lanes);
+    bool is_buffer_dtype_scalable = buffer_encoded_lanes < -1;
+    bool is_index_scalable = index_ty.IsScalableVector();
 
     TVM_FFI_ICHECK(!(is_index_scalable && is_buffer_dtype_scalable))
         << "Index dtype and buffer dtype can't both be scalable.";
 
     if (is_index_scalable) {
-      this->dtype = buffer->dtype.with_scalable_vscale_factor(index_dtype.vscale_factor() *
-                                                              buffer->dtype.lanes());
+      this->BaseExprNode::ty =
+          PrimType::ScalableVector(buffer->dtype.code(), buffer->dtype.bits(),
+                                   index_ty.VScaleFactor() * buffer->dtype.lanes());
     } else if (is_buffer_dtype_scalable) {
-      this->dtype = buffer->dtype.with_scalable_vscale_factor(buffer->dtype.vscale_factor() *
-                                                              index_dtype.lanes());
+      this->BaseExprNode::ty = PrimType::ScalableVector(buffer->dtype.code(), buffer->dtype.bits(),
+                                                        -buffer_encoded_lanes * index_ty.lanes());
     } else {
-      this->dtype = buffer->dtype.with_lanes(index_dtype.lanes() * buffer->dtype.lanes());
+      this->BaseExprNode::ty = buffer->dtype.WithLanes(index_ty.lanes() * buffer->dtype.lanes());
     }
   }
 }
@@ -872,25 +906,24 @@ BufferLoad::BufferLoad(Buffer buffer, ffi::Array<PrimExpr> indices,
       << "-dimensional indices provided.";
 
   if (predicate.defined()) {
-    DataType predicate_dtype = predicate.value().dtype();
-
-    bool is_index_scalable = indices.empty() ? false : indices.back().dtype().is_scalable_vector();
-    bool is_predicate_scalable = predicate_dtype.is_scalable_vector();
+    PrimType predicate_ty = predicate.value().ty();
+    bool is_index_scalable = indices.empty() ? false : indices.back().ty().IsScalableVector();
+    bool is_predicate_scalable = predicate_ty.IsScalableVector();
     TVM_FFI_ICHECK_EQ(is_index_scalable, is_predicate_scalable)
         << "Predicate mask dtype and load indices must both be scalable.";
 
-    int buffer_lanes = buffer->dtype.get_lanes_or_vscale_factor();
-    int index_lanes = indices.empty() ? 1 : indices.back().dtype().get_lanes_or_vscale_factor();
-    int predicate_lanes = predicate_dtype.get_lanes_or_vscale_factor();
+    int16_t buffer_encoded_lanes = static_cast<int16_t>(buffer->dtype->dtype.lanes);
+    int buffer_lanes = buffer_encoded_lanes < -1 ? -buffer_encoded_lanes : buffer_encoded_lanes;
+    int index_lanes = indices.empty() ? 1 : GetLanesOrVScaleFactor(indices.back().ty());
+    int predicate_lanes = GetLanesOrVScaleFactor(predicate_ty);
     TVM_FFI_ICHECK_EQ(index_lanes * buffer_lanes, predicate_lanes)
         << "Got a predicate mask with " << predicate_lanes
         << " lanes, but trying to load a vector with " << index_lanes
         << " lanes. The number of lanes must match.";
 
-    DataType predicate_element_dtype = predicate_dtype.element_of();
-    TVM_FFI_ICHECK(predicate_element_dtype.is_predicate_dtype())
-        << "Predicate mask elements must be boolean values, but got " << predicate_element_dtype
-        << ".";
+    TVM_FFI_ICHECK(predicate_ty.MatchesCode(DLDataTypeCode::kDLBool) ||
+                   predicate_ty.MatchesElementType(DLDataTypeCode::kDLUInt, 1))
+        << "Predicate mask elements must be boolean values, but got " << predicate_ty->dtype << ".";
   }
 
   ffi::ObjectPtr<BufferLoadNode> node = ffi::make_object<BufferLoadNode>();
@@ -913,7 +946,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 // ProducerLoad
 ProducerLoad::ProducerLoad(DataProducer producer, ffi::Array<PrimExpr> indices, Span span) {
   ffi::ObjectPtr<ProducerLoadNode> node = ffi::make_object<ProducerLoadNode>();
-  node->dtype = producer->GetDataType();
+  node->BaseExprNode::ty = producer->GetDataType();
   node->producer = std::move(producer);
   node->indices = std::move(indices);
   node->span = std::move(span);
diff --git a/src/tirx/ir/expr_functor.cc b/src/tirx/ir/expr_functor.cc
index aba96aae8c3a..056ed9419bc8 100644
--- a/src/tirx/ir/expr_functor.cc
+++ b/src/tirx/ir/expr_functor.cc
@@ -155,7 +155,7 @@ PrimExpr ExprMutator::VisitExpr_(const CallNode* op) {
   if (args.same_as(op->args)) {
     return ffi::GetRef<PrimExpr>(op);
   } else {
-    return Call(op->dtype, op->op, args, op->attrs, op->span);
+    return Call(ffi::GetRef<PrimExpr>(op).ty(), op->op, args, op->attrs, op->span);
   }
 }
 
@@ -227,7 +227,7 @@ PrimExpr ExprMutator::VisitExpr_(const CastNode* op) {
   if (value.same_as(op->value)) {
     return ffi::GetRef<PrimExpr>(op);
   } else {
-    return Cast(op->dtype, value);
+    return Cast(ffi::GetRef<PrimExpr>(op).ty(), value);
   }
 }
 
diff --git a/src/tirx/ir/function.cc b/src/tirx/ir/function.cc
index d6b171481ea7..7fc1439fb1ea 100644
--- a/src/tirx/ir/function.cc
+++ b/src/tirx/ir/function.cc
@@ -45,23 +45,25 @@ tvm::Type InferType(const PrimFunc& prim_func) {
       if (auto opt_buf = prim_func->buffer_map.Get(param)) {
         auto buf = opt_buf.value();
         relax::ShapeExpr shape(
-            buf->shape.Map([](PrimExpr dim) { return cast(DataType::Int(64), dim); }));
+            buf->shape.Map([](PrimExpr dim) { return cast(PrimType::Int(64), dim); }));
         return relax::TensorType(shape, buf->dtype);
       }
 
-      if (auto prim_type = param->type_annotation.as<PrimTypeNode>();
-          prim_type && prim_type->dtype.is_handle()) {
-        return relax::ObjectType();
+      if (auto prim_type = param->type_annotation.as<PrimTypeNode>()) {
+        const DLDataType& dtype = prim_type->dtype;
+        if (dtype.code == kDLOpaqueHandle && (dtype.bits != 0 || dtype.lanes != 0)) {
+          return relax::ObjectType();
+        }
       }
 
-      return PrimType(param->dtype);
+      return param.ty();
     }();
     params.push_back(param_ty);
   }
 
   tvm::Type ret = [&]() -> tvm::Type {
     if (const auto* prim = prim_func->ret_type.as<PrimTypeNode>()) {
-      return PrimType(prim->dtype);
+      return tvm::PrimType(prim->dtype);
     } else if (IsVoidType(prim_func->ret_type)) {
       return relax::TupleType(ffi::Array<tvm::Type>{});
     } else {
@@ -119,10 +121,10 @@ TensorIntrin::TensorIntrin(PrimFunc desc, PrimFunc impl) {
       << "The number of parameters of the description and the implementation of the "
          "tensor intrinsic doesn't match.";
   for (size_t i = 0; i < desc->params.size(); i++) {
-    TVM_FFI_CHECK(desc->params[i]->dtype.is_handle(), ValueError)
+    TVM_FFI_CHECK(desc->params[i].ty().IsHandle(), ValueError)
         << "Parameters of the description of the "
            "tensor intrinsic should be handle only.";
-    TVM_FFI_CHECK(impl->params[i]->dtype.is_handle(), ValueError)
+    TVM_FFI_CHECK(impl->params[i].ty().IsHandle(), ValueError)
         << "Parameters of the implementation of "
            "the tensor intrinsic should be handle only.";
   }
diff --git a/src/tirx/ir/index_map.cc b/src/tirx/ir/index_map.cc
index 4e9e7ecea8b6..382c75348941 100644
--- a/src/tirx/ir/index_map.cc
+++ b/src/tirx/ir/index_map.cc
@@ -53,7 +53,7 @@ IndexMap IndexMap::FromFunc(int ndim,
   ffi::Array<Var> initial_indices;
   initial_indices.reserve(ndim);
   for (int i = 0; i < ndim; ++i) {
-    initial_indices.push_back(Var("i" + std::to_string(i), DataType::Int(32)));
+    initial_indices.push_back(Var("i" + std::to_string(i), PrimType::Int(32)));
   }
   return IndexMap(initial_indices, func(initial_indices), std::move(inverse_index_map));
 }
@@ -83,7 +83,7 @@ std::pair<IndexMap, PrimExpr> IndexMapInverseImpl(const IndexMap& self,
     // should be named (X.outer,X.inner).
     std::stringstream ss;
     ss << "axis" << i;
-    Var var_index(ss.str(), index.dtype());
+    Var var_index(ss.str(), index.ty());
     output_vars.push_back(var_index);
   }
 
@@ -249,12 +249,13 @@ ffi::Array<Range> IndexMapNode::MapRanges(const ffi::Array<Range>& ranges,
   auto output_dtype = [&]() {
     int max_bits = ranges.empty() ? 32 : 0;
     for (const auto& range : ranges) {
-      max_bits = std::max(max_bits, range->extent.dtype().bits());
+      max_bits = std::max(max_bits, range->extent.ty().bits());
     }
-    return DataType::Int(max_bits);
+    return PrimType::Int(max_bits);
   }();
   output.MutateByApply([&](const Range& range) {
-    if (range->min.dtype() != output_dtype || range->extent.dtype() != output_dtype) {
+    if (range->min.ty()->dtype != output_dtype->dtype ||
+        range->extent.ty()->dtype != output_dtype->dtype) {
       return Range::FromMinExtent(cast(output_dtype, range->min),
                                   cast(output_dtype, range->extent));
     } else {
@@ -275,7 +276,7 @@ ffi::Array<PrimExpr> IndexMapNode::MapShape(const ffi::Array<PrimExpr>& shape,
 
   ffi::Array<Range> ranges;
   for (auto& dim : shape) {
-    ranges.push_back(Range(IntImm(dim.dtype(), 0), dim));
+    ranges.push_back(Range(IntImm(dim.ty(), 0), dim));
   }
   ffi::Array<Range> mapped = MapRanges(std::move(ranges), analyzer);
 
@@ -366,7 +367,7 @@ IndexMap IndexMap::RenameVariables(
           ffi::String name = opt_name.value();
           TVM_FFI_ICHECK(!name_supply->ContainsName(name, /*add_prefix=*/false));
           name_supply->ReserveName(name, /*add_prefix=*/false);
-          var_remap.Set(var, Var(name, var->dtype));
+          var_remap.Set(var, Var(name, var.ty()));
         }
       });
     });
diff --git a/src/tirx/ir/layout/axis_registry.cc b/src/tirx/ir/layout/axis_registry.cc
index 2afd290037c8..633296cee629 100644
--- a/src/tirx/ir/layout/axis_registry.cc
+++ b/src/tirx/ir/layout/axis_registry.cc
@@ -169,7 +169,7 @@ ffi::Array<Iter> SplitterGen(const Iter& iter, const Axis& axis_outer, const Axi
              analyzer->CanProveEqual(floormod(iter->extent * iter->stride, e_inner), 0)) {
     const auto& d = analyzer->Simplify(floordiv(e_inner, iter->stride));
     const auto& c = analyzer->Simplify(floordiv(iter->extent, d));
-    return {Iter(c, IntImm(e_inner.dtype(), 1), axis_outer), Iter(d, iter->stride, axis_inner)};
+    return {Iter(c, IntImm(e_inner.ty(), 1), axis_outer), Iter(d, iter->stride, axis_inner)};
   } else if (analyzer->CanProveEqual(floormod(iter->stride, e_inner), 0)) {
     const auto& d = analyzer->Simplify(floordiv(iter->stride, e_inner));
     return {Iter(iter->extent, d, axis_outer)};
diff --git a/src/tirx/ir/layout/tile_slice.cc b/src/tirx/ir/layout/tile_slice.cc
index 8b4181d2bfa8..b172f7fec0ff 100644
--- a/src/tirx/ir/layout/tile_slice.cc
+++ b/src/tirx/ir/layout/tile_slice.cc
@@ -118,7 +118,7 @@ ffi::Optional<TileLayout> SlicePerGroup(TileLayout layout, PrimExpr begin, PrimE
     return TileLayout(new_shard, layout->replica, new_offset);
   }
 
-  PrimExpr two = MakeConst(rem.dtype(), 2);
+  PrimExpr two = MakeConst(rem.ty(), 2);
   PrimExpr c = analyzer->Simplify(floordiv(rem, two));
   bool even = analyzer->CanProveEqual(floormod(rem, two), 0);
   bool mid = analyzer->CanProveEqual(analyzer->Simplify(d0[pivot] + c), Ek);
@@ -131,7 +131,7 @@ ffi::Optional<TileLayout> SlicePerGroup(TileLayout layout, PrimExpr begin, PrimE
       PrimExpr delta =
           analyzer->Simplify((pivot > 0 ? shard[pivot - 1]->stride : PrimExpr(0)) - (Ek - c) * Sk);
       std::vector<Iter> new_shard;
-      new_shard.push_back(Iter(MakeConst(c.dtype(), 2), delta, ak));
+      new_shard.push_back(Iter(MakeConst(c.ty(), 2), delta, ak));
       new_shard.push_back(Iter(c, Sk, ak));
       new_shard.insert(new_shard.end(), peeled_rev.rbegin(), peeled_rev.rend());
       return TileLayout(new_shard, layout->replica, new_offset);
diff --git a/src/tirx/ir/layout/utils.cc b/src/tirx/ir/layout/utils.cc
index 477f512a4e42..05828a66001c 100644
--- a/src/tirx/ir/layout/utils.cc
+++ b/src/tirx/ir/layout/utils.cc
@@ -73,7 +73,7 @@ std::vector<PrimExpr> GetDefaultStrides(const ffi::Array<PrimExpr>& data, PrimEx
   // get int32 strides and structurally differ from parser output.
   PrimExpr current_stride = initial_stride;
   if (const auto* imm = current_stride.as<IntImmNode>()) {
-    current_stride = MakeConst(data[0].dtype(), imm->value);
+    current_stride = MakeConst(data[0].ty(), imm->value);
   }
   for (int i = static_cast<int>(n) - 1; i >= 0; --i) {
     strides[i] = current_stride;
diff --git a/src/tirx/ir/script/script_complete.cc b/src/tirx/ir/script/script_complete.cc
index c432731ebad5..bb915e96acf8 100644
--- a/src/tirx/ir/script/script_complete.cc
+++ b/src/tirx/ir/script/script_complete.cc
@@ -45,8 +45,9 @@ class ScriptCompleter : public StmtMutator {
   ffi::Map<Var, Buffer>* buffer_var_map_;
   Stmt VisitStmt_(const SBlockRealizeNode* op) final {
     for (const PrimExpr& value : op->iter_values) {
-      TVM_FFI_ICHECK(value.dtype().is_int())
-          << "BlockRealize iter_value expected a IntImm, but got " << value.dtype();
+      PrimType value_ty = value.ty();
+      TVM_FFI_ICHECK(value_ty.code() == DLDataTypeCode::kDLInt)
+          << "BlockRealize iter_value expected a IntImm, but got " << value_ty->dtype;
     }
     return StmtMutator::VisitStmt_(op);
   }
diff --git a/src/tirx/ir/stmt.cc b/src/tirx/ir/stmt.cc
index 66f48355c5ce..d4908df436c3 100644
--- a/src/tirx/ir/stmt.cc
+++ b/src/tirx/ir/stmt.cc
@@ -33,6 +33,14 @@
 namespace tvm {
 namespace tirx {
 
+namespace {
+
+int GetLanesOrVScaleFactor(const PrimType& ty) {
+  return ty.IsScalableVector() ? ty.VScaleFactor() : ty.lanes();
+}
+
+}  // namespace
+
 TVM_FFI_STATIC_INIT_BLOCK() {
   StmtNode::RegisterReflection();
   BindNode::RegisterReflection();
@@ -59,12 +67,12 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 // Bind
 Bind::Bind(Var var, PrimExpr value, Span span) {
   TVM_FFI_ICHECK(value.defined());
-  auto vdtype = value.dtype();
+  PrimType value_ty = value.ty();
   // It is still valid to bind a pointer type var to a value that is of type handle.
   if (var->type_annotation.as<PointerTypeNode>()) {
-    TVM_FFI_ICHECK(vdtype.is_handle());
+    TVM_FFI_ICHECK(value_ty.IsHandle());
   } else {
-    TVM_FFI_ICHECK_EQ(value.dtype(), var.dtype());
+    TVM_FFI_ICHECK(value.ty() == var.ty());
   }
 
   ffi::ObjectPtr<BindNode> node = ffi::make_object<BindNode>();
@@ -108,9 +116,10 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 AssertStmt::AssertStmt(PrimExpr condition, StringImm error_kind,
                        ffi::Array<StringImm> message_parts, Span span) {
   TVM_FFI_ICHECK(condition.defined());
-  TVM_FFI_ICHECK(condition.dtype().is_predicate_dtype())
+  PrimType condition_ty = condition.ty();
+  TVM_FFI_ICHECK(condition_ty.MatchesCode(DLDataTypeCode::kDLBool))
       << "AssertStmt should have boolean condition, "
-      << "but received " << condition << " with dtype " << condition.dtype();
+      << "but received " << condition << " with dtype " << condition_ty;
   TVM_FFI_ICHECK(error_kind.defined());
 
   ffi::ObjectPtr<AssertStmtNode> node = ffi::make_object<AssertStmtNode>();
@@ -139,8 +148,9 @@ For::For(Var loop_var, PrimExpr min, PrimExpr extent, ForKind kind, Stmt body,
   TVM_FFI_ICHECK(body.defined());
 
   auto require_scalar_int_dtype = [&](PrimExpr expr, const char* field_name) {
-    auto dtype = expr.dtype();
-    TVM_FFI_ICHECK(dtype.is_scalar() && (dtype.is_int() || dtype.is_uint()))
+    PrimType dtype = expr.ty();
+    TVM_FFI_ICHECK(dtype.IsScalar() &&
+                   (dtype.MatchesCode(DLDataTypeCode::kDLUInt, DLDataTypeCode::kDLInt)))
         << "TIR For nodes require a scalar integer as the " << field_name << ", but received "
         << expr << " with dtype " << dtype;
   };
@@ -151,12 +161,14 @@ For::For(Var loop_var, PrimExpr min, PrimExpr extent, ForKind kind, Stmt body,
   // When extent, min or step is an IntImm but has narrower dtype than loop_var
   // we directly promote them without raising errors.
   auto try_promote_imm_dtype = [&](const PrimExpr& e) {
-    TVM_FFI_ICHECK(e.dtype().bits() <= loop_var.dtype().bits())
-        << " Loop variable's dtype (" << loop_var.dtype()
-        << ") is narrower than that of `min` or `extent` (" << e.dtype() << ")";
+    PrimType e_ty = e.ty();
+    PrimType loop_var_ty = loop_var.ty();
+    TVM_FFI_ICHECK(e_ty.bits() <= loop_var_ty.bits())
+        << " Loop variable's dtype (" << loop_var_ty
+        << ") is narrower than that of `min` or `extent` (" << e_ty << ")";
     const IntImmNode* a = e.as<IntImmNode>();
-    if (a && e.dtype().bits() < loop_var.dtype().bits()) {
-      return MakeConst(loop_var.dtype(), a->value);
+    if (a && e_ty.bits() < loop_var_ty.bits()) {
+      return MakeConst(loop_var_ty, a->value);
     } else {
       return e;
     }
@@ -165,15 +177,14 @@ For::For(Var loop_var, PrimExpr min, PrimExpr extent, ForKind kind, Stmt body,
   min = try_promote_imm_dtype(min);
   extent = try_promote_imm_dtype(extent);
 
-  TVM_FFI_ICHECK(loop_var.dtype() == min.dtype()) << loop_var.dtype() << " vs " << min.dtype();
-  TVM_FFI_ICHECK(loop_var.dtype() == extent.dtype())
-      << loop_var.dtype() << " vs " << extent.dtype();
+  TVM_FFI_ICHECK(loop_var.ty() == min.ty()) << loop_var.ty() << " vs " << min.ty();
+  TVM_FFI_ICHECK(loop_var.ty() == extent.ty()) << loop_var.ty() << " vs " << extent.ty();
 
   if (step.has_value()) {
     require_scalar_int_dtype(*step, "step");
     step = try_promote_imm_dtype(*step);
-    TVM_FFI_ICHECK(loop_var.dtype() == (*step).dtype())
-        << loop_var.dtype() << " vs " << (*step).dtype();
+    TVM_FFI_ICHECK(loop_var.ty() == step.value().ty())
+        << loop_var.ty() << " vs " << step.value().ty();
   }
 
   ffi::ObjectPtr<ForNode> node = ffi::make_object<ForNode>();
@@ -226,7 +237,7 @@ std::ostream& operator<<(std::ostream& out, ForKind type) {  // NOLINT(*)
 // While
 While::While(PrimExpr condition, Stmt body, Span span) {
   TVM_FFI_ICHECK(condition.defined());
-  TVM_FFI_ICHECK(condition.dtype().is_scalar());
+  TVM_FFI_ICHECK(condition.ty().IsScalar());
   TVM_FFI_ICHECK(body.defined());
 
   ffi::ObjectPtr<WhileNode> node = ffi::make_object<WhileNode>();
@@ -393,19 +404,21 @@ BufferStore::BufferStore(Buffer buffer, PrimExpr value, ffi::Array<PrimExpr> ind
       << "-dimensional indices provided.";
 
   for (int i = 0; i < static_cast<int>(indices.size()) - 1; i++) {
-    TVM_FFI_ICHECK(indices[i].dtype().is_scalar())
+    TVM_FFI_ICHECK(indices[i].ty().IsScalar())
         << "Only the last index of a buffer access may be a vector type.";
   }
 
-  bool is_index_scalable = indices.empty() ? false : indices.back().dtype().is_scalable_vector();
-  bool is_buffer_dtype_scalable = buffer->dtype.is_scalable_vector();
-  bool is_value_dtype_scalable = value.dtype().is_scalable_vector();
+  bool is_index_scalable = indices.empty() ? false : indices.back().ty().IsScalableVector();
+  int16_t buffer_encoded_lanes = static_cast<int16_t>(buffer->dtype->dtype.lanes);
+  bool is_buffer_dtype_scalable = buffer_encoded_lanes < -1;
+  PrimType value_ty = value.ty();
+  bool is_value_dtype_scalable = value_ty.IsScalableVector();
 
   TVM_FFI_ICHECK(!(is_index_scalable && is_buffer_dtype_scalable))
       << "Index dtype and buffer dtype can't both be scalable.";
 
   if (predicate.defined()) {
-    bool is_predicate_dtype_scalable = predicate.value().dtype().is_scalable_vector();
+    bool is_predicate_dtype_scalable = predicate.value().ty().IsScalableVector();
     TVM_FFI_ICHECK_EQ(is_value_dtype_scalable, is_predicate_dtype_scalable)
         << "Predicate mask dtype and value dtype must both be scalable.";
   }
@@ -414,9 +427,9 @@ BufferStore::BufferStore(Buffer buffer, PrimExpr value, ffi::Array<PrimExpr> ind
     TVM_FFI_ICHECK(is_value_dtype_scalable) << "Can't store non-scalable data into scalable buffer";
   }
 
-  int index_lanes = indices.empty() ? 1 : indices.back().dtype().get_lanes_or_vscale_factor();
-  int buffer_lanes = buffer->dtype.get_lanes_or_vscale_factor();
-  int value_dtype_lanes = value.dtype().get_lanes_or_vscale_factor();
+  int index_lanes = indices.empty() ? 1 : GetLanesOrVScaleFactor(indices.back().ty());
+  int buffer_lanes = is_buffer_dtype_scalable ? -buffer_encoded_lanes : buffer_encoded_lanes;
+  int value_dtype_lanes = GetLanesOrVScaleFactor(value_ty);
 
   TVM_FFI_ICHECK_EQ(index_lanes * buffer_lanes, value_dtype_lanes)
       << "Cannot store value with " << value_dtype_lanes << ", expected value with "
@@ -424,31 +437,33 @@ BufferStore::BufferStore(Buffer buffer, PrimExpr value, ffi::Array<PrimExpr> ind
       << " buffer element lanes)";
 
   if (predicate.defined()) {
-    DataType predicate_dtype = predicate.value().dtype();
-    int predicate_dtype_lanes = predicate_dtype.get_lanes_or_vscale_factor();
+    PrimType predicate_ty = predicate.value().ty();
+    int predicate_dtype_lanes = GetLanesOrVScaleFactor(predicate_ty);
     TVM_FFI_ICHECK_EQ(value_dtype_lanes, predicate_dtype_lanes)
         << "Got a predicate mask with " << predicate_dtype_lanes
         << " lanes, but trying to store a value with " << value_dtype_lanes
         << " lanes. The number of lanes must match.";
 
-    DataType predicate_element_dtype = predicate_dtype.element_of();
-    TVM_FFI_ICHECK(predicate_element_dtype.is_predicate_dtype())
-        << "Predicate mask elements must be boolean values, but got " << predicate_element_dtype
+    PrimType predicate_element_ty = predicate_ty.WithLanes(1);
+    TVM_FFI_ICHECK(predicate_element_ty.MatchesCode(DLDataTypeCode::kDLBool) ||
+                   predicate_element_ty.MatchesElementType(DLDataTypeCode::kDLUInt, 1))
+        << "Predicate mask elements must be boolean values, but got " << predicate_element_ty
         << ".";
   }
 
-  runtime::DataType buffer_dtype;
+  PrimType buffer_dtype = PrimType::Void();
   if (is_index_scalable || is_buffer_dtype_scalable) {
-    buffer_dtype = buffer->dtype.with_scalable_vscale_factor(buffer_lanes * index_lanes);
+    buffer_dtype = PrimType::ScalableVector(buffer->dtype.code(), buffer->dtype.bits(),
+                                            buffer_lanes * index_lanes);
   } else {
-    buffer_dtype = buffer->dtype.with_lanes(buffer_lanes * index_lanes);
+    buffer_dtype = buffer->dtype.WithLanes(buffer_lanes * index_lanes);
   }
-  if (buffer_dtype != value.dtype()) {
+  if (buffer_dtype != value_ty) {
     TVM_FFI_THROW(TypeError) << "dtype mismatch on BufferStore: "                 //
                              << "buffer's dtype is `" << buffer->dtype            //
                              << "`, the lanes of indexing are: `" << index_lanes  //
-                             << "`, the scalability is: `" << buffer_dtype.is_scalable_vector()
-                             << "`, but RHS's dtype is `" << value.dtype() << "`";
+                             << "`, the scalability is: `" << buffer_dtype.IsScalableVector()
+                             << "`, but RHS's dtype is `" << value_ty << "`";
   }
 
   ffi::ObjectPtr<BufferStoreNode> node = ffi::make_object<BufferStoreNode>();
@@ -478,7 +493,7 @@ PrimExpr BufferRegionNode::ToPrimExpr() const {
     if (tvm::tirx::is_one(r->extent)) {
       indices.push_back(r->min);
     } else if (r->extent.as<IntImmNode>()) {
-      indices.push_back(tirx::Ramp(r->min, tvm::tirx::MakeConst(r->min->dtype, 1), r->extent));
+      indices.push_back(tirx::Ramp(r->min, tvm::tirx::MakeConst(r->min.ty(), 1), r->extent));
     } else {
       TVM_FFI_THROW(ValueError) << "Cannot convert to BufferLoad: "
                                 << ffi::GetRef<BufferRegion>(this);
@@ -512,7 +527,7 @@ BufferRegion BufferRegion::FromPoint(Buffer buffer, ffi::Array<PrimExpr> indices
       region.push_back(
           Range::FromMinExtent(ramp_index->base, ramp_index->stride * ramp_index->lanes));
     } else {
-      region.push_back(Range::FromMinExtent(index, MakeConst(index.dtype(), 1)));
+      region.push_back(Range::FromMinExtent(index, MakeConst(index.ty(), 1)));
     }
   }
   return BufferRegion(buffer, region);
@@ -652,7 +667,8 @@ SBlockRealize::SBlockRealize(ffi::Array<PrimExpr> values, PrimExpr predicate, SB
                              Span span) {
   TVM_FFI_CHECK_EQ(block->iter_vars.size(), values.size(), ValueError)
       << "BlockRealize needs to have the same number of iter_vars and binding values";
-  TVM_FFI_CHECK(predicate.dtype().is_bool() || predicate.dtype() == DataType::UInt(1), TypeError)
+  PrimType predicate_ty = predicate.ty();
+  TVM_FFI_CHECK(predicate_ty.MatchesCode(DLDataTypeCode::kDLBool), TypeError)
       << "Expect Block.predicate to be a bool expression";
   ffi::ObjectPtr<SBlockRealizeNode> node = ffi::make_object<SBlockRealizeNode>();
   node->iter_values = std::move(values);
@@ -670,7 +686,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   });
 }
 
-PrimExpr TypeAnnotation(DataType dtype, Span span) {
+PrimExpr TypeAnnotation(PrimType dtype, Span span) {
   static const Op& type_annotation_op = Op::Get("tirx.type_annotation");
   return tirx::Call(dtype, type_annotation_op, {}, {}, span);
 }
diff --git a/src/tirx/ir/stmt_functor.cc b/src/tirx/ir/stmt_functor.cc
index 83864c88af8a..1fd06190e321 100644
--- a/src/tirx/ir/stmt_functor.cc
+++ b/src/tirx/ir/stmt_functor.cc
@@ -775,11 +775,11 @@ class IRSubstitute : public StmtExprMutator {
     if (ret.defined()) {
       // Allow substitution of void variables with any expression. The TVM script parser
       // uses void variables for lambda parameters (since exact types are not known yet).
-      if (!var.dtype().is_void()) {
+      if (!var.ty().IsVoid()) {
         PrimExpr ret_ex = ret.value().as_or_throw<PrimExpr>();
-        TVM_FFI_ICHECK(ret_ex.dtype() == var.dtype())
-            << "substituting " << var << ":" << var.dtype() << " -> " << ret_ex << ":"
-            << ret_ex.dtype();
+        TVM_FFI_ICHECK(ret_ex.ty()->dtype == var.ty()->dtype)
+            << "substituting " << var << ":" << var.ty()->dtype << " -> " << ret_ex << ":"
+            << ret_ex.ty()->dtype;
       }
       return ret.value();
     }
diff --git a/src/tirx/op/op.cc b/src/tirx/op/op.cc
index 21f9f601f809..e67dec179a82 100644
--- a/src/tirx/op/op.cc
+++ b/src/tirx/op/op.cc
@@ -49,6 +49,38 @@ bool IsVScaleCall(const PrimExpr& expr) {
   }
   return false;
 }
+
+TVM_FFI_INLINE const PrimTypeNode* GetPrimTypeNode(const PrimExpr& expr) {
+  // Avoid PrimExpr::ty() ObjectRef materialization on binary operator hot paths.
+  const auto* node = expr.get();
+  TVM_FFI_DCHECK(node != nullptr);
+  TVM_FFI_DCHECK(node->BaseExprNode::ty.defined());
+  const auto* prim_ty = node->BaseExprNode::ty.as<PrimTypeNode>();
+  TVM_FFI_DCHECK(prim_ty != nullptr);
+  return prim_ty;
+}
+
+bool IsFloatType(const PrimType& ty) { return ty.code() == DLDataTypeCode::kDLFloat; }
+
+bool IsBFloat16Type(const PrimType& ty) {
+  return ty.code() == DLDataTypeCode::kDLBfloat && ty.bits() == 16;
+}
+
+bool IsFloat8Type(const PrimType& ty) {
+  DLDataTypeCode code = ty.code();
+  return code == DLDataTypeCode::kDLFloat8_e3m4 || code == DLDataTypeCode::kDLFloat8_e4m3 ||
+         code == DLDataTypeCode::kDLFloat8_e4m3b11fnuz ||
+         code == DLDataTypeCode::kDLFloat8_e4m3fn || code == DLDataTypeCode::kDLFloat8_e4m3fnuz ||
+         code == DLDataTypeCode::kDLFloat8_e5m2 || code == DLDataTypeCode::kDLFloat8_e5m2fnuz ||
+         code == DLDataTypeCode::kDLFloat8_e8m0fnu;
+}
+
+bool IsFloat6Type(const PrimType& ty) {
+  DLDataTypeCode code = ty.code();
+  return code == DLDataTypeCode::kDLFloat6_e2m3fn || code == DLDataTypeCode::kDLFloat6_e3m2fn;
+}
+
+bool IsFloat4Type(const PrimType& ty) { return ty.code() == DLDataTypeCode::kDLFloat4_e2m1fn; }
 }  // namespace
 
 // macro to register an unary op
@@ -61,16 +93,16 @@ bool IsVScaleCall(const PrimExpr& expr) {
   TVM_TIR_REGISTER_OP(OpName).set_num_inputs(2).set_attr<TCallEffectKind>( \
       "TCallEffectKind", static_cast<int64_t>(CallEffectKind::kPure))
 
-runtime::DataType GetRuntimeDataType(const Type& type) {
+DLDataType GetRuntimeDLDataType(const Type& type) {
   if (auto* n = type.as<PrimTypeNode>()) {
     return n->dtype;
   } else if (type.as<PointerTypeNode>()) {
-    return DataType::Handle();
+    return DLDataType{kDLOpaqueHandle, 64, 1};
   } else if (IsVoidType(type)) {
-    return DataType::Void();
+    return DLDataType{kDLOpaqueHandle, 0, 0};
   } else {
     TVM_FFI_THROW(InternalError) << "Type " << type
-                                 << " does not have a corresponding runtime::DataType";
+                                 << " does not have a corresponding runtime DLPack dtype";
   }
 }
 
@@ -94,7 +126,7 @@ Type GetType(const PrimExpr& expr) {
       TVM_FFI_ICHECK(type_annotation->op.same_as(type_annotation_op))
           << "Expected the first argument of builtin tvm_access_ptr() "
           << "to be a type annotation, but found " << type_annotation->op;
-      return PointerType(PrimType(type_annotation->dtype));
+      return PointerType(type_annotation.ty());
     }
     if (access->op.same_as(builtin::ptr_byte_offset())) {
       TVM_FFI_ICHECK_EQ(access->args.size(), 3U);
@@ -102,7 +134,7 @@ Type GetType(const PrimExpr& expr) {
       TVM_FFI_ICHECK(type_annotation->op.same_as(type_annotation_op))
           << "Expected the third argument of builtin ptr_byte_offset() "
           << "to be a type annotation, but found " << type_annotation->op;
-      return PointerType(PrimType(type_annotation->dtype));
+      return PointerType(type_annotation.ty());
     }
   }
 
@@ -113,16 +145,16 @@ Type GetType(const PrimExpr& expr) {
           << address_of->args;
       auto* address = address_of->args[0].as<BufferLoadNode>();
       if (address) {
-        return PointerType(PrimType(address->dtype));
+        return PointerType(ffi::GetRef<PrimExpr>(address).ty());
       }
 
       if (auto* var = address_of->args[0].as<VarNode>()) {
         if (auto* ptr = var->type_annotation.as<PointerTypeNode>()) {
           if (ptr->element_type.as<TensorMapTypeNode>()) {
-            return PrimType(DataType::UInt(64));
+            return PrimType::UInt(64);
           }
         }
-        return PointerType(PrimType(var->dtype));
+        return PointerType(ffi::GetRef<PrimExpr>(var).ty());
       }
 
       TVM_FFI_ICHECK(false)
@@ -130,163 +162,149 @@ Type GetType(const PrimExpr& expr) {
           << "received argument " << address_of->args[0];
     }
   }
-  // Default: return the type indicated by the dtype.
-  runtime::DataType dtype = expr.dtype();
-  return GetTypeFromRuntimeDataType(dtype);
+  return expr.ty();
 }
 
-Type GetTypeFromRuntimeDataType(const DataType& dtype) {
-  if (dtype.is_void()) {
-    return VoidType();
-  }
-  return PrimType(dtype);
-}
+Type GetTypeFromRuntimeDataType(DLDataType dtype) { return PrimType(dtype); }
 
 // LargeUIntImm
-PrimExpr LargeUIntImm(DataType t, int64_t low, int64_t high, Span span) {
-  return tirx::Call(t, tirx::builtin::large_uint_imm(),
-                    {IntImm(DataType::UInt(32), low, span), IntImm(DataType::UInt(32), high, span)},
+PrimExpr LargeUIntImm(PrimType value_ty, int64_t low, int64_t high, Span span) {
+  return tirx::Call(value_ty, tirx::builtin::large_uint_imm(),
+                    {IntImm(PrimType::UInt(32), low, span), IntImm(PrimType::UInt(32), high, span)},
                     {}, span);
 }
 
 // Q-multiplication
 PrimExpr q_multiply_shift(PrimExpr x, PrimExpr y, PrimExpr q, PrimExpr s, Span span) {
-  return tirx::Call(DataType::Int(32, x.dtype().lanes()), tirx::builtin::q_multiply_shift(),
+  return tirx::Call(PrimType::Int(32, x.ty().lanes()), tirx::builtin::q_multiply_shift(),
                     {x, y, q, s}, {}, span);
 }
 
 void BroadcastToMatchLanes(PrimExpr& op_a, PrimExpr& op_b) {  // NOLINT(*)
-  DataType dtype_a = op_a.dtype();
-  DataType dtype_b = op_b.dtype();
-
-  if (!dtype_a.is_scalable_or_fixed_length_vector() &&
-      dtype_b.is_scalable_or_fixed_length_vector()) {
-    if (dtype_b.is_scalable_vector()) {
-      op_a = tirx::Broadcast(
-          op_a, tirx::Mul(dtype_b.vscale_factor(), Call(DataType::Int(32), builtin::vscale(), {})));
+  PrimType ty_a = op_a.ty();
+  PrimType ty_b = op_b.ty();
+
+  if (!ty_a.IsScalableVector() && !ty_a.IsFixedLengthVector() &&
+      (ty_b.IsScalableVector() || ty_b.IsFixedLengthVector())) {
+    if (ty_b.IsScalableVector()) {
+      PrimType i32_ty = PrimType::Int(32);
+      op_a = tirx::Broadcast(op_a,
+                             tirx::Mul(ty_b.VScaleFactor(), Call(i32_ty, builtin::vscale(), {})));
+    } else {
+      op_a = tirx::Broadcast(op_a, ty_b.lanes());
+    }
+  }
+}
+
+PrimType PromoteBinaryOpType(PrimType lhs_ty, PrimType rhs_ty) {
+  if (lhs_ty->dtype == rhs_ty->dtype) {
+    return lhs_ty;
+  }
+
+  // Keep conversion behavior consistent with the previous DataType-based path.
+  if (IsFloatType(lhs_ty) && IsFloatType(rhs_ty)) {
+    return lhs_ty.bits() < rhs_ty.bits() ? rhs_ty : lhs_ty;
+  } else if (!IsFloatType(lhs_ty) && IsFloatType(rhs_ty)) {
+    return rhs_ty;
+  } else if (IsFloatType(lhs_ty) && !IsFloatType(rhs_ty)) {
+    return lhs_ty;
+  } else if (!IsBFloat16Type(lhs_ty) && IsBFloat16Type(rhs_ty)) {
+    return rhs_ty;
+  } else if (IsBFloat16Type(lhs_ty) && !IsBFloat16Type(rhs_ty)) {
+    return lhs_ty;
+  } else if (!IsFloat8Type(lhs_ty) && IsFloat8Type(rhs_ty)) {
+    return rhs_ty;
+  } else if (IsFloat8Type(lhs_ty) && !IsFloat8Type(rhs_ty)) {
+    return lhs_ty;
+  } else if (!IsFloat6Type(lhs_ty) && IsFloat6Type(rhs_ty)) {
+    return rhs_ty;
+  } else if (IsFloat6Type(lhs_ty) && !IsFloat6Type(rhs_ty)) {
+    return lhs_ty;
+  } else if (!IsFloat4Type(lhs_ty) && IsFloat4Type(rhs_ty)) {
+    return rhs_ty;
+  } else if (IsFloat4Type(lhs_ty) && !IsFloat4Type(rhs_ty)) {
+    return lhs_ty;
+  } else if (lhs_ty.MatchesCode(DLDataTypeCode::kDLBool) &&
+             rhs_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
+    return rhs_ty;
+  } else if (lhs_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt) &&
+             rhs_ty.MatchesCode(DLDataTypeCode::kDLBool)) {
+    return lhs_ty;
+  } else if ((lhs_ty.MatchesCode(DLDataTypeCode::kDLInt) &&
+              rhs_ty.MatchesCode(DLDataTypeCode::kDLInt)) ||
+             (lhs_ty.MatchesCode(DLDataTypeCode::kDLUInt) &&
+              rhs_ty.MatchesCode(DLDataTypeCode::kDLUInt))) {
+    return lhs_ty.bits() < rhs_ty.bits() ? rhs_ty : lhs_ty;
+  } else if ((lhs_ty.MatchesCode(DLDataTypeCode::kDLInt) &&
+              rhs_ty.MatchesCode(DLDataTypeCode::kDLUInt)) ||
+             (lhs_ty.MatchesCode(DLDataTypeCode::kDLUInt) &&
+              rhs_ty.MatchesCode(DLDataTypeCode::kDLInt))) {
+    if (lhs_ty.bits() < rhs_ty.bits()) {
+      return rhs_ty;
+    } else if (lhs_ty.bits() > rhs_ty.bits()) {
+      return lhs_ty;
     } else {
-      op_a = tirx::Broadcast(op_a, dtype_b.lanes());
+      return lhs_ty.MatchesCode(DLDataTypeCode::kDLUInt) ? lhs_ty
+                                                         : lhs_ty.WithCode(DLDataTypeCode::kDLUInt);
     }
+  } else {
+    TVM_FFI_THROW(InternalError) << "Cannot match type " << lhs_ty->dtype << " vs "
+                                 << rhs_ty->dtype;
   }
+  return lhs_ty;
 }
 
 // The public function with a quick checking path.
 void BinaryOpMatchTypes(PrimExpr& lhs, PrimExpr& rhs, Span span) {  // NOLINT(*)
   TVM_FFI_CHECK(lhs.defined(), ValueError) << "`lhs` is null in the binary operator";
   TVM_FFI_CHECK(rhs.defined(), ValueError) << "`rhs` is null in the binary operator";
-  if (lhs.dtype() == rhs.dtype()) return;
+  const PrimTypeNode* lhs_ty_node = GetPrimTypeNode(lhs);
+  const PrimTypeNode* rhs_ty_node = GetPrimTypeNode(rhs);
+  if (lhs_ty_node == rhs_ty_node || lhs_ty_node->dtype == rhs_ty_node->dtype) return;
 
   BroadcastToMatchLanes(lhs, rhs);
   BroadcastToMatchLanes(rhs, lhs);
 
-  DataType ltype = lhs.dtype();
-  DataType rtype = rhs.dtype();
+  PrimType lhs_ty = lhs.ty();
+  PrimType rhs_ty = rhs.ty();
 
-  TVM_FFI_ICHECK(ltype.is_scalable_vector() == rtype.is_scalable_vector())
+  TVM_FFI_ICHECK(lhs_ty.IsScalableVector() == rhs_ty.IsScalableVector())
       << "Can't match scalable and fixed length vectors";
 
   bool lanes_match = false;
 
-  if (ltype.is_scalable_vector()) {
-    lanes_match = ltype.vscale_factor() == rtype.vscale_factor();
+  if (lhs_ty.IsScalableVector()) {
+    lanes_match = lhs_ty.VScaleFactor() == rhs_ty.VScaleFactor();
   } else {
-    lanes_match = ltype.lanes() == rtype.lanes();
+    lanes_match = lhs_ty.lanes() == rhs_ty.lanes();
   }
 
-  TVM_FFI_ICHECK(lanes_match) << "Cannot match type " << ltype << " vs " << rtype;
-  if (lhs.dtype() == rhs.dtype()) return;
-
-  ltype = lhs.dtype();
-  rtype = rhs.dtype();
-  // We keep dtypes conversion to be relatively consistent to reduce the amount code generated by
-  // operators. This can be helpful for users to find potential type conversion problems. The
-  // following are exceptions:
-  if (ltype.is_float() && rtype.is_float()) {
-    // Given two dissimilar floats, cast the lower bit version to the higher bit version.
-    // E.g. fp16 + fp32 --> fp32 + fp32
-    if (ltype.bits() < rtype.bits()) {
-      lhs = cast(rtype, lhs);
-    } else {
-      rhs = cast(ltype, rhs);
-    }
-  } else if (!ltype.is_float() && rtype.is_float()) {
-    // Cast int->float when the other operand is a float
-    lhs = cast(rtype, lhs);
-  } else if (ltype.is_float() && !rtype.is_float()) {
-    // Cast int->float when the other operand is a float
-    rhs = cast(ltype, rhs);
-  } else if (!ltype.is_bfloat16() && rtype.is_bfloat16()) {
-    // Cast int->bfloat16 when the other operand is a bfloat16
-    lhs = cast(rtype, lhs);
-  } else if (ltype.is_bfloat16() && !rtype.is_bfloat16()) {
-    // Cast int->bfloat16 when the other operand is a bfloat16
-    rhs = cast(ltype, rhs);
-  } else if (!ltype.is_float8() && rtype.is_float8()) {
-    // Cast int->float8 for lhs when rhs is a float8
-    lhs = cast(rtype, lhs);
-  } else if (ltype.is_float8() && !rtype.is_float8()) {
-    // Cast int->float8 for rhs when lhs is a float8
-    rhs = cast(ltype, rhs);
-  } else if (!ltype.is_float6() && rtype.is_float6()) {
-    // Cast int->float6 for lhs when rhs is a float6
-    lhs = cast(rtype, lhs);
-  } else if (ltype.is_float6() && !rtype.is_float6()) {
-    // Cast int->float6 for rhs when lhs is a float6
-    rhs = cast(ltype, rhs);
-  } else if (!ltype.is_float4() && rtype.is_float4()) {
-    // Cast int->float4 for lhs when rhs is a float4
-    lhs = cast(rtype, lhs);
-  } else if (ltype.is_float4() && !rtype.is_float4()) {
-    // Cast int->float4 for rhs when lhs is a float4
-    rhs = cast(ltype, rhs);
-  } else if (ltype.is_bool() && (rtype.is_int() || rtype.is_uint())) {
-    // Cast bool to int for lhs when rhs is a int or uint
-    lhs = cast(rtype, lhs);
-  } else if ((ltype.is_int() || ltype.is_uint()) && rtype.is_bool()) {
-    // Cast bool to int for rhs when lhs is a int or uint
-    rhs = cast(ltype, rhs);
-  } else if ((ltype.is_int() && rtype.is_int()) || (ltype.is_uint() && rtype.is_uint())) {
-    // Promote int to higher bits e.g. int8 + int16 --> int16 + int16
-    if (ltype.bits() < rtype.bits()) {
-      lhs = cast(rtype, lhs);
-    } else {
-      rhs = cast(ltype, rhs);
-    }
-  } else if ((ltype.is_int() && rtype.is_uint()) || (ltype.is_uint() && rtype.is_int())) {
-    // Handle mixing signed and unsigned integers
-    if (ltype.bits() < rtype.bits()) {
-      lhs = cast(rtype, lhs);
-    } else if (ltype.bits() > rtype.bits()) {
-      rhs = cast(ltype, rhs);
-    } else {
-      // The width of signed and unsigned integers is same.
-      if (ltype.is_uint()) {
-        rhs = cast(ltype, rhs);
-      } else {
-        lhs = cast(rtype, lhs);
-      }
-    }
-  } else {
-    LOG(INFO) << lhs << " " << rhs;
-    TVM_FFI_THROW(InternalError) << "Cannot match type " << ltype << " vs " << rtype;
+  TVM_FFI_ICHECK(lanes_match) << "Cannot match type " << lhs_ty->dtype << " vs " << rhs_ty->dtype;
+
+  PrimType promoted_ty = PromoteBinaryOpType(lhs_ty, rhs_ty);
+  if (lhs_ty->dtype != promoted_ty->dtype) {
+    lhs = cast(promoted_ty, lhs, span);
+  }
+  if (rhs_ty->dtype != promoted_ty->dtype) {
+    rhs = cast(promoted_ty, rhs, span);
   }
 }
 
 PrimExpr ret(PrimExpr value, Span span) {
   TVM_FFI_ICHECK(value.defined());
-  return tirx::Call(value.dtype(), tirx::builtin::ret(), {value}, {}, span);
+  return tirx::Call(value.ty(), tirx::builtin::ret(), {value}, {}, span);
 }
 
 PrimExpr thread_return(Span span) {
-  return tirx::Call(DataType::Void(), tirx::builtin::thread_return(), {}, {}, span);
+  return tirx::Call(PrimType::Void(), tirx::builtin::thread_return(), {}, {}, span);
 }
 
 PrimExpr continue_loop(Span span) {
-  return tirx::Call(DataType::Void(), tirx::builtin::continue_loop(), {}, {}, span);
+  return tirx::Call(PrimType::Void(), tirx::builtin::continue_loop(), {}, {}, span);
 }
 
 PrimExpr break_loop(Span span) {
-  return tirx::Call(DataType::Void(), tirx::builtin::break_loop(), {}, {}, span);
+  return tirx::Call(PrimType::Void(), tirx::builtin::break_loop(), {}, {}, span);
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
@@ -299,128 +317,131 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 };
 
 // maximum and min limits
-PrimExpr max_value(const DataType& dtype, Span span) {
+PrimExpr max_value(PrimType value_ty, Span span) {
   using namespace tirx;
+  PrimType dtype = value_ty;
   TVM_FFI_ICHECK_EQ(dtype.lanes(), 1);
-  if (dtype.is_int()) {
+  if (dtype.MatchesCode(DLDataTypeCode::kDLInt)) {
     if (dtype.bits() == 64) {
-      return IntImm(dtype, std::numeric_limits<int64_t>::max(), span);
+      return IntImm(value_ty, std::numeric_limits<int64_t>::max(), span);
     } else if (dtype.bits() < 64) {
       int64_t val = 1;
       val = (val << (dtype.bits() - 1)) - 1;
-      return IntImm(dtype, val, span);
+      return IntImm(value_ty, val, span);
     }
-  } else if (dtype.is_uint()) {
+  } else if (dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
     if (dtype.bits() == 64) {
       return MakeConst(dtype, std::numeric_limits<uint64_t>::max(), span);
     } else if (dtype.bits() < 64) {
       uint64_t val = 1;
       val = (val << static_cast<uint64_t>(dtype.bits())) - 1;
-      return IntImm(dtype, static_cast<int64_t>(val), span);
+      return IntImm(value_ty, static_cast<int64_t>(val), span);
     }
-  } else if (dtype.is_float()) {
+  } else if (IsFloatType(dtype)) {
     if (dtype.bits() == 64) {
-      return FloatImm(dtype, std::numeric_limits<double>::max(), span);
+      return FloatImm(value_ty, std::numeric_limits<double>::max(), span);
     } else if (dtype.bits() == 32) {
-      return FloatImm(dtype, std::numeric_limits<float>::max(), span);
+      return FloatImm(value_ty, std::numeric_limits<float>::max(), span);
     } else if (dtype.bits() == 16) {
-      return FloatImm(dtype, 65504.0, span);
+      return FloatImm(value_ty, 65504.0, span);
     }
-  } else if (dtype.is_bfloat16()) {
-    return FloatImm(dtype, std::numeric_limits<float>::max(), span);
-  } else if (dtype.is_float8()) {
+  } else if (IsBFloat16Type(dtype)) {
+    return FloatImm(value_ty, std::numeric_limits<float>::max(), span);
+  } else if (IsFloat8Type(dtype)) {
     // according to https://arxiv.org/pdf/2209.05433.pdf
-    if (dtype.code() == DataType::TypeCode::kFloat8_e5m2) {
-      return FloatImm(dtype, 57344.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e5m2fnuz) {
-      return FloatImm(dtype, 57344.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e4m3fn) {
-      return FloatImm(dtype, 448.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e4m3fnuz ||
-               dtype.code() == DataType::TypeCode::kFloat8_e4m3) {
-      return FloatImm(dtype, 448.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e4m3b11fnuz) {
-      return FloatImm(dtype, 30.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e3m4) {
-      return FloatImm(dtype, 31.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e8m0fnu) {
-      return FloatImm(dtype, 3.4028236692093846e+38, span);
+    if (dtype.code() == DLDataTypeCode::kDLFloat8_e5m2) {
+      return FloatImm(value_ty, 57344.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e5m2fnuz) {
+      return FloatImm(value_ty, 57344.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e4m3fn) {
+      return FloatImm(value_ty, 448.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e4m3fnuz ||
+               dtype.code() == DLDataTypeCode::kDLFloat8_e4m3) {
+      return FloatImm(value_ty, 448.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e4m3b11fnuz) {
+      return FloatImm(value_ty, 30.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e3m4) {
+      return FloatImm(value_ty, 31.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e8m0fnu) {
+      return FloatImm(value_ty, 3.4028236692093846e+38, span);
     }
-  } else if (dtype.is_float6()) {
-    if (dtype.code() == DataType::TypeCode::kFloat6_e2m3fn) {
-      return FloatImm(dtype, 7.5, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat6_e3m2fn) {
-      return FloatImm(dtype, 28.0, span);
+  } else if (IsFloat6Type(dtype)) {
+    if (dtype.code() == DLDataTypeCode::kDLFloat6_e2m3fn) {
+      return FloatImm(value_ty, 7.5, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat6_e3m2fn) {
+      return FloatImm(value_ty, 28.0, span);
     }
-  } else if (dtype.is_float4()) {
-    return FloatImm(dtype, 6.0, span);
+  } else if (IsFloat4Type(dtype)) {
+    return FloatImm(value_ty, 6.0, span);
   }
   TVM_FFI_THROW(InternalError) << "Cannot decide max_value for type" << dtype;
 }
 
-PrimExpr min_value(const DataType& dtype, Span span) {
+PrimExpr min_value(PrimType value_ty, Span span) {
   using namespace tirx;
+  PrimType dtype = value_ty;
   TVM_FFI_ICHECK_EQ(dtype.lanes(), 1);
-  if (dtype.is_int()) {
+  if (dtype.MatchesCode(DLDataTypeCode::kDLInt)) {
     if (dtype.bits() == 64) {
-      return IntImm(dtype, std::numeric_limits<int64_t>::lowest(), span);
+      return IntImm(value_ty, std::numeric_limits<int64_t>::lowest(), span);
     } else if (dtype.bits() < 64) {
       int64_t val = 1;
       val = -(val << (dtype.bits() - 1));
-      return IntImm(dtype, val, span);
+      return IntImm(value_ty, val, span);
     }
-  } else if (dtype.is_uint()) {
-    return IntImm(dtype, 0, span);
-  } else if (dtype.is_float()) {
+  } else if (dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
+    return IntImm(value_ty, 0, span);
+  } else if (IsFloatType(dtype)) {
     if (dtype.bits() == 64) {
-      return FloatImm(dtype, std::numeric_limits<double>::lowest(), span);
+      return FloatImm(value_ty, std::numeric_limits<double>::lowest(), span);
     } else if (dtype.bits() == 32) {
-      return FloatImm(dtype, std::numeric_limits<float>::lowest(), span);
+      return FloatImm(value_ty, std::numeric_limits<float>::lowest(), span);
     } else if (dtype.bits() == 16) {
-      return FloatImm(dtype, -65504.0, span);
+      return FloatImm(value_ty, -65504.0, span);
     }
-  } else if (dtype.is_bfloat16()) {
-    return FloatImm(dtype, std::numeric_limits<float>::lowest(), span);
-  } else if (dtype.is_float8()) {
+  } else if (IsBFloat16Type(dtype)) {
+    return FloatImm(value_ty, std::numeric_limits<float>::lowest(), span);
+  } else if (IsFloat8Type(dtype)) {
     // according to https://arxiv.org/pdf/2209.05433.pdf
-    if (dtype.code() == DataType::TypeCode::kFloat8_e5m2) {
-      return FloatImm(dtype, -57344.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e5m2fnuz) {
-      return FloatImm(dtype, 0.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e4m3fn) {
-      return FloatImm(dtype, -448.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e4m3fnuz) {
-      return FloatImm(dtype, 0.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e4m3) {
-      return FloatImm(dtype, -448.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e4m3b11fnuz) {
-      return FloatImm(dtype, 0.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e3m4) {
-      return FloatImm(dtype, -31.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e8m0fnu) {
-      return FloatImm(dtype, 0.0, span);
+    if (dtype.code() == DLDataTypeCode::kDLFloat8_e5m2) {
+      return FloatImm(value_ty, -57344.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e5m2fnuz) {
+      return FloatImm(value_ty, 0.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e4m3fn) {
+      return FloatImm(value_ty, -448.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e4m3fnuz) {
+      return FloatImm(value_ty, 0.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e4m3) {
+      return FloatImm(value_ty, -448.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e4m3b11fnuz) {
+      return FloatImm(value_ty, 0.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e3m4) {
+      return FloatImm(value_ty, -31.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e8m0fnu) {
+      return FloatImm(value_ty, 0.0, span);
     }
-  } else if (dtype.is_float6()) {
-    if (dtype.code() == DataType::TypeCode::kFloat6_e2m3fn) {
-      return FloatImm(dtype, -7.5, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat6_e3m2fn) {
-      return FloatImm(dtype, -28.0, span);
+  } else if (IsFloat6Type(dtype)) {
+    if (dtype.code() == DLDataTypeCode::kDLFloat6_e2m3fn) {
+      return FloatImm(value_ty, -7.5, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat6_e3m2fn) {
+      return FloatImm(value_ty, -28.0, span);
     }
-  } else if (dtype.is_float4()) {
-    return FloatImm(dtype, -6.0, span);
+  } else if (IsFloat4Type(dtype)) {
+    return FloatImm(value_ty, -6.0, span);
   }
   TVM_FFI_THROW(InternalError) << "Cannot decide min_value for type" << dtype;
 }
 
 // infinity
-PrimExpr infinity(const DataType& dtype, Span span) {
+PrimExpr infinity(PrimType value_ty, Span span) {
   using namespace tirx;
+  PrimType dtype = value_ty;
   TVM_FFI_ICHECK_EQ(dtype.lanes(), 1);
-  if (dtype.is_float()) {
+  if (IsFloatType(dtype)) {
     if (dtype.bits() == 64) {
-      return FloatImm(dtype, std::numeric_limits<double>::infinity(), span);
+      return FloatImm(value_ty, std::numeric_limits<double>::infinity(), span);
     } else if (dtype.bits() == 32 || dtype.bits() == 16) {
-      return FloatImm(dtype, std::numeric_limits<float>::infinity(), span);
+      return FloatImm(value_ty, std::numeric_limits<float>::infinity(), span);
     }
   }
   TVM_FFI_THROW(InternalError) << "Cannot decide infinity for type " << dtype;
@@ -450,72 +471,88 @@ bool is_const_power_of_two_integer(const PrimExpr& x, int* shift) {
 }
 }  // namespace tirx
 
-PrimExpr cast(const DataType& t, PrimExpr value, Span span) {
+PrimExpr cast(PrimType t, PrimExpr value, Span span) {
   using tirx::FloatImmNode;
-  if (value.dtype() == t) return value;
+  PrimType dtype = t;
+  if (value.ty()->dtype == dtype->dtype) return value;
   // const fold IntImm as they are used in index computations
-  if (t.is_scalar()) {
+  if (dtype.IsScalar()) {
     if (const IntImmNode* op = value.as<IntImmNode>()) {
-      return MakeConst(t, op->value, op->span);
+      return MakeConst(dtype, op->value, op->span);
     } else if (const FloatImmNode* op = value.as<FloatImmNode>()) {
-      return MakeConst(t, op->value, op->span);
+      return MakeConst(dtype, op->value, op->span);
     }
-    TVM_FFI_ICHECK(!value.dtype().is_handle()) << "Can't cast a handle to other types.";
-    return tirx::Cast(t, value, span);
+    TVM_FFI_ICHECK(!value.ty().IsHandle()) << "Can't cast a handle to other types.";
+    return tirx::Cast(std::move(t), value, span);
   } else {
-    DataType vtype = t.element_of();
-    if (!value.dtype().is_scalable_or_fixed_length_vector()) {
+    PrimType elem_ty = dtype.WithLanes(1);
+    if (!value.ty().IsScalableVector() && !value.ty().IsFixedLengthVector()) {
       // manually unroll cast
-      if (value.dtype() != vtype) {
+      if (value.ty()->dtype != elem_ty->dtype) {
         if (const IntImmNode* op = value.as<IntImmNode>()) {
-          value = MakeConst(vtype, op->value, op->span);
+          value = MakeConst(elem_ty, op->value, op->span);
         } else if (const FloatImmNode* op = value.as<FloatImmNode>()) {
-          value = MakeConst(vtype, op->value, op->span);
+          value = MakeConst(elem_ty, op->value, op->span);
         } else {
-          value = tirx::Cast(vtype, value, span);
+          value = tirx::Cast(elem_ty, value, span);
         }
       }
-      if (t.is_scalable_vector()) {
+      if (dtype.IsScalableVector()) {
         return tirx::Broadcast(
-            value, tirx::Mul(t.vscale_factor(), Call(DataType::Int(32), builtin::vscale(), {})),
+            value, tirx::Mul(dtype.VScaleFactor(), Call(PrimType::Int(32), builtin::vscale(), {})),
             span);
       } else {
-        return tirx::Broadcast(value, t.lanes(), span);
+        return tirx::Broadcast(value, dtype.lanes(), span);
       }
     } else { /* value is a vector */
-      TVM_FFI_ICHECK(value.dtype().is_scalable_vector() == t.is_scalable_vector());
+      TVM_FFI_ICHECK(value.ty().IsScalableVector() == dtype.IsScalableVector());
 
       bool lanes_match = false;
-      if (value.dtype().is_scalable_vector()) {
-        lanes_match = value.dtype().vscale_factor() == t.vscale_factor();
+      if (value.ty().IsScalableVector()) {
+        lanes_match = value.ty().VScaleFactor() == dtype.VScaleFactor();
       } else {
-        lanes_match = value.dtype().lanes() == t.lanes();
+        lanes_match = value.ty().lanes() == dtype.lanes();
       }
       TVM_FFI_ICHECK(lanes_match);
       if (const auto* broadcast = value.as<tirx::BroadcastNode>()) {
-        return tirx::Broadcast(cast(vtype, broadcast->value, span), broadcast->lanes, span);
+        return tirx::Broadcast(cast(elem_ty, broadcast->value, span), broadcast->lanes, span);
       } else if (const auto* ramp = value.as<tirx::RampNode>()) {
-        if (t.is_int() || t.is_uint()) {
+        if (dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
           // only cast to index data type can be folded to ramp
-          return tirx::Ramp(cast(vtype, ramp->base, span), cast(vtype, ramp->stride, span),
+          return tirx::Ramp(cast(elem_ty, ramp->base, span), cast(elem_ty, ramp->stride, span),
                             ramp->lanes, span);
         }
       }
-      return tirx::Cast(t, value, span);
+      return tirx::Cast(std::move(t), value, span);
     }
   }
 }
 
+PrimExpr cast(DLDataType t, PrimExpr value, Span span) {
+  return cast(PrimType(t), std::move(value), std::move(span));
+}
+
 // reinterpret
-PrimExpr reinterpret(const DataType& t, PrimExpr value, Span span) {
-  if (value.dtype() == t) return value;
-  if (!t.is_scalable_vector() && !value.dtype().is_scalable_vector()) {
-    TVM_FFI_ICHECK(value.dtype().bits() * value.dtype().lanes() == t.bits() * t.lanes() ||
-                   ((value.dtype().is_float4_e2m1fn() || t.is_float4_e2m1fn()) &&
-                    value.dtype().bytes() * value.dtype().lanes() == t.bytes() * t.lanes()))
-        << "Reinterpret requires size match " << t << " vs " << value.dtype();
+PrimExpr reinterpret(PrimType t, PrimExpr value, Span span) {
+  PrimType target_dtype = t;
+  PrimType value_dtype = value.ty();
+  if (value.ty()->dtype == t->dtype) return value;
+  if (!target_dtype.IsScalableVector() && !value_dtype.IsScalableVector()) {
+    int value_bits = value_dtype.bits() * value_dtype.lanes();
+    int target_bits = target_dtype.bits() * target_dtype.lanes();
+    auto storage_bytes = [](PrimType dtype) { return (dtype.bits() + 7) / 8; };
+    TVM_FFI_ICHECK(value_bits == target_bits ||
+                   ((value_dtype.code() == DLDataTypeCode::kDLFloat4_e2m1fn ||
+                     target_dtype.code() == DLDataTypeCode::kDLFloat4_e2m1fn) &&
+                    storage_bytes(value_dtype) * value_dtype.lanes() ==
+                        storage_bytes(target_dtype) * target_dtype.lanes()))
+        << "Reinterpret requires size match " << target_dtype << " vs " << value_dtype;
   }
-  return tirx::Call(t, tirx::builtin::reinterpret(), {value}, {}, span);
+  return tirx::Call(std::move(t), tirx::builtin::reinterpret(), {value}, {}, span);
+}
+
+PrimExpr reinterpret(DLDataType t, PrimExpr value, Span span) {
+  return reinterpret(PrimType(t), std::move(value), std::move(span));
 }
 
 // operator+
@@ -535,9 +572,9 @@ PrimExpr neg(PrimExpr a, Span span) {
   using tirx::IntImmNode;
   const IntImmNode* pa = a.as<IntImmNode>();
   const FloatImmNode* fa = a.as<FloatImmNode>();
-  if (pa) return IntImm(a.dtype(), -pa->value, span);
-  if (fa) return FloatImm(a.dtype(), -fa->value, span);
-  return MakeConst(a.dtype(), 0, span) - a;
+  if (pa) return IntImm(a.ty(), -pa->value, span);
+  if (fa) return FloatImm(a.ty(), -fa->value, span);
+  return MakeConst(a.ty(), 0, span) - a;
 }
 
 PrimExpr operator-(PrimExpr a, PrimExpr b) { return sub(a, b); }
@@ -562,8 +599,8 @@ PrimExpr div(PrimExpr a, PrimExpr b, Span span) {
 }
 
 PrimExpr truncdiv(PrimExpr a, PrimExpr b, Span span) {
-  TVM_FFI_ICHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
-  TVM_FFI_ICHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
+  TVM_FFI_ICHECK(a.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) << a;
+  TVM_FFI_ICHECK(b.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) << b;
   return div(a, b, span);
 }
 
@@ -585,16 +622,16 @@ PrimExpr shapediv(PrimExpr a, PrimExpr b, Span span) { return ceildiv(a, b, span
 PrimExpr indexmod(PrimExpr a, PrimExpr b, Span span) { return floormod(a, b, span); }
 
 PrimExpr floordiv(PrimExpr a, PrimExpr b, Span span) {
-  TVM_FFI_ICHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
-  TVM_FFI_ICHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
+  TVM_FFI_ICHECK(a.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) << a;
+  TVM_FFI_ICHECK(b.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) << b;
   BinaryOpMatchTypes(a, b, span);
   if (auto ret = arith::TryConstFold<tirx::FloorDiv>(a, b)) return ret.value();
   return tirx::FloorDiv(a, b, span);
 }
 
 PrimExpr logaddexp(PrimExpr a, PrimExpr b, Span span) {
-  TVM_FFI_ICHECK(a.dtype().is_float()) << a;
-  TVM_FFI_ICHECK(b.dtype().is_float()) << b;
+  TVM_FFI_ICHECK(IsFloatType(a.ty())) << a;
+  TVM_FFI_ICHECK(IsFloatType(b.ty())) << b;
   BinaryOpMatchTypes(a, b, span);
   PrimExpr exp_sum = add(exp(a), exp(b));
   PrimExpr log_exp_sum = log(exp_sum);
@@ -602,16 +639,16 @@ PrimExpr logaddexp(PrimExpr a, PrimExpr b, Span span) {
 }
 
 PrimExpr ceildiv(PrimExpr a, PrimExpr b, Span span) {
-  TVM_FFI_ICHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
-  TVM_FFI_ICHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
+  TVM_FFI_ICHECK(a.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) << a;
+  TVM_FFI_ICHECK(b.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) << b;
   BinaryOpMatchTypes(a, b, span);
   if (auto ret = arith::TryConstFold<tirx::FloorDiv>(a + b - 1, b)) return ret.value();
   return tirx::FloorDiv(a + b - 1, b, span);
 }
 
 PrimExpr floormod(PrimExpr a, PrimExpr b, Span span) {
-  TVM_FFI_ICHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
-  TVM_FFI_ICHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
+  TVM_FFI_ICHECK(a.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) << a;
+  TVM_FFI_ICHECK(b.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) << b;
   BinaryOpMatchTypes(a, b, span);
   if (auto ret = arith::TryConstFold<tirx::FloorMod>(a, b)) return ret.value();
   return tirx::FloorMod(a, b, span);
@@ -645,7 +682,7 @@ PrimExpr max(PrimExpr a, PrimExpr b, Span span) {
 
 // if_then_else
 PrimExpr if_then_else(PrimExpr cond, PrimExpr true_value, PrimExpr false_value, Span span) {
-  TVM_FFI_ICHECK(cond.dtype() == DataType::Bool())
+  TVM_FFI_ICHECK(cond.ty().MatchesCode(DLDataTypeCode::kDLBool))
       << "if_then_else only accept the condition to be boolean type.";
   BinaryOpMatchTypes(true_value, false_value, span);
   if (const IntImmNode* op = cond.as<IntImmNode>()) {
@@ -656,14 +693,14 @@ PrimExpr if_then_else(PrimExpr cond, PrimExpr true_value, PrimExpr false_value,
     }
   }
 
-  return tirx::Call(true_value.dtype(), tirx::builtin::if_then_else(),
-                    {cond, true_value, false_value}, {}, span);
+  return tirx::Call(true_value.ty(), tirx::builtin::if_then_else(), {cond, true_value, false_value},
+                    {}, span);
 }
 
 // likely
 PrimExpr likely(PrimExpr cond, Span span) {
   if (is_const_int(cond)) return cond;
-  return tirx::Call(cond.dtype(), tirx::builtin::likely(), {cond}, {}, span);
+  return tirx::Call(cond.ty(), tirx::builtin::likely(), {cond}, {}, span);
 }
 
 // operator>
@@ -712,38 +749,44 @@ PrimExpr not_equal(PrimExpr a, PrimExpr b, Span span) {
 
 namespace {
 void type_check_boolean_args(const PrimExpr& arg, const char* op) {
-  TVM_FFI_ICHECK(arg.dtype().is_bool()) << "Expected boolean argument for " << op
-                                        << ", but received " << arg << " of type " << arg.dtype();
+  TVM_FFI_ICHECK(arg.ty().MatchesCode(DLDataTypeCode::kDLBool))
+      << "Expected boolean argument for " << op << ", but received " << arg << " of type "
+      << arg.ty();
 }
 void type_check_boolean_args(const PrimExpr& lhs, const PrimExpr& rhs, const char* op) {
-  TVM_FFI_ICHECK(lhs.dtype().is_bool()) << "Expected boolean argument as LHS of " << op
-                                        << ", but received " << lhs << " of type " << lhs.dtype();
-  TVM_FFI_ICHECK(rhs.dtype().is_bool()) << "Expected boolean argument as RHS of " << op
-                                        << ", but received " << rhs << " of type " << rhs.dtype();
+  TVM_FFI_ICHECK(lhs.ty().MatchesCode(DLDataTypeCode::kDLBool))
+      << "Expected boolean argument as LHS of " << op << ", but received " << lhs << " of type "
+      << lhs.ty();
+  TVM_FFI_ICHECK(rhs.ty().MatchesCode(DLDataTypeCode::kDLBool))
+      << "Expected boolean argument as RHS of " << op << ", but received " << rhs << " of type "
+      << rhs.ty();
 }
 
 void type_check_int_or_bool_args(const PrimExpr& arg, const char* op) {
-  TVM_FFI_ICHECK(arg.dtype().is_int() || arg.dtype().is_uint() || arg.dtype().is_bool())
+  TVM_FFI_ICHECK(arg.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt,
+                                      DLDataTypeCode::kDLBool))
       << "Expected integer or boolean argument for " << op << ", but received " << arg
-      << " of type " << arg.dtype();
+      << " of type " << arg.ty();
 }
 
 void type_check_integer_args(const PrimExpr& lhs, const PrimExpr& rhs, const char* op) {
-  TVM_FFI_ICHECK(lhs.dtype().is_int() || lhs.dtype().is_uint())
+  TVM_FFI_ICHECK(lhs.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt))
       << "Expected integer argument as LHS of " << op << ", but received " << lhs << " of type "
-      << lhs.dtype();
-  TVM_FFI_ICHECK(rhs.dtype().is_int() || rhs.dtype().is_uint())
+      << lhs.ty();
+  TVM_FFI_ICHECK(rhs.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt))
       << "Expected integer argument as RHS of " << op << ", but received " << rhs << " of type "
-      << rhs.dtype();
+      << rhs.ty();
 }
 
 void type_check_int_or_bool_args(const PrimExpr& lhs, const PrimExpr& rhs, const char* op) {
-  TVM_FFI_ICHECK(lhs.dtype().is_int() || lhs.dtype().is_uint() || lhs.dtype().is_bool())
+  TVM_FFI_ICHECK(lhs.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt,
+                                      DLDataTypeCode::kDLBool))
       << "Expected integer argument as LHS of " << op << ", but received " << lhs << " of type "
-      << lhs.dtype();
-  TVM_FFI_ICHECK(rhs.dtype().is_int() || rhs.dtype().is_uint() || rhs.dtype().is_bool())
+      << lhs.ty();
+  TVM_FFI_ICHECK(rhs.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt,
+                                      DLDataTypeCode::kDLBool))
       << "Expected integer argument as RHS of " << op << ", but received " << rhs << " of type "
-      << rhs.dtype();
+      << rhs.ty();
 }
 }  // namespace
 
@@ -776,20 +819,20 @@ PrimExpr right_shift(PrimExpr a, PrimExpr b, Span span) {
 
   BinaryOpMatchTypes(a, b, span);
   TVM_INDEX_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
+    PrimType result_ty = a.ty();
     if (pb)
-      TVM_FFI_ICHECK(pb->value >= 0 && pb->value < rtype.bits())
-          << "Shift amount must be non-negative and less than " << rtype.bits() << " for type "
-          << rtype;
+      TVM_FFI_ICHECK(pb->value >= 0 && pb->value < result_ty.bits())
+          << "Shift amount must be non-negative and less than " << result_ty.bits() << " for type "
+          << result_ty;
     if (pa && pb) {
-      return IntImm(rtype, (pa->value >> pb->value), span);
+      return IntImm(result_ty, (pa->value >> pb->value), span);
     }
     if (pb) {
       if (pb->value == 0) return a;
     }
   });
 
-  return tirx::Call(a.dtype(), tirx::builtin::shift_right(), {a, b}, {}, span);
+  return tirx::Call(a.ty(), tirx::builtin::shift_right(), {a, b}, {}, span);
 }
 
 // shift left
@@ -798,17 +841,17 @@ PrimExpr left_shift(PrimExpr a, PrimExpr b, Span span) {
   type_check_integer_args(a, b, "<< operator (left shift)");
   BinaryOpMatchTypes(a, b, span);
   TVM_INDEX_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
+    PrimType result_ty = a.ty();
     if (pb)
-      TVM_FFI_ICHECK(pb->value >= 0 && pb->value < rtype.bits())
-          << "Shift amount must be non-negative and less than " << rtype.bits() << " for type "
-          << rtype;
-    if (pa && pb) return IntImm(rtype, (pa->value << pb->value), span);
+      TVM_FFI_ICHECK(pb->value >= 0 && pb->value < result_ty.bits())
+          << "Shift amount must be non-negative and less than " << result_ty.bits() << " for type "
+          << result_ty;
+    if (pa && pb) return IntImm(result_ty, (pa->value << pb->value), span);
     if (pb) {
       if (pb->value == 0) return a;
     }
   });
-  return tirx::Call(a.dtype(), tirx::builtin::shift_left(), {a, b}, {}, span);
+  return tirx::Call(a.ty(), tirx::builtin::shift_left(), {a, b}, {}, span);
 }
 
 // bitwise and
@@ -817,10 +860,10 @@ PrimExpr bitwise_and(PrimExpr a, PrimExpr b, Span span) {
   type_check_int_or_bool_args(a, b, "& operator (bitwise AND)");
   BinaryOpMatchTypes(a, b, span);
   TVM_INDEX_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
-    if (pa && pb) return IntImm(rtype, (pa->value & pb->value), span);
+    PrimType result_ty = a.ty();
+    if (pa && pb) return IntImm(result_ty, (pa->value & pb->value), span);
   });
-  return tirx::Call(a.dtype(), tirx::builtin::bitwise_and(), {a, b}, {}, span);
+  return tirx::Call(a.ty(), tirx::builtin::bitwise_and(), {a, b}, {}, span);
 }
 
 // bitwise_or
@@ -829,10 +872,10 @@ PrimExpr bitwise_or(PrimExpr a, PrimExpr b, Span span) {
   type_check_int_or_bool_args(a, b, "| operator (bitwise OR)");
   BinaryOpMatchTypes(a, b, span);
   TVM_INDEX_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
-    if (pa && pb) return IntImm(rtype, (pa->value | pb->value), span);
+    PrimType result_ty = a.ty();
+    if (pa && pb) return IntImm(result_ty, (pa->value | pb->value), span);
   });
-  return tirx::Call(a.dtype(), tirx::builtin::bitwise_or(), {a, b}, {}, span);
+  return tirx::Call(a.ty(), tirx::builtin::bitwise_or(), {a, b}, {}, span);
 }
 
 // bitwise_xor
@@ -841,10 +884,10 @@ PrimExpr bitwise_xor(PrimExpr a, PrimExpr b, Span span) {
   type_check_int_or_bool_args(a, b, "^ operator (bitwise XOR)");
   BinaryOpMatchTypes(a, b, span);
   TVM_INDEX_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
-    if (pa && pb) return IntImm(rtype, (pa->value ^ pb->value), span);
+    PrimType result_ty = a.ty();
+    if (pa && pb) return IntImm(result_ty, (pa->value ^ pb->value), span);
   });
-  return tirx::Call(a.dtype(), tirx::builtin::bitwise_xor(), {a, b}, {}, span);
+  return tirx::Call(a.ty(), tirx::builtin::bitwise_xor(), {a, b}, {}, span);
 }
 
 // bitwise_not
@@ -852,7 +895,7 @@ PrimExpr operator~(PrimExpr a) { return bitwise_neg(a); }
 
 PrimExpr bitwise_neg(PrimExpr a, Span span) {
   type_check_int_or_bool_args(a, "~ operator (bitwise NOT)");
-  return tirx::Call(a.dtype(), tirx::builtin::bitwise_not(), {a}, {}, span);
+  return tirx::Call(a.ty(), tirx::builtin::bitwise_not(), {a}, {}, span);
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
@@ -864,10 +907,10 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 // pow
 PrimExpr pow(PrimExpr x, PrimExpr y, Span span) {
   BinaryOpMatchTypes(x, y, span);
-  TVM_FFI_ICHECK(x.dtype().is_float()) << "power only applies to float";
+  TVM_FFI_ICHECK(IsFloatType(x.ty())) << "power only applies to float";
 
   // If we detect pow(x, 3), suggest using x * x * x
-  if (y.dtype().is_int()) {
+  if (y.ty().MatchesCode(DLDataTypeCode::kDLInt)) {
     using tirx::IntImmNode;
     const IntImmNode* px = y.as<IntImmNode>();
     if (px) {
@@ -878,7 +921,7 @@ PrimExpr pow(PrimExpr x, PrimExpr y, Span span) {
                "`pow(x, 2) * pow(x, 2) ...`.";
       }
     }
-  } else if (y.dtype().is_float()) {
+  } else if (IsFloatType(y.ty())) {
     using tirx::FloatImmNode;
     const FloatImmNode* fx = y.as<FloatImmNode>();
     if (fx) {
@@ -892,33 +935,33 @@ PrimExpr pow(PrimExpr x, PrimExpr y, Span span) {
   }
 
   static const Op& pow_op = Op::Get("tirx.pow");
-  return tirx::Call(x.dtype(), pow_op, {x, y}, {}, span);
+  return tirx::Call(x.ty(), pow_op, {x, y}, {}, span);
 }
 
 TVM_TIR_REGISTER_PURE_BINARY_OP("pow").set_attr<TVectorizable>("TVectorizable", true);
 
 // abs
 PrimExpr abs(PrimExpr x, Span span) {
-  if (x.dtype().is_int()) {
+  if (x.ty().MatchesCode(DLDataTypeCode::kDLInt)) {
     using tirx::IntImmNode;
     const IntImmNode* px = x.as<IntImmNode>();
     if (px) {
-      return IntImm(x.dtype(), std::abs(px->value), px->span);
+      return IntImm(x.ty(), std::abs(px->value), px->span);
     }
     // MakeConst can handle both vector and scalar types.
-    return tirx::Select(x >= MakeConst(x.dtype(), 0), x, -x, span);
-  } else if (x.dtype().is_float() || x.dtype().is_bfloat()) {
+    return tirx::Select(x >= MakeConst(x.ty(), 0), x, -x, span);
+  } else if (IsFloatType(x.ty()) || IsBFloat16Type(x.ty())) {
     using tirx::FloatImmNode;
     const FloatImmNode* fx = x.as<FloatImmNode>();
     if (fx) {
-      return FloatImm(x.dtype(), std::fabs(fx->value), fx->span);
+      return FloatImm(x.ty(), std::fabs(fx->value), fx->span);
     }
     static const Op& fabs_op = Op::Get("tirx.fabs");
-    return tirx::Call(x.dtype(), fabs_op, {x}, {}, span);
-  } else if (x.dtype().is_uint()) {
+    return tirx::Call(x.ty(), fabs_op, {x}, {}, span);
+  } else if (x.ty().MatchesCode(DLDataTypeCode::kDLUInt)) {
     return x;
   } else {
-    TVM_FFI_THROW(InternalError) << "Data type " << x.dtype()
+    TVM_FFI_THROW(InternalError) << "Data type " << x.ty()
                                  << " not supported for absolute op. Skipping absolute op...";
     return x;
   }
@@ -928,39 +971,40 @@ TVM_TIR_REGISTER_PURE_UNARY_OP("fabs").set_attr<TVectorizable>("TVectorizable",
 
 // isnan
 PrimExpr isnan(PrimExpr x, Span span) {
-  DataType t = DataType::Bool(x.dtype().lanes());
-  if (x.dtype().is_int() || x.dtype().is_uint()) {
+  PrimType t = PrimType::Bool(x.ty().lanes());
+  PrimType bool_ty(t);
+  if (x.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
     return MakeConst(t, false);
-  } else if (x.dtype().is_float()) {
+  } else if (IsFloatType(x.ty())) {
     using tirx::FloatImmNode;
     const FloatImmNode* fx = x.as<FloatImmNode>();
     if (fx) {
       return MakeConst(t, std::isnan(fx->value), fx->span);
     }
-    if (x.dtype().bits() == 16) {
+    if (x.ty().bits() == 16) {
       static const Op& isnan_op = Op::Get("tirx.isnan");
-      return tirx::Call(t, isnan_op, {cast(DataType::Float(32, t.lanes()), std::move(x), span)}, {},
-                        span);
+      PrimType f32_ty = PrimType::Float(32, t.lanes());
+      return tirx::Call(bool_ty, isnan_op, {cast(f32_ty, std::move(x), span)}, {}, span);
     } else {
       static const Op& isnan_op = Op::Get("tirx.isnan");
-      return tirx::Call(t, isnan_op, {x}, {}, span);
+      return tirx::Call(bool_ty, isnan_op, {x}, {}, span);
     }
   } else {
-    TVM_FFI_THROW(InternalError) << "Data type " << x.dtype()
+    TVM_FFI_THROW(InternalError) << "Data type " << x.ty()
                                  << " not supported for isnan op. Skipping isnan op...";
   }
 }
 
 // isinf
 PrimExpr isinf(PrimExpr x, Span span) {
-  DataType t = DataType::Bool(x.dtype().lanes());
-  if (x.dtype().is_int() || x.dtype().is_uint()) {
+  PrimType t = PrimType::Bool(x.ty().lanes());
+  if (x.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
     return MakeConst(t, false, span);
-  } else if (x.dtype().is_float()) {
-    PrimExpr infX = infinity(x.dtype(), span);
+  } else if (IsFloatType(x.ty())) {
+    PrimExpr infX = infinity(x.ty(), span);
     return abs(x, span) == infX && !isnan(x, span);
   } else {
-    TVM_FFI_THROW(InternalError) << "Data type " << x.dtype()
+    TVM_FFI_THROW(InternalError) << "Data type " << x.ty()
                                  << " not supported for finiteness ops. Skipping it...";
   }
 }
@@ -969,57 +1013,57 @@ PrimExpr isinf(PrimExpr x, Span span) {
 PrimExpr isfinite(PrimExpr x, Span span) { return !isinf(x, span) && !isnan(x, span); }
 
 PrimExpr sum(PrimExpr source, ffi::Array<IterVar> rdom, ffi::Array<PrimExpr> init, Span span) {
-  Var x("x", source.dtype(), span), y("y", source.dtype(), span);
+  Var x("x", source.ty(), span), y("y", source.ty(), span);
   PrimExpr result = tirx::Add(x, y, span);
-  PrimExpr identity_element = MakeConst(source.dtype(), 0, span);
+  PrimExpr identity_element = MakeConst(source.ty(), 0, span);
   tirx::CommReducer combiner = tirx::CommReducer({x}, {y}, {result}, {identity_element}, span);
   return tirx::Reduce(combiner, {source}, rdom, IntImm::Bool(true), 0, init, span);
 }
 
 PrimExpr all(PrimExpr source, ffi::Array<IterVar> rdom, ffi::Array<PrimExpr> init, Span span) {
   type_check_boolean_args(source, "tvm::all");
-  Var x("x", source.dtype(), span), y("y", source.dtype());
+  Var x("x", source.ty(), span), y("y", source.ty());
   PrimExpr result = tirx::And(x, y, span);
-  PrimExpr identity_element = MakeConst(source.dtype(), true, span);
+  PrimExpr identity_element = MakeConst(source.ty(), true, span);
   tirx::CommReducer combiner = tirx::CommReducer({x}, {y}, {result}, {identity_element}, span);
   return tirx::Reduce(combiner, {source}, rdom, IntImm::Bool(true), 0, init, span);
 }
 
 PrimExpr any(PrimExpr source, ffi::Array<IterVar> rdom, ffi::Array<PrimExpr> init, Span span) {
   type_check_boolean_args(source, "tvm::any");
-  Var x("x", source.dtype(), span), y("y", source.dtype(), span);
+  Var x("x", source.ty(), span), y("y", source.ty(), span);
   PrimExpr result = tirx::Or(x, y, span);
-  PrimExpr identity_element = MakeConst(source.dtype(), false, span);
+  PrimExpr identity_element = MakeConst(source.ty(), false, span);
   tirx::CommReducer combiner = tirx::CommReducer({x}, {y}, {result}, {identity_element}, span);
   return tirx::Reduce(combiner, {source}, rdom, IntImm::Bool(true), 0, init, span);
 }
 
 PrimExpr max(PrimExpr source, ffi::Array<IterVar> rdom, ffi::Array<PrimExpr> init, Span span) {
-  Var x("x", source.dtype(), span), y("y", source.dtype(), span);
+  Var x("x", source.ty(), span), y("y", source.ty(), span);
   PrimExpr result = tirx::Max(x, y, span);
-  PrimExpr identity_element = min_value(source.dtype(), span);
+  PrimExpr identity_element = min_value(source.ty(), span);
   tirx::CommReducer combiner = tirx::CommReducer({x}, {y}, {result}, {identity_element}, span);
   return tirx::Reduce(combiner, {source}, rdom, IntImm::Bool(true), 0, init, span);
 }
 
 PrimExpr min(PrimExpr source, ffi::Array<IterVar> rdom, ffi::Array<PrimExpr> init, Span span) {
-  Var x("x", source.dtype(), span), y("y", source.dtype(), span);
+  Var x("x", source.ty(), span), y("y", source.ty(), span);
   PrimExpr result = tirx::Min(x, y, span);
-  PrimExpr identity_element = max_value(source.dtype(), span);
+  PrimExpr identity_element = max_value(source.ty(), span);
   tirx::CommReducer combiner = tirx::CommReducer({x}, {y}, {result}, {identity_element}, span);
   return tirx::Reduce(combiner, {source}, rdom, IntImm::Bool(true), 0, init, span);
 }
 
 PrimExpr prod(PrimExpr source, ffi::Array<IterVar> rdom, ffi::Array<PrimExpr> init, Span span) {
-  if (source.dtype().is_bool()) {
+  if (source.ty().MatchesCode(DLDataTypeCode::kDLBool)) {
     // Bool product (prod) has the same truth table as logical AND.  Reuse all() to
     // avoid lowering bool prod through Mul, which LLVM codegen does not support.
     return all(source, rdom, init, span);
   } else {
     // For non-bool types, we lower prod through Mul.
-    Var x("x", source.dtype(), span), y("y", source.dtype(), span);
+    Var x("x", source.ty(), span), y("y", source.ty(), span);
     PrimExpr result = tirx::Mul(x, y, span);
-    PrimExpr identity_element = MakeConst(source.dtype(), 1, span);
+    PrimExpr identity_element = MakeConst(source.ty(), 1, span);
     tirx::CommReducer combiner = tirx::CommReducer({x}, {y}, {result}, {identity_element}, span);
     return tirx::Reduce(combiner, {source}, rdom, IntImm::Bool(true), 0, init, span);
   }
@@ -1028,82 +1072,87 @@ PrimExpr prod(PrimExpr source, ffi::Array<IterVar> rdom, ffi::Array<PrimExpr> in
 // fmod
 PrimExpr fmod(PrimExpr x, PrimExpr y, Span span) {
   BinaryOpMatchTypes(x, y, span);
-  TVM_FFI_ICHECK(x.dtype().is_float()) << "fmod only applies to float";
+  TVM_FFI_ICHECK(IsFloatType(x.ty())) << "fmod only applies to float";
   static const Op& fmod_op = Op::Get("tirx.fmod");
-  return tirx::Call(x.dtype(), fmod_op, {x, y}, {}, span);
+  return tirx::Call(x.ty(), fmod_op, {x, y}, {}, span);
 }
 
 TVM_TIR_REGISTER_PURE_UNARY_OP("fmod");
 
 // floor
 PrimExpr floor(PrimExpr x, Span span) {
-  if (x.dtype().is_int() || x.dtype().is_uint() || x.dtype().is_bool()) {
+  if (x.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt,
+                         DLDataTypeCode::kDLBool)) {
     return x;
   }
   using tirx::FloatImmNode;
   const FloatImmNode* fx = x.as<FloatImmNode>();
-  if (fx) return FloatImm(x.dtype(), std::floor(fx->value), fx->span);
+  if (fx) return FloatImm(x.ty(), std::floor(fx->value), fx->span);
   static const Op& floor_op = Op::Get("tirx.floor");
-  return tirx::Call(x.dtype(), floor_op, {x}, {}, span);
+  return tirx::Call(x.ty(), floor_op, {x}, {}, span);
 }
 
 TVM_TIR_REGISTER_PURE_UNARY_OP("floor").set_attr<TVectorizable>("TVectorizable", true);
 
 // ceil
 PrimExpr ceil(PrimExpr x, Span span) {
-  if (x.dtype().is_int() || x.dtype().is_uint() || x.dtype().is_bool()) {
+  if (x.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt,
+                         DLDataTypeCode::kDLBool)) {
     return x;
   }
   using tirx::FloatImmNode;
   const FloatImmNode* fx = x.as<FloatImmNode>();
-  if (fx) return FloatImm(x.dtype(), std::ceil(fx->value), fx->span);
+  if (fx) return FloatImm(x.ty(), std::ceil(fx->value), fx->span);
   static const Op& ceil_op = Op::Get("tirx.ceil");
-  return tirx::Call(x.dtype(), ceil_op, {x}, {}, span);
+  return tirx::Call(x.ty(), ceil_op, {x}, {}, span);
 }
 
 TVM_TIR_REGISTER_PURE_UNARY_OP("ceil").set_attr<TVectorizable>("TVectorizable", true);
 
 // round
 PrimExpr round(PrimExpr x, Span span) {
-  if (x.dtype().is_int() || x.dtype().is_uint() || x.dtype().is_bool()) {
+  if (x.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt,
+                         DLDataTypeCode::kDLBool)) {
     return x;
   }
   using tirx::FloatImmNode;
   const FloatImmNode* fx = x.as<FloatImmNode>();
-  if (fx) return FloatImm(x.dtype(), std::nearbyint(fx->value), fx->span);
+  if (fx) return FloatImm(x.ty(), std::nearbyint(fx->value), fx->span);
   static const Op& round_op = Op::Get("tirx.round");
-  return tirx::Call(x.dtype(), round_op, {x}, {}, span);
+  return tirx::Call(x.ty(), round_op, {x}, {}, span);
 }
 
 TVM_TIR_REGISTER_PURE_UNARY_OP("round").set_attr<TVectorizable>("TVectorizable", true);
 
 // nearbyint
 PrimExpr nearbyint(PrimExpr x, Span span) {
-  if (x.dtype().is_int() || x.dtype().is_uint() || x.dtype().is_bool()) {
+  if (x.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt,
+                         DLDataTypeCode::kDLBool)) {
     return x;
   }
   using tirx::FloatImmNode;
   const FloatImmNode* fx = x.as<FloatImmNode>();
-  if (fx) return FloatImm(x.dtype(), std::nearbyint(fx->value), fx->span);
+  if (fx) return FloatImm(x.ty(), std::nearbyint(fx->value), fx->span);
   static const Op& nearbyint_op = Op::Get("tirx.nearbyint");
-  return tirx::Call(x.dtype(), nearbyint_op, {x}, {}, span);
+  return tirx::Call(x.ty(), nearbyint_op, {x}, {}, span);
 }
 
 TVM_TIR_REGISTER_PURE_UNARY_OP("nearbyint");
 
 // trunc
 PrimExpr trunc(PrimExpr x, Span span) {
-  if (x.dtype().is_int() || x.dtype().is_uint() || x.dtype().is_bool()) {
+  if (x.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt,
+                         DLDataTypeCode::kDLBool)) {
     return x;
   }
   using tirx::FloatImmNode;
   const FloatImmNode* fx = x.as<FloatImmNode>();
   if (fx) {
-    return FloatImm(x.dtype(), (fx->value < 0 ? std::ceil(fx->value) : std::floor(fx->value)),
+    return FloatImm(x.ty(), (fx->value < 0 ? std::ceil(fx->value) : std::floor(fx->value)),
                     fx->span);
   }
   static const Op& trunc_op = Op::Get("tirx.trunc");
-  return tirx::Call(x.dtype(), trunc_op, {x}, {}, span);
+  return tirx::Call(x.ty(), trunc_op, {x}, {}, span);
 }
 
 TVM_TIR_REGISTER_PURE_UNARY_OP("trunc").set_attr<TVectorizable>("TVectorizable", true);
@@ -1185,9 +1234,9 @@ TVM_FFI_STATIC_INIT_BLOCK() {
       .def_packed("node._const",
                   [](ffi::PackedArgs args, ffi::Any* ret) {
                     if (auto opt = args[0].try_cast<int64_t>()) {
-                      *ret = tirx::MakeConst(args[1].cast<DataType>(), *opt, args[2].cast<Span>());
+                      *ret = tirx::MakeConst(args[1].cast<PrimType>(), *opt, args[2].cast<Span>());
                     } else if (auto opt = args[0].try_cast<double>()) {
-                      *ret = tirx::MakeConst(args[1].cast<DataType>(), *opt, args[2].cast<Span>());
+                      *ret = tirx::MakeConst(args[1].cast<PrimType>(), *opt, args[2].cast<Span>());
                     } else {
                       TVM_FFI_THROW(InternalError)
                           << "First argument to tvm.tirx.const must be int, float, or bool, "
@@ -1196,9 +1245,9 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                     }
                   })
       .def("node.LargeUIntImm", LargeUIntImm)
-      .def("tirx.min_value", min_value)
-      .def("tirx.max_value", max_value)
-      .def("tirx.infinity", infinity)
+      .def("tirx.min_value", static_cast<PrimExpr (*)(PrimType, Span)>(&min_value))
+      .def("tirx.max_value", static_cast<PrimExpr (*)(PrimType, Span)>(&max_value))
+      .def("tirx.infinity", static_cast<PrimExpr (*)(PrimType, Span)>(&infinity))
       .def("tirx.abs", tvm::abs)
       .def("tirx.likely", tvm::likely)
       .def("tirx.isnan", tvm::isnan)
@@ -1209,8 +1258,11 @@ TVM_FFI_STATIC_INIT_BLOCK() {
       .def("tirx.round", tvm::round)
       .def("tirx.nearbyint", tvm::nearbyint)
       .def("tirx.trunc", tvm::trunc)
-      .def("tirx._cast", tvm::cast)
-      .def("tirx.reinterpret", tvm::reinterpret);
+      .def("tirx._cast",
+           [](PrimType dtype, PrimExpr value, Span span) { return tvm::cast(dtype, value, span); })
+      .def("tirx.reinterpret", [](PrimType dtype, PrimExpr value, Span span) {
+        return tvm::reinterpret(dtype, value, span);
+      });
 }
 
 // operator overloading, smarter than make
@@ -1269,24 +1321,25 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 }
 
 PrimExpr fast_erf_float_expr(PrimExpr arg, int bits) {
-  auto plus_4 = FloatImm(DataType::Float(bits), 4.f);
-  auto minus_4 = FloatImm(DataType::Float(bits), -4.f);
+  PrimType fp_ty = PrimType::Float(bits);
+  auto plus_4 = FloatImm(fp_ty, 4.f);
+  auto minus_4 = FloatImm(fp_ty, -4.f);
 
   // The monomial coefficients of the numerator polynomial (odd).
-  auto alpha_1 = FloatImm(DataType::Float(bits), -1.60960333262415e-02f);
-  auto alpha_3 = FloatImm(DataType::Float(bits), -2.95459980854025e-03f);
-  auto alpha_5 = FloatImm(DataType::Float(bits), -7.34990630326855e-04f);
-  auto alpha_7 = FloatImm(DataType::Float(bits), -5.69250639462346e-05f);
-  auto alpha_9 = FloatImm(DataType::Float(bits), -2.10102402082508e-06f);
-  auto alpha_11 = FloatImm(DataType::Float(bits), 2.77068142495902e-08f);
-  auto alpha_13 = FloatImm(DataType::Float(bits), -2.72614225801306e-10f);
+  auto alpha_1 = FloatImm(fp_ty, -1.60960333262415e-02f);
+  auto alpha_3 = FloatImm(fp_ty, -2.95459980854025e-03f);
+  auto alpha_5 = FloatImm(fp_ty, -7.34990630326855e-04f);
+  auto alpha_7 = FloatImm(fp_ty, -5.69250639462346e-05f);
+  auto alpha_9 = FloatImm(fp_ty, -2.10102402082508e-06f);
+  auto alpha_11 = FloatImm(fp_ty, 2.77068142495902e-08f);
+  auto alpha_13 = FloatImm(fp_ty, -2.72614225801306e-10f);
 
   // The monomial coefficients of the denominator polynomial (even).
-  auto beta_0 = FloatImm(DataType::Float(bits), -1.42647390514189e-02f);
-  auto beta_2 = FloatImm(DataType::Float(bits), -7.37332916720468e-03f);
-  auto beta_4 = FloatImm(DataType::Float(bits), -1.68282697438203e-03f);
-  auto beta_6 = FloatImm(DataType::Float(bits), -2.13374055278905e-04f);
-  auto beta_8 = FloatImm(DataType::Float(bits), -1.45660718464996e-05f);
+  auto beta_0 = FloatImm(fp_ty, -1.42647390514189e-02f);
+  auto beta_2 = FloatImm(fp_ty, -7.37332916720468e-03f);
+  auto beta_4 = FloatImm(fp_ty, -1.68282697438203e-03f);
+  auto beta_6 = FloatImm(fp_ty, -2.13374055278905e-04f);
+  auto beta_8 = FloatImm(fp_ty, -1.45660718464996e-05f);
 
   // clamp x
   auto x = tvm::max(tvm::min(arg, plus_4), minus_4);
@@ -1340,18 +1393,20 @@ int ExtractInt(const ffi::PackedArgs& args, int index) {
   }
 }
 
-PrimExpr PrintOpPacked(Var data, DataType dtype, bool is_string, bool is_scalar, int dim_num,
+PrimExpr PrintOpPacked(Var data, DLDataType dtype, bool is_string, bool is_scalar, int dim_num,
                        ffi::Array<PrimExpr> shape) {
+  PrimType value_ty(dtype);
+  PrimType u32_ty = PrimType::UInt(32);
   ffi::Array<PrimExpr> args;
   args.push_back(data);
   args.push_back(tirx::StringImm(ffi::DLDataTypeToString(dtype)));
   args.push_back(IntImm::Bool(is_string));
   args.push_back(IntImm::Bool(is_scalar));
-  args.push_back(IntImm(DataType::UInt(32), dim_num));
+  args.push_back(IntImm(u32_ty, dim_num));
   for (const auto& dim : shape) {
     args.push_back(dim);
   }
-  return tirx::Call(dtype, tirx::builtin::print_buffer(), args);
+  return tirx::Call(value_ty, tirx::builtin::print_buffer(), args);
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
@@ -1359,7 +1414,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   refl::GlobalDef().def_packed("tirx.print_buffer", [](ffi::PackedArgs args, ffi::Any* ret) {
     // Expected arguments:
     // args[0]: buffer_var (Var)
-    // args[1]: dtype (DataType)
+    // args[1]: dtype (DLDataType)
     // args[2]: is_string (bool or IntImm)
     // args[3]: is_scalar (bool or IntImm)
     // args[4]: dim_num (int or IntImm)
@@ -1368,7 +1423,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
     TVM_FFI_ICHECK_GE(args.size(), 5) << "print_buffer expects at least 5 arguments";
 
     Var buffer_var = args[0].cast<Var>();
-    DataType dtype = args[1].cast<DataType>();
+    DLDataType dtype = args[1].cast<DLDataType>();
     bool is_string = ExtractBool(args, 2);
     bool is_scalar = ExtractBool(args, 3);
     int dim_num = ExtractInt(args, 4);
diff --git a/src/tirx/script/builder/ir.cc b/src/tirx/script/builder/ir.cc
index a75025a0ddd1..a732a14958b7 100644
--- a/src/tirx/script/builder/ir.cc
+++ b/src/tirx/script/builder/ir.cc
@@ -41,7 +41,7 @@ namespace tirx {
 using tvm::tirx::IterVar;
 using tvm::tirx::Layout;
 
-Buffer BufferDecl(ffi::Array<PrimExpr> shape, DataType dtype, ffi::String buffer_name,
+Buffer BufferDecl(ffi::Array<PrimExpr> shape, PrimType dtype, ffi::String buffer_name,
                   ffi::Optional<Var> data, ffi::Optional<ffi::Array<PrimExpr>> strides,
                   ffi::Optional<PrimExpr> elem_offset, ffi::String storage_scope, int align,
                   int offset_factor, ffi::String buffer_type,
@@ -57,16 +57,16 @@ Buffer BufferDecl(ffi::Array<PrimExpr> shape, DataType dtype, ffi::String buffer
   }
   Var buffer_data;
   if (!data.defined()) {
-    DataType storage_dtype = dtype;
-    if (storage_dtype == DataType::Bool()) {
-      storage_dtype = DataType::Int(8);
+    DLDataType storage_dtype = dtype->dtype;
+    if (storage_dtype == DLDataType{kDLBool, 8, 1}) {
+      storage_dtype = DLDataType{kDLInt, 8, 1};
     }
     buffer_data = tvm::tirx::Var(buffer_name, PointerType(PrimType(storage_dtype), storage_scope));
   } else {
     buffer_data = data.value();
   }
   if (!elem_offset.defined() && offset_factor) {
-    DataType shape_dtype = shape.empty() ? DataType::Int(32) : shape[0]->dtype;
+    PrimType shape_dtype = shape.empty() ? PrimType::Int(32) : shape[0].ty();
     elem_offset = tvm::tirx::Var("elem_offset", shape_dtype);
   }
   return Buffer(buffer_data, dtype, shape, strides.value_or(ffi::Array<PrimExpr>()),
@@ -100,7 +100,7 @@ Var Arg(ffi::String name, Var var) {
 Buffer Arg(ffi::String name, Buffer buffer) {
   PrimFuncFrame frame = FindPrimFuncFrame("T.Arg");
   details::Namer::Name(buffer, name);
-  Var handle(buffer->name + "_handle", DataType::Handle());
+  Var handle(buffer->name + "_handle", PrimType::Handle());
   frame->args.push_back(handle);
   frame->buffer_map.Set(handle, buffer);
   return buffer;
@@ -148,7 +148,7 @@ tvm::Type FuncRet(tvm::Type ret_type) {
   return ret_type;
 }
 
-Buffer MatchBuffer(ffi::ObjectRef param, ffi::Array<PrimExpr> shape, DataType dtype,
+Buffer MatchBuffer(ffi::ObjectRef param, ffi::Array<PrimExpr> shape, PrimType dtype,
                    ffi::Optional<Var> data, ffi::Array<PrimExpr> strides, PrimExpr elem_offset,
                    ffi::String storage_scope, int align, int offset_factor,
                    ffi::String buffer_type_str, ffi::Optional<ffi::Array<IntImm>> axis_separators,
@@ -367,7 +367,7 @@ void BlockAttrs(ffi::Map<ffi::String, Any> attrs) {
 }
 
 ffi::Variant<Buffer, AllocBufferFrame> SBlockAllocBuffer(
-    ffi::Array<PrimExpr> shape, DataType dtype, ffi::Optional<Var> data,
+    ffi::Array<PrimExpr> shape, PrimType dtype, ffi::Optional<Var> data,
     ffi::Array<PrimExpr> strides, PrimExpr elem_offset, ffi::String storage_scope, int align,
     int offset_factor, ffi::String buffer_type_str,
     ffi::Optional<ffi::Array<IntImm>> axis_separators, ffi::Optional<Layout> layout,
@@ -418,14 +418,17 @@ IterVar PushBlockVar(IterVar iter_var, PrimExpr binding) {
   return iter_var;
 }
 
-#define TVM_TIRX_IR_BUILDER_AXIS(Method, Kind, Name)                                          \
-  Var Method(Range dom, PrimExpr binding, DataType dtype) {                                   \
-    TVM_FFI_ICHECK(dom.defined()) << Name << " axis must have a domain";                      \
-    int bits = std::max({dom->min.dtype().bits(), dom->extent.dtype().bits(), dtype.bits()}); \
-    return PushBlockVar(IterVar(/*dom=*/dom, /*var=*/Var("", dtype.with_bits(bits)),          \
-                                /*iter_type=*/Kind, /*thread_tag=*/""),                       \
-                        binding)                                                              \
-        ->var;                                                                                \
+#define TVM_TIRX_IR_BUILDER_AXIS(Method, Kind, Name)                                      \
+  Var Method(Range dom, PrimExpr binding, PrimType dtype) {                               \
+    TVM_FFI_ICHECK(dom.defined()) << Name << " axis must have a domain";                  \
+    PrimType min_ty = dom->min.ty();                                                      \
+    PrimType extent_ty = dom->extent.ty();                                                \
+    int bits = std::max({min_ty.bits(), extent_ty.bits(), dtype.bits()});                 \
+    PrimType var_ty = dtype.WithBits(bits);                                               \
+    return PushBlockVar(IterVar(/*dom=*/dom, /*var=*/Var("", var_ty), /*iter_type=*/Kind, \
+                                /*thread_tag=*/""),                                       \
+                        binding)                                                          \
+        ->var;                                                                            \
   }
 TVM_TIRX_IR_BUILDER_AXIS(Spatial, tvm::tirx::IterVarType::kDataPar, "Spatial");
 TVM_TIRX_IR_BUILDER_AXIS(Reduce, tvm::tirx::IterVarType::kCommReduce, "Reduction");
@@ -433,7 +436,7 @@ TVM_TIRX_IR_BUILDER_AXIS(Scan, tvm::tirx::IterVarType::kOrdered, "Scan");
 TVM_TIRX_IR_BUILDER_AXIS(Opaque, tvm::tirx::IterVarType::kOpaque, "Opaque");
 #undef TVM_TIRX_IR_BUILDER_AXIS
 
-ffi::Array<Var> Remap(ffi::String kinds, ffi::Array<PrimExpr> bindings, DataType dtype) {
+ffi::Array<Var> Remap(ffi::String kinds, ffi::Array<PrimExpr> bindings, PrimType dtype) {
   using namespace tvm::tirx;
   ffi::Array<Var> results;
   TVM_FFI_ICHECK_EQ(kinds.size(), bindings.size());
@@ -462,7 +465,7 @@ ffi::Array<Var> Remap(ffi::String kinds, ffi::Array<PrimExpr> bindings, DataType
     }
     TVM_FFI_ICHECK(dom.defined()) << "TypeError: Variable is not in the loop: "
                                   << ffi::GetRef<Var>(v);
-    DataType dtype = v->dtype;
+    PrimType dtype = v->ty();
     if (c == 'S') {
       results.push_back(PushBlockVar(IterVar(/*dom=*/dom,
                                              /*var=*/Var("", dtype),
@@ -493,8 +496,10 @@ ffi::Array<Var> Remap(ffi::String kinds, ffi::Array<PrimExpr> bindings, DataType
     PrimExpr min = start;                                                                     \
     PrimExpr extent = arith::Analyzer()->Simplify(stop - start);                              \
     ffi::ObjectPtr<ForFrameNode> n = ffi::make_object<ForFrameNode>();                        \
-    int bits = std::max(min.dtype().bits(), extent.dtype().bits());                           \
-    n->vars = {Var("v", DataType(min.dtype().code(), bits, 1))};                              \
+    PrimType min_ty = min.ty();                                                               \
+    PrimType extent_ty = extent.ty();                                                         \
+    int bits = std::max(min_ty.bits(), extent_ty.bits());                                     \
+    n->vars = {Var("v", min_ty.WithBits(bits).WithLanes(1))};                                 \
     n->doms = {Range::FromMinExtent(min, extent)};                                            \
     n->steps = {step};                                                                        \
     n->f_make_for_loop = [annotations](ffi::Array<Var> vars, ffi::Array<Range> doms,          \
@@ -522,8 +527,10 @@ ForFrame ThreadBinding(PrimExpr start, PrimExpr stop, ffi::String thread,
   PrimExpr min = start;
   PrimExpr extent = arith::Analyzer()->Simplify(stop - start);
   ffi::ObjectPtr<ForFrameNode> n = ffi::make_object<ForFrameNode>();
-  int bits = std::max(min.dtype().bits(), extent.dtype().bits());
-  DataType dtype = DataType(min.dtype().code(), bits, 1);
+  PrimType min_ty = min.ty();
+  PrimType extent_ty = extent.ty();
+  int bits = std::max(min_ty.bits(), extent_ty.bits());
+  PrimType dtype = min_ty.WithBits(bits).WithLanes(1);
   n->vars = {Var("v", dtype)};
   n->doms = {Range::FromMinExtent(min, extent)};
   n->steps = {std::nullopt};
@@ -549,12 +556,12 @@ ForFrame Grid(ffi::Array<ffi::Variant<PrimExpr, ffi::Tuple<PrimExpr, PrimExpr>>>
   for (const auto& extent : extents) {
     if (auto prim_expr = extent.as<PrimExpr>()) {
       // extent is a single PrimExpr
-      DataType dtype = prim_expr.value().dtype();
+      PrimType dtype = prim_expr.value().ty();
       n->vars.push_back(Var("v", dtype));
       n->doms.push_back(Range(tvm::IntImm(dtype, 0), prim_expr.value()));
     } else if (auto tuple = extent.as<ffi::Tuple<PrimExpr, PrimExpr>>()) {
       // extent is a tuple of two PrimExpr (start, extent)
-      DataType dtype = tuple.value().get<0>().dtype();
+      PrimType dtype = tuple.value().get<0>().ty();
       n->vars.push_back(Var("v", dtype));
       n->doms.push_back(Range::FromMinExtent(tuple.value().get<0>(), tuple.value().get<1>()));
     } else {
@@ -598,7 +605,7 @@ Var Bind(PrimExpr value, ffi::Optional<Type> type_annotation, ffi::Optional<Var>
     } else if (type_annotation.defined()) {
       return Var("v", type_annotation.value());
     } else {
-      return Var("v", value.dtype());
+      return Var("v", value.ty());
     }
   }();
   AddToParent(tvm::tirx::Bind(bind_var, value));
@@ -621,7 +628,7 @@ LaunchThreadFrame LaunchThread(Var var, PrimExpr extent) {
   ffi::ObjectPtr<LaunchThreadFrameNode> n = ffi::make_object<LaunchThreadFrameNode>();
   if (!iter_var->dom.defined()) {
     const_cast<tvm::tirx::IterVarNode*>(iter_var.get())->dom =
-        Range(tvm::IntImm(extent.dtype(), 0), extent);
+        Range(tvm::IntImm(extent.ty(), 0), extent);
   } else if (!arith::Analyzer()->CanProveEqual(iter_var->dom->extent, extent)) {
     TVM_FFI_THROW(InternalError) << "ValueError: Inconsistent extents of environment thread. "
                                  << iter_var->dom->extent << " vs " << extent;
@@ -633,7 +640,7 @@ LaunchThreadFrame LaunchThread(Var var, PrimExpr extent) {
 }
 
 LaunchThreadFrame LaunchThread(ffi::String thread_tag, PrimExpr extent) {
-  return LaunchThread(EnvThread(thread_tag, extent.dtype()), extent);
+  return LaunchThread(EnvThread(thread_tag, extent.ty()), extent);
 }
 
 AttrFrame Attr(ffi::Any node, ffi::String attr_key, PrimExpr value) {
@@ -721,7 +728,7 @@ ComposeOpFrame ComposeOp(ffi::Map<ffi::String, Buffer> workspace,
   return ComposeOpFrame(n);
 }
 
-Var EnvThread(ffi::String thread_tag, DataType dtype) {
+Var EnvThread(ffi::String thread_tag, PrimType dtype) {
   IterVar iter_var(Range{nullptr}, Var("", dtype), tvm::tirx::IterVarType::kThreadIndex,
                    thread_tag);
   Var var = iter_var->var;
@@ -735,9 +742,10 @@ Var EnvThread(ffi::String thread_tag, DataType dtype) {
 
 void BufferStore(Buffer buffer, PrimExpr value, ffi::Array<PrimExpr> indices,
                  ffi::Optional<PrimExpr> predicate = std::nullopt) {
-  runtime::DataType buffer_dtype = buffer->dtype;
-  bool is_index_scalable = indices.empty() ? false : indices.back().dtype().is_scalable_vector();
-  bool is_buffer_dtype_scalable = buffer_dtype.is_scalable_vector();
+  PrimType buffer_dtype = buffer->dtype;
+  PrimType index_ty = indices.empty() ? PrimType::Int(32) : indices.back().ty();
+  bool is_index_scalable = !indices.empty() && index_ty.IsScalableVector();
+  bool is_buffer_dtype_scalable = buffer_dtype.IsScalableVector();
 
   TVM_FFI_ICHECK(!(is_index_scalable && is_buffer_dtype_scalable))
       << "Index dtype and buffer dtype can't both be scalable.";
@@ -746,29 +754,30 @@ void BufferStore(Buffer buffer, PrimExpr value, ffi::Array<PrimExpr> indices,
   if (indices.empty()) {
     index_lanes = 1;
   } else if (is_index_scalable) {
-    index_lanes = indices.back().dtype().vscale_factor();
+    index_lanes = index_ty.VScaleFactor();
   } else {
-    index_lanes = indices.back().dtype().lanes();
+    index_lanes = index_ty.lanes();
   }
 
-  int buffer_lanes = is_buffer_dtype_scalable ? buffer_dtype.vscale_factor() : buffer_dtype.lanes();
+  int buffer_lanes = is_buffer_dtype_scalable ? buffer_dtype.VScaleFactor() : buffer_dtype.lanes();
 
-  runtime::DataType lhs_dtype;
+  PrimType lhs_dtype = buffer_dtype;
   if (is_buffer_dtype_scalable || is_index_scalable) {
-    lhs_dtype = buffer_dtype.with_scalable_vscale_factor(buffer_lanes * index_lanes);
+    lhs_dtype = PrimType::ScalableVector(buffer_dtype.code(), buffer_dtype.bits(),
+                                         buffer_lanes * index_lanes);
   } else {
-    lhs_dtype = buffer_dtype.with_lanes(buffer_dtype.lanes() * index_lanes);
+    lhs_dtype = buffer_dtype.WithLanes(buffer_dtype.lanes() * index_lanes);
   }
 
-  runtime::DataType rhs_dtype = value->dtype;
+  PrimType rhs_dtype = value.ty();
 
   if (lhs_dtype != rhs_dtype) {
-    TVM_FFI_ICHECK(lhs_dtype.is_scalable_vector() == rhs_dtype.is_scalable_vector())
+    TVM_FFI_ICHECK(lhs_dtype.IsScalableVector() == rhs_dtype.IsScalableVector())
         << "Can't mix scalable and fixed length vectors in a statement";
 
     bool lanes_match = false;
-    if (lhs_dtype.is_scalable_vector()) {
-      lanes_match = lhs_dtype.vscale_factor() == rhs_dtype.vscale_factor();
+    if (lhs_dtype.IsScalableVector()) {
+      lanes_match = lhs_dtype.VScaleFactor() == rhs_dtype.VScaleFactor();
     } else {
       lanes_match = lhs_dtype.lanes() == rhs_dtype.lanes();
     }
@@ -781,14 +790,13 @@ void BufferStore(Buffer buffer, PrimExpr value, ffi::Array<PrimExpr> indices,
     if (lhs_dtype.code() != rhs_dtype.code()) {
       if (
           // Case 1. lhs is handle, and rhs needs to be casted to handle.
-          (lhs_dtype.code() == runtime::DataType::kHandle) ||
+          (lhs_dtype.code() == DLDataTypeCode::kDLOpaqueHandle) ||
           // Case 2. rhs is handle, and it needs to be casted to non-handle.
-          (rhs_dtype.code() == runtime::DataType::kHandle) ||
+          (rhs_dtype.code() == DLDataTypeCode::kDLOpaqueHandle) ||
           // Case 3. rhs is float or bfloat, and casting to non-float can lose precision.
-          ((lhs_dtype.code() == runtime::DataType::kInt ||
-            lhs_dtype.code() == runtime::DataType::kUInt) &&
-           (rhs_dtype.code() == runtime::DataType::kFloat ||
-            rhs_dtype.code() == runtime::DataType::kBFloat))) {
+          ((lhs_dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) &&
+           (rhs_dtype.code() == DLDataTypeCode::kDLFloat ||
+            rhs_dtype.code() == DLDataTypeCode::kDLBfloat))) {
         LOG(WARNING) << "Casting in BufferStore may lose precision"
                      << ": LHS is `" << lhs_dtype << "`, RHS is `" << rhs_dtype
                      << "`, indexing lanes: " << index_lanes;
@@ -799,7 +807,7 @@ void BufferStore(Buffer buffer, PrimExpr value, ffi::Array<PrimExpr> indices,
   AddToParent(tvm::tirx::BufferStore(buffer, value, indices, predicate));
 }
 
-DeclBufferFrame DeclBuffer(ffi::Array<PrimExpr> shape, DataType dtype, ffi::String buffer_name,
+DeclBufferFrame DeclBuffer(ffi::Array<PrimExpr> shape, PrimType dtype, ffi::String buffer_name,
                            ffi::Optional<Var> data, ffi::Optional<ffi::Array<PrimExpr>> strides,
                            ffi::Optional<PrimExpr> elem_offset, ffi::String storage_scope,
                            int align, int offset_factor, ffi::String buffer_type,
@@ -841,7 +849,7 @@ DeclBufferFrame DeclBuffer(ffi::Array<PrimExpr> shape, DataType dtype, ffi::Stri
   return DeclBufferFrame(n);
 }
 
-Buffer AllocBuffer(ffi::Array<PrimExpr> shape, DataType dtype, ffi::String storage_scope,
+Buffer AllocBuffer(ffi::Array<PrimExpr> shape, PrimType dtype, ffi::String storage_scope,
                    ffi::Optional<ffi::Map<ffi::String, ffi::Any>> annotations) {
   Buffer buffer = BufferDecl(shape, dtype, "", std::nullopt, std::nullopt, std::nullopt,
                              storage_scope, 0, 0, "", std::nullopt);
@@ -852,8 +860,7 @@ Buffer AllocBuffer(ffi::Array<PrimExpr> shape, DataType dtype, ffi::String stora
 
 void Evaluate(PrimExpr value) { AddToParent(tvm::tirx::Evaluate(value)); }
 
-PrimExpr Ptr(runtime::DataType dtype, ffi::String storage_scope = "global",
-             bool is_size_var = false) {
+PrimExpr Ptr(DLDataType dtype, ffi::String storage_scope = "global", bool is_size_var = false) {
   PointerType type_annotation(PrimType(dtype), storage_scope);
   return is_size_var ? tvm::tirx::SizeVar("", type_annotation)
                      : tvm::tirx::Var("", type_annotation);
@@ -922,7 +929,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def("script.ir_builder.tirx.Buffer",
-           static_cast<Buffer (*)(ffi::Array<PrimExpr>, DataType, ffi::String, ffi::Optional<Var>,
+           static_cast<Buffer (*)(ffi::Array<PrimExpr>, PrimType, ffi::String, ffi::Optional<Var>,
                                   ffi::Optional<ffi::Array<PrimExpr>>, ffi::Optional<PrimExpr>,
                                   ffi::String, int, int, ffi::String,
                                   ffi::Optional<ffi::Array<IntImm>>, ffi::Optional<Layout>,
diff --git a/src/tirx/script/builder/utils.h b/src/tirx/script/builder/utils.h
index 4d7821a84d5a..a8cd6e5b496b 100644
--- a/src/tirx/script/builder/utils.h
+++ b/src/tirx/script/builder/utils.h
@@ -129,7 +129,7 @@ inline IfFrame FindIfFrame(const ffi::String& method) {
 inline tvm::tirx::BufferRegion BufferRegionFromLoad(tvm::tirx::BufferLoad buffer_load) {
   ffi::Array<Range> ranges;
   for (const PrimExpr& index : buffer_load->indices) {
-    ranges.push_back(Range::FromMinExtent(index, IntImm(index->dtype, 1)));
+    ranges.push_back(Range::FromMinExtent(index, IntImm(index.ty(), 1)));
   }
   return tvm::tirx::BufferRegion(buffer_load->buffer, ranges);
 }
diff --git a/src/tirx/script/printer/block.cc b/src/tirx/script/printer/block.cc
index 6d7902a4a89f..71fc0b2e7ecb 100644
--- a/src/tirx/script/printer/block.cc
+++ b/src/tirx/script/printer/block.cc
@@ -149,7 +149,9 @@ Doc PrintBlock(IRDocsifier d, tirx::SBlock block, AccessPath block_p,  //
 
   // Step 2. Handle block predicate
   if (realize) {
-    TVM_FFI_ICHECK(realize->predicate.defined() && realize->predicate->dtype.is_bool());
+    PrimType predicate_ty = realize->predicate.ty();
+    TVM_FFI_ICHECK(realize->predicate.defined() &&
+                   predicate_ty.MatchesCode(DLDataTypeCode::kDLBool));
     if (!tirx::is_one(realize->predicate)) {
       (*frame)->stmts.push_back(ExprStmtDoc(
           TIR(d, "where")
diff --git a/src/tirx/script/printer/buffer.cc b/src/tirx/script/printer/buffer.cc
index 6dd24e6b9a3c..015c3685817e 100644
--- a/src/tirx/script/printer/buffer.cc
+++ b/src/tirx/script/printer/buffer.cc
@@ -93,9 +93,9 @@ ffi::Map<ffi::String, ExprDoc> BufferAttrs(tirx::Buffer buffer, const AccessPath
   }
   // Step 2. Handle `buffer.dtype`
   {
-    DataType default_buf_dtype = d->cfg->buffer_dtype;
-    if (buffer->dtype != default_buf_dtype) {
-      kwargs.Set("dtype", LiteralDoc::DataType(buffer->dtype, buffer_p->Attr("dtype")));
+    DLDataType default_buf_dtype = d->cfg->buffer_dtype;
+    if (buffer->dtype->dtype != default_buf_dtype) {
+      kwargs.Set("dtype", LiteralDoc::DataType(buffer->dtype->dtype, buffer_p->Attr("dtype")));
     }
   }
   // Step 3. Handle `buffer.data`
@@ -145,7 +145,7 @@ ffi::Map<ffi::String, ExprDoc> BufferAttrs(tirx::Buffer buffer, const AccessPath
   // Step 5. Handle `buffer.elem_offset`
   bool needs_print_factor = false;
   if (const auto* int_imm = buffer->elem_offset.as<IntImmNode>()) {
-    if (int_imm->value != 0 || int_imm->dtype != buffer->DefaultIndexType()) {
+    if (int_imm->value != 0 || int_imm->ty()->dtype != buffer->DefaultIndexType()) {
       kwargs.Set("elem_offset",
                  d->AsDoc<ExprDoc>(buffer->elem_offset,  //
                                    buffer_p->Attr("elem_offset")));
@@ -329,7 +329,7 @@ ExprDoc BufferAttn(const tirx::Buffer& buffer, const AccessPath& p, const Frame&
       BufferAttrs(buffer, p, frame, d, BufferVarDefinition::DataPointer);
   ExprDoc shape = attrs.Get("shape").value();
   ExprDoc dtype =
-      attrs.Get("dtype").value_or(LiteralDoc::DataType(buffer->dtype, p->Attr("dtype")));
+      attrs.Get("dtype").value_or(LiteralDoc::DataType(buffer->dtype->dtype, p->Attr("dtype")));
   return TIR(d, "Buffer")->Call({shape, dtype}, {}, {});
 }
 
diff --git a/src/tirx/script/printer/expr.cc b/src/tirx/script/printer/expr.cc
index 32a6251d54d3..1d2168b13a03 100644
--- a/src/tirx/script/printer/expr.cc
+++ b/src/tirx/script/printer/expr.cc
@@ -54,7 +54,7 @@ ExprDoc PrintVarCreation(const tirx::Var& var, const AccessPath& var_p, const IR
       rhs = TIR(d, "TensorMap")->Call({}, {}, {});
     }
   } else {
-    rhs = TIR(d, DType2Str(var->dtype));
+    rhs = TIR(d, DType2Str(var.ty()->dtype));
     rhs->source_paths.push_back(var_p->Attr("dtype"));
     rhs = rhs->Call({}, kwargs_keys, kwargs_values);
   }
@@ -121,7 +121,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tirx::Cast>("", [](tirx::Cast cast, AccessPath p, IRDocsifier d) -> Doc {
-      ExprDoc dtype = LiteralDoc::DataType(cast->dtype, p->Attr("dtype"));
+      ExprDoc dtype = LiteralDoc::DataType(cast.ty()->dtype, p->Attr("dtype"));
       ExprDoc value = d->AsDoc<ExprDoc>(cast->value, p->Attr("value"));
       return TIR(d, "Cast")->Call({dtype, value});
     });
@@ -258,6 +258,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tirx::Call>("", [](tirx::Call call, AccessPath call_p, IRDocsifier d) -> Doc {
+      DLDataType call_dtype = call.ty()->dtype;
       if (call->attrs.defined()) {
         ffi::Array<ExprDoc> call_args;
         int n_args = call->args.size();
@@ -269,7 +270,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
                              ? LiteralDoc::Str(call->op.as<Op>().value()->name, call_p->Attr("op"))
                              : d->AsDoc<ExprDoc>(call->op, call_p->Attr("op"));
         return TIR(d, "Call")->Call(
-            {LiteralDoc::DataType(call->dtype, call_p->Attr("dtype")), op_doc, ListDoc(call_args)},
+            {LiteralDoc::DataType(call_dtype, call_p->Attr("dtype")), op_doc, ListDoc(call_args)},
             {"attrs"}, {d->AsDoc<ExprDoc>(call->attrs, call_p->Attr("attrs"))});
       }
       static const OpAttrMap<tirx::TScriptPrinterName>& op_names =
@@ -297,7 +298,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           ffi::Array<ExprDoc> args;
           args.reserve(n_args + 1);
           if (dtype_print_location == tirx::ScriptDtypePrintLocation::kFirst) {
-            args.push_back(LiteralDoc::DataType(call->dtype, call_p->Attr("dtype")));
+            args.push_back(LiteralDoc::DataType(call_dtype, call_p->Attr("dtype")));
           }
 
           for (int i = 0; i < n_args; ++i) {
@@ -309,7 +310,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
             }
           }
           if (dtype_print_location == tirx::ScriptDtypePrintLocation::kLast) {
-            args.push_back(LiteralDoc::DataType(call->dtype, call_p->Attr("dtype")));
+            args.push_back(LiteralDoc::DataType(call_dtype, call_p->Attr("dtype")));
           }
           return prefix.value()->Call(args);
         }
@@ -334,9 +335,9 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           kw_keys.push_back("source_code");
           kw_vals.push_back(src);
           // If non-void return type, print return_type keyword.
-          if (call->dtype != DataType::Void()) {
+          if (!call.ty().IsVoid()) {
             kw_keys.push_back("return_type");
-            kw_vals.push_back(LiteralDoc::DataType(call->dtype, call_p->Attr("dtype")));
+            kw_vals.push_back(LiteralDoc::DataType(call_dtype, call_p->Attr("dtype")));
           }
           return prefix.value()->Call(args, kw_keys, kw_vals);
         }
@@ -349,14 +350,14 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       int n_args = call->args.size();
       args.reserve(n_args + 1);
       if (dtype_print_location == tirx::ScriptDtypePrintLocation::kFirst) {
-        args.push_back(LiteralDoc::DataType(call->dtype, call_p->Attr("dtype")));
+        args.push_back(LiteralDoc::DataType(call_dtype, call_p->Attr("dtype")));
       }
 
       for (int i = 0; i < n_args; ++i) {
         args.push_back(d->AsDoc<ExprDoc>(call->args[i], call_p->Attr("args")->ArrayItem(i)));
       }
       if (dtype_print_location == tirx::ScriptDtypePrintLocation::kLast) {
-        args.push_back(LiteralDoc::DataType(call->dtype, call_p->Attr("dtype")));
+        args.push_back(LiteralDoc::DataType(call_dtype, call_p->Attr("dtype")));
       }
       return prefix.value()->Call(args);
     });
@@ -391,8 +392,10 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       if (!ret->IsInstance<tirx::DivNode>()) {
         return TIR(d, "Div")->Call({a, b});
       }
-      if ((node->a->dtype.is_int() || node->a->dtype.is_uint()) &&
-          (node->b->dtype.is_int() || node->b->dtype.is_uint())) {
+      PrimType a_ty = node->a.ty();
+      PrimType b_ty = node->b.ty();
+      if ((a_ty.code() == DLDataTypeCode::kDLInt || a_ty.code() == DLDataTypeCode::kDLUInt) &&
+          (b_ty.code() == DLDataTypeCode::kDLInt || b_ty.code() == DLDataTypeCode::kDLUInt)) {
         return TIR(d, "Div")->Call({a, b});
       }
       return OperationDoc(OperationDocNode::Kind::kDiv, {a, b});
diff --git a/src/tirx/script/printer/for_loop.cc b/src/tirx/script/printer/for_loop.cc
index 249e151b9774..a1edcb8fe5e7 100644
--- a/src/tirx/script/printer/for_loop.cc
+++ b/src/tirx/script/printer/for_loop.cc
@@ -34,8 +34,8 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       };
       if (d->cfg->syntax_sugar) {
         for (const tirx::ForNode* l = loop.get(); l != nullptr; l = l->body.as<tirx::ForNode>()) {
-          TVM_FFI_ICHECK(l->loop_var->dtype == l->min->dtype);
-          TVM_FFI_ICHECK(l->loop_var->dtype == l->extent->dtype);
+          TVM_FFI_ICHECK(l->loop_var.ty()->dtype == l->min.ty()->dtype);
+          TVM_FFI_ICHECK(l->loop_var.ty()->dtype == l->extent.ty()->dtype);
           if (l->kind != tirx::ForKind::kSerial ||  //
               !tirx::is_zero(l->min) ||             //
               !l->annotations.empty() ||            //
diff --git a/src/tirx/script/printer/ir.cc b/src/tirx/script/printer/ir.cc
index d7817da8269d..d5d399a33d01 100644
--- a/src/tirx/script/printer/ir.cc
+++ b/src/tirx/script/printer/ir.cc
@@ -28,10 +28,10 @@ TVM_FFI_STATIC_INIT_BLOCK() { TIRFrameNode::RegisterReflection(); }
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<IntImm>("", [](IntImm imm, AccessPath imm_p, IRDocsifier d) -> Doc {
-      DataType dtype = imm->dtype;
+      DLDataType dtype = imm->ty()->dtype;
       if (dtype == d->cfg->int_dtype) {
         return LiteralDoc::Int(imm->value, imm_p->Attr("value"));
-      } else if (dtype == DataType::Bool()) {
+      } else if (dtype == DLDataType{kDLBool, 8, 1}) {
         return TIR(d, DType2Str(dtype))
             ->Call({LiteralDoc::Boolean(imm->value, imm_p->Attr("value"))});
       } else {
@@ -41,7 +41,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<FloatImm>("", [](FloatImm imm, AccessPath imm_p, IRDocsifier d) -> Doc {
-      DataType dtype = imm->dtype;
+      DLDataType dtype = imm->ty()->dtype;
       if (dtype == d->cfg->float_dtype) {
         return LiteralDoc::Float(imm->value, imm_p->Attr("value"));
       } else {
diff --git a/src/tirx/script/printer/stmt.cc b/src/tirx/script/printer/stmt.cc
index 0aa697175355..392b1c7c86da 100644
--- a/src/tirx/script/printer/stmt.cc
+++ b/src/tirx/script/printer/stmt.cc
@@ -502,7 +502,7 @@ ffi::Optional<ExprDoc> TryDeclBufferSugarWithParent(const tirx::Buffer& child, c
     }
     if (shapes_compatible) {
       ExprDoc dtype_doc =
-          LiteralDoc::Str(DType2Str(child->dtype), p->Attr("buffer")->Attr("dtype"));
+          LiteralDoc::Str(DType2Str(child->dtype->dtype), p->Attr("buffer")->Attr("dtype"));
       return pdoc->Attr("view")->Call({dtype_doc});
     }
   }
@@ -723,7 +723,7 @@ Doc AllocBufferDoc(tirx::AllocBuffer stmt, AccessPath p, IRDocsifier d) {
       d->Define(stmt->buffer->data, d->frames.back(),
                 [d, buf, p]() { return d->AsDoc<ExprDoc>(buf, p->Attr("buffer"))->Attr("data"); });
     }
-    ExprDoc type_ann = TIR(d, DType2Str(stmt->buffer->dtype));
+    ExprDoc type_ann = TIR(d, DType2Str(stmt->buffer->dtype->dtype));
     return AssignDoc(lhs, std::nullopt, type_ann);
   }
   ExprDoc rhs = BufferDecl(stmt->buffer, "alloc_buffer", {}, p->Attr("buffer"), d->frames.back(), d,
@@ -814,7 +814,7 @@ ExprDoc DocsifyLaunchThread(const tirx::AttrStmt& attr_stmt, const AccessPath& a
 /*! \brief Check whether an AttrStmt has node=IntImm(int32, 0) (the dict-attr pattern). */
 static bool IsDictAttrPattern(const tirx::AttrStmt& stmt) {
   if (auto int_imm = stmt->node.as<IntImmNode>()) {
-    return int_imm->dtype == DataType::Int(32) && int_imm->value == 0;
+    return int_imm->ty()->dtype == DLDataType{kDLInt, 32, 1} && int_imm->value == 0;
   }
   return false;
 }
diff --git a/src/tirx/transform/common_subexpr_elim.cc b/src/tirx/transform/common_subexpr_elim.cc
index 2221df935226..8bca3931cb10 100644
--- a/src/tirx/transform/common_subexpr_elim.cc
+++ b/src/tirx/transform/common_subexpr_elim.cc
@@ -296,7 +296,8 @@ class CSEPlanner : public StmtExprVisitor {
     // the predicate directly. BoolImm is already filtered above as an IntImm
     // leaf, so this rule only affects compound bool expressions
     // (LT/LE/GT/GE/EQ/NE/And/Or/Not/Cast-to-bool/Select-of-bool).
-    if (expr.dtype().is_bool()) return false;
+    PrimType expr_ty = expr.ty();
+    if (expr_ty.MatchesCode(DLDataTypeCode::kDLBool)) return false;
     if (CheckContains::ExprContains(expr, IsForbiddenNode)) return false;
     return true;
   }
@@ -662,7 +663,7 @@ class CSEPlanner : public StmtExprVisitor {
       // entry->repr may already contain CSE vars from shallower entries.
       ++counter;
       std::string name = "cse_v" + std::to_string(counter);
-      Var cse_var(name, entry->repr.dtype());
+      Var cse_var(name, entry->repr.ty());
       Stmt bind = Bind(cse_var, entry->repr);
 
       // Step 3c: Record in output tables.
diff --git a/src/tirx/transform/dtype_conversion.cc b/src/tirx/transform/dtype_conversion.cc
index 7cf1593d822b..08d70dab2b33 100644
--- a/src/tirx/transform/dtype_conversion.cc
+++ b/src/tirx/transform/dtype_conversion.cc
@@ -27,30 +27,41 @@ namespace tvm {
 namespace tirx {
 
 PrimExpr ReinterpretAsUInt(PrimExpr value) {
-  return reinterpret(GetStorageUIntDType(value.dtype()), value);
+  return reinterpret(GetStorageUIntDType(value.ty()), value);
 }
 
-DataType GetStorageUIntDType(DataType dtype) { return DataType::UInt(dtype.bits(), dtype.lanes()); }
+PrimType GetStorageUIntDType(PrimType dtype) {
+  if (dtype.IsScalableVector()) {
+    return PrimType::ScalableVector(DLDataTypeCode::kDLUInt, dtype.bits(), dtype.VScaleFactor());
+  }
+  return PrimType::UInt(dtype.bits(), dtype.lanes());
+}
 
-PrimExpr DTypeConversion(PrimExpr src_value, DataType tgt_dtype, RoundingMode round_mode) {
-  DataType src_dtype = src_value.dtype();
+PrimExpr DTypeConversion(PrimExpr src_value, PrimType tgt_dtype, RoundingMode round_mode) {
+  PrimType src_dtype = src_value.ty();
   // Step 1: check dtype
   // The lanes of src dtype and target dtype must match.
-  TVM_FFI_ICHECK_EQ(src_dtype.lanes(), tgt_dtype.lanes())
+  TVM_FFI_ICHECK_EQ(src_dtype->dtype.lanes, tgt_dtype->dtype.lanes)
       << "The lanes for data type for source value must matches the target datatype.";
-  auto is_floating_point = [](DataType dtype) {
-    return dtype.is_float() || dtype.is_bfloat16() || dtype.is_float8() || dtype.is_float6() ||
-           dtype.is_float4();
+  auto is_floating_point = [](PrimType dtype) {
+    DLDataTypeCode code = dtype.code();
+    return code == DLDataTypeCode::kDLFloat ||
+           (code == DLDataTypeCode::kDLBfloat && dtype.bits() == 16) ||
+           code == DLDataTypeCode::kDLFloat8_e3m4 || code == DLDataTypeCode::kDLFloat8_e4m3 ||
+           code == DLDataTypeCode::kDLFloat8_e4m3b11fnuz ||
+           code == DLDataTypeCode::kDLFloat8_e4m3fn || code == DLDataTypeCode::kDLFloat8_e4m3fnuz ||
+           code == DLDataTypeCode::kDLFloat8_e5m2 || code == DLDataTypeCode::kDLFloat8_e5m2fnuz ||
+           code == DLDataTypeCode::kDLFloat8_e8m0fnu || code == DLDataTypeCode::kDLFloat6_e2m3fn ||
+           code == DLDataTypeCode::kDLFloat6_e3m2fn || code == DLDataTypeCode::kDLFloat4_e2m1fn;
   };
   // Both source dtype and target dtype should be floating point.
   TVM_FFI_ICHECK(is_floating_point(src_dtype) && is_floating_point(tgt_dtype));
-  FloatConfig src_fp = FloatConfig::FromDataType(src_value.dtype()),
+  FloatConfig src_fp = FloatConfig::FromDataType(src_dtype),
               tgt_fp = FloatConfig::FromDataType(tgt_dtype);
   int exponent_delta = tgt_fp.exponent - src_fp.exponent;
   int bias_delta = tgt_fp.bias - src_fp.bias;
   int mantissa_delta = tgt_fp.mantissa - src_fp.mantissa;
-  DataType src_uint = GetStorageUIntDType(src_value.dtype()),
-           tgt_uint = GetStorageUIntDType(tgt_dtype);
+  PrimType src_uint = GetStorageUIntDType(src_dtype), tgt_uint = GetStorageUIntDType(tgt_dtype);
   PrimExpr src_uint_value = ReinterpretAsUInt(src_value);
   if (mantissa_delta < 0) {
     // use rounding
diff --git a/src/tirx/transform/dtype_conversion.h b/src/tirx/transform/dtype_conversion.h
index 21bd5bf355bd..d6026cf75fe6 100644
--- a/src/tirx/transform/dtype_conversion.h
+++ b/src/tirx/transform/dtype_conversion.h
@@ -98,12 +98,20 @@ class FloatConfig {
    * \param dtype The data type, must be a floating point.
    * \return The FloatConfig class containing internal floating point representation.
    */
-  static FloatConfig FromDataType(DataType dtype) {
-    TVM_FFI_ICHECK(dtype.is_float() || dtype.is_bfloat16() || dtype.is_float8() ||
-                   dtype.is_float6() || dtype.is_float4())
+  static FloatConfig FromDataType(PrimType dtype) {
+    DLDataTypeCode code = dtype.code();
+    TVM_FFI_ICHECK(
+        code == DLDataTypeCode::kDLFloat ||
+        (code == DLDataTypeCode::kDLBfloat && dtype.bits() == 16) ||
+        code == DLDataTypeCode::kDLFloat8_e3m4 || code == DLDataTypeCode::kDLFloat8_e4m3 ||
+        code == DLDataTypeCode::kDLFloat8_e4m3b11fnuz || code == DLDataTypeCode::kDLFloat8_e4m3fn ||
+        code == DLDataTypeCode::kDLFloat8_e4m3fnuz || code == DLDataTypeCode::kDLFloat8_e5m2 ||
+        code == DLDataTypeCode::kDLFloat8_e5m2fnuz || code == DLDataTypeCode::kDLFloat8_e8m0fnu ||
+        code == DLDataTypeCode::kDLFloat6_e2m3fn || code == DLDataTypeCode::kDLFloat6_e3m2fn ||
+        code == DLDataTypeCode::kDLFloat4_e2m1fn)
         << "FloatConfig is only applicable to floating point data types, got " << dtype
         << " instead.";
-    if (dtype.is_float()) {
+    if (code == DLDataTypeCode::kDLFloat) {
       // IEEE 754 binary formats
       // Reference: https://en.wikipedia.org/wiki/Floating-point_arithmetic
       switch (dtype.bits()) {
@@ -115,46 +123,53 @@ class FloatConfig {
           // float64
           return FloatConfig(11, 52, 1023, InftyStyle::kIEEE, NaNStyle::kIEEE);
       }
-    } else if (dtype.is_bfloat16()) {
+    } else if (code == DLDataTypeCode::kDLBfloat && dtype.bits() == 16) {
       // bfloat16,
       return FloatConfig(8, 7, 127, InftyStyle::kIEEE, NaNStyle::kIEEE);
-    } else if (dtype.is_float8()) {  // float8
+    } else if (code == DLDataTypeCode::kDLFloat8_e3m4 || code == DLDataTypeCode::kDLFloat8_e4m3 ||
+               code == DLDataTypeCode::kDLFloat8_e4m3b11fnuz ||
+               code == DLDataTypeCode::kDLFloat8_e4m3fn ||
+               code == DLDataTypeCode::kDLFloat8_e4m3fnuz ||
+               code == DLDataTypeCode::kDLFloat8_e5m2 ||
+               code == DLDataTypeCode::kDLFloat8_e5m2fnuz ||
+               code == DLDataTypeCode::kDLFloat8_e8m0fnu) {  // float8
       // NVIDIA/Arm/Intel's FP8 formats for Deep Learning
       // Reference: https://arxiv.org/abs/2209.05433
-      switch (dtype.code()) {
-        case DataType::kFloat8_e3m4:
+      switch (code) {
+        case DLDataTypeCode::kDLFloat8_e3m4:
           // E3M4 format, not consistent with IEEE-754
           return FloatConfig(3, 4, 3, InftyStyle::kNone, NaNStyle::kAllOnes);
-        case DataType::kFloat8_e4m3:
+        case DLDataTypeCode::kDLFloat8_e4m3:
           // E4M3 format, not consistent with IEEE-754
           return FloatConfig(4, 3, 7, InftyStyle::kNone, NaNStyle::kAllOnes);
-        case DataType::kFloat8_e4m3b11fnuz:
+        case DLDataTypeCode::kDLFloat8_e4m3b11fnuz:
           // E4M3 variant with b11 encoding, not consistent with IEEE-754
           return FloatConfig(4, 3, 7, InftyStyle::kNone, NaNStyle::kAllOnes);
-        case DataType::kFloat8_e4m3fn:
+        case DLDataTypeCode::kDLFloat8_e4m3fn:
           // E4M3 format, not consistent with IEEE-754
           return FloatConfig(4, 3, 7, InftyStyle::kNone, NaNStyle::kAllOnes);
-        case DataType::kFloat8_e4m3fnuz:
+        case DLDataTypeCode::kDLFloat8_e4m3fnuz:
           // UE4M3 format, not consistent with IEEE-754
           return FloatConfig(4, 3, 7, InftyStyle::kNone, NaNStyle::kAllOnes);
-        case DataType::kFloat8_e5m2:
+        case DLDataTypeCode::kDLFloat8_e5m2:
           // UE5M2 format, consistent with IEEE-754
           return FloatConfig(5, 2, 15, InftyStyle::kIEEE, NaNStyle::kIEEE);
-        case DataType::kFloat8_e5m2fnuz:
+        case DLDataTypeCode::kDLFloat8_e5m2fnuz:
           // UE5M2 format, not consistent with IEEE-754
           return FloatConfig(5, 2, 15, InftyStyle::kNone, NaNStyle::kAllOnes);
-        case DataType::kFloat8_e8m0fnu:
+        case DLDataTypeCode::kDLFloat8_e8m0fnu:
           // UE8M0 format, not consistent with IEEE-754
           return FloatConfig(8, 0, 127, InftyStyle::kNone, NaNStyle::kAllOnes);
         default:
           TVM_FFI_THROW(InternalError) << "Unknown float8 variant: " << dtype;
       }
-    } else if (dtype.is_float6()) {  // float6
-      switch (dtype.code()) {
-        case DataType::kFloat6_e2m3fn:
+    } else if (code == DLDataTypeCode::kDLFloat6_e2m3fn ||
+               code == DLDataTypeCode::kDLFloat6_e3m2fn) {  // float6
+      switch (code) {
+        case DLDataTypeCode::kDLFloat6_e2m3fn:
           // E2M3 format, not consistent with IEEE-754
           return FloatConfig(2, 3, 1, InftyStyle::kNone, NaNStyle::kNone);
-        case DataType::kFloat6_e3m2fn:
+        case DLDataTypeCode::kDLFloat6_e3m2fn:
           // E3M2 format, not consistent with IEEE-754
           return FloatConfig(3, 2, 3, InftyStyle::kNone, NaNStyle::kNone);
         default:
@@ -182,7 +197,7 @@ PrimExpr ReinterpretAsUInt(PrimExpr value);
  * \return The uint data type, the number of bits is
  *   the same as input dtype.
  */
-DataType GetStorageUIntDType(DataType dtype);
+PrimType GetStorageUIntDType(PrimType dtype);
 
 /*!
  * \brief Conversion routine from value stored in one floating point data type to another floating
@@ -193,7 +208,7 @@ DataType GetStorageUIntDType(DataType dtype);
  * \return The converted value in target floating point data type.
  * \note Used when there is no native data type conversion implementation.
  */
-PrimExpr DTypeConversion(PrimExpr src_value, DataType tgt_dtype,
+PrimExpr DTypeConversion(PrimExpr src_value, PrimType tgt_dtype,
                          RoundingMode round_mode = RoundingMode::kHalfToEven);
 
 }  // namespace tirx
diff --git a/src/tirx/transform/flatten_buffer.cc b/src/tirx/transform/flatten_buffer.cc
index c18a3bccb964..48e7edc4171f 100644
--- a/src/tirx/transform/flatten_buffer.cc
+++ b/src/tirx/transform/flatten_buffer.cc
@@ -24,6 +24,7 @@
 #include <tvm/arith/iter_affine_map.h>
 #include <tvm/ffi/cast.h>
 #include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/type.h>
 #include <tvm/tirx/analysis.h>
 #include <tvm/tirx/layout.h>
 #include <tvm/tirx/stmt_functor.h>
@@ -113,9 +114,9 @@ class BufferFlattener : public arith::IRMutatorWithAnalyzer {
 
     auto new_buf = GetFlattenedBuffer(node->buffer);
     // TODO(Lunderberg): Move the handling of boolean into a dedicated pass.
-    if (new_buf->dtype == DataType::Bool()) {
+    if (new_buf->dtype->dtype == DLDataType{kDLBool, 8, 1}) {
       auto writer = new_buf.CopyOnWrite();
-      writer->dtype = DataType::Int(8);
+      writer->dtype = PrimType::Int(8);
     }
     if (!node->buffer.same_as(new_buf)) {
       node.CopyOnWrite()->buffer = new_buf;
@@ -145,8 +146,8 @@ class BufferFlattener : public arith::IRMutatorWithAnalyzer {
 
     // TODO(Lunderberg): Move the handling of boolean into a
     // dedicated pass.
-    if (flattened->dtype == DataType::Bool()) {
-      writer->dtype = DataType::Int(8);
+    if (flattened->dtype->dtype == DLDataType{kDLBool, 8, 1}) {
+      writer->dtype = PrimType::Int(8);
     }
     // canonicalize shape
     for (size_t i = 0; i < flattened->shape.size(); ++i) {
@@ -160,7 +161,8 @@ class BufferFlattener : public arith::IRMutatorWithAnalyzer {
 
   Stmt VisitStmt_(const BufferStoreNode* op) final {
     BufferStore store = StmtExprMutator::VisitStmt_(op).as_or_throw<BufferStore>();
-    bool store_returns_bool = (op->value.dtype() == DataType::Bool());
+    PrimType store_value_ty = op->value.ty();
+    bool store_returns_bool = store_value_ty.MatchesCode(DLDataTypeCode::kDLBool);
     store = VisitBufferAccess(store);
 
     // Handle casts from the value's dtype to the dtype of the
@@ -168,27 +170,28 @@ class BufferFlattener : public arith::IRMutatorWithAnalyzer {
     // TODO(Lunderberg): Move the handling of boolean into a
     // dedicated pass.
     if (store_returns_bool) {
-      TVM_FFI_ICHECK_EQ(store->buffer->dtype, DataType::Int(8))
+      TVM_FFI_ICHECK_EQ(store->buffer->dtype->dtype, (DLDataType{kDLInt, 8, 1}))
           << "Expected int8 backing array for boolean tensor";
       auto writer = store.CopyOnWrite();
-      writer->value = tvm::cast(DataType::Int(8), store->value);
+      writer->value = tvm::cast(PrimType::Int(8), store->value);
       return store;
     }
     return store;
   }
 
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
-    bool load_returns_bool = (op->dtype == DataType::Bool());
+    PrimType load_ty = op->ty();
+    bool load_returns_bool = load_ty.MatchesCode(DLDataTypeCode::kDLBool);
     BufferLoad load = StmtExprMutator::VisitExpr_(op).as_or_throw<BufferLoad>();
     load = VisitBufferAccess(load);
     // Handle casts from dtype of the backing array to value's dtype.
     // TODO(Lunderberg): Move the handling of boolean into a
     // dedicated pass.
     if (load_returns_bool) {
-      TVM_FFI_ICHECK_EQ(load->buffer->dtype, DataType::Int(8))
+      TVM_FFI_ICHECK_EQ(load->buffer->dtype->dtype, (DLDataType{kDLInt, 8, 1}))
           << "Expected int8 backing array for boolean tensor";
-      load.CopyOnWrite()->dtype = DataType::Int(8);
-      return tvm::cast(DataType::Bool(), load);
+      load.CopyOnWrite()->BaseExprNode::ty = PrimType::Int(8);
+      return tvm::cast(PrimType::Bool(), load);
     } else {
       return load;
     }
diff --git a/src/tirx/transform/force_narrow_index_to_i32.cc b/src/tirx/transform/force_narrow_index_to_i32.cc
index 68ae7e73f636..65988f0e9647 100644
--- a/src/tirx/transform/force_narrow_index_to_i32.cc
+++ b/src/tirx/transform/force_narrow_index_to_i32.cc
@@ -38,7 +38,7 @@ class Int32DTypeNarrower : public IndexDataTypeNormalizer {
   static PrimFunc RewriteDataType(PrimFunc func) {
     // Check if the integer parameter buffers have dtype other than int32.
     for (auto it : func->buffer_map) {
-      if (it.second->dtype.is_int() && it.second->dtype.bits() > 32) {
+      if (it.second->dtype.code() == DLDataTypeCode::kDLInt && it.second->dtype.bits() > 32) {
         TVM_FFI_THROW(InternalError)
             << "The buffer " << it.second << " in the function buffer map has dtype "
             << it.second->dtype << ". The function is " << func;
@@ -51,11 +51,11 @@ class Int32DTypeNarrower : public IndexDataTypeNormalizer {
 
  private:
   explicit Int32DTypeNarrower(PrimFunc func)
-      : IndexDataTypeNormalizer(DataType::Int(32)), func_(std::move(func)) {}
+      : IndexDataTypeNormalizer(PrimType::Int(32)), func_(std::move(func)) {}
 
   PrimExpr VisitExpr_(const IntImmNode* op) final {
     // ignore the enabled condition and always rewrite i64
-    if (op->dtype == DataType::Int(64)) {
+    if (op->ty() == PrimType::Int(64)) {
       TVM_FFI_ICHECK_LE(op->value, max_value(target_data_type_).as_or_throw<IntImm>()->value);
       return IntImm::Int32(op->value);
     }
@@ -66,7 +66,7 @@ class Int32DTypeNarrower : public IndexDataTypeNormalizer {
     SBlock block_ = IndexDataTypeNormalizer::VisitStmt_(block).as_or_throw<SBlock>();
     // Check if the allocated integer buffers have dtype other than int32.
     for (const Buffer& buf : block_->alloc_buffers) {
-      if (buf->dtype.is_int() && buf->dtype.bits() > 32) {
+      if (buf->dtype.code() == DLDataTypeCode::kDLInt && buf->dtype.bits() > 32) {
         TVM_FFI_THROW(InternalError)
             << "The buffer " << buf << " allocated in the function has dtype " << buf->dtype
             << ". The function is " << func_;
diff --git a/src/tirx/transform/ir_utils.cc b/src/tirx/transform/ir_utils.cc
index c81b7c686775..342fb5df025e 100644
--- a/src/tirx/transform/ir_utils.cc
+++ b/src/tirx/transform/ir_utils.cc
@@ -459,7 +459,7 @@ class IRConvertSSA final : public StmtExprMutator {
           if (var->type_annotation.defined()) {
             return Var(var->name_hint, var->type_annotation);
           } else {
-            return Var(var->name_hint, var->dtype);
+            return Var(var->name_hint, var.ty());
           }
         }();
 
@@ -542,9 +542,9 @@ class IRConvertSSA final : public StmtExprMutator {
       }
     } else {
       if (is_size_var) {
-        return SizeVar(old_var->name_hint, old_var->dtype);
+        return SizeVar(old_var->name_hint, old_var.ty());
       } else {
-        return Var(old_var->name_hint, old_var->dtype);
+        return Var(old_var->name_hint, old_var.ty());
       }
     }
   }
@@ -750,7 +750,8 @@ ffi::Optional<arith::IntConstraints> ConditionalBoundsContext::TrySolveCondition
         if (obj.same_as(e)) {
           return;
         } else if (const VarNode* var = obj.as<VarNode>()) {
-          if (var->dtype.is_int() || var->dtype.is_uint()) {
+          PrimType var_ty = var->ty();
+          if (var_ty.code() == DLDataTypeCode::kDLInt || var_ty.code() == DLDataTypeCode::kDLUInt) {
             cand_vars.push_back(ffi::GetRef<Var>(var));
           }
         } else {
diff --git a/src/tirx/transform/ir_utils.h b/src/tirx/transform/ir_utils.h
index 1bb9aac7f25d..d103ff9f583a 100644
--- a/src/tirx/transform/ir_utils.h
+++ b/src/tirx/transform/ir_utils.h
@@ -95,7 +95,7 @@ inline ffi::Array<T> UpdateArray(ffi::Array<T> arr, F fupdate) {
  * \param kind The data kind.
  * \return the get expression.
  */
-inline PrimExpr TVMStructGet(DataType dtype, Var handle, int index,
+inline PrimExpr TVMStructGet(PrimType dtype, Var handle, int index,
                              builtin::TVMStructFieldKind kind) {
   ffi::Array<PrimExpr> args = {handle, IntImm::Int32(index), IntImm::Int32(static_cast<int>(kind))};
   return Call(dtype, builtin::tvm_struct_get(), args);
@@ -107,14 +107,14 @@ inline PrimExpr TVMStructGet(DataType dtype, Var handle, int index,
  * \param dtype The data type.
  * \param offset the offset index.
  */
-inline PrimExpr AddressOffset(Var handle, DataType dtype, int offset) {
+inline PrimExpr AddressOffset(Var handle, PrimType dtype, int offset) {
   PrimExpr offset_expr = IntImm::Int32(offset * dtype.lanes());
   ffi::Array<PrimExpr> shape = {offset_expr + 1};
-  Buffer dummy_buf(handle, dtype, shape, {}, 0, handle->name_hint, 0, 0, kDefault, {}, Span(),
-                   std::nullopt);
+  Buffer dummy_buf(handle, dtype->dtype, shape, {}, 0, handle->name_hint, 0, 0, kDefault, {},
+                   Span(), std::nullopt);
   BufferLoad buf_load(dummy_buf, {offset_expr});
 
-  return Call(DataType::Handle(), builtin::address_of(), {buf_load});
+  return Call(PrimType::Handle(), builtin::address_of(), {buf_load});
 }
 
 /*!
@@ -123,18 +123,19 @@ inline PrimExpr AddressOffset(Var handle, DataType dtype, int offset) {
  * \param dtype The data type.
  * \param offset the offset index.
  */
-inline PrimExpr AddressOffset(Var handle, DataType dtype, PrimExpr offset) {
+inline PrimExpr AddressOffset(Var handle, PrimType dtype, PrimExpr offset) {
   if (dtype.lanes() != 1) {
-    offset = offset * MakeConst(offset.dtype(), dtype.lanes());
-    offset = Ramp(offset, MakeConst(offset.dtype(), 1), dtype.lanes());
+    PrimType offset_ty = offset.ty();
+    offset = offset * MakeConst(offset_ty, dtype.lanes());
+    offset = Ramp(offset, MakeConst(offset_ty, 1), dtype.lanes());
   }
 
   ffi::Array<PrimExpr> shape = {offset + 1};
-  Buffer dummy_buf(handle, dtype.element_of(), shape, {}, 0, handle->name_hint, 0, 0, kDefault, {},
-                   Span(), std::nullopt);
+  Buffer dummy_buf(handle, dtype.WithLanes(1)->dtype, shape, {}, 0, handle->name_hint, 0, 0,
+                   kDefault, {}, Span(), std::nullopt);
   BufferLoad buf_load(dummy_buf, {offset});
 
-  return Call(DataType::Handle(), builtin::address_of(), {buf_load});
+  return Call(PrimType::Handle(), builtin::address_of(), {buf_load});
 }
 
 /*!
@@ -148,7 +149,7 @@ inline PrimExpr AddressOffset(Var handle, DataType dtype, PrimExpr offset) {
 inline Stmt TVMStructSet(Var handle, int index, builtin::TVMStructFieldKind kind, PrimExpr value) {
   ffi::Array<PrimExpr> args = {handle, IntImm::Int32(index), IntImm::Int32(static_cast<int>(kind)),
                                value};
-  return Evaluate(Call(DataType::Int(32), builtin::tvm_struct_set(), args));
+  return Evaluate(Call(PrimType::Int(32), builtin::tvm_struct_set(), args));
 }
 
 /*!
@@ -156,13 +157,15 @@ inline Stmt TVMStructSet(Var handle, int index, builtin::TVMStructFieldKind kind
  * \param t The original type.
  * \return The corresponding API type.
  */
-inline DataType APIType(DataType t) {
-  TVM_FFI_ICHECK(!t.is_void()) << "Cannot pass void type through packed API.";
-  if (t.is_handle()) return t;
+inline PrimType APIType(const PrimType& t) {
+  TVM_FFI_ICHECK(!t.IsVoid()) << "Cannot pass void type through packed API.";
+  if (t.IsHandle()) return t;
   TVM_FFI_ICHECK_EQ(t.lanes(), 1) << "Cannot pass vector type through packed API.";
-  if (t.is_bool() || t.is_uint() || t.is_int()) return DataType::Int(64);
-  TVM_FFI_ICHECK(t.is_float());
-  return DataType::Float(64);
+  if (t.MatchesCode(DLDataTypeCode::kDLBool, DLDataTypeCode::kDLUInt, DLDataTypeCode::kDLInt)) {
+    return PrimType::Int(64);
+  }
+  TVM_FFI_ICHECK_EQ(t.code(), DLDataTypeCode::kDLFloat);
+  return PrimType::Float(64);
 }
 
 /*!
@@ -171,10 +174,10 @@ inline DataType APIType(DataType t) {
  * \param const_size The constant size of the array.
  * \return the alignment
  */
-inline int GetTempAllocaAlignment(DataType type, int32_t const_size) {
+inline int GetTempAllocaAlignment(const PrimType& type, int32_t const_size) {
   int align = runtime::kTempAllocaAlignment;
   if (const_size > 0) {
-    int64_t const_s = static_cast<int64_t>(const_size) * type.bits() * type.lanes() / 8;
+    int64_t const_s = static_cast<int64_t>(const_size) * type.StorageBytes();
     while (align > const_s) {
       align = align / 2;
     }
@@ -200,7 +203,7 @@ inline PrimExpr ConstInt32(size_t index) {
  */
 inline PrimExpr StackAlloca(std::string type, size_t num) {
   ffi::Array<PrimExpr> args = {StringImm(type), ConstInt32(num)};
-  return Call(DataType::Handle(), builtin::tvm_stack_alloca(), args);
+  return Call(PrimType::Handle(), builtin::tvm_stack_alloca(), args);
 }
 
 /*!
diff --git a/src/tirx/transform/lower_intrin.cc b/src/tirx/transform/lower_intrin.cc
index 804a582d900b..cd2527579a3d 100644
--- a/src/tirx/transform/lower_intrin.cc
+++ b/src/tirx/transform/lower_intrin.cc
@@ -117,8 +117,8 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
     op = ret.as<FloorDivNode>();
     if (op == nullptr) return ret;
     int shift;
-    const DataType& dtype = op->dtype;
-    TVM_FFI_ICHECK(dtype.is_int() || dtype.is_uint());
+    PrimType dtype = op->ty();
+    TVM_FFI_ICHECK(dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt));
 
     if (support_bitwise_op_ && is_const_power_of_two_integer(op->b, &shift)) {
       // lower to right shift if possible.
@@ -145,7 +145,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
       // condition on b >= 0.
       // truncmod(a, b) < 0 will implies ceildiv,
       // So we need to correct these cases.
-      if ((dtype == DataType::Int(32) || dtype == DataType::Int(64)) && support_bitwise_op_) {
+      if ((dtype == PrimType::Int(32) || dtype == PrimType::Int(64)) && support_bitwise_op_) {
         // equivalent to rdiv + (rmod >= 0 ? 0: -1);
         return rdiv + (rmod >> MakeConst(dtype, dtype.bits() - 1));
       } else {
@@ -153,7 +153,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
       }
 
     } else {
-      if (dtype.is_float()) {
+      if (dtype.code() == DLDataTypeCode::kDLFloat) {
         // floor(a / b)
         return VisitExpr_(tvm::floor(op->a / op->b).as<CallNode>());
       } else {
@@ -178,8 +178,8 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
     if (op == nullptr) return ret;
     // Lower floordiv to native truncdiv.
     int shift;
-    const DataType& dtype = op->dtype;
-    TVM_FFI_ICHECK(dtype.is_int() || dtype.is_uint());
+    PrimType dtype = op->ty();
+    TVM_FFI_ICHECK(dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt));
 
     if (support_bitwise_op_ && is_const_power_of_two_integer(op->b, &shift)) {
       // lower to masking if possible.
@@ -205,7 +205,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
       // mod(a, b) < 0 will imply we are doing ceildiv,
       // So we need to correct these cases.
       PrimExpr rmod = truncmod(op->a, op->b);
-      if ((dtype == DataType::Int(32) || dtype == DataType::Int(64)) && support_bitwise_op_) {
+      if ((dtype == PrimType::Int(32) || dtype == PrimType::Int(64)) && support_bitwise_op_) {
         // (rmod >> shift) & b
         // -> (rmod >= 0 ? 0: -1) & b
         // -> rmod >= 0 ? 0 : b
@@ -215,7 +215,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
       }
 
     } else {
-      if (dtype.is_float()) {
+      if (dtype.code() == DLDataTypeCode::kDLFloat) {
         // a - floor(a / b) * b
         return op->a - (VisitExpr_(tvm::floor(op->a / op->b).as<CallNode>()) * op->b);
       } else {
@@ -274,24 +274,28 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
     if (const BroadcastNode* bcast = e.as<BroadcastNode>()) {
       if (const CastNode* cast = bcast->value.as<CastNode>()) {
         auto should_swap = [&]() {
+          PrimType cast_ty = cast->ty();
+          PrimType value_ty = cast->value.ty();
           // Maintain behaviour (int8 -> int16, fp16 -> fp32).
-          if (cast->dtype.bits() == cast->value.dtype().bits() * 2) {
+          if (cast_ty.bits() == value_ty.bits() * 2) {
             return true;
           }
           // Check both operands are integer-like.
-          if (!cast->dtype.is_uint() && !cast->dtype.is_int()) {
+          if (cast_ty.code() != DLDataTypeCode::kDLUInt &&
+              cast_ty.code() != DLDataTypeCode::kDLInt) {
             return false;
           }
-          if (!cast->value.dtype().is_uint() && !cast->value.dtype().is_int()) {
+          if (value_ty.code() != DLDataTypeCode::kDLUInt &&
+              value_ty.code() != DLDataTypeCode::kDLInt) {
             return false;
           }
           // If both are integer-like, swap if we have a widening cast.
-          return cast->dtype.bits() > cast->value.dtype().bits();
+          return cast_ty.bits() > value_ty.bits();
         };
 
         if (should_swap()) {
           PrimExpr new_bcast = Broadcast(cast->value, bcast->lanes);
-          return Cast(bcast->dtype, new_bcast);
+          return Cast(ffi::GetRef<PrimExpr>(bcast).ty(), new_bcast);
         }
       }
     }
@@ -303,8 +307,8 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
     PrimExpr lhs = SwapBroadcastCast(a);
     PrimExpr rhs = SwapBroadcastCast(b);
 
-    if (fma_ != nullptr && op->dtype.is_float()) {
-      PrimExpr r = fma_(Call(op->dtype, builtin::fma(), {lhs, rhs, c}));
+    if (fma_ != nullptr && op->ty().code() == DLDataTypeCode::kDLFloat) {
+      PrimExpr r = fma_(Call(ffi::GetRef<PrimExpr>(op).ty(), builtin::fma(), {lhs, rhs, c}));
       if (r.defined()) return this->VisitExpr(r);
     } else {
       if (!lhs.same_as(a) || !rhs.same_as(b)) {
@@ -334,8 +338,10 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
     if (const_int_bound_a->min_value >= 0) {
       return std::nullopt;
     }
+    PrimType a_ty = a.ty();
+    // This overflow check is scalar element based. Lane count is intentionally ignored.
     const int64_t max_value_of_dtype =
-        tvm::max_value(a->dtype.element_of()).as_or_throw<IntImm>()->value;
+        tvm::max_value(PrimType(a_ty.code(), a_ty.bits())).as_or_throw<IntImm>()->value;
 
     // NOTE: ensures that (b-1) - a_min does not overflow
     // also note: max_value_of_dtype + const_int_bound_a->min_value won't overflow
diff --git a/src/tirx/transform/lower_tirx_cleanup.cc b/src/tirx/transform/lower_tirx_cleanup.cc
index 62f9bd31246e..b1586b9aed23 100644
--- a/src/tirx/transform/lower_tirx_cleanup.cc
+++ b/src/tirx/transform/lower_tirx_cleanup.cc
@@ -171,8 +171,8 @@ class LayoutApplier : public arith::IRMutatorWithAnalyzer {
     }
     // TODO(Lunderberg): Move the handling of boolean into a
     // dedicated pass.
-    if (flattened->dtype == DataType::Bool()) {
-      writer->dtype = DataType::Int(8);
+    if (flattened->dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
+      writer->dtype = PrimType::Int(8);
     }
     // canonicalize shape
     for (size_t i = 0; i < flattened->shape.size(); ++i) {
@@ -187,7 +187,8 @@ class LayoutApplier : public arith::IRMutatorWithAnalyzer {
 
   Stmt VisitStmt_(const BufferStoreNode* op) final {
     BufferStore store = StmtExprMutator::VisitStmt_(op).as_or_throw<BufferStore>();
-    bool store_returns_bool = (op->value.dtype() == DataType::Bool());
+    PrimType store_value_ty = op->value.ty();
+    bool store_returns_bool = store_value_ty.MatchesCode(DLDataTypeCode::kDLBool);
     store = VisitBufferAccess(store);
 
     // Handle casts from the value's dtype to the dtype of the
@@ -195,27 +196,28 @@ class LayoutApplier : public arith::IRMutatorWithAnalyzer {
     // TODO(Lunderberg): Move the handling of boolean into a
     // dedicated pass.
     if (store_returns_bool) {
-      TVM_FFI_ICHECK_EQ(store->buffer->dtype, DataType::Int(8))
+      TVM_FFI_ICHECK_EQ(store->buffer->dtype, PrimType::Int(8))
           << "Expected int8 backing array for boolean tensor";
       auto writer = store.CopyOnWrite();
-      writer->value = tvm::cast(DataType::Int(8), store->value);
+      writer->value = tvm::cast(PrimType::Int(8), store->value);
       return std::move(store);
     }
     return std::move(store);
   }
 
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
-    bool load_returns_bool = (op->dtype == DataType::Bool());
+    PrimType load_ty = op->ty();
+    bool load_returns_bool = load_ty.MatchesCode(DLDataTypeCode::kDLBool);
     BufferLoad load = StmtExprMutator::VisitExpr_(op).as_or_throw<BufferLoad>();
     load = VisitBufferAccess(load);
     // Handle casts from dtype of the backing array to value's dtype.
     // TODO(Lunderberg): Move the handling of boolean into a
     // dedicated pass.
     if (load_returns_bool) {
-      TVM_FFI_ICHECK_EQ(load->buffer->dtype, DataType::Int(8))
+      TVM_FFI_ICHECK_EQ(load->buffer->dtype, PrimType::Int(8))
           << "Expected int8 backing array for boolean tensor";
-      load.CopyOnWrite()->dtype = DataType::Int(8);
-      return tvm::cast(DataType::Bool(), load);
+      load.CopyOnWrite()->BaseExprNode::ty = PrimType::Int(8);
+      return tvm::cast(PrimType::Bool(), load);
     } else {
       return std::move(load);
     }
diff --git a/src/tirx/transform/lower_tirx_opaque.cc b/src/tirx/transform/lower_tirx_opaque.cc
index 9c822e1f9558..03e40d4ec824 100644
--- a/src/tirx/transform/lower_tirx_opaque.cc
+++ b/src/tirx/transform/lower_tirx_opaque.cc
@@ -146,8 +146,8 @@ class TIRxOpaqueLower : public StmtExprMutator {
       return var;
     } else {
       PrimExpr expr = it->second;
-      if (expr.dtype() != var.dtype()) {
-        expr = tvm::cast(var.dtype(), std::move(expr));
+      if (expr.ty() != var.ty()) {
+        expr = tvm::cast(var.ty(), std::move(expr));
       }
       return expr;
     }
diff --git a/src/tirx/transform/lower_tvm_builtin.cc b/src/tirx/transform/lower_tvm_builtin.cc
index 14772cead49f..606bfbea52aa 100644
--- a/src/tirx/transform/lower_tvm_builtin.cc
+++ b/src/tirx/transform/lower_tvm_builtin.cc
@@ -39,6 +39,14 @@
 namespace tvm {
 namespace tirx {
 
+namespace {
+
+TVM_FFI_INLINE int GetVectorBytes(const PrimType& dtype) {
+  return static_cast<int>(dtype.StorageBytes());
+}
+
+}  // namespace
+
 // Calculate the statistics of packed function.
 // These information are needed during codegen.
 class BuiltinLower : public StmtExprMutator {
@@ -99,8 +107,8 @@ class BuiltinLower : public StmtExprMutator {
   // Record stack frame for existing scope.
   struct AllocaScope {
     Buffer stack_shape;
-    Var stack_array = Var("stack_array", DataType::Handle());
-    Var stack_ffi_any = Var("stack_ffi_any", DataType::Handle());
+    Var stack_array = Var("stack_array", PrimType::Handle());
+    Var stack_ffi_any = Var("stack_ffi_any", PrimType::Handle());
 
     StackSizes max_sizes;
     StackSizes run_sizes;
@@ -130,7 +138,7 @@ class BuiltinLower : public StmtExprMutator {
     {
       // NOTE: this scope reference is invalid after any mutation is applied to alloca_scope_.
       auto& scope = precheck.alloca_scope_.back();
-      scope.stack_shape = decl_buffer({IntImm::Int64(0)}, DataType::Int(64), "stack_shape");
+      scope.stack_shape = decl_buffer({IntImm::Int64(0)}, PrimType::Int(64), "stack_shape");
     }
 
     precheck.VisitStmt(stmt);
@@ -171,7 +179,7 @@ class BuiltinLower : public StmtExprMutator {
 
       if (scope.max_sizes.shape_stack != -1) {
         scope.stack_shape = decl_buffer({IntImm::Int64(scope.max_sizes.shape_stack)},
-                                        DataType::Int(64), "stack_shape");
+                                        PrimType::Int(64), "stack_shape");
         alloca_stmts.push_back(
             Bind(scope.stack_shape->data, StackAlloca("shape", scope.max_sizes.shape_stack)));
         stmt = SeqStmt::Flatten(DeclBuffer(scope.stack_shape), stmt);
@@ -245,7 +253,7 @@ class BuiltinLower : public StmtExprMutator {
         return stmt;
       }
     }
-    if (op->buffer->dtype.is_scalable_vector()) {
+    if (op->buffer->dtype.IsScalableVector()) {
       return stmt;
     }
     int64_t nbytes = GetVectorBytes(op->buffer->dtype);
@@ -261,22 +269,22 @@ class BuiltinLower : public StmtExprMutator {
         }
       }
     }
-    PrimExpr total_bytes = IntImm(DataType::UInt(64), nbytes);
+    PrimExpr total_bytes = IntImm(PrimType::UInt(64), nbytes);
     for (size_t i = 0; i < op->buffer->shape.size(); ++i) {
       total_bytes = total_bytes * op->buffer->shape[i];
     }
     TVM_FFI_ICHECK(device_type_) << "Unknown device type in current IR";
     TVM_FFI_ICHECK(device_id_) << "Unknown device id in current IR";
-    Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {}));
+    Stmt throw_last_error = Evaluate(Call(PrimType::Int(32), builtin::tvm_throw_last_error(), {}));
 
     Stmt alloc_nullptr_check = IfThenElse(
-        Call(DataType::Bool(), builtin::isnullptr(), {op->buffer->data}), throw_last_error);
+        Call(PrimType::Bool(), builtin::isnullptr(), {op->buffer->data}), throw_last_error);
 
     static const Op& free_workspace_op = Op::Get("tirx.TVMBackendFreeWorkspace");
     static const Op& alloc_workspace_op = Op::Get("tirx.TVMBackendAllocWorkspace");
-    PrimExpr free_op = Call(DataType::Int(32), free_workspace_op,
-                            {cast(DataType::Int(32), device_type_.value()),
-                             cast(DataType::Int(32), device_id_.value()), op->buffer->data});
+    PrimExpr free_op = Call(PrimType::Int(32), free_workspace_op,
+                            {cast(PrimType::Int(32), device_type_.value()),
+                             cast(PrimType::Int(32), device_id_.value()), op->buffer->data});
     Stmt free_stmt = IfThenElse(free_op != IntImm::Int32(0), throw_last_error);
 
     // Push free to enclosing scope's pending_frees (LIFO ordering preserved).
@@ -284,9 +292,9 @@ class BuiltinLower : public StmtExprMutator {
 
     Stmt alloc_bind = Bind(
         op->buffer->data,
-        Call(op->buffer->data.dtype(), alloc_workspace_op,
-             {cast(DataType::Int(32), device_type_.value()),
-              cast(DataType::Int(32), device_id_.value()), total_bytes,
+        Call(op->buffer->data.ty(), alloc_workspace_op,
+             {cast(PrimType::Int(32), device_type_.value()),
+              cast(PrimType::Int(32), device_id_.value()), total_bytes,
               IntImm::Int32(op->buffer->dtype.code()), IntImm::Int32(op->buffer->dtype.bits())}));
 
     return SeqStmt({alloc_bind, alloc_nullptr_check});
@@ -390,7 +398,7 @@ class BuiltinLower : public StmtExprMutator {
     } else if (op->op.same_as(builtin::tvm_stack_make_array())) {
       return MakeArray(op);
     } else if (op->op.same_as(builtin::tvm_context_id())) {
-      return IntImm(op->dtype, 0);
+      return IntImm(ffi::GetRef<PrimExpr>(op).ty(), 0);
     } else if (op->op.same_as(builtin::dma_copy())) {
       return MakeDMACopy(op);
     } else if (op->op.same_as(builtin::dma_wait())) {
@@ -426,7 +434,7 @@ class BuiltinLower : public StmtExprMutator {
     PrimExpr bypass_cache = op->args[4];
 
     auto method_name = GetDeviceMethodName("dma_copy");
-    Call call_packed = Call(DataType::Int(32), builtin::tvm_call_packed(),
+    Call call_packed = Call(PrimType::Int(32), builtin::tvm_call_packed(),
                             {method_name, queue_id, dst, src, size, bypass_cache});
     return VisitExpr(call_packed);
   }
@@ -437,7 +445,7 @@ class BuiltinLower : public StmtExprMutator {
 
     auto method_name = GetDeviceMethodName("dma_wait");
     Call call_packed =
-        Call(DataType::Int(32), builtin::tvm_call_packed(), {method_name, queue_id, inflight});
+        Call(PrimType::Int(32), builtin::tvm_call_packed(), {method_name, queue_id, inflight});
     return VisitExpr(call_packed);
   }
 
@@ -445,7 +453,7 @@ class BuiltinLower : public StmtExprMutator {
     PrimExpr queue_id = op->args[0];
 
     auto method_name = GetDeviceMethodName("dma_start_group");
-    Call call_packed = Call(DataType::Int(32), builtin::tvm_call_packed(), {method_name, queue_id});
+    Call call_packed = Call(PrimType::Int(32), builtin::tvm_call_packed(), {method_name, queue_id});
     return VisitExpr(call_packed);
   }
 
@@ -453,7 +461,7 @@ class BuiltinLower : public StmtExprMutator {
     PrimExpr queue_id = op->args[0];
 
     auto method_name = GetDeviceMethodName("dma_end_group");
-    Call call_packed = Call(DataType::Int(32), builtin::tvm_call_packed(), {method_name, queue_id});
+    Call call_packed = Call(PrimType::Int(32), builtin::tvm_call_packed(), {method_name, queue_id});
     return VisitExpr(call_packed);
   }
 
@@ -472,10 +480,10 @@ class BuiltinLower : public StmtExprMutator {
     op = expr.as<CallNode>();
     // no need to perform any store for a scalar shape
     for (size_t i = 0; i < op->args.size(); ++i) {
-      prep_seq.emplace_back(BufferStore(scope.stack_shape, cast(DataType::Int(64), op->args[i]),
+      prep_seq.emplace_back(BufferStore(scope.stack_shape, cast(PrimType::Int(64), op->args[i]),
                                         {ConstInt32(stack_begin + i)}));
     }
-    return AddressOffset(scope.stack_shape->data, DataType::Int(64), stack_begin);
+    return AddressOffset(scope.stack_shape->data, PrimType::Int(64), stack_begin);
   }
   // make array
   PrimExpr MakeArray(const CallNode* op) {
@@ -499,31 +507,31 @@ class BuiltinLower : public StmtExprMutator {
     prep_seq.emplace_back(TVMStructSet(scope.stack_array, idx, builtin::kDLTensorStrides, strides));
     prep_seq.emplace_back(
         TVMStructSet(scope.stack_array, idx, builtin::kDLTensorNDim, op->args[3]));
-    DataType dtype = op->args[4].dtype();
+    PrimType dtype = op->args[4].ty();
     prep_seq.emplace_back(TVMStructSet(scope.stack_array, idx, builtin::kDLTensorTypeCode,
-                                       IntImm(DataType::UInt(8), static_cast<int>(dtype.code()))));
+                                       IntImm(PrimType::UInt(8), static_cast<int>(dtype.code()))));
     prep_seq.emplace_back(TVMStructSet(scope.stack_array, idx, builtin::kDLTensorTypeBits,
-                                       IntImm(DataType::UInt(8), dtype.bits())));
+                                       IntImm(PrimType::UInt(8), dtype.bits())));
     prep_seq.emplace_back(TVMStructSet(scope.stack_array, idx, builtin::kDLTensorTypeLanes,
-                                       IntImm(DataType::UInt(16), dtype.lanes())));
+                                       IntImm(PrimType::UInt(16), dtype.lanes())));
     // set byte offset
     int data_bytes = GetVectorBytes(dtype);
     PrimExpr elem_offset = op->args[5];
     PrimExpr byte_offset;
     if (!is_zero(elem_offset)) {
-      byte_offset = elem_offset * MakeConst(elem_offset.dtype(), data_bytes);
+      byte_offset = elem_offset * MakeConst(elem_offset.ty(), data_bytes);
     } else {
       byte_offset = elem_offset;
     }
     prep_seq.emplace_back(TVMStructSet(scope.stack_array, idx, builtin::kDLTensorByteOffset,
-                                       cast(DataType::UInt(64), byte_offset)));
+                                       cast(PrimType::UInt(64), byte_offset)));
     TVM_FFI_ICHECK(device_type_) << "Unknown device type in current IR";
     TVM_FFI_ICHECK(device_id_) << "Unknown device id in current IR";
     prep_seq.emplace_back(TVMStructSet(scope.stack_array, idx, builtin::kDLTensorDeviceId,
-                                       cast(DataType::Int(32), device_id_.value())));
+                                       cast(PrimType::Int(32), device_id_.value())));
     prep_seq.emplace_back(TVMStructSet(scope.stack_array, idx, builtin::kDLTensorDeviceType,
-                                       cast(DataType::Int(32), device_type_.value())));
-    return TVMStructGet(DataType::Handle(), scope.stack_array, idx, builtin::kDLTensorAddr);
+                                       cast(PrimType::Int(32), device_type_.value())));
+    return TVMStructGet(PrimType::Handle(), scope.stack_array, idx, builtin::kDLTensorAddr);
   }
 
   void SetPackedArg(PrimExpr arg, const Var& args_stack, size_t stack_offset,
@@ -533,26 +541,28 @@ class BuiltinLower : public StmtExprMutator {
       // call runtime function to set anylist
       static const Op& anylist_set_packed_arg_op = Op::Get("tirx.TVMBackendAnyListSetPackedArg");
       prep_seq->emplace_back(Evaluate(Call(
-          DataType::Int(32), anylist_set_packed_arg_op,
+          PrimType::Int(32), anylist_set_packed_arg_op,
           {call_pattern->args[0], call_pattern->args[1], args_stack, ConstInt32(stack_offset)})));
     } else {
-      DataType api_dtype = APIType(arg.dtype());
-      if (arg.dtype() != api_dtype) {
-        arg = Cast(api_dtype, arg);
+      PrimType arg_ty = arg.ty();
+      PrimType api_ty = APIType(arg_ty);
+      if (arg_ty != api_ty) {
+        arg = Cast(api_ty, arg);
       }
 
       int arg_type_index = [&]() {
-        if (api_dtype.is_bool()) return ffi::TypeIndex::kTVMFFIBool;
-        if (api_dtype.is_int() || api_dtype.is_uint()) return ffi::TypeIndex::kTVMFFIInt;
-        if (api_dtype.is_float()) return ffi::TypeIndex::kTVMFFIFloat;
-        if (api_dtype.is_handle() && arg.as<StringImmNode>()) {
+        if (api_ty.MatchesCode(DLDataTypeCode::kDLBool)) return ffi::TypeIndex::kTVMFFIBool;
+        if (api_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt))
+          return ffi::TypeIndex::kTVMFFIInt;
+        if (api_ty.code() == DLDataTypeCode::kDLFloat) return ffi::TypeIndex::kTVMFFIFloat;
+        if (api_ty.IsHandle() && arg.as<StringImmNode>()) {
           return ffi::TypeIndex::kTVMFFIRawStr;
         } else if (IsArrayHandle(arg)) {
           return ffi::TypeIndex::kTVMFFIDLTensorPtr;
-        } else if (api_dtype.is_handle()) {
+        } else if (api_ty.IsHandle()) {
           return ffi::TypeIndex::kTVMFFIOpaquePtr;
         } else {
-          TVM_FFI_THROW(InternalError) << "Unsupported type: " << api_dtype;
+          TVM_FFI_THROW(InternalError) << "Unsupported type: " << api_ty;
           TVM_FFI_UNREACHABLE();
         }
       }();
@@ -560,7 +570,7 @@ class BuiltinLower : public StmtExprMutator {
       // opaque handle need to set the kind properly
       if (arg_type_index == ffi::TypeIndex::kTVMFFIOpaquePtr) {
         prep_seq->emplace_back(
-            IfThenElse(Call(DataType::Bool(), builtin::isnullptr(), {arg}),
+            IfThenElse(Call(PrimType::Bool(), builtin::isnullptr(), {arg}),
                        TVMStructSet(args_stack, stack_offset, builtin::kTVMFFIAnyTypeIndex,
                                     ConstInt32(ffi::TypeIndex::kTVMFFINone)),
                        TVMStructSet(args_stack, stack_offset, builtin::kTVMFFIAnyTypeIndex,
@@ -592,7 +602,7 @@ class BuiltinLower : public StmtExprMutator {
     prep_seq.emplace_back(Evaluate(call));
     static const Op& anylist_move_from_packed_return_op =
         Op::Get("tirx.TVMBackendAnyListMoveFromPackedReturn");
-    return Call(DataType::Int(32), anylist_move_from_packed_return_op,
+    return Call(PrimType::Int(32), anylist_move_from_packed_return_op,
                 {list_handle, list_index, args_stack, ret_offset});
   }
   /*!
@@ -652,16 +662,18 @@ class BuiltinLower : public StmtExprMutator {
       // used by call_packed_traced
       packed_args.push_back(op->args[op->args.size() - 1]);
     }
-    return Call(op->dtype, lowered_packed_op, packed_args);
+    return Call(ffi::GetRef<PrimExpr>(op).ty(), lowered_packed_op, packed_args);
   }
 
   Stmt MakeNdMemAllocWithScope(const BindNode* let, const CallNode* call) {
     TVM_FFI_ICHECK(device_type_) << "Unknown device type in current IR";
     TVM_FFI_ICHECK(device_id_) << "Unknown device id in current IR";
-    Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {}));
+    Stmt throw_last_error = Evaluate(Call(PrimType::Int(32), builtin::tvm_throw_last_error(), {}));
 
-    DataType dtype =
-        let->var->type_annotation.as<PointerTypeNode>()->element_type.as<PrimTypeNode>()->dtype;
+    const auto* dtype_node =
+        let->var->type_annotation.as<PointerTypeNode>()->element_type.as<PrimTypeNode>();
+    TVM_FFI_ICHECK(dtype_node);
+    PrimType dtype = ffi::GetRef<PrimType>(dtype_node);
 
     ffi::Array<PrimExpr> args = {
         GetDeviceMethodName("alloc_nd"), device_type_.value(),        device_id_.value(),
@@ -672,14 +684,14 @@ class BuiltinLower : public StmtExprMutator {
       args.push_back(call->args[i]);
     }
 
-    Call call_packed = Call(let->var.dtype(), builtin::tvm_call_packed(), args);
+    Call call_packed = Call(let->var.ty(), builtin::tvm_call_packed(), args);
     Stmt null_check =
-        IfThenElse(Call(DataType::Bool(), builtin::isnullptr(), {let->var}), throw_last_error);
+        IfThenElse(Call(PrimType::Bool(), builtin::isnullptr(), {let->var}), throw_last_error);
 
     // Construct free_nd call and register in current scope.
     // The free will be emitted on scope exit, matching the old LetStmt body semantics.
     PrimExpr storage_scope = call->args[0];
-    Call free_op = Call(DataType::Int(32), builtin::tvm_call_packed(),
+    Call free_op = Call(PrimType::Int(32), builtin::tvm_call_packed(),
                         {GetDeviceMethodName("free_nd"), device_type_.value(), device_id_.value(),
                          storage_scope, let->var});
     Stmt free_stmt = IfThenElse(free_op != IntImm::Int32(0), throw_last_error);
diff --git a/src/tirx/transform/lower_warp_memory.cc b/src/tirx/transform/lower_warp_memory.cc
index be21efaa3694..57b9dde61fed 100644
--- a/src/tirx/transform/lower_warp_memory.cc
+++ b/src/tirx/transform/lower_warp_memory.cc
@@ -155,9 +155,10 @@ class WarpStoreCoeffFinder : private StmtExprVisitor {
                                              << "Has FlattenBuffer been run?";
 
     PrimExpr index = op->indices[0];
-    if (op->value.dtype().lanes() != 1) {
+    PrimType value_ty = op->value.ty();
+    if (value_ty.lanes() != 1) {
       arith::PVar<PrimExpr> base;
-      TVM_FFI_ICHECK(arith::ramp(base, 1, op->value.dtype().lanes()).Match(index))
+      TVM_FFI_ICHECK(arith::ramp(base, 1, value_ty.lanes()).Match(index))
           << "LowerWarpMemory failed due to store index=" << index
           << ", can only handle continuous store";
       UpdatePattern(base.Eval());
@@ -294,7 +295,7 @@ class WarpAccessRewriter : protected StmtExprMutator {
         new_args.Set(i + 1, local_index);
       }
     }
-    return Call(op->dtype, op->op, new_args, op->attrs, op->span);
+    return Call(ffi::GetRef<PrimExpr>(op).ty(), op->op, new_args, op->attrs, op->span);
   }
 
   PrimExpr VisitExpr_(const CallNode* op) override {
@@ -390,8 +391,8 @@ class WarpAccessRewriter : protected StmtExprMutator {
       return load;
     }
 
-    PrimExpr mask = Call(DataType::UInt(32), builtin::tvm_warp_activemask(), {});
-    return Call(load.dtype(), builtin::tvm_warp_shuffle(), {mask, load, group, width_, warp_size_});
+    PrimExpr mask = Call(PrimType::UInt(32), builtin::tvm_warp_activemask(), {});
+    return Call(load.ty(), builtin::tvm_warp_shuffle(), {mask, load, group, width_, warp_size_});
   }
 
   // Split the index to the two component
@@ -400,15 +401,16 @@ class WarpAccessRewriter : protected StmtExprMutator {
   // source index is the corresponding source index
   // in this access pattern.
   std::pair<PrimExpr, PrimExpr> SplitIndexByGroup(const PrimExpr& index) {
-    if (index.dtype().lanes() != 1) {
+    PrimType index_ty = index.ty();
+    if (index_ty.lanes() != 1) {
       arith::PVar<PrimExpr> base;
-      TVM_FFI_ICHECK(arith::ramp(base, 1, index.dtype().lanes()).Match(index));
+      TVM_FFI_ICHECK(arith::ramp(base, 1, index_ty.lanes()).Match(index));
 
       auto [local_index, group] = SplitIndexByGroup(base.Eval());
-      local_index = Ramp(local_index, MakeConst(local_index.dtype(), 1), index.dtype().lanes());
+      local_index = Ramp(local_index, MakeConst(local_index.ty(), 1), index_ty.lanes());
       return std::make_pair(local_index, group);
     }
-    PrimExpr m = MakeConst(index.dtype(), warp_coeff_);
+    PrimExpr m = MakeConst(index_ty, warp_coeff_);
 
     // simple case, warp index is on the highest.
     if (warp_group_ == 1) {
@@ -417,9 +419,9 @@ class WarpAccessRewriter : protected StmtExprMutator {
       return std::make_pair(x, z);
     } else {
       PrimExpr x = analyzer_->canonical_simplify(indexmod(index, m));
-      PrimExpr y = index / MakeConst(index.dtype(), warp_coeff_ * width_);
+      PrimExpr y = index / MakeConst(index_ty, warp_coeff_ * width_);
       y = y * m + x;
-      PrimExpr z = indexdiv(indexmod(index, MakeConst(index.dtype(), warp_coeff_ * width_)), m);
+      PrimExpr z = indexdiv(indexmod(index, MakeConst(index_ty, warp_coeff_ * width_)), m);
       return std::make_pair(analyzer_->canonical_simplify(y), analyzer_->canonical_simplify(z));
     }
   }
diff --git a/src/tirx/transform/make_packed_api.cc b/src/tirx/transform/make_packed_api.cc
index d2d4113cdcfc..56a30ba6eb91 100644
--- a/src/tirx/transform/make_packed_api.cc
+++ b/src/tirx/transform/make_packed_api.cc
@@ -80,22 +80,22 @@ class ReturnRewriter : public StmtMutator {
     ConvertedInfo info;
 
     // convert val's data type to FFI data type, return type code
-    DataType dtype = val.dtype();
-    if (dtype.is_bool()) {
+    PrimType dtype = val.ty();
+    if (dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
       info.type_index = ffi::TypeIndex::kTVMFFIBool;
-      info.expr = Cast(DataType::Int(64), val);
+      info.expr = Cast(PrimType::Int(64), val);
 
-    } else if (dtype.is_int() || dtype.is_uint()) {
+    } else if (dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
       info.type_index = ffi::TypeIndex::kTVMFFIInt;
-      info.expr = Cast(DataType::Int(64), val);
-    } else if (dtype.is_float()) {
+      info.expr = Cast(PrimType::Int(64), val);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat) {
       info.type_index = ffi::TypeIndex::kTVMFFIFloat;
-      info.expr = Cast(DataType::Float(64), val);
-    } else if (dtype.is_void()) {
+      info.expr = Cast(PrimType::Float(64), val);
+    } else if (dtype.IsVoid()) {
       info.type_index = ffi::TypeIndex::kTVMFFINone;
       info.expr = val;
     } else {
-      TVM_FFI_THROW(InternalError) << "data type " << dtype << " not supported yet";
+      TVM_FFI_THROW(InternalError) << "data type " << dtype->dtype << " not supported yet";
     }
     return info;
   }
@@ -103,15 +103,15 @@ class ReturnRewriter : public StmtMutator {
   Stmt WriteToOut(PrimExpr val) {
     auto info = ConvertForFFI(val);
     Stmt store_tindex = tirx::Evaluate(
-        tirx::Call(DataType::Int(32), tirx::builtin::tvm_struct_set(),
+        tirx::Call(PrimType::Int(32), tirx::builtin::tvm_struct_set(),
                    {ret_var_, IntImm::Int32(0), IntImm::Int32(tirx::builtin::kTVMFFIAnyTypeIndex),
                     IntImm::Int32(info.type_index)}));
     Stmt store_zero_padding = tirx::Evaluate(
-        tirx::Call(DataType::Int(32), tirx::builtin::tvm_struct_set(),
+        tirx::Call(PrimType::Int(32), tirx::builtin::tvm_struct_set(),
                    {ret_var_, IntImm::Int32(0), IntImm::Int32(tirx::builtin::kTVMFFIAnyZeroPadding),
                     IntImm::Int32(0)}));
     Stmt store_val =
-        tirx::Evaluate(tirx::Call(DataType::Int(32), tirx::builtin::tvm_struct_set(),
+        tirx::Evaluate(tirx::Call(PrimType::Int(32), tirx::builtin::tvm_struct_set(),
                                   {ret_var_, IntImm::Int32(0),
                                    IntImm::Int32(tirx::builtin::kTVMFFIAnyUnionValue), info.expr}));
     Stmt ret_zero = Evaluate(tvm::ret(0));
@@ -154,7 +154,7 @@ class SubroutineCallRewriter : public StmtExprMutator {
         // push an empty handle to be compatible with current cpacked convention
         cpacked_args.push_back(tirx::ConstHandle(0));
         made_change_ = true;
-        return tirx::Call(node->dtype, tirx::builtin::tvm_call_cpacked(), cpacked_args);
+        return tirx::Call(node.ty(), tirx::builtin::tvm_call_cpacked(), cpacked_args);
       }
     }
 
@@ -219,14 +219,14 @@ PrimFunc MakePackedAPI(PrimFunc func) {
   const Stmt nop = Evaluate(0);
 
   // Data field definitions
-  Var v_self_handle("self_handle", DataType::Handle());
-  Var v_packed_args("args", DataType::Handle());
-  Var v_num_packed_args("num_args", DataType::Int(32));
-  Var v_result("result", PointerType(PrimType(DataType::Void())));
+  Var v_self_handle("self_handle", PrimType::Handle());
+  Var v_packed_args("args", PrimType::Handle());
+  Var v_num_packed_args("num_args", PrimType::Int(32));
+  Var v_result("result", PointerType(PrimType::Void()));
 
   // The device context
   Var device_id("dev_id");
-  IntImm device_type(DataType::Int(32), target_device_type);
+  IntImm device_type(PrimType::Int(32), target_device_type);
 
   // Create TVMFFIABIBuilder and decode all packed args
   TVMFFIABIBuilder binder(name_hint, func_ptr->params, func_ptr->buffer_map, v_packed_args,
@@ -257,7 +257,7 @@ PrimFunc MakePackedAPI(PrimFunc func) {
 
     if (runtime::DeviceAPI::NeedSetDevice(target_device_type)) {
       Stmt set_device =
-          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(),
+          Evaluate(Call(PrimType::Int(32), builtin::tvm_call_packed(),
                         {StringImm(runtime::symbol::tvm_set_device), device_type, device_id}));
       body = SeqStmt({set_device, body});
     }
@@ -278,7 +278,7 @@ PrimFunc MakePackedAPI(PrimFunc func) {
       << " are used, but are not passed in as API arguments";
 
   func_ptr->buffer_map = ffi::Map<Var, Buffer>();
-  func_ptr->ret_type = PrimType(DataType::Int(32));
+  func_ptr->ret_type = PrimType::Int(32);
 
   // return the function.
   return func;
diff --git a/src/tirx/transform/narrow_datatype.cc b/src/tirx/transform/narrow_datatype.cc
index 9dfdf88c0c06..fa9e431f9253 100644
--- a/src/tirx/transform/narrow_datatype.cc
+++ b/src/tirx/transform/narrow_datatype.cc
@@ -79,15 +79,16 @@ class DataTypeVisitor final : public StmtExprVisitor {
   explicit DataTypeVisitor(int target_bits) : bits_(target_bits), target_bits_(target_bits) {}
 
   void VisitExpr(const PrimExpr& e) {
-    if (e.dtype().is_int()) {
+    PrimType e_ty = e.ty();
+    if (e_ty.MatchesCode(DLDataTypeCode::kDLInt)) {
       int bits = max_bits_;
       if (bound_.find(e) == bound_.end()) {
         analyzer_->const_int_bound(e, &bound_);
       }
       ConstIntBound bound = bound_[e];
-      int64_t ubound = max_value(DataType::Int(target_bits_)).as_or_throw<IntImm>()->value;
-      int64_t lbound = min_value(DataType::Int(target_bits_)).as_or_throw<IntImm>()->value;
-      if (e.dtype().bits() <= target_bits_ ||
+      int64_t ubound = max_value(PrimType::Int(target_bits_)).as_or_throw<IntImm>()->value;
+      int64_t lbound = min_value(PrimType::Int(target_bits_)).as_or_throw<IntImm>()->value;
+      if (e_ty.bits() <= target_bits_ ||
           (bound->max_value <= ubound && bound->min_value >= lbound)) {
         bits = target_bits_;
       }
@@ -109,14 +110,14 @@ class DataTypeVisitor final : public StmtExprVisitor {
 
   void VisitStmt_(const ForNode* op) {
     analyzer_->Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent));
-    vextent_[op->loop_var.as<VarNode>()] = op->extent.dtype();
+    vextent_.insert_or_assign(op->loop_var.as<VarNode>(), op->extent.ty());
     return StmtExprVisitor::VisitStmt_(op);
   }
 
   void VisitStmt_(const SBlockNode* op) {
     for (const IterVar& iter : op->iter_vars) {
       analyzer_->Bind(iter->var, Range::FromMinExtent(iter->dom->min, iter->dom->extent));
-      vextent_[iter->var.as<VarNode>()] = iter->dom->extent.dtype();
+      vextent_.insert_or_assign(iter->var.as<VarNode>(), iter->dom->extent.ty());
     }
     StmtExprVisitor::VisitStmt_(op);
   }
@@ -126,7 +127,7 @@ class DataTypeVisitor final : public StmtExprVisitor {
       IterVar iv = op->node.as_or_throw<IterVar>();
       TVM_FFI_ICHECK_NE(iv->thread_tag.length(), 0U);
       analyzer_->Bind(iv->var, Range::FromMinExtent(0, op->value));
-      vextent_[iv->var.as<VarNode>()] = op->value.dtype();
+      vextent_.insert_or_assign(iv->var.as<VarNode>(), op->value.ty());
       StmtExprVisitor::VisitStmt_(op);
     } else {
       StmtExprVisitor::VisitStmt_(op);
@@ -137,57 +138,59 @@ class DataTypeVisitor final : public StmtExprVisitor {
     // Setup the domain information before simplification.
     for (const IterVar& iv : op->axis) {
       analyzer_->Bind(iv->var, iv->dom);
-      vextent_[iv->var.as<VarNode>()] = iv->dom->extent.dtype();
+      vextent_.insert_or_assign(iv->var.as<VarNode>(), iv->dom->extent.ty());
     }
     // Recursively call simplification when necessary.
     StmtExprVisitor::VisitExpr_(op);
   }
 
   void VisitExpr_(const VarNode* op) {
-    if (vextent_.find(op) != vextent_.end()) {
+    if (auto vextent_it = vextent_.find(op); vextent_it != vextent_.end()) {
       // We only narrow and never promote, so the result dtype
       // is upperbounded by its original dtype before rewrite.
-      int bits = std::min(vextent_[op].bits(), bits_);
-      if (vmap.find(op) == vmap.end()) {
-        vmap[op] = op->dtype.with_bits(bits);
+      int bits = std::min(vextent_it->second.bits(), bits_);
+      if (auto it = vmap.find(op); it == vmap.end()) {
+        vmap.emplace(op, op->ty().WithBits(bits));
       } else {
         // We take maximum bits for all the possible Expr where a var occurs
-        vmap[op] = op->dtype.with_bits(std::max(vmap[op].bits(), bits));
+        it->second = op->ty().WithBits(std::max(it->second.bits(), bits));
       }
     }
     StmtExprVisitor::VisitExpr_(op);
   }
 
   void VisitExpr_(const IntImmNode* op) {
-    if (op->dtype.is_int()) {
+    PrimType op_ty = op->ty();
+    if (op_ty.MatchesCode(DLDataTypeCode::kDLInt)) {
       // We only narrow and never promote, so the result dtype
       // is upperbounded by its original dtype before rewrite.
-      int bits = std::min(op->dtype.bits(), bits_);
-      if (vmap.find(op) == vmap.end()) {
-        vmap[op] = op->dtype.with_bits(bits);
+      int bits = std::min(op_ty.bits(), bits_);
+      if (auto it = vmap.find(op); it == vmap.end()) {
+        vmap.emplace(op, op_ty.WithBits(bits));
       } else {
-        vmap[op] = op->dtype.with_bits(std::max(vmap[op].bits(), bits));
+        it->second = op_ty.WithBits(std::max(it->second.bits(), bits));
       }
     }
     StmtExprVisitor::VisitExpr_(op);
   }
 
   void VisitExpr_(const CastNode* op) {
-    if (op->dtype.is_int()) {
+    PrimType op_ty = op->ty();
+    if (op_ty.MatchesCode(DLDataTypeCode::kDLInt)) {
       // We only narrow and never promote, so the result dtype
       // is upperbounded by its original dtype before rewrite.
-      int bits = std::min(op->dtype.bits(), bits_);
-      if (vmap.find(op) == vmap.end()) {
-        vmap[op] = op->dtype.with_bits(bits);
+      int bits = std::min(op_ty.bits(), bits_);
+      if (auto it = vmap.find(op); it == vmap.end()) {
+        vmap.emplace(op, op_ty.WithBits(bits));
       } else {
-        vmap[op] = op->dtype.with_bits(std::max(vmap[op].bits(), bits));
+        it->second = op_ty.WithBits(std::max(it->second.bits(), bits));
       }
     }
     StmtExprVisitor::VisitExpr_(op);
   }
 
   // the narrowed datatype of Var and IntImm
-  std::unordered_map<const PrimExprNode*, DataType> vmap;
+  std::unordered_map<const PrimExprNode*, PrimType> vmap;
 
  protected:
   // internal analyzer
@@ -201,7 +204,7 @@ class DataTypeVisitor final : public StmtExprVisitor {
   // the target bits
   int target_bits_;
   // the extent of vars to be rewritten
-  std::unordered_map<const VarNode*, DataType> vextent_;
+  std::unordered_map<const VarNode*, PrimType> vextent_;
   // the memorized bound generated by ConstIntBoundAnalyzer
   arith::ConstIntBoundAnalyzer::BoundMapType bound_;
 };
@@ -215,7 +218,7 @@ class NarrowDataTypeRewriter : public IndexDataTypeRewriter {
     visitor_(s);
     for (auto i = visitor_.vmap.begin(), last = visitor_.vmap.end(); i != last;) {
       PrimExpr e = ffi::GetRef<PrimExpr>(i->first);
-      if (e.dtype() == i->second) {
+      if (e.ty() == i->second) {
         i = visitor_.vmap.erase(i);
       } else {
         ++i;
@@ -243,7 +246,7 @@ class NarrowDataTypeRewriter : public IndexDataTypeRewriter {
   PrimExpr VisitExpr_(const IntImmNode* op) final {
     if (is_enabled_) {
       if (visitor_.vmap.find(op) != visitor_.vmap.end()) {
-        return IntImm(visitor_.vmap[op], op->value);
+        return IntImm(visitor_.vmap.at(op), op->value);
       }
     }
     return Parent::VisitExpr_(op);
@@ -256,8 +259,8 @@ class NarrowDataTypeRewriter : public IndexDataTypeRewriter {
       TVM_FFI_ICHECK(new_op != nullptr) << "Expected type to be CastNode"
                                         << ", but get " << e->GetTypeKey();
       PrimExpr new_value = new_op->value;
-      DataType cast_type = visitor_.vmap[op];
-      if (new_value.dtype() != cast_type) {
+      PrimType cast_type = visitor_.vmap.at(op);
+      if (new_value.ty() != cast_type) {
         new_value = Cast(cast_type, new_value);
       }
       return new_value;
@@ -265,24 +268,24 @@ class NarrowDataTypeRewriter : public IndexDataTypeRewriter {
     return Parent::VisitExpr_(op);
   }
 
-#define TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC)             \
-  PrimExpr VisitExpr_(const OP* op) {                                     \
-    PrimExpr a = this->VisitExpr(op->a);                                  \
-    PrimExpr b = this->VisitExpr(op->b);                                  \
-    if (op->a.same_as(a) && op->b.same_as(b) && a.dtype() == b.dtype()) { \
-      return ffi::GetRef<PrimExpr>(op);                                   \
-    } else {                                                              \
-      if (a.dtype() != b.dtype()) {                                       \
-        bool is_enabled = is_enabled_;                                    \
-        is_enabled_ = true;                                               \
-        PrimExpr lhs = this->VisitExpr(op->a);                            \
-        PrimExpr rhs = this->VisitExpr(op->b);                            \
-        is_enabled_ = is_enabled;                                         \
-        return FUNC(lhs, rhs);                                            \
-      } else {                                                            \
-        return FUNC(a, b);                                                \
-      }                                                                   \
-    }                                                                     \
+#define TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC)       \
+  PrimExpr VisitExpr_(const OP* op) {                               \
+    PrimExpr a = this->VisitExpr(op->a);                            \
+    PrimExpr b = this->VisitExpr(op->b);                            \
+    if (op->a.same_as(a) && op->b.same_as(b) && a.ty() == b.ty()) { \
+      return ffi::GetRef<PrimExpr>(op);                             \
+    } else {                                                        \
+      if (a.ty() != b.ty()) {                                       \
+        bool is_enabled = is_enabled_;                              \
+        is_enabled_ = true;                                         \
+        PrimExpr lhs = this->VisitExpr(op->a);                      \
+        PrimExpr rhs = this->VisitExpr(op->b);                      \
+        is_enabled_ = is_enabled;                                   \
+        return FUNC(lhs, rhs);                                      \
+      } else {                                                      \
+        return FUNC(a, b);                                          \
+      }                                                             \
+    }                                                               \
   }
 
   TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(AddNode, operator+);
diff --git a/src/tirx/transform/split_host_device.cc b/src/tirx/transform/split_host_device.cc
index d66d1d23f226..6eaa75d57f6c 100644
--- a/src/tirx/transform/split_host_device.cc
+++ b/src/tirx/transform/split_host_device.cc
@@ -141,7 +141,7 @@ class HostDeviceSplitter : public StmtMutator {
         std::sort(params.begin(), params.end(), [](const Var& a, const Var& b) {
           auto sort_key = [](const Var& var) {
             return std::tuple{
-                !var->dtype.is_handle(),
+                !var->ty().IsHandle(),
                 var->name_hint,
             };
           };
@@ -167,10 +167,10 @@ class HostDeviceSplitter : public StmtMutator {
       auto kind = device_target->GetTargetDeviceType();
       return kind == kDLCPU || kind == kDLExtDev || kind == kDLHexagon;
     }();
-    IntImm success(DataType::Int(32), 0);
+    IntImm success(PrimType::Int(32), 0);
     Type kernel_ret_type;
     if (can_propagate_errors) {
-      kernel_ret_type = PrimType(DataType::Int(32));
+      kernel_ret_type = PrimType::Int(32);
       body = SeqStmt::Flatten(body, Evaluate(ret(success)));
     } else {
       kernel_ret_type = VoidType();
@@ -202,14 +202,14 @@ class HostDeviceSplitter : public StmtMutator {
     ffi::Array<PrimExpr> args = params.Map([](const Var& var) -> PrimExpr { return var; });
 
     if (can_propagate_errors) {
-      Var kernel_error_code("kernel_error_code", success->dtype);
-      Call kernel_call(success->dtype, kernel_symbol_global, args);
+      Var kernel_error_code("kernel_error_code", success.ty());
+      Call kernel_call(success.ty(), kernel_symbol_global, args);
       AssertStmt assert_success(kernel_error_code == success, StringImm("RuntimeError"),
                                 {StringImm("Error executing compute kernel")});
       return SeqStmt({Bind(kernel_error_code, kernel_call), assert_success});
 
     } else {
-      return Evaluate(Call(DataType::Void(), kernel_symbol_global, args));
+      return Evaluate(Call(PrimType::Void(), kernel_symbol_global, args));
     }
   }
 
@@ -353,7 +353,7 @@ class DeviceInfoCollector : public StmtVisitor {
       for (const auto& extent : op->buffer->shape) {
         dyn_size *= extent;
       }
-      dyn_size *= op->buffer->dtype.bytes();
+      dyn_size *= IntImm::Int64(static_cast<int64_t>(op->buffer->dtype.StorageBytes()));
 
       // Inline any locally-bound variables (e.g. from CSE).
       if (bind_map_.size()) {
@@ -570,7 +570,7 @@ class DeviceKernelMutator : public StmtExprMutator {
         for (const auto& arg : node->args) {
           args.push_back(arg);
         }
-        return Call(node->dtype, builtin::call_extern(), args);
+        return Call(node.ty(), builtin::call_extern(), args);
       }
     }
 
@@ -607,9 +607,9 @@ class DeviceKernelMutator : public StmtExprMutator {
       call_args.push_back(Substitute(launch_arg, param_map));
     }
 
-    auto dtype = node->dtype.is_void() ? DataType::Int(32) : node->dtype;
+    PrimType ret_ty = node->ty().IsVoid() ? PrimType::Int(32) : node.ty();
 
-    return Call(dtype, builtin::tvm_call_packed(), call_args);
+    return Call(ret_ty, builtin::tvm_call_packed(), call_args);
   }
 
   ffi::Optional<Target> current_target_;
diff --git a/src/tirx/transform/storage_rewrite.cc b/src/tirx/transform/storage_rewrite.cc
index 81f71f9b74ce..02ca714cb474 100644
--- a/src/tirx/transform/storage_rewrite.cc
+++ b/src/tirx/transform/storage_rewrite.cc
@@ -52,6 +52,22 @@ namespace tirx {
 using runtime::StorageRank;
 using runtime::StorageScope;
 
+namespace {
+
+struct PrimTypeHash {
+  size_t operator()(const PrimType& ty) const {
+    DLDataType dtype = ty->dtype;
+    return (static_cast<size_t>(dtype.code) << 24) ^ (static_cast<size_t>(dtype.bits) << 16) ^
+           static_cast<size_t>(dtype.lanes);
+  }
+};
+
+struct PrimTypeEqual {
+  bool operator()(const PrimType& lhs, const PrimType& rhs) const { return lhs == rhs; }
+};
+
+}  // namespace
+
 // Find a linear pattern of storage access
 // Used for liveness analysis.
 // Composite scopes(loop/thread_launch/IfThen) is represented by two points:
@@ -356,7 +372,7 @@ class InplaceOpVerifier : public StmtExprVisitor {
       return;
     }
     if (src_ == buf) {
-      if (store_ == nullptr || store_->value.dtype() != op->dtype) {
+      if (store_ == nullptr || store_->value.ty() != op->ty()) {
         result_ = false;
         return;
       }
@@ -482,7 +498,7 @@ class StoragePlanRewriter : public StmtExprMutator {
   PrimExpr VisitExpr_(const CallNode* op) final {
     if (op->op.same_as(builtin::tvm_access_ptr())) {
       TVM_FFI_ICHECK_EQ(op->args.size(), 5U);
-      DataType dtype = op->args[0].dtype();
+      PrimType dtype = op->args[0].ty();
       const VarNode* buffer = op->args[1].as<VarNode>();
       auto it = alloc_map_.find(buffer);
       if (it == alloc_map_.end()) {
@@ -494,10 +510,10 @@ class StoragePlanRewriter : public StmtExprMutator {
       uint64_t elem_bits = dtype.bits() * dtype.lanes();
       TVM_FFI_ICHECK_EQ(se->bits_offset % elem_bits, 0U);
       if (se->bits_offset != 0) {
-        offset = MakeConst(offset.dtype(), se->bits_offset / elem_bits) + offset;
+        offset = MakeConst(offset.ty(), se->bits_offset / elem_bits) + offset;
       }
-      return Call(op->dtype, op->op, {op->args[0], se->alloc_var, offset, extent, op->args[4]},
-                  op->attrs, op->span);
+      return Call(ffi::GetRef<PrimExpr>(op).ty(), op->op,
+                  {op->args[0], se->alloc_var, offset, extent, op->args[4]}, op->attrs, op->span);
     } else {
       return StmtExprMutator::VisitExpr_(op);
     }
@@ -589,7 +605,7 @@ class StoragePlanRewriter : public StmtExprMutator {
     // The var expr of new allocation.
     Var alloc_var;
     // The allocation element type.
-    DataType elem_type;
+    PrimType elem_type = PrimType::Void();
     // Whether any constituent allocation was marked volatile.
     bool is_volatile{false};
     // This is non-zero if this alloc_buffer is folded into another one
@@ -629,11 +645,11 @@ class StoragePlanRewriter : public StmtExprMutator {
     return body;
   }
   // Remap the index
-  PrimExpr RemapIndex(DataType dtype, PrimExpr index, StorageEntry* e) {
+  PrimExpr RemapIndex(PrimType dtype, PrimExpr index, StorageEntry* e) {
     if (e->bits_offset == 0) return index;
     uint64_t elem_bits = dtype.bits();
     TVM_FFI_ICHECK_EQ(e->bits_offset % elem_bits, 0U);
-    return MakeConst(index.dtype(), e->bits_offset / elem_bits) + index;
+    return MakeConst(index.ty(), e->bits_offset / elem_bits) + index;
   }
   // Prepare the new allocations
   void PrepareNewAlloc() {
@@ -667,7 +683,7 @@ class StoragePlanRewriter : public StmtExprMutator {
           NewAllocTagMerged(e);
           continue;
         }
-        if (e->allocs.size() == 1 && e->allocs[0]->buffer->dtype.is_scalable_vector()) {
+        if (e->allocs.size() == 1 && e->allocs[0]->buffer->dtype.IsScalableVector()) {
           // Scalable vector lanes are runtime-dependent.  Keep these allocations exact rather
           // than trying to compare or merge their compile-time bit size.
           e->alloc_var = e->allocs[0]->buffer->data;
@@ -681,7 +697,7 @@ class StoragePlanRewriter : public StmtExprMutator {
         }
         // Get the allocation size;
         e->alloc_var = e->allocs[0]->buffer->data;
-        DataType alloc_type = e->allocs[0]->buffer->dtype;
+        PrimType alloc_type = e->allocs[0]->buffer->dtype;
         for (const AllocBufferNode* op : e->allocs) {
           if (op->buffer->dtype.lanes() > alloc_type.lanes()) {
             alloc_type = op->buffer->dtype;
@@ -691,7 +707,7 @@ class StoragePlanRewriter : public StmtExprMutator {
         bool all_allocs_identical = std::all_of(
             e->allocs.begin() + 1, e->allocs.end(), [&](const AllocBufferNode* op) -> bool {
               const AllocBufferNode* first = *e->allocs.begin();
-              if (op->buffer->dtype != first->buffer->dtype) {
+              if (op->buffer->dtype->dtype != first->buffer->dtype->dtype) {
                 return false;
               }
               if (op->buffer->shape.size() != first->buffer->shape.size()) {
@@ -789,7 +805,7 @@ class StoragePlanRewriter : public StmtExprMutator {
     }
     uint64_t type_bits = e->elem_type.bits() * e->elem_type.lanes();
     PrimExpr alloc_size =
-        MakeConst(e->allocs[0]->buffer->shape[0].dtype(), (total_bits + type_bits - 1) / type_bits);
+        MakeConst(e->allocs[0]->buffer->shape[0].ty(), (total_bits + type_bits - 1) / type_bits);
     Buffer buf(e->alloc_var, e->elem_type, {alloc_size}, {}, PrimExpr(), e->alloc_var->name_hint, 0,
                0, BufferType::kDefault);
     bool any_volatile = e->is_volatile;
@@ -888,8 +904,8 @@ class StoragePlanRewriter : public StmtExprMutator {
                 StorageEntry* src_entry = alloc_map_.at(src);
                 if (src_entry->scope == storage_scope &&
                     src_entry->attach_scope_ == thread_scope_ &&
-                    !alloc->buffer->dtype.is_scalable_vector() &&
-                    src_entry->elem_type == alloc->buffer->dtype.element_of() &&
+                    !alloc->buffer->dtype.IsScalableVector() &&
+                    src_entry->elem_type == alloc->buffer->dtype.WithLanes(1) &&
                     visitor.Check(s.stmt, var, src)) {
                   int64_t const_size = AllocBuffer(ffi::GetRef<AllocBuffer>(alloc))
                                            .ConstantAllocationSize()
@@ -957,7 +973,7 @@ class StoragePlanRewriter : public StmtExprMutator {
     auto entry = std::make_unique<StorageEntry>();
     entry->attach_scope_ = attach_scope;
     entry->scope = scope;
-    entry->elem_type = op->buffer->dtype.element_of();
+    entry->elem_type = op->buffer->dtype.WithLanes(1);
     entry->const_nbits = const_nbits;
     StorageEntry* e = entry.get();
     alloc_vec_.emplace_back(std::move(entry));
@@ -971,7 +987,7 @@ class StoragePlanRewriter : public StmtExprMutator {
     // skip plan for local variable,
     // compiler can do a better job with register allocation.
     const uint64_t match_range = 16;
-    bool is_scalable_vector = op->buffer->dtype.is_scalable_vector();
+    bool is_scalable_vector = op->buffer->dtype.IsScalableVector();
     uint64_t op_elem_bits =
         is_scalable_vector ? 0 : op->buffer->dtype.bits() * op->buffer->dtype.lanes();
     int64_t const_size =
@@ -991,7 +1007,7 @@ class StoragePlanRewriter : public StmtExprMutator {
     // disable reuse of small arrays, they will be lowered to registers in LLVM
     // This rules only apply if we are using non special memory
     bool is_small_array = (scope.tag.length() == 0) &&
-                          (scope.rank >= StorageRank::kWarp || op->buffer->dtype.is_handle() ||
+                          (scope.rank >= StorageRank::kWarp || op->buffer->dtype.IsHandle() ||
                            (is_known_size && const_nbits <= 32));
 
     if (is_scalable_vector || !enable_reuse || is_small_array || !is_flat_memory_space) {
@@ -1023,7 +1039,7 @@ class StoragePlanRewriter : public StmtExprMutator {
         StorageEntry* e = it->second;
         if (e->attach_scope_ != attach_scope) continue;
         if (e->scope != scope) continue;
-        if (e->elem_type != op->buffer->dtype.element_of()) continue;
+        if (e->elem_type != op->buffer->dtype.WithLanes(1)) continue;
         if (reuse_require_exact_matched_dtype && e->elem_type != op->buffer->dtype) {
           continue;
         }
@@ -1037,7 +1053,7 @@ class StoragePlanRewriter : public StmtExprMutator {
         StorageEntry* e = *it;
         if (e->attach_scope_ != attach_scope) continue;
         if (e->scope != scope) continue;
-        if (e->elem_type != op->buffer->dtype.element_of()) continue;
+        if (e->elem_type != op->buffer->dtype.WithLanes(1)) continue;
         sym_free_list_.erase(it);
         return e;
       }
@@ -1055,8 +1071,8 @@ class StoragePlanRewriter : public StmtExprMutator {
     // This rules only apply if we are using non special memory
     if (e->scope.tag.length() == 0) {
       // Disable sharing of local memory.
-      if (e->scope.rank >= StorageRank::kWarp || e->allocs[0]->buffer->dtype.is_handle() ||
-          e->allocs[0]->buffer->dtype.is_scalable_vector()) {
+      if (e->scope.rank >= StorageRank::kWarp || e->allocs[0]->buffer->dtype.IsHandle() ||
+          e->allocs[0]->buffer->dtype.IsScalableVector()) {
         return;
       }
       // disable reuse of small arrays
@@ -1113,7 +1129,7 @@ struct BufferVarInfo {
   Var var;
 
   // The data type of an element of the buffer.
-  DataType element_dtype;
+  PrimType element_dtype;
 
   /* The extent of the buffer.
    *
@@ -1130,18 +1146,18 @@ struct BufferVarInfo {
   // differ both in base type (e.g. int32* cast to float32* after
   // packing in StorageRewrite) or in number of lanes (e.g. float16*
   // cast to float16x4*).
-  std::unordered_set<DataType> access_dtype;
+  std::unordered_set<PrimType, PrimTypeHash, PrimTypeEqual> access_dtype;
   // Data types used for scalar reads. This is used to record vectorized read dtypes that can be
   // shuffled for scalar reads when rewrite_scalar_read_to_vector_shuffle is enabled.
-  std::unordered_set<DataType> scalar_read_dtype;
+  std::unordered_set<PrimType, PrimTypeHash, PrimTypeEqual> scalar_read_dtype;
 
-  DataType get_preferred_dtype() const {
-    std::unordered_set<DataType> base_access_dtype;
+  PrimType get_preferred_dtype() const {
+    std::unordered_set<PrimType, PrimTypeHash, PrimTypeEqual> base_access_dtype;
     for (auto dtype : access_dtype) {
-      base_access_dtype.insert(dtype.element_of());
+      base_access_dtype.insert(dtype.WithLanes(1));
     }
     for (auto dtype : scalar_read_dtype) {
-      base_access_dtype.insert(dtype.element_of());
+      base_access_dtype.insert(dtype.WithLanes(1));
     }
     // If the array is accessed as multiple base types within a
     // function, no point in changing the declared type.  CodeGenC can
@@ -1152,7 +1168,7 @@ struct BufferVarInfo {
       return element_dtype;
     }
 
-    DataType preferred_base_type = *base_access_dtype.begin();
+    PrimType preferred_base_type = *base_access_dtype.begin();
 
     // If there is only one vectorizable size used to access the
     // buffer, and if that access size is compatible with the array
@@ -1177,7 +1193,7 @@ struct BufferVarInfo {
       }
     }
 
-    return preferred_base_type.with_lanes(preferred_lanes);
+    return preferred_base_type.WithLanes(preferred_lanes);
   }
 };
 
@@ -1208,7 +1224,7 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
     for (auto it : buffer_map) {
       Buffer& buffer = it.second;
       Var buffer_var = buffer->data;
-      DataType dtype = buffer->dtype;
+      PrimType dtype = buffer->dtype;
       PrimExpr extent = buffer->shape.size() ? buffer->shape[buffer->shape.size() - 1] : 0;
       OnArrayDeclaration(buffer_var, dtype, extent, BufferVarInfo::kPrimFuncParam);
     }
@@ -1218,7 +1234,7 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
     for (Var buffer_var : params) {
       auto pointer_type = GetPointerType(buffer_var->type_annotation);
       if (pointer_type.has_value() && (buffer_map.count(buffer_var) == 0)) {
-        DataType dtype = pointer_type.value();
+        PrimType dtype(pointer_type.value());
         PrimExpr extent = 0;
         OnArrayDeclaration(buffer_var, dtype, extent, BufferVarInfo::kPrimFuncBufferMap);
       }
@@ -1226,18 +1242,18 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
   }
 
   void VisitExpr_(const BufferLoadNode* op) final {
-    OnArrayAccess(op->dtype, op->buffer->data.get(), op->indices, /*is_buffer_load=*/true);
+    OnArrayAccess(op->ty(), op->buffer->data.get(), op->indices, /*is_buffer_load=*/true);
     StmtExprVisitor::VisitExpr_(op);
   }
 
   void VisitStmt_(const BufferStoreNode* op) final {
-    OnArrayAccess(op->value.dtype(), op->buffer->data.get(), op->indices, /*is_buffer_load=*/false);
+    OnArrayAccess(op->value.ty(), op->buffer->data.get(), op->indices, /*is_buffer_load=*/false);
     StmtExprVisitor::VisitStmt_(op);
   }
 
   void VisitExpr_(const CallNode* op) final {
     if (op->op.same_as(builtin::tvm_access_ptr())) {
-      DataType dtype = op->args[0].dtype();
+      PrimType dtype = op->args[0].ty();
       const VarNode* buffer = op->args[1].as<VarNode>();
       PrimExpr index = op->args[2];
       // args[1] may be a nested Call (e.g. another tvm_access_ptr) rather
@@ -1248,7 +1264,7 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
       }
     } else if (op->op.same_as(builtin::address_of())) {
       BufferLoad load = op->args[0].as_or_throw<BufferLoad>();
-      OnArrayAccess(load->dtype, load->buffer->data.get(), load->indices, /*is_buffer_load=*/false);
+      OnArrayAccess(load->ty(), load->buffer->data.get(), load->indices, /*is_buffer_load=*/false);
     }
     StmtExprVisitor::VisitExpr_(op);
   }
@@ -1273,12 +1289,12 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
   }
 
   void HandleLetNode(Var let_var) {
-    if (let_var->dtype.is_handle()) {
+    if (let_var.ty().IsHandle()) {
       auto pointer_type = GetPointerType(let_var->type_annotation);
       if (pointer_type.has_value()) {
-        OnArrayDeclaration(let_var, pointer_type.value(), 0, BufferVarInfo::kLetNode);
+        OnArrayDeclaration(let_var, PrimType(pointer_type.value()), 0, BufferVarInfo::kLetNode);
       } else if (allow_untyped_pointers_) {
-        OnArrayDeclaration(let_var, let_var->dtype, 0, BufferVarInfo::kLetNode);
+        OnArrayDeclaration(let_var, let_var.ty(), 0, BufferVarInfo::kLetNode);
       } else {
         TVM_FFI_THROW(InternalError) << "Let statement of variable " << let_var->name_hint
                                      << " is missing a type annotation, "
@@ -1300,15 +1316,16 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
    * @param declaration_location How the buffer was allocated, so that
    * some locations can be rewritten without others.
    */
-  void OnArrayDeclaration(Var buffer, DataType element_dtype, PrimExpr extent,
+  void OnArrayDeclaration(Var buffer, PrimType element_dtype, PrimExpr extent,
                           BufferVarInfo::DeclarationLocation declaration_location) {
     TVM_FFI_ICHECK(info_map_.find(buffer.get()) == info_map_.end())
         << "Array declaration of " << buffer->name_hint << " occurred multiple times.";
 
-    if (element_dtype == DataType::Bool()) {
-      element_dtype = DataType::Int(8).with_lanes(element_dtype.lanes());
+    if (element_dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
+      element_dtype = PrimType::Int(8, element_dtype.lanes());
     }
-    info_map_[buffer.get()] = BufferVarInfo{buffer, element_dtype, extent, declaration_location};
+    info_map_.insert_or_assign(buffer.get(),
+                               BufferVarInfo{buffer, element_dtype, extent, declaration_location});
   }
 
   /* Update the type map for a buffer based on its usage
@@ -1322,13 +1339,13 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
    *
    * @param is_buffer_load Whether the access is BufferLoad
    */
-  void OnArrayAccess(DataType value_dtype, const VarNode* buffer,
+  void OnArrayAccess(PrimType value_dtype, const VarNode* buffer,
                      const ffi::Array<PrimExpr>& indices, bool is_buffer_load) {
     auto it = info_map_.find(buffer);
     TVM_FFI_ICHECK(it != info_map_.end()) << "Load/Store of buffer " << buffer->name_hint << " ("
                                           << buffer << ") occurred before its declaration.";
 
-    if (value_dtype.is_scalable_vector()) {
+    if (value_dtype.IsScalableVector()) {
       // Scalable types are not currently supported in storage_rewrite. Scalable buffer
       // accesses are not currently checked and therefore are not rewritten.
       return;
@@ -1336,24 +1353,24 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
 
     BufferVarInfo& var_info = it->second;
 
-    if (value_dtype.element_of() == DataType::Bool()) {
-      value_dtype = DataType::Int(8).with_lanes(value_dtype.lanes());
+    if (value_dtype.WithLanes(1).MatchesCode(DLDataTypeCode::kDLBool)) {
+      value_dtype = PrimType::Int(8, value_dtype.lanes());
     }
 
-    if (var_info.element_dtype.is_handle()) {
+    if (var_info.element_dtype.IsHandle()) {
       TVM_FFI_ICHECK(allow_untyped_pointers_)
           << "Variable " << buffer->name_hint
           << " was missing a type annotation in its declaration";
-      var_info.element_dtype = value_dtype.element_of();
+      var_info.element_dtype = value_dtype.WithLanes(1);
     }
 
     for (int i = 0; i < static_cast<int>(indices.size()) - 1; i++) {
-      TVM_FFI_ICHECK(indices[i].dtype().is_scalar())
+      TVM_FFI_ICHECK(indices[i].ty().IsScalar())
           << "Only the last index of a buffer access may be a vector type.";
     }
-    int index_lanes = indices.size() ? indices.back().dtype().lanes() : 1;
+    int index_lanes = indices.size() ? indices.back().ty().lanes() : 1;
 
-    DataType access_dtype = value_dtype;
+    PrimType access_dtype = value_dtype;
 
     int lanes_used = var_info.element_dtype.lanes();
 
@@ -1366,7 +1383,7 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
     if (index_lanes * var_info.element_dtype.lanes() != value_dtype.lanes()) {
       TVM_FFI_ICHECK_EQ(index_lanes, value_dtype.lanes());
       lanes_used = 1;
-      var_info.element_dtype = var_info.element_dtype.with_lanes(1);
+      var_info.element_dtype = var_info.element_dtype.WithLanes(1);
     }
 
     // TODO(Lunderberg): Uncomment this check once it can be applied.
@@ -1399,13 +1416,13 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
 
     if (detect_scalar_read_patterns_ && is_buffer_load && indices.size()) {
       const PrimExpr last_dim_index = indices[indices.size() - 1];
-      if (last_dim_index.dtype().lanes() == 1) {
+      if (last_dim_index.ty().lanes() == 1) {
         arith::ModularSet me = analyzer_->modular_set(last_dim_index);
-        var_info.scalar_read_dtype.emplace(access_dtype.with_lanes(me->coeff));
+        var_info.scalar_read_dtype.emplace(access_dtype.WithLanes(me->coeff));
         return;
       }
     }
-    var_info.access_dtype.insert(access_dtype.with_lanes(lanes_used));
+    var_info.access_dtype.insert(access_dtype.WithLanes(lanes_used));
   }
 
   // Map of buffer variable information determined
@@ -1488,15 +1505,16 @@ class VectorTypeRewriter : public StmtExprMutator {
     // Rewrite any buffer variables whose preferred type isn't their current type.
     for (const auto& pair : info_map) {
       const auto& var_info = pair.second;
-      DataType preferred = var_info.get_preferred_dtype();
+      PrimType preferred = var_info.get_preferred_dtype();
       if (preferred != var_info.element_dtype && (rewrite_mask & var_info.declaration_location)) {
         Var old_buffer_var = var_info.var;
         Var new_buffer_var(old_buffer_var->name_hint,
-                           PointerType(PrimType(preferred), GetPtrStorageScope(old_buffer_var)),
+                           PointerType(preferred, GetPtrStorageScope(old_buffer_var)),
                            old_buffer_var->span);
 
-        rewrite_map_[var_info.var.get()] = {var_info.var, new_buffer_var, var_info.element_dtype,
-                                            preferred};
+        rewrite_map_.insert_or_assign(
+            var_info.var.get(),
+            RewriteInfo{var_info.var, new_buffer_var, var_info.element_dtype, preferred});
       }
     }
   }
@@ -1523,7 +1541,7 @@ class VectorTypeRewriter : public StmtExprMutator {
     const PrimExpr& last_dim_index = indices[indices.size() - 1];
     const RampNode* ramp_index = indices[indices.size() - 1].as<RampNode>();
 
-    if (node->buffer->dtype.is_scalable_vector() || last_dim_index.dtype().is_scalable_vector()) {
+    if (node->buffer->dtype.IsScalableVector() || last_dim_index.ty().IsScalableVector()) {
       // Scalable types are not currently supported in storage_rewrite. Scalable buffer
       // accesses are not currently checked and therefore are not rewritten.
       return {node, shuffle_index};
@@ -1531,17 +1549,17 @@ class VectorTypeRewriter : public StmtExprMutator {
 
     if (ramp_index && is_one(ramp_index->stride) && ramp_index->lanes->IsInstance<IntImmNode>()) {
       int lanes = static_cast<int>(ramp_index->lanes.as_or_throw<IntImm>()->value);
-      PrimExpr new_index = ramp_index->base / MakeConst(ramp_index->base.dtype(), lanes);
+      PrimExpr new_index = ramp_index->base / MakeConst(ramp_index->base.ty(), lanes);
       if (lanes != info.factor()) {
         TVM_FFI_ICHECK(info.factor() && lanes % info.factor() == 0);
         int new_lanes = lanes / info.factor();
         new_index = Ramp(new_index * new_lanes, ramp_index->stride, new_lanes, ramp_index->span);
       }
       indices.Set(indices.size() - 1, new_index);
-    } else if (last_dim_index.dtype().lanes() == 1 && info.factor() > 1) {
+    } else if (last_dim_index.ty().lanes() == 1 && info.factor() > 1) {
       arith::ModularSet me = analyzer_->modular_set(last_dim_index);
       TVM_FFI_ICHECK(me->coeff == 0 || info.factor() % me->coeff == 0);
-      PrimExpr new_index = last_dim_index / MakeConst(last_dim_index.dtype(), info.factor());
+      PrimExpr new_index = last_dim_index / MakeConst(last_dim_index.ty(), info.factor());
       shuffle_index = me->base % info.factor();
       indices.Set(indices.size() - 1, new_index);
     }
@@ -1612,7 +1630,7 @@ class VectorTypeRewriter : public StmtExprMutator {
 
       ffi::Array<PrimExpr> shape = buf->shape;
       PrimExpr last_dim = shape[shape.size() - 1];
-      shape.Set(shape.size() - 1, last_dim / MakeConst(last_dim.dtype(), info.factor()));
+      shape.Set(shape.size() - 1, last_dim / MakeConst(last_dim.ty(), info.factor()));
 
       auto writer = buf.CopyOnWrite();
       writer->data = info.new_buffer_var;
@@ -1647,13 +1665,13 @@ class VectorTypeRewriter : public StmtExprMutator {
 
       PrimExpr e_dtype = tirx::TypeAnnotation(info.new_element_dtype);
       int factor = info.factor();
-      extent = extent / MakeConst(extent.dtype(), factor);
-      index = index / MakeConst(index.dtype(), factor);
+      extent = extent / MakeConst(extent.ty(), factor);
+      index = index / MakeConst(index.ty(), factor);
       ffi::Array<PrimExpr> acc_args{e_dtype, info.new_buffer_var, index, extent, flag};
       // tvm_access_ptr produces a pointer; its Call.dtype must be handle
       // (the lowering rule in src/target/intrin_rule.cc ICHECKs this).
       // The element dtype is conveyed via the first arg (e_dtype marker).
-      return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
+      return Call(PrimType::Handle(), builtin::tvm_access_ptr(), acc_args);
 
     } else {
       return StmtExprMutator::VisitExpr_(op);
@@ -1710,8 +1728,8 @@ class VectorTypeRewriter : public StmtExprMutator {
   struct RewriteInfo {
     Var old_buffer_var;
     Var new_buffer_var;
-    DataType old_element_dtype;
-    DataType new_element_dtype;
+    PrimType old_element_dtype;
+    PrimType new_element_dtype;
 
     int factor() const {
       int old_lanes = old_element_dtype.lanes();
diff --git a/src/tirx/transform/tile_primitive_dispatch.cc b/src/tirx/transform/tile_primitive_dispatch.cc
index 6052adcdc7ac..213264b1a2ae 100644
--- a/src/tirx/transform/tile_primitive_dispatch.cc
+++ b/src/tirx/transform/tile_primitive_dispatch.cc
@@ -605,7 +605,7 @@ class TilePrimitiveDispatcher : public StmtExprMutator {
     // Synthesize the warp_id_in_cta helper (CUDA only) when threadIdx is set.
     if (launch_params_.count("threadIdx.x") > 0) {
       PrimExpr shuffled = ScopeIdResolve::ComputeWarpIdInCta(launch_params_);
-      Var warp_id_in_cta_var("warp_id_in_cta", shuffled.dtype());
+      Var warp_id_in_cta_var("warp_id_in_cta", shuffled.ty());
       scope_binds->push_back({warp_id_in_cta_var, shuffled});
       IterVar warp_iv(Range::FromMinExtent(0, 1), warp_id_in_cta_var, kThreadIndex,
                       "warp_id_in_cta");
@@ -664,8 +664,8 @@ class TilePrimitiveDispatcher : public StmtExprMutator {
         // to map Vars back to their ScopeBinding.
         Var bind_var = def->def_ids[i];
         PrimExpr value = resolved[i];
-        if (bind_var->dtype != value.dtype()) {
-          value = Cast(bind_var->dtype, value);
+        if (bind_var.ty() != value.ty()) {
+          value = Cast(bind_var.ty(), value);
         }
         scope_binds->push_back({bind_var, value});
         if (is_implicit(bind_var)) {
@@ -1157,8 +1157,8 @@ class TilePrimitiveDispatcher : public StmtExprMutator {
         << "TIRxError: tirx.filter expects (var, cond); got " << call->args.size() << " args";
     auto target = ResolveScopeIdTarget(call->args[0]);
     if (target && ElectSyncFinder::Contains(call->args[1])) {
-      PrimExpr selector = tirx::Call(call->args[0].dtype(), tirx::builtin::selector(),
-                                     {call->args[0], call->args[1]});
+      PrimExpr selector =
+          tirx::Call(call->args[0].ty(), tirx::builtin::selector(), {call->args[0], call->args[1]});
       int pushed = TryPushSelectorForTarget(*target, selector) ? 1 : 0;
       return pushed + PushPredicateCtx(call->args[1]);
     }
@@ -1269,7 +1269,7 @@ class TilePrimitiveDispatcher : public StmtExprMutator {
         auto lane = FindLaneScopeVar();
         if (!lane) return -1;
         ScopeIdTarget target{ScopeBinding::kWarpThread, 0, 1};
-        PrimExpr selector = tirx::Call(lane->dtype(), tirx::builtin::selector(), {*lane, cond});
+        PrimExpr selector = tirx::Call(lane->ty(), tirx::builtin::selector(), {*lane, cond});
         return TryPushSelectorForTarget(target, selector) ? 1 : 0;
       }
       return -1;
@@ -1337,7 +1337,7 @@ class TilePrimitiveDispatcher : public StmtExprMutator {
     if (!lane) return false;
     ScopeIdTarget target{ScopeBinding::kWarpThread, 0, 1};
     PrimExpr selector =
-        tirx::Call(lane->dtype(), tirx::builtin::selector(), {*lane, atom.elect_sync_call});
+        tirx::Call(lane->ty(), tirx::builtin::selector(), {*lane, atom.elect_sync_call});
     return TryPushSelectorForTarget(target, selector);
   }
 
@@ -1399,17 +1399,18 @@ class TilePrimitiveDispatcher : public StmtExprMutator {
         args.push_back(new_arg);
       }
       if (changed) {
-        return tirx::Call(call->dtype, call->op, args, call->attrs, call->span);
+        return tirx::Call(call->ty(), call->op, args, call->attrs, call->span);
       }
     }
     return pred;
   }
 
   PrimExpr AsBool(PrimExpr pred) const {
-    if (pred.dtype().is_bool()) {
+    PrimType pred_ty = pred.ty();
+    if (pred_ty.MatchesCode(DLDataTypeCode::kDLBool)) {
       return pred;
     }
-    return pred != IntImm(pred.dtype(), 0);
+    return pred != IntImm(pred.ty(), 0);
   }
 
   ffi::Map<Var, Range> var_range_map_;
diff --git a/src/tirx/transform/tvm_ffi_binder.cc b/src/tirx/transform/tvm_ffi_binder.cc
index ba0e671dd94b..fd659b53d7f0 100644
--- a/src/tirx/transform/tvm_ffi_binder.cc
+++ b/src/tirx/transform/tvm_ffi_binder.cc
@@ -69,10 +69,10 @@ TVMFFIABIBuilder::TVMFFIABIBuilder(const ffi::String& func_name, const ffi::Arra
         shape_os << buf->shape[j];
         os << shape_os.str();
       }
-      os << "], " << buf->dtype << ")";
+      os << "], " << buf->dtype->dtype << ")";
       param_names_[static_cast<int>(i)] = buf_name;
     } else {
-      os << param->name_hint << ": " << param.dtype();
+      os << param->name_hint << ": " << param.ty()->dtype;
       param_names_[static_cast<int>(i)] = param->name_hint;
     }
   }
@@ -87,7 +87,7 @@ TVMFFIABIBuilder::TVMFFIABIBuilder(const ffi::String& func_name, const ffi::Arra
 
   // Emit null-pointer check for packed args (early check)
   if (num_args > 0) {
-    EmitAssert(!Call(DataType::Bool(), builtin::isnullptr(), {v_packed_args}),
+    EmitAssert(!Call(PrimType::Bool(), builtin::isnullptr(), {v_packed_args}),
                "TypeError",  //
                "args pointer is NULL", when_calling_imm_, sig_imm_, "`");
   }
@@ -163,7 +163,7 @@ int TVMFFIABIBuilder::GetParamIndex(const ffi::reflection::AccessPath& path) con
 
 bool TVMFFIABIBuilder::BindScalar(const PrimExpr& arg, const PrimExpr& value,
                                   const ffi::reflection::AccessPath& path, bool with_lets) {
-  TVM_FFI_ICHECK_EQ(arg.dtype(), value.dtype());
+  TVM_FFI_ICHECK(arg.ty()->dtype == value.ty()->dtype);
   if (arg.as<VarNode>()) {
     Var v_arg = arg.as_or_throw<Var>();
     auto it = var_defs_.find(v_arg.get());
@@ -368,8 +368,8 @@ void TVMFFIABIBuilder::BindBuffer(const Buffer& arg, const Buffer& value,
     if (BindScalar(arg->elem_offset, value->elem_offset, offset_path, false)) {
       if (arg->offset_factor > 1) {
         PrimExpr offset = value->elem_offset;
-        PrimExpr factor = IntImm(offset.dtype(), arg->offset_factor);
-        PrimExpr zero = IntImm(offset.dtype(), 0);
+        PrimExpr factor = IntImm(offset.ty(), arg->offset_factor);
+        PrimExpr zero = IntImm(offset.ty(), 0);
         PrimExpr acond = analyzer_->Simplify(truncmod(offset, factor) == zero);
         if (is_zero(acond)) {
           TVM_FFI_THROW(InternalError)
@@ -377,7 +377,9 @@ void TVMFFIABIBuilder::BindBuffer(const Buffer& arg, const Buffer& value,
         }
         if (!is_one(acond)) {
           int param_index = GetParamIndex(base_path);
-          int data_bytes = GetVectorBytes(arg->dtype);
+          int data_bytes =
+              ((((arg->dtype->dtype).bits * static_cast<int16_t>((arg->dtype->dtype).lanes)) + 7) /
+               8);
           EmitAssert(acond, "ValueError",  //
                      "Misaligned buffer data on argument #", std::to_string(param_index),
                      when_calling_imm_, sig_imm_, "`,\n  expected data alignment=",
@@ -422,12 +424,12 @@ void TVMFFIABIBuilder::BindBuffer(const Buffer& arg, const Buffer& value,
 
 /*! \brief Load the i-th packed argument as the given type. */
 PrimExpr TVMFFIABIBuilder::LoadTVMFFIAnyUnionValue(const Var& v_packed_args, int param_index,
-                                                   DataType arg_type) {
+                                                   PrimType arg_type) {
   ffi::Array<PrimExpr> call_args{v_packed_args, IntImm::Int32(param_index),
                                  IntImm::Int32(builtin::kTVMFFIAnyUnionValue)};
-  DataType api_type = APIType(arg_type);
+  PrimType api_type = APIType(arg_type);
   PrimExpr res = Call(api_type, builtin::tvm_struct_get(), call_args);
-  if (api_type != arg_type) {
+  if (api_type->dtype != arg_type->dtype) {
     res = Cast(arg_type, res);
   }
   return res;
@@ -447,8 +449,8 @@ PrimExpr TVMFFIABIBuilder::DecodeParamOpaqueHandle(int param_index, const Var& t
   const int64_t object_cell_offset = sizeof(TVMFFIObject);
   static_assert(sizeof(TVMFFIObject) == 24);
   PrimExpr arg_value =
-      LoadTVMFFIAnyUnionValue(v_packed_args_, param_index, params_[param_index].dtype());
-  PrimExpr handle_from_tensor = Call(DataType::Handle(), tirx::builtin::handle_add_byte_offset(),
+      LoadTVMFFIAnyUnionValue(v_packed_args_, param_index, params_[param_index].ty());
+  PrimExpr handle_from_tensor = Call(PrimType::Handle(), tirx::builtin::handle_add_byte_offset(),
                                      {arg_value, IntImm::Int32(object_cell_offset)});
   return Select(type_index == ffi::TypeIndex::kTVMFFITensor, handle_from_tensor, arg_value);
 }
@@ -459,11 +461,11 @@ PrimExpr TVMFFIABIBuilder::DecodeParamBool(int param_index, const Var& type_inde
       param_index,
       type_index == ffi::TypeIndex::kTVMFFIBool || type_index == ffi::TypeIndex::kTVMFFIInt,
       "boolean");
-  return Cast(DataType::Bool(),
-              LoadTVMFFIAnyUnionValue(v_packed_args_, param_index, DataType::Int(64)));
+  return Cast(PrimType::Bool(),
+              LoadTVMFFIAnyUnionValue(v_packed_args_, param_index, PrimType::Int(64)));
 }
 
-PrimExpr TVMFFIABIBuilder::DecodeParamInt(int param_index, const Var& type_index, DataType dtype) {
+PrimExpr TVMFFIABIBuilder::DecodeParamInt(int param_index, const Var& type_index, PrimType dtype) {
   // ── Type check: accept int or bool ─────────────────────────
   EmitTypeIndexCheck(
       param_index,
@@ -472,7 +474,7 @@ PrimExpr TVMFFIABIBuilder::DecodeParamInt(int param_index, const Var& type_index
 }
 
 PrimExpr TVMFFIABIBuilder::DecodeParamFloat(int param_index, const Var& type_index,
-                                            DataType dtype) {
+                                            PrimType dtype) {
   // ── Type check: accept float, int, or bool ─────────────────
   EmitTypeIndexCheck(param_index,
                      type_index == ffi::TypeIndex::kTVMFFIFloat ||
@@ -483,7 +485,7 @@ PrimExpr TVMFFIABIBuilder::DecodeParamFloat(int param_index, const Var& type_ind
       type_index == ffi::TypeIndex::kTVMFFIFloat,
       /* true_value = */ LoadTVMFFIAnyUnionValue(v_packed_args_, param_index, dtype),
       /* false_value = */
-      Cast(dtype, LoadTVMFFIAnyUnionValue(v_packed_args_, param_index, DataType::Int(64))));
+      Cast(dtype, LoadTVMFFIAnyUnionValue(v_packed_args_, param_index, PrimType::Int(64))));
 }
 
 // ============================================================
@@ -492,24 +494,24 @@ PrimExpr TVMFFIABIBuilder::DecodeParamFloat(int param_index, const Var& type_ind
 
 void TVMFFIABIBuilder::DecodeParam(int param_index) {
   Var param = params_[param_index];
-  DataType dtype = param.dtype();
+  PrimType dtype = param.ty();
 
   // Extract type_index from packed_args
-  Var type_index(param->name_hint + ".type_index", DataType::Int(32));
-  init_nest_.push_back(Bind(type_index, tirx::Call(DataType::Int(32), builtin::tvm_struct_get(),
+  Var type_index(param->name_hint + ".type_index", PrimType::Int(32));
+  init_nest_.push_back(Bind(type_index, tirx::Call(PrimType::Int(32), builtin::tvm_struct_get(),
                                                    {v_packed_args_, IntImm::Int32(param_index),
                                                     IntImm::Int32(builtin::kTVMFFIAnyTypeIndex)})));
 
   // Type-check and load value via per-dtype dispatch
   PrimExpr arg_value;
-  if (dtype.is_handle()) {
+  if (dtype.IsHandle()) {
     arg_value = DecodeParamOpaqueHandle(param_index, type_index);
-  } else if (dtype.is_bool()) {
+  } else if (dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
     arg_value = DecodeParamBool(param_index, type_index);
-  } else if (dtype.is_int() || dtype.is_uint()) {
+  } else if (dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
     arg_value = DecodeParamInt(param_index, type_index, dtype);
   } else {
-    TVM_FFI_ICHECK(dtype.is_float());
+    TVM_FFI_ICHECK_EQ(dtype.code(), DLDataTypeCode::kDLFloat);
     arg_value = DecodeParamFloat(param_index, type_index, dtype);
   }
 
@@ -553,9 +555,9 @@ void TVMFFIABIBuilder::DecodeAllParams() {
 
 Var TVMFFIABIBuilder::DLTensorGetFieldPtr(const Var& handle, int field_kind,
                                           const std::string& var_name) {
-  Var ptr(var_name, DataType::Handle());
+  Var ptr(var_name, PrimType::Handle());
   init_nest_.emplace_back(
-      Bind(ptr, TVMStructGet(DataType::Handle(), handle, 0,
+      Bind(ptr, TVMStructGet(PrimType::Handle(), handle, 0,
                              static_cast<builtin::TVMStructFieldKind>(field_kind))));
   return ptr;
 }
@@ -565,7 +567,7 @@ Var TVMFFIABIBuilder::DLTensorGetFieldPtr(const Var& handle, int field_kind,
 // ============================================================
 
 PrimExpr TVMFFIABIBuilder::LoadInt64ArrayElem(const Var& ptr, int index) {
-  return TVMStructGet(DataType::ShapeIndex(), ptr, index, builtin::kInt64ArrayElem);
+  return TVMStructGet(DefaultIndexPrimType(), ptr, index, builtin::kInt64ArrayElem);
 }
 
 // ============================================================
@@ -575,7 +577,7 @@ PrimExpr TVMFFIABIBuilder::LoadInt64ArrayElem(const Var& ptr, int index) {
 void TVMFFIABIBuilder::BindCompactStrides(const Buffer& buffer, const Var& strides_ptr,
                                           const PrimExpr& v_strides_is_null,
                                           const ffi::reflection::AccessPath& param_path) {
-  DataType stype = buffer->DefaultIndexType();
+  PrimType stype(buffer->DefaultIndexType());
   PrimExpr expect_stride = MakeConst(stype, 1);
   ffi::Array<PrimExpr> conds;
   for (size_t i = buffer->shape.size(); i != 0; --i) {
@@ -602,11 +604,11 @@ void TVMFFIABIBuilder::BindCompactStrides(const Buffer& buffer, const Var& strid
 void TVMFFIABIBuilder::BindAutoBroadcastStrides(const Buffer& buffer, const Var& strides_ptr,
                                                 const PrimExpr& v_strides_is_null,
                                                 const ffi::reflection::AccessPath& param_path) {
-  DataType stype = buffer->DefaultIndexType();
+  PrimType stype(buffer->DefaultIndexType());
   PrimExpr stride = MakeConst(stype, 1);
   for (size_t i = buffer->shape.size(); i != 0; --i) {
     size_t k = i - 1;
-    PrimExpr value = cast(buffer->shape[k].dtype(), LoadInt64ArrayElem(strides_ptr, k));
+    PrimExpr value = cast(buffer->shape[k].ty(), LoadInt64ArrayElem(strides_ptr, k));
     value = tvm::if_then_else(v_strides_is_null, stride, value);
     value = tvm::if_then_else(buffer->shape[k] == 1, 0, value);
     ffi::reflection::AccessPath strides_k_path =
@@ -621,13 +623,13 @@ void TVMFFIABIBuilder::BindRegularStrides(const Buffer& buffer, const Var& strid
                                           const ffi::reflection::AccessPath& param_path) {
   PrimExpr stride_from_shape = 1;
   for (int k = buffer->strides.size() - 1; k >= 0; k--) {
-    PrimExpr explicit_stride = cast(buffer->shape[k].dtype(), LoadInt64ArrayElem(strides_ptr, k));
+    PrimExpr explicit_stride = cast(buffer->shape[k].ty(), LoadInt64ArrayElem(strides_ptr, k));
     ffi::reflection::AccessPath strides_k_path =
         param_path->Attr(ffi::String("strides"))->ArrayItem(k);
     BindScalar(buffer->strides[k],
                tvm::if_then_else(v_strides_is_null, stride_from_shape, explicit_stride),
                strides_k_path, true);
-    stride_from_shape *= cast(buffer->shape[k].dtype(), LoadInt64ArrayElem(shape_ptr, k));
+    stride_from_shape *= cast(buffer->shape[k].ty(), LoadInt64ArrayElem(shape_ptr, k));
   }
 }
 
@@ -639,14 +641,14 @@ void TVMFFIABIBuilder::DecodeParamDLTensor(const Buffer& buffer, const PrimExpr&
                                            const PrimExpr& device_id, const Var& handle,
                                            const std::string& arg_name,
                                            ffi::reflection::AccessPath base_path) {
-  const DataType tvm_ndim_type = DataType::Int(32);
+  const PrimType tvm_ndim_type = PrimType::Int(32);
 
   std::string buf_name = buffer->name;
   ffi::reflection::AccessPath param_path = base_path;
   int param_index = GetParamIndex(base_path);
 
   // ── Section: Null pointer check ──────────────────────────────
-  EmitTypeIndexCheck(param_index, !Call(DataType::Bool(), builtin::isnullptr(), {handle}),
+  EmitTypeIndexCheck(param_index, !Call(PrimType::Bool(), builtin::isnullptr(), {handle}),
                      "Tensor");
 
   // ── Section: ndim ────────────────────────────────────────────
@@ -658,16 +660,21 @@ void TVMFFIABIBuilder::DecodeParamDLTensor(const Buffer& buffer, const PrimExpr&
 
   // ── Section: dtype ───────────────────────────────────────────
   {
-    PrimExpr cond = (TVMStructGet(DataType::UInt(8), handle, 0, builtin::kDLTensorTypeCode) ==
-                         IntImm(DataType::UInt(8), buffer->dtype.code()) &&
-                     TVMStructGet(DataType::UInt(8), handle, 0, builtin::kDLTensorTypeBits) ==
-                         IntImm(DataType::UInt(8), buffer->dtype.bits()) &&
-                     TVMStructGet(DataType::UInt(16), handle, 0, builtin::kDLTensorTypeLanes) ==
-                         IntImm(DataType::UInt(16), buffer->dtype.lanes()));
-    if (!(buffer->dtype == DataType::Int(1) || buffer->dtype == DataType::Int(4) ||
-          buffer->dtype == DataType::UInt(4))) {
+    PrimExpr code_matches =
+        TVMStructGet(PrimType::UInt(8), handle, 0, builtin::kDLTensorTypeCode) ==
+        IntImm(PrimType::UInt(8), buffer->dtype.code());
+    PrimExpr bits_matches =
+        TVMStructGet(PrimType::UInt(8), handle, 0, builtin::kDLTensorTypeBits) ==
+        IntImm(PrimType::UInt(8), buffer->dtype.bits());
+    PrimExpr lanes_matches =
+        TVMStructGet(PrimType::UInt(16), handle, 0, builtin::kDLTensorTypeLanes) ==
+        IntImm(PrimType::UInt(16), buffer->dtype.lanes());
+    PrimExpr cond = cast(PrimType::Bool(), code_matches) && cast(PrimType::Bool(), bits_matches) &&
+                    cast(PrimType::Bool(), lanes_matches);
+    if (!(buffer->dtype == PrimType::Int(1) || buffer->dtype == PrimType::Int(4) ||
+          buffer->dtype == PrimType::UInt(4))) {
       std::ostringstream dtype_os;
-      dtype_os << buffer->dtype;
+      dtype_os << buffer->dtype->dtype;
       EmitAssert(cond, "TypeError",  //
                  "Mismatched ", buf_name, ".dtype on argument #", std::to_string(param_index),
                  when_calling_imm_, sig_imm_, "`,\n  expected ", dtype_os.str());
@@ -677,18 +684,18 @@ void TVMFFIABIBuilder::DecodeParamDLTensor(const Buffer& buffer, const PrimExpr&
   // ── Section: shape ───────────────────────────────────────────
   Var shape_ptr = DLTensorGetFieldPtr(handle, builtin::kDLTensorShape, arg_name + "_shape");
   for (size_t k = 0; k < buffer->shape.size(); ++k) {
-    if (buffer->dtype == DataType::Int(4) || buffer->dtype == DataType::UInt(4) ||
-        buffer->dtype == DataType::Int(1)) {
+    if (buffer->dtype == PrimType::Int(4) || buffer->dtype == PrimType::UInt(4) ||
+        buffer->dtype == PrimType::Int(1)) {
       break;
     }
     ffi::reflection::AccessPath shape_k_path = param_path->Attr(ffi::String("shape"))->ArrayItem(k);
-    BindScalar(buffer->shape[k], cast(buffer->shape[k].dtype(), LoadInt64ArrayElem(shape_ptr, k)),
+    BindScalar(buffer->shape[k], cast(buffer->shape[k].ty(), LoadInt64ArrayElem(shape_ptr, k)),
                shape_k_path, true);
   }
 
   // ── Section: strides ─────────────────────────────────────────
   Var strides_ptr = DLTensorGetFieldPtr(handle, builtin::kDLTensorStrides, arg_name + "_strides");
-  PrimExpr v_strides_is_null = Call(DataType::Bool(), builtin::isnullptr(), {strides_ptr});
+  PrimExpr v_strides_is_null = Call(PrimType::Bool(), builtin::isnullptr(), {strides_ptr});
   if (buffer->strides.size() == 0) {
     BindCompactStrides(buffer, strides_ptr, v_strides_is_null, param_path);
   } else if (buffer->buffer_type == kAutoBroadcast) {
@@ -698,22 +705,22 @@ void TVMFFIABIBuilder::DecodeParamDLTensor(const Buffer& buffer, const PrimExpr&
   }
 
   // ── Section: byte_offset ─────────────────────────────────────
-  int data_bytes = GetVectorBytes(buffer->dtype);
+  int data_bytes = static_cast<int>(buffer->dtype.StorageBytes());
   ffi::reflection::AccessPath byte_offset_path = param_path->Attr(ffi::String("byte_offset"));
   if (const auto* const_offset = buffer->elem_offset.as<IntImmNode>()) {
-    BindScalar(IntImm(DataType::UInt(64), const_offset->value * data_bytes),
-               TVMStructGet(DataType::UInt(64), handle, 0, builtin::kDLTensorByteOffset),
+    BindScalar(IntImm(PrimType::UInt(64), const_offset->value * data_bytes),
+               TVMStructGet(PrimType::UInt(64), handle, 0, builtin::kDLTensorByteOffset),
                byte_offset_path, true);
   } else {
     if (BindScalar(buffer->elem_offset,
-                   cast(buffer->elem_offset.dtype(),
-                        (TVMStructGet(DataType::UInt(64), handle, 0, builtin::kDLTensorByteOffset) /
-                         MakeConst(DataType::UInt(64), data_bytes))),
+                   cast(buffer->elem_offset.ty(),
+                        (TVMStructGet(PrimType::UInt(64), handle, 0, builtin::kDLTensorByteOffset) /
+                         MakeConst(PrimType::UInt(64), data_bytes))),
                    byte_offset_path, true)) {
       if (buffer->offset_factor > 1) {
         PrimExpr offset = buffer->elem_offset;
-        PrimExpr factor = IntImm(offset.dtype(), buffer->offset_factor);
-        PrimExpr zero = IntImm(offset.dtype(), 0);
+        PrimExpr factor = IntImm(offset.ty(), buffer->offset_factor);
+        PrimExpr zero = IntImm(offset.ty(), 0);
         PrimExpr acond = analyzer_->Simplify(truncmod(offset, factor) == zero);
         if (is_zero(acond)) {
           TVM_FFI_THROW(InternalError)
@@ -732,7 +739,7 @@ void TVMFFIABIBuilder::DecodeParamDLTensor(const Buffer& buffer, const PrimExpr&
   // ── Section: device ──────────────────────────────────────────
   {
     PrimExpr actual_device_type =
-        TVMStructGet(DataType::Int(32), handle, 0, builtin::kDLTensorDeviceType);
+        TVMStructGet(PrimType::Int(32), handle, 0, builtin::kDLTensorDeviceType);
     // Use custom assertion for device_type to show human-readable device name
     if (const auto* const_dt = device_type_.as<IntImmNode>()) {
       PrimExpr cond = analyzer_->Simplify(IntImm::Int32(const_dt->value) == actual_device_type);
@@ -748,7 +755,7 @@ void TVMFFIABIBuilder::DecodeParamDLTensor(const Buffer& buffer, const PrimExpr&
       BindScalar(device_type_, actual_device_type, device_type_path, true);
     }
     ffi::reflection::AccessPath device_id_path = param_path->Attr(ffi::String("device_id"));
-    BindScalar(device_id_, TVMStructGet(DataType::Int(32), handle, 0, builtin::kDLTensorDeviceId),
+    BindScalar(device_id_, TVMStructGet(PrimType::Int(32), handle, 0, builtin::kDLTensorDeviceId),
                device_id_path, true);
   }
 
@@ -756,12 +763,12 @@ void TVMFFIABIBuilder::DecodeParamDLTensor(const Buffer& buffer, const PrimExpr&
   {
     ffi::reflection::AccessPath data_path = param_path->Attr(ffi::String("data"));
     if (BindScalar(buffer->data,
-                   TVMStructGet(DataType::Handle(), handle, 0, builtin::kDLTensorData), data_path,
+                   TVMStructGet(PrimType::Handle(), handle, 0, builtin::kDLTensorData), data_path,
                    true)) {
       Var vptr(buffer->data);
 
       auto alloc_size = [&]() -> PrimExpr {
-        PrimExpr product = IntImm(buffer->DefaultIndexType(), 1);
+        PrimExpr product = IntImm(PrimType(buffer->DefaultIndexType()), 1);
         for (const auto& dim : buffer->shape) {
           product *= dim;
         }
@@ -769,9 +776,10 @@ void TVMFFIABIBuilder::DecodeParamDLTensor(const Buffer& buffer, const PrimExpr&
       }();
       // Data pointer null and alignment checks go to asserts_ because alloc_size
       // references buffer->shape which may contain forward-referenced symbolic vars.
+      PrimExpr empty_alloc = cast(PrimType::Bool(), alloc_size == 0);
+      PrimExpr data_non_null = !Call(PrimType::Bool(), builtin::isnullptr(), {vptr});
       asserts_.emplace_back(AssertStmt(
-          alloc_size == 0 || !Call(DataType::Bool(), builtin::isnullptr(), {vptr}),
-          StringImm("ValueError"),
+          empty_alloc || data_non_null, StringImm("ValueError"),
           ffi::Array<StringImm>({StringImm(buf_name),
                                  StringImm(" data pointer is NULL on argument #"),
                                  StringImm(std::to_string(param_index)), when_calling_imm_,
@@ -781,10 +789,10 @@ void TVMFFIABIBuilder::DecodeParamDLTensor(const Buffer& buffer, const PrimExpr&
         // Check data pointer alignment
         if (buffer->data_alignment > 1) {
           PrimExpr ptr_as_int =
-              Call(DataType::UInt(64), builtin::reinterpret(), {cast(DataType::Handle(), vptr)});
+              Call(PrimType::UInt(64), builtin::reinterpret(), {cast(PrimType::Handle(), vptr)});
           PrimExpr align_cond =
-              truncmod(ptr_as_int, IntImm(DataType::UInt(64), buffer->data_alignment)) ==
-              IntImm(DataType::UInt(64), 0);
+              truncmod(ptr_as_int, IntImm(PrimType::UInt(64), buffer->data_alignment)) ==
+              IntImm(PrimType::UInt(64), 0);
           asserts_.emplace_back(AssertStmt(
               alloc_size == 0 || align_cond, StringImm("ValueError"),
               ffi::Array<StringImm>({StringImm("Misaligned Tensor data on argument #"),
diff --git a/src/tirx/transform/tvm_ffi_binder.h b/src/tirx/transform/tvm_ffi_binder.h
index 92af52df6bcb..a465025ad517 100644
--- a/src/tirx/transform/tvm_ffi_binder.h
+++ b/src/tirx/transform/tvm_ffi_binder.h
@@ -180,7 +180,7 @@ class TVMFFIABIBuilder {
 
   /*! \brief Load the i-th packed argument as the given type from the union value. */
   static PrimExpr LoadTVMFFIAnyUnionValue(const Var& v_packed_args, int param_index,
-                                          DataType arg_type);
+                                          PrimType arg_type);
 
   // ── Per-dtype type-check + value-load methods ──────────────────
   //
@@ -211,7 +211,7 @@ class TVMFFIABIBuilder {
    * \param dtype The expected data type for this parameter.
    * \return The loaded argument value.
    */
-  PrimExpr DecodeParamInt(int param_index, const Var& type_index, DataType dtype);
+  PrimExpr DecodeParamInt(int param_index, const Var& type_index, PrimType dtype);
 
   /*!
    * \brief Type-check and load a float argument.
@@ -220,7 +220,7 @@ class TVMFFIABIBuilder {
    * \param dtype The expected data type for this parameter.
    * \return The loaded argument value.
    */
-  PrimExpr DecodeParamFloat(int param_index, const Var& type_index, DataType dtype);
+  PrimExpr DecodeParamFloat(int param_index, const Var& type_index, PrimType dtype);
 
   // ── Private binding submethods (all take ffi::reflection::AccessPath) ───────────
 
diff --git a/src/tirx/transform/unroll_loop.cc b/src/tirx/transform/unroll_loop.cc
index 740176f50498..c41e717ca8f1 100644
--- a/src/tirx/transform/unroll_loop.cc
+++ b/src/tirx/transform/unroll_loop.cc
@@ -225,7 +225,7 @@ class LoopUnroller : public StmtExprMutator {
     ffi::Map<Var, PrimExpr> vmap;
     ffi::Array<Stmt> unrolled;
     for (int i = 0; i < value; ++i) {
-      vmap.Set(op->loop_var, op->min + MakeConst(op->loop_var.dtype(), i));
+      vmap.Set(op->loop_var, op->min + MakeConst(op->loop_var.ty(), i));
       Stmt step = Substitute(body, vmap);
       unrolled.push_back(step);
     }
diff --git a/src/tirx/transform/unsupported_dtype_legalize.cc b/src/tirx/transform/unsupported_dtype_legalize.cc
index 01ae31938117..cc9725dc7a23 100644
--- a/src/tirx/transform/unsupported_dtype_legalize.cc
+++ b/src/tirx/transform/unsupported_dtype_legalize.cc
@@ -37,6 +37,31 @@
 namespace tvm {
 namespace tirx {
 
+namespace {
+
+bool IsBFloat16Type(const PrimType& type) {
+  return type.MatchesElementType(DLDataTypeCode::kDLBfloat, 16);
+}
+
+bool IsFloat8Type(const PrimType& type) {
+  DLDataTypeCode code = type.code();
+  return code == DLDataTypeCode::kDLFloat8_e3m4 || code == DLDataTypeCode::kDLFloat8_e4m3 ||
+         code == DLDataTypeCode::kDLFloat8_e4m3b11fnuz ||
+         code == DLDataTypeCode::kDLFloat8_e4m3fn || code == DLDataTypeCode::kDLFloat8_e4m3fnuz ||
+         code == DLDataTypeCode::kDLFloat8_e5m2 || code == DLDataTypeCode::kDLFloat8_e5m2fnuz ||
+         code == DLDataTypeCode::kDLFloat8_e8m0fnu;
+}
+
+template <typename F>
+bool MatchPrimType(const Type& type, F f) {
+  if (const auto* prim_type = type.as<PrimTypeNode>()) {
+    return f(ffi::GetRef<PrimType>(prim_type));
+  }
+  return false;
+}
+
+}  // namespace
+
 // NOTE: do not touch buffer on function boundary
 // remap internal fp8/bf16 buffer to f32 if they meet the following condition
 // - constant allocation size
@@ -47,7 +72,7 @@ class ComputeLegalizePlanner : public StmtExprVisitor {
  public:
   ComputeLegalizePlanner(
       std::unordered_map<Buffer, Buffer, ffi::ObjectPtrHash, ffi::ObjectPtrEqual>* buffer_remap,
-      std::unordered_map<Var, Var>* var_remap, DataType promote_dtype)
+      std::unordered_map<Var, Var>* var_remap, PrimType promote_dtype)
       : buffer_remap_(buffer_remap), var_remap_(var_remap), promote_dtype_(promote_dtype) {}
 
   // run planning to populate buffer remap and var remap.
@@ -74,7 +99,7 @@ class ComputeLegalizePlanner : public StmtExprVisitor {
     }
   }
 
-  virtual bool MatchDType(DataType dtype) const = 0;
+  virtual bool MatchType(const Type& type) const = 0;
 
   void VisitStmt_(const BufferStoreNode* op) final {
     StmtExprVisitor::VisitStmt_(op);
@@ -88,14 +113,13 @@ class ComputeLegalizePlanner : public StmtExprVisitor {
 
   void VisitStmt_(const AllocBufferNode* op) final {
     // remap all intermediate constant buffer to promote data types (fp16/fp32)
-    if (MatchDType(op->buffer->dtype)) {
-      DataType dtype = promote_dtype_.with_lanes(op->buffer->dtype.lanes());
+    if (MatchType(op->buffer->dtype)) {
+      PrimType dtype = promote_dtype_.WithLanes(op->buffer->dtype.lanes());
       ffi::String storage_scope = "global";
       if (auto* ptr_type = op->buffer->data->type_annotation.as<PointerTypeNode>()) {
         storage_scope = ptr_type->storage_scope;
       }
-      Var buffer_var =
-          Var(op->buffer->data->name_hint, PointerType(PrimType(dtype), storage_scope));
+      Var buffer_var = Var(op->buffer->data->name_hint, PointerType(dtype, storage_scope));
       (*var_remap_)[op->buffer->data] = buffer_var;
     }
     return StmtExprVisitor::VisitStmt_(op);
@@ -109,7 +133,7 @@ class ComputeLegalizePlanner : public StmtExprVisitor {
   void VisitExpr_(const VarNode* op) final {
     StmtExprVisitor::VisitExpr_(op);
     Var buffer_var = ffi::GetRef<Var>(op);
-    if (buffer_var.dtype().is_handle()) {
+    if (buffer_var.ty().IsHandle()) {
       opaque_var_access_.insert(buffer_var);
     }
   }
@@ -119,7 +143,7 @@ class ComputeLegalizePlanner : public StmtExprVisitor {
     auto var_it = var_remap_->find(buf->data);
     if (var_it == var_remap_->end()) return;
 
-    Buffer new_buffer(var_it->second, promote_dtype_.with_lanes(buf->dtype.lanes()), buf->shape,
+    Buffer new_buffer(var_it->second, promote_dtype_.WithLanes(buf->dtype.lanes()), buf->shape,
                       buf->strides, buf->elem_offset, buf->name, buf->data_alignment,
                       buf->offset_factor, buf->buffer_type, buf->axis_separators, buf->span,
                       buf->layout, buf->allocated_addr);
@@ -129,25 +153,29 @@ class ComputeLegalizePlanner : public StmtExprVisitor {
   std::unordered_map<Buffer, Buffer, ffi::ObjectPtrHash, ffi::ObjectPtrEqual>* buffer_remap_;
   std::unordered_map<Var, Var>* var_remap_;
   std::unordered_set<Var> opaque_var_access_;
-  DataType promote_dtype_;
+  PrimType promote_dtype_;
 };
 
 class BF16ComputeLegalizePlanner : public ComputeLegalizePlanner {
  public:
   explicit BF16ComputeLegalizePlanner(
       std::unordered_map<Buffer, Buffer, ffi::ObjectPtrHash, ffi::ObjectPtrEqual>* buffer_remap,
-      std::unordered_map<Var, Var>* var_remap, DataType promote_dtype)
+      std::unordered_map<Var, Var>* var_remap, PrimType promote_dtype)
       : ComputeLegalizePlanner(buffer_remap, var_remap, promote_dtype) {}
-  bool MatchDType(DataType dtype) const { return dtype.is_bfloat16(); }
+  bool MatchType(const Type& type) const {
+    return MatchPrimType(type, [](const PrimType& prim_type) { return IsBFloat16Type(prim_type); });
+  }
 };
 
 class FP8ComputeLegalizePlanner : public ComputeLegalizePlanner {
  public:
   explicit FP8ComputeLegalizePlanner(
       std::unordered_map<Buffer, Buffer, ffi::ObjectPtrHash, ffi::ObjectPtrEqual>* buffer_remap,
-      std::unordered_map<Var, Var>* var_remap, DataType promote_dtype)
+      std::unordered_map<Var, Var>* var_remap, PrimType promote_dtype)
       : ComputeLegalizePlanner(buffer_remap, var_remap, promote_dtype) {}
-  bool MatchDType(DataType dtype) const { return dtype.is_float8(); }
+  bool MatchType(const Type& type) const {
+    return MatchPrimType(type, [](const PrimType& prim_type) { return IsFloat8Type(prim_type); });
+  }
 };
 
 #define DEFINE_BIOP_EXPR_LEGALIZE(OP, FUNC)                      \
@@ -169,7 +197,7 @@ class FP8ComputeLegalizePlanner : public ComputeLegalizePlanner {
 // point in the TIR lowering phases.
 class ComputeLegalizer : public StmtExprMutator {
  public:
-  explicit ComputeLegalizer(DataType promote_dtype) : promote_dtype_(promote_dtype) {}
+  explicit ComputeLegalizer(PrimType promote_dtype) : promote_dtype_(promote_dtype) {}
 
   PrimFunc LegalizeWithPlanner(PrimFunc func, ComputeLegalizePlanner* planner) {
     planner->Plan(func);
@@ -180,21 +208,22 @@ class ComputeLegalizer : public StmtExprMutator {
 
   virtual PrimFunc Legalize(PrimFunc func) = 0;
 
-  virtual bool MatchDType(DataType dtype) const = 0;
+  virtual bool MatchType(const Type& type) const = 0;
 
  protected:
   PrimExpr VisitExpr_(const CastNode* op) final {
     auto op_val = PromoteToTarget(this->VisitExpr(op->value));
 
     // all casts to matched data type (fp8/bf16) becomes f32
-    if (MatchDType(op->dtype)) {
-      return cast(promote_dtype_.with_lanes(op->dtype.lanes()), op_val);
+    PrimType op_ty = op->ty();
+    if (MatchType(op_ty)) {
+      return cast(promote_dtype_.WithLanes(op_ty.lanes()), op_val);
     }
 
     if (op_val.same_as(op->value)) {
       return ffi::GetRef<PrimExpr>(op);
     } else {
-      return cast(op->dtype, op_val);
+      return cast(op_ty, op_val);
     }
   }
 
@@ -237,18 +266,19 @@ class ComputeLegalizer : public StmtExprMutator {
     // update normal computations to return f32 instead.
     auto fmutate = [this](const PrimExpr& e) { return PromoteToTarget(this->VisitExpr(e)); };
     ffi::Array<PrimExpr> args = op->args.Map(fmutate);
-    if (MatchDType(op->dtype)) {
-      return Call(promote_dtype_.with_lanes(op->dtype.lanes()), op->op, args, op->attrs, op->span);
+    PrimType op_ty = op->ty();
+    if (MatchType(op_ty)) {
+      return Call(promote_dtype_.WithLanes(op_ty.lanes()), op->op, args, op->attrs, op->span);
     }
     if (args.same_as(op->args)) {
       return ffi::GetRef<PrimExpr>(op);
     } else {
-      return Call(op->dtype, op->op, args, op->attrs, op->span);
+      return Call(ffi::GetRef<PrimExpr>(op).ty(), op->op, args, op->attrs, op->span);
     }
   }
 
   PrimExpr VisitExpr_(const FloatImmNode* op) final {
-    if (MatchDType(op->dtype)) {
+    if (MatchType(op->ty())) {
       return FloatImm(promote_dtype_, op->value);
     }
     return ffi::GetRef<PrimExpr>(op);
@@ -268,8 +298,8 @@ class ComputeLegalizer : public StmtExprMutator {
   PrimExpr VisitExpr_(const LetNode* op) final {
     PrimExpr value = PromoteToTarget(op->value);
     Var var = op->var;
-    if (value.dtype() != op->value.dtype()) {
-      var = op->var.copy_with_dtype(op->value.dtype());
+    if (value.ty() != op->value.ty()) {
+      var = op->var.copy_with_dtype(op->value.ty());
       var_remap_[op->var] = var;
     }
 
@@ -298,8 +328,8 @@ class ComputeLegalizer : public StmtExprMutator {
   Stmt VisitStmt_(const BindNode* op) final {
     PrimExpr value = PromoteToTarget(op->value);
     Var var = op->var;
-    if (value.dtype() != op->value.dtype()) {
-      var = op->var.copy_with_dtype(op->value.dtype());
+    if (value.ty() != op->value.ty()) {
+      var = op->var.copy_with_dtype(op->value.ty());
       var_remap_[op->var] = var;
     }
 
@@ -321,17 +351,17 @@ class ComputeLegalizer : public StmtExprMutator {
     if (value.same_as(op->value) && indices.same_as(op->indices) && new_buf.same_as(op->buffer)) {
       return ffi::GetRef<Stmt>(op);
     } else {
-      if (MatchDType(new_buf->dtype)) {
-        int index_lanes = indices.size() ? indices.back().dtype().lanes() : 1;
+      if (MatchType(new_buf->dtype)) {
+        int index_lanes = indices.size() ? indices.back().ty().lanes() : 1;
         int buffer_lanes = new_buf->dtype.lanes();
-        DataType legalized_dtype = new_buf->dtype.with_lanes(index_lanes * buffer_lanes);
+        PrimType legalized_dtype = new_buf->dtype.WithLanes(index_lanes * buffer_lanes);
         value = CastTargetToDType(value, legalized_dtype);
       }
-      if (value.dtype() != new_buf->dtype) {
+      if (value.ty() != new_buf->dtype) {
         // this happens when buffer get rewritten to f32
         // but values remain as fp8/bf16
-        TVM_FFI_ICHECK(MatchDType(value->dtype));
-        value = DTypeConversion(value, new_buf->dtype.with_lanes(value.dtype().lanes()));
+        TVM_FFI_ICHECK(MatchType(value.ty()));
+        value = DTypeConversion(value, new_buf->dtype.WithLanes(value.ty().lanes()));
       }
       TVM_FFI_ICHECK(!op->predicate.defined())
           << "Predicated buffer store is not currently supported in "
@@ -360,12 +390,12 @@ class ComputeLegalizer : public StmtExprMutator {
       // Remap input variables
       for (size_t i = 0; i < legalized_identity_elements.size(); i++) {
         Var lhs_var = reducer->lhs[i];
-        if (lhs_var.dtype() != legalized_identity_elements[i].dtype()) {
-          var_remap_[lhs_var] = lhs_var.copy_with_dtype(legalized_identity_elements[i].dtype());
+        if (lhs_var.ty() != legalized_identity_elements[i].ty()) {
+          var_remap_[lhs_var] = lhs_var.copy_with_dtype(legalized_identity_elements[i].ty());
         }
         Var rhs_var = reducer->rhs[i];
-        if (rhs_var.dtype() != legalized_identity_elements[i].dtype()) {
-          var_remap_[rhs_var] = rhs_var.copy_with_dtype(legalized_identity_elements[i].dtype());
+        if (rhs_var.ty() != legalized_identity_elements[i].ty()) {
+          var_remap_[rhs_var] = rhs_var.copy_with_dtype(legalized_identity_elements[i].ty());
         }
       }
 
@@ -442,12 +472,12 @@ class ComputeLegalizer : public StmtExprMutator {
    * \return The converted value.
    */
   PrimExpr PromoteToTarget(PrimExpr value) {
-    if (!MatchDType(value.dtype())) return value;
+    PrimType value_ty = value.ty();
+    if (!MatchType(value_ty)) return value;
     if (const CastNode* cast = value.as<CastNode>()) {
-      if (cast->value.dtype() == promote_dtype_.with_lanes(value.dtype().lanes()))
-        return cast->value;
+      if (cast->value.ty() == promote_dtype_.WithLanes(value_ty.lanes())) return cast->value;
     }
-    return DTypeConversion(value, promote_dtype_.with_lanes(value.dtype().lanes()));
+    return DTypeConversion(value, promote_dtype_.WithLanes(value_ty.lanes()));
   }
 
   /*!
@@ -456,9 +486,10 @@ class ComputeLegalizer : public StmtExprMutator {
    * \param value The input value
    * \return The converted value.
    */
-  PrimExpr CastTargetToDType(PrimExpr value, DataType dtype) {
-    if (!value.dtype().is_float()) return value;
-    TVM_FFI_ICHECK_EQ(value.dtype(), this->promote_dtype_.with_lanes(value.dtype().lanes()));
+  PrimExpr CastTargetToDType(PrimExpr value, PrimType dtype) {
+    PrimType value_ty = value.ty();
+    if (value_ty.code() != DLDataTypeCode::kDLFloat) return value;
+    TVM_FFI_ICHECK_EQ(value.ty(), this->promote_dtype_.WithLanes(value_ty.lanes()));
     return DTypeConversion(value, dtype);
   }
 
@@ -471,29 +502,33 @@ class ComputeLegalizer : public StmtExprMutator {
   }
 
  protected:
-  DataType promote_dtype_;
+  PrimType promote_dtype_;
   std::unordered_map<Buffer, Buffer, ffi::ObjectPtrHash, ffi::ObjectPtrEqual> buffer_remap_;
   std::unordered_map<Var, Var> var_remap_;
 };
 
 class BF16ComputeLegalizer : public ComputeLegalizer {
  public:
-  BF16ComputeLegalizer() : ComputeLegalizer(DataType::Float(32)) {}
+  BF16ComputeLegalizer() : ComputeLegalizer(PrimType::Float(32)) {}
   PrimFunc Legalize(PrimFunc func) {
     BF16ComputeLegalizePlanner planner(&buffer_remap_, &var_remap_, promote_dtype_);
     return LegalizeWithPlanner(func, &planner);
   }
-  bool MatchDType(DataType dtype) const { return dtype.is_bfloat16(); }
+  bool MatchType(const Type& type) const {
+    return MatchPrimType(type, [](const PrimType& prim_type) { return IsBFloat16Type(prim_type); });
+  }
 };
 
 class FP8ComputeLegalizer : public ComputeLegalizer {
  public:
-  explicit FP8ComputeLegalizer(DataType promote_dtype) : ComputeLegalizer(promote_dtype) {}
+  explicit FP8ComputeLegalizer(PrimType promote_dtype) : ComputeLegalizer(promote_dtype) {}
   PrimFunc Legalize(PrimFunc func) {
     FP8ComputeLegalizePlanner planner(&buffer_remap_, &var_remap_, promote_dtype_);
     return LegalizeWithPlanner(func, &planner);
   }
-  bool MatchDType(DataType dtype) const { return dtype.is_float8(); }
+  bool MatchType(const Type& type) const {
+    return MatchPrimType(type, [](const PrimType& prim_type) { return IsFloat8Type(prim_type); });
+  }
 };
 
 /*!
@@ -529,13 +564,13 @@ class StorageLegalizer : public StmtExprMutator {
     // in a rare case the buffer didn't get remapped
     // because the original var is not bfloat*
     // force remap here
-    if (MatchDType(buf->dtype)) {
-      DataType new_dtype = GetStorageUIntDType(buf->dtype);
+    if (MatchType(buf->dtype)) {
+      PrimType new_dtype = GetStorageUIntDType(buf->dtype);
       ffi::String storage_scope = "global";
       if (auto* ptr_type = buf->data->type_annotation.as<PointerTypeNode>()) {
         storage_scope = ptr_type->storage_scope;
       }
-      Var new_data = Var(buf->data->name_hint, PointerType(PrimType(new_dtype), storage_scope));
+      Var new_data = Var(buf->data->name_hint, PointerType(new_dtype, storage_scope));
       var_remap_[buf->data] = new_data;
       buf = Buffer(new_data, new_dtype, buf->shape, buf->strides, buf->elem_offset, buf->name,
                    buf->data_alignment, buf->offset_factor, buf->buffer_type, buf->axis_separators,
@@ -556,7 +591,7 @@ class StorageLegalizer : public StmtExprMutator {
     // in a rare case the buffer didn't get remapped
     // because the original var is not bfloat*
     // force remap here
-    if (MatchDType(buf->dtype)) {
+    if (MatchType(buf->dtype)) {
       buf = Buffer(buf->data, GetStorageUIntDType(buf->dtype), buf->shape, buf->strides,
                    buf->elem_offset, buf->name, buf->data_alignment, buf->offset_factor,
                    buf->buffer_type, buf->axis_separators, buf->span, buf->layout,
@@ -600,8 +635,8 @@ class StorageLegalizer : public StmtExprMutator {
     if (new_buf.same_as(op->buffer) && indices.same_as(op->indices) && value.same_as(op->value)) {
       return ffi::GetRef<Stmt>(op);
     } else {
-      if (MatchDType(op->value.dtype())) {
-        TVM_FFI_ICHECK(new_buf->dtype.is_uint());
+      if (MatchType(op->value.ty())) {
+        TVM_FFI_ICHECK(new_buf->dtype.MatchesCode(DLDataTypeCode::kDLUInt));
       }
       TVM_FFI_ICHECK(!op->predicate.defined())
           << "Predicated buffer store is not currently supported in "
@@ -647,20 +682,21 @@ class StorageLegalizer : public StmtExprMutator {
     if (op->op.same_as(builtin::reinterpret())) {
       PrimExpr value = VisitExpr(op->args[0]);
       // sometimes the input dtype can change and we can skip.
-      if (value.dtype() == op->dtype) return value;
-      if (MatchDType(op->dtype)) {
-        return reinterpret(GetStorageUIntDType(op->dtype), value);
+      PrimType op_dtype = op->ty();
+      if (value.ty() == op_dtype) return value;
+      if (MatchType(op_dtype)) {
+        return reinterpret(GetStorageUIntDType(op_dtype), value);
       }
       if (op->args[0].same_as(value)) {
         return ffi::GetRef<PrimExpr>(op);
       } else {
-        return reinterpret(op->dtype, value);
+        return reinterpret(op_dtype, value);
       }
     }
     return StmtExprMutator::VisitExpr_(op);
   }
 
-  virtual bool MatchDType(DataType dtype) const = 0;
+  virtual bool MatchType(const Type& type) const = 0;
 
  private:
   /*!
@@ -669,10 +705,11 @@ class StorageLegalizer : public StmtExprMutator {
    * \return The converted value.
    */
   PrimExpr ChangeToUInt(PrimExpr value) {
-    if (!MatchDType(value->dtype)) return value;
+    PrimType value_dtype = value.ty();
+    if (!MatchType(value_dtype)) return value;
     auto* call = value.as<CallNode>();
     if (call && call->op.same_as(builtin::reinterpret())) {
-      return reinterpret(GetStorageUIntDType(value->dtype), call->args[0]);
+      return reinterpret(GetStorageUIntDType(value_dtype), call->args[0]);
     } else {
       return value;
     }
@@ -680,13 +717,13 @@ class StorageLegalizer : public StmtExprMutator {
 
   Var RemapVarDef(Var var) {
     // remap the var
-    if (var.dtype().is_handle()) {
+    if (var.ty().IsHandle()) {
       if (auto* ptr_type = var->type_annotation.as<PointerTypeNode>()) {
         if (auto* elem_type = ptr_type->element_type.as<PrimTypeNode>()) {
-          if (MatchDType(elem_type->dtype)) {
-            Var new_var =
-                Var(var->name_hint, PointerType(PrimType(GetStorageUIntDType(elem_type->dtype)),
-                                                ptr_type->storage_scope));
+          PrimType elem_prim_type = ffi::GetRef<PrimType>(elem_type);
+          if (MatchType(elem_prim_type)) {
+            Var new_var = Var(var->name_hint, PointerType(GetStorageUIntDType(elem_prim_type),
+                                                          ptr_type->storage_scope));
             var_remap_[var] = new_var;
             return new_var;
           }
@@ -704,12 +741,12 @@ class StorageLegalizer : public StmtExprMutator {
     Buffer new_buf = buf;
     auto var_it = var_remap_.find(buf->data);
     if (var_it != var_remap_.end()) {
-      DataType dtype = MatchDType(buf->dtype) ? GetStorageUIntDType(buf->dtype) : buf->dtype;
+      PrimType dtype = MatchType(buf->dtype) ? GetStorageUIntDType(buf->dtype) : buf->dtype;
       new_buf = Buffer(var_it->second, dtype, buf->shape, buf->strides, buf->elem_offset, buf->name,
                        buf->data_alignment, buf->offset_factor, buf->buffer_type,
                        buf->axis_separators, buf->span, buf->layout, buf->allocated_addr);
     } else {
-      TVM_FFI_ICHECK(!MatchDType(buf->dtype)) << "Cannot find var remap for " << buf;
+      TVM_FFI_ICHECK(!MatchType(buf->dtype)) << "Cannot find var remap for " << buf;
     }
 
     buffer_remap_[buf] = new_buf;
@@ -723,12 +760,16 @@ class StorageLegalizer : public StmtExprMutator {
 
 class BF16StorageLegalizer : public StorageLegalizer {
  public:
-  bool MatchDType(DataType dtype) const { return dtype.is_bfloat16(); }
+  bool MatchType(const Type& type) const {
+    return MatchPrimType(type, [](const PrimType& prim_type) { return IsBFloat16Type(prim_type); });
+  }
 };
 
 class FP8StorageLegalizer : public StorageLegalizer {
  public:
-  bool MatchDType(DataType dtype) const { return dtype.is_float8(); }
+  bool MatchType(const Type& type) const {
+    return MatchPrimType(type, [](const PrimType& prim_type) { return IsFloat8Type(prim_type); });
+  }
 };
 
 namespace transform {
@@ -787,7 +828,7 @@ Pass FP8ComputeLegalize(ffi::String promote_dtype) {
         CheckDataTypeSupport(opt_target.value(), "tvm.support.nvcc.supports_fp8")) {
       return f;
     }
-    return FP8ComputeLegalizer(DataType(ffi::StringToDLDataType(promote_dtype))).Legalize(f);
+    return FP8ComputeLegalizer(PrimType(ffi::StringToDLDataType(promote_dtype))).Legalize(f);
   };
   return CreatePrimFuncPass(pass_func, 0, "tirx.FP8ComputeLegalize", {});
 }
diff --git a/src/tirx/transform/vectorize_loop.cc b/src/tirx/transform/vectorize_loop.cc
index 03ee2d3eefde..271a9f20efa9 100644
--- a/src/tirx/transform/vectorize_loop.cc
+++ b/src/tirx/transform/vectorize_loop.cc
@@ -39,13 +39,20 @@
 #include <vector>
 
 #include "../../tirx/analysis/check_contains.h"
-#include "tvm/runtime/data_type.h"
+#include "tvm/ffi/dtype.h"
 #include "tvm/tirx/buffer.h"
 
 namespace tvm {
 namespace tirx {
 
 namespace {
+int GetLanesOrVScaleFactor(const PrimType& ty) {
+  if (ty.IsScalableVector()) {
+    return ty.VScaleFactor();
+  }
+  return ty.lanes();
+}
+
 // File-local helper: true if `expr` is a call to tirx::builtin::vscale().
 bool IsVScaleCall(const PrimExpr& expr) {
   if (const auto* call = expr.as<CallNode>()) {
@@ -56,9 +63,8 @@ bool IsVScaleCall(const PrimExpr& expr) {
 
 bool TargetHasRVV(Target target) {
   if (!target.defined()) return false;
-  static auto target_has_feature_fn =
-      tvm::ffi::Function::GetGlobalRequired("target.target_has_feature");
-  return target_has_feature_fn("v", target).cast<bool>();
+  static auto target_has_feature_fn = tvm::ffi::Function::GetGlobal("target.target_has_feature");
+  return target_has_feature_fn.has_value() && (*target_has_feature_fn)("v", target).cast<bool>();
 }
 
 // File-local helper: true if the target supports Variable-Length Array extensions
@@ -66,6 +72,16 @@ bool TargetHasRVV(Target target) {
 bool TargetHasVLA(Target target) {
   if (!target.defined()) return false;
   bool has_vla = target->GetAttr<bool>("feature.has_sve").value_or(false);
+  if (!has_vla) {
+    if (auto mattr = target->GetAttr<ffi::Array<ffi::String>>("mattr")) {
+      for (const ffi::String& attr : mattr.value()) {
+        if (attr == "+sve") {
+          has_vla = true;
+          break;
+        }
+      }
+    }
+  }
   has_vla |= TargetHasRVV(target);
   return has_vla;
 }
@@ -78,7 +94,7 @@ bool ContainsCallNode(const Stmt& stmt) {
 
 inline PrimExpr CreateNewLanes(bool is_scalable, int lanes_or_vscale_factor) {
   if (is_scalable) {
-    return Mul(Call(DataType::Int(32), builtin::vscale(), {}), lanes_or_vscale_factor);
+    return Mul(Call(PrimType::Int(32), builtin::vscale(), {}), lanes_or_vscale_factor);
   } else {
     return lanes_or_vscale_factor;
   }
@@ -86,23 +102,21 @@ inline PrimExpr CreateNewLanes(bool is_scalable, int lanes_or_vscale_factor) {
 
 inline PrimExpr BroadcastTo(PrimExpr e, int lanes, bool is_scalable) {
   // Check if e is already in the expected form
-  if (e.dtype().get_lanes_or_vscale_factor() == lanes &&
-      e.dtype().is_scalable_vector() == is_scalable)
-    return e;
+  if (GetLanesOrVScaleFactor(e.ty()) == lanes && e.ty().IsScalableVector() == is_scalable) return e;
 
   if (const BroadcastNode* op = e.as<BroadcastNode>()) {
-    TVM_FFI_ICHECK(op->dtype.is_scalable_vector() == is_scalable)
+    TVM_FFI_ICHECK(op->ty().IsScalableVector() == is_scalable)
         << "Can't broadcast between scalable and fixed length vectors.";
-    int e_lanes = op->dtype.get_lanes_or_vscale_factor();
+    int e_lanes = GetLanesOrVScaleFactor(op->ty());
 
     if (lanes % e_lanes == 0) {
       return Broadcast(op->value, CreateNewLanes(is_scalable, lanes));
     }
   }
 
-  TVM_FFI_ICHECK(e.dtype().is_scalar())
-      << "Cannot broadcast lanes=" << e.dtype().get_lanes_or_vscale_factor()
-      << " is_scalable=" << e.dtype().is_scalable_vector() << " to " << lanes;
+  TVM_FFI_ICHECK(e.ty().IsScalar())
+      << "Cannot broadcast lanes=" << GetLanesOrVScaleFactor(e.ty())
+      << " is_scalable=" << e.ty().IsScalableVector() << " to " << lanes;
 
   return Broadcast(e, CreateNewLanes(is_scalable, lanes));
 }
@@ -219,9 +233,10 @@ class TryPredicateBufferAccesses : public StmtExprMutator {
       }
     }
 
-    DataType buf_predicate_dtype =
-        DataType(DataType::kUInt, 1, ramp->dtype.get_lanes_or_vscale_factor(),
-                 ramp->dtype.is_scalable_vector());
+    PrimType buf_predicate_dtype =
+        ramp->ty().IsScalableVector() ? PrimType::ScalableVector(DLDataTypeCode::kDLUInt, 1,
+                                                                 GetLanesOrVScaleFactor(ramp->ty()))
+                                      : PrimType::UInt(1, GetLanesOrVScaleFactor(ramp->ty()));
     Call lane_mask = Call(buf_predicate_dtype, builtin::get_active_lane_mask(), {base_, limit_});
 
     num_accesses_rewritten_ += 1;
@@ -354,7 +369,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
 
   Vectorizer(Var var, PrimExpr var_lanes, Target target)
       : var_(var), var_lanes_(var_lanes), target_(target) {
-    ramp_ = Ramp(IntImm(var->dtype, 0), IntImm(var->dtype, 1), var_lanes);
+    ramp_ = Ramp(IntImm(var.ty(), 0), IntImm(var.ty(), 1), var_lanes);
   }
 
   Stmt VisitStmt(const Stmt& stmt) final {
@@ -384,28 +399,28 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     if (a.same_as(op->a) && b.same_as(op->b)) {
       return ffi::GetRef<PrimExpr>(op);
     } else {
-      bool is_vec_a = a.dtype().is_scalable_or_fixed_length_vector();
-      bool is_vec_b = b.dtype().is_scalable_or_fixed_length_vector();
+      bool is_vec_a = a.ty().IsScalableVector() || a.ty().IsFixedLengthVector();
+      bool is_vec_b = b.ty().IsScalableVector() || b.ty().IsFixedLengthVector();
       if (is_vec_a && is_vec_b) {
         // Let's not multiply scalable and fixed length vectors
-        TVM_FFI_ICHECK(a.dtype().is_scalable_vector() == b.dtype().is_scalable_vector())
+        TVM_FFI_ICHECK(a.ty().IsScalableVector() == b.ty().IsScalableVector())
             << "Fixed length and scalable vectors can't be mixed in multiplication.";
       }
       if (is_vec_a || is_vec_b) {
         const RampNode* b_ramp = b.as<RampNode>();
         const RampNode* a_ramp = a.as<RampNode>();
-        if (a_ramp && b.dtype().is_scalar() && analyzer_->CanProve(b > 0)) {
+        if (a_ramp && b.ty().IsScalar() && analyzer_->CanProve(b > 0)) {
           PrimExpr lanes = a_ramp->lanes;
           return Ramp(a_ramp->base * b, a_ramp->stride * b, lanes);
         }
-        if (b_ramp && a.dtype().is_scalar() && analyzer_->CanProve(a > 0)) {
+        if (b_ramp && a.ty().IsScalar() && analyzer_->CanProve(a > 0)) {
           PrimExpr lanes = b_ramp->lanes;
           return Ramp(b_ramp->base * a, b_ramp->stride * a, lanes);
         }
-        int a_lanes = a.dtype().get_lanes_or_vscale_factor();
-        int b_lanes = b.dtype().get_lanes_or_vscale_factor();
+        int a_lanes = GetLanesOrVScaleFactor(a.ty());
+        int b_lanes = GetLanesOrVScaleFactor(b.ty());
         int max_lanes = std::max(a_lanes, b_lanes);
-        bool is_scalable = a.dtype().is_scalable_vector() || b.dtype().is_scalable_vector();
+        bool is_scalable = a.ty().IsScalableVector() || b.ty().IsScalableVector();
         return Mul(BroadcastTo(a, max_lanes, is_scalable), BroadcastTo(b, max_lanes, is_scalable));
       }
     }
@@ -438,22 +453,22 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
   PrimExpr VisitExpr_(const RampNode* op) final {
     PrimExpr base = this->VisitExpr(op->base);
     PrimExpr stride = this->VisitExpr(op->stride);
-    TVM_FFI_ICHECK(!base.dtype().is_scalable_vector())
+    TVM_FFI_ICHECK(!base.ty().IsScalableVector())
         << "Creating scalable vectors from existing vectors is not supported.";
-    TVM_FFI_ICHECK(!stride.dtype().is_scalable_vector())
+    TVM_FFI_ICHECK(!stride.ty().IsScalableVector())
         << "Ramp stride with scalable dtype is not supported";
-    if (base.dtype().is_fixed_length_vector() && stride.dtype().is_scalar()) {
+    if (base.ty().IsFixedLengthVector() && stride.ty().IsScalar()) {
       TVM_FFI_ICHECK(op->lanes->IsInstance<IntImmNode>())
           << "Vectorizing over existing scalable vectors is not supported.";
       const RampNode* base_ramp = base.as<RampNode>();
       int op_lanes = static_cast<int>(op->lanes.as_or_throw<IntImm>()->value);
       int base_ramp_lanes = static_cast<int>(base_ramp->lanes.as_or_throw<IntImm>()->value);
       if (analyzer_->CanProve(base_ramp->stride ==
-                              stride * MakeConst(stride.dtype(), base_ramp_lanes))) {
+                              stride * MakeConst(stride.ty(), base_ramp_lanes))) {
         return Ramp(base_ramp->base, stride, op_lanes * base_ramp_lanes);
       }
     }
-    int lanes = std::max(base.dtype().lanes(), stride.dtype().lanes());
+    int lanes = std::max(base.ty().lanes(), stride.ty().lanes());
     base = BroadcastTo(base, lanes, false);
     stride = BroadcastTo(stride, lanes, false);
     ffi::Array<PrimExpr> elems;
@@ -466,7 +481,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
 
   PrimExpr VisitExpr_(const BroadcastNode* op) final {
     PrimExpr value = this->VisitExpr(op->value);
-    if (value.dtype().is_scalable_or_fixed_length_vector()) {
+    if (value.ty().IsScalableVector() || value.ty().IsFixedLengthVector()) {
       need_scalarize_ = true;
       return ffi::GetRef<PrimExpr>(op);
     }
@@ -484,12 +499,12 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     if (cond.same_as(op->condition) && t.same_as(op->true_value) && f.same_as(op->false_value)) {
       return ffi::GetRef<PrimExpr>(op);
     } else {
-      int cond_lanes = cond.dtype().get_lanes_or_vscale_factor();
-      int t_lanes = t.dtype().get_lanes_or_vscale_factor();
-      int f_lanes = f.dtype().get_lanes_or_vscale_factor();
+      int cond_lanes = GetLanesOrVScaleFactor(cond.ty());
+      int t_lanes = GetLanesOrVScaleFactor(t.ty());
+      int f_lanes = GetLanesOrVScaleFactor(f.ty());
       int lanes = std::max(std::max(cond_lanes, t_lanes), f_lanes);
-      bool is_scalable = cond.dtype().is_scalable_vector() || t.dtype().is_scalable_vector() ||
-                         f.dtype().is_scalable_vector();
+      bool is_scalable =
+          cond.ty().IsScalableVector() || t.ty().IsScalableVector() || f.ty().IsScalableVector();
       return Select(BroadcastTo(cond, lanes, is_scalable), BroadcastTo(t, lanes, is_scalable),
                     BroadcastTo(f, lanes, is_scalable));
     }
@@ -500,10 +515,12 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     if (value.same_as(op->value)) {
       return ffi::GetRef<PrimExpr>(op);
     } else {
-      if (value.dtype().is_scalable_vector()) {
-        return Cast(op->dtype.with_scalable_vscale_factor(value.dtype().vscale_factor()), value);
+      if (value.ty().IsScalableVector()) {
+        return Cast(
+            PrimType::ScalableVector(op->ty().code(), op->ty().bits(), value.ty().VScaleFactor()),
+            value);
       } else {
-        return Cast(op->dtype.with_lanes(value.dtype().lanes()), value);
+        return Cast(op->ty().WithLanes(value.ty().lanes()), value);
       }
     }
   }
@@ -531,7 +548,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
   // IfThenElse expr
   PrimExpr MutateIfThenElseExpr_(const CallNode* op) {
     PrimExpr cond = this->VisitExpr(op->args[0]);
-    if (cond.dtype().is_scalable_or_fixed_length_vector()) {
+    if (cond.ty().IsScalableVector() || cond.ty().IsFixedLengthVector()) {
       need_scalarize_ = true;
       return ffi::GetRef<PrimExpr>(op);
     }
@@ -540,17 +557,17 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     if (cond.same_as(op->args[0]) && t.same_as(op->args[1]) && f.same_as(op->args[2])) {
       return ffi::GetRef<PrimExpr>(op);
     } else {
-      int t_lanes = t.dtype().get_lanes_or_vscale_factor();
-      int f_lanes = f.dtype().get_lanes_or_vscale_factor();
+      int t_lanes = GetLanesOrVScaleFactor(t.ty());
+      int f_lanes = GetLanesOrVScaleFactor(f.ty());
       int lanes = std::max(t_lanes, f_lanes);
-      bool is_scalable = t.dtype().is_scalable_vector() || f.dtype().is_scalable_vector();
+      bool is_scalable = t.ty().IsScalableVector() || f.ty().IsScalableVector();
       t = BroadcastTo(t, lanes, is_scalable);
       f = BroadcastTo(f, lanes, is_scalable);
       if (is_scalable) {
-        return Call(op->dtype.with_scalable_vscale_factor(lanes), op->op, {cond, t, f}, op->attrs,
-                    op->span);
+        return Call(PrimType::ScalableVector(op->ty().code(), op->ty().bits(), lanes), op->op,
+                    {cond, t, f}, op->attrs, op->span);
       } else {
-        return Call(op->dtype.with_lanes(lanes), op->op, {cond, t, f}, op->attrs, op->span);
+        return Call(op->ty().WithLanes(lanes), op->op, {cond, t, f}, op->attrs, op->span);
       }
     }
   }
@@ -561,16 +578,16 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     if (value.same_as(op->args[0])) {
       return ffi::GetRef<PrimExpr>(op);
     } else {
-      int lanes = value.dtype().get_lanes_or_vscale_factor();
-      if (value.dtype().is_scalable_vector()) {
-        return Call(op->dtype.with_scalable_vscale_factor(lanes), op->op, {value}, op->attrs,
-                    op->span);
+      int lanes = GetLanesOrVScaleFactor(value.ty());
+      if (value.ty().IsScalableVector()) {
+        return Call(PrimType::ScalableVector(op->ty().code(), op->ty().bits(), lanes), op->op,
+                    {value}, op->attrs, op->span);
       } else {
-        int new_lanes = (op->dtype != DataType::Float4E2M1FN() &&
-                         op->args[0].dtype() != DataType::Float4E2M1FN())
-                            ? (value.dtype().bits() * value.dtype().lanes()) / op->dtype.bits()
-                            : value.dtype().lanes();
-        return Call(op->dtype.with_lanes(new_lanes), op->op, {value}, op->attrs, op->span);
+        int new_lanes = (op->ty().code() != DLDataTypeCode::kDLFloat4_e2m1fn &&
+                         op->args[0].ty().code() != DLDataTypeCode::kDLFloat4_e2m1fn)
+                            ? (value.ty().bits() * value.ty().lanes()) / op->ty().bits()
+                            : value.ty().lanes();
+        return Call(op->ty().WithLanes(new_lanes), op->op, {value}, op->attrs, op->span);
       }
     }
   }
@@ -581,46 +598,46 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     } else if (op->op.same_as(builtin::texture2d_load())) {
       int lane = 0;
       ffi::Array<PrimExpr> fcd = MutateArray({op->args.back()}, &lane);
-      auto dtype = op->args[0]
-                       .as<VarNode>()
-                       ->type_annotation.as<PointerTypeNode>()
-                       ->element_type.as<PrimTypeNode>()
-                       ->dtype;
-      TVM_FFI_ICHECK(lane * dtype.bits() <= op->args[4].as<IntImmNode>()->value)
+      DLDataType dtype = op->args[0]
+                             .as<VarNode>()
+                             ->type_annotation.as<PointerTypeNode>()
+                             ->element_type.as<PrimTypeNode>()
+                             ->dtype;
+      TVM_FFI_ICHECK(lane * dtype.bits <= op->args[4].as<IntImmNode>()->value)
           << "Expected Data to be Read is lesser than or equal to Texture Load length";
 
       auto new_args = op->args;
       new_args.pop_back();
       new_args.push_back(fcd[0]);
-      return Call(op->dtype.with_lanes(lane), op->op, new_args, op->attrs, op->span);
+      return Call(op->ty().WithLanes(lane), op->op, new_args, op->attrs, op->span);
     } else if (op->op.same_as(builtin::texture2d_store())) {
       int lane = 0;
       // Vectorize the value to store
       ffi::Array<PrimExpr> value{op->args.back()};
       ffi::Array<PrimExpr> mutated_value = MutateArray(value, &lane);
-      auto dtype = op->args[0]
-                       .as<VarNode>()
-                       ->type_annotation.as<PointerTypeNode>()
-                       ->element_type.as<PrimTypeNode>()
-                       ->dtype;
-      TVM_FFI_ICHECK(lane * dtype.bits() == op->args[4].as<IntImmNode>()->value)
+      DLDataType dtype = op->args[0]
+                             .as<VarNode>()
+                             ->type_annotation.as<PointerTypeNode>()
+                             ->element_type.as<PrimTypeNode>()
+                             ->dtype;
+      TVM_FFI_ICHECK(lane * dtype.bits == op->args[4].as<IntImmNode>()->value)
           << "Expected Data to be Written equal to Texture Store length";
       ffi::Array<PrimExpr> new_args{op->args[0], op->args[1], op->args[2],
                                     op->args[3], op->args[4], mutated_value[0]};
-      return Call(op->dtype.with_lanes(lane), op->op, new_args, op->attrs, op->span);
+      return Call(op->ty().WithLanes(lane), op->op, new_args, op->attrs, op->span);
     } else if (op->op.same_as(builtin::reinterpret())) {
       return MutateReinterpretExpr_(op);
     }
     auto optional_op = op->op.as<Op>();
     bool vectorizable = optional_op && op_vectorizable_.get(optional_op.value(), false) &&
-                        !op->dtype.is_scalable_vector();
+                        !op->ty().IsScalableVector();
 
     if (!vectorizable) {
       // Cannot vectorize this op
       ffi::Array<PrimExpr> new_args;
       for (auto arg : op->args) {
         auto new_arg = this->VisitExpr(arg);
-        if (new_arg.dtype().is_scalable_or_fixed_length_vector()) {
+        if (new_arg.ty().IsScalableVector() || new_arg.ty().IsFixedLengthVector()) {
           need_scalarize_ = true;
           return ffi::GetRef<PrimExpr>(op);
         }
@@ -629,7 +646,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
       if (op->args.same_as(new_args)) {
         return ffi::GetRef<PrimExpr>(op);
       } else {
-        return Call(op->dtype, op->op, new_args, op->attrs, op->span);
+        return Call(ffi::GetRef<PrimExpr>(op).ty(), op->op, new_args, op->attrs, op->span);
       }
     } else {
       int lane = 0;
@@ -655,7 +672,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
       if (op->args.same_as(new_args)) {
         return ffi::GetRef<PrimExpr>(op);
       } else {
-        return Call(op->dtype.with_lanes(lane), op->op, new_args, op->attrs, op->span);
+        return Call(op->ty().WithLanes(lane), op->op, new_args, op->attrs, op->span);
       }
     }
   }
@@ -688,9 +705,8 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
       TVM_FFI_ICHECK(deep_equal_(it->second, value))
           << "Let cannot bind the same var to two different values";
     }
-    if (value.dtype().get_lanes_or_vscale_factor() !=
-        op->value.dtype().get_lanes_or_vscale_factor()) {
-      Var new_var(op->var->name_hint, value.dtype());
+    if (GetLanesOrVScaleFactor(value.ty()) != GetLanesOrVScaleFactor(op->value.ty())) {
+      Var new_var(op->var->name_hint, value.ty());
       let_binding_[op->var] = new_var;
       return Let(new_var, value, this->VisitExpr(op->body));
     } else {
@@ -715,7 +731,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
       return ffi::GetRef<PrimExpr>(op);
     }
 
-    int new_vec_length = var_lanes_.as_or_throw<IntImm>()->value / op->vectors[0].dtype().lanes();
+    int new_vec_length = var_lanes_.as_or_throw<IntImm>()->value / op->vectors[0].ty().lanes();
     PrimExpr updated_index = indices[0];
     // Check that the indices satisfy the specific patterns.
     auto f_check_index = [this, op](const PrimExpr& index) {
@@ -741,7 +757,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
                 ramp->lanes.as_or_throw<IntImm>()->value ==
                     var_lanes_.as_or_throw<IntImm>()->value &&
                 broadcast->value->IsInstance<IntImmNode>() &&
-                broadcast->value.as_or_throw<IntImm>()->value == op->vectors[0]->dtype.lanes() &&
+                broadcast->value.as_or_throw<IntImm>()->value == op->vectors[0].ty().lanes() &&
                 broadcast->lanes->IsInstance<IntImmNode>() &&
                 broadcast->lanes.as_or_throw<IntImm>()->value ==
                     var_lanes_.as_or_throw<IntImm>()->value) {
@@ -756,12 +772,12 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     TVM_FFI_ICHECK(f_check_index(updated_index));
 
     if (new_vec_length == 1) {
-      return tirx::Substitute(op->vectors[0], {{var_, tvm::IntImm(var_->dtype, 0)}});
+      return tirx::Substitute(op->vectors[0], {{var_, tvm::IntImm(var_.ty(), 0)}});
     } else {
       PrimExpr prev_ramp = ramp_;
       PrimExpr prev_var_lanes = var_lanes_;
-      ramp_ = Ramp(IntImm(var_->dtype, 0), IntImm(var_->dtype, 2), new_vec_length);
-      var_lanes_ = tvm::IntImm(var_lanes_.dtype(), new_vec_length);
+      ramp_ = Ramp(IntImm(var_.ty(), 0), IntImm(var_.ty(), 2), new_vec_length);
+      var_lanes_ = tvm::IntImm(var_lanes_.ty(), new_vec_length);
       lane_vectors = 0;
       vectors = MutateArray(op->vectors, &lane_vectors);
       ramp_ = prev_ramp;
@@ -779,28 +795,28 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     PrimExpr value = this->VisitExpr(op->value);
 
     if (!indices.same_as(op->indices) || !value.same_as(op->value)) {
-      TVM_FFI_ICHECK(!op->buffer->dtype.is_scalable_vector())
+      TVM_FFI_ICHECK(!op->buffer->dtype.IsScalableVector())
           << "Vectorizing over scalable buffer elements is not supported in vectorizer.";
       // How many lanes of indexing are present in the index and
       // buffer element type, excluding the last index.
       int other_index_lanes = op->buffer->dtype.lanes();
       for (size_t i = 0; i < indices.size() - 1; i++) {
-        other_index_lanes *= indices[i].dtype().lanes();
+        other_index_lanes *= indices[i].ty().lanes();
         // Only allow the last index to be scalable
-        TVM_FFI_ICHECK(!indices[i].dtype().is_scalable_vector())
+        TVM_FFI_ICHECK(!indices[i].ty().IsScalableVector())
             << "Only the last index can be scalable.";
       }
 
       // The total number of lanes of indexing, including the last index.
-      auto last_index_dtype = indices[indices.size() - 1].dtype();
-      int lanes_in_last_index = last_index_dtype.get_lanes_or_vscale_factor();
+      PrimType last_index_dtype = indices[indices.size() - 1].ty();
+      int lanes_in_last_index = GetLanesOrVScaleFactor(last_index_dtype);
       int index_lanes = other_index_lanes * lanes_in_last_index;
 
       // The total number of lanes in this store operation.  Either
       // the index or the value will be broadcast out to this number
       // of lanes, depending on which has more lanes.
-      int value_dtype_lanes = value.dtype().get_lanes_or_vscale_factor();
-      bool is_last_index_scalable = last_index_dtype.is_scalable_vector();
+      int value_dtype_lanes = GetLanesOrVScaleFactor(value.ty());
+      bool is_last_index_scalable = last_index_dtype.IsScalableVector();
       int total_lanes = std::max(index_lanes, value_dtype_lanes);
 
       TVM_FFI_ICHECK_EQ(total_lanes % other_index_lanes, 0)
@@ -826,9 +842,9 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
       LOG(WARNING) << "Detect vectorize inside vectorized loop, ignoring...";
     }
     TVM_FFI_ICHECK(is_zero(op->min));
-    TVM_FFI_ICHECK(!op->extent.dtype().is_scalable_or_fixed_length_vector());
+    TVM_FFI_ICHECK(!op->extent.ty().IsScalableVector() && !op->extent.ty().IsFixedLengthVector());
     PrimExpr extent = this->VisitExpr(op->extent);
-    if (extent.dtype().is_scalable_or_fixed_length_vector()) {
+    if (extent.ty().IsScalableVector() || extent.ty().IsFixedLengthVector()) {
       return Scalarize(ffi::GetRef<Stmt>(op));
     }
     Stmt body = this->VisitStmt(op->body);
@@ -843,7 +859,8 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
   }
   // IfThenElse
   Stmt VisitStmt_(const IfThenElseNode* op) final {
-    TVM_FFI_ICHECK(!op->condition.dtype().is_scalable_or_fixed_length_vector());
+    TVM_FFI_ICHECK(!op->condition.ty().IsScalableVector() &&
+                   !op->condition.ty().IsFixedLengthVector());
     PrimExpr condition = this->VisitExpr(op->condition);
     // need scalarize can be marked as true during visit of condition
     bool cond_need_scalarize = false;
@@ -857,7 +874,8 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     }
     // Check if we can rewrite the condition with predicated buffers
     if (EnableBufferLevelPredication(target_) &&
-        condition.dtype().is_scalable_or_fixed_length_vector() && !else_case.defined()) {
+        (condition.ty().IsScalableVector() || condition.ty().IsFixedLengthVector()) &&
+        !else_case.defined()) {
       std::pair<bool, Stmt> success_stmt_pair =
           TryPredicateBufferAccesses(TargetHasRVV(target_)).Run(then_case, condition);
       bool can_remove_if_then_else = success_stmt_pair.first;
@@ -866,7 +884,8 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
       }
     }
 
-    if (cond_need_scalarize || condition.dtype().is_scalable_or_fixed_length_vector()) {
+    if (cond_need_scalarize || condition.ty().IsScalableVector() ||
+        condition.ty().IsFixedLengthVector()) {
       return Scalarize(ffi::GetRef<Stmt>(op));
     }
     if (condition.same_as(op->condition) && then_case.same_as(op->then_case) &&
@@ -893,9 +912,8 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     TVM_FFI_ICHECK(!let_binding_.count(op->var)) << "SSA violation, a single var is binded twice";
     let_binding_[op->var] = value;
 
-    if (value.dtype().get_lanes_or_vscale_factor() !=
-        op->value.dtype().get_lanes_or_vscale_factor()) {
-      Var new_var(op->var->name_hint, value.dtype());
+    if (GetLanesOrVScaleFactor(value.ty()) != GetLanesOrVScaleFactor(op->value.ty())) {
+      Var new_var(op->var->name_hint, value.ty());
       let_binding_[op->var] = new_var;
       return Bind(new_var, value);
     } else {
@@ -912,9 +930,9 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
 
   // scalarize the statment
   Stmt Scalarize(Stmt stmt) {
-    Var idx(var_->name_hint + ".s", var_->dtype);
+    Var idx(var_->name_hint + ".s", var_.ty());
     stmt = Substitute(stmt, {{var_, idx}});
-    return For(idx, IntImm(var_->dtype, 0), var_lanes_, ForKind::kSerial, stmt);
+    return For(idx, IntImm(var_.ty(), 0), var_lanes_, ForKind::kSerial, stmt);
   }
 
  private:
@@ -949,11 +967,11 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
       PrimExpr new_elem = this->VisitExpr(old_elem);
       if (!new_elem.same_as(old_elem)) changed = true;
       new_arr[i] = new_elem;
-      lanes = std::max(lanes, new_elem.dtype().lanes());
+      lanes = std::max(lanes, new_elem.ty().lanes());
     }
 
     for (size_t i = 0; i < arr.size(); ++i) {
-      if (new_arr[i].dtype().lanes() != lanes) {
+      if (new_arr[i].ty().lanes() != lanes) {
         new_arr[i] = BroadcastTo(new_arr[i], lanes, false);
         changed = true;
       }
@@ -969,10 +987,10 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     if (a.same_as(op->a) && b.same_as(op->b)) {
       return ffi::GetRef<PrimExpr>(op);
     } else {
-      int a_lanes = a.dtype().get_lanes_or_vscale_factor();
-      int b_lanes = b.dtype().get_lanes_or_vscale_factor();
+      int a_lanes = GetLanesOrVScaleFactor(a.ty());
+      int b_lanes = GetLanesOrVScaleFactor(b.ty());
       int lanes = std::max(a_lanes, b_lanes);
-      bool is_scalable = a.dtype().is_scalable_vector() || b.dtype().is_scalable_vector();
+      bool is_scalable = a.ty().IsScalableVector() || b.ty().IsScalableVector();
       return TOp(BroadcastTo(a, lanes, is_scalable), BroadcastTo(b, lanes, is_scalable));
     }
   }
@@ -983,21 +1001,21 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     if (a.same_as(op->a) && b.same_as(op->b)) {
       return ffi::GetRef<PrimExpr>(op);
     } else {
-      int a_lanes = a.dtype().get_lanes_or_vscale_factor();
-      int b_lanes = b.dtype().get_lanes_or_vscale_factor();
+      int a_lanes = GetLanesOrVScaleFactor(a.ty());
+      int b_lanes = GetLanesOrVScaleFactor(b.ty());
       int lanes = std::max(a_lanes, b_lanes);
       if (lanes != 1) {
         const RampNode* b_ramp = b.as<RampNode>();
         const RampNode* a_ramp = a.as<RampNode>();
-        if (a.dtype().is_scalar() && b_ramp) {
+        if (a.ty().IsScalar() && b_ramp) {
           return Ramp(fcompute(a, b_ramp->base),
-                      fcompute(IntImm(b_ramp->stride.dtype(), 0), b_ramp->stride), b_ramp->lanes);
+                      fcompute(IntImm(b_ramp->stride.ty(), 0), b_ramp->stride), b_ramp->lanes);
         }
-        if (b.dtype().is_scalar() && a_ramp) {
+        if (b.ty().IsScalar() && a_ramp) {
           return Ramp(fcompute(a_ramp->base, b), a_ramp->stride, a_ramp->lanes);
         }
       }
-      bool is_scalable = a.dtype().is_scalable_vector() || b.dtype().is_scalable_vector();
+      bool is_scalable = a.ty().IsScalableVector() || b.ty().IsScalableVector();
       return fcompute(BroadcastTo(a, lanes, is_scalable), BroadcastTo(b, lanes, is_scalable));
     }
   }
@@ -1050,13 +1068,13 @@ class LoopVectorizer : public StmtMutator {
     // Match the existing TIRx scalable-vector convention.  LLVM/RVV still
     // selects the runtime vector length with vsetvli.
     static constexpr int kDefaultVScaleFactor = 4;
-    DataType index_dtype = op->loop_var->dtype;
+    PrimType index_dtype = op->loop_var.ty();
     PrimExpr zero = IntImm(index_dtype, 0);
     PrimExpr fixed_extent = IntImm(index_dtype, extent);
     PrimExpr scalable_lanes = CreateNewLanes(/*is_scalable=*/true, kDefaultVScaleFactor);
-    DataType lane_dtype = scalable_lanes.dtype();
+    PrimType lane_dtype = scalable_lanes.ty();
     PrimExpr scalable_lanes_index = scalable_lanes;
-    if (scalable_lanes_index.dtype() != index_dtype) {
+    if (scalable_lanes_index.ty() != index_dtype) {
       scalable_lanes_index = Cast(index_dtype, scalable_lanes_index);
     }
     PrimExpr num_chunks = ceildiv(fixed_extent, scalable_lanes_index);
@@ -1064,7 +1082,7 @@ class LoopVectorizer : public StmtMutator {
     Var outer(op->loop_var->name_hint + ".vla.o", index_dtype);
     Var inner(op->loop_var->name_hint + ".vla.i", lane_dtype);
     PrimExpr inner_index = inner;
-    if (inner_index.dtype() != index_dtype) {
+    if (inner_index.ty() != index_dtype) {
       inner_index = Cast(index_dtype, inner_index);
     }
     PrimExpr index = outer * scalable_lanes_index + inner_index;
diff --git a/src/topi/einsum.cc b/src/topi/einsum.cc
index 5d3a7936967b..b9610d5fcedd 100644
--- a/src/topi/einsum.cc
+++ b/src/topi/einsum.cc
@@ -127,7 +127,7 @@ PrimExpr GetIndexForBroadcastedDim(const Var& index, const PrimExpr& extent,
   // Check if current dimension is being broadcasted to `broadcasted_extent` (symbolic shape is
   // handled)
   if (is_one(extent) && !is_one(broadcasted_extent)) {
-    return IntImm(index.dtype(), 0);
+    return IntImm(index.ty(), 0);
   }
   return index;
 }
@@ -219,7 +219,7 @@ class EinsumBuilder {
     PrepareOutputIndicesMapping(indices, &label_to_index, &ellipsis_indices);
     PrepareReductionIndicesMapping(indices, &label_to_index, &ellipsis_indices, &reduce_axes);
 
-    auto zero = MakeConst(inputs[0]->dtype, 0);
+    auto zero = MakeConst(PrimType(inputs[0]->dtype), 0);
 
     PrimExpr result = zero;
     for (int i = 0, n = static_cast<int>(inputs.size()); i < n; ++i) {
@@ -288,9 +288,9 @@ class EinsumBuilder {
         }
       } else {
         // Normal label
-        reduction_axes->push_back(IterVar(
-            Range(0, label_to_extent_[label]),
-            Var(std::string(1, label), label_to_extent_[label].dtype()), IterVarType::kCommReduce));
+        reduction_axes->push_back(IterVar(Range(0, label_to_extent_[label]),
+                                          Var(std::string(1, label), label_to_extent_[label].ty()),
+                                          IterVarType::kCommReduce));
         label_to_index->emplace(label, reduction_axes->back()->var);
       }
     }
diff --git a/src/topi/elemwise.cc b/src/topi/elemwise.cc
index 922c40619908..4b9d26f276e1 100644
--- a/src/topi/elemwise.cc
+++ b/src/topi/elemwise.cc
@@ -92,11 +92,11 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                   })
       .def_packed("topi.cast",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
-                    *rv = cast(args[0].cast<te::Tensor>(), args[1].cast<DataType>());
+                    *rv = cast(args[0].cast<te::Tensor>(), args[1].cast<PrimType>());
                   })
       .def_packed("topi.reinterpret",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
-                    *rv = reinterpret(args[0].cast<te::Tensor>(), args[1].cast<DataType>());
+                    *rv = reinterpret(args[0].cast<te::Tensor>(), args[1].cast<PrimType>());
                   })
       .def_packed("topi.elemwise_sum",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
@@ -106,7 +106,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                                   ffi::Any* rv) { *rv = sign(args[0].cast<te::Tensor>()); })
       .def_packed("topi.full",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
-                    *rv = full(args[0].cast<ffi::Array<PrimExpr>>(), args[1].cast<DataType>(),
+                    *rv = full(args[0].cast<ffi::Array<PrimExpr>>(), args[1].cast<PrimType>(),
                                args[2].cast<PrimExpr>());
                   })
       .def_packed("topi.full_like",
diff --git a/src/topi/nn.cc b/src/topi/nn.cc
index e7b0d9c69e44..cd4968a46145 100644
--- a/src/topi/nn.cc
+++ b/src/topi/nn.cc
@@ -91,7 +91,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def_packed("topi.nn.dense", [](ffi::PackedArgs args, ffi::Any* rv) {
     *rv = nn::dense(args[0].cast<te::Tensor>(), args[1].cast<te::Tensor>(),
-                    args[2].cast<te::Tensor>(), args[3].cast<DataType>());
+                    args[2].cast<te::Tensor>(), args[3].cast<PrimType>());
   });
 }
 
diff --git a/src/topi/transform.cc b/src/topi/transform.cc
index f0d9225fb567..a9d994c2a883 100644
--- a/src/topi/transform.cc
+++ b/src/topi/transform.cc
@@ -86,11 +86,11 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                   })
       .def_packed("topi.shape",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
-                    *rv = shape(args[0].cast<te::Tensor>(), args[1].cast<DataType>());
+                    *rv = shape(args[0].cast<te::Tensor>(), args[1].cast<PrimType>());
                   })
       .def_packed("topi.tensor_size",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
-                    *rv = tensor_size(args[0].cast<te::Tensor>(), args[1].cast<DataType>());
+                    *rv = tensor_size(args[0].cast<te::Tensor>(), args[1].cast<PrimType>());
                   })
       .def_packed("topi.split",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
@@ -141,7 +141,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
       .def_packed("topi.arange",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
                     *rv = arange(args[0].cast<PrimExpr>(), args[1].cast<PrimExpr>(),
-                                 args[2].cast<PrimExpr>(), args[3].cast<DataType>());
+                                 args[2].cast<PrimExpr>(), args[3].cast<PrimType>());
                   })
       .def_packed("topi.meshgrid",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
@@ -261,7 +261,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                   [](ffi::PackedArgs args, ffi::Any* rv) {
                     int depth = args[3].cast<int>();
                     int axis = args[4].cast<int>();
-                    DataType dtype = args[5].cast<DataType>();
+                    PrimType dtype = args[5].cast<PrimType>();
                     *rv = one_hot(args[0].cast<te::Tensor>(), args[1].cast<PrimExpr>(),
                                   args[2].cast<PrimExpr>(), depth, axis, dtype);
                   })
diff --git a/tests/cpp/arith_simplify_test.cc b/tests/cpp/arith_simplify_test.cc
index d5050446d6a5..a08968c9f954 100644
--- a/tests/cpp/arith_simplify_test.cc
+++ b/tests/cpp/arith_simplify_test.cc
@@ -99,7 +99,7 @@ TEST(AnalyzerObjectRef, CloneIsIndependent) {
 TEST(ConstantFold, Broadcast) {
   tvm::ffi::StructuralEqual checker;
   auto i32x4 = tvm::tirx::Broadcast(tvm::IntImm::Int32(10), 4);
-  auto i64x4 = tvm::cast(i32x4->dtype.with_bits(64), i32x4);
+  auto i64x4 = tvm::cast(i32x4.ty().WithBits(64), i32x4);
   auto i64x4_expected = tvm::tirx::Broadcast(tvm::IntImm::Int64(10), 4);
   ASSERT_TRUE(checker(i64x4, i64x4_expected));
 }
@@ -107,11 +107,11 @@ TEST(ConstantFold, Broadcast) {
 TEST(ConstantFold, Ramp) {
   tvm::ffi::StructuralEqual checker;
   auto i32x4 = tvm::tirx::Ramp(tvm::IntImm::Int32(10), tvm::IntImm::Int32(1), 4);
-  auto i64x4 = tvm::cast(i32x4->dtype.with_bits(64), i32x4);
+  auto i64x4 = tvm::cast(i32x4.ty().WithBits(64), i32x4);
   auto i64x4_expected = tvm::tirx::Ramp(tvm::IntImm::Int64(10), tvm::IntImm::Int64(1), 4);
   ASSERT_TRUE(checker(i64x4, i64x4_expected));
 
-  auto f32x4 = tvm::cast(tvm::DataType::Float(32, 4), i32x4);
-  auto f32x4_expected = tvm::tirx::Cast(tvm::DataType::Float(32, 4), i32x4);
+  auto f32x4 = tvm::cast(tvm::PrimType::Float(32, 4), i32x4);
+  auto f32x4_expected = tvm::tirx::Cast(tvm::PrimType::Float(32, 4), i32x4);
   ASSERT_TRUE(checker(f32x4, f32x4_expected));
 }
diff --git a/tests/cpp/expr_test.cc b/tests/cpp/expr_test.cc
index 2470dc25d6fd..f9083525732d 100644
--- a/tests/cpp/expr_test.cc
+++ b/tests/cpp/expr_test.cc
@@ -39,13 +39,22 @@ TEST(Expr, Basic) {
 TEST(Expr, VarTypeAnnotation) {
   using namespace tvm;
   using namespace tvm::tirx;
-  Var x("x", DataType::Float(32));
-  Var y("y", PrimType(DataType::Float(32)));
+  Var x("x", PrimType::Float(32));
+  Var y("y", PrimType::Float(32));
   tvm::ffi::StructuralEqual checker;
-  TVM_FFI_ICHECK(checker(x->dtype, y->dtype));
+  TVM_FFI_ICHECK(checker(x.ty(), y.ty()));
   TVM_FFI_ICHECK(checker(x->type_annotation, y->type_annotation));
 }
 
+TEST(Expr, PrimTypeBoolLanes) {
+  using namespace tvm;
+  PrimType boolx4 = PrimType::Bool(4);
+  TVM_FFI_ICHECK(boolx4.IsFixedLengthVector());
+  TVM_FFI_ICHECK(boolx4.MatchesCode(DLDataTypeCode::kDLBool));
+  TVM_FFI_ICHECK_EQ(boolx4.lanes(), 4);
+  TVM_FFI_ICHECK(boolx4.MatchesElementType(DLDataTypeCode::kDLBool, 8));
+}
+
 TEST(ExprNodeRef, Basic) {
   using namespace tvm;
   using namespace tvm::tirx;
diff --git a/tests/cpp/ir_functor_test.cc b/tests/cpp/ir_functor_test.cc
index 2befce821d79..62f05fd90dbf 100644
--- a/tests/cpp/ir_functor_test.cc
+++ b/tests/cpp/ir_functor_test.cc
@@ -152,8 +152,8 @@ TEST(IRF, StmtVisitor) {
   auto fmaketest = [&]() {
     auto z = x + 1;
     Stmt eval_body = Evaluate(z);
-    DataType dtype = DataType::Float(32);
-    Var data_var("b", PointerType(PrimType(dtype)));
+    PrimType dtype = PrimType::Float(32);
+    Var data_var("b", PointerType(dtype));
     Buffer buf(data_var, dtype, {z, z}, {}, PrimExpr(), "b", 0, 0, BufferType::kDefault);
     // AllocBuffer is flat (no body). Return as SeqStmt with eval.
     return SeqStmt({AllocBuffer(buf), eval_body});
@@ -166,8 +166,8 @@ TEST(IRF, StmtVisitor) {
   {
     // tests for block and block_realize
     Stmt body = fmaketest();
-    DataType dtype = DataType::Float(32);
-    Var buf_var("b", PointerType(PrimType(dtype)));
+    PrimType dtype = PrimType::Float(32);
+    Var buf_var("b", PointerType(dtype));
     Buffer buffer = decl_buffer({16});
     body = SeqStmt({DeclBuffer(buffer), std::move(body)});
     BufferRegion buffer_region(buffer, {Range::FromMinExtent(x + 1, 1)});
@@ -206,8 +206,8 @@ TEST(IRF, StmtMutator) {
   };
   auto fmakealloc = [&]() {
     auto z = x + 1;
-    DataType dtype = DataType::Float(32);
-    Var data_var("b", PointerType(PrimType(dtype)));
+    PrimType dtype = PrimType::Float(32);
+    Var data_var("b", PointerType(dtype));
     Buffer buf(data_var, dtype, {1, z}, {}, PrimExpr(), "b", 0, 0, BufferType::kDefault);
     return AllocBuffer(buf);
   };
@@ -258,7 +258,7 @@ TEST(IRF, StmtMutator) {
 
   {
     auto body =
-        Evaluate(Call(DataType::Int(32), builtin::call_extern(), {StringImm("xyz"), x + 1}));
+        Evaluate(Call(PrimType::Int(32), builtin::call_extern(), {StringImm("xyz"), x + 1}));
     auto res = v(std::move(body));
     TVM_FFI_ICHECK(res.as<EvaluateNode>()->value.as<CallNode>()->args[1].same_as(x));
   }
@@ -330,13 +330,13 @@ TEST(IRF, StmtMutator) {
 TEST(IRF, Substitute) {
   using namespace tvm;
   using namespace tvm::tirx;
-  DataType dtype = DataType::Float(32);
-  Var x("x", PointerType(PrimType(dtype), ""));
-  Var n("n", DataType::Int(32));
+  PrimType dtype = PrimType::Float(32);
+  Var x("x", PointerType(dtype, ""));
+  Var n("n", PrimType::Int(32));
 
   auto fmakebuffer = [&]() {
     return Buffer{/*data=*/x,
-                  /*dtype=*/DataType::Float(32),
+                  /*dtype=*/PrimType::Float(32),
                   /*shape=*/{n},
                   /*strides=*/{},
                   /*elem_offset=*/PrimExpr(),
@@ -349,7 +349,7 @@ TEST(IRF, Substitute) {
   {
     // test substitute buffer data var and shape var via DeclBuffer
     Var y = x.copy_with_suffix("subst");
-    Var m("m", DataType::Int(32));
+    Var m("m", PrimType::Int(32));
     Buffer buffer = fmakebuffer();
     Stmt store = BufferStore(buffer, FloatImm(dtype, 0), {IntImm::Int32(0)});
     Stmt decl = SeqStmt({DeclBuffer(buffer), store});
diff --git a/tests/cpp/ndarray_test.cc b/tests/cpp/ndarray_test.cc
index c02efecc5148..79695a299d49 100644
--- a/tests/cpp/ndarray_test.cc
+++ b/tests/cpp/ndarray_test.cc
@@ -24,7 +24,7 @@
 using namespace tvm;
 
 TEST(TensorTest, IsContiguous_ContiguousStride) {
-  auto array = runtime::Tensor::Empty({5, 10}, DataType::Float(32), {kDLCPU});
+  auto array = runtime::Tensor::Empty({5, 10}, DLDataType{kDLFloat, 32, 1}, {kDLCPU});
   DLManagedTensor* managed_tensor = array.ToDLPack();
 
   int64_t strides[] = {10, 1};
@@ -36,7 +36,7 @@ TEST(TensorTest, IsContiguous_ContiguousStride) {
 }
 
 TEST(TensorTest, IsContiguous_NullStride) {
-  auto array = runtime::Tensor::Empty({5, 10}, DataType::Float(32), {kDLCPU});
+  auto array = runtime::Tensor::Empty({5, 10}, DLDataType{kDLFloat, 32, 1}, {kDLCPU});
   DLManagedTensor* managed_tensor = array.ToDLPack();
 
   managed_tensor->dl_tensor.strides = nullptr;
@@ -47,7 +47,7 @@ TEST(TensorTest, IsContiguous_NullStride) {
 }
 
 TEST(TensorTest, IsContiguous_AnyStrideForSingular) {
-  auto array = runtime::Tensor::Empty({5, 1, 10}, DataType::Float(32), {kDLCPU});
+  auto array = runtime::Tensor::Empty({5, 1, 10}, DLDataType{kDLFloat, 32, 1}, {kDLCPU});
   DLManagedTensor* managed_tensor = array.ToDLPack();
 
   int64_t strides[] = {10, 1, 1};  // strides[1] is normalized to 1 because shape[1] == 1.
@@ -60,7 +60,7 @@ TEST(TensorTest, IsContiguous_AnyStrideForSingular) {
 }
 
 TEST(TensorTest, IsContiguous_UncontiguousStride) {
-  auto array = runtime::Tensor::Empty({5, 1, 10}, DataType::Float(32), {kDLCPU});
+  auto array = runtime::Tensor::Empty({5, 1, 10}, DLDataType{kDLFloat, 32, 1}, {kDLCPU});
   DLManagedTensor* managed_tensor = array.ToDLPack();
 
   int64_t strides[] = {1, 1, 1};
diff --git a/tests/cpp/nested_msg_test.cc b/tests/cpp/nested_msg_test.cc
index 26bfee06f47d..96f645924382 100644
--- a/tests/cpp/nested_msg_test.cc
+++ b/tests/cpp/nested_msg_test.cc
@@ -18,11 +18,11 @@
  */
 
 #include <gtest/gtest.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/extra/structural_equal.h>
 #include <tvm/relax/block_builder.h>
 #include <tvm/relax/nested_msg.h>
 #include <tvm/relax/type.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/tirx/expr.h>
 
@@ -40,6 +40,17 @@
 using namespace tvm;
 using namespace tvm::relax;
 
+namespace {
+
+TensorType ScalarTensorType(PrimType dtype) {
+  auto n = tvm::ffi::make_object<TensorTypeNode>();
+  n->dtype = std::move(dtype);
+  n->ndim = 0;
+  return TensorType(n);
+}
+
+}  // namespace
+
 TEST(NestedMsg, Basic) {
   // start with no annotation
   relax::Var x("x", std::nullopt), y("y", std::nullopt);
@@ -145,9 +156,9 @@ TEST(NestedMsg, Equal) {
 }
 
 TEST(NestedMsg, MapAndDecompose) {
-  relax::Var x("x", PrimType(runtime::DataType::Int(16)));
-  relax::Var y("y", PrimType(runtime::DataType::Int(32)));
-  relax::Var z("z", PrimType(runtime::DataType::Int(64)));
+  relax::Var x("x", PrimType::Int(16));
+  relax::Var y("y", PrimType::Int(32));
+  relax::Var z("z", PrimType::Int(64));
 
   BlockBuilder bb = BlockBuilder::Create(std::nullopt);
   relax::Expr t0 = bb->Normalize(Tuple({x, y}));
@@ -171,7 +182,7 @@ TEST(NestedMsg, MapAndDecompose) {
   auto output2 = MapToNestedMsg<IntImm>(GetType(t1), [&](Type ty) -> NestedMsg<IntImm> {
     const auto* prim_ty = ty.as<PrimTypeNode>();
     if (prim_ty == nullptr) return std::nullopt;
-    int bits = prim_ty->dtype.bits();
+    int bits = prim_ty->dtype.bits;
     if (bits == 16) return c0;
     if (bits == 32) return c1;
     if (bits == 64) return c2;
@@ -201,7 +212,7 @@ TEST(NestedMsg, MapAndDecompose) {
 }
 
 TEST(NestedMsg, MapToNestedMsgByType) {
-  auto sf0 = TensorType(DataType::Float(32), /*ndim=*/0);
+  auto sf0 = ScalarTensorType(PrimType::Float(32));
   auto sf1 = TupleType({sf0, sf0});
   auto sf2 = TupleType({sf0, sf0});
   auto x = relax::Var("x", TupleType({sf1, sf2, sf0}));
@@ -223,7 +234,7 @@ TEST(NestedMsg, MapToNestedMsgByType) {
 }
 
 TEST(NestedMsg, NestedMsgToExpr) {
-  auto sf0 = TensorType(DataType::Float(32), /*ndim=*/0);
+  auto sf0 = ScalarTensorType(PrimType::Float(32));
   auto sf1 = TupleType({sf0, sf0});
 
   auto c0 = IntImm::Int32(0);
@@ -306,7 +317,7 @@ TEST(NestedMsg, TransformTupleLeaf) {
   NInt msg1 = {c0, {c0, c1}, c2, {c0, {c1, c2}}};
   NInt msg2 = {c1, {c2, c0}, c2, {c1, {c2, c0}}};
 
-  PrimType s = PrimType(runtime::DataType::Int(32));
+  PrimType s = PrimType::Int(32);
   relax::Var x("x", s), y("y", s), z("z", s);
   BlockBuilder bb = BlockBuilder::Create(std::nullopt);
   Expr expr = bb->Normalize(Tuple({x, Tuple({x, x}), x, Tuple({x, Tuple({x, x})})}));
diff --git a/tests/cpp/pattern_match_test.cc b/tests/cpp/pattern_match_test.cc
index ab668e9a4204..e9075c6faf9f 100644
--- a/tests/cpp/pattern_match_test.cc
+++ b/tests/cpp/pattern_match_test.cc
@@ -27,9 +27,9 @@ TEST(Pattern, Basic) {
   using namespace tvm::tirx;
   using namespace tvm::arith;
   tvm::tirx::Var x("x"), y("y"), z("z");
-  PrimExpr scalable_lanes = Mul(Call(DataType::Int(32), builtin::vscale(), {}), 4);
+  PrimExpr scalable_lanes = Mul(Call(PrimType::Int(32), builtin::vscale(), {}), 4);
   arith::PVar<PrimExpr> px, py, pz;
-  arith::PVar<DataType> pt;
+  arith::PVar<PrimType> pt;
   arith::PVar<PrimExpr> planes;
   arith::PCallExpr<PVscaleOp> vscale;
 
@@ -101,14 +101,14 @@ TEST(Pattern, Basic) {
   // cast pattern
   {
     TVM_FFI_ICHECK(
-        !cast(PConst<DataType>(DataType::Int(32)), px).Match(tirx::Cast(DataType::Float(64), x)));
-    TVM_FFI_ICHECK(cast(pt, px).Match(tirx::Cast(DataType::Float(64), x)));
-    TVM_FFI_ICHECK(pt.Eval() == DataType::Float(64));
+        !cast(PConst<PrimType>(PrimType::Int(32)), px).Match(tirx::Cast(PrimType::Float(64), x)));
+    TVM_FFI_ICHECK(cast(pt, px).Match(tirx::Cast(PrimType::Float(64), x)));
+    TVM_FFI_ICHECK(pt.Eval() == PrimType::Float(64));
     auto zz = cast(pt, px).Eval();
     TVM_FFI_ICHECK(
         (cast(pt, px) - cast(pt, py))
-            .Match(tirx::Cast(DataType::Float(64), x) - tirx::Cast(DataType::Int(64), x)));
-    auto expr = tirx::Cast(DataType::Int(32), tirx::Cast(DataType::Float(64), x));
+            .Match(tirx::Cast(PrimType::Float(64), x) - tirx::Cast(PrimType::Int(64), x)));
+    auto expr = tirx::Cast(PrimType::Int(32), tirx::Cast(PrimType::Float(64), x));
     TVM_FFI_ICHECK(!(cast(pt, cast(pt, px))).Match(expr));
   }
   // ramp pattern
@@ -150,21 +150,21 @@ TEST(Pattern, IntImm) {
 TEST(Pattern, MatchWithType) {
   using namespace tvm;
   // match expr with specified dtype
-  arith::PVarWithDataType<PrimExpr, arith::PConst<DataType>> pat(DataType::Float(32));
-  tirx::Var x("x", DataType::Float(32));
-  tirx::Var y("y", DataType::Float(32));
-  tirx::Var x_int("x", DataType::Int(32));
-  tirx::Var y_int("y", DataType::Int(32));
+  arith::PVarWithDataType<PrimExpr, arith::PConst<PrimType>> pat(PrimType::Float(32));
+  tirx::Var x("x", PrimType::Float(32));
+  tirx::Var y("y", PrimType::Float(32));
+  tirx::Var x_int("x", PrimType::Int(32));
+  tirx::Var y_int("y", PrimType::Int(32));
   TVM_FFI_ICHECK(pat.Match(x + y * 2.0f));
   TVM_FFI_ICHECK(!pat.Match(x_int + y_int * 2));
 
   // match vectorized expr with specified element dtype
-  arith::PVecDataType vec_ty(DataType::Float(32));
+  arith::PVecDataType vec_ty(PrimType::Float(32));
   arith::PVarWithDataType<PrimExpr, arith::PVecDataType> vpat(vec_ty);
-  tirx::Var vx = tirx::Var("x", DataType::Float(32, 8));
-  tirx::Var vy("y", DataType::Float(32, 8));
-  tirx::Var vx_int("x", DataType::Int(32, 8));
-  tirx::Var vy_int("y", DataType::Int(32, 8));
+  tirx::Var vx = tirx::Var("x", PrimType::Float(32, 8));
+  tirx::Var vy("y", PrimType::Float(32, 8));
+  tirx::Var vx_int("x", PrimType::Int(32, 8));
+  tirx::Var vy_int("y", PrimType::Int(32, 8));
   TVM_FFI_ICHECK(vpat.Match(vx + vy * tirx::Broadcast(2.0f, 8)));
   TVM_FFI_ICHECK(!vpat.Match(vx_int + vy_int * tirx::Broadcast(2, 8)));
 }
diff --git a/tests/cpp/te_compute_test.cc b/tests/cpp/te_compute_test.cc
index 6f1e6aa9b8cc..30397fb765bb 100644
--- a/tests/cpp/te_compute_test.cc
+++ b/tests/cpp/te_compute_test.cc
@@ -27,8 +27,8 @@ TEST(Tensor, Basic) {
 
   Var m("m"), n("n"), l("l");
 
-  Tensor A = placeholder({m, l}, DataType::Float(32), "A");
-  Tensor B = placeholder({n, l}, DataType::Float(32), "B");
+  Tensor A = placeholder({m, l}, PrimType::Float(32), "A");
+  Tensor B = placeholder({n, l}, PrimType::Float(32), "B");
 
   auto C = compute({m, n}, [&](Var i, Var j) { return A[i][j]; }, "C");
 
@@ -40,8 +40,8 @@ TEST(Tensor, Reduce) {
   using namespace tvm::te;
 
   Var m("m"), n("n"), l("l");
-  te::Tensor A = te::placeholder({m, l}, DataType::Float(32), "A");
-  te::Tensor B = te::placeholder({n, l}, DataType::Float(32), "B");
+  te::Tensor A = te::placeholder({m, l}, PrimType::Float(32), "A");
+  te::Tensor B = te::placeholder({n, l}, PrimType::Float(32), "B");
   IterVar rv = reduce_axis(Range{0, l}, "k");
 
   auto C = te::compute(
@@ -53,5 +53,5 @@ TEST(Tensor, Indexing) {
   using namespace tvm::te;
 
   Var x("x"), y("y");
-  te::Tensor A = te::placeholder({x, y}, DataType::Float(32), "A");
+  te::Tensor A = te::placeholder({x, y}, PrimType::Float(32), "A");
 }
diff --git a/tests/cpp/tir_analysis_side_effect.cc b/tests/cpp/tir_analysis_side_effect.cc
index bcc7128647b4..1183f37abee6 100644
--- a/tests/cpp/tir_analysis_side_effect.cc
+++ b/tests/cpp/tir_analysis_side_effect.cc
@@ -25,11 +25,11 @@
 
 TEST(SimplePasses, SideEffect) {
   using namespace tvm;
-  auto buf = tirx::decl_buffer({16}, DataType::Float(32));
-  auto i = tirx::Var("i", DataType::Int(32));
+  auto buf = tirx::decl_buffer({16}, PrimType::Float(32));
+  auto i = tirx::Var("i", PrimType::Int(32));
   TVM_FFI_ICHECK(tirx::SideEffect(tirx::BufferLoad(buf, {i})) == tirx::CallEffectKind::kReadState);
-  TVM_FFI_ICHECK(tirx::SideEffect(exp(tirx::Cast(DataType::Float(32), i + 1))) ==
+  TVM_FFI_ICHECK(tirx::SideEffect(exp(tirx::Cast(PrimType::Float(32), i + 1))) ==
                  tirx::CallEffectKind::kPure);
-  TVM_FFI_ICHECK(tirx::SideEffect(tirx::Call(DataType::Handle(), tirx::builtin::tvm_storage_sync(),
+  TVM_FFI_ICHECK(tirx::SideEffect(tirx::Call(PrimType::Handle(), tirx::builtin::tvm_storage_sync(),
                                              {})) == tirx::CallEffectKind::kUpdateState);
 }
diff --git a/tests/cpp/tir_scalable_datatype.cc b/tests/cpp/tir_scalable_datatype.cc
index a81915c74b97..015bae4564a1 100644
--- a/tests/cpp/tir_scalable_datatype.cc
+++ b/tests/cpp/tir_scalable_datatype.cc
@@ -19,7 +19,7 @@
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include <tvm/runtime/data_type.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/script/printer/printer.h>
 #include <tvm/tirx/builtin.h>
 #include <tvm/tirx/expr.h>
@@ -33,67 +33,68 @@
 using ::testing::HasSubstr;
 
 // ---------
-// Data Type
+// Prim Type
 // ---------
-TEST(ScalableDataType, TestCreateScalableType) {
-  tvm::DataType scalable_type = tvm::DataType(kDLInt, 32, 4, true);
+TEST(ScalablePrimType, TestCreateScalableType) {
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(kDLInt, 32, 4);
   ASSERT_EQ(scalable_type.code(), kDLInt);
   ASSERT_EQ(scalable_type.bits(), 32);
-  ASSERT_EQ(scalable_type.vscale_factor(), 4);
-  ASSERT_TRUE(scalable_type.is_scalable_vector());
-  ASSERT_TRUE(scalable_type.is_scalable_or_fixed_length_vector());
+  ASSERT_EQ(scalable_type.VScaleFactor(), 4);
+  ASSERT_TRUE(scalable_type.IsScalableVector());
+  ASSERT_TRUE(scalable_type.IsScalableVector() || scalable_type.IsFixedLengthVector());
 }
 
-TEST(ScalableDataType, TestScalableWithBits) {
-  tvm::DataType scalable_type = tvm::DataType(kDLInt, 1, 8, true);
-  scalable_type = scalable_type.with_bits(32);
+TEST(ScalablePrimType, TestScalableWithBits) {
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(kDLInt, 1, 8);
+  scalable_type = scalable_type.WithBits(32);
   ASSERT_EQ(scalable_type.bits(), 32);
-  ASSERT_TRUE(scalable_type.is_scalable_vector());
-  ASSERT_TRUE(scalable_type.is_scalable_or_fixed_length_vector());
+  ASSERT_TRUE(scalable_type.IsScalableVector());
+  ASSERT_TRUE(scalable_type.IsScalableVector() || scalable_type.IsFixedLengthVector());
 }
 
-TEST(ScalableDataType, TestScalableWithVscaleFactor) {
-  tvm::DataType type = tvm::DataType(kDLInt, 32, 1);
-  tvm::DataType scalable_type = type.with_scalable_vscale_factor(4);
-  ASSERT_EQ(scalable_type.vscale_factor(), 4);
-  ASSERT_TRUE(scalable_type.is_scalable_vector());
-  ASSERT_TRUE(scalable_type.is_scalable_or_fixed_length_vector());
+TEST(ScalablePrimType, TestScalableWithVscaleFactor) {
+  tvm::PrimType type = tvm::PrimType::Int(32);
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(type.code(), type.bits(), 4);
+  ASSERT_EQ(scalable_type.VScaleFactor(), 4);
+  ASSERT_TRUE(scalable_type.IsScalableVector());
+  ASSERT_TRUE(scalable_type.IsScalableVector() || scalable_type.IsFixedLengthVector());
 }
 
-TEST(ScalableDataType, TestAssignScalableDataType) {
-  tvm::DataType scalable_type = tvm::DataType(kDLInt, 32, 2, true);
-  tvm::DataType scalable_type_copy = scalable_type;
-  ASSERT_TRUE(scalable_type_copy.is_scalable_vector());
-  ASSERT_TRUE(scalable_type_copy.is_scalable_or_fixed_length_vector());
+TEST(ScalablePrimType, TestAssignScalablePrimType) {
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(kDLInt, 32, 2);
+  tvm::PrimType scalable_type_copy = scalable_type;
+  ASSERT_TRUE(scalable_type_copy.IsScalableVector());
+  ASSERT_TRUE(scalable_type_copy.IsScalableVector() || scalable_type_copy.IsFixedLengthVector());
 }
 
-TEST(ScalableDataType, TestScalableDataTypeEquality) {
-  ASSERT_TRUE(tvm::DataType(kDLInt, 32, 4, true) == tvm::DataType(kDLInt, 32, 4, true));
+TEST(ScalablePrimType, TestScalablePrimTypeEquality) {
+  ASSERT_TRUE(tvm::PrimType::ScalableVector(kDLInt, 32, 4) ==
+              tvm::PrimType::ScalableVector(kDLInt, 32, 4));
 }
 
-TEST(ScalableDataType, TestScalableDataTypeAndNonScalableDataTypeInequality) {
-  ASSERT_FALSE(tvm::DataType(kDLInt, 32, 4, true) == tvm::DataType(kDLInt, 32, 4));
+TEST(ScalablePrimType, TestScalablePrimTypeAndNonScalablePrimTypeInequality) {
+  ASSERT_FALSE(tvm::PrimType::ScalableVector(kDLInt, 32, 4) == tvm::PrimType::Int(32, 4));
 }
 
-TEST(ScalableDataType, TestIsScalar) {
-  ASSERT_FALSE(tvm::DataType(kDLInt, 32, 4, true).is_scalar());
-  ASSERT_TRUE(tvm::DataType(kDLInt, 32, 1, false).is_scalar());
-  ASSERT_FALSE(tvm::DataType(kDLInt, 32, 4, false).is_scalar());
-  ASSERT_FALSE(tvm::DataType(kDLOpaqueHandle, 1, 0, false).is_scalar());
+TEST(ScalablePrimType, TestIsScalar) {
+  ASSERT_FALSE(tvm::PrimType::ScalableVector(kDLInt, 32, 4).IsScalar());
+  ASSERT_TRUE(tvm::PrimType::Int(32).IsScalar());
+  ASSERT_FALSE(tvm::PrimType::Int(32, 4).IsScalar());
+  ASSERT_FALSE(tvm::PrimType::Void().IsScalar());
 }
 
-TEST(ScalableDataType, TestScalableDataTypeToString) {
-  tvm::DataType scalable_type = tvm::DataType(kDLInt, 32, 4, true);
-  EXPECT_EQ(tvm::ffi::DLDataTypeToString(scalable_type), "int32xvscalex4");
+TEST(ScalablePrimType, TestScalablePrimTypeToString) {
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(kDLInt, 32, 4);
+  EXPECT_EQ(tvm::ffi::DLDataTypeToString(scalable_type->dtype), "int32xvscalex4");
 }
 
-TEST(ScalableDataType, TestStringToScalableDataType) {
+TEST(ScalablePrimType, TestStringToScalablePrimType) {
   std::string scalable_type_str = "int32xvscalex4";
-  EXPECT_EQ(tvm::DataType(tvm::ffi::StringToDLDataType(scalable_type_str)),
-            tvm::DataType(kDLInt, 32, 4, true));
+  EXPECT_EQ(tvm::PrimType(tvm::ffi::StringToDLDataType(scalable_type_str)),
+            tvm::PrimType::ScalableVector(kDLInt, 32, 4));
 }
 
-TEST(ScalableDataType, TestInvalidStringToScalableDataType) {
+TEST(ScalablePrimType, TestInvalidStringToScalablePrimType) {
   std::string scalable_type_str = "int32x4xvscale";
   EXPECT_THROW(
       {
@@ -107,12 +108,13 @@ TEST(ScalableDataType, TestInvalidStringToScalableDataType) {
       tvm::ffi::Error);
 }
 
-TEST(ScalableDataType, TestGetScalableVectorBytes) {
-  tvm::DataType scalable_type = tvm::DataType(kDLInt, 32, 4, true);
+TEST(ScalablePrimType, TestGetScalableVectorBytes) {
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(kDLInt, 32, 4);
   EXPECT_THROW(
       {
         try {
-          tvm::runtime::GetVectorBytes(scalable_type);
+          int bytes = (scalable_type.bits() * scalable_type.lanes() + 7) / 8;
+          static_cast<void>(bytes);
         } catch (const tvm::ffi::Error& e) {
           EXPECT_THAT(e.what(),
                       HasSubstr("Can't fetch the lanes of a scalable vector at a compile time"));
@@ -122,11 +124,11 @@ TEST(ScalableDataType, TestGetScalableVectorBytes) {
       tvm::ffi::Error);
 }
 
-TEST(ScalableDataType, TestScalableDataTypeInvalidLanesError) {
+TEST(ScalablePrimType, TestScalablePrimTypeInvalidLanesError) {
   EXPECT_THROW(
       {
         try {
-          tvm::DataType(kDLFloat, 62, 1, true);
+          tvm::PrimType::ScalableVector(kDLFloat, 62, 1);
         } catch (const tvm::ffi::Error& e) {
           EXPECT_THAT(e.what(), HasSubstr("Invalid value for vscale factor"));
           throw;
@@ -135,14 +137,14 @@ TEST(ScalableDataType, TestScalableDataTypeInvalidLanesError) {
       tvm::ffi::Error);
 }
 
-TEST(ScalableDataType, TestScalableDataTypeInvalidVscaleFactorAccess) {
-  tvm::DataType fixed_length_type = tvm::DataType(kDLFloat, 32, 4);
-  ASSERT_TRUE(fixed_length_type.is_fixed_length_vector());
-  ASSERT_TRUE(fixed_length_type.is_scalable_or_fixed_length_vector());
+TEST(ScalablePrimType, TestScalablePrimTypeInvalidVscaleFactorAccess) {
+  tvm::PrimType fixed_length_type = tvm::PrimType::Float(32, 4);
+  ASSERT_TRUE(fixed_length_type.IsFixedLengthVector());
+  ASSERT_TRUE(fixed_length_type.IsScalableVector() || fixed_length_type.IsFixedLengthVector());
   EXPECT_THROW(
       {
         try {
-          fixed_length_type.vscale_factor();
+          fixed_length_type.VScaleFactor();
         } catch (const tvm::ffi::Error& e) {
           EXPECT_THAT(e.what(), HasSubstr("A fixed length vector doesn't have a vscale factor"));
           throw;
@@ -151,8 +153,8 @@ TEST(ScalableDataType, TestScalableDataTypeInvalidVscaleFactorAccess) {
       tvm::ffi::Error);
 }
 
-TEST(ScalableDataType, TestScalableDataTypeInvalidLanesAccess) {
-  tvm::DataType scalable_type = tvm::DataType(kDLFloat, 32, 4, true);
+TEST(ScalablePrimType, TestScalablePrimTypeInvalidLanesAccess) {
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(kDLFloat, 32, 4);
   EXPECT_THROW(
       {
         try {
@@ -166,28 +168,28 @@ TEST(ScalableDataType, TestScalableDataTypeInvalidLanesAccess) {
       tvm::ffi::Error);
 }
 
-TEST(ScalableDataType, TestScalableBool) {
-  tvm::DataType scalable_type = tvm::DataType::Bool(4, true);
+TEST(ScalablePrimType, TestScalableBool) {
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(kDLBool, 8, 4);
   ASSERT_EQ(scalable_type.code(), kDLBool);
   ASSERT_EQ(scalable_type.bits(), 8);
-  ASSERT_EQ(scalable_type.vscale_factor(), 4);
-  ASSERT_TRUE(scalable_type.is_scalable_vector());
+  ASSERT_EQ(scalable_type.VScaleFactor(), 4);
+  ASSERT_TRUE(scalable_type.IsScalableVector());
 }
 
-TEST(ScalableDataType, TestScalableUInt) {
-  tvm::DataType scalable_type = tvm::DataType::UInt(1, 4, true);
+TEST(ScalablePrimType, TestScalableUInt) {
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(kDLUInt, 1, 4);
   ASSERT_EQ(scalable_type.code(), kDLUInt);
   ASSERT_EQ(scalable_type.bits(), 1);
-  ASSERT_EQ(scalable_type.vscale_factor(), 4);
-  ASSERT_TRUE(scalable_type.is_scalable_vector());
+  ASSERT_EQ(scalable_type.VScaleFactor(), 4);
+  ASSERT_TRUE(scalable_type.IsScalableVector());
 }
 
 // -----------
 // Integration
 // -----------
 #ifdef TVM_LLVM_VERSION
-TEST(ScalableDataType, TestScalableIntrinCall) {
-  tvm::DataType scalable_type = tvm::DataType(kDLInt, 32, 4, true);
+TEST(ScalablePrimType, TestScalableIntrinCall) {
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(kDLInt, 32, 4);
   tvm::tirx::Call call = tvm::tirx::Call(scalable_type, tvm::tirx::builtin::call_llvm_intrin(),
 #if TVM_LLVM_VERSION >= 200
                                          {tvm::IntImm::Int32(::llvm::Intrinsic::stepvector)});
@@ -195,7 +197,7 @@ TEST(ScalableDataType, TestScalableIntrinCall) {
                                          {tvm::IntImm::Int32(
                                              ::llvm::Intrinsic::experimental_stepvector)});
 #endif
-  ASSERT_EQ(call->dtype, scalable_type);
+  ASSERT_EQ(tvm::PrimType(call.ty()->dtype), scalable_type);
   ASSERT_EQ(tvm::Script(call),
 #if TVM_LLVM_VERSION >= 200
             "T.call_llvm_intrin(\"int32xvscalex4\", \"llvm.stepvector\")");
@@ -205,7 +207,7 @@ TEST(ScalableDataType, TestScalableIntrinCall) {
 }
 #endif
 
-TEST(ScalableDataType, TestTIRScriptScalableDtype2Str) {
-  tvm::DataType scalable_type = tvm::DataType(kDLInt, 32, 4, true);
-  ASSERT_EQ(tvm::script::printer::DType2Str(scalable_type), "int32xvscalex4");
+TEST(ScalablePrimType, TestTIRScriptScalableDtype2Str) {
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(kDLInt, 32, 4);
+  ASSERT_EQ(tvm::script::printer::DType2Str(scalable_type->dtype), "int32xvscalex4");
 }
diff --git a/tests/cpp/topi_ewise_test.cc b/tests/cpp/topi_ewise_test.cc
index 9f4457de5192..41b48ed3c5be 100644
--- a/tests/cpp/topi_ewise_test.cc
+++ b/tests/cpp/topi_ewise_test.cc
@@ -26,7 +26,7 @@ namespace topi {
 TEST(Tensor, Basic) {
   using namespace tvm;
   Var m("m"), l("l");
-  Tensor A = placeholder({m, l}, DataType::Float(32), "A");
+  Tensor A = placeholder({m, l}, PrimType::Float(32), "A");
   auto C = topi::exp(A);
 }
 }  // namespace topi
diff --git a/tests/python/codegen/test_target_codegen_llvm.py b/tests/python/codegen/test_target_codegen_llvm.py
index 624d587b825f..186b2c6de318 100644
--- a/tests/python/codegen/test_target_codegen_llvm.py
+++ b/tests/python/codegen/test_target_codegen_llvm.py
@@ -1145,7 +1145,7 @@ def test_call_packed_returning_void():
 
     The LLVM codegen uses the CallNode's dtype to cast the return type
     of the PackedFunc into the appropriate LLVM output type.  However,
-    there is no API type for `DataType::Void()`.  When the return type
+    there is no runtime dtype value for a void return.  When the return type
     of a PackedFunc is void, the generated code should not attempt to
     read the return value.
 
diff --git a/tests/python/contrib/test_sort.py b/tests/python/contrib/test_sort.py
index 33d37ccdd372..abc814e52346 100644
--- a/tests/python/contrib/test_sort.py
+++ b/tests/python/contrib/test_sort.py
@@ -56,11 +56,13 @@ def test_sort():
     dev = tvm.cpu(0)
     target = "llvm"
     f = tvm.compile(te.create_prim_func([data, sort_num, out]), target=target)
-    a = tvm.runtime.tensor(np.array(input_data).astype(data.dtype), dev)
-    b = tvm.runtime.tensor(np.array(sort_num_input).astype(sort_num.dtype), dev)
-    c = tvm.runtime.tensor(np.zeros(a.shape, dtype=out.dtype), dev)
+    a = tvm.runtime.tensor(np.array(input_data).astype(data.dtype.dtype), dev)
+    b = tvm.runtime.tensor(np.array(sort_num_input).astype(sort_num.dtype.dtype), dev)
+    c = tvm.runtime.tensor(np.zeros(a.shape, dtype=out.dtype.dtype), dev)
     f(a, b, c)
-    tvm.testing.assert_allclose(c.numpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
+    tvm.testing.assert_allclose(
+        c.numpy(), np.array(sorted_index).astype(out.dtype.dtype), rtol=1e-5
+    )
 
 
 def test_sort_np():
@@ -88,9 +90,9 @@ def test_sort_np():
     np_data = np.random.uniform(size=dshape)
     np_out = np.argsort(np_data, axis=axis)
     sort_num_input = np.full(reduced_shape, dshape[axis])
-    a = tvm.runtime.tensor(np.array(np_data).astype(data.dtype), dev)
-    b = tvm.runtime.tensor(np.array(sort_num_input).astype(sort_num.dtype), dev)
-    c = tvm.runtime.tensor(np.zeros(a.shape, dtype=out.dtype), dev)
+    a = tvm.runtime.tensor(np.array(np_data).astype(data.dtype.dtype), dev)
+    b = tvm.runtime.tensor(np.array(sort_num_input).astype(sort_num.dtype.dtype), dev)
+    c = tvm.runtime.tensor(np.zeros(a.shape, dtype=out.dtype.dtype), dev)
     f(a, b, c)
     tvm.testing.assert_allclose(c.numpy(), np_out, rtol=1e-5)
 
diff --git a/tests/python/relax/frontend_nn_extern_module.cc b/tests/python/relax/frontend_nn_extern_module.cc
index 1ca2b300d3a8..33755cce581b 100644
--- a/tests/python/relax/frontend_nn_extern_module.cc
+++ b/tests/python/relax/frontend_nn_extern_module.cc
@@ -22,19 +22,18 @@
  */
 #include <dlpack/dlpack.h>
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/function.h>
-#include <tvm/runtime/data_type.h>
 
 namespace {
 
 int _scalar_add(DLTensor* a, DLTensor* b, DLTensor* c) {
-  using namespace tvm::runtime;
   TVM_FFI_ICHECK(a->ndim == 0);
   TVM_FFI_ICHECK(b->ndim == 0);
   TVM_FFI_ICHECK(c->ndim == 0);
-  TVM_FFI_ICHECK(DataType(a->dtype) == DataType::Float(32));
-  TVM_FFI_ICHECK(DataType(b->dtype) == DataType::Float(32));
-  TVM_FFI_ICHECK(DataType(c->dtype) == DataType::Float(32));
+  TVM_FFI_ICHECK((a->dtype == DLDataType{kDLFloat, 32, 1}));
+  TVM_FFI_ICHECK((b->dtype == DLDataType{kDLFloat, 32, 1}));
+  TVM_FFI_ICHECK((c->dtype == DLDataType{kDLFloat, 32, 1}));
   float* a_data = static_cast<float*>(a->data);
   float* b_data = static_cast<float*>(b->data);
   float* c_data = static_cast<float*>(c->data);
@@ -43,13 +42,12 @@ int _scalar_add(DLTensor* a, DLTensor* b, DLTensor* c) {
 }
 
 int _test_sym(DLTensor* a, DLTensor* b, DLTensor* c) {
-  using namespace tvm::runtime;
   TVM_FFI_ICHECK(a->ndim == 3);  // [x, y, 1]
   TVM_FFI_ICHECK(b->ndim == 3);  // [y, z, 5]
   TVM_FFI_ICHECK(c->ndim == 4);  // [x, y, z, 9]
-  TVM_FFI_ICHECK(DataType(a->dtype) == DataType::Float(32));
-  TVM_FFI_ICHECK(DataType(b->dtype) == DataType::Float(32));
-  TVM_FFI_ICHECK(DataType(c->dtype) == DataType::Float(32));
+  TVM_FFI_ICHECK((a->dtype == DLDataType{kDLFloat, 32, 1}));
+  TVM_FFI_ICHECK((b->dtype == DLDataType{kDLFloat, 32, 1}));
+  TVM_FFI_ICHECK((c->dtype == DLDataType{kDLFloat, 32, 1}));
   int x = a->shape[0];
   int y = a->shape[1];
   int z = b->shape[1];
diff --git a/tests/python/relax/test_analysis_well_formed.py b/tests/python/relax/test_analysis_well_formed.py
index a123dfe75e29..b13e87ea3fe4 100644
--- a/tests/python/relax/test_analysis_well_formed.py
+++ b/tests/python/relax/test_analysis_well_formed.py
@@ -662,7 +662,7 @@ def test_pass_dltensor_arg_to_tir():
 
     In TIR, a `DLTensor*` argument with unknown shape and dtype is
     represented as a `tirx.Var` with
-    `tvm::PrimType(DataType::Handle())`, and with no entry in the
+    `tvm::PrimType::Handle()`, and with no entry in the
     `PrimFuncNode::buffer_map`.  In Relax, this is represented as
     `R.Tensor`.  Calls from Relax to TIR that pass a tensor of unknown
     rank/shape are well-formed.
diff --git a/tests/python/te/test_te_create_primfunc.py b/tests/python/te/test_te_create_primfunc.py
index 6aa5689ad10d..2249b5bd4ab6 100644
--- a/tests/python/te/test_te_create_primfunc.py
+++ b/tests/python/te/test_te_create_primfunc.py
@@ -353,8 +353,8 @@ def test_constant():
 
     func = te.create_prim_func([C, A])
     func = tvm.compile(func)
-    a_np = np.random.uniform(size=(M,)).astype(A.dtype)
-    c = tvm.runtime.tensor(np.zeros(M, dtype=C.dtype))
+    a_np = np.random.uniform(size=(M,)).astype(A.dtype.dtype)
+    c = tvm.runtime.tensor(np.zeros(M, dtype=C.dtype.dtype))
     x = func(c, tvm.runtime.tensor(a_np))
     tvm.testing.assert_allclose(a_np + 2, c.numpy())
 
@@ -393,9 +393,9 @@ def test_data_dependent_access():
     func = te.create_prim_func([C, A, B])
     func = tvm.compile(func)
 
-    a_np = np.random.uniform(size=(10,)).astype(A.dtype)
-    b_np = np.arange(10, dtype=B.dtype)
-    c = tvm.runtime.tensor(np.zeros(10, dtype=C.dtype))
+    a_np = np.random.uniform(size=(10,)).astype(A.dtype.dtype)
+    b_np = np.arange(10, dtype=B.dtype.dtype)
+    c = tvm.runtime.tensor(np.zeros(10, dtype=C.dtype.dtype))
     func(c, tvm.runtime.tensor(a_np), tvm.runtime.tensor(b_np))
     tvm.testing.assert_allclose(a_np[b_np], c.numpy())
 
diff --git a/tests/python/tirx-base/test_tir_buffer.py b/tests/python/tirx-base/test_tir_buffer.py
index bcdd0830a7f3..1fb83a8bb9cb 100644
--- a/tests/python/tirx-base/test_tir_buffer.py
+++ b/tests/python/tirx-base/test_tir_buffer.py
@@ -33,7 +33,7 @@ def test_buffer():
     Bb = tvm.tirx.decl_buffer((n, l), "float32")
 
     assert isinstance(Ab, tvm.tirx.Buffer)
-    assert Ab.dtype == "float32"
+    assert Ab.dtype == tvm.ir.PrimType("float32")
     assert tuple(Ab.shape) == (m, n)
 
 
@@ -43,7 +43,7 @@ def test_buffer_access_ptr():
     Ab = tvm.tirx.decl_buffer((m, n), "float32", strides=[n + 1, 1])
     aptr = Ab.access_ptr("rw")
     tvm.ir.assert_structural_equal(aptr.args[3], Ab.strides[0] * m)
-    assert aptr.args[0].dtype == Ab.dtype
+    assert aptr.args[0].dtype == Ab.dtype.dtype
     assert aptr.args[4].value == Buffer.READ | Buffer.WRITE
     aptr = Ab.access_ptr("w")
     assert aptr.args[4].value == Buffer.WRITE
diff --git a/tests/python/tirx-base/test_tir_intrin.py b/tests/python/tirx-base/test_tir_intrin.py
index db4a42f2584e..43cf7fa2ebb6 100644
--- a/tests/python/tirx-base/test_tir_intrin.py
+++ b/tests/python/tirx-base/test_tir_intrin.py
@@ -48,8 +48,8 @@ def test_nearbyint():
 
     dev = tvm.cpu(0)
     n = 10
-    a = tvm.runtime.tensor(np.random.uniform(high=100, size=n).astype(A.dtype), dev)
-    a_rounded = tvm.runtime.tensor(np.random.uniform(size=n).astype(A_rounded.dtype), dev)
+    a = tvm.runtime.tensor(np.random.uniform(high=100, size=n).astype(A.dtype.dtype), dev)
+    a_rounded = tvm.runtime.tensor(np.random.uniform(size=n).astype(A_rounded.dtype.dtype), dev)
     func(a, a_rounded)
     # Note that numpys rint rounds to nearest integer with
     # ties to halfway is broken by rounding to even.
@@ -125,8 +125,8 @@ def run_test(tvm_intrin, np_func, atol=1e-5, rtol=1e-5):
 
         dev = tvm.cpu(0)
         n = 10
-        a = tvm.runtime.tensor(np.random.uniform(0.1, 0.5, size=n).astype(A.dtype), dev)
-        b = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(0.1, 0.5, size=n).astype(A.dtype.dtype), dev)
+        b = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype.dtype), dev)
         func(a, b)
         tvm.testing.assert_allclose(b.numpy(), np_func(a.numpy()), atol=atol, rtol=rtol)
 
@@ -140,7 +140,7 @@ def run_test(tvm_intrin, np_func, atol=1e-5, rtol=1e-5):
                     np.random.uniform(1.1, 2.0, size=n // 2),
                     np.random.uniform(-2.0, -1.1, size=n // 2),
                 ]
-            ).astype(A.dtype)
+            ).astype(A.dtype.dtype)
             a2 = tvm.runtime.tensor(out_np, dev)
             b2 = tvm.runtime.tensor(np.empty_like(out_np), dev)
             func(a2, b2)
@@ -148,7 +148,7 @@ def run_test(tvm_intrin, np_func, atol=1e-5, rtol=1e-5):
             assert np.all(np.isnan(b2.numpy()))
         if name == "exp":
             n = 8
-            out_np = np.random.randint(-20, 20, size=n).astype(A.dtype)
+            out_np = np.random.randint(-20, 20, size=n).astype(A.dtype.dtype)
             a2 = tvm.runtime.tensor(out_np, dev)
             b2 = tvm.runtime.tensor(np.empty_like(out_np), dev)
             func(a2, b2)
@@ -239,9 +239,9 @@ def run_test(tvm_intrin, np_func):
 
         dev = tvm.cpu(0)
         n = 10
-        a = tvm.runtime.tensor(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
-        b = tvm.runtime.tensor(np.random.uniform(0, 1, size=n).astype(B.dtype), dev)
-        c = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(0, 1, size=n).astype(A.dtype.dtype), dev)
+        b = tvm.runtime.tensor(np.random.uniform(0, 1, size=n).astype(B.dtype.dtype), dev)
+        c = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype.dtype), dev)
         func(a, b, c)
         tvm.testing.assert_allclose(c.numpy(), np_func(a.numpy(), b.numpy()), atol=1e-5, rtol=1e-5)
 
@@ -266,9 +266,9 @@ def test_ldexp():
 
     dev = tvm.cpu(0)
     n = 10
-    a = tvm.runtime.tensor(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
-    b = tvm.runtime.tensor(np.random.randint(0, 5, size=n).astype(B.dtype), dev)
-    c = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
+    a = tvm.runtime.tensor(np.random.uniform(0, 1, size=n).astype(A.dtype.dtype), dev)
+    b = tvm.runtime.tensor(np.random.randint(0, 5, size=n).astype(B.dtype.dtype), dev)
+    c = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype.dtype), dev)
     func(a, b, c)
     tvm.testing.assert_allclose(c.numpy(), np.ldexp(a.numpy(), b.numpy()), atol=1e-5, rtol=1e-5)
 
diff --git a/tests/python/tirx-base/test_tir_specialize.py b/tests/python/tirx-base/test_tir_specialize.py
index 0529bd90a4db..f4c530584761 100644
--- a/tests/python/tirx-base/test_tir_specialize.py
+++ b/tests/python/tirx-base/test_tir_specialize.py
@@ -302,8 +302,8 @@ def test_specialize_buffer_var_to_expr():
     """Handle specialization of buffer var
 
     The `tirx::Buffer::data` field must be an explicit `tirx::Var`, and
-    cannot be replaced with a `tirx::PrimExpr` of type
-    `DataType::Handle()`.  However, these substitutions are useful
+    cannot be replaced with a handle-typed `tirx::PrimExpr`.  However,
+    these substitutions are useful
     when lowering.  If these occur, a binding of the `tirx::Var` is
     included in the specialized function.
     """
diff --git a/tests/python/tvmscript/test_tvmscript_parser_tir.py b/tests/python/tvmscript/test_tvmscript_parser_tir.py
index 9c1e26459dc8..f8cc6da3b95b 100644
--- a/tests/python/tvmscript/test_tvmscript_parser_tir.py
+++ b/tests/python/tvmscript/test_tvmscript_parser_tir.py
@@ -29,14 +29,14 @@ def test_tir_buffer_proxy():
     assert (
         isinstance(buffer_0, tirx.Buffer)
         and list(buffer_0.shape) == [128, 128]
-        and buffer_0.dtype == "float32"
+        and buffer_0.dtype == ir.PrimType("float32")
     )
 
     buffer_1 = T.Buffer((64, 64, 64), "int32")
     assert (
         isinstance(buffer_1, tirx.Buffer)
         and list(buffer_1.shape) == [64, 64, 64]
-        and buffer_1.dtype == "int32"
+        and buffer_1.dtype == ir.PrimType("int32")
     )
 
 
diff --git a/tests/python/tvmscript/test_tvmscript_roundtrip.py b/tests/python/tvmscript/test_tvmscript_roundtrip.py
index bdcaf668718e..03950d6b3569 100644
--- a/tests/python/tvmscript/test_tvmscript_roundtrip.py
+++ b/tests/python/tvmscript/test_tvmscript_roundtrip.py
@@ -2482,12 +2482,12 @@ def test_void_ptr_vs_handle():
     one of the two C++ representations.
     """
 
-    # Generates PointerType(PrimType(DataType::Void()))
+    # Generates PointerType(PrimType::Void())
     @T.prim_func(s_tir=True)
     def void_ptr(out_ret_value: T.handle("void")):
         T.evaluate(out_ret_value)
 
-    # Generates PrimType(DataType::Handle())
+    # Generates PrimType::Handle()
     @T.prim_func(s_tir=True)
     def handle(out_ret_value: T.handle):
         T.evaluate(out_ret_value)