From 286d16bec0e7c8693d1776b23acf648671a12db4 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 23 Jun 2026 15:43:02 +0000
Subject: [PATCH 1/8] [REFACTOR][IR] Use PrimType for compiler dtypes

Use PrimType as the compiler-facing dtype/type carrier so primitive expression dtype information is unified with Expr.ty instead of flowing through a dedicated dtype path. This keeps compiler IR type information in the type system and leaves room for future expression type annotations.

Use raw DLDataType at runtime, ABI, storage-helper, and dtype-valued attr boundaries where a plain DLPack dtype value is the real interface. Keep the PrimType API minimal and hot-path friendly with value equality, matching helpers, documented factories, and cached common constructors.

Update TIRX, TE, TOPI, Relax, codegen, Python bindings, and tests to follow the compiler PrimType versus runtime DLDataType boundary.
---
 include/tvm/ir/base_expr.h                    | 311 +++++++
 include/tvm/ir/expr.h                         | 135 +--
 include/tvm/ir/type.h                         |  65 +-
 include/tvm/relax/attrs/create.h              |   2 +-
 include/tvm/relax/attrs/datatype.h            |   4 +-
 include/tvm/relax/attrs/image.h               |   4 +-
 include/tvm/relax/attrs/linear_algebra.h      |   2 +-
 include/tvm/relax/attrs/nn.h                  |  12 +-
 include/tvm/relax/attrs/qdq.h                 |   2 +-
 include/tvm/relax/attrs/sampling.h            |   4 +-
 include/tvm/relax/attrs/sorting.h             |   8 +-
 include/tvm/relax/attrs/statistical.h         |   2 +-
 include/tvm/relax/dataflow_pattern.h          |   8 +-
 include/tvm/relax/distributed/global_info.h   |   1 +
 include/tvm/relax/expr.h                      |   4 +-
 include/tvm/relax/transform.h                 |   5 +-
 include/tvm/relax/type.h                      |  14 +-
 include/tvm/runtime/data_type.h               | 522 ------------
 include/tvm/runtime/disco/builtin.h           |   4 +-
 include/tvm/runtime/tensor.h                  |   4 +-
 include/tvm/runtime/vm/bytecode.h             |   2 +-
 include/tvm/runtime/vm/tensor_cache_support.h |   2 +-
 include/tvm/s_tir/data_layout.h               |   4 +-
 include/tvm/s_tir/meta_schedule/arg_info.h    |   6 +-
 include/tvm/script/printer/config.h           |   9 +-
 include/tvm/script/printer/doc.h              |  10 +-
 include/tvm/script/printer/ir_docsifier.h     |   2 +-
 include/tvm/te/operation.h                    |  32 +-
 include/tvm/te/tensor.h                       |  10 +-
 include/tvm/tirx/buffer.h                     |  73 +-
 include/tvm/tirx/expr.h                       |   8 +-
 include/tvm/tirx/op.h                         | 166 ++--
 include/tvm/tirx/script/builder/ir.h          | 147 ++--
 include/tvm/tirx/stmt.h                       |   2 +-
 include/tvm/tirx/var.h                        |  10 +-
 include/tvm/topi/broadcast.h                  |  12 +-
 include/tvm/topi/contrib/cublas.h             |   4 +-
 include/tvm/topi/detail/broadcast.h           |  18 +-
 include/tvm/topi/detail/extern.h              |  13 +-
 include/tvm/topi/detail/strided_slice.h       |   6 +-
 include/tvm/topi/detail/tensor_utils.h        |   8 +-
 include/tvm/topi/elemwise.h                   | 111 ++-
 include/tvm/topi/nn.h                         |  40 +-
 include/tvm/topi/nn/bnn.h                     |   8 +-
 include/tvm/topi/nn/dense.h                   |   2 +-
 include/tvm/topi/nn/dilate.h                  |   2 +-
 include/tvm/topi/nn/group_norm.h              |  14 +-
 include/tvm/topi/nn/instance_norm.h           |  22 +-
 include/tvm/topi/nn/layer_norm.h              |  21 +-
 include/tvm/topi/nn/local_response_norm.h     |   9 +-
 include/tvm/topi/nn/pooling.h                 |  31 +-
 include/tvm/topi/nn/rms_norm.h                |  10 +-
 include/tvm/topi/reduction.h                  |  15 +-
 include/tvm/topi/transform.h                  | 112 ++-
 python/tvm/ir/expr.py                         |   5 +-
 python/tvm/ir/type.py                         |  29 +
 python/tvm/relax/frontend/nn/extern.py        |   2 +-
 .../torch/base_fx_graph_translator.py         |   5 +-
 python/tvm/relax/op/create.py                 |   8 +-
 python/tvm/relax/op/manipulate.py             |  11 +-
 .../relax/transform/legalize_ops/common.py    |  13 +-
 .../transform/legalize_ops/manipulate.py      |   4 +-
 .../tvm/relax/transform/legalize_ops/qdq.py   |   7 +-
 python/tvm/relax/type.py                      |   8 +-
 python/tvm/runtime/object_generic.py          |   6 +-
 python/tvm/s_tir/schedule/schedule.py         |  14 +-
 python/tvm/script/parser/core/evaluator.py    |   2 +-
 python/tvm/te/tensor.py                       |  14 +
 python/tvm/tirx/buffer.py                     |   2 +-
 python/tvm/tirx/expr.py                       |  31 +-
 python/tvm/tirx/script/parser/operation.py    |  69 +-
 python/tvm/topi/math.py                       |  54 +-
 python/tvm/topi/scatter.py                    |   4 +-
 python/tvm/topi/sort.py                       |   2 +-
 src/arith/analyzer.cc                         |  11 +-
 src/arith/bound_deducer.cc                    |   3 +-
 src/arith/canonical_simplify.cc               | 106 +--
 src/arith/const_fold.h                        | 134 +--
 src/arith/const_int_bound.cc                  |  41 +-
 src/arith/detect_linear_equation.cc           |  24 +-
 src/arith/int_constraints.cc                  |  10 +-
 src/arith/int_set.cc                          |  46 +-
 src/arith/ir_mutator_with_analyzer.cc         |  10 +-
 src/arith/ir_visitor_with_analyzer.cc         |   2 +-
 src/arith/iter_affine_map.cc                  |  72 +-
 src/arith/pattern_match.h                     |  38 +-
 src/arith/product_normal_form.h               |   5 +-
 src/arith/rewrite_simplify.cc                 | 114 +--
 src/arith/solve_linear_equation.cc            |  28 +-
 src/arith/solve_linear_inequality.cc          |  25 +-
 src/arith/transitive_comparison_analyzer.cc   |   3 +-
 src/arith/unwrap_vector_expr.cc               |   6 +-
 src/arith/z3_prover.cc                        |  44 +-
 src/backend/cuda/codegen/codegen_cuda.cc      | 623 +++++++-------
 src/backend/cuda/codegen/codegen_cuda.h       |  19 +-
 src/backend/cuda/codegen/intrin_rule_cuda.cc  |  26 +-
 .../cuda/codegen/llvm/codegen_nvptx.cc        |  12 +-
 .../cuda/codegen/llvm/intrin_rule_nvptx.cc    |   9 +-
 src/backend/cuda/runtime/cuda_device_api.cc   |  16 +-
 .../hexagon/codegen/llvm/codegen_hexagon.cc   |  42 +-
 .../codegen/llvm/intrin_rule_hexagon.cc       |  40 +-
 .../hexagon/runtime/ops/conv2d_fp16_hvx.cc    |   4 +-
 src/backend/metal/codegen/codegen_metal.cc    |  73 +-
 src/backend/metal/codegen/codegen_metal.h     |   7 +-
 .../metal/codegen/intrin_rule_metal.cc        |   6 +-
 src/backend/opencl/codegen/codegen_opencl.cc  | 125 +--
 src/backend/opencl/codegen/codegen_opencl.h   |  20 +-
 .../opencl/codegen/intrin_rule_opencl.cc      |   4 +-
 src/backend/opencl/runtime/opencl_common.h    |  21 +-
 .../opencl/runtime/opencl_device_api.cc       |   6 +-
 src/backend/opencl/runtime/texture.h          |  10 +-
 .../rocm/codegen/llvm/codegen_amdgpu.cc       |   9 +-
 .../rocm/codegen/llvm/intrin_rule_rocm.cc     |  22 +-
 src/backend/trn/codegen/codegen_trn.cc        |  25 +-
 src/backend/trn/codegen/codegen_trn.h         |   4 +-
 .../trn/transform/lower_trainium_layout.cc    |  20 +-
 src/backend/vulkan/codegen/codegen_spirv.cc   | 112 +--
 src/backend/vulkan/codegen/codegen_spirv.h    |  14 +-
 .../vulkan/codegen/intrin_rule_spirv.cc       |  13 +-
 src/backend/vulkan/codegen/ir_builder.cc      | 221 ++---
 src/backend/vulkan/codegen/ir_builder.h       |  10 +-
 src/backend/webgpu/codegen/codegen_webgpu.cc  | 154 ++--
 src/backend/webgpu/codegen/codegen_webgpu.h   |  15 +-
 .../webgpu/codegen/intrin_rule_webgpu.cc      |  10 +-
 src/ir/expr.cc                                | 150 ++--
 src/ir/type.cc                                | 105 ++-
 src/relax/analysis/tir_op_pattern_kind.cc     |   7 +-
 src/relax/analysis/type_analysis.cc           |  18 +-
 src/relax/analysis/well_formed.cc             |   4 +-
 .../backend/contrib/codegen_c/codegen_c.h     |  15 +-
 src/relax/backend/contrib/cublas/codegen.cc   |   4 +-
 src/relax/backend/contrib/utils.h             |   4 +-
 src/relax/backend/vm/codegen_vm_tir.cc        |  32 +-
 src/relax/backend/vm/lower_runtime_builtin.cc |   4 +-
 src/relax/backend/vm/vm_shape_lower.cc        |  19 +-
 src/relax/ir/dataflow_expr_rewriter.cc        |   2 +-
 src/relax/ir/dataflow_matcher.cc              |   3 +-
 src/relax/ir/dataflow_pattern.cc              |  10 +-
 src/relax/ir/dependent_type.cc                |  15 +-
 src/relax/ir/emit_te.cc                       |   2 +-
 src/relax/ir/expr.cc                          |  14 +-
 src/relax/op/ccl/ccl.cc                       |   4 +-
 src/relax/op/distributed/binary.cc            |   2 +-
 src/relax/op/distributed/binary.h             |   4 +-
 src/relax/op/distributed/distributed.cc       |   2 +-
 src/relax/op/distributed/linear_algebra.cc    |   6 +-
 src/relax/op/distributed/nn.cc                |   4 +-
 src/relax/op/distributed/unary.cc             |   2 +-
 src/relax/op/distributed/unary.h              |  11 +-
 src/relax/op/image/resize.cc                  |  20 +-
 src/relax/op/image/resize.h                   |   4 +-
 src/relax/op/memory/view.cc                   |  28 +-
 src/relax/op/nn/attention.cc                  |   2 +-
 src/relax/op/nn/convolution.cc                |  82 +-
 src/relax/op/nn/convolution.h                 |  16 +-
 src/relax/op/nn/nn.cc                         |  69 +-
 src/relax/op/nn/pooling.cc                    |   6 +-
 src/relax/op/op.cc                            |  20 +-
 src/relax/op/op_common.h                      |  40 +-
 src/relax/op/tensor/binary.cc                 |   8 +-
 src/relax/op/tensor/create.cc                 |  73 +-
 src/relax/op/tensor/create.h                  |  20 +-
 src/relax/op/tensor/datatype.cc               |   8 +-
 src/relax/op/tensor/datatype.h                |   4 +-
 src/relax/op/tensor/index.cc                  |  27 +-
 src/relax/op/tensor/inspect.cc                | 122 +--
 src/relax/op/tensor/inspect.h                 |  16 +-
 src/relax/op/tensor/linear_algebra.cc         |  26 +-
 src/relax/op/tensor/linear_algebra.h          |   2 +-
 src/relax/op/tensor/manipulate.cc             | 125 +--
 src/relax/op/tensor/qdq.cc                    |  63 +-
 src/relax/op/tensor/qdq.h                     |   4 +-
 src/relax/op/tensor/sampling.cc               |  20 +-
 src/relax/op/tensor/sampling.h                |   3 +-
 src/relax/op/tensor/search.cc                 |  15 +-
 src/relax/op/tensor/set.cc                    |  14 +-
 src/relax/op/tensor/sorting.cc                |  10 +-
 src/relax/op/tensor/sorting.h                 |   4 +-
 src/relax/op/tensor/statistical.cc            |  16 +-
 src/relax/op/tensor/statistical.h             |   4 +-
 src/relax/op/tensor/ternary.cc                |   4 +-
 src/relax/op/tensor/unary.cc                  |   2 +-
 src/relax/op/vision/nms.cc                    |  38 +-
 src/relax/script/printer/dependent_type.cc    |   2 +-
 src/relax/script/printer/distributed.cc       |   4 +-
 src/relax/script/printer/expr.cc              |  18 +-
 src/relax/script/printer/tir.cc               |   7 +-
 src/relax/transform/adjust_matmul_order.cc    |  20 +-
 src/relax/transform/allocate_workspace.cc     |   4 +-
 src/relax/transform/alter_op_impl.cc          |  10 +-
 src/relax/transform/call_tir_rewrite.cc       |  16 +-
 .../transform/combine_parallel_matmul.cc      |   2 +-
 src/relax/transform/compute_prim_value.cc     |   7 +-
 src/relax/transform/convert_layout.cc         |   2 +-
 src/relax/transform/dataflow_inplace.cc       |   2 +-
 src/relax/transform/decompose_ops.cc          |  10 +-
 src/relax/transform/expand_matmul_of_sum.cc   |   3 +-
 src/relax/transform/fold_constant.cc          |  10 +-
 src/relax/transform/fuse_tir.cc               |  38 +-
 src/relax/transform/gradient.cc               |   7 +-
 src/relax/transform/infer_amp_utils.cc        |  22 +-
 src/relax/transform/infer_amp_utils.h         |  11 +-
 src/relax/transform/lazy_transform_params.cc  |   5 +-
 src/relax/transform/legalize_ops.cc           |   2 +-
 src/relax/transform/lower_alloc_tensor.cc     |   5 +-
 src/relax/transform/remove_unused_outputs.cc  |   2 +-
 .../transform/remove_unused_parameters.cc     |   2 +-
 .../transform/reorder_take_after_matmul.cc    |   4 +-
 .../transform/split_call_tir_by_pattern.cc    |  14 +-
 .../transform/split_layout_rewrite_preproc.cc |   4 +-
 .../transform/static_plan_block_memory.cc     |  50 +-
 src/relax/transform/to_mixed_precision.cc     |  53 +-
 src/relax/transform/utils.h                   |  32 +-
 src/relax/utils.cc                            |   8 +-
 src/runtime/extra/contrib/cblas/cblas.cc      |   2 +-
 src/runtime/extra/contrib/cblas/dnnl_blas.cc  |   2 +-
 src/runtime/extra/contrib/cblas/gemm_common.h |  17 +-
 src/runtime/extra/contrib/cblas/mkl.cc        |   2 +-
 .../extra/contrib/coreml/coreml_runtime.mm    |  16 +-
 src/runtime/extra/contrib/cublas/cublas.cc    |   6 +-
 .../extra/contrib/cudnn/conv_backward.cc      |   2 +-
 .../extra/contrib/cudnn/conv_forward.cc       |   2 +-
 .../extra/contrib/cudnn/cudnn_utils.cc        |   2 +-
 .../extra/contrib/cutlass/fp16_group_gemm.cuh |  12 +-
 .../cutlass/fp8_groupwise_scaled_gemm.cuh     |  50 +-
 .../fp8_groupwise_scaled_group_gemm_sm100.cu  |  17 +-
 src/runtime/extra/contrib/dnnl/dnnl_utils.cc  |   8 +-
 src/runtime/extra/contrib/dnnl/dnnl_utils.h   |   2 +-
 src/runtime/extra/contrib/hipblas/hipblas.cc  |   2 +-
 src/runtime/extra/contrib/json/json_node.h    |   2 +-
 .../extra/contrib/nvshmem/memory_allocator.cc |   4 +-
 src/runtime/extra/contrib/random/random.cc    |   2 +-
 src/runtime/extra/contrib/sort/sort.cc        |   4 +-
 src/runtime/extra/contrib/vllm/cache_alloc.cc |   4 +-
 .../extra/contrib/vllm/cache_kernels.cu       |   6 +-
 src/runtime/extra/disco/builtin.cc            |   4 +-
 .../extra/disco/cuda_ipc/cuda_ipc_memory.cc   |  10 +-
 .../extra/disco/cuda_ipc/custom_allreduce.cc  |   2 +-
 src/runtime/extra/disco/loader.cc             |   9 +-
 src/runtime/extra/disco/nccl/nccl.cc          |  20 +-
 src/runtime/extra/disco/nccl/nccl_context.h   |  26 +-
 src/runtime/tensor.cc                         |   8 +-
 src/runtime/vm/attn_backend.h                 |   4 +-
 src/runtime/vm/attn_utils.h                   |   8 +-
 src/runtime/vm/builtin.cc                     |  31 +-
 src/runtime/vm/executable.cc                  |   3 +-
 src/runtime/vm/lm_support.cc                  |  23 +-
 src/runtime/vm/paged_kv_cache.cc              |  22 +-
 src/runtime/vm/rnn_state.cc                   |   2 +-
 src/runtime/vm/tensor_cache_support.cc        |   4 +-
 .../analysis/calculate_allocated_memory.cc    |   2 +-
 src/s_tir/analysis/estimate_flops.cc          |  10 +-
 .../analysis/sblock_access_region_detector.cc |   2 +-
 src/s_tir/analysis/verify_gpu_code.cc         |  77 +-
 .../backend/adreno/inject_texture_alloc.cc    |   4 +-
 src/s_tir/backend/adreno/texture_flatten.cc   |   4 +-
 src/s_tir/data_layout.cc                      |  42 +-
 src/s_tir/meta_schedule/arg_info.cc           |  13 +-
 .../meta_schedule/database/database_utils.cc  |   5 +-
 .../feature_extractor/per_store_feature.cc    |  24 +-
 .../measure_callback/add_to_database.cc       |   2 +-
 src/s_tir/meta_schedule/mutator/mutator.cc    |  22 +-
 .../postproc/rewrite_cooperative_fetch.cc     |   6 +-
 src/s_tir/meta_schedule/profiler.cc           |   2 +-
 .../schedule/cuda/thread_bind.cc              |   2 +-
 .../schedule_rule/cross_thread_reduction.cc   |   2 +-
 .../schedule_rule/multi_level_tiling.cc       |  10 +-
 .../multi_level_tiling_tensor_core.cc         |   8 +-
 .../parallel_vectorize_unroll.cc              |   2 +-
 .../schedule_rule/schedule_rule.cc            |   2 +-
 src/s_tir/meta_schedule/utils.h               |   2 +-
 src/s_tir/schedule/analysis/layout.cc         |   6 +-
 src/s_tir/schedule/analysis/reducer.cc        |   2 +-
 src/s_tir/schedule/concrete_schedule.cc       |   8 +-
 src/s_tir/schedule/concrete_schedule.h        |   2 +-
 src/s_tir/schedule/ir_comparator.cc           |  18 +-
 .../schedule/primitive/block_annotate.cc      |  18 +-
 .../schedule/primitive/blockize_tensorize.cc  |  12 +-
 src/s_tir/schedule/primitive/cache_index.cc   |  24 +-
 .../schedule/primitive/cache_read_write.cc    |  30 +-
 src/s_tir/schedule/primitive/compute_at.cc    |   8 +-
 .../schedule/primitive/compute_inline.cc      |   4 +-
 .../schedule/primitive/decompose_padding.cc   |   4 +-
 src/s_tir/schedule/primitive/for_kind.cc      |   6 +-
 .../primitive/layout_transformation.cc        |  42 +-
 .../schedule/primitive/loop_transformation.cc |  34 +-
 src/s_tir/schedule/primitive/pad_einsum.cc    |  16 +-
 src/s_tir/schedule/primitive/reduction.cc     |  38 +-
 src/s_tir/schedule/transform.cc               |   4 +-
 src/s_tir/schedule/transform.h                |   2 +-
 src/s_tir/schedule/utils.h                    |   4 +-
 src/s_tir/transform/bound_checker.cc          |  16 +-
 src/s_tir/transform/canonicalize_loop.cc      |   4 +-
 src/s_tir/transform/compact_buffer_region.cc  |  23 +-
 src/s_tir/transform/default_gpu_schedule.cc   |   8 +-
 src/s_tir/transform/inject_double_buffer.cc   |  18 +-
 src/s_tir/transform/inject_permuted_layout.cc |   2 +-
 src/s_tir/transform/inject_ptx_async_copy.cc  |  10 +-
 src/s_tir/transform/inject_ptx_ldg32.cc       |   5 +-
 .../transform/inject_software_pipeline.cc     |   8 +-
 src/s_tir/transform/inject_virtual_thread.cc  |  15 +-
 src/s_tir/transform/lift_thread_binding.cc    |   2 +-
 src/s_tir/transform/loop_partition.cc         |   8 +-
 src/s_tir/transform/lower_async_dma.cc        |  20 +-
 .../transform/lower_cross_thread_reduction.cc |  12 +-
 src/s_tir/transform/lower_match_buffer.cc     |  21 +-
 src/s_tir/transform/lower_opaque_block.cc     |   4 +-
 src/s_tir/transform/lower_thread_allreduce.cc |  78 +-
 src/s_tir/transform/lower_vtcm_alloc.cc       |   4 +-
 .../transform/memhammer_tensorcore_rewrite.cc |  53 +-
 .../merge_shared_memory_allocations.cc        |  41 +-
 .../transform/profile_instrumentation.cc      |   8 +-
 src/s_tir/transform/renew_defs.cc             |   4 +-
 .../transform/renormalize_split_pattern.cc    |  26 +-
 src/s_tir/transform/rewrite_unsafe_select.cc  |   5 +-
 src/s_tir/transform/storage_access.cc         |   8 +-
 src/s_tir/transform/storage_access.h          |   2 +-
 src/s_tir/transform/thread_storage_sync.cc    |   4 +-
 src/s_tir/transform/unify_thread_binding.cc   |  18 +-
 .../printer/doc_printer/python_doc_printer.cc |   3 +-
 src/script/printer/ir/distributed.cc          |   1 +
 src/script/printer/script_printer.cc          |   6 +-
 src/script/printer/utils.h                    |   6 +-
 src/target/build_common.h                     |   2 +-
 src/target/intrin_rule.cc                     |  63 +-
 src/target/intrin_rule.h                      |  16 +-
 src/target/llvm/codegen_arm.cc                |  34 +-
 src/target/llvm/codegen_cpu.cc                |  47 +-
 src/target/llvm/codegen_cpu.h                 |   4 +-
 src/target/llvm/codegen_llvm.cc               | 317 +++----
 src/target/llvm/codegen_llvm.h                |  32 +-
 src/target/llvm/codegen_params.cc             |  46 +-
 src/target/llvm/codegen_x86_64.cc             |  15 +-
 src/target/llvm/intrin_rule_llvm.cc           |  15 +-
 src/target/llvm/intrin_rule_llvm.h            |   8 +-
 src/target/source/codegen_c.cc                | 223 ++---
 src/target/source/codegen_c.h                 |  30 +-
 src/target/source/codegen_c_host.cc           |  28 +-
 src/target/source/codegen_c_host.h            |   4 +-
 src/target/source/codegen_params.cc           |  57 +-
 src/target/source/codegen_source_base.cc      |  31 +-
 src/target/source/codegen_source_base.h       |   9 +-
 src/target/source/source_module.cc            |   2 +-
 src/te/operation/compute_op.cc                |  12 +-
 src/te/operation/create_primfunc.cc           |  24 +-
 src/te/operation/create_primfunc.h            |   4 +-
 src/te/operation/extern_op.cc                 |   4 +-
 src/te/operation/placeholder_op.cc            |  10 +-
 src/te/operation/scan_op.cc                   |   2 +-
 src/te/tensor.cc                              |  17 +-
 src/tirx/analysis/deep_equal.cc               |  50 +-
 src/tirx/ir/buffer.cc                         | 109 ++-
 src/tirx/ir/buffer_common.h                   |   6 +-
 src/tirx/ir/data_type_rewriter.cc             | 152 ++--
 src/tirx/ir/data_type_rewriter.h              |   6 +-
 src/tirx/ir/exec_scope.cc                     |   6 +-
 src/tirx/ir/expr.cc                           | 279 ++++---
 src/tirx/ir/expr_functor.cc                   |   4 +-
 src/tirx/ir/function.cc                       |  18 +-
 src/tirx/ir/index_map.cc                      |  15 +-
 src/tirx/ir/layout/axis_registry.cc           |   2 +-
 src/tirx/ir/layout/tile_slice.cc              |   4 +-
 src/tirx/ir/layout/utils.cc                   |   2 +-
 src/tirx/ir/script/script_complete.cc         |   5 +-
 src/tirx/ir/stmt.cc                           |  98 ++-
 src/tirx/ir/stmt_functor.cc                   |   8 +-
 src/tirx/op/op.cc                             | 777 ++++++++++--------
 src/tirx/script/builder/ir.cc                 | 111 +--
 src/tirx/script/builder/utils.h               |   2 +-
 src/tirx/script/printer/block.cc              |   4 +-
 src/tirx/script/printer/buffer.cc             |  10 +-
 src/tirx/script/printer/expr.cc               |  25 +-
 src/tirx/script/printer/for_loop.cc           |   4 +-
 src/tirx/script/printer/ir.cc                 |   6 +-
 src/tirx/script/printer/stmt.cc               |   6 +-
 src/tirx/transform/common_subexpr_elim.cc     |   5 +-
 src/tirx/transform/dtype_conversion.cc        |  33 +-
 src/tirx/transform/dtype_conversion.h         |  57 +-
 src/tirx/transform/flatten_buffer.cc          |  25 +-
 .../transform/force_narrow_index_to_i32.cc    |   8 +-
 src/tirx/transform/ir_utils.cc                |   9 +-
 src/tirx/transform/ir_utils.h                 |  43 +-
 src/tirx/transform/lower_intrin.cc            |  38 +-
 src/tirx/transform/lower_tirx_cleanup.cc      |  20 +-
 src/tirx/transform/lower_tirx_opaque.cc       |   4 +-
 src/tirx/transform/lower_tvm_builtin.cc       | 110 +--
 src/tirx/transform/lower_warp_memory.cc       |  24 +-
 src/tirx/transform/make_packed_api.cc         |  40 +-
 src/tirx/transform/narrow_datatype.cc         |  97 +--
 src/tirx/transform/split_host_device.cc       |  20 +-
 src/tirx/transform/storage_rewrite.cc         | 158 ++--
 src/tirx/transform/tile_primitive_dispatch.cc |  21 +-
 src/tirx/transform/tvm_ffi_binder.cc          | 136 +--
 src/tirx/transform/tvm_ffi_binder.h           |   6 +-
 src/tirx/transform/unroll_loop.cc             |   2 +-
 .../transform/unsupported_dtype_legalize.cc   | 183 +++--
 src/tirx/transform/vectorize_loop.cc          | 244 +++---
 src/topi/einsum.cc                            |  10 +-
 src/topi/elemwise.cc                          |   6 +-
 src/topi/nn.cc                                |   2 +-
 src/topi/transform.cc                         |   8 +-
 tests/cpp/arith_simplify_test.cc              |   8 +-
 tests/cpp/expr_test.cc                        |  15 +-
 tests/cpp/ir_functor_test.cc                  |  24 +-
 tests/cpp/ndarray_test.cc                     |   8 +-
 tests/cpp/nested_msg_test.cc                  |  27 +-
 tests/cpp/pattern_match_test.cc               |  34 +-
 tests/cpp/te_compute_test.cc                  |  10 +-
 tests/cpp/tir_analysis_side_effect.cc         |   8 +-
 tests/cpp/tir_scalable_datatype.cc            | 132 +--
 tests/cpp/topi_ewise_test.cc                  |   2 +-
 .../codegen/test_target_codegen_llvm.py       |   2 +-
 tests/python/contrib/test_sort.py             |  16 +-
 .../python/relax/frontend_nn_extern_module.cc |  16 +-
 .../python/relax/test_analysis_well_formed.py |   2 +-
 tests/python/tirx-base/test_tir_buffer.py     |   4 +-
 tests/python/tirx-base/test_tir_intrin.py     |  24 +-
 tests/python/tirx-base/test_tir_specialize.py |   4 +-
 .../tvmscript/test_tvmscript_parser_tir.py    |   4 +-
 .../tvmscript/test_tvmscript_roundtrip.py     |   4 +-
 420 files changed, 6481 insertions(+), 5737 deletions(-)
 create mode 100644 include/tvm/ir/base_expr.h
 delete mode 100644 include/tvm/runtime/data_type.h

diff --git a/include/tvm/ir/base_expr.h b/include/tvm/ir/base_expr.h
new file mode 100644
index 000000000000..fbde9ec26aca
--- /dev/null
+++ b/include/tvm/ir/base_expr.h
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/ir/base_expr.h
+ * \brief Base expression and primitive type nodes.
+ */
+#ifndef TVM_IR_BASE_EXPR_H_
+#define TVM_IR_BASE_EXPR_H_
+
+#include <tvm/ffi/cast.h>
+#include <tvm/ffi/dtype.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/source_map.h>
+
+#include <cstdint>
+
+namespace tvm {
+
+/*!
+ * \brief Type is the base type of all types.
+ *
+ * TVM's type system contains following subclasses:
+ *
+ * - PrimType: type of primitive type values used in the low-level IR.
+ * - FuncType: type of a function.
+ * - TensorType: type of certain Tensor values in the expression.
+ *
+ * There are also advanced types to support generic(polymorphic types).
+ * \sa Type
+ */
+class TypeNode : public ffi::Object {
+ public:
+  /*!
+   * \brief Span that points to the original source code.
+   *        Reserved debug information.
+   */
+  mutable Span span;
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    // span do not participate in structural equal and hash.
+    refl::ObjectDef<TypeNode>().def_ro("span", &TypeNode::span, refl::DefaultValue(Span()),
+                                       refl::AttachFieldFlag::SEqHashIgnore());
+  }
+
+  static constexpr TVMFFISEqHashKind _type_s_eq_hash_kind = kTVMFFISEqHashKindTreeNode;
+
+  static constexpr const uint32_t _type_child_slots = 14;
+  TVM_FFI_DECLARE_OBJECT_INFO("ir.Type", TypeNode, ffi::Object);
+};
+
+/*!
+ * \brief Managed reference to TypeNode.
+ * \sa TypeNode
+ */
+class Type : public ffi::ObjectRef {
+ public:
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Type, ffi::ObjectRef, TypeNode);
+};
+
+/*!
+ * \brief Primitive data types used in the low-level IR.
+ *
+ * PrimType represents POD-values and handles that are
+ * not automatically managed by the runtime.
+ *
+ * \sa PrimType
+ */
+class PrimTypeNode final : public TypeNode {
+ public:
+  /*!
+   * \brief The raw DLPack dtype represented by this primitive type.
+   */
+  DLDataType dtype;
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<PrimTypeNode>().def_ro("dtype", &PrimTypeNode::dtype);
+  }
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("ir.PrimType", PrimTypeNode, TypeNode);
+};
+
+/*
+ * \brief Managed reference to PrimTypeNode.
+ * \sa PrimTypeNode
+ */
+class PrimType final : public Type {
+ public:
+  /*!
+   * \brief Construct from a raw DLPack dtype.
+   * \param dtype The corresponding DLPack dtype.
+   */
+  TVM_DLL explicit PrimType(DLDataType dtype);
+
+  /*!
+   * \brief Construct from DLPack dtype fields.
+   * \param code The DLPack dtype code.
+   * \param bits The scalar bit width.
+   * \param lanes The fixed lane count.
+   */
+  TVM_DLL PrimType(DLDataTypeCode code, int bits, int lanes = 1);
+
+  /*! \brief Construct a signed integer type with fixed lanes. */
+  TVM_DLL static PrimType Int(int bits, int lanes = 1);
+  /*! \brief Construct an unsigned integer type with fixed lanes. */
+  TVM_DLL static PrimType UInt(int bits, int lanes = 1);
+  /*! \brief Construct a floating-point type with fixed lanes. */
+  TVM_DLL static PrimType Float(int bits, int lanes = 1);
+  /*! \brief Construct a bfloat type with fixed lanes. */
+  TVM_DLL static PrimType BFloat(int bits, int lanes = 1);
+  /*! \brief Construct a boolean type with fixed lanes. */
+  TVM_DLL static PrimType Bool(int lanes = 1);
+  /*! \brief Construct an opaque handle type. */
+  TVM_DLL static PrimType Handle(int bits = 64, int lanes = 1);
+  /*! \brief Construct the void sentinel type, encoded as handle(0, 0). */
+  TVM_DLL static PrimType Void();
+  /*!
+   * \brief Construct a scalable vector type.
+   * \param code The DLPack dtype code.
+   * \param bits The scalar bit width.
+   * \param lanes The positive vscale factor to encode in the DLPack lane field.
+   */
+  TVM_DLL static PrimType ScalableVector(DLDataTypeCode code, int bits, int lanes);
+
+  /*! \return The DLPack dtype code. */
+  TVM_FFI_INLINE DLDataTypeCode code() const {
+    return static_cast<DLDataTypeCode>(static_cast<int>(get()->dtype.code));
+  }
+
+  /*! \return The scalar bit width. */
+  TVM_FFI_INLINE int32_t bits() const { return get()->dtype.bits; }
+
+  /*!
+   * \return The fixed lane count.
+   * \note Throws on scalable vector types, where the encoded lane field stores a vscale factor.
+   */
+  TVM_FFI_INLINE int32_t lanes() const {
+    int16_t encoded_lanes = static_cast<int16_t>(get()->dtype.lanes);
+    if (TVM_FFI_PREDICT_FALSE(encoded_lanes < 0)) {
+      TVM_FFI_THROW(InternalError)
+          << "Can't fetch the lanes of a scalable vector at a compile time.";
+    }
+    return encoded_lanes;
+  }
+
+  /*!
+   * \brief Check the scalar element code and bit width.
+   * \note Lane count and scalable-vector encoding are intentionally ignored.
+   */
+  TVM_FFI_INLINE bool MatchesElementType(DLDataTypeCode code, int bits) const {
+    DLDataType dtype = get()->dtype;
+    return dtype.code == static_cast<uint8_t>(code) && dtype.bits == bits;
+  }
+
+  /*!
+   * \brief Check whether the dtype code matches any of the provided DLPack codes.
+   * \note Bit width and lanes are intentionally ignored.
+   */
+  template <typename... Codes>
+  TVM_FFI_INLINE bool MatchesCode(Codes... codes) const {
+    uint8_t dtype_code = get()->dtype.code;
+    return ((dtype_code == static_cast<uint8_t>(codes)) || ...);
+  }
+
+  /*! \brief Whether this type is a scalar, excluding fixed and scalable vectors. */
+  TVM_FFI_INLINE bool IsScalar() const {
+    int16_t encoded_lanes = static_cast<int16_t>(get()->dtype.lanes);
+    return encoded_lanes == 1;
+  }
+
+  /*! \brief Whether this type is the void sentinel `handle(0, 0)`. */
+  TVM_FFI_INLINE bool IsVoid() const {
+    DLDataType dtype = get()->dtype;
+    return dtype.code == static_cast<uint8_t>(DLDataTypeCode::kDLOpaqueHandle) && dtype.bits == 0 &&
+           static_cast<int16_t>(dtype.lanes) == 0;
+  }
+
+  /*! \brief Whether this type is an opaque handle, excluding the void sentinel. */
+  TVM_FFI_INLINE bool IsHandle() const {
+    return this->code() == DLDataTypeCode::kDLOpaqueHandle && !this->IsVoid();
+  }
+
+  /*! \brief Whether this type is a scalable vector. */
+  TVM_FFI_INLINE bool IsScalableVector() const {
+    return static_cast<int16_t>(get()->dtype.lanes) < -1;
+  }
+
+  /*! \brief Whether this type is a fixed-length vector. */
+  TVM_FFI_INLINE bool IsFixedLengthVector() const {
+    return static_cast<int16_t>(get()->dtype.lanes) > 1;
+  }
+
+  /*! \brief Return the same type with a different dtype code, preserving bits and lanes. */
+  TVM_FFI_INLINE PrimType WithCode(DLDataTypeCode code) const {
+    DLDataType dtype = get()->dtype;
+    int16_t encoded_lanes = static_cast<int16_t>(dtype.lanes);
+    if (encoded_lanes < -1) {
+      return ScalableVector(code, dtype.bits, -encoded_lanes);
+    }
+    return PrimType(code, dtype.bits, encoded_lanes);
+  }
+
+  /*! \brief Return the same type with a different scalar bit width, preserving code and lanes. */
+  TVM_FFI_INLINE PrimType WithBits(int bits) const {
+    DLDataType dtype = get()->dtype;
+    int16_t encoded_lanes = static_cast<int16_t>(dtype.lanes);
+    if (encoded_lanes < -1) {
+      return ScalableVector(this->code(), bits, -encoded_lanes);
+    }
+    return PrimType(this->code(), bits, encoded_lanes);
+  }
+
+  /*! \brief Return the same scalar element type with a fixed lane count. */
+  TVM_FFI_INLINE PrimType WithLanes(int lanes) const {
+    return PrimType(this->code(), this->bits(), lanes);
+  }
+
+  /*! \return The vscale factor encoded in a scalable vector type. */
+  TVM_FFI_INLINE int32_t VScaleFactor() const {
+    int16_t encoded_lanes = static_cast<int16_t>(get()->dtype.lanes);
+    if (encoded_lanes >= -1) {
+      TVM_FFI_THROW(InternalError) << "A fixed length vector doesn't have a vscale factor.";
+    }
+    return -encoded_lanes;
+  }
+
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NOTNULLABLE(PrimType, Type, PrimTypeNode);
+};
+
+inline bool operator==(const PrimType& lhs, const PrimType& rhs) {
+  return lhs->dtype == rhs->dtype;
+}
+
+inline bool operator!=(const PrimType& lhs, const PrimType& rhs) { return !(lhs == rhs); }
+
+/*!
+ * \brief Base type of all the expressions.
+ * \sa Expr
+ */
+class BaseExprNode : public ffi::Object {
+ public:
+  /*!
+   * \brief Span that points to the original source code.
+   *        Reserved debug information.
+   */
+  mutable Span span;
+
+  /*!
+   * \brief The deduced or annotated type of the expression.
+   *
+   * This field is intentionally nullable because type information may
+   * be populated by later analysis passes instead of expression
+   * constructors.
+   */
+  mutable Type ty;
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    // span and ty do not participate in structural equal and hash.
+    refl::ObjectDef<BaseExprNode>()
+        .def_ro("span", &BaseExprNode::span, refl::DefaultValue(Span()),
+                refl::AttachFieldFlag::SEqHashIgnore())
+        .def_ro("ty", &BaseExprNode::ty, refl::DefaultValue(Type()),
+                refl::AttachFieldFlag::SEqHashIgnore());
+  }
+
+  static constexpr TVMFFISEqHashKind _type_s_eq_hash_kind = kTVMFFISEqHashKindTreeNode;
+
+  static constexpr const uint32_t _type_child_slots = 64;
+  TVM_FFI_DECLARE_OBJECT_INFO("ir.BaseExpr", BaseExprNode, ffi::Object);
+};
+
+/*!
+ * \brief Managed reference to BaseExprNode.
+ * \sa BaseExprNode
+ */
+class BaseExpr : public ffi::ObjectRef {
+ public:
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(BaseExpr, ffi::ObjectRef, BaseExprNode);
+};
+
+namespace ffi {
+template <>
+inline constexpr bool use_default_type_traits_v<PrimType> = false;
+
+template <>
+struct TypeTraits<PrimType> : public ObjectRefWithFallbackTraitsBase<PrimType, DLDataType> {
+  TVM_FFI_INLINE static PrimType ConvertFallbackValue(DLDataType dtype) { return PrimType(dtype); }
+};
+}  // namespace ffi
+
+}  // namespace tvm
+
+#endif  // TVM_IR_BASE_EXPR_H_
diff --git a/include/tvm/ir/expr.h b/include/tvm/ir/expr.h
index b81e4c2feda7..70e1ffeb480c 100644
--- a/include/tvm/ir/expr.h
+++ b/include/tvm/ir/expr.h
@@ -24,12 +24,13 @@
 #ifndef TVM_IR_EXPR_H_
 #define TVM_IR_EXPR_H_
 
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/extra/dataclass.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ffi/string.h>
+#include <tvm/ir/base_expr.h>
 #include <tvm/ir/cow.h>
 #include <tvm/ir/source_map.h>
-#include <tvm/runtime/data_type.h>
 
 #include <algorithm>
 #include <functional>
@@ -54,82 +55,6 @@ class VirtualDevice;
  * There are also advanced types to support generic(polymorphic types).
  * \sa Type
  */
-class TypeNode : public ffi::Object {
- public:
-  /*!
-   * \brief Span that points to the original source code.
-   *        Reserved debug information.
-   */
-  mutable Span span;
-
-  static void RegisterReflection() {
-    namespace refl = tvm::ffi::reflection;
-    // span do not participate in structural equal and hash.
-    refl::ObjectDef<TypeNode>().def_ro("span", &TypeNode::span, refl::DefaultValue(Span()),
-                                       refl::AttachFieldFlag::SEqHashIgnore());
-  }
-
-  static constexpr TVMFFISEqHashKind _type_s_eq_hash_kind = kTVMFFISEqHashKindTreeNode;
-
-  static constexpr const uint32_t _type_child_slots = 14;
-  TVM_FFI_DECLARE_OBJECT_INFO("ir.Type", TypeNode, ffi::Object);
-};
-
-/*!
- * \brief Managed reference to TypeNode.
- * \sa TypeNode
- */
-class Type : public ffi::ObjectRef {
- public:
-  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Type, ffi::ObjectRef, TypeNode);
-};
-
-/*!
- * \brief Base type of all the expressions.
- * \sa Expr
- */
-class BaseExprNode : public ffi::Object {
- public:
-  /*!
-   * \brief Span that points to the original source code.
-   *        Reserved debug information.
-   */
-  mutable Span span;
-
-  /*!
-   * \brief The deduced or annotated type of the expression.
-   *
-   * This field is intentionally nullable because type information may
-   * be populated by later analysis passes instead of expression
-   * constructors.
-   */
-  mutable Type ty;
-
-  static void RegisterReflection() {
-    namespace refl = tvm::ffi::reflection;
-    // span and ty do not participate in structural equal and hash.
-    refl::ObjectDef<BaseExprNode>()
-        .def_ro("span", &BaseExprNode::span, refl::DefaultValue(Span()),
-                refl::AttachFieldFlag::SEqHashIgnore())
-        .def_ro("ty", &BaseExprNode::ty, refl::DefaultValue(Type()),
-                refl::AttachFieldFlag::SEqHashIgnore());
-  }
-
-  static constexpr TVMFFISEqHashKind _type_s_eq_hash_kind = kTVMFFISEqHashKindTreeNode;
-
-  static constexpr const uint32_t _type_child_slots = 64;
-  TVM_FFI_DECLARE_OBJECT_INFO("ir.BaseExpr", BaseExprNode, ffi::Object);
-};
-
-/*!
- * \brief Managed reference to BaseExprNode.
- * \sa BaseExprNode
- */
-class BaseExpr : public ffi::ObjectRef {
- public:
-  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(BaseExpr, ffi::ObjectRef, BaseExprNode);
-};
-
 /*!
  * \brief Base node of all primitive expressions.
  *
@@ -144,25 +69,16 @@ class BaseExpr : public ffi::ObjectRef {
  */
 class PrimExprNode : public BaseExprNode {
  public:
-  /*!
-   * \brief The runtime data type of the primitive expression.
-   *
-   * runtime::DataType(dtype) provides coarse grained type information
-   * during compile time and runtime. It is eagerly built in
-   * PrimExpr expression construction and can be used for
-   * quick type checking.
-   *
-   * dtype is sufficient to decide the Type of the PrimExpr
-   * when it corresponds to POD value types such as i32.
-   *
-   * When dtype is DataType::Handle(), the expression could corresponds to
-   * a more fine-grained Type, and we can get the type by running lazy type inference.
-   */
-  DataType dtype;
+  /*! \return the primitive type of this expression node. */
+  PrimType ty() const {
+    TVM_FFI_DCHECK(this->BaseExprNode::ty.defined());
+    TVM_FFI_DCHECK(this->BaseExprNode::ty->IsInstance<PrimTypeNode>());
+    return ffi::GetRef<PrimType>(static_cast<const PrimTypeNode*>(this->BaseExprNode::ty.get()));
+  }
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
-    refl::ObjectDef<PrimExprNode>().def_ro("dtype", &PrimExprNode::dtype);
+    refl::ObjectDef<PrimExprNode>();
   }
 
   static constexpr const uint32_t _type_child_slots = 40;
@@ -186,8 +102,13 @@ class PrimExpr : public BaseExpr {
    */
   TVM_DLL PrimExpr(float value);  // NOLINT(*)
 
-  /*! \return the data type of this expression. */
-  DataType dtype() const { return static_cast<const PrimExprNode*>(get())->dtype; }
+  /*! \return the primitive type of this expression. */
+  PrimType ty() const {
+    const auto* node = static_cast<const PrimExprNode*>(get());
+    TVM_FFI_DCHECK(node->BaseExprNode::ty.defined());
+    TVM_FFI_DCHECK(node->BaseExprNode::ty->IsInstance<PrimTypeNode>());
+    return ffi::GetRef<PrimType>(static_cast<const PrimTypeNode*>(node->BaseExprNode::ty.get()));
+  }
 
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(PrimExpr, BaseExpr, PrimExprNode);
 
@@ -554,11 +475,11 @@ class IntImm : public PrimExpr {
  public:
   /*!
    * \brief Constructor.
-   * \param dtype The data type of the value.
+   * \param value_ty The primitive type of the value.
    * \param value The internal value.
    * \param span The location of this object in the source code.
    */
-  TVM_DLL IntImm(DataType dtype, int64_t value, Span span = Span());
+  TVM_DLL IntImm(PrimType value_ty, int64_t value, Span span = Span());
 
   /*!
    * \brief Construct a scalar boolean constant.
@@ -566,7 +487,7 @@ class IntImm : public PrimExpr {
    * \param span The location of this object in the source code.
    */
   static IntImm Bool(bool value, Span span = Span()) {
-    return IntImm(DataType::Bool(), value, span);
+    return IntImm(PrimType::Bool(), value, span);
   }
 
   /*!
@@ -575,7 +496,7 @@ class IntImm : public PrimExpr {
    * \param span The location of this object in the source code.
    */
   static IntImm Int32(int64_t value, Span span = Span()) {
-    return IntImm(DataType::Int(32), value, span);
+    return IntImm(PrimType::Int(32), value, span);
   }
 
   /*!
@@ -584,7 +505,7 @@ class IntImm : public PrimExpr {
    * \param span The location of this object in the source code.
    */
   static IntImm Int64(int64_t value, Span span = Span()) {
-    return IntImm(DataType::Int(64), value, span);
+    return IntImm(PrimType::Int(64), value, span);
   }
 
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(IntImm, PrimExpr, IntImmNode);
@@ -616,11 +537,11 @@ class FloatImm : public PrimExpr {
  public:
   /*!
    * \brief Constructor.
-   * \param dtype The data type of the value.
+   * \param value_ty The primitive type of the value.
    * \param value The internal value.
    * \param span The location in the source code.
    */
-  TVM_DLL FloatImm(DataType dtype, double value, Span span = Span());
+  TVM_DLL FloatImm(PrimType value_ty, double value, Span span = Span());
 
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(FloatImm, PrimExpr, FloatImmNode);
   TVM_DEFINE_OBJECT_REF_COW_METHOD(FloatImmNode);
@@ -688,11 +609,11 @@ inline constexpr bool use_default_type_traits_v<IntImm> = false;
 template <>
 struct TypeTraits<IntImm> : public ObjectRefWithFallbackTraitsBase<IntImm, int64_t> {
   TVM_FFI_INLINE static IntImm ConvertFallbackValue(int64_t value) {
-    auto dtype =
+    auto value_ty =
         (value > std::numeric_limits<int>::max() || value < std::numeric_limits<int>::min())
-            ? DataType::Int(64)
-            : DataType::Int(32);
-    return IntImm(dtype, value);
+            ? PrimType::Int(64)
+            : PrimType::Int(32);
+    return IntImm(value_ty, value);
   }
 };
 
@@ -702,7 +623,7 @@ inline constexpr bool use_default_type_traits_v<FloatImm> = false;
 template <>
 struct TypeTraits<FloatImm> : public ObjectRefWithFallbackTraitsBase<FloatImm, double> {
   TVM_FFI_INLINE static FloatImm ConvertFallbackValue(double value) {
-    return FloatImm(runtime::DataType::Float(32), value);
+    return FloatImm(PrimType::Float(32), value);
   }
 };
 
diff --git a/include/tvm/ir/type.h b/include/tvm/ir/type.h
index 9c56d0376405..f63b5d261500 100644
--- a/include/tvm/ir/type.h
+++ b/include/tvm/ir/type.h
@@ -26,21 +26,19 @@
  *
  * This file contains types that are common across IR variants.
  *
- * ## Relation between Type and runtime::DataType
+ * ## Relation between Type and DLPack dtype
  *
- * Besides Type, we also store a dtype field in the low-level PrimExpr.
- * runtime::DataType(dtype) provides coarse grained type information
- * during compile time and runtime. It is eagerly built in
- * low-level expression construction and can be used for
- * quick type checking in the low-level IR.
- * For example, when an Expr's dtype is int32,
- * we know for sure that its type is also int32.
+ * PrimExpr stores a PrimType in its `ty` field, backed by a DLPack
+ * `DLDataType`. This provides coarse grained scalar/vector element type
+ * information during compile time and runtime. It is eagerly built in
+ * low-level expression construction and can be used for quick type checking
+ * in the low-level IR. For example, when an Expr's dtype is int32, we know
+ * for sure that its PrimType is also int32.
  *
  * On the other hand, Type provides more fine grained information.
- * For example, a low level expression can have DataType::Handle() as
- * its dtype and MemRef[float32] as its type.
- * Types are usually lazily constructed via type checking,
- * so they may not readily be available during IR construction.
+ * For example, a low level expression can have a handle dtype while a
+ * node-specific type annotation records a
+ * PointerType to a float32 element.
  *
  * The unified Type serves as a common bridge across IR dialects.
  * For example, we require all the functions to have a type signature,
@@ -49,55 +47,16 @@
 #ifndef TVM_IR_TYPE_H_
 #define TVM_IR_TYPE_H_
 
-#include <tvm/ffi/cast.h>
 #include <tvm/ffi/container/array.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/ir/expr.h>
+#include <tvm/ir/base_expr.h>
 #include <tvm/ir/source_map.h>
-#include <tvm/runtime/data_type.h>
 
 #include <string>
 
 namespace tvm {
 
-/*!
- * \brief Primitive data types used in the low-level IR.
- *
- * PrimType represents POD-values and handles that are
- * not automatically managed by the runtime.
- *
- * \sa PrimType
- */
-class PrimTypeNode : public TypeNode {
- public:
-  /*!
-   * \brief The corresponding dtype field.
-   */
-  runtime::DataType dtype;
-
-  static void RegisterReflection() {
-    namespace refl = tvm::ffi::reflection;
-    refl::ObjectDef<PrimTypeNode>().def_ro("dtype", &PrimTypeNode::dtype);
-  }
-  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("ir.PrimType", PrimTypeNode, TypeNode);
-};
-
-/*
- * \brief Managed reference to PrimTypeNode.
- * \sa PrimTypeNode
- */
-class PrimType : public Type {
- public:
-  /*!
-   * \brief Constructor
-   * \param dtype The corresponding dtype.
-   * \param span The span
-   */
-  TVM_DLL explicit PrimType(runtime::DataType dtype, Span span = Span());
-
-  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NOTNULLABLE(PrimType, Type, PrimTypeNode);
-};
-
 /*!
  * \brief Low-level raw pointer type.
  *
diff --git a/include/tvm/relax/attrs/create.h b/include/tvm/relax/attrs/create.h
index 14a3402f2503..76ef219a862c 100644
--- a/include/tvm/relax/attrs/create.h
+++ b/include/tvm/relax/attrs/create.h
@@ -31,7 +31,7 @@ namespace relax {
 
 /*! \brief Attributes used in full/full_like, ones/ones_like, and zeros/zeros_like operators */
 struct InitAttrs : public AttrsNode {
-  DataType dtype;
+  DLDataType dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
diff --git a/include/tvm/relax/attrs/datatype.h b/include/tvm/relax/attrs/datatype.h
index f67223edb546..aeac65e64484 100644
--- a/include/tvm/relax/attrs/datatype.h
+++ b/include/tvm/relax/attrs/datatype.h
@@ -31,7 +31,7 @@ namespace relax {
 
 /*! \brief Attributes used in astype operator */
 struct AstypeAttrs : public AttrsNode {
-  DataType dtype;
+  DLDataType dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -42,7 +42,7 @@ struct AstypeAttrs : public AttrsNode {
 
 /*! \brief Attributes used in wrap_param operator */
 struct WrapParamAttrs : public AttrsNode {
-  DataType dtype;
+  DLDataType dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
diff --git a/include/tvm/relax/attrs/image.h b/include/tvm/relax/attrs/image.h
index c9a720374036..8f512f28e55f 100644
--- a/include/tvm/relax/attrs/image.h
+++ b/include/tvm/relax/attrs/image.h
@@ -39,7 +39,7 @@ struct Resize2DAttrs : public AttrsNode {
   double cubic_alpha;
   int cubic_exclude;
   double extrapolation_value;
-  DataType out_dtype;
+  DLDataType out_dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -88,7 +88,7 @@ struct Resize3DAttrs : public AttrsNode {
   double cubic_alpha;
   int cubic_exclude;
   double extrapolation_value;
-  DataType out_dtype;
+  DLDataType out_dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
diff --git a/include/tvm/relax/attrs/linear_algebra.h b/include/tvm/relax/attrs/linear_algebra.h
index 817885edb871..19a5982bfe12 100644
--- a/include/tvm/relax/attrs/linear_algebra.h
+++ b/include/tvm/relax/attrs/linear_algebra.h
@@ -31,7 +31,7 @@ namespace relax {
 
 /*! \brief Attributes for matmul operator */
 struct MatmulAttrs : public AttrsNode {
-  DataType out_dtype;
+  DLDataType out_dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
diff --git a/include/tvm/relax/attrs/nn.h b/include/tvm/relax/attrs/nn.h
index 52d9c40d742d..aa3c0f4736f0 100644
--- a/include/tvm/relax/attrs/nn.h
+++ b/include/tvm/relax/attrs/nn.h
@@ -38,7 +38,7 @@ struct Conv1DAttrs : public AttrsNode {
   ffi::String data_layout;
   ffi::String kernel_layout;
   ffi::String out_layout;
-  DataType out_dtype;
+  DLDataType out_dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -82,7 +82,7 @@ struct Conv2DAttrs : public AttrsNode {
   ffi::String data_layout;
   ffi::String kernel_layout;
   ffi::String out_layout;
-  DataType out_dtype;
+  DLDataType out_dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -128,7 +128,7 @@ struct Conv3DAttrs : public AttrsNode {
   ffi::String data_layout;
   ffi::String kernel_layout;
   ffi::String out_layout;
-  DataType out_dtype;
+  DLDataType out_dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -177,7 +177,7 @@ struct Conv1DTransposeAttrs : public AttrsNode {
   ffi::String data_layout;
   ffi::String kernel_layout;
   ffi::String out_layout;
-  DataType out_dtype;
+  DLDataType out_dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -226,7 +226,7 @@ struct Conv2DTransposeAttrs : public AttrsNode {
   ffi::String data_layout;
   ffi::String kernel_layout;
   ffi::String out_layout;
-  DataType out_dtype;
+  DLDataType out_dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -277,7 +277,7 @@ struct Conv3DTransposeAttrs : public AttrsNode {
   ffi::String data_layout;
   ffi::String kernel_layout;
   ffi::String out_layout;
-  DataType out_dtype;
+  DLDataType out_dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
diff --git a/include/tvm/relax/attrs/qdq.h b/include/tvm/relax/attrs/qdq.h
index 83ec2223c3c7..be95b9e7b8ed 100644
--- a/include/tvm/relax/attrs/qdq.h
+++ b/include/tvm/relax/attrs/qdq.h
@@ -31,7 +31,7 @@ namespace relax {
 
 /*! \brief Attributes for relax.quantize/relax.dequantize operator */
 struct QuantizeAttrs : public AttrsNode {
-  DataType out_dtype;
+  DLDataType out_dtype;
   int axis;
 
   static void RegisterReflection() {
diff --git a/include/tvm/relax/attrs/sampling.h b/include/tvm/relax/attrs/sampling.h
index 11bbfb6eba31..07b7de25e553 100644
--- a/include/tvm/relax/attrs/sampling.h
+++ b/include/tvm/relax/attrs/sampling.h
@@ -31,13 +31,13 @@ namespace relax {
 
 /*! \brief Attributes used in multinomial_from_uniform operator */
 struct MultinomialFromUniformAttrs : public AttrsNode {
-  DataType dtype;
+  DLDataType dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
     refl::ObjectDef<MultinomialFromUniformAttrs>().def_ro(
         "dtype", &MultinomialFromUniformAttrs::dtype, "Data type of the output indices.",
-        refl::DefaultValue(DataType::Int(64)));
+        refl::DefaultValue((DLDataType{kDLInt, 64, 1})));
   }
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("relax.attrs.MultinomialFromUniformAttrs",
                                     MultinomialFromUniformAttrs, AttrsNode);
diff --git a/include/tvm/relax/attrs/sorting.h b/include/tvm/relax/attrs/sorting.h
index e8bf65d55a43..ef21bf9a637e 100644
--- a/include/tvm/relax/attrs/sorting.h
+++ b/include/tvm/relax/attrs/sorting.h
@@ -54,7 +54,7 @@ struct SortAttrs : public AttrsNode {
 struct ArgsortAttrs : public AttrsNode {
   int axis;
   bool descending;
-  DataType dtype;
+  DLDataType dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -68,7 +68,7 @@ struct ArgsortAttrs : public AttrsNode {
                 "If it is not specified, it defaults to the ascending order.",
                 refl::DefaultValue(false))
         .def_ro("dtype", &ArgsortAttrs::dtype, "DType of the output indices.",
-                refl::DefaultValue(DataType::Void()));
+                refl::DefaultValue((DLDataType{kDLOpaqueHandle, 0, 0})));
   }
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("relax.attrs.ArgsortAttrs", ArgsortAttrs, AttrsNode);
 };  // struct ArgsortAttrs
@@ -79,7 +79,7 @@ struct TopKAttrs : public AttrsNode {
   int axis;
   bool largest;
   ffi::String ret_type;
-  DataType dtype;
+  DLDataType dtype;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -98,7 +98,7 @@ struct TopKAttrs : public AttrsNode {
                 "By default, return the largest k elements.",
                 refl::DefaultValue(true))
         .def_ro("dtype", &TopKAttrs::dtype, "Data type of the output indices.",
-                refl::DefaultValue(DataType::Void()));
+                refl::DefaultValue((DLDataType{kDLOpaqueHandle, 0, 0})));
   }
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("relax.attrs.TopKAttrs", TopKAttrs, AttrsNode);
 };  // struct TopKAttrs
diff --git a/include/tvm/relax/attrs/statistical.h b/include/tvm/relax/attrs/statistical.h
index 66996c802cc3..a815e0e07e51 100644
--- a/include/tvm/relax/attrs/statistical.h
+++ b/include/tvm/relax/attrs/statistical.h
@@ -50,7 +50,7 @@ struct StatisticalAttrs : public AttrsNode {
 /*! \brief Attributes used in scan operators like cumsum, cumprod */
 struct ScanopAttrs : public AttrsNode {
   ffi::Optional<int64_t> axis;
-  DataType dtype;
+  DLDataType dtype;
   bool exclusive = false;
 
   static void RegisterReflection() {
diff --git a/include/tvm/relax/dataflow_pattern.h b/include/tvm/relax/dataflow_pattern.h
index 27894da3addd..0511395f8a67 100644
--- a/include/tvm/relax/dataflow_pattern.h
+++ b/include/tvm/relax/dataflow_pattern.h
@@ -116,8 +116,8 @@ class DFPattern : public ffi::ObjectRef {
   TVM_DLL AttrPattern HasAttr(const ffi::Map<ffi::String, Any>& attrs) const;
   /*! \brief Syntatic Sugar for creating a TypePattern */
   TVM_DLL TypePattern HasType(const Type& ty) const;
-  /*! \brief Syntatic Sugar for creating a DataTypePattern with a DataType */
-  TVM_DLL DataTypePattern HasDtype(const DataType& dtype) const;
+  /*! \brief Syntatic Sugar for creating a DataTypePattern with a dtype */
+  TVM_DLL DataTypePattern HasDtype(DLDataType dtype) const;
   /*! \brief Syntatic Sugar for creating a DataTypePattern with a data type's name */
   TVM_DLL DataTypePattern HasDtype(const std::string& dtype) const;
   /*! \brief Syntatic Sugar for creating a ShapePattern */
@@ -860,7 +860,7 @@ class SameShapeConstraint : public DFConstraint {
 class DataTypePatternNode : public DFPatternNode {
  public:
   DFPattern pattern; /*!< The root pattern to match */
-  DataType dtype;    /*!< The data type to match */
+  DLDataType dtype;  /*!< The data type to match */
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -878,7 +878,7 @@ class DataTypePatternNode : public DFPatternNode {
  */
 class DataTypePattern : public DFPattern {
  public:
-  TVM_DLL DataTypePattern(DFPattern pattern, DataType dtype);
+  TVM_DLL DataTypePattern(DFPattern pattern, DLDataType dtype);
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(DataTypePattern, DFPattern, DataTypePatternNode);
 };
 
diff --git a/include/tvm/relax/distributed/global_info.h b/include/tvm/relax/distributed/global_info.h
index 62ff904fc1a4..0347ec3b85a8 100644
--- a/include/tvm/relax/distributed/global_info.h
+++ b/include/tvm/relax/distributed/global_info.h
@@ -25,6 +25,7 @@
 #ifndef TVM_RELAX_DISTRIBUTED_GLOBAL_INFO_H_
 #define TVM_RELAX_DISTRIBUTED_GLOBAL_INFO_H_
 
+#include <tvm/ffi/container/shape.h>
 #include <tvm/ir/expr.h>
 #include <tvm/ir/module.h>
 namespace tvm {
diff --git a/include/tvm/relax/expr.h b/include/tvm/relax/expr.h
index 937091255b6f..0b75bf27a7d2 100644
--- a/include/tvm/relax/expr.h
+++ b/include/tvm/relax/expr.h
@@ -471,7 +471,7 @@ class StringImm : public LeafExpr {
 class DataTypeImmNode : public LeafExprNode {
  public:
   /*! \brief The data value. */
-  DataType value;
+  DLDataType value;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -491,7 +491,7 @@ class DataTypeImm : public LeafExpr {
    * \param value The value input.
    * \param span The source span of the expression.
    */
-  TVM_DLL explicit DataTypeImm(DataType value, Span span = Span());
+  TVM_DLL explicit DataTypeImm(DLDataType value, Span span = Span());
 
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(DataTypeImm, LeafExpr, DataTypeImmNode);
   TVM_DEFINE_OBJECT_REF_COW_METHOD(DataTypeImmNode);
diff --git a/include/tvm/relax/transform.h b/include/tvm/relax/transform.h
index d0d0d1bb5441..5c757ba15161 100644
--- a/include/tvm/relax/transform.h
+++ b/include/tvm/relax/transform.h
@@ -663,9 +663,8 @@ TVM_DLL Pass DataflowUseInplaceCalls();
  *
  * \note Mainly operates within dataflow blocks. ConvertToDataflow may need to be called first.
  */
-TVM_DLL Pass
-ToMixedPrecision(const DataType& out_dtype,
-                 ffi::Optional<ffi::Array<ffi::String>> fp16_input_names = std::nullopt);
+TVM_DLL Pass ToMixedPrecision(
+    DLDataType out_dtype, ffi::Optional<ffi::Array<ffi::String>> fp16_input_names = std::nullopt);
 
 /*!
  * \brief Rewrite a Relax module for executing with CUDA graph. This pass identifies
diff --git a/include/tvm/relax/type.h b/include/tvm/relax/type.h
index 9c27b627a7d6..a77a3cc66c38 100644
--- a/include/tvm/relax/type.h
+++ b/include/tvm/relax/type.h
@@ -124,7 +124,7 @@ class ShapeTypeNode : public TypeNode {
    * \brief The number of dimension of the shape, can be unknown.
    * \sa kUnknownNDim
    */
-  int ndim;
+  int ndim{kUnknownNDim};
 
   /*! \return Whether the type contains unknown ndim. */
   bool IsUnknownNdim() const { return ndim == kUnknownNDim; }
@@ -174,19 +174,19 @@ class TensorTypeNode : public TypeNode {
    *  is expected to be executed.
    */
   ffi::Optional<VDevice> vdevice;
-  /*! \brief The content data type, use void to denote the dtype is unknown. */
-  DataType dtype;
+  /*! \brief The content dtype, use void to denote the dtype is unknown. */
+  tvm::PrimType dtype{DLDataType{kDLOpaqueHandle, 0, 0}};
   /*!
    * \brief The number of dimension of the tensor, can be unknown.
    * \sa kUnknownNDim
    */
-  int ndim;
+  int ndim{kUnknownNDim};
 
   /*! \return Whether the type contains unknown ndim. */
   bool IsUnknownNdim() const { return ndim == kUnknownNDim; }
 
   /*! \return Whether the type contains unknown dtype. */
-  bool IsUnknownDtype() const { return dtype.is_void(); }
+  bool IsUnknownDtype() const { return dtype->dtype == DLDataType{kDLOpaqueHandle, 0, 0}; }
 
   /*! \return Shape if it is known. */
   ffi::Optional<ffi::Array<PrimExpr>> GetShape() const {
@@ -230,7 +230,7 @@ class TensorType : public Type {
    *
    * \note shape must already be normalized.
    */
-  TVM_DLL TensorType(Expr shape, DataType dtype, ffi::Optional<VDevice> vdevice = std::nullopt,
+  TVM_DLL TensorType(Expr shape, tvm::PrimType dtype, ffi::Optional<VDevice> vdevice = std::nullopt,
                      Span span = Span());
 
   /*!
@@ -240,7 +240,7 @@ class TensorType : public Type {
    * \param vdevice The virtual device.
    * \param span The span of the AST.
    */
-  TVM_DLL TensorType(DataType dtype, int ndim, ffi::Optional<VDevice> vdevice = std::nullopt,
+  TVM_DLL TensorType(tvm::PrimType dtype, int ndim, ffi::Optional<VDevice> vdevice = std::nullopt,
                      Span span = Span());
 
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NOTNULLABLE(TensorType, Type, TensorTypeNode);
diff --git a/include/tvm/runtime/data_type.h b/include/tvm/runtime/data_type.h
deleted file mode 100644
index 9f230cac824e..000000000000
--- a/include/tvm/runtime/data_type.h
+++ /dev/null
@@ -1,522 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*
- * \file tvm/runtime/data_type.h
- * \brief Primitive runtime data type.
- */
-// Acknowledgement: DataType structure design originates from Halide.
-#ifndef TVM_RUNTIME_DATA_TYPE_H_
-#define TVM_RUNTIME_DATA_TYPE_H_
-
-#include <tvm/ffi/container/shape.h>
-#include <tvm/ffi/dtype.h>
-#include <tvm/ffi/error.h>
-#include <tvm/runtime/base.h>
-
-#include <cstring>
-#include <string>
-#include <type_traits>
-
-namespace tvm {
-namespace runtime {
-
-/*!
- * \brief Runtime primitive data type.
- *
- *  This class is a thin wrapper of DLDataType.
- *  We also make use of DataType in compiler to store quick hint
- */
-class DataType {
- public:
-  /*!
-   * \brief Type code for the DataType.
-   *
-   * DLPack consistency:
-   * 1) kInt is consistent with kDLInt
-   * 2) kUInt is consistent with kDLUInt
-   * 3) kFloat is consistent with kDLFloat
-   */
-  enum TypeCode {
-    kInt = kDLInt,
-    kUInt = kDLUInt,
-    kFloat = kDLFloat,
-    kHandle = kDLOpaqueHandle,
-    kBFloat = kDLBfloat,
-    kBool = kDLBool,
-    kFloat8_e3m4 = kDLFloat8_e3m4,
-    kFloat8_e4m3 = kDLFloat8_e4m3,
-    kFloat8_e4m3b11fnuz = kDLFloat8_e4m3b11fnuz,
-    kFloat8_e4m3fn = kDLFloat8_e4m3fn,
-    kFloat8_e4m3fnuz = kDLFloat8_e4m3fnuz,
-    kFloat8_e5m2 = kDLFloat8_e5m2,
-    kFloat8_e5m2fnuz = kDLFloat8_e5m2fnuz,
-    kFloat8_e8m0fnu = kDLFloat8_e8m0fnu,
-    kFloat6_e2m3fn = kDLFloat6_e2m3fn,
-    kFloat6_e3m2fn = kDLFloat6_e3m2fn,
-    kFloat4_e2m1fn = kDLFloat4_e2m1fn,
-    kCustomBegin = 129
-  };
-  /*! \brief default constructor */
-  DataType() { data_ = DataType::Void(); }
-  /*!
-   * \brief Constructor
-   * \param dtype The DLDataType
-   */
-  explicit DataType(DLDataType dtype) : data_(dtype) {}
-  /*!
-   * \brief Constructor
-   * \param code The type code.
-   * \param bits The number of bits in the type.
-   * \param lanes The number of lanes.
-   * \param is_scalable Whether the data type is scalable.
-   */
-  DataType(int code, int bits, int lanes, bool is_scalable = false) {
-    data_.code = static_cast<uint8_t>(code);
-    data_.bits = static_cast<uint8_t>(bits);
-    if (is_scalable) {
-      TVM_FFI_ICHECK(lanes > 1) << "Invalid value for vscale factor" << lanes;
-    }
-    data_.lanes = is_scalable ? static_cast<uint16_t>(-lanes) : static_cast<uint16_t>(lanes);
-    if (code == kBFloat) {
-      TVM_FFI_ICHECK_EQ(bits, 16);
-    }
-    if (code == kFloat8_e3m4 || code == kFloat8_e4m3 || code == kFloat8_e4m3b11fnuz ||
-        code == kFloat8_e4m3fn || code == kFloat8_e4m3fnuz || code == kFloat8_e5m2 ||
-        code == kFloat8_e5m2fnuz || code == kFloat8_e8m0fnu) {
-      TVM_FFI_ICHECK_EQ(bits, 8);
-    }
-    if (code == kFloat6_e2m3fn || code == kFloat6_e3m2fn) {
-      TVM_FFI_ICHECK_EQ(bits, 6);
-    }
-    if (code == kFloat4_e2m1fn) {
-      TVM_FFI_ICHECK_EQ(bits, 4);
-    }
-  }
-  /*! \return The type code. */
-  int code() const { return static_cast<int>(data_.code); }
-  /*! \return number of bits in the data. */
-  int bits() const { return static_cast<int>(data_.bits); }
-  /*! \return number of bytes to store each scalar. */
-  int bytes() const { return (bits() + 7) / 8; }
-  /*! \return number of lanes in the data. */
-  int lanes() const {
-    int lanes_as_int = static_cast<int16_t>(data_.lanes);
-    if (lanes_as_int < 0) {
-      TVM_FFI_THROW(InternalError)
-          << "Can't fetch the lanes of a scalable vector at a compile time.";
-    }
-    return lanes_as_int;
-  }
-  /*! \return the integer multiplier of vscale in a scalable vector. */
-  int vscale_factor() const {
-    int lanes_as_int = static_cast<int16_t>(data_.lanes);
-    if (lanes_as_int >= -1) {
-      TVM_FFI_THROW(InternalError) << "A fixed length vector doesn't have a vscale factor.";
-    }
-    return -lanes_as_int;
-  }
-  /*! \return get vscale factor or lanes depending on scalability of the vector. */
-  int get_lanes_or_vscale_factor() const {
-    return is_scalable_vector() ? vscale_factor() : lanes();
-  }
-  /*! \return whether type is a scalar type. */
-  bool is_scalar() const { return !is_scalable_vector() && lanes() == 1; }
-  /*! \return whether type is a bool type. */
-  bool is_bool() const { return code() == DataType::kBool; }
-  /*! \return whether type can be used in a predicate expression. */
-  bool is_predicate_dtype() const { return is_bool() || (is_uint() && bits() == 1); }
-  /*! \return whether type is a float type. */
-  bool is_float() const { return code() == DataType::kFloat; }
-  /*! \return whether type is a bfloat type. */
-  bool is_bfloat() const { return code() == DataType::kBFloat; }
-  /*! \return whether type is any 8-bit custom Float8 variant. */
-  bool is_float8() const {
-    return bits() == 8 &&
-           (code() == DataType::kFloat8_e3m4 || code() == DataType::kFloat8_e4m3 ||
-            code() == DataType::kFloat8_e4m3b11fnuz || code() == DataType::kFloat8_e4m3fn ||
-            code() == DataType::kFloat8_e4m3fnuz || code() == DataType::kFloat8_e5m2 ||
-            code() == DataType::kFloat8_e5m2fnuz || code() == DataType::kFloat8_e8m0fnu);
-  }
-  /*! \return whether type is any 6-bit custom Float6 variant. */
-  bool is_float6() const {
-    return bits() == 6 &&
-           (code() == DataType::kFloat6_e2m3fn || code() == DataType::kFloat6_e3m2fn);
-  }
-  /*! \return whether type is the 4-bit custom Float4_e2m1fn variant. */
-  bool is_float4() const { return bits() == 4 && code() == DataType::kFloat4_e2m1fn; }
-  /*! \return whether type is Float8E3M4. */
-  bool is_float8_e3m4() const { return bits() == 8 && code() == DataType::kFloat8_e3m4; }
-  /*! \return whether type is Float8E4M3. */
-  bool is_float8_e4m3() const { return bits() == 8 && code() == DataType::kFloat8_e4m3; }
-  /*! \return whether type is Float8E4M3B11FNUZ. */
-  bool is_float8_e4m3b11fnuz() const {
-    return bits() == 8 && code() == DataType::kFloat8_e4m3b11fnuz;
-  }
-  /*! \return whether type is Float8E4M3FN. */
-  bool is_float8_e4m3fn() const { return bits() == 8 && code() == DataType::kFloat8_e4m3fn; }
-  /*! \return whether type is Float8E4M3FNUZ. */
-  bool is_float8_e4m3fnuz() const { return bits() == 8 && code() == DataType::kFloat8_e4m3fnuz; }
-  /*! \return whether type is Float8E5M2. */
-  bool is_float8_e5m2() const { return bits() == 8 && code() == DataType::kFloat8_e5m2; }
-  /*! \return whether type is Float8E5M2FNUZ. */
-  bool is_float8_e5m2fnuz() const { return bits() == 8 && code() == DataType::kFloat8_e5m2fnuz; }
-  /*! \return whether type is Float8E8M0FNU. */
-  bool is_float8_e8m0fnu() const { return bits() == 8 && code() == DataType::kFloat8_e8m0fnu; }
-  /*! \return whether type is Float6E2M3FN. */
-  bool is_float6_e2m3fn() const { return bits() == 6 && code() == DataType::kFloat6_e2m3fn; }
-  /*! \return whether type is Float6E3M2FN. */
-  bool is_float6_e3m2fn() const { return bits() == 6 && code() == DataType::kFloat6_e3m2fn; }
-  /*! \return whether type is Float4E2M1FN. */
-  bool is_float4_e2m1fn() const { return bits() == 4 && code() == DataType::kFloat4_e2m1fn; }
-  /*! \return whether type is a float16 type. */
-  bool is_float16() const { return is_float() && bits() == 16; }
-  /*! \return whether type is a bfloat16 type. */
-  bool is_bfloat16() const { return code() == DataType::kBFloat && bits() == 16; }
-  /*! \return whether type is an int type. */
-  bool is_int() const { return code() == DataType::kInt; }
-  /*! \return whether type is an uint type. */
-  bool is_uint() const { return code() == DataType::kUInt; }
-  /*! \return whether type is a handle type. */
-  bool is_handle() const { return code() == DataType::kHandle && !is_void(); }
-  /*! \return whether type is a vector type. */
-  bool is_scalable_or_fixed_length_vector() const {
-    int encoded_lanes = static_cast<int16_t>(data_.lanes);
-    return (encoded_lanes < -1) || (1 < encoded_lanes);
-  }
-  /*! \return Whether the type is a fixed length vector. */
-  bool is_fixed_length_vector() const { return static_cast<int16_t>(data_.lanes) > 1; }
-  /*! \return Whether the type is a scalable vector. */
-  bool is_scalable_vector() const { return static_cast<int16_t>(data_.lanes) < -1; }
-  /*! \return whether type is a vector type. */
-  bool is_vector() const { return lanes() > 1; }
-  /*! \return whether type is a bool vector type. */
-  bool is_vector_bool() const { return is_scalable_or_fixed_length_vector() && is_bool(); }
-  /*! \return whether type is a Void type. */
-  bool is_void() const {
-    return code() == DataType::kHandle && bits() == 0 && static_cast<int16_t>(data_.lanes) == 0;
-  }
-  /*!
-   * \brief Create a new data type by change lanes to a specified value.
-   * \param lanes The target number of lanes.
-   * \return the result type.
-   */
-  DataType with_lanes(int lanes) const { return DataType(data_.code, data_.bits, lanes); }
-  /*!
-   * \brief Create a new scalable vector data type by changing the vscale multiplier to a specified
-   * value. We'll use the data_.lanes field for this value. \param vscale_factor The vscale
-   * multiplier. \return A copy of the old DataType with the number of scalable lanes.
-   */
-  DataType with_scalable_vscale_factor(int vscale_factor) const {
-    return DataType(data_.code, data_.bits, -vscale_factor);
-  }
-  /*!
-   * \brief Create a new data type by change bits to a specified value.
-   * \param bits The target number of bits.
-   * \return the result type.
-   */
-  DataType with_bits(int bits) const { return DataType(data_.code, bits, data_.lanes); }
-  /*!
-   * \brief Get the scalar version of the type.
-   * \return the result type.
-   */
-  DataType element_of() const { return with_lanes(1); }
-  /*!
-   * \brief Assignment operator.
-   */
-  DataType& operator=(const DataType& rhs) {
-    if (this == &rhs) {
-      return *this;
-    }
-    data_ = rhs.data_;
-    return *this;
-  }
-  /*!
-   * \brief Equal comparator.
-   * \param other The data type to compare against.
-   * \return The comparison result.
-   */
-  bool operator==(const DataType& other) const {
-    return data_.code == other.data_.code && data_.bits == other.data_.bits &&
-           data_.lanes == other.data_.lanes;
-  }
-  /*!
-   * \brief NotEqual comparator.
-   * \param other The data type to compare against.
-   * \return The comparison result.
-   */
-  bool operator!=(const DataType& other) const { return !operator==(other); }
-  /*!
-   * \brief Converter to DLDataType
-   * \return the result.
-   */
-  operator DLDataType() const { return data_; }
-
-  /*!
-   * \brief Construct an int type.
-   * \param bits The number of bits in the type.
-   * \param lanes The number of lanes.
-   * \return The constructed data type.
-   */
-  static DataType Int(int bits, int lanes = 1) { return DataType(kDLInt, bits, lanes); }
-  /*!
-   * \brief Construct an uint type.
-   * \param bits The number of bits in the type.
-   * \param lanes The number of lanes.
-   * \param is_scalable Whether the data type is scalable.
-   * \return The constructed data type.
-   */
-  static DataType UInt(int bits, int lanes = 1, bool is_scalable = false) {
-    return DataType(kDLUInt, bits, lanes, is_scalable);
-  }
-  /*!
-   * \brief Construct an float type.
-   * \param bits The number of bits in the type.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float(int bits, int lanes = 1) { return DataType(kDLFloat, bits, lanes); }
-  /*!
-   * \brief Construct an bfloat type.
-   * \param bits The number of bits in the type.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType BFloat(int bits, int lanes = 1) { return DataType(kDLBfloat, bits, lanes); }
-  /*!
-   * \brief Construct float8 e3m4 datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float8E3M4(int lanes = 1) { return DataType(kFloat8_e3m4, 8, lanes); }
-
-  /*!
-   * \brief Construct float8 e4m3 datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float8E4M3(int lanes = 1) { return DataType(kFloat8_e4m3, 8, lanes); }
-
-  /*!
-   * \brief Construct float8 e4m3b11fnuz datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float8E4M3B11FNUZ(int lanes = 1) {
-    return DataType(kFloat8_e4m3b11fnuz, 8, lanes);
-  }
-
-  /*!
-   * \brief Construct float8 e4m3fn datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float8E4M3FN(int lanes = 1) { return DataType(kFloat8_e4m3fn, 8, lanes); }
-
-  /*!
-   * \brief Construct float8 e4m3fnuz datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float8E4M3FNUZ(int lanes = 1) { return DataType(kFloat8_e4m3fnuz, 8, lanes); }
-
-  /*!
-   * \brief Construct float8 e5m2 datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float8E5M2(int lanes = 1) { return DataType(kFloat8_e5m2, 8, lanes); }
-
-  /*!
-   * \brief Construct float8 e5m2fnuz datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float8E5M2FNUZ(int lanes = 1) { return DataType(kFloat8_e5m2fnuz, 8, lanes); }
-
-  /*!
-   * \brief Construct float8 e8m0fnu datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float8E8M0FNU(int lanes = 1) { return DataType(kFloat8_e8m0fnu, 8, lanes); }
-
-  /*!
-   * \brief Construct float6 e2m3fn datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float6E2M3FN(int lanes = 1) { return DataType(kFloat6_e2m3fn, 6, lanes); }
-
-  /*!
-   * \brief Construct float6 e3m2fn datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float6E3M2FN(int lanes = 1) { return DataType(kFloat6_e3m2fn, 6, lanes); }
-
-  /*!
-   * \brief Construct float4 e2m1fn datatype.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Float4E2M1FN(int lanes = 1) { return DataType(kFloat4_e2m1fn, 4, lanes); }
-  /*!
-   * \brief Construct a bool type.
-   * \param lanes The number of lanes.
-   * \param is_scalable Whether the data type is scalable.
-   * \return The constructed data type.
-   */
-  static DataType Bool(int lanes = 1, bool is_scalable = false) {
-    return DataType(kDLBool, 8, lanes, is_scalable);
-  }
-  /*!
-   * \brief Construct a handle type.
-   * \param bits The number of bits in the type.
-   * \param lanes The number of lanes
-   * \return The constructed data type.
-   */
-  static DataType Handle(int bits = 64, int lanes = 1) { return DataType(kHandle, bits, lanes); }
-  /*!
-   * \brief Construct a Void type.
-   * \return The constructed data type.
-   */
-  static DataType Void() { return DataType(kHandle, 0, 0); }
-  /*!
-   * \brief Get the corresponding type of TVMShapeIndex.
-   * \return The type of TVM shape index.
-   */
-  static DataType ShapeIndex() {
-    if (std::is_signed<ffi::Shape::index_type>::value) {
-      return DataType::Int(sizeof(ffi::Shape::index_type) * 8);
-    } else {
-      return DataType::UInt(sizeof(ffi::Shape::index_type) * 8);
-    }
-  }
-
- private:
-  DLDataType data_;
-};
-
-/*!
- * \brief Get the number of bytes needed in a vector.
- * \param dtype The data type.
- * \return Number of bytes needed.
- */
-inline int GetVectorBytes(DataType dtype) {
-  int data_bits = dtype.bits() * dtype.lanes();
-  // allow bool to exist
-  if (dtype == DataType::Bool() || dtype == DataType::Int(4) || dtype == DataType::UInt(4) ||
-      dtype == DataType::Int(1) || dtype == DataType::Float4E2M1FN() ||
-      dtype == DataType::Float6E2M3FN() || dtype == DataType::Float6E3M2FN()) {
-    return 1;
-  }
-  TVM_FFI_ICHECK_EQ(data_bits % 8, 0U) << "Need to load/store by multiple of bytes";
-  return data_bits / 8;
-}
-
-/*!
- * \brief Check whether type matches the given spec.
- * \param t The type
- * \param code The type code.
- * \param bits The number of bits to be matched.
- * \param lanes The number of lanes in the type.
- */
-inline bool TypeMatch(DLDataType t, int code, int bits, int lanes = 1) {
-  return t.code == code && t.bits == bits && t.lanes == lanes;
-}
-/*!
- * \brief Check whether two types are equal .
- * \param lhs The left operand.
- * \param rhs The right operand.
- */
-inline bool TypeEqual(DLDataType lhs, DLDataType rhs) {
-  return lhs.code == rhs.code && lhs.bits == rhs.bits && lhs.lanes == rhs.lanes;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const DataType& dtype) {  // NOLINT(*)
-  return os << dtype.operator DLDataType();
-}
-}  // namespace runtime
-
-using DataType = runtime::DataType;
-
-namespace ffi {
-
-// runtime::DataType
-template <>
-struct TypeTraits<runtime::DataType> : public TypeTraitsBase {
-  static constexpr int32_t field_static_type_index = TypeIndex::kTVMFFIDataType;
-
-  TVM_FFI_INLINE static void CopyToAnyView(const runtime::DataType& src, TVMFFIAny* result) {
-    // clear padding part to ensure the equality check can always check the v_uint64 part
-    result->v_uint64 = 0;
-    result->zero_padding = 0;
-    result->type_index = TypeIndex::kTVMFFIDataType;
-    result->v_dtype = src;
-  }
-
-  TVM_FFI_INLINE static void MoveToAny(runtime::DataType src, TVMFFIAny* result) {
-    // clear padding part to ensure the equality check can always check the v_uint64 part
-    result->v_uint64 = 0;
-    result->zero_padding = 0;
-    result->type_index = TypeIndex::kTVMFFIDataType;
-    result->v_dtype = src;
-  }
-
-  TVM_FFI_INLINE static std::optional<runtime::DataType> TryCastFromAnyView(const TVMFFIAny* src) {
-    auto opt_dtype = TypeTraits<DLDataType>::TryCastFromAnyView(src);
-    if (opt_dtype) {
-      return runtime::DataType(opt_dtype.value());
-    }
-    return std::nullopt;
-  }
-
-  TVM_FFI_INLINE static bool CheckAnyStrict(const TVMFFIAny* src) {
-    return TypeTraits<DLDataType>::CheckAnyStrict(src);
-  }
-
-  TVM_FFI_INLINE static runtime::DataType CopyFromAnyViewAfterCheck(const TVMFFIAny* src) {
-    return runtime::DataType(TypeTraits<DLDataType>::CopyFromAnyViewAfterCheck(src));
-  }
-
-  TVM_FFI_INLINE static std::string TypeStr() { return ffi::StaticTypeKey::kTVMFFIDataType; }
-
-  TVM_FFI_INLINE static std::string TypeSchema() {
-    return R"({"type":")" + std::string(ffi::StaticTypeKey::kTVMFFIDataType) + R"("})";
-  }
-};
-
-}  // namespace ffi
-}  // namespace tvm
-
-namespace std {
-template <>
-struct hash<tvm::DataType> {
-  inline int cantor_pairing_function(int a, int b) const { return (a + b) * (a + b + 1) / 2 + b; }
-  std::size_t operator()(tvm::DataType const& dtype) const {
-    int a = dtype.code();
-    int b = dtype.bits();
-    int c = dtype.lanes();
-    int d = cantor_pairing_function(a, b);
-    return cantor_pairing_function(c, d);
-  }
-};
-}  // namespace std
-
-#endif  //  TVM_RUNTIME_DATA_TYPE_H_
diff --git a/include/tvm/runtime/disco/builtin.h b/include/tvm/runtime/disco/builtin.h
index a9487c866acc..9d66a09507c5 100644
--- a/include/tvm/runtime/disco/builtin.h
+++ b/include/tvm/runtime/disco/builtin.h
@@ -19,8 +19,8 @@
 #ifndef TVM_RUNTIME_DISCO_BUILTIN_H_
 #define TVM_RUNTIME_DISCO_BUILTIN_H_
 
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/extra/module.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/tensor.h>
 
 #include <string>
@@ -70,7 +70,7 @@ TVM_RUNTIME_DLL ffi::Module LoadVMModule(std::string path, ffi::Optional<Device>
  * \param device The device the Tensor is created on. If None, use the thread local default device
  * \return The Tensor created
  */
-TVM_RUNTIME_DLL Tensor DiscoEmptyTensor(ffi::Shape shape, DataType dtype,
+TVM_RUNTIME_DLL Tensor DiscoEmptyTensor(ffi::Shape shape, DLDataType dtype,
                                         ffi::Optional<Device> device);
 /*!
  * \brief Perform an allreduce operation using the underlying communication library
diff --git a/include/tvm/runtime/tensor.h b/include/tvm/runtime/tensor.h
index d3497c8ff78f..cb93c4abd741 100644
--- a/include/tvm/runtime/tensor.h
+++ b/include/tvm/runtime/tensor.h
@@ -26,10 +26,10 @@
 
 #include <tvm/ffi/container/shape.h>
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/optional.h>
 #include <tvm/ffi/string.h>
 #include <tvm/runtime/base.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/support/io.h>
 #include <tvm/support/serializer.h>
@@ -59,7 +59,7 @@ class Tensor : public tvm::ffi::Tensor {
   Tensor(const ffi::Tensor& other) : tvm::ffi::Tensor(other) {}        // NOLINT(*)
 
   ffi::ShapeView Shape() const { return this->shape(); }
-  runtime::DataType DataType() const { return runtime::DataType(this->dtype()); }
+  DLDataType DataType() const { return this->dtype(); }
 
   // DLPack handling
   static Tensor FromDLPack(DLManagedTensor* tensor) {
diff --git a/include/tvm/runtime/vm/bytecode.h b/include/tvm/runtime/vm/bytecode.h
index 0f1927e0cbcb..ea246da5d354 100644
--- a/include/tvm/runtime/vm/bytecode.h
+++ b/include/tvm/runtime/vm/bytecode.h
@@ -24,8 +24,8 @@
 #ifndef TVM_RUNTIME_VM_BYTECODE_H_
 #define TVM_RUNTIME_VM_BYTECODE_H_
 
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/error.h>
-#include <tvm/runtime/data_type.h>
 
 #include <iostream>
 #include <vector>
diff --git a/include/tvm/runtime/vm/tensor_cache_support.h b/include/tvm/runtime/vm/tensor_cache_support.h
index ea997f0755bd..b112043c376f 100644
--- a/include/tvm/runtime/vm/tensor_cache_support.h
+++ b/include/tvm/runtime/vm/tensor_cache_support.h
@@ -54,7 +54,7 @@ struct TensorCacheMetadata {
       /*! \brief Shape of the parameter */
       ffi::Shape shape;
       /*! \brief Data type of the parameter */
-      DataType dtype;
+      DLDataType dtype;
       /*! \brief Format of the parameter */
       std::string format;
       /*! \brief Number of bytes */
diff --git a/include/tvm/s_tir/data_layout.h b/include/tvm/s_tir/data_layout.h
index 48836c5a53d5..ee6d51832dba 100644
--- a/include/tvm/s_tir/data_layout.h
+++ b/include/tvm/s_tir/data_layout.h
@@ -140,10 +140,10 @@ class SLayout : public ffi::ObjectRef {
    *        the corresponding lower case with factor size
    *        indicates the split dimension.
    *        return undefined layout if "__undef__" is passed.
-   * \param dtype The dtype of generated axes vars in the returned layout.
+   * \param index_ty The type of generated axes vars in the returned layout.
    *        It is required to be integer type.
    */
-  TVM_DLL SLayout(const std::string& name, DataType dtype = DataType::Int(32));  // NOLINT(*)
+  TVM_DLL SLayout(const std::string& name, PrimType index_ty = PrimType::Int(32));  // NOLINT(*)
 
   /*!
    * \brief access the internal node container
diff --git a/include/tvm/s_tir/meta_schedule/arg_info.h b/include/tvm/s_tir/meta_schedule/arg_info.h
index 463e73b0e246..a346a73dd441 100644
--- a/include/tvm/s_tir/meta_schedule/arg_info.h
+++ b/include/tvm/s_tir/meta_schedule/arg_info.h
@@ -20,9 +20,9 @@
 #define TVM_S_TIR_META_SCHEDULE_ARG_INFO_H_
 
 #include <tvm/ffi/container/shape.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ir/module.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/tirx/function.h>
 
 namespace tvm {
@@ -77,7 +77,7 @@ class ArgInfo : public ffi::ObjectRef {
 class TensorInfoNode : public ArgInfoNode {
  public:
   /*! \brief The data type of the tensor. */
-  runtime::DataType dtype;
+  DLDataType dtype;
   /*! \brief The shape of the tensor. */
   ffi::Shape shape;
 
@@ -104,7 +104,7 @@ class TensorInfo : public ArgInfo {
    * \param dtype The data type of the tensor argument.
    * \param shape The shape tuple of the tensor argument.
    */
-  TVM_DLL explicit TensorInfo(runtime::DataType dtype, ffi::Shape shape);
+  TVM_DLL explicit TensorInfo(DLDataType dtype, ffi::Shape shape);
   /*!
    * \brief Parse the argument information from a JSON object.
    * \param json_obj The json object to parse.
diff --git a/include/tvm/script/printer/config.h b/include/tvm/script/printer/config.h
index beea4042470c..e0ed32d38094 100644
--- a/include/tvm/script/printer/config.h
+++ b/include/tvm/script/printer/config.h
@@ -30,10 +30,11 @@
 #include <tvm/ffi/any.h>
 #include <tvm/ffi/container/array.h>
 #include <tvm/ffi/container/map.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/reflection/access_path.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ffi/string.h>
-#include <tvm/runtime/data_type.h>
+#include <tvm/runtime/base.h>
 
 #include <string>
 
@@ -53,15 +54,15 @@ class PrinterConfigNode : public ffi::Object {
    */
   ffi::String module_alias = "cls";
   /*! \brief Default buffer dtype */
-  DataType buffer_dtype = DataType::Float(32);
+  DLDataType buffer_dtype = DLDataType{kDLFloat, 32, 1};
   /*! \brief Default data type of integer literals */
-  DataType int_dtype = DataType::Int(32);
+  DLDataType int_dtype = DLDataType{kDLInt, 32, 1};
   /*!
    * \brief Default data type of float literals. Right now we always print out the explicit type
    * of floating point values, so setting it to Void means we do not print without the
    * T.float32/T.float64 wrapper.
    */
-  DataType float_dtype = DataType::Void();
+  DLDataType float_dtype = DLDataType{kDLOpaqueHandle, 0, 0};
   /*! \brief Whether or not to verbose print expressions. */
   bool verbose_expr = false;
   /*! \brief Number of spaces used for indentation*/
diff --git a/include/tvm/script/printer/doc.h b/include/tvm/script/printer/doc.h
index 2389c1b50d15..bc90e5365734 100644
--- a/include/tvm/script/printer/doc.h
+++ b/include/tvm/script/printer/doc.h
@@ -19,10 +19,11 @@
 #ifndef TVM_SCRIPT_PRINTER_DOC_H_
 #define TVM_SCRIPT_PRINTER_DOC_H_
 
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/reflection/access_path.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ir/expr.h>
-#include <tvm/runtime/data_type.h>
+#include <tvm/ir/type.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/script/printer/config.h>
 
@@ -293,7 +294,7 @@ class LiteralDoc : public ExprDoc {
    * \param p The object path
    */
   static LiteralDoc Float(double v, const ffi::Optional<AccessPath>& p) {
-    return LiteralDoc(FloatImm(DataType::Float(64), v), p);
+    return LiteralDoc(FloatImm(PrimType::Float(64), v), p);
   }
   /*!
    * \brief Create a LiteralDoc to represent string.
@@ -308,8 +309,9 @@ class LiteralDoc : public ExprDoc {
    * \param v The string value.
    * \param p The object path
    */
-  static LiteralDoc DataType(const runtime::DataType& v, const ffi::Optional<AccessPath>& p) {
-    std::string dtype = v.is_void() ? "void" : ffi::DLDataTypeToString(v);
+  static LiteralDoc DataType(DLDataType v, const ffi::Optional<AccessPath>& p) {
+    std::string dtype =
+        v == DLDataType{kDLOpaqueHandle, 0, 0} ? "void" : ffi::DLDataTypeToString(v);
     return LiteralDoc::Str(dtype, p);
   }
   /*!
diff --git a/include/tvm/script/printer/ir_docsifier.h b/include/tvm/script/printer/ir_docsifier.h
index 98249c6f30bd..e9c82265ff27 100644
--- a/include/tvm/script/printer/ir_docsifier.h
+++ b/include/tvm/script/printer/ir_docsifier.h
@@ -333,7 +333,7 @@ inline TDoc IRDocsifierNode::AsDoc(const Any& value, const AccessPath& path) con
       return LiteralDoc::Str(string_value, path).as_or_throw<TDoc>();
     }
     case ffi::TypeIndex::kTVMFFIDataType:
-      return LiteralDoc::DataType(value.as<runtime::DataType>().value(), path).as_or_throw<TDoc>();
+      return LiteralDoc::DataType(value.as<DLDataType>().value(), path).as_or_throw<TDoc>();
     case ffi::TypeIndex::kTVMFFIDevice:
       return LiteralDoc::Device(value.as<DLDevice>().value(), path).as_or_throw<TDoc>();
     default: {
diff --git a/include/tvm/te/operation.h b/include/tvm/te/operation.h
index c9d35a77fe99..ba5267a8ce85 100644
--- a/include/tvm/te/operation.h
+++ b/include/tvm/te/operation.h
@@ -34,6 +34,7 @@
 
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 namespace tvm {
@@ -67,11 +68,11 @@ class TVM_DLL OperationNode : public ffi::Object {
   /*! \return number of outputs */
   virtual int num_outputs() const = 0;
   /*!
-   * \brief Get data type. i-th output tensor.
+   * \brief Get the primitive element type of the i-th output tensor.
    * \param i The output index.
-   * \return type of i-th output.
+   * \return primitive element type of i-th output.
    */
-  virtual DataType output_dtype(size_t i) const = 0;
+  virtual PrimType output_dtype(size_t i) const = 0;
   /*!
    * \brief Get shape of i-th output tensor.
    * \param i The output index.
@@ -101,11 +102,11 @@ class PlaceholderOpNode : public OperationNode {
  public:
   /*! \brief The shape of the input */
   ffi::Array<PrimExpr> shape;
-  /*! \brief The data type of the input. */
-  DataType dtype;
+  /*! \brief The dtype of the input. */
+  PrimType dtype{DLDataType{kDLOpaqueHandle, 0, 0}};
   // override behavior.
   int num_outputs() const final;
-  DataType output_dtype(size_t i) const final;
+  PrimType output_dtype(size_t i) const final;
   ffi::Array<PrimExpr> output_shape(size_t i) const final;
   ffi::Array<Tensor> InputTensors() const final;
 
@@ -124,7 +125,9 @@ class PlaceholderOpNode : public OperationNode {
  */
 class PlaceholderOp : public Operation {
  public:
-  TVM_DLL PlaceholderOp(std::string name, ffi::Array<PrimExpr> shape, DataType dtype);
+  TVM_DLL PlaceholderOp(std::string name, ffi::Array<PrimExpr> shape, PrimType dtype);
+  PlaceholderOp(std::string name, ffi::Array<PrimExpr> shape, DLDataType dtype)
+      : PlaceholderOp(std::move(name), std::move(shape), PrimType(dtype)) {}
 
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(PlaceholderOp, Operation, PlaceholderOpNode);
 };
@@ -162,7 +165,7 @@ class TVM_DLL ComputeOpNode : public BaseComputeOpNode {
   ComputeOpNode() {}
   // override functions
   int num_outputs() const final;
-  DataType output_dtype(size_t i) const final;
+  PrimType output_dtype(size_t i) const final;
   ffi::Array<Tensor> InputTensors() const final;
 
   static void RegisterReflection() {
@@ -217,7 +220,7 @@ class ScanOpNode : public OperationNode {
   ScanOpNode() {}
   // override behavior.
   int num_outputs() const final;
-  DataType output_dtype(size_t i) const final;
+  PrimType output_dtype(size_t i) const final;
   ffi::Array<PrimExpr> output_shape(size_t i) const final;
   ffi::Array<Tensor> InputTensors() const final;
 
@@ -266,7 +269,7 @@ class ExternOpNode : public OperationNode {
   ExternOpNode() {}
   // override functions
   int num_outputs() const final;
-  DataType output_dtype(size_t i) const final;
+  PrimType output_dtype(size_t i) const final;
   ffi::Array<PrimExpr> output_shape(size_t i) const final;
   ffi::Array<Tensor> InputTensors() const final;
 
@@ -299,7 +302,7 @@ class ExternOp : public Operation {
  * \param name_hint The name hint for the expression
  * \param t The type of the expression
  */
-TVM_DLL Var var(std::string name_hint, DataType t = DataType::Int(32));
+TVM_DLL Var var(std::string name_hint, PrimType t = PrimType::Int(32));
 
 /*!
  * \brief Create a new IterVar that represents an axis in thread.
@@ -329,9 +332,14 @@ using FBatchCompute = std::function<ffi::Array<PrimExpr>(const ffi::Array<Var>&
  * \param dtype the data type of the tensor.
  * \param name The name of the Tensor.
  */
-TVM_DLL Tensor placeholder(ffi::Array<PrimExpr> shape, DataType dtype = DataType::Float(32),
+TVM_DLL Tensor placeholder(ffi::Array<PrimExpr> shape, PrimType dtype = PrimType::Float(32),
                            std::string name = "placeholder");
 
+inline Tensor placeholder(ffi::Array<PrimExpr> shape, DLDataType dtype,
+                          std::string name = "placeholder") {
+  return placeholder(std::move(shape), PrimType(dtype), std::move(name));
+}
+
 /*!
  * \brief Construct a new tensor by computing over shape,
  *  using the computation rule: result_tensor[axis] = fcompute(axis)
diff --git a/include/tvm/te/tensor.h b/include/tvm/te/tensor.h
index ed07a35fb2da..760d308623f8 100644
--- a/include/tvm/te/tensor.h
+++ b/include/tvm/te/tensor.h
@@ -71,8 +71,8 @@ class TensorNode : public DataProducerNode {
  public:
   /*! \brief The shape of the tensor */
   ffi::Array<PrimExpr> shape;
-  /*! \brief data type in the content of the tensor */
-  DataType dtype;
+  /*! \brief dtype in the content of the tensor */
+  PrimType dtype{DLDataType{kDLOpaqueHandle, 0, 0}};
   /*! \brief the source operation, can be None */
   Operation op;
   /*! \brief the output index from source operation */
@@ -82,7 +82,7 @@ class TensorNode : public DataProducerNode {
 
   ffi::Array<PrimExpr> GetShape() const final { return shape; }
 
-  DataType GetDataType() const final { return dtype; }
+  PrimType GetDataType() const final { return dtype; }
 
   TVM_DLL PrimExpr ToPrimExpr() const final;
 
@@ -108,7 +108,9 @@ class Tensor : public DataProducer {
   inline PrimExpr IndexTensor(ffi::Array<PrimExpr> indices, bool support_negative_indices) const;
 
  public:
-  TVM_DLL Tensor(ffi::Array<PrimExpr> shape, DataType dtype, Operation op, int value_index);
+  TVM_DLL Tensor(ffi::Array<PrimExpr> shape, PrimType dtype, Operation op, int value_index);
+  Tensor(ffi::Array<PrimExpr> shape, DLDataType dtype, Operation op, int value_index)
+      : Tensor(std::move(shape), PrimType(dtype), std::move(op), value_index) {}
   /*!
    * \brief check if two tensors equals each other.
    * \param other tensor to be checked.
diff --git a/include/tvm/tirx/buffer.h b/include/tvm/tirx/buffer.h
index 1456787d688b..71d4c974dbb8 100644
--- a/include/tvm/tirx/buffer.h
+++ b/include/tvm/tirx/buffer.h
@@ -40,11 +40,20 @@ namespace tirx {
 #define TVM_INDEX_DEFAULT_I64 1
 #endif
 /*! \brief if TVM_INDEX_DEFAULT_I64 is set, return int64, otherwise return int32 */
-inline DataType DefaultIndexType() {
+inline PrimType DefaultIndexPrimType() {
 #if TVM_INDEX_DEFAULT_I64
-  return DataType::Int(64);
+  static const PrimType default_index_ty = PrimType::Int(64);
 #else
-  return DataType::Int(32);
+  static const PrimType default_index_ty = PrimType::Int(32);
+#endif
+  return default_index_ty;
+}
+
+inline DLDataType DefaultIndexType() {
+#if TVM_INDEX_DEFAULT_I64
+  return DLDataType{kDLInt, 64, 1};
+#else
+  return DLDataType{kDLInt, 32, 1};
 #endif
 }
 
@@ -67,8 +76,8 @@ class BufferNode : public ffi::Object {
    * \sa data_alignment The alignment of data in bytes.
    */
   Var data;
-  /*! \brief data type in the content of the tensor */
-  DataType dtype;
+  /*! \brief dtype in the content of the tensor */
+  PrimType dtype{DLDataType{kDLOpaqueHandle, 0, 0}};
   /*! \brief The type of the buffer prior to flattening
    *
    * This contains the shape as it is accessed by
@@ -147,10 +156,13 @@ class BufferNode : public ffi::Object {
   }
 
   /*! \return preferred index type for this buffer node */
-  DataType DefaultIndexType() const {
-    return shape.size() != 0 ? shape[0].dtype() : tvm::tirx::DefaultIndexType();
+  DLDataType DefaultIndexType() const {
+    return shape.size() != 0 ? shape[0].ty()->dtype : tvm::tirx::DefaultIndexType();
   }
 
+  /*! \return primitive element type for compiler-side uses. */
+  PrimType ElementType() const { return dtype; }
+
   /*! \brief Determine the offset in the buffer of the given index.
    *
    * Returns the buffer offset, in number of elements of type dtype,
@@ -176,11 +188,19 @@ class Buffer : public ffi::ObjectRef {
  public:
   // User can specify data_alignment and offset_factor to be 0
   // A default value will be picked.
-  TVM_DLL Buffer(Var data, DataType dtype, ffi::Array<PrimExpr> shape, ffi::Array<PrimExpr> strides,
+  TVM_DLL Buffer(Var data, PrimType dtype, ffi::Array<PrimExpr> shape, ffi::Array<PrimExpr> strides,
                  PrimExpr elem_offset, ffi::String name, int data_alignment, int offset_factor,
                  BufferType buffer_type, ffi::Array<IntImm> axis_separators = {},
                  Span span = Span(), ffi::Optional<Layout> layout = std::nullopt,
                  ffi::Array<PrimExpr> allocated_addr = {});
+  Buffer(Var data, DLDataType dtype, ffi::Array<PrimExpr> shape, ffi::Array<PrimExpr> strides,
+         PrimExpr elem_offset, ffi::String name, int data_alignment, int offset_factor,
+         BufferType buffer_type, ffi::Array<IntImm> axis_separators = {}, Span span = Span(),
+         ffi::Optional<Layout> layout = std::nullopt, ffi::Array<PrimExpr> allocated_addr = {})
+      : Buffer(std::move(data), PrimType(dtype), std::move(shape), std::move(strides),
+               std::move(elem_offset), std::move(name), data_alignment, offset_factor, buffer_type,
+               std::move(axis_separators), std::move(span), std::move(layout),
+               std::move(allocated_addr)) {}
 
   /*!
    * \brief Return a new buffer that is equivalent with current one
@@ -205,7 +225,7 @@ class Buffer : public ffi::ObjectRef {
    * \param offset The offset of ptr.
    * \param input_extent The extent of ptr.
    */
-  TVM_DLL PrimExpr access_ptr(int access_mask, DataType ptr_type = DataType::Handle(),
+  TVM_DLL PrimExpr access_ptr(int access_mask, PrimType ptr_type = PrimType::Handle(),
                               int content_lanes = 1, PrimExpr offset = IntImm::Int32(0),
                               ffi::Optional<PrimExpr> input_extent = std::nullopt) const;
   /*!
@@ -215,7 +235,7 @@ class Buffer : public ffi::ObjectRef {
    * \param predicate A vector mask of boolean values indicating which lanes of a vector are to be
    * loaded. The number lanes of the mask must be equal to the number of lanes in being loaded.
    */
-  TVM_DLL PrimExpr vload(ffi::Array<PrimExpr> begin, DataType dtype,
+  TVM_DLL PrimExpr vload(ffi::Array<PrimExpr> begin, PrimType dtype,
                          ffi::Optional<PrimExpr> predicate = std::nullopt) const;
   /*!
    * \brief Create a Stmt that does a vector store at begin index.
@@ -267,7 +287,11 @@ class Buffer : public ffi::ObjectRef {
   /*!
    * \brief Return a new buffer with the dtype.
    */
-  TVM_DLL Buffer with_dtype(DataType dtype) const;
+  TVM_DLL Buffer with_dtype(PrimType dtype) const;
+  Buffer with_dtype(DLDataType dtype) const { return with_dtype(PrimType(dtype)); }
+
+  /*! \return primitive element type for compiler-side uses. */
+  PrimType ElementType() const { return (*this)->ElementType(); }
 
   /*!
    * \brief Return a new buffer with the data.
@@ -289,11 +313,20 @@ class Buffer : public ffi::ObjectRef {
  * \return The created buffer.
  * \sa Buffer for complete constructor.
  */
-TVM_DLL Buffer decl_buffer(ffi::Array<PrimExpr> shape, DataType dtype = DataType::Float(32),
+TVM_DLL Buffer decl_buffer(ffi::Array<PrimExpr> shape,
+                           DLDataType dtype = DLDataType{kDLFloat, 32, 1},
                            ffi::String name = "buffer", ffi::String storage_scope = "",
                            ffi::Optional<ffi::Array<IntImm>> axis_separators = std::nullopt,
                            Span span = Span());
 
+inline Buffer decl_buffer(ffi::Array<PrimExpr> shape, PrimType dtype, ffi::String name = "buffer",
+                          ffi::String storage_scope = "",
+                          ffi::Optional<ffi::Array<IntImm>> axis_separators = std::nullopt,
+                          Span span = Span()) {
+  return decl_buffer(std::move(shape), dtype->dtype, std::move(name), std::move(storage_scope),
+                     std::move(axis_separators), std::move(span));
+}
+
 /*!
  * \brief Base node for data producers.
  *
@@ -316,10 +349,10 @@ class DataProducerNode : public PrimExprConvertibleNode {
    */
   virtual ffi::Array<PrimExpr> GetShape() const = 0;
   /*!
-   * \brief Get the data type of the result.
-   * \return The data type.
+   * \brief Get the raw element dtype of the result.
+   * \return The raw dtype.
    */
-  virtual DataType GetDataType() const = 0;
+  virtual PrimType GetDataType() const = 0;
   /*!
    * \brief Get the name hint of the data producer.
    * \return The data type.
@@ -350,10 +383,18 @@ class DataProducer : public PrimExprConvertible {
  * \param compact If the statement has already bound to a compact buffer.
  * \param memory_scope memory scope of the buffer
  */
-TVM_DLL tirx::Buffer BufferWithOffsetAlignment(ffi::Array<PrimExpr> shape, DataType dtype,
+TVM_DLL tirx::Buffer BufferWithOffsetAlignment(ffi::Array<PrimExpr> shape, DLDataType dtype,
                                                std::string name, int data_alignment,
                                                int offset_factor, bool compact,
                                                std::string memory_scope = "");
+
+inline tirx::Buffer BufferWithOffsetAlignment(ffi::Array<PrimExpr> shape, PrimType dtype,
+                                              std::string name, int data_alignment,
+                                              int offset_factor, bool compact,
+                                              std::string memory_scope = "") {
+  return BufferWithOffsetAlignment(std::move(shape), dtype->dtype, std::move(name), data_alignment,
+                                   offset_factor, compact, std::move(memory_scope));
+}
 }  // namespace tirx
 }  // namespace tvm
 #endif  // TVM_TIR_BUFFER_H_
diff --git a/include/tvm/tirx/expr.h b/include/tvm/tirx/expr.h
index cd51108b0d23..bf4c9004e84d 100644
--- a/include/tvm/tirx/expr.h
+++ b/include/tvm/tirx/expr.h
@@ -27,13 +27,13 @@
 
 #include <tvm/ffi/container/array.h>
 #include <tvm/ffi/container/map.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/string.h>
 #include <tvm/ir/attrs.h>
 #include <tvm/ir/cow.h>
 #include <tvm/ir/expr.h>
 #include <tvm/ir/node_functor.h>
 #include <tvm/runtime/base.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/tirx/buffer.h>
 #include <tvm/tirx/var.h>
 
@@ -96,7 +96,7 @@ class CastNode : public PrimExprNode {
  */
 class Cast : public PrimExpr {
  public:
-  TVM_DLL Cast(DataType dtype, PrimExpr value, Span span = Span());
+  TVM_DLL Cast(PrimType value_ty, PrimExpr value, Span span = Span());
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Cast, PrimExpr, CastNode);
   TVM_DEFINE_OBJECT_REF_COW_METHOD(CastNode);
 };
@@ -752,9 +752,9 @@ class CallNode : public PrimExprNode {
  */
 class Call : public PrimExpr {
  public:
-  TVM_DLL Call(DataType dtype, RelaxExpr op, ffi::Array<PrimExpr> args, Attrs attrs = Attrs(),
+  TVM_DLL Call(PrimType ret_ty, RelaxExpr op, ffi::Array<PrimExpr> args, Attrs attrs = Attrs(),
                Span span = Span());
-  TVM_DLL Call(DataType dtype, RelaxExpr op, ffi::Array<PrimExpr> args, Span span);
+  TVM_DLL Call(PrimType ret_ty, RelaxExpr op, ffi::Array<PrimExpr> args, Span span);
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Call, PrimExpr, CallNode);
   TVM_DEFINE_OBJECT_REF_COW_METHOD(CallNode);
 };
diff --git a/include/tvm/tirx/op.h b/include/tvm/tirx/op.h
index 416aff73ee29..be827b9ef534 100644
--- a/include/tvm/tirx/op.h
+++ b/include/tvm/tirx/op.h
@@ -39,6 +39,7 @@
 #include <algorithm>
 #include <limits>
 #include <type_traits>
+#include <utility>
 
 namespace tvm {
 
@@ -58,34 +59,36 @@ namespace tvm {
 /*!
  * \brief Get the type of the expression under the unified type system.
  *
- * This function could return a more refined type than
- * the runtime type provided by expr->dtype
+ * This function could return a more refined type than the runtime dtype
+ * implied by PrimExpr::ty().
  *
  * \param expr The input parameter.
  * \return The result type.
  *
- * \sa tvm/ir/type.h for discussion about the relation between Type and runtime::DataType.
+ * \sa tvm/ir/type.h for discussion about the relation between Type and DLPack dtype.
  */
 TVM_DLL Type GetType(const PrimExpr& expr);
 
 /*!
- * \brief Get the type corresponding to DataType
- * \param dtype The data type
+ * \brief Get the type corresponding to a runtime DLPack dtype.
+ * \param dtype The runtime dtype.
  * \return The result type
  *
- * \sa tvm/ir/type.h for discussion about the relation between Type and runtime::DataType.
+ * \sa tvm/ir/type.h for discussion about the relation between Type and DLPack dtype.
  */
-TVM_DLL Type GetTypeFromRuntimeDataType(const DataType& dtype);
+TVM_DLL Type GetTypeFromRuntimeDataType(DLDataType dtype);
 
 /*!
- * \brief Get the implied DataType for storing values with type during runtime.
+ * \brief Get the implied DLPack dtype for storing values with type during runtime.
  *
  * \param type The input type.
- * \return The result runtime::DataType.
+ * \return The result DLPack dtype.
  *
- * \sa tvm/ir/type.h for discussion about the relation between Type and runtime::DataType.
+ * \sa tvm/ir/type.h for discussion about the relation between Type and DLPack dtype.
  */
-TVM_DLL runtime::DataType GetRuntimeDataType(const Type& type);
+TVM_DLL DLDataType GetRuntimeDLDataType(const Type& type);
+
+inline DLDataType GetRuntimeDataType(const Type& type) { return GetRuntimeDLDataType(type); }
 
 /*!
  * \brief Return the value.
@@ -120,27 +123,27 @@ TVM_DLL PrimExpr break_loop(Span span = Span());
 
 /*!
  * Query the maximum possible value of dtype.
- * \param dtype The data type.
+ * \param dtype The primitive type.
  * \param span The location of this operation in the source.
  * \return the maximum possible value in this format.
  */
-TVM_DLL PrimExpr max_value(const DataType& dtype, Span span = Span());
+TVM_DLL PrimExpr max_value(PrimType dtype, Span span = Span());
 
 /*!
  * Query the minimum possible value of dtype.
- * \param dtype The data type.
+ * \param dtype The primitive type.
  * \param span The location of this operation in the source.
  * \return the minimum possible value in this format.
  */
-TVM_DLL PrimExpr min_value(const DataType& dtype, Span span = Span());
+TVM_DLL PrimExpr min_value(PrimType dtype, Span span = Span());
 
 /*!
  * Get the value of infinity.
- * \param dtype The data type.
+ * \param dtype The primitive type.
  * \param span The location of this operation in the source.
  * \return the infinity value in this format.
  */
-TVM_DLL PrimExpr infinity(const DataType& dtype, Span span = Span());
+TVM_DLL PrimExpr infinity(PrimType dtype, Span span = Span());
 
 /*!
  * \brief cast value to type.
@@ -151,7 +154,7 @@ TVM_DLL PrimExpr infinity(const DataType& dtype, Span span = Span());
  * \return The result expression.
  * \note This function may return value if the type is the same.
  */
-TVM_DLL PrimExpr cast(const DataType& t, PrimExpr value, Span span = Span());
+TVM_DLL PrimExpr cast(PrimType t, PrimExpr value, Span span = Span());
 /*!
  * \brief perform reinterpret cast value to type.
  *
@@ -161,7 +164,7 @@ TVM_DLL PrimExpr cast(const DataType& t, PrimExpr value, Span span = Span());
  * \return The result expression.
  * \note This function may return value if the type is the same.
  */
-TVM_DLL PrimExpr reinterpret(const DataType& t, PrimExpr value, Span span = Span());
+TVM_DLL PrimExpr reinterpret(PrimType t, PrimExpr value, Span span = Span());
 /*!
  * \brief add operator
  *
@@ -691,13 +694,13 @@ TVM_DLL PrimExpr trunc(PrimExpr x, Span span = Span());
 
 /*!
  * \brief Construct a large uint constant by its low 32 bits and high 32bits.
- * \param dtype The final data type.
+ * \param value_ty The final primitive type.
  * \param low The lower 32 bits.
  * \param high The higher 32 bits.
  * \param span The location of this operation in the source.
  * \return The constructed expression.
  */
-TVM_DLL PrimExpr LargeUIntImm(DataType dtype, int64_t low, int64_t high, Span span = Span());
+TVM_DLL PrimExpr LargeUIntImm(PrimType value_ty, int64_t low, int64_t high, Span span = Span());
 
 /*!
  * \brief Execute a multiplication between two Q-numbers x and y
@@ -731,29 +734,35 @@ TVM_DLL PrimExpr q_multiply_shift(PrimExpr x, PrimExpr y, PrimExpr q, PrimExpr s
  */
 TVM_DLL PrimExpr fast_erf_float_expr(PrimExpr arg, int bits);
 
-inline void CheckMathUnaryOpInputDType(const char* op_name, DataType dtype) {
-  TVM_FFI_CHECK(dtype.is_float() || dtype.is_bfloat16(), TypeError)
+inline void CheckMathUnaryOpInputDType(const char* op_name, const PrimType& dtype) {
+  TVM_FFI_CHECK(dtype.code() == DLDataTypeCode::kDLFloat ||
+                    dtype.MatchesElementType(DLDataTypeCode::kDLBfloat, 16),
+                TypeError)
       << "tirx." << op_name << " only supports floating-point inputs, but got " << dtype;
 }
 
 // Intrinsic operators
-#define TVM_DECLARE_INTRIN_UNARY_WITH_CHECK(OpName, CheckInputDType)         \
-  inline PrimExpr OpName(PrimExpr x, Span span = Span()) {                   \
-    static const Op op = Op::Get("tirx." #OpName);                           \
-    CheckInputDType(#OpName, x.dtype());                                     \
-    if (x.dtype().is_bfloat16()) {                                           \
-      DataType bf16_dtype = x.dtype();                                       \
-      DataType fp32_dtype(kDLFloat, 32, bf16_dtype.lanes());                 \
-      PrimExpr x_fp32 = tirx::Cast(fp32_dtype, {x}, span);                   \
-      PrimExpr result_fp32 = tirx::Call(fp32_dtype, op, {x_fp32}, {}, span); \
-      return tirx::Cast(bf16_dtype, {result_fp32}, span);                    \
-    } else {                                                                 \
-      return tirx::Call(x.dtype(), op, {x}, {}, span);                       \
-    }                                                                        \
+#define TVM_DECLARE_INTRIN_UNARY_WITH_CHECK(OpName, CheckInputDType)                        \
+  inline PrimExpr OpName(PrimExpr x, Span span = Span()) {                                  \
+    static const Op op = Op::Get("tirx." #OpName);                                          \
+    PrimType x_ty = x.ty();                                                                 \
+    CheckInputDType(#OpName, x_ty);                                                         \
+    if (x_ty.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) {                           \
+      PrimType bf16_ty = x_ty;                                                              \
+      PrimType f32_ty =                                                                     \
+          x_ty.IsScalableVector()                                                           \
+              ? PrimType::ScalableVector(DLDataTypeCode::kDLFloat, 32, x_ty.VScaleFactor()) \
+              : PrimType::Float(32, x_ty.lanes());                                          \
+      PrimExpr x_fp32 = tirx::Cast(f32_ty, x, span);                                        \
+      PrimExpr result_fp32 = tirx::Call(f32_ty, op, {x_fp32}, {}, span);                    \
+      return tirx::Cast(bf16_ty, result_fp32, span);                                        \
+    } else {                                                                                \
+      return tirx::Call(x_ty, op, {x}, {}, span);                                           \
+    }                                                                                       \
   }
 
 #define TVM_DECLARE_INTRIN_UNARY(OpName) \
-  TVM_DECLARE_INTRIN_UNARY_WITH_CHECK(OpName, [](const char*, DataType) {})
+  TVM_DECLARE_INTRIN_UNARY_WITH_CHECK(OpName, [](const char*, const PrimType&) {})
 
 #define TVM_DECLARE_FLOAT_INTRIN_UNARY(OpName) \
   TVM_DECLARE_INTRIN_UNARY_WITH_CHECK(OpName, CheckMathUnaryOpInputDType)
@@ -787,7 +796,7 @@ TVM_DECLARE_INTRIN_UNARY(clz);
 #define TVM_DECLARE_INTRIN_BINARY(OpName)                              \
   inline PrimExpr OpName(PrimExpr x, PrimExpr y, Span span = Span()) { \
     static const Op op = Op::Get("tirx." #OpName);                     \
-    return tirx::Call(x.dtype(), op, {x, y}, {}, span);                \
+    return tirx::Call(x.ty(), op, {x, y}, {}, span);                   \
   }
 
 TVM_DECLARE_INTRIN_BINARY(atan2);
@@ -804,7 +813,7 @@ namespace tirx {
  * \param element_type The corresponding element type.
  * \return The check results
  */
-inline bool IsPointerType(const Type& type, const DataType& element_type) {
+inline bool IsPointerType(const Type& type, DLDataType element_type) {
   if (!type.defined()) return false;
   if (const auto* ptr_type = type.as<PointerTypeNode>()) {
     if (const auto* prim_type = ptr_type->element_type.as<PrimTypeNode>()) {
@@ -832,7 +841,7 @@ inline bool IsPointerType(const Type& type, const DataType& element_type) {
 template <typename ValueType,
           typename = typename std::enable_if<std::is_standard_layout<ValueType>::value &&
                                              std::is_trivial<ValueType>::value>::type>
-inline PrimExpr MakeConst(DataType dtype, ValueType value, Span span = Span());
+inline PrimExpr MakeConst(PrimType dtype, ValueType value, Span span = Span());
 /*!
  * \brief Make a constant handle value.
  * \param value The integer payload to reinterpret as a handle.
@@ -970,9 +979,12 @@ inline bool is_no_op(const tirx::Stmt& stmt) {
 }
 
 template <typename ValueType>
-inline PrimExpr MakeConstScalar(DataType dtype, ValueType value, Span span = Span()) {
-  if (dtype.is_int() || dtype.is_bool()) return IntImm(dtype, static_cast<int64_t>(value), span);
-  if (dtype.is_uint()) {
+inline PrimExpr MakeConstScalar(PrimType dtype, ValueType value, Span span = Span()) {
+  DLDataTypeCode code = dtype.code();
+  if (code == DLDataTypeCode::kDLInt || code == DLDataTypeCode::kDLBool) {
+    return IntImm(dtype, static_cast<int64_t>(value), span);
+  }
+  if (code == DLDataTypeCode::kDLUInt) {
     // Use IntImm if it is a small integer
     uint64_t uval = static_cast<uint64_t>(value);
     if (value < static_cast<ValueType>(0)) {
@@ -986,8 +998,13 @@ inline PrimExpr MakeConstScalar(DataType dtype, ValueType value, Span span = Spa
       return LargeUIntImm(dtype, static_cast<int64_t>(low), static_cast<int64_t>(high), span);
     }
   }
-  if (dtype.is_float() || dtype.is_bfloat16() || dtype.is_float8() || dtype.is_float6() ||
-      dtype.is_float4()) {
+  if (dtype.MatchesCode(DLDataTypeCode::kDLFloat, DLDataTypeCode::kDLFloat8_e3m4,
+                        DLDataTypeCode::kDLFloat8_e4m3, DLDataTypeCode::kDLFloat8_e4m3b11fnuz,
+                        DLDataTypeCode::kDLFloat8_e4m3fn, DLDataTypeCode::kDLFloat8_e4m3fnuz,
+                        DLDataTypeCode::kDLFloat8_e5m2, DLDataTypeCode::kDLFloat8_e5m2fnuz,
+                        DLDataTypeCode::kDLFloat8_e8m0fnu, DLDataTypeCode::kDLFloat6_e2m3fn,
+                        DLDataTypeCode::kDLFloat6_e3m2fn, DLDataTypeCode::kDLFloat4_e2m1fn) ||
+      dtype.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) {
     return FloatImm(dtype, static_cast<double>(value), span);
   }
   TVM_FFI_THROW(InternalError) << "cannot make const for type " << dtype;
@@ -995,27 +1012,26 @@ inline PrimExpr MakeConstScalar(DataType dtype, ValueType value, Span span = Spa
 }
 
 template <>
-inline PrimExpr MakeConstScalar(DataType dtype, bool value, Span span) {
+inline PrimExpr MakeConstScalar(PrimType dtype, bool value, Span span) {
   return MakeConstScalar(dtype, static_cast<int>(value), span);
 }
 
 template <typename ValueType, typename>
-inline PrimExpr MakeConst(DataType dtype, ValueType value, Span span) {
-  if (dtype.is_scalar()) {
+inline PrimExpr MakeConst(PrimType dtype, ValueType value, Span span) {
+  if (!dtype.IsScalableVector() && !dtype.IsFixedLengthVector()) {
     return MakeConstScalar(dtype, value, span);
-  } else {
-    if (dtype.is_fixed_length_vector()) {
-      return tirx::Broadcast(MakeConstScalar(dtype.element_of(), value, span), dtype.lanes(), span);
-    } else {
-      PrimExpr lanes = tirx::Mul(tirx::Call(DataType::Int(32), tirx::builtin::vscale(), {}),
-                                 dtype.vscale_factor());
-      return tirx::Broadcast(MakeConstScalar(dtype.element_of(), value, span), lanes, span);
-    }
   }
+  PrimType elem_ty = dtype.WithLanes(1);
+  if (dtype.IsFixedLengthVector()) {
+    return tirx::Broadcast(MakeConstScalar(elem_ty, value, span), dtype.lanes(), span);
+  }
+  PrimExpr lanes =
+      tirx::Mul(tirx::Call(PrimType::Int(32), tirx::builtin::vscale(), {}), dtype.VScaleFactor());
+  return tirx::Broadcast(MakeConstScalar(elem_ty, value, span), lanes, span);
 }
 
 inline PrimExpr ConstHandle(int64_t value, Span span) {
-  return reinterpret(DataType::Handle(), IntImm(DataType::UInt(64), value, span));
+  return reinterpret(PrimType::Handle(), IntImm(PrimType::UInt(64), value, span));
 }
 
 }  // namespace tirx
@@ -1027,17 +1043,13 @@ inline PrimExpr ConstHandle(int64_t value, Span span) {
     return a;                                       \
   }
 
-#define TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(Name)                                   \
-  inline PrimExpr Name(const PrimExpr& a, float b) { return Name(a, PrimExpr(b)); } \
-  inline PrimExpr Name(float a, const PrimExpr& b) { return Name(PrimExpr(a), b); } \
-  inline PrimExpr Name(int a, const PrimExpr& b) {                                  \
-    return Name(tirx::MakeConst(b.dtype(), a), b);                                  \
-  }                                                                                 \
-  inline PrimExpr Name(const PrimExpr& a, int b) {                                  \
-    return Name(a, tirx::MakeConst(a.dtype(), b));                                  \
-  }                                                                                 \
-  inline PrimExpr Name(const PrimExpr& a, double b) {                               \
-    return Name(a, FloatImm(DataType::Float(64), b));                               \
+#define TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(Name)                                                \
+  inline PrimExpr Name(const PrimExpr& a, float b) { return Name(a, PrimExpr(b)); }              \
+  inline PrimExpr Name(float a, const PrimExpr& b) { return Name(PrimExpr(a), b); }              \
+  inline PrimExpr Name(int a, const PrimExpr& b) { return Name(tirx::MakeConst(b.ty(), a), b); } \
+  inline PrimExpr Name(const PrimExpr& a, int b) { return Name(a, tirx::MakeConst(a.ty(), b)); } \
+  inline PrimExpr Name(const PrimExpr& a, double b) {                                            \
+    return Name(a, FloatImm(PrimType::Float(64), b));                                            \
   }
 
 #define TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD_SPANNED(Name)                 \
@@ -1048,13 +1060,13 @@ inline PrimExpr ConstHandle(int64_t value, Span span) {
     return Name(PrimExpr(a), b, span);                                    \
   }                                                                       \
   inline PrimExpr Name(int a, const PrimExpr& b, Span span = Span()) {    \
-    return Name(tirx::MakeConst(b.dtype(), a), b, span);                  \
+    return Name(tirx::MakeConst(b.ty(), a), b, span);                     \
   }                                                                       \
   inline PrimExpr Name(const PrimExpr& a, int b, Span span = Span()) {    \
-    return Name(a, tirx::MakeConst(a.dtype(), b), span);                  \
+    return Name(a, tirx::MakeConst(a.ty(), b), span);                     \
   }                                                                       \
   inline PrimExpr Name(const PrimExpr& a, double b, Span span = Span()) { \
-    return Name(a, FloatImm(DataType::Float(64), b), span);               \
+    return Name(a, FloatImm(PrimType::Float(64), b), span);               \
   }
 
 #define TVM_DEFINE_LOGICAL_OP_CONST_VAL_OVERLOAD(Name)                             \
@@ -1069,18 +1081,16 @@ inline PrimExpr ConstHandle(int64_t value, Span span) {
     return Name(PrimExpr(a), b, span);                                  \
   }
 
-#define TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD(Name) \
-  inline PrimExpr Name(const PrimExpr& a, int b) { \
-    return Name(a, tirx::MakeConst(a.dtype(), b)); \
-  }                                                \
-  inline PrimExpr Name(int a, const PrimExpr& b) { return Name(tirx::MakeConst(b.dtype(), a), b); }
+#define TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD(Name)                                               \
+  inline PrimExpr Name(const PrimExpr& a, int b) { return Name(a, tirx::MakeConst(a.ty(), b)); } \
+  inline PrimExpr Name(int a, const PrimExpr& b) { return Name(tirx::MakeConst(b.ty(), a), b); }
 
 #define TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD_SPANNED(Name)             \
   inline PrimExpr Name(const PrimExpr& a, int b, Span span = Span()) { \
-    return Name(a, tirx::MakeConst(a.dtype(), b), span);               \
+    return Name(a, tirx::MakeConst(a.ty(), b), span);                  \
   }                                                                    \
   inline PrimExpr Name(int a, const PrimExpr& b, Span span = Span()) { \
-    return Name(tirx::MakeConst(b.dtype(), a), b, span);               \
+    return Name(tirx::MakeConst(b.ty(), a), b, span);                  \
   }
 
 TVM_DEFINE_ASSIGN_OP_OVERLOAD(operator+=, operator+);
diff --git a/include/tvm/tirx/script/builder/ir.h b/include/tvm/tirx/script/builder/ir.h
index ad18d7ac4001..684653134a55 100644
--- a/include/tvm/tirx/script/builder/ir.h
+++ b/include/tvm/tirx/script/builder/ir.h
@@ -57,7 +57,7 @@ using tvm::tirx::Var;
  * \param axis_separators The separators between input axes when generating flattened output axes.
  * \return The declared buffer.
  */
-Buffer BufferDecl(ffi::Array<PrimExpr> shape, DataType dtype, ffi::String buffer_name,
+Buffer BufferDecl(ffi::Array<PrimExpr> shape, PrimType dtype, ffi::String buffer_name,
                   ffi::Optional<Var> data, ffi::Optional<ffi::Array<PrimExpr>> strides,
                   ffi::Optional<PrimExpr> elem_offset, ffi::String storage_scope, int align,
                   int offset_factor, ffi::String buffer_type,
@@ -122,7 +122,7 @@ Type FuncRet(Type ret_type);
  * \return The matched buffer.
  */
 Buffer MatchBuffer(ffi::ObjectRef param, ffi::Array<PrimExpr> shape,
-                   DataType dtype = DataType::Float(32), ffi::Optional<Var> data = std::nullopt,
+                   PrimType dtype = PrimType::Float(32), ffi::Optional<Var> data = std::nullopt,
                    ffi::Array<PrimExpr> strides = {}, PrimExpr elem_offset = PrimExpr(),
                    ffi::String storage_scope = "global", int align = -1, int offset_factor = 0,
                    ffi::String buffer_type = "default",
@@ -197,7 +197,7 @@ void BlockAttrs(ffi::Map<ffi::String, ffi::Any> attrs);
  * T.prim_func(tirx=True).
  */
 ffi::Variant<Buffer, AllocBufferFrame> SBlockAllocBuffer(
-    ffi::Array<PrimExpr> shape, DataType dtype = DataType::Float(32),
+    ffi::Array<PrimExpr> shape, PrimType dtype = PrimType::Float(32),
     ffi::Optional<Var> data = std::nullopt, ffi::Array<PrimExpr> strides = {},
     PrimExpr elem_offset = PrimExpr(), ffi::String storage_scope = "", int align = -1,
     int offset_factor = 0, ffi::String buffer_type = "default",
@@ -213,7 +213,7 @@ namespace axis {
  * \param dtype The data type of the iteration variable.
  * \return The iteration variable.
  */
-Var Spatial(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32));
+Var Spatial(Range dom, PrimExpr binding, PrimType dtype = PrimType::Int(32));
 
 /*!
  * \brief The reduced block axis defining function.
@@ -222,7 +222,7 @@ Var Spatial(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32));
  * \param dtype The data type of the iteration variable.
  * \return The iteration variable.
  */
-Var Reduce(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32));
+Var Reduce(Range dom, PrimExpr binding, PrimType dtype = PrimType::Int(32));
 
 /*!
  * \brief The scanning block axis defining function.
@@ -231,7 +231,7 @@ Var Reduce(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32));
  * \param dtype The data type of the iteration variable.
  * \return The iteration variable.
  */
-Var Scan(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32));
+Var Scan(Range dom, PrimExpr binding, PrimType dtype = PrimType::Int(32));
 
 /*!
  * \brief The opaque block axis defining function.
@@ -240,7 +240,7 @@ Var Scan(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32));
  * \param dtype The data type of the iteration variable.
  * \return The iteration variable.
  */
-Var Opaque(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32));
+Var Opaque(Range dom, PrimExpr binding, PrimType dtype = PrimType::Int(32));
 
 /*!
  * \brief The block axis remapping function.
@@ -250,7 +250,7 @@ Var Opaque(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32));
  * \return The iteration variables.
  */
 ffi::Array<Var> Remap(ffi::String kinds, ffi::Array<PrimExpr> bindings,
-                      DataType dtype = DataType::Int(32));
+                      PrimType dtype = PrimType::Int(32));
 
 }  // namespace axis
 
@@ -412,7 +412,7 @@ ElseFrame Else();
  * \param layout The layout of the buffer.
  * \return The declaration frame.
  */
-DeclBufferFrame DeclBuffer(ffi::Array<PrimExpr> shape, DataType dtype, ffi::String buffer_name,
+DeclBufferFrame DeclBuffer(ffi::Array<PrimExpr> shape, PrimType dtype, ffi::String buffer_name,
                            ffi::Optional<Var> data, ffi::Optional<ffi::Array<PrimExpr>> strides,
                            ffi::Optional<PrimExpr> elem_offset, ffi::String storage_scope,
                            int align, int offset_factor, ffi::String buffer_type,
@@ -428,7 +428,7 @@ DeclBufferFrame DeclBuffer(ffi::Array<PrimExpr> shape, DataType dtype, ffi::Stri
  * \param annotations Optional annotations for the allocation.
  * \return The allocated buffer.
  */
-Buffer AllocBuffer(ffi::Array<PrimExpr> shape, DataType dtype = DataType::Float(32),
+Buffer AllocBuffer(ffi::Array<PrimExpr> shape, PrimType dtype = PrimType::Float(32),
                    ffi::String storage_scope = "global",
                    ffi::Optional<ffi::Map<ffi::String, ffi::Any>> annotations = std::nullopt);
 
@@ -465,7 +465,7 @@ ComposeOpFrame ComposeOp(ffi::Map<ffi::String, Buffer> workspace,
  * \param dtype The data type of the variable.
  * \return The result variable which gets bound to the thread env.
  */
-Var EnvThread(ffi::String thread_tag, DataType dtype = DataType::Int(32));
+Var EnvThread(ffi::String thread_tag, PrimType dtype = PrimType::Int(32));
 
 /*!
  * \brief Store data in a buffer.
@@ -494,21 +494,20 @@ void Evaluate(PrimExpr value);
  * \param is_size_var Whether the pointer is a size var.
  *
  * \param is_unknown_type Used to distinguish between
- * `PrimType(DataType::Handle())` and
- * `PointerType(PrimType(DataType::Void()))`.  If true, resolve dtype
+ * `PrimType::Handle()` and `PointerType(PrimType(DLDataType{kDLOpaqueHandle, 0, 0}))`.
+ * If true, resolve dtype
  * of `Void()` as `PrimType`, and if false resolve dtype of `Void()`
  * as a `PointerType`.
  *
  * \return The pointer.
  */
-inline Var Handle(runtime::DataType dtype = runtime::DataType::Void(),
-                  ffi::String storage_scope = "global", bool is_size_var = false,
-                  bool is_unknown_type = false) {
+inline Var Handle(PrimType dtype = PrimType::Handle(), ffi::String storage_scope = "global",
+                  bool is_size_var = false, bool is_unknown_type = false) {
   Type type_annotation{nullptr};
   if (is_unknown_type && storage_scope == "global") {
-    type_annotation = PrimType(runtime::DataType::Handle());
+    type_annotation = PrimType::Handle();
   } else {
-    type_annotation = PointerType(PrimType(dtype), storage_scope);
+    type_annotation = PointerType(dtype, storage_scope);
   }
   return is_size_var ? tvm::tirx::SizeVar("", type_annotation)
                      : tvm::tirx::Var("", type_annotation);
@@ -519,67 +518,67 @@ inline Var TensorMap() { return tvm::tirx::Var("", PointerType(TensorMapType()))
 #define TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName, DType)                                 \
   inline PrimExpr FuncName(ffi::Optional<PrimExpr> expr = std::nullopt,                     \
                            bool is_size_var = false) {                                      \
-    DataType dtype = DType;                                                                 \
+    PrimType dtype(DType);                                                                  \
     return expr.defined()                                                                   \
                ? tvm::cast(dtype, expr.value())                                             \
                : (is_size_var ? tvm::tirx::SizeVar("", dtype) : tvm::tirx::Var("", dtype)); \
   }
 
-#define TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES(DType, FDType) \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##8, FDType(8));      \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##16, FDType(16));    \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##32, FDType(32));    \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##64, FDType(64));
-
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES(BFloat, DataType::BFloat);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES(Float, DataType::Float);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES(UInt, DataType::UInt);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES(Int, DataType::Int);
-
-#define TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES(FuncName, FDType, Size) \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x2, FDType(Size, 2))      \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x4, FDType(Size, 4));     \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x8, FDType(Size, 8));     \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x16, FDType(Size, 16));   \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x32, FDType(Size, 32));   \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x64, FDType(Size, 64));
-
-#define TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(DType, FDType) \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES(DType##8, FDType, 8);      \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES(DType##16, FDType, 16);    \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES(DType##32, FDType, 32);    \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES(DType##64, FDType, 64);
-
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(BFloat, DataType::BFloat);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(Float, DataType::Float);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(UInt, DataType::UInt);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(Int, DataType::Int);
-
-#define TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(DType, FDType) \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType, FDType(1));                    \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x2, FDType(2));                \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x4, FDType(4));                \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x8, FDType(8));                \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x16, FDType(16));              \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x32, FDType(32));              \
-  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x64, FDType(64));
-
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E3M4, DataType::Float8E3M4);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E4M3, DataType::Float8E4M3);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E4M3B11FNUZ, DataType::Float8E4M3B11FNUZ);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E4M3FN, DataType::Float8E4M3FN);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E4M3FNUZ, DataType::Float8E4M3FNUZ);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E5M2, DataType::Float8E5M2);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E5M2FNUZ, DataType::Float8E5M2FNUZ);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E8M0FNU, DataType::Float8E8M0FNU);
-
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float6E2M3FN, DataType::Float6E2M3FN);
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float6E3M2FN, DataType::Float6E3M2FN);
-
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float4E2M1FN, DataType::Float4E2M1FN);
-
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(Boolean, DataType::Bool());
-TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(Void, DataType::Void());
+#define TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES(DType, Code)               \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##8, (DLDataType{Code, 8, 1}));   \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##16, (DLDataType{Code, 16, 1})); \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##32, (DLDataType{Code, 32, 1})); \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##64, (DLDataType{Code, 64, 1}));
+
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES(BFloat, kDLBfloat);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES(Float, kDLFloat);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES(UInt, kDLUInt);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES(Int, kDLInt);
+
+#define TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES(FuncName, Code, Size)             \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x2, (DLDataType{Code, Size, 2}))    \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x4, (DLDataType{Code, Size, 4}));   \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x8, (DLDataType{Code, Size, 8}));   \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x16, (DLDataType{Code, Size, 16})); \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x32, (DLDataType{Code, Size, 32})); \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(FuncName##x64, (DLDataType{Code, Size, 64}));
+
+#define TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(DType, Code) \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES(DType##8, Code, 8);      \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES(DType##16, Code, 16);    \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES(DType##32, Code, 32);    \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES(DType##64, Code, 64);
+
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(BFloat, kDLBfloat);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(Float, kDLFloat);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(UInt, kDLUInt);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_SIZES_LANES(Int, kDLInt);
+
+#define TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(DType, Code, Bits)  \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType, (DLDataType{Code, Bits, 1}));       \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x2, (DLDataType{Code, Bits, 2}));   \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x4, (DLDataType{Code, Bits, 4}));   \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x8, (DLDataType{Code, Bits, 8}));   \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x16, (DLDataType{Code, Bits, 16})); \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x32, (DLDataType{Code, Bits, 32})); \
+  TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(DType##x64, (DLDataType{Code, Bits, 64}));
+
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E3M4, kDLFloat8_e3m4, 8);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E4M3, kDLFloat8_e4m3, 8);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E4M3B11FNUZ, kDLFloat8_e4m3b11fnuz, 8);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E4M3FN, kDLFloat8_e4m3fn, 8);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E4M3FNUZ, kDLFloat8_e4m3fnuz, 8);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E5M2, kDLFloat8_e5m2, 8);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E5M2FNUZ, kDLFloat8_e5m2fnuz, 8);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float8E8M0FNU, kDLFloat8_e8m0fnu, 8);
+
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float6E2M3FN, kDLFloat6_e2m3fn, 6);
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float6E3M2FN, kDLFloat6_e3m2fn, 6);
+
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST_LANES_FIXED_SIZE(Float4E2M1FN, kDLFloat4_e2m1fn, 4);
+
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(Boolean, (DLDataType{kDLBool, 8, 1}));
+TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST(Void, (DLDataType{kDLOpaqueHandle, 0, 0}));
 
 #undef TVM_TIRX_IR_BUILDER_DEF_DTYPE_CAST
 
diff --git a/include/tvm/tirx/stmt.h b/include/tvm/tirx/stmt.h
index 1ed4d5acac54..7eb004f8cf25 100644
--- a/include/tvm/tirx/stmt.h
+++ b/include/tvm/tirx/stmt.h
@@ -1282,7 +1282,7 @@ inline bool IsPragmaKey(const std::string& attr_key) {
  * \param span The location of this object in the source code.
  * \return Expr a expression with dtype.
  */
-TVM_DLL PrimExpr TypeAnnotation(DataType dtype, Span span = Span());
+TVM_DLL PrimExpr TypeAnnotation(PrimType dtype, Span span = Span());
 
 // overload printing of for type.
 TVM_DLL std::ostream& operator<<(std::ostream& os, ForKind kind);
diff --git a/include/tvm/tirx/var.h b/include/tvm/tirx/var.h
index 8c536ef0d668..3a4746a3f6a2 100644
--- a/include/tvm/tirx/var.h
+++ b/include/tvm/tirx/var.h
@@ -24,9 +24,9 @@
 #ifndef TVM_TIR_VAR_H_
 #define TVM_TIR_VAR_H_
 
+#include <tvm/ffi/dtype.h>
 #include <tvm/ir/cow.h>
 #include <tvm/ir/expr.h>
-#include <tvm/runtime/data_type.h>
 
 #include <functional>
 #include <string>
@@ -57,7 +57,7 @@ class VarNode : public PrimExprNode {
    *
    * It is an optional field that provides a refined type of the variable than dtype.
    *
-   * \sa tvm/ir/type.h for discussion of relations between runtime::DataType and Type.
+   * \sa tvm/ir/type.h for discussion of relations between DLPack dtype and Type.
    */
   Type type_annotation;
 
@@ -84,7 +84,7 @@ class Var : public PrimExpr {
    * \param dtype data type
    * \param span The location of this object in the source code.
    */
-  TVM_DLL explicit Var(ffi::String name_hint = "v", DataType dtype = DataType::Int(32),
+  TVM_DLL explicit Var(ffi::String name_hint = "v", PrimType dtype = PrimType::Int(32),
                        Span span = Span());
   /*!
    * \brief Constructor which provides a more detailed type annotation.
@@ -110,7 +110,7 @@ class Var : public PrimExpr {
    * \param dtype The specified dtype
    * \return The new variable
    */
-  TVM_DLL Var copy_with_dtype(DataType dtype) const;
+  TVM_DLL Var copy_with_dtype(PrimType dtype) const;
 
   /*!
    * \brief Get pointer to the internal value.
@@ -150,7 +150,7 @@ class SizeVar : public Var {
    * \param t data type
    * \param span The location of this object in the source code.
    */
-  TVM_DLL explicit SizeVar(ffi::String name_hint = "s", DataType t = DataType::Int(32),
+  TVM_DLL explicit SizeVar(ffi::String name_hint = "s", PrimType t = PrimType::Int(32),
                            Span span = Span());
   /*!
    * \brief Constructor which provides a more detailed type annotation.
diff --git a/include/tvm/topi/broadcast.h b/include/tvm/topi/broadcast.h
index b0c6ac8f6722..26bf7c100ca5 100644
--- a/include/tvm/topi/broadcast.h
+++ b/include/tvm/topi/broadcast.h
@@ -252,7 +252,8 @@ TOPI_DEFINE_BCAST_OP(divide, { return div(a, b); });
  * \return The result.
  */
 TOPI_DEFINE_BCAST_OP(floor_divide, {
-  if (a.dtype().is_int() || a.dtype().is_uint()) {
+  PrimType a_ty = a.ty();
+  if (a_ty.code() == DLDataTypeCode::kDLInt || a_ty.code() == DLDataTypeCode::kDLUInt) {
     return floordiv(a, b);
   } else {
     return floor(div(a, b));
@@ -287,7 +288,8 @@ TOPI_DEFINE_BCAST_OP(log_add_exp, { return logaddexp(a, b); });
  * \return The result.
  */
 TOPI_DEFINE_BCAST_OP(trunc_divide, {
-  if (a.dtype().is_int() || a.dtype().is_uint()) {
+  PrimType a_ty = a.ty();
+  if (a_ty.code() == DLDataTypeCode::kDLInt || a_ty.code() == DLDataTypeCode::kDLUInt) {
     return truncdiv(a, b);
   } else {
     return trunc(div(a, b));
@@ -319,7 +321,8 @@ TOPI_DEFINE_BCAST_OP(mod, { return truncmod(a, b); });
  * \return The result.
  */
 TOPI_DEFINE_BCAST_OP(floor_mod, {
-  if (a.dtype().is_int() || a.dtype().is_uint()) {
+  PrimType a_ty = a.ty();
+  if (a_ty.code() == DLDataTypeCode::kDLInt || a_ty.code() == DLDataTypeCode::kDLUInt) {
     return floormod(a, b);
   } else {
     return a - floor_divide(a, b) * b;
@@ -338,7 +341,8 @@ TOPI_DEFINE_BCAST_OP(floor_mod, {
  * \return The result.
  */
 TOPI_DEFINE_BCAST_OP(trunc_mod, {
-  if (a.dtype().is_int() || a.dtype().is_uint()) {
+  PrimType a_ty = a.ty();
+  if (a_ty.code() == DLDataTypeCode::kDLInt || a_ty.code() == DLDataTypeCode::kDLUInt) {
     return truncmod(a, b);
   } else {
     return a - trunc_divide(a, b) * b;
diff --git a/include/tvm/topi/contrib/cublas.h b/include/tvm/topi/contrib/cublas.h
index 3590b7a54458..18ad4320f489 100644
--- a/include/tvm/topi/contrib/cublas.h
+++ b/include/tvm/topi/contrib/cublas.h
@@ -48,7 +48,7 @@ inline Tensor cublas_matmul(const Tensor& lhs, const Tensor& rhs, bool transa, b
   auto m = transb ? rhs->shape[0] : rhs->shape[1];
 
   return make_extern(
-      {{n, m}}, {lhs->dtype}, {lhs, rhs},
+      {{n, m}}, {lhs->GetDataType()}, {lhs, rhs},
       [&](ffi::Array<Buffer> ins, ffi::Array<Buffer> outs) {
         return call_packed({StringImm("tvm.contrib.cublas.matmul"), pack_buffer(ins[0]),
                             pack_buffer(ins[1]), pack_buffer(outs[0]), transa, transb});
@@ -73,7 +73,7 @@ inline Tensor cublas_batch_matmul(const Tensor& lhs, const Tensor& rhs, bool tra
   auto m = transb ? rhs->shape[1] : rhs->shape[2];
 
   return make_extern(
-      {{b, n, m}}, {lhs->dtype}, {lhs, rhs},
+      {{b, n, m}}, {lhs->GetDataType()}, {lhs, rhs},
       [&](ffi::Array<Buffer> ins, ffi::Array<Buffer> outs) {
         return call_packed({StringImm("tvm.contrib.cublas.batch_matmul"), pack_buffer(ins[0]),
                             pack_buffer(ins[1]), pack_buffer(outs[0]), transa, transb});
diff --git a/include/tvm/topi/detail/broadcast.h b/include/tvm/topi/detail/broadcast.h
index c9dce9eb7489..e5984fd1d787 100644
--- a/include/tvm/topi/detail/broadcast.h
+++ b/include/tvm/topi/detail/broadcast.h
@@ -42,10 +42,12 @@ struct BroadcastHelper {
   std::deque<tvm::tirx::Var> vars2;
 };
 
-static inline DataType CommonType(DataType type1, DataType type2) {
-  TVM_FFI_ICHECK(type1.is_scalar() && type2.is_scalar());
+static inline PrimType CommonType(const PrimType& type1, const PrimType& type2) {
+  TVM_FFI_ICHECK(!type1.IsScalableVector() && !type2.IsScalableVector());
+  TVM_FFI_ICHECK_EQ(type1.lanes(), 1);
+  TVM_FFI_ICHECK_EQ(type2.lanes(), 1);
   TVM_FFI_ICHECK(type1.code() == type2.code());
-  return DataType(type1.code(), std::max(type1.bits(), type2.bits()), /*lanes=*/1);
+  return type1.bits() < type2.bits() ? type1.WithBits(type2.bits()) : type1;
 }
 
 inline BroadcastHelper BroadcastShape(const tvm::ffi::Array<tvm::PrimExpr>& shape1,
@@ -56,15 +58,15 @@ inline BroadcastHelper BroadcastShape(const tvm::ffi::Array<tvm::PrimExpr>& shap
   tvm::PrimExpr one(1);
   int i;
 
-  auto cast_if_needed = [](DataType to_type, PrimExpr expr) {
-    return to_type != expr.dtype() ? cast(to_type, expr) : expr;
+  auto cast_if_needed = [](PrimType to_type, PrimExpr expr) {
+    return to_type->dtype == expr.ty()->dtype ? expr : cast(to_type, expr);
   };
 
   for (i = 1; i <= std::min(s1_size, s2_size); ++i) {
     // TODO(@icemelon9): Need to revisit this part
     const IntImmNode* static_size1 = shape1[s1_size - i].as<IntImmNode>();
     const IntImmNode* static_size2 = shape2[s2_size - i].as<IntImmNode>();
-    DataType common_type = CommonType(shape1[s1_size - i].dtype(), shape2[s2_size - i].dtype());
+    PrimType common_type = CommonType(shape1[s1_size - i].ty(), shape2[s2_size - i].ty());
 
     bh.all_vars.push_front(tvm::tirx::Var("dim", common_type));
     if (topi::detail::EqualCheck(shape1[s1_size - i], shape2[s2_size - i])) {
@@ -104,7 +106,7 @@ inline BroadcastHelper BroadcastShape(const tvm::ffi::Array<tvm::PrimExpr>& shap
   auto& shape = (s1_size > s2_size) ? shape1 : shape2;
   auto& vars = (s1_size > s2_size) ? bh.vars1 : bh.vars2;
   for (; i <= max_size; ++i) {
-    bh.all_vars.push_front(tvm::tirx::Var("v", shape[max_size - 1].dtype()));
+    bh.all_vars.push_front(tvm::tirx::Var("v", shape[max_size - 1].ty()));
     bh.common_shape.push_front(shape[max_size - i]);
     vars.push_front(bh.all_vars[0]);
   }
@@ -130,7 +132,7 @@ inline tvm::ffi::Array<tvm::PrimExpr> InputIndexFromBroadcast(
     // Only inject 0 here if we have not yet reached the dimension of I
     // (i.e. this must be a 1)
     if (!found && (ovars.size() - i) <= expected_dims) {
-      ivars.push_back(tvm::IntImm(ovars[i].dtype(), 0));
+      ivars.push_back(tvm::IntImm(ovars[i].ty(), 0));
     }
   }
   TVM_FFI_ICHECK(expected_dims == ivars.size());
diff --git a/include/tvm/topi/detail/extern.h b/include/tvm/topi/detail/extern.h
index 161d5291c38e..b0ce2d713bee 100644
--- a/include/tvm/topi/detail/extern.h
+++ b/include/tvm/topi/detail/extern.h
@@ -28,6 +28,7 @@
 #include <tvm/tirx/builtin.h>
 
 #include <string>
+#include <utility>
 #include <vector>
 
 namespace tvm {
@@ -61,7 +62,7 @@ using FExtern = std::function<PrimExpr(ffi::Array<Buffer>, ffi::Array<Buffer>)>;
  * element of out_types.
  */
 inline ffi::Array<Tensor> make_extern(const ffi::Array<ffi::Array<PrimExpr>>& out_shapes,
-                                      const std::vector<DataType>& out_types,
+                                      const std::vector<PrimType>& out_types,
                                       const ffi::Array<Tensor>& inputs, FExtern fextern,
                                       std::string name, std::string tag,
                                       ::tvm::ffi::Map<ffi::String, ffi::Any> attrs) {
@@ -100,10 +101,10 @@ inline ffi::Array<Tensor> make_extern(const ffi::Array<ffi::Array<PrimExpr>>& ou
 inline PrimExpr pack_buffer(Buffer buf) {
   TVM_FFI_ICHECK_GT(buf->shape.size(), 0) << "buf shape must have at least one element";
   auto shape =
-      tvm::tirx::Call(DataType::Handle(), tvm::tirx::builtin::tvm_stack_make_shape(), buf->shape);
+      tvm::tirx::Call(PrimType::Handle(), tvm::tirx::builtin::tvm_stack_make_shape(), buf->shape);
   PrimExpr strides;
   if (buf->strides.size() > 0) {
-    strides = tvm::tirx::Call(DataType::Handle(), tvm::tirx::builtin::tvm_stack_make_shape(),
+    strides = tvm::tirx::Call(PrimType::Handle(), tvm::tirx::builtin::tvm_stack_make_shape(),
                               buf->strides);
   } else {
     strides = 0;
@@ -112,9 +113,9 @@ inline PrimExpr pack_buffer(Buffer buf) {
                                  shape,
                                  strides,
                                  IntImm::Int32(static_cast<int64_t>(buf->shape.size())),
-                                 MakeConst(buf->dtype, 0),
+                                 MakeConst(PrimType(buf->dtype), 0),
                                  buf->elem_offset};
-  return tvm::tirx::Call(DataType::Handle(), tvm::tirx::builtin::tvm_stack_make_array(), pack_args);
+  return tvm::tirx::Call(PrimType::Handle(), tvm::tirx::builtin::tvm_stack_make_array(), pack_args);
 }
 
 /*!
@@ -127,7 +128,7 @@ inline PrimExpr pack_buffer(Buffer buf) {
  * \return An expression representing the invocation
  */
 inline PrimExpr call_packed(ffi::Array<PrimExpr> args) {
-  return tvm::tirx::Call(DataType::Int(32), tvm::tirx::builtin::tvm_call_packed(), args);
+  return tvm::tirx::Call(PrimType::Int(32), tvm::tirx::builtin::tvm_call_packed(), args);
 }
 
 }  // namespace detail
diff --git a/include/tvm/topi/detail/strided_slice.h b/include/tvm/topi/detail/strided_slice.h
index 19ee79a2086f..95ab3a38cbc0 100644
--- a/include/tvm/topi/detail/strided_slice.h
+++ b/include/tvm/topi/detail/strided_slice.h
@@ -91,7 +91,7 @@ inline ffi::Array<PrimExpr> StridedSliceCanonicalizeBegin(const ffi::Array<PrimE
                                                           const std::vector<int64_t>& begin,
                                                           const std::vector<int64_t>& strides,
                                                           const ffi::Array<int64_t>& axes,
-                                                          DataType dtype,
+                                                          PrimType dtype,
                                                           std::string slice_mode = "end") {
   ffi::Array<PrimExpr> begin_expr;
   for (size_t i = 0; i < axes.size(); ++i) {
@@ -140,9 +140,9 @@ inline ffi::Array<PrimExpr> StridedSliceOutputShape(
           static_cast<int>((interval + std::abs(strides[i]) - 1) / std::abs(strides[i]));
       TVM_FFI_ICHECK(strides[i] < 0 ? (end_i <= begin_i) : (begin_i <= end_i))
           << ": Input [Begin=" << begin[i] << ", End=" << end[i] << "] is invalid for axis=" << i;
-      out_shape.Set(ax, cast(out_shape[i].dtype(), PrimExpr(slice_size)));
+      out_shape.Set(ax, cast(out_shape[i].ty(), PrimExpr(slice_size)));
     } else {
-      out_shape.Set(ax, tvm::tirx::Var("dim", out_shape[i]->dtype));
+      out_shape.Set(ax, tvm::tirx::Var("dim", out_shape[i].ty()));
     }
   }
 
diff --git a/include/tvm/topi/detail/tensor_utils.h b/include/tvm/topi/detail/tensor_utils.h
index d67ad6359434..82649cd0b387 100644
--- a/include/tvm/topi/detail/tensor_utils.h
+++ b/include/tvm/topi/detail/tensor_utils.h
@@ -70,10 +70,10 @@ inline PrimExpr bilinear_sample_nchw(const Tensor& input, const ffi::Array<PrimE
   auto in_y = indices[2];
   auto in_x = indices[3];
 
-  auto y_low = tvm::cast(DataType::Int(32), tvm::floor(in_y));
+  auto y_low = tvm::cast(PrimType::Int(32), tvm::floor(in_y));
   auto y_high = y_low + 1;
 
-  auto x_low = tvm::cast(DataType::Int(32), tvm::floor(in_x));
+  auto x_low = tvm::cast(PrimType::Int(32), tvm::floor(in_x));
   auto x_high = x_low + 1;
 
   auto wy_h = in_y - y_low;
@@ -114,10 +114,10 @@ inline PrimExpr bilinear_sample_nhwc(const Tensor& input, const ffi::Array<PrimE
   auto in_y = indices[1];
   auto in_x = indices[2];
 
-  auto y_low = tvm::cast(DataType::Int(32), tvm::floor(in_y));
+  auto y_low = tvm::cast(PrimType::Int(32), tvm::floor(in_y));
   auto y_high = y_low + 1;
 
-  auto x_low = tvm::cast(DataType::Int(32), tvm::floor(in_x));
+  auto x_low = tvm::cast(PrimType::Int(32), tvm::floor(in_x));
   auto x_high = x_low + 1;
 
   auto wy_h = in_y - y_low;
diff --git a/include/tvm/topi/elemwise.h b/include/tvm/topi/elemwise.h
index 57225af9b493..b47204b46c25 100644
--- a/include/tvm/topi/elemwise.h
+++ b/include/tvm/topi/elemwise.h
@@ -82,22 +82,23 @@ TOPI_DECLARE_UNARY_OP(isinf);
 inline Tensor fast_tanh_float(const Tensor& in, std::string name, std::string tag) {
   // Clamp the inputs to the range [-9, 9] since anything outside
   // this range is +/-1.0f in single-precision.
-  auto x = maximum(MakeConst(in->dtype, -9.0), minimum(MakeConst(in->dtype, 9.0), in));
+  PrimType input_type = in->GetDataType();
+  auto x = maximum(MakeConst(input_type, -9.0), minimum(MakeConst(input_type, 9.0), in));
 
   // The monomial coefficients of the numerator polynomial (odd).
-  auto alpha_1 = MakeConst(in->dtype, 4.89352455891786e-03);
-  auto alpha_3 = MakeConst(in->dtype, 6.37261928875436e-04);
-  auto alpha_5 = MakeConst(in->dtype, 1.48572235717979e-05);
-  auto alpha_7 = MakeConst(in->dtype, 5.12229709037114e-08);
-  auto alpha_9 = MakeConst(in->dtype, -8.60467152213735e-11);
-  auto alpha_11 = MakeConst(in->dtype, 2.00018790482477e-13);
-  auto alpha_13 = MakeConst(in->dtype, -2.76076847742355e-16);
+  auto alpha_1 = MakeConst(input_type, 4.89352455891786e-03);
+  auto alpha_3 = MakeConst(input_type, 6.37261928875436e-04);
+  auto alpha_5 = MakeConst(input_type, 1.48572235717979e-05);
+  auto alpha_7 = MakeConst(input_type, 5.12229709037114e-08);
+  auto alpha_9 = MakeConst(input_type, -8.60467152213735e-11);
+  auto alpha_11 = MakeConst(input_type, 2.00018790482477e-13);
+  auto alpha_13 = MakeConst(input_type, -2.76076847742355e-16);
 
   // The monomial coefficients of the denominator polynomial (even).
-  auto beta_0 = MakeConst(in->dtype, 4.89352518554385e-03);
-  auto beta_2 = MakeConst(in->dtype, 2.26843463243900e-03);
-  auto beta_4 = MakeConst(in->dtype, 1.18534705686654e-04);
-  auto beta_6 = MakeConst(in->dtype, 1.19825839466702e-06);
+  auto beta_0 = MakeConst(input_type, 4.89352518554385e-03);
+  auto beta_2 = MakeConst(input_type, 2.26843463243900e-03);
+  auto beta_4 = MakeConst(input_type, 1.18534705686654e-04);
+  auto beta_6 = MakeConst(input_type, 1.19825839466702e-06);
 
   return compute(
       x->shape,
@@ -130,7 +131,7 @@ inline Tensor fast_tanh_float(const Tensor& in, std::string name, std::string ta
  */
 inline Tensor fast_tanh(const Tensor& x, std::string name = "T_fast_tanh",
                         std::string tag = kElementWise) {
-  if (x->dtype == DataType::Float(32)) {
+  if (x->GetDataType().MatchesElementType(DLDataTypeCode::kDLFloat, 32)) {
     // invoke fast_tanh_float implementation
     return fast_tanh_float(x, name, tag);
   } else {
@@ -209,9 +210,10 @@ inline Tensor sign(const Tensor& x, std::string name = "T_sign", std::string tag
   return compute(
       x->shape,
       [&](const ffi::Array<Var>& i) {
-        PrimExpr zero = MakeConst(x->dtype, 0);
-        PrimExpr one = MakeConst(x->dtype, 1);
-        PrimExpr minus_one = MakeConst(x->dtype, -1);
+        PrimType x_type(x->GetDataType());
+        PrimExpr zero = MakeConst(x_type, 0);
+        PrimExpr one = MakeConst(x_type, 1);
+        PrimExpr minus_one = MakeConst(x_type, -1);
         auto s1 = tvm::tirx::Select((x(i) < zero), minus_one, zero);
         auto s2 = tvm::tirx::Select((x(i) > zero), one, s1);
         return s2;
@@ -232,7 +234,7 @@ inline Tensor rsqrt(const Tensor& x, std::string name = "tensor", std::string ta
   return compute(
       x->shape,
       [&](const ffi::Array<Var>& i) {
-        PrimExpr one = MakeConst(x->dtype, 1);
+        PrimExpr one = MakeConst(x->GetDataType(), 1);
         return one / tvm::sqrt(x(i));
       },
       name, tag);
@@ -255,8 +257,9 @@ inline Tensor clip(const Tensor& x, const PrimExpr& a_min, const PrimExpr& a_max
   return compute(
       x->shape,
       [&](const ffi::Array<Var>& i) {
-        auto min_val = tvm::cast(x->dtype, a_min);
-        auto max_val = tvm::cast(x->dtype, a_max);
+        PrimType x_type(x->GetDataType());
+        auto min_val = tvm::cast(x_type, a_min);
+        auto max_val = tvm::cast(x_type, a_max);
         return tvm::max(tvm::min(x(i), max_val), min_val);  // NOLINT(*)
       },
       name, tag);
@@ -274,16 +277,24 @@ inline Tensor clip(const Tensor& x, const PrimExpr& a_min, const PrimExpr& a_max
  *
  * \return A Tensor whose op member is the cast operation
  */
-inline Tensor cast(const Tensor& x, DataType type, std::string name = "T_cast",
+inline Tensor cast(const Tensor& x, PrimType type, std::string name, std::string tag);
+
+inline Tensor cast(const Tensor& x, DLDataType type, std::string name = "T_cast",
+                   std::string tag = kElementWise) {
+  return cast(x, PrimType(type), std::move(name), std::move(tag));
+}
+
+inline Tensor cast(const Tensor& x, PrimType type, std::string name = "T_cast",
                    std::string tag = kElementWise) {
   return compute(
       x->shape,
       [&](const ffi::Array<Var>& i) -> PrimExpr {
         auto expr = x(i);
-        if (expr.dtype().code() == type.code() && expr.dtype().bits() == type.bits()) {
-          if (expr.dtype().lanes() == type.lanes()) {
+        PrimType expr_ty = expr.ty();
+        if (expr_ty.MatchesElementType(type.code(), type.bits())) {
+          if (expr_ty.lanes() == type.lanes()) {
             return expr;
-          } else if (expr.dtype().lanes() == 1 && type.is_vector()) {
+          } else if (expr_ty.lanes() == 1 && type.IsFixedLengthVector()) {
             return tvm::tirx::Broadcast(expr, type.lanes());
           }
         }
@@ -303,7 +314,14 @@ inline Tensor cast(const Tensor& x, DataType type, std::string name = "T_cast",
  *
  * \return A Tensor whose op member is the reinterpret operation
  */
-inline Tensor reinterpret(const Tensor& x, DataType type, std::string name = "tensor",
+inline Tensor reinterpret(const Tensor& x, PrimType type, std::string name, std::string tag);
+
+inline Tensor reinterpret(const Tensor& x, DLDataType type, std::string name = "tensor",
+                          std::string tag = kElementWise) {
+  return reinterpret(x, PrimType(type), std::move(name), std::move(tag));
+}
+
+inline Tensor reinterpret(const Tensor& x, PrimType type, std::string name = "tensor",
                           std::string tag = kElementWise) {
   return compute(
       x->shape, [&](const ffi::Array<Var>& i) { return reinterpret(type, x(i)); }, name, tag);
@@ -344,7 +362,15 @@ inline Tensor elemwise_sum(const ffi::Array<Tensor>& xs, std::string name = "T_e
  *
  * \return A Tensor whose op member is the full operation
  */
-inline Tensor full(const ffi::Array<PrimExpr>& shape, DataType dtype, const PrimExpr fill_value,
+inline Tensor full(const ffi::Array<PrimExpr>& shape, PrimType dtype, const PrimExpr fill_value,
+                   std::string name, std::string tag);
+
+inline Tensor full(const ffi::Array<PrimExpr>& shape, DLDataType dtype, const PrimExpr fill_value,
+                   std::string name = "T_full", std::string tag = kElementWise) {
+  return full(shape, PrimType(dtype), fill_value, std::move(name), std::move(tag));
+}
+
+inline Tensor full(const ffi::Array<PrimExpr>& shape, PrimType dtype, const PrimExpr fill_value,
                    std::string name = "T_full", std::string tag = kElementWise) {
   PrimExpr ev = cast(dtype, fill_value);
   if (!ev.defined()) {
@@ -366,7 +392,7 @@ inline Tensor full(const ffi::Array<PrimExpr>& shape, DataType dtype, const Prim
  */
 inline Tensor full_like(const Tensor& x, const PrimExpr fill_value,
                         std::string name = "T_full_like", std::string tag = kElementWise) {
-  PrimExpr ev = cast(x->dtype, fill_value);
+  PrimExpr ev = cast(x->GetDataType(), fill_value);
   return compute(x->shape, [&](const ffi::Array<Var>& i) { return ev; }, name, tag);
 }
 
@@ -392,19 +418,17 @@ inline Tensor full_like(const Tensor& x, const PrimExpr fill_value,
  * y = exp(f) = 1 + 2 * P(x**2)/(Q(x**2) - P(x**2))
  */
 inline Tensor fast_exp_float32(const Tensor& _x, std::string name, std::string tag) {
-  auto x_hi = FloatImm(DataType::Float(32), 88.3762626647950f);
-  auto x_lo = FloatImm(DataType::Float(32), -88.3762626647949f);
-  auto log2e = FloatImm(DataType::Float(32), 1.44269504088896341f);
-  auto ln2 = FloatImm(DataType::Float(32), 0.6931471805599453f);
-  PrimExpr p[6] = {FloatImm(DataType::Float(32), 1.9875691500E-4f),
-                   FloatImm(DataType::Float(32), 1.3981999507E-3f),
-                   FloatImm(DataType::Float(32), 8.3334519073E-3f),
-                   FloatImm(DataType::Float(32), 4.1665795894E-2f),
-                   FloatImm(DataType::Float(32), 1.6666665459E-1f),
-                   FloatImm(DataType::Float(32), 5.0000001201E-1f)};
-  auto one = FloatImm(DataType::Float(32), 1.0f);
-  auto one_half = FloatImm(DataType::Float(32), 0.5f);
-  auto b = FloatImm(DataType::Float(32), 127.0f);
+  PrimType f32_ty = PrimType::Float(32);
+  auto x_hi = FloatImm(f32_ty, 88.3762626647950f);
+  auto x_lo = FloatImm(f32_ty, -88.3762626647949f);
+  auto log2e = FloatImm(f32_ty, 1.44269504088896341f);
+  auto ln2 = FloatImm(f32_ty, 0.6931471805599453f);
+  PrimExpr p[6] = {FloatImm(f32_ty, 1.9875691500E-4f), FloatImm(f32_ty, 1.3981999507E-3f),
+                   FloatImm(f32_ty, 8.3334519073E-3f), FloatImm(f32_ty, 4.1665795894E-2f),
+                   FloatImm(f32_ty, 1.6666665459E-1f), FloatImm(f32_ty, 5.0000001201E-1f)};
+  auto one = FloatImm(f32_ty, 1.0f);
+  auto one_half = FloatImm(f32_ty, 0.5f);
+  auto b = FloatImm(f32_ty, 127.0f);
 
   return compute(
       _x->shape,
@@ -419,7 +443,7 @@ inline Tensor fast_exp_float32(const Tensor& _x, std::string name, std::string t
             (((((p[0] * f + p[1]) * f + p[2]) * f + p[3]) * f + p[4]) * f + p[5]) * f * f + f + one;
         // Return 2^m * exp(r).
         auto ef =
-            tvm::reinterpret(DataType::Float(32), ::tvm::cast(DataType::Int(32), n + b) << 23);
+            tvm::reinterpret(PrimType::Float(32), ::tvm::cast(PrimType::Int(32), n + b) << 23);
         return ::tvm::max(ef * y, _x(i));  // NOLINT(*)
       },
       name, tag);
@@ -437,7 +461,7 @@ inline Tensor fast_exp_float32(const Tensor& _x, std::string name, std::string t
  */
 inline Tensor fast_exp(const Tensor& x, std::string name = "T_fast_exp",
                        std::string tag = kElementWise) {
-  if (x->dtype == DataType::Float(32)) {
+  if (x->GetDataType().MatchesElementType(DLDataTypeCode::kDLFloat, 32)) {
     auto ret = fast_exp_float32(x, name, tag);
     return ret;
   } else {
@@ -474,10 +498,11 @@ inline Tensor fast_erf_float16(const Tensor& data, std::string name, std::string
  */
 inline Tensor fast_erf(const Tensor& x, std::string name = "T_fast_erf",
                        std::string tag = kElementWise) {
-  if (x->dtype == DataType::Float(32)) {
+  PrimType x_type(x->GetDataType());
+  if (x_type.MatchesElementType(DLDataTypeCode::kDLFloat, 32)) {
     auto ret = fast_erf_float32(x, name, tag);
     return ret;
-  } else if (x->dtype == DataType::Float(16)) {
+  } else if (x_type.MatchesElementType(DLDataTypeCode::kDLFloat, 16)) {
     auto ret = fast_erf_float16(x, name, tag);
     return ret;
   } else {
diff --git a/include/tvm/topi/nn.h b/include/tvm/topi/nn.h
index 0a448620dae3..b864bfe53ea3 100644
--- a/include/tvm/topi/nn.h
+++ b/include/tvm/topi/nn.h
@@ -57,7 +57,7 @@ inline tvm::te::Tensor relu(const tvm::te::Tensor& t, T threshold = static_cast<
   return tvm::te::compute(
       t->shape,
       [&](const tvm::ffi::Array<tvm::tirx::Var>& i) {
-        auto threshold_const = tvm::tirx::MakeConst(t->dtype, threshold);
+        auto threshold_const = tvm::tirx::MakeConst(tvm::PrimType(t->dtype), threshold);
         return tvm::max(t(i), threshold_const);
       },
       name, tag);
@@ -80,7 +80,7 @@ inline tvm::te::Tensor leaky_relu(const tvm::te::Tensor& t, double alpha = 0.1,
       t->shape,
       [&](const tvm::ffi::Array<tvm::tirx::Var>& i) {
         auto value = t(i);
-        auto calpha = tvm::tirx::MakeConst(value.dtype(), alpha);
+        auto calpha = tvm::tirx::MakeConst(value.ty(), alpha);
         return tvm::tirx::Select(value > 0, value, value * calpha);
       },
       name, tag);
@@ -171,10 +171,10 @@ inline tvm::te::Tensor pad(
   tvm::ffi::Array<tvm::PrimExpr> pad_after_int32;
 
   for (const auto& ele : pad_before) {
-    pad_before_int32.push_back(tvm::cast(tvm::DataType::Int(32), ele));
+    pad_before_int32.push_back(tvm::cast(tvm::PrimType::Int(32), ele));
   }
   for (const auto& ele : pad_after) {
-    pad_after_int32.push_back(tvm::cast(tvm::DataType::Int(32), ele));
+    pad_after_int32.push_back(tvm::cast(tvm::PrimType::Int(32), ele));
   }
 
   tvm::ffi::Array<tvm::PrimExpr> output_shape;
@@ -194,7 +194,7 @@ inline tvm::te::Tensor pad(
   }
 
   if (!pad_value.defined()) {
-    pad_value = tvm::tirx::MakeConst(t->dtype, 0);
+    pad_value = tvm::tirx::MakeConst(tvm::PrimType(t->dtype), 0);
   }
 
   auto l = [&](tvm::ffi::Array<tvm::tirx::Var> ovars) {
@@ -495,19 +495,19 @@ inline tvm::te::Tensor space_to_batch_nd(const tvm::te::Tensor& data,
   tvm::ffi::Array<tvm::PrimExpr> pad_after_int32;
 
   // pad size for batch dimension is 0
-  pad_before_int32.push_back(tvm::cast(tvm::DataType::Int(32), 0));
-  pad_after_int32.push_back(tvm::cast(tvm::DataType::Int(32), 0));
+  pad_before_int32.push_back(tvm::cast(tvm::PrimType::Int(32), 0));
+  pad_after_int32.push_back(tvm::cast(tvm::PrimType::Int(32), 0));
   // insert pad sizes given for spatial dimensions
   for (const auto& ele : pad_before) {
-    pad_before_int32.push_back(tvm::cast(tvm::DataType::Int(32), ele));
+    pad_before_int32.push_back(tvm::cast(tvm::PrimType::Int(32), ele));
   }
   for (const auto& ele : pad_after) {
-    pad_after_int32.push_back(tvm::cast(tvm::DataType::Int(32), ele));
+    pad_after_int32.push_back(tvm::cast(tvm::PrimType::Int(32), ele));
   }
 
   // pad the input with paddings provided
   if (!pad_value.defined()) {
-    pad_value = tvm::tirx::MakeConst(data->dtype, 0);
+    pad_value = tvm::tirx::MakeConst(tvm::PrimType(data->dtype), 0);
   }
   padded_t = pad(data, pad_before_int32, pad_after_int32, pad_value);
 
@@ -629,9 +629,9 @@ inline tvm::te::Tensor batch_to_space_nd(const tvm::te::Tensor& data,
   // Crop the start and end of dimensions of out
   ffi::Array<ffi::Optional<IntImm>> begin_idx, end_idx;
   ffi::Array<IntImm> strides;
-  DataType index_dtype = DataType::Int(64);
+  PrimType index_ty = PrimType::Int(64);
   for (size_t i = 0; i < r_p_shape.size(); ++i) {
-    strides.push_back(IntImm(index_dtype, 1));
+    strides.push_back(IntImm(index_ty, 1));
     if (i > 0 && i <= num_block_dims) {
       // prepare begin and end index for spatial dimensions
       int64_t begin_i = GetConstInt(crop_begin_list[i - 1]);
@@ -640,12 +640,12 @@ inline tvm::te::Tensor batch_to_space_nd(const tvm::te::Tensor& data,
       TVM_FFI_ICHECK_GT(out_i, (begin_i + end_i))
           << "Incorrect crop sizes for (" << i << ")th dim, can not crop more than"
           << " output size" << out_i << " vs " << (begin_i + end_i);
-      begin_idx.push_back(IntImm(index_dtype, begin_i));
-      end_idx.push_back(IntImm(index_dtype, out_i - end_i));
+      begin_idx.push_back(IntImm(index_ty, begin_i));
+      end_idx.push_back(IntImm(index_ty, out_i - end_i));
     } else {
       // ignore the batch and remaining dimension
-      begin_idx.push_back(IntImm(index_dtype, 0));
-      end_idx.push_back(IntImm(index_dtype, GetConstInt(r_p_shape[i])));
+      begin_idx.push_back(IntImm(index_ty, 0));
+      end_idx.push_back(IntImm(index_ty, GetConstInt(r_p_shape[i])));
     }
   }
 
@@ -677,7 +677,7 @@ inline Tensor nll_loss(const Tensor& predictions, const Tensor& targets, const T
         [&](const tvm::ffi::Array<tvm::tirx::Var>& target_indices) {
           auto c = targets();
           return tvm::tirx::Select(c != ignore_index, -predictions(c) * weights(c),
-                                   tvm::tirx::MakeConst(predictions->dtype, 0));
+                                   tvm::tirx::MakeConst(tvm::PrimType(predictions->dtype), 0));
         },
         name, tag);
     if (reduction == "mean") {
@@ -686,7 +686,7 @@ inline Tensor nll_loss(const Tensor& predictions, const Tensor& targets, const T
           [&](const tvm::ffi::Array<tvm::tirx::Var>& target_indices) {
             auto c = targets();
             return tvm::tirx::Select(c != ignore_index, weights(c),
-                                     tvm::tirx::MakeConst(predictions->dtype, 0));
+                                     tvm::tirx::MakeConst(tvm::PrimType(predictions->dtype), 0));
           },
           name, tag);
       return topi::divide(T, W);
@@ -705,7 +705,7 @@ inline Tensor nll_loss(const Tensor& predictions, const Tensor& targets, const T
           pred_indices.push_back(target_indices[i]);  // indices for multidimensional loss
         }
         return tvm::tirx::Select(c != ignore_index, -predictions(pred_indices) * weights(c),
-                                 tvm::tirx::MakeConst(predictions->dtype, 0));
+                                 tvm::tirx::MakeConst(tvm::PrimType(predictions->dtype), 0));
       },
       name, tag);
   TVM_FFI_ICHECK(T->shape.size() != 0);
@@ -715,7 +715,7 @@ inline Tensor nll_loss(const Tensor& predictions, const Tensor& targets, const T
         [&](const tvm::ffi::Array<tvm::tirx::Var>& target_indices) {
           auto c = targets(target_indices);
           return tvm::tirx::Select(c != ignore_index, weights(c),
-                                   tvm::tirx::MakeConst(predictions->dtype, 0));
+                                   tvm::tirx::MakeConst(tvm::PrimType(predictions->dtype), 0));
         },
         name, tag);
     return topi::divide(topi::sum(T, tvm::ffi::Array<int64_t>(nullptr)),
diff --git a/include/tvm/topi/nn/bnn.h b/include/tvm/topi/nn/bnn.h
index 5faed879c005..56a6f3aaa815 100644
--- a/include/tvm/topi/nn/bnn.h
+++ b/include/tvm/topi/nn/bnn.h
@@ -71,14 +71,14 @@ inline tvm::te::Tensor binarize_pack(const tvm::te::Tensor& data, int axis,
           start_idx.push_back(i == static_cast<size_t>(axis) ? indices[i] * 32
                                                              : static_cast<PrimExpr>(indices[i]));
         }
-        PrimExpr packed = IntImm(DataType::UInt(32), 0);
+        PrimExpr packed = IntImm(PrimType::UInt(32), 0);
         for (size_t j = 0; j < 32; ++j) {
           ffi::Array<PrimExpr> idx;
           for (size_t i = 0; i < n; ++i) {
             idx.push_back(i == static_cast<size_t>(axis) ? start_idx[i] + static_cast<int>(j)
                                                          : start_idx[i]);
           }
-          auto sign = tvm::cast(DataType::UInt(32), data(idx) >= 0);
+          auto sign = tvm::cast(PrimType::UInt(32), data(idx) >= 0);
           packed = (packed | sign);
           if (j == 31) {
             return packed;
@@ -101,8 +101,8 @@ inline tvm::te::Tensor binarize_pack(const tvm::te::Tensor& data, int axis,
 inline tvm::te::Tensor binary_dense(const tvm::te::Tensor& data, const tvm::te::Tensor& weight) {
   TVM_FFI_ICHECK_EQ(data->shape.size(), 2) << "binary_dense requires 2-D data";
   TVM_FFI_ICHECK_EQ(weight->shape.size(), 2) << "binary_dense requires 2-D weight";
-  TVM_FFI_ICHECK_EQ(data->dtype, DataType::UInt(32)) << "binary_dense requires uint32 data";
-  TVM_FFI_ICHECK_EQ(weight->dtype, DataType::UInt(32)) << "binary_dense requires uint32 weight";
+  TVM_FFI_ICHECK_EQ(data->dtype, PrimType::UInt(32)) << "binary_dense requires uint32 data";
+  TVM_FFI_ICHECK_EQ(weight->dtype, PrimType::UInt(32)) << "binary_dense requires uint32 weight";
 
   auto batch = data->shape[0];
   auto in_dim = data->shape[1];
diff --git a/include/tvm/topi/nn/dense.h b/include/tvm/topi/nn/dense.h
index be0030cd40d5..2c7b2330505e 100644
--- a/include/tvm/topi/nn/dense.h
+++ b/include/tvm/topi/nn/dense.h
@@ -46,7 +46,7 @@ using namespace tvm::te;
  * \return Tensor with shape [batch, out_dim]
  */
 inline tvm::te::Tensor dense(const tvm::te::Tensor& data, const tvm::te::Tensor& weight,
-                             const tvm::te::Tensor& bias, const DataType& out_dtype) {
+                             const tvm::te::Tensor& bias, const PrimType& out_dtype) {
   TVM_FFI_ICHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
   TVM_FFI_ICHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
   if (bias.defined()) {
diff --git a/include/tvm/topi/nn/dilate.h b/include/tvm/topi/nn/dilate.h
index 0c8ea395c701..f45543eda337 100644
--- a/include/tvm/topi/nn/dilate.h
+++ b/include/tvm/topi/nn/dilate.h
@@ -95,7 +95,7 @@ inline Tensor dilate(const Tensor& x, ffi::Array<PrimExpr> strides, double dilat
         if (not_zero.size() > 0) {
           auto all_not_zero = all(not_zero);
           return tvm::if_then_else(all_not_zero, x(index_tuple),
-                                   MakeConst(x->dtype, dilation_value));
+                                   MakeConst(PrimType(x->dtype), dilation_value));
         }
         return x(index_tuple);
       },
diff --git a/include/tvm/topi/nn/group_norm.h b/include/tvm/topi/nn/group_norm.h
index 4962587a9396..7a778dea8ce5 100644
--- a/include/tvm/topi/nn/group_norm.h
+++ b/include/tvm/topi/nn/group_norm.h
@@ -45,9 +45,9 @@ inline Tensor group_norm(const Tensor& data, const Tensor& gamma, const Tensor&
   const auto& beta_type = beta.defined() ? beta->dtype : data_type;
   TVM_FFI_ICHECK(data_type == gamma_type && data_type == beta_type)
       << "group_norm: data, gamma and beta must have the same type";
-  TVM_FFI_ICHECK(data_type == DataType::Float(32) || data_type == DataType::Float(16))
+  TVM_FFI_ICHECK(data_type == PrimType::Float(32) || data_type == PrimType::Float(16))
       << "group_norm: only support float32 and float16 for now";
-  bool is_float16 = data_type == DataType::Float(16);
+  bool is_float16 = data_type == PrimType::Float(16);
   // reshape data C -> G, C/G
   int ndim = data->shape.size();
   channel_axis = GetRealAxis(static_cast<int>(ndim), ffi::Array<int64_t>({channel_axis}))[0];
@@ -65,7 +65,7 @@ inline Tensor group_norm(const Tensor& data, const Tensor& gamma, const Tensor&
   }
   Tensor data_reshaped;
   if (is_float16) {
-    data_reshaped = cast(reshape(data, new_shape), DataType::Float(32));
+    data_reshaped = cast(reshape(data, new_shape), PrimType::Float(32));
   } else {
     data_reshaped = reshape(data, new_shape);
   }
@@ -126,7 +126,7 @@ inline Tensor group_norm(const Tensor& data, const Tensor& gamma, const Tensor&
 
   auto temp_x = temp_x_x2[0];
   auto temp_x2 = temp_x_x2[1];
-  PrimExpr reduce_extent = FloatImm(DataType::Float(32), 1);
+  PrimExpr reduce_extent = FloatImm(PrimType::Float(32), 1);
   for (auto axis : new_axes) {
     reduce_extent *= data_reshaped->shape[axis];
   }
@@ -142,10 +142,10 @@ inline Tensor group_norm(const Tensor& data, const Tensor& gamma, const Tensor&
     gamma_indices = {indices[channel_axis], indices[channel_axis + 1]};
     auto mean = temp_x(non_reduce_indices) / reduce_extent;
     auto var = temp_x2(non_reduce_indices) / reduce_extent - mean * mean;
-    PrimExpr group_norm =
-        (data_reshaped(indices) - mean) * tvm::rsqrt(var + MakeConst(data->dtype, epsilon));
+    PrimExpr group_norm = (data_reshaped(indices) - mean) *
+                          tvm::rsqrt(var + MakeConst(PrimType(data->dtype), epsilon));
     if (is_float16) {
-      group_norm = Cast(DataType::Float(16), group_norm);
+      group_norm = Cast(PrimType::Float(16), group_norm);
     }
     if (gamma.defined()) {
       group_norm = topi::multiply(group_norm, gamma_reshaped(gamma_indices));
diff --git a/include/tvm/topi/nn/instance_norm.h b/include/tvm/topi/nn/instance_norm.h
index 60361e8bc681..e246d97a59df 100644
--- a/include/tvm/topi/nn/instance_norm.h
+++ b/include/tvm/topi/nn/instance_norm.h
@@ -58,9 +58,9 @@ inline Tensor instance_norm(const Tensor& data, const Tensor& gamma, const Tenso
   const auto& beta_type = beta.defined() ? beta->dtype : data_type;
   TVM_FFI_ICHECK(data_type == gamma_type && data_type == beta_type)
       << "instance_norm: data, gamma and beta must have the same type";
-  TVM_FFI_ICHECK(data_type == DataType::Float(32) || data_type == DataType::Float(16))
+  TVM_FFI_ICHECK(data_type == PrimType::Float(32) || data_type == PrimType::Float(16))
       << "instance_norm: only support float32 and float16 for now";
-  bool is_float16 = data_type == DataType::Float(16);
+  bool is_float16 = data_type == PrimType::Float(16);
   // sum x and x^2
   auto ndim = data->shape.size();
   TVM_FFI_ICHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
@@ -69,9 +69,10 @@ inline Tensor instance_norm(const Tensor& data, const Tensor& gamma, const Tenso
   auto target_shape =
       MakeReduceTargetShape(real_axis, data, /*keepdims=*/false, /*atleast1d=*/true);
   auto func = MakeTupleSumReducer();
+  PrimType f32_ty = PrimType::Float(32);
 
-  auto compute = [ndim, is_float16, &real_axis, &reduce_axes, &func,
-                  &data](const ffi::Array<Var>& indices) {
+  auto compute = [ndim, is_float16, &real_axis, &reduce_axes, &func, &data,
+                  f32_ty](const ffi::Array<Var>& indices) {
     ffi::Array<PrimExpr> eval_range;
     int arg_counter = 0;
     int red_counter = 0;
@@ -86,15 +87,14 @@ inline Tensor instance_norm(const Tensor& data, const Tensor& gamma, const Tenso
         arg_counter++;
       }
     }
-    auto square = [is_float16](const PrimExpr& x) {
+    auto square = [is_float16, f32_ty](const PrimExpr& x) {
       if (is_float16) {
-        return Cast(DataType::Float(32), x) * Cast(DataType::Float(32), x);
+        return Cast(f32_ty, x) * Cast(f32_ty, x);
       }
       return x * x;
     };
     if (is_float16) {
-      return func({Cast(DataType::Float(32), data(eval_range)), square(data(eval_range))},
-                  reduce_axes, nullptr);
+      return func({Cast(f32_ty, data(eval_range)), square(data(eval_range))}, reduce_axes, nullptr);
     } else {
       return func({data(eval_range), square(data(eval_range))}, reduce_axes, nullptr);
     }
@@ -106,7 +106,7 @@ inline Tensor instance_norm(const Tensor& data, const Tensor& gamma, const Tenso
   auto temp_x = temp_x_x2[0];
   auto temp_x2 = temp_x_x2[1];
 
-  auto reduce_extent = MakeConst(data->dtype, 1);
+  auto reduce_extent = MakeConst(PrimType(data->dtype), 1);
   for (int i : real_axis) {
     reduce_extent *= data->shape[i];
   }
@@ -124,9 +124,9 @@ inline Tensor instance_norm(const Tensor& data, const Tensor& gamma, const Tenso
     channel = indices[channel_axis];
     auto mean = temp_x(non_reduce_indices) / reduce_extent;
     auto var = temp_x2(non_reduce_indices) / reduce_extent - mean * mean;
-    auto instance_norm = (data(indices) - mean) * tvm::rsqrt(var + MakeConst(var->dtype, epsilon));
+    auto instance_norm = (data(indices) - mean) * tvm::rsqrt(var + MakeConst(var.ty(), epsilon));
     if (is_float16) {
-      instance_norm = Cast(DataType::Float(16), instance_norm);
+      instance_norm = Cast(PrimType::Float(16), instance_norm);
     }
     instance_norm = topi::multiply(instance_norm, gamma(channel));
     if (beta.defined()) {
diff --git a/include/tvm/topi/nn/layer_norm.h b/include/tvm/topi/nn/layer_norm.h
index fb8155ef654a..8a995d7b91fe 100644
--- a/include/tvm/topi/nn/layer_norm.h
+++ b/include/tvm/topi/nn/layer_norm.h
@@ -57,9 +57,9 @@ inline Tensor layer_norm(const Tensor& data, const Tensor& gamma, const Tensor&
   const auto& beta_type = beta.defined() ? beta->dtype : data_type;
   TVM_FFI_ICHECK(data_type == gamma_type && data_type == beta_type)
       << "layer_norm: data, gamma and beta must have the same type";
-  TVM_FFI_ICHECK(data_type == DataType::Float(32) || data_type == DataType::Float(16))
+  TVM_FFI_ICHECK(data_type == PrimType::Float(32) || data_type == PrimType::Float(16))
       << "layer_norm: only support float32 and float16 for now";
-  bool is_float16 = data_type == DataType::Float(16);
+  bool is_float16 = data_type == PrimType::Float(16);
   // Two-pass algorithm for improved numerical stability:
   //   pass1: mean = E[x]
   //   pass2: var = E[(x - mean)^2]
@@ -69,6 +69,7 @@ inline Tensor layer_norm(const Tensor& data, const Tensor& gamma, const Tensor&
   auto reduce_axes = MakeReduceAxes(real_axis, data);
   auto target_shape =
       MakeReduceTargetShape(real_axis, data, /*keepdims=*/false, /*atleast1d=*/false);
+  PrimType f32_ty = PrimType::Float(32);
 
   auto make_eval_range = [&real_axis, &reduce_axes,
                           ndim](const ffi::Array<Var>& non_reduce_indices) {
@@ -91,17 +92,17 @@ inline Tensor layer_norm(const Tensor& data, const Tensor& gamma, const Tensor&
 
   Tensor temp_sum = te::compute(
       target_shape,
-      [is_float16, &data, &reduce_axes, &make_eval_range](const ffi::Array<Var>& indices) {
+      [is_float16, &data, &reduce_axes, &make_eval_range, f32_ty](const ffi::Array<Var>& indices) {
         auto eval_range = make_eval_range(indices);
         PrimExpr x = data(eval_range);
         if (is_float16) {
-          x = Cast(DataType::Float(32), x);
+          x = Cast(f32_ty, x);
         }
         return sum(x, reduce_axes);
       },
       data->op->name + "_sum", kCommReduce);
 
-  DataType reduce_dtype = is_float16 ? DataType::Float(32) : data->dtype;
+  PrimType reduce_dtype = is_float16 ? PrimType::Float(32) : PrimType(data->dtype);
   PrimExpr reduce_extent = MakeConst(reduce_dtype, 1);
   for (int i : real_axis) {
     reduce_extent *= data->shape[i];
@@ -115,12 +116,12 @@ inline Tensor layer_norm(const Tensor& data, const Tensor& gamma, const Tensor&
 
   Tensor temp_var_sum = te::compute(
       target_shape,
-      [is_float16, &data, &reduce_axes, &make_eval_range,
-       &temp_mean](const ffi::Array<Var>& indices) {
+      [is_float16, &data, &reduce_axes, &make_eval_range, &temp_mean,
+       f32_ty](const ffi::Array<Var>& indices) {
         auto eval_range = make_eval_range(indices);
         PrimExpr x = data(eval_range);
         if (is_float16) {
-          x = Cast(DataType::Float(32), x);
+          x = Cast(f32_ty, x);
         }
         PrimExpr diff = x - temp_mean(indices);
         return sum(diff * diff, reduce_axes);
@@ -138,9 +139,9 @@ inline Tensor layer_norm(const Tensor& data, const Tensor& gamma, const Tensor&
     }
     auto mean = temp_mean(non_reduce_indices);
     auto var = temp_var_sum(non_reduce_indices) / reduce_extent;
-    auto layer_norm = (data(indices) - mean) * rsqrt(var + MakeConst(var->dtype, epsilon));
+    auto layer_norm = (data(indices) - mean) * rsqrt(var + MakeConst(var.ty(), epsilon));
     if (is_float16) {
-      layer_norm = Cast(DataType::Float(16), layer_norm);
+      layer_norm = Cast(PrimType::Float(16), layer_norm);
     }
     layer_norm = topi::multiply(layer_norm, gamma(reduce_indices));
     if (beta.defined()) {
diff --git a/include/tvm/topi/nn/local_response_norm.h b/include/tvm/topi/nn/local_response_norm.h
index 7407448f88c5..4f411076387d 100644
--- a/include/tvm/topi/nn/local_response_norm.h
+++ b/include/tvm/topi/nn/local_response_norm.h
@@ -55,7 +55,8 @@ inline Tensor lrn(const Tensor& data, int size, int axis = 1, float alpha = 0.00
   TVM_FFI_ICHECK_EQ(data->shape.size(), 4) << "LRN requires 4-D input";
   TVM_FFI_ICHECK_EQ(size % 2, 1) << "size should be odd number";
   TVM_FFI_ICHECK(axis == 1 || axis == 3) << "axis should be 1 or 3 for NCHW and NHWC";
-  TVM_FFI_ICHECK(data->dtype.is_float()) << "datatype should be float";
+  // LRN only requires a floating-point element kind; lane encoding is irrelevant here.
+  TVM_FFI_ICHECK_EQ(data->dtype.code(), DLDataTypeCode::kDLFloat) << "datatype should be float";
   auto input_shape = data->shape;
   ffi::Array<PrimExpr> pad_before{0, 0, 0, 0};
   ffi::Array<PrimExpr> pad_after{0, 0, 0, 0};
@@ -79,9 +80,9 @@ inline Tensor lrn(const Tensor& data, int size, int axis = 1, float alpha = 0.00
         },
         "tensor", "sqr_sum");
   }
-  PrimExpr alpha_imm = tvm::te::MakeConst(data->dtype, alpha);
-  PrimExpr beta_imm = tvm::te::MakeConst(data->dtype, beta);
-  PrimExpr bias_imm = tvm::te::MakeConst(data->dtype, bias);
+  PrimExpr alpha_imm = tvm::te::MakeConst(PrimType(data->dtype), alpha);
+  PrimExpr beta_imm = tvm::te::MakeConst(PrimType(data->dtype), beta);
+  PrimExpr bias_imm = tvm::te::MakeConst(PrimType(data->dtype), bias);
   auto sqrt_sum_up = tvm::te::compute(
       input_shape,
       [&](Var i, Var j, Var k, Var l) {
diff --git a/include/tvm/topi/nn/pooling.h b/include/tvm/topi/nn/pooling.h
index e8410d8add22..91b10e7d8df9 100644
--- a/include/tvm/topi/nn/pooling.h
+++ b/include/tvm/topi/nn/pooling.h
@@ -117,7 +117,8 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
         tvm::te::reduce_axis(Range(0, (kernel_width + stride_width - 1) / stride_width), "ww");
 
     auto argmax = MakeArgmaxReducer();
-    auto pad_x = do_pad ? pad(x, pad_before, pad_after, tvm::min_value(x->dtype), "pad_temp") : x;
+    auto pad_x =
+        do_pad ? pad(x, pad_before, pad_after, tvm::min_value(PrimType(x->dtype)), "pad_temp") : x;
 
     auto mp_argmax = tvm::te::compute(
         out_shape,
@@ -145,17 +146,17 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
           out_idx.Set(width_axis, (inds[width_axis] + pad_left) / stride_width - windoww);
 
           PrimExpr out_idx_lower_h = tirx::Select(
-              pad_inds[height_axis] < kernel_height, IntImm(pad_inds[height_axis].dtype(), 0),
+              pad_inds[height_axis] < kernel_height, IntImm(pad_inds[height_axis].ty(), 0),
               (pad_inds[height_axis] - kernel_height) / stride_height + 1);
           PrimExpr out_idx_lower_w = tirx::Select(
-              pad_inds[width_axis] < kernel_width, IntImm(pad_inds[width_axis].dtype(), 0),
+              pad_inds[width_axis] < kernel_width, IntImm(pad_inds[width_axis].ty(), 0),
               (pad_inds[width_axis] - kernel_width) / stride_width + 1);
 
           return tvm::sum(
               tvm::if_then_else(tirx::And(tirx::And(out_idx[height_axis] >= out_idx_lower_h,
                                                     out_idx[width_axis] >= out_idx_lower_w),
                                           mp_inds(out_idx) == idx),
-                                out_grad(out_idx), MakeConst(x->dtype, 0)),
+                                out_grad(out_idx), MakeConst(PrimType(x->dtype), 0)),
               {windowh, windoww});
         },
         "T_pool_grad", "pool_grad_max");
@@ -176,10 +177,10 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
           out_idx.Set(width_axis, (pad_w_idx / stride_width - windoww));
 
           PrimExpr out_idx_lower_h =
-              tirx::Select(pad_h_idx < kernel_height, IntImm(pad_h_idx.dtype(), 0),
+              tirx::Select(pad_h_idx < kernel_height, IntImm(pad_h_idx.ty(), 0),
                            (pad_h_idx - kernel_height) / stride_height + 1);
           PrimExpr out_idx_lower_w =
-              tirx::Select(pad_w_idx < kernel_width, IntImm(pad_w_idx.dtype(), 0),
+              tirx::Select(pad_w_idx < kernel_width, IntImm(pad_w_idx.ty(), 0),
                            (pad_w_idx - kernel_width) / stride_width + 1);
 
           PrimExpr divide_factor;  // number of pooled elements
@@ -191,16 +192,17 @@ inline Tensor pool_grad_impl(const Tensor& out_grad, const Tensor& x,
 
             PrimExpr h_end = min(h_start + kernel_height, height);
             PrimExpr w_end = min(w_start + kernel_width, width);
-            h_start = max(h_start, IntImm(h_start.dtype(), 0));
-            w_start = max(w_start, IntImm(w_start.dtype(), 0));
-            divide_factor = max((h_end - h_start) * (w_end - w_start), MakeConst(h_end.dtype(), 1));
+            h_start = max(h_start, IntImm(h_start.ty(), 0));
+            w_start = max(w_start, IntImm(w_start.ty(), 0));
+            divide_factor = max((h_end - h_start) * (w_end - w_start), MakeConst(h_end.ty(), 1));
           }
           return tvm::sum(
               tvm::if_then_else(tirx::And(tirx::And(out_idx[height_axis] >= out_idx_lower_h,
                                                     out_idx[height_axis] < out_height),
                                           tirx::And(out_idx[width_axis] >= out_idx_lower_w,
                                                     out_idx[width_axis] < out_width)),
-                                out_grad(out_idx) / divide_factor, MakeConst(out_grad->dtype, 0)),
+                                out_grad(out_idx) / divide_factor,
+                                MakeConst(PrimType(out_grad->dtype), 0)),
               {windowh, windoww});
         },
         "T_pool_grad", "pool_grad_avg");
@@ -384,9 +386,9 @@ inline Tensor adaptive_pool_impl(const Tensor& x, const ffi::Array<PrimExpr>& ou
           ffi::Array<tirx::IterVar> reduce_axes;
           std::tie(indices, reduce_axes) = get_iter_vars(output, false);
 
-          PrimExpr divide_factor = tvm::cast(x->dtype, 1);
+          PrimExpr divide_factor = tvm::cast(PrimType(x->dtype), 1);
           for (size_t i = 0; i < n_dim; ++i) {
-            divide_factor *= tvm::cast(DataType::Int(32), reduce_axes[i]->dom->extent);
+            divide_factor *= tvm::cast(PrimType::Int(32), reduce_axes[i]->dom->extent);
           }
 
           return div(pool_sum(indices), divide_factor);
@@ -582,7 +584,8 @@ inline Tensor pool_impl_nd(const Tensor& x, const ffi::Array<PrimExpr>& kernel_s
 
   ffi::Map<ffi::String, ffi::Any> attrs;
   if (pool_type == kMaxPool) {
-    auto temp = do_pad ? pad(x, pad_before, pad_after, tvm::min_value(x->dtype), "pad_temp") : x;
+    auto temp =
+        do_pad ? pad(x, pad_before, pad_after, tvm::min_value(PrimType(x->dtype)), "pad_temp") : x;
     attrs.Set("schedule_rule", tvm::ffi::String("meta_schedule.pool_max"));
     return tvm::te::compute(
         out_shape,
@@ -657,7 +660,7 @@ inline Tensor pool_impl_nd(const Tensor& x, const ffi::Array<PrimExpr>& kernel_s
               // number that represents the number of steps along the dilated kernel to reach a
               // non-padded value. Otherwise this should be 0.
               PrimExpr jumps_to_non_pad = (dilation[i] - 1 - start[i]) / dilation[i];
-              jumps_to_non_pad = max(jumps_to_non_pad, IntImm(jumps_to_non_pad.dtype(), 0));
+              jumps_to_non_pad = max(jumps_to_non_pad, IntImm(jumps_to_non_pad.ty(), 0));
 
               end[i] = min(end[i], data_shape[ii] - 1);
               num_el *= (end[i] - (start[i] + dilation[i] * jumps_to_non_pad)) / dilation[i] + 1;
diff --git a/include/tvm/topi/nn/rms_norm.h b/include/tvm/topi/nn/rms_norm.h
index 294d82054e3e..29f46918a754 100644
--- a/include/tvm/topi/nn/rms_norm.h
+++ b/include/tvm/topi/nn/rms_norm.h
@@ -54,8 +54,8 @@ inline Tensor rms_norm(const Tensor& data, const Tensor& weight, const ffi::Arra
   const auto& weight_type = weight.defined() ? weight->dtype : data_type;
   TVM_FFI_ICHECK(data_type == weight_type) << "rms_norm: data and weight must have the same type";
 
-  const auto& data_fp32 = cast(data, DataType::Float(32));
-  const auto& weight_fp32 = cast(weight, DataType::Float(32));
+  const auto& data_fp32 = cast(data, PrimType::Float(32));
+  const auto& weight_fp32 = cast(weight, PrimType::Float(32));
 
   auto square = multiply(data_fp32, data_fp32);
   auto square_sum = sum(square, axis, /*keepdims=*/false, /*atleast1d=*/true);
@@ -63,7 +63,7 @@ inline Tensor rms_norm(const Tensor& data, const Tensor& weight, const ffi::Arra
   auto ndim = data_fp32->shape.size();
   TVM_FFI_ICHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
   auto real_axis = GetRealAxis(static_cast<int>(ndim), axis);
-  auto reduce_extent = MakeConst(data_fp32->dtype, 1);
+  auto reduce_extent = MakeConst(PrimType(data_fp32->dtype), 1);
   for (int i : real_axis) {
     reduce_extent *= data_fp32->shape[i];
   }
@@ -74,8 +74,8 @@ inline Tensor rms_norm(const Tensor& data, const Tensor& weight, const ffi::Arra
         non_reduce_indices.push_back(indices[i]);
       }
     }
-    auto output =
-        tvm::rsqrt(square_sum(non_reduce_indices) / reduce_extent + MakeConst(data_type, epsilon));
+    auto output = tvm::rsqrt(square_sum(non_reduce_indices) / reduce_extent +
+                             MakeConst(PrimType(data_type), epsilon));
     return output;
   };
   auto rsqrt_shape = ffi::Array<PrimExpr>();
diff --git a/include/tvm/topi/reduction.h b/include/tvm/topi/reduction.h
index e6b4c5af1dea..fbea4a57eabf 100644
--- a/include/tvm/topi/reduction.h
+++ b/include/tvm/topi/reduction.h
@@ -259,7 +259,7 @@ inline Tensor CommReduceIdx(const Tensor& data, const ffi::Optional<ffi::Array<i
 using FCombine = std::function<ffi::Array<PrimExpr>(ffi::Array<Var> lhs, ffi::Array<Var> rhs)>;
 
 /*! \brief An initializer function for a reduction */
-using FIdentity = std::function<ffi::Array<PrimExpr>(std::vector<DataType> types)>;
+using FIdentity = std::function<ffi::Array<PrimExpr>(std::vector<PrimType> types)>;
 
 /*!
  * \brief Create a commutative reducer for a reduction
@@ -275,10 +275,10 @@ inline FCommReduce MakeCommReducer(FCombine fcombine, FIdentity fidentity,
   return [fcombine, fidentity, name](ffi::Array<PrimExpr> exprs, const ffi::Array<IterVar>& axis,
                                      PrimExpr* condition) {
     ffi::Array<Var> lhs, rhs;
-    std::vector<DataType> dtypes;
+    std::vector<PrimType> dtypes;
 
     for (size_t i = 0; i < exprs.size(); ++i) {
-      auto dtype = exprs[i].dtype();
+      PrimType dtype = exprs[i].ty();
       dtypes.push_back(dtype);
       lhs.push_back(var(name + "_lhs_" + std::to_string(i), dtype));
       rhs.push_back(var(name + "_rhs_" + std::to_string(i), dtype));
@@ -330,7 +330,8 @@ inline PrimExpr ProdOp(PrimExpr source, ffi::Array<IterVar> axis, ffi::Array<Pri
  */
 inline Tensor sum(const Tensor& data, const ffi::Optional<ffi::Array<int64_t>>& axis,
                   bool keepdims = false, bool atleast1d = false) {
-  if (data->dtype.is_bool()) {
+  // Reduction dispatch only depends on boolean element kind; lane encoding is irrelevant here.
+  if (data->dtype.code() == DLDataTypeCode::kDLBool) {
     return CommReduce(data, axis, tvm::any, keepdims, atleast1d);
   } else {
     return CommReduce(data, axis, tvm::sum, keepdims, atleast1d);
@@ -477,7 +478,7 @@ inline FCommReduce MakeArgminReducer(bool select_last_index = false) {
     result.push_back(tvm::tirx::Select(is_smaller, lhs[1], rhs[1]));    // val
     return result;
   };
-  auto fidentity = [&](std::vector<DataType> types) {
+  auto fidentity = [&](std::vector<PrimType> types) {
     ffi::Array<PrimExpr> result;
     result.push_back(tvm::tirx::MakeConst(types[0], -1));  // idx
     result.push_back(tvm::max_value(types[1]));            // val
@@ -539,7 +540,7 @@ inline FCommReduce MakeArgmaxReducer(bool select_last_index = false) {
     result.push_back(tvm::tirx::Select(is_bigger, lhs[1], rhs[1]));     // val
     return result;
   };
-  auto fidentity = [&](std::vector<DataType> types) {
+  auto fidentity = [&](std::vector<PrimType> types) {
     ffi::Array<PrimExpr> result;
     result.push_back(tvm::tirx::MakeConst(types[0], -1));  // idx
     result.push_back(tvm::min_value(types[1]));            // val
@@ -601,7 +602,7 @@ inline FCommReduce MakeTupleSumReducer() {
     }
     return result;
   };
-  auto fidentity = [](std::vector<DataType> types) {
+  auto fidentity = [](std::vector<PrimType> types) {
     ffi::Array<PrimExpr> result;
     for (size_t i = 0; i < types.size(); ++i) {
       result.push_back(tvm::tirx::MakeConst(types[i], 0));
diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index e216cf86ced4..f2ede7af8aa0 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -44,8 +44,8 @@
 #include <utility>
 #include <vector>
 
+#include "tvm/ffi/dtype.h"
 #include "tvm/ir/expr.h"
-#include "tvm/runtime/data_type.h"
 #include "tvm/tirx/expr.h"
 #include "tvm/tirx/op.h"
 #include "tvm/tirx/var.h"
@@ -338,7 +338,8 @@ inline Tensor reshape(const Tensor& x, ffi::Array<PrimExpr> newshape,
   // If either the input shape or the target shape contains a zero, return an empty tensor.
   if (is_empty_shape(target_shape) || is_empty_shape(x->shape)) {
     return compute(
-        target_shape, [&](const ffi::Array<Var>& indices) { return tvm::cast(x->dtype, 0); }, name,
+        target_shape,
+        [&](const ffi::Array<Var>& indices) { return tvm::cast(PrimType(x->dtype), 0); }, name,
         tag);
   } else {
     return compute(
@@ -679,7 +680,7 @@ inline PrimExpr CanonicalizeIndex(PrimExpr index, PrimExpr extent, PrimExpr stri
   if (index->IsInstance<tvm::IntImmNode>() && extent->IsInstance<tvm::IntImmNode>() &&
       stride->IsInstance<tvm::IntImmNode>()) {
     return tvm::IntImm(
-        tvm::DataType::Int(64),
+        tvm::PrimType::Int(64),
         StaticCanonicalizeIndex(GetConstInt(index), GetConstInt(extent), GetConstInt(stride)));
   }
   return DynamicCanonicalizeIndex(index, extent, stride);
@@ -835,14 +836,14 @@ inline te::Tensor dynamic_strided_slice(const te::Tensor& x, const te::Tensor& b
                                         bool assume_inbound = true,
                                         std::string name = "T_strided_slice_dynamic",
                                         std::string tag = topi::kInjective) {
-  DataType index_dtype = begin->shape[0]->dtype;
+  PrimType index_ty = begin->shape[0].ty();
   const int64_t num_dynamic_axes = begin->shape[0].as<IntImmNode>()->value;
   TVM_FFI_ICHECK_EQ(end->shape[0].as<IntImmNode>()->value, num_dynamic_axes);
   TVM_FFI_ICHECK_EQ(strides->shape[0].as<IntImmNode>()->value, num_dynamic_axes);
 
   ffi::Array<PrimExpr> begin_expr, end_expr, strides_expr;
   for (int64_t i = 0; i < num_dynamic_axes; ++i) {
-    auto ind = MakeConst(index_dtype, i);
+    auto ind = MakeConst(index_ty, i);
     begin_expr.push_back(begin(ind));
     end_expr.push_back(end(ind));
     strides_expr.push_back(strides(ind));
@@ -874,10 +875,10 @@ inline ffi::Array<PrimExpr> StridedSliceOutputShape(const ffi::Array<PrimExpr>&
                  axes.size() == strides.size());
   std::vector<int64_t> begin_vec, end_vec, strides_vec;
   std::tie(begin_vec, end_vec, strides_vec) = ConvertToVec(begin, end, strides, slice_mode);
-  DataType index_dtype =
-      (begin.size() > 0 && begin[0].defined()) ? begin[0].value()->dtype : DataType::Int(64);
+  PrimType index_ty =
+      (begin.size() > 0 && begin[0].defined()) ? begin[0].value().ty() : PrimType::Int(64);
   auto begin_canonicalized =
-      StridedSliceCanonicalizeBegin(ishape, begin_vec, strides_vec, axes, index_dtype, slice_mode);
+      StridedSliceCanonicalizeBegin(ishape, begin_vec, strides_vec, axes, index_ty, slice_mode);
   return StridedSliceOutputShape(ishape, begin_vec, end_vec, strides_vec, axes, slice_mode,
                                  begin_canonicalized, true);
 }
@@ -924,10 +925,10 @@ inline Tensor strided_slice_with_axes(
   std::vector<int64_t> begin_vec, end_vec, strides_vec;
   std::tie(begin_vec, end_vec, strides_vec) = ConvertToVec(begin, end, strides, slice_mode);
 
-  DataType index_dtype =
-      (begin.size() > 0 && begin[0].defined()) ? begin[0].value()->dtype : DataType::Int(64);
+  PrimType index_ty =
+      (begin.size() > 0 && begin[0].defined()) ? begin[0].value().ty() : PrimType::Int(64);
   auto begin_expr = StridedSliceCanonicalizeBegin(x->shape, begin_vec, strides_vec, normalized_axes,
-                                                  index_dtype, slice_mode);
+                                                  index_ty, slice_mode);
   auto out_shape = StridedSliceOutputShape(x->shape, begin_vec, end_vec, strides_vec,
                                            normalized_axes, slice_mode, begin_expr);
 
@@ -938,7 +939,7 @@ inline Tensor strided_slice_with_axes(
         for (size_t i = 0; i < out_shape.size(); ++i) real_indices.push_back(indices[i]);
         for (size_t i = 0; i < normalized_axes.size(); ++i) {
           int64_t ax = normalized_axes[i];
-          auto stride = MakeConst(strides[i]->dtype, strides_vec[i]);
+          auto stride = MakeConst(strides[i]->ty(), strides_vec[i]);
           PrimExpr ind = indices[ax] * stride + begin_expr[i];
           real_indices.Set(ax, ind);
         }
@@ -972,11 +973,11 @@ inline Tensor strided_slice(const Tensor& x, const ffi::Array<ffi::Optional<IntI
   ffi::Array<ffi::Optional<IntImm>> end_full(end);
   ffi::Array<IntImm> strides_full(strides);
 
-  DataType index_dtype =
-      (begin.size() > 0 && begin[0].defined()) ? begin[0].value()->dtype : DataType::Int(64);
-  const IntImm one = IntImm(index_dtype, 1);
-  const IntImm zero = IntImm(index_dtype, 0);
-  const IntImm max_range = max_value(index_dtype).as_or_throw<IntImm>();
+  PrimType index_ty =
+      (begin.size() > 0 && begin[0].defined()) ? begin[0].value().ty() : PrimType::Int(64);
+  const IntImm one = IntImm(index_ty, 1);
+  const IntImm zero = IntImm(index_ty, 0);
+  const IntImm max_range = max_value(index_ty).as_or_throw<IntImm>();
 
   for (size_t i = strides.size(); i < src_tensor_dim; ++i) {
     strides_full.push_back(one);
@@ -1073,7 +1074,8 @@ inline Tensor take(const Tensor& a, const Tensor& indices, int batch_dims,
         [&](const ffi::Array<Var>& out_index) {
           auto idx = tvm::if_then_else(
               indices(out_index) < 0 || indices(out_index) >= a_size,
-              tvm::FloatImm(a->dtype, std::numeric_limits<float>::quiet_NaN()), indices(out_index));
+              tvm::FloatImm(tvm::PrimType(a->dtype), std::numeric_limits<float>::quiet_NaN()),
+              indices(out_index));
           return a(UnravelIndex(idx, a_shape));
         },
         name, tag);
@@ -1116,9 +1118,9 @@ inline Tensor sequence_mask(const Tensor& data, const Tensor& valid_length, doub
         auto tid = out_index[axis];
         auto bid = out_index[1 - axis];
         len_index.push_back(bid);
-        PrimExpr ret =
-            tvm::if_then_else(tvm::cast(valid_length->dtype, tid) >= valid_length(len_index),
-                              tvm::tirx::MakeConst(data->dtype, mask_value), data(out_index));
+        PrimExpr ret = tvm::if_then_else(
+            tvm::cast(PrimType(valid_length->dtype), tid) >= valid_length(len_index),
+            tvm::tirx::MakeConst(PrimType(data->dtype), mask_value), data(out_index));
         return ret;
       },
       name, tag);
@@ -1293,7 +1295,7 @@ inline Tensor take(const Tensor& a, ffi::Variant<Tensor, PrimExpr> indices, int
           PrimExpr in_bounds = idx >= 0 && idx < axis_dim;
           return tvm::if_then_else(
               in_bounds, a(real_indices),
-              tvm::tirx::MakeConst(a->dtype, std::numeric_limits<float>::quiet_NaN()));
+              tvm::tirx::MakeConst(PrimType(a->dtype), std::numeric_limits<float>::quiet_NaN()));
         },
         name, tag);
   } else {  // mode == "wrap"
@@ -1443,8 +1445,8 @@ inline Tensor tile(const Tensor& x, ffi::Array<int64_t> reps, std::string name =
 
   if (is_empty_shape(new_shape)) {
     return compute(
-        new_shape, [&](const ffi::Array<Var>& indices) { return tvm::cast(x->dtype, 0); }, name,
-        tag);
+        new_shape, [&](const ffi::Array<Var>& indices) { return tvm::cast(PrimType(x->dtype), 0); },
+        name, tag);
   } else {
     return compute(
         new_shape,
@@ -1478,8 +1480,8 @@ inline Tensor dyn_tile(const Tensor& x, ffi::Array<PrimExpr> new_shape, size_t r
   size_t ndim = x->shape.size();
   if (is_empty_shape(new_shape)) {
     return compute(
-        new_shape, [&](const ffi::Array<Var>& indices) { return tvm::cast(x->dtype, 0); }, name,
-        tag);
+        new_shape, [&](const ffi::Array<Var>& indices) { return tvm::cast(PrimType(x->dtype), 0); },
+        name, tag);
   } else {
     return compute(
         new_shape,
@@ -1526,7 +1528,9 @@ inline Tensor gather(const Tensor& data, int axis, const Tensor& indices,
     size_t indices_dim_i = static_cast<size_t>(GetConstInt(indices->shape[axis]));
     TVM_FFI_ICHECK_GE(indices_dim_i, 1);
   }
-  TVM_FFI_ICHECK(indices->dtype.is_int() || indices->dtype.is_uint());
+  // Index tensors are validated by integer element kind; vector lane encoding is irrelevant here.
+  PrimType indices_ty = indices->dtype;
+  TVM_FFI_ICHECK(indices_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt));
 
   ffi::Array<PrimExpr> out_shape;
   for (size_t i = 0; i < ndim_i; ++i) {
@@ -1593,10 +1597,13 @@ inline Tensor gather_nd(const Tensor& data, const Tensor& indices, int batch_dim
         }
         for (size_t i = 0; i < indices_dim0; ++i) {
           indices_position.Set(0, IntImm::Int32(i));
-          if (indices->dtype.is_int() || indices->dtype.is_uint()) {
+          // Index tensors are validated by integer element kind; vector lane encoding is
+          // irrelevant for choosing whether an index cast is needed.
+          PrimType indices_ty = indices->dtype;
+          if (indices_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
             real_indices.push_back(indices(indices_position));
           } else {
-            real_indices.push_back(tvm::cast(tvm::DataType::Int(32), indices(indices_position)));
+            real_indices.push_back(tvm::cast(tvm::PrimType::Int(32), indices(indices_position)));
           }
         }
         if (real_indices.size() == ndim_d) {
@@ -1740,10 +1747,15 @@ inline Tensor tensordot(const Tensor& A, const tvm::te::Tensor& B, ffi::Array<Pr
 }
 
 inline Tensor arange(const PrimExpr& start, const PrimExpr& stop, const PrimExpr& step,
-                     DataType dtype, std::string name = "T_arange", std::string tag = kInjective) {
+                     PrimType dtype, std::string name = "T_arange", std::string tag = kInjective) {
   arith::Analyzer analyzer;
   PrimExpr num_elem;
-  bool is_all_int = start.dtype().is_int() && stop.dtype().is_int() && step.dtype().is_int();
+  PrimType start_ty = start.ty();
+  PrimType stop_ty = stop.ty();
+  PrimType step_ty = step.ty();
+  bool is_all_int = start_ty.code() == DLDataTypeCode::kDLInt &&
+                    stop_ty.code() == DLDataTypeCode::kDLInt &&
+                    step_ty.code() == DLDataTypeCode::kDLInt;
   if (is_all_int && analyzer->CanProveGreaterEqual(step, 1)) {
     // fast path for integer arange when step is positive
     num_elem = tvm::floordiv((stop - start + step - 1), step);
@@ -1752,8 +1764,8 @@ inline Tensor arange(const PrimExpr& start, const PrimExpr& stop, const PrimExpr
     num_elem = tvm::floordiv((start - stop - step - 1), -step);
   } else {
     // fallback path for non-integer or step of unknown sign
-    num_elem = tvm::cast(DefaultIndexType(),
-                         tvm::ceil(tvm::cast(tvm::DataType::Float(32), stop - start) / step));
+    num_elem = tvm::cast(PrimType(DefaultIndexType()),
+                         tvm::ceil(tvm::cast(tvm::PrimType::Float(32), stop - start) / step));
   }
   num_elem = analyzer->Simplify(num_elem);
 
@@ -1845,7 +1857,8 @@ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout,
         for (size_t i = 0; i < src.ndim(); ++i) {
           in_range = in_range && (src_indices[i] < src->shape[i]);
         }
-        return if_then_else(in_range, src(src_indices), tvm::cast(src->dtype, PrimExpr(0)));
+        return if_then_else(in_range, src(src_indices),
+                            tvm::cast(PrimType(src->dtype), PrimExpr(0)));
       },
       name, tag, attrs);
 }
@@ -1960,7 +1973,7 @@ inline Tensor meta_schedule_layout_transform(
   ffi::Array<Range> iter_domain;
   iter_domain.reserve(src->shape.size());
   for (const PrimExpr& e : src->shape) {
-    iter_domain.push_back(Range::FromMinExtent(IntImm(e->dtype, 0), e));
+    iter_domain.push_back(Range::FromMinExtent(IntImm(e.ty(), 0), e));
   }
   ffi::Array<PrimExpr> post_transform_shape = index_map->MapShape(src->shape, analyzer);
   return compute(
@@ -1980,7 +1993,7 @@ inline Tensor meta_schedule_layout_transform(
  * \param tag output tensor tag.
  * \return Tensor of input shape.
  */
-inline Tensor shape(const Tensor& src, DataType dtype, const std::string name = "T_shape",
+inline Tensor shape(const Tensor& src, PrimType dtype, const std::string name = "T_shape",
                     const std::string tag = kInjective) {
   int ndim = static_cast<int>(src->shape.size());
   ffi::Array<PrimExpr> out_shape{ndim};
@@ -1997,6 +2010,11 @@ inline Tensor shape(const Tensor& src, DataType dtype, const std::string name =
       name, tag);
 }
 
+inline Tensor shape(const Tensor& src, DLDataType dtype, const std::string name = "T_shape",
+                    const std::string tag = kInjective) {
+  return shape(src, PrimType(dtype), name, tag);
+}
+
 /*!
  * \brief Get the size of input tensor.
  * \param src the input tensor.
@@ -2005,7 +2023,7 @@ inline Tensor shape(const Tensor& src, DataType dtype, const std::string name =
  * \param tag output tensor tag.
  * \return Tensor of input shape.
  */
-inline te::Tensor tensor_size(const te::Tensor& src, const DataType& dtype,
+inline te::Tensor tensor_size(const te::Tensor& src, PrimType dtype,
                               const std::string& name = "tensor_size",
                               const std::string& tag = kInjective) {
   int ndim = static_cast<int>(src->shape.size());
@@ -2022,6 +2040,12 @@ inline te::Tensor tensor_size(const te::Tensor& src, const DataType& dtype,
       name, tag);
 }
 
+inline te::Tensor tensor_size(const te::Tensor& src, DLDataType dtype,
+                              const std::string& name = "tensor_size",
+                              const std::string& tag = kInjective) {
+  return tensor_size(src, PrimType(dtype), name, tag);
+}
+
 /*!
  * \brief Returns a one-hot tensor where the locations repsented by indices take value on_value,
     other locations take value off_value.
@@ -2037,7 +2061,7 @@ inline te::Tensor tensor_size(const te::Tensor& src, const DataType& dtype,
  * \return one-hot tensor.
  */
 inline Tensor one_hot(const Tensor& indices, const PrimExpr on_value, const PrimExpr off_value,
-                      int depth, int axis, const DataType& dtype,
+                      int depth, int axis, PrimType dtype,
                       ffi::Array<PrimExpr> oshape = ffi::Array<PrimExpr>(),
                       const std::string name = "T_one_hot", const std::string tag = kInjective) {
   int true_axis = (axis == -1) ? indices->shape.size() : axis;
@@ -2073,6 +2097,14 @@ inline Tensor one_hot(const Tensor& indices, const PrimExpr on_value, const Prim
       name, tag);
 }
 
+inline Tensor one_hot(const Tensor& indices, const PrimExpr on_value, const PrimExpr off_value,
+                      int depth, int axis, DLDataType dtype,
+                      ffi::Array<PrimExpr> oshape = ffi::Array<PrimExpr>(),
+                      const std::string name = "T_one_hot", const std::string tag = kInjective) {
+  return one_hot(indices, on_value, off_value, depth, axis, PrimType(dtype), std::move(oshape),
+                 name, tag);
+}
+
 /*!
  * \brief Get a dense tensor.
  * \param sparse_indices sparse_indices[i] contains sparse_values[i] will be placed.
@@ -2088,7 +2120,9 @@ inline Tensor sparse_to_dense(const Tensor& sparse_indices,
                               const PrimExpr& default_value,
                               const std::string name = "T_sparse_to_dense",
                               const std::string tag = kInjective) {
-  TVM_FFI_ICHECK(sparse_indices->dtype.is_int()) << "sparse_indices only accepts integer values";
+  // Sparse indices are validated by signed integer element kind; lane encoding is irrelevant here.
+  TVM_FFI_ICHECK_EQ(sparse_indices->dtype.code(), DLDataTypeCode::kDLInt)
+      << "sparse_indices only accepts integer values";
   TVM_FFI_ICHECK_LE(sparse_indices->shape.size(), 3)
       << "sparse_indices tensor should be 0D, 1D, or 2D only";
   TVM_FFI_ICHECK_LE(sparse_values->shape.size(), 2)
diff --git a/python/tvm/ir/expr.py b/python/tvm/ir/expr.py
index 4fbebeddd0f5..e6a33ac9b4a6 100644
--- a/python/tvm/ir/expr.py
+++ b/python/tvm/ir/expr.py
@@ -43,7 +43,10 @@ class PrimExpr(BaseExpr):
     optimizations and integer analysis.
     """
 
-    dtype: str
+    @property
+    def dtype(self):
+        """Return the runtime dtype represented by this expression's PrimType."""
+        return self.ty.dtype
 
 
 @tvm_ffi.register_object("ir.RelaxExpr")
diff --git a/python/tvm/ir/type.py b/python/tvm/ir/type.py
index 567ebafa2d5c..96548439d70e 100644
--- a/python/tvm/ir/type.py
+++ b/python/tvm/ir/type.py
@@ -53,6 +53,35 @@ class PrimType(Type):
     def __init__(self, dtype):
         self.__init_handle_by_constructor__(_ffi_api.PrimType, dtype)
 
+    def __eq__(self, other):
+        if isinstance(other, str):
+            return self.dtype == other
+        return super().__eq__(other)
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __hash__(self):
+        dtype = self.dtype
+        return hash((dtype.type_code, dtype.bits, dtype.lanes))
+
+    def __str__(self):
+        return str(self.dtype)
+
+    def matches_code(self, *codes) -> bool:
+        """Return whether this type has any of the given DLPack dtype codes."""
+        type_code = self.dtype.type_code
+        return any(type_code == int(code) for code in codes)
+
+    def matches_element_type(self, code, bits: int) -> bool:
+        """Return whether this type has the given scalar element code and bits."""
+        dtype = self.dtype
+        return dtype.type_code == int(code) and dtype.bits == bits
+
+    def is_scalar(self) -> bool:
+        """Return whether this type has exactly one fixed lane."""
+        return self.dtype.lanes == 1
+
 
 @tvm_ffi.register_object("ir.PointerType")
 class PointerType(Type):
diff --git a/python/tvm/relax/frontend/nn/extern.py b/python/tvm/relax/frontend/nn/extern.py
index 9c8efce690f1..6c7f3dc72c9f 100644
--- a/python/tvm/relax/frontend/nn/extern.py
+++ b/python/tvm/relax/frontend/nn/extern.py
@@ -145,7 +145,7 @@ def shape_dtype_inference(a, b):
 
         // those headers are guaranteed to be available
         #include <dlpack/dlpack.h>
-        #include <tvm/runtime/data_type.h>
+        #include <tvm/ffi/dtype.h>
         #include <tvm/ffi/function.h>
 
         namespace {
diff --git a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
index f987f48d4251..b9ab88da0b43 100644
--- a/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
+++ b/python/tvm/relax/frontend/torch/base_fx_graph_translator.py
@@ -29,6 +29,7 @@
 import tvm_ffi
 
 from tvm import relax, tirx
+from tvm.runtime import DataTypeCode
 
 
 class BaseFXGraphImporter(metaclass=abc.ABCMeta):
@@ -566,7 +567,7 @@ def _pow(self, node: fx.Node) -> relax.Var:
         if (
             isinstance(lhs, relax.Expr)
             and isinstance(lhs.ty, relax.TensorType)
-            and "int" in lhs.ty.dtype
+            and lhs.ty.dtype.matches_code(DataTypeCode.INT, DataTypeCode.UINT)
             and isinstance(rhs, int)
             and not isinstance(rhs, bool)
             and rhs >= 0
@@ -1607,7 +1608,7 @@ def transpose_and_reshape_back(tensor):
         if attn_mask is not None:
             attn_mask = self.env[attn_mask]
             msg = "Only a float mask is supported for the attn_mask input."
-            assert "float" in attn_mask.ty.dtype, msg
+            assert attn_mask.ty.dtype.matches_code(DataTypeCode.FLOAT, DataTypeCode.BFLOAT), msg
 
         attention_output = self.block_builder.emit(
             relax.op.nn.attention(query, key, value, bias=attn_mask, causal_mask=causal_mask)
diff --git a/python/tvm/relax/op/create.py b/python/tvm/relax/op/create.py
index 9d28ed92f9c5..1bbeeee8f272 100644
--- a/python/tvm/relax/op/create.py
+++ b/python/tvm/relax/op/create.py
@@ -17,6 +17,7 @@
 """Creation operators."""
 
 from tvm import DataType, DataTypeCode
+from tvm.ir import PrimType
 from tvm.ir.expr import PrimExpr
 
 from ..expr import Expr, PrimValue, ShapeExpr
@@ -267,7 +268,12 @@ def is_int(expr):
             return True
         if isinstance(expr, PrimValue):
             expr = expr.value
-        return isinstance(expr, PrimExpr) and DataType(expr.dtype).type_code == DataTypeCode.INT  # type: ignore
+        if isinstance(expr, PrimExpr):
+            dtype = expr.dtype  # type: ignore
+            if isinstance(dtype, PrimType):
+                dtype = dtype.dtype
+            return DataType(dtype).type_code == DataTypeCode.INT
+        return False
 
     if dtype is None:
         args = (start, end, step)
diff --git a/python/tvm/relax/op/manipulate.py b/python/tvm/relax/op/manipulate.py
index 4b787c265bc3..43a2bd400351 100644
--- a/python/tvm/relax/op/manipulate.py
+++ b/python/tvm/relax/op/manipulate.py
@@ -19,6 +19,7 @@
 from collections.abc import Callable
 
 from tvm.ir.expr import PrimExpr
+from tvm.runtime import DataTypeCode
 from tvm.tirx import FloatImm, IndexMap, IntImm
 
 from ..expr import Expr, PrimValue, ShapeExpr
@@ -151,10 +152,12 @@ def layout_transform(
     if pad_value is None:
         pass
     elif not isinstance(pad_value, PrimValue):
-        if "int" in x_dtype and isinstance(pad_value, int):
-            pad_value = IntImm(x_dtype, pad_value)
-        elif "float" in x_dtype and (isinstance(pad_value, int | float)):
-            pad_value = FloatImm(x_dtype, float(pad_value))
+        if x_dtype.matches_code(DataTypeCode.INT, DataTypeCode.UINT) and isinstance(pad_value, int):
+            pad_value = IntImm(x_dtype.dtype, pad_value)
+        elif x_dtype.matches_code(DataTypeCode.FLOAT, DataTypeCode.BFLOAT) and (
+            isinstance(pad_value, int | float)
+        ):
+            pad_value = FloatImm(x_dtype.dtype, float(pad_value))
         pad_value = PrimValue(pad_value)
 
     if axis_separators is None:
diff --git a/python/tvm/relax/transform/legalize_ops/common.py b/python/tvm/relax/transform/legalize_ops/common.py
index 1b7d1179a521..f464c248e363 100644
--- a/python/tvm/relax/transform/legalize_ops/common.py
+++ b/python/tvm/relax/transform/legalize_ops/common.py
@@ -20,6 +20,7 @@
 
 import tvm
 from tvm import te
+from tvm.runtime import DataTypeCode
 from tvm.tirx import FloatImm, IntImm
 
 from ...block_builder import BlockBuilder
@@ -38,9 +39,6 @@
 LegalizeFunc = Callable[[BlockBuilder, Call], Expr]
 
 
-##################### Utilities #####################
-
-
 def _try_convert_to_scalar_const(
     expr: Expr, python_native: bool = False
 ) -> Expr | FloatImm | IntImm | bool | float | int:
@@ -69,13 +67,14 @@ def _try_convert_to_scalar_const(
         # get the value of the scalar constant
         value = expr.data.numpy()[()].item()
         dtype = expr.ty.dtype
+        dtype_str = str(dtype.dtype)
         if python_native:
             return value
         # preserve the data type of the constant
-        if dtype.startswith("float"):
-            return tvm.tirx.FloatImm(dtype, value)
-        elif dtype.startswith("int") or dtype.startswith("uint") or dtype.startswith("bool"):
-            return tvm.tirx.IntImm(dtype, value)
+        if dtype.matches_code(DataTypeCode.FLOAT, DataTypeCode.BFLOAT):
+            return tvm.tirx.FloatImm(dtype_str, value)
+        elif dtype.matches_code(DataTypeCode.INT, DataTypeCode.UINT, DataTypeCode.BOOL):
+            return tvm.tirx.IntImm(dtype_str, value)
     return expr
 
 
diff --git a/python/tvm/relax/transform/legalize_ops/manipulate.py b/python/tvm/relax/transform/legalize_ops/manipulate.py
index f0cc8977d4ef..a59b1f9fe52e 100644
--- a/python/tvm/relax/transform/legalize_ops/manipulate.py
+++ b/python/tvm/relax/transform/legalize_ops/manipulate.py
@@ -19,7 +19,7 @@
 """Default legalization function for manipulate operators."""
 
 import tvm
-from tvm import relax, s_tir, te, tirx, topi
+from tvm import DataTypeCode, relax, s_tir, te, tirx, topi
 from tvm.relax.op.base import call_tir
 from tvm.relax.type import TensorType
 from tvm.relax.utils import gen_call_tir_inputs
@@ -337,7 +337,7 @@ def set_axis_sep(axis_sep: list, sch: s_tir.schedule, buffer_type: str):
     if pad_value is not None:
         pad_value = pad_value.value
     else:
-        if "int" in call.args[0].ty.dtype:
+        if call.args[0].ty.dtype.matches_code(DataTypeCode.INT, DataTypeCode.UINT):
             pad_value = 0
         else:
             pad_value = 0.0
diff --git a/python/tvm/relax/transform/legalize_ops/qdq.py b/python/tvm/relax/transform/legalize_ops/qdq.py
index aa86f6fca2c3..7a825e300e40 100644
--- a/python/tvm/relax/transform/legalize_ops/qdq.py
+++ b/python/tvm/relax/transform/legalize_ops/qdq.py
@@ -19,6 +19,7 @@
 
 import tvm
 from tvm import te, tirx
+from tvm.runtime import DataTypeCode
 
 from ...block_builder import BlockBuilder
 from ...expr import Call, Expr
@@ -140,7 +141,11 @@ def dequantize_compute(*indices):
                 zp_value = zp[(0,) * len(zp.shape)]
             else:
                 zp_value = zp[indices[axis]]
-            dtype = "float32" if "float" in data.dtype else "int32"
+            dtype = (
+                "float32"
+                if data.dtype.matches_code(DataTypeCode.FLOAT, DataTypeCode.BFLOAT)
+                else "int32"
+            )
             sub = te.subtract(data[indices].astype(dtype), zp_value)
             out = te.multiply(sub, scale_value.astype("float32"))
             if out_dtype == "float32":
diff --git a/python/tvm/relax/type.py b/python/tvm/relax/type.py
index ad8f469826ef..305f01750306 100644
--- a/python/tvm/relax/type.py
+++ b/python/tvm/relax/type.py
@@ -21,7 +21,7 @@
 import tvm_ffi
 from tvm_ffi import Array
 
-from tvm.ir import EnvFunc, PrimExpr, Span, TupleType, VDevice
+from tvm.ir import EnvFunc, PrimExpr, PrimType, Span, TupleType, VDevice
 
 from . import _ffi_api
 from .expr import Expr, ShapeExpr, Type
@@ -92,7 +92,7 @@ class TensorType(Type):
     """
 
     shape: Expr | None
-    dtype: str
+    dtype: PrimType
     vdevice: VDevice | None
     ndim: int
     span: Span
@@ -100,13 +100,15 @@ class TensorType(Type):
     def __init__(
         self,
         shape: Expr | None | list[PrimExpr] = None,
-        dtype: str = "float32",
+        dtype: str | PrimType | None = "float32",
         vdevice: VDevice | None | str = None,
         ndim: int = -1,
         span: Span = None,
     ) -> None:
         if isinstance(shape, list | tuple | Array):
             shape = ShapeExpr(shape)
+        if dtype is not None and not isinstance(dtype, PrimType):
+            dtype = PrimType(dtype)
         self.__init_handle_by_constructor__(
             _ffi_api.TensorType,
             shape,
diff --git a/python/tvm/runtime/object_generic.py b/python/tvm/runtime/object_generic.py
index 51c8805f9445..505613d0372e 100644
--- a/python/tvm/runtime/object_generic.py
+++ b/python/tvm/runtime/object_generic.py
@@ -66,5 +66,9 @@ def const(value, dtype=None, span=None):
     if dtype is None:
         dtype = _scalar_type_inference(value)
     if dtype == "uint64" and value >= (1 << 63):
-        return _ffi_node_api.LargeUIntImm(dtype, value & ((1 << 32) - 1), value >> 32, span)
+        from tvm.ir import PrimType  # pylint: disable=import-outside-toplevel
+
+        return _ffi_node_api.LargeUIntImm(
+            PrimType(dtype), value & ((1 << 32) - 1), value >> 32, span
+        )
     return _ffi_node_api._const(value, dtype, span)
diff --git a/python/tvm/s_tir/schedule/schedule.py b/python/tvm/s_tir/schedule/schedule.py
index 7f191df98d84..25b81239189d 100644
--- a/python/tvm/s_tir/schedule/schedule.py
+++ b/python/tvm/s_tir/schedule/schedule.py
@@ -24,7 +24,7 @@
 
 from tvm.error import register_error
 from tvm.ir import GlobalVar, IRModule, PrimExpr
-from tvm.runtime import Object
+from tvm.runtime import DataTypeCode, Object
 from tvm.tirx import Buffer, FloatImm, For, IntImm, PrimFunc, SBlock
 from tvm.tirx.function import IndexMap
 
@@ -3465,10 +3465,14 @@ def two_elementwise_transformed_intermediate_buffer(a: T.handle, c: T.handle) ->
             # buffer's type.  If the default `tvm.runtime.convert`
             # behavior is applied, these would be converted to
             # int32/float32, which may not match the buffer's type.
-            if "int" in buffer_obj.dtype and isinstance(pad_value, int):
-                pad_value = IntImm(buffer_obj.dtype, pad_value)
-            elif "float" in buffer_obj.dtype and isinstance(pad_value, float):
-                pad_value = FloatImm(buffer_obj.dtype, pad_value)
+            if buffer_obj.dtype.matches_code(DataTypeCode.INT, DataTypeCode.UINT) and isinstance(
+                pad_value, int
+            ):
+                pad_value = IntImm(buffer_obj.dtype.dtype, pad_value)
+            elif buffer_obj.dtype.matches_code(DataTypeCode.FLOAT, DataTypeCode.BFLOAT) and (
+                isinstance(pad_value, float)
+            ):
+                pad_value = FloatImm(buffer_obj.dtype.dtype, pad_value)
             pad_value = IndexMap.from_func(
                 lambda *indices: pad_value,
                 ndim=len(index_map.final_indices),
diff --git a/python/tvm/script/parser/core/evaluator.py b/python/tvm/script/parser/core/evaluator.py
index 4d38292b9b56..dec30f29a114 100644
--- a/python/tvm/script/parser/core/evaluator.py
+++ b/python/tvm/script/parser/core/evaluator.py
@@ -396,7 +396,7 @@ def _eval_if_exp(self, fields: dict[str, Any]) -> Any:
         orelse = self._eval_expr(fields["orelse"])
         if isinstance(test, bool):
             return body if test else orelse
-        elif isinstance(test, tvm.tirx.PrimExpr) and test.dtype == "bool":
+        elif isinstance(test, tvm.tirx.PrimExpr) and test.dtype.type_code == tvm.DataTypeCode.BOOL:
             return tvm.tirx.op.if_then_else(test, body, orelse)
         else:
             raise TypeError(f"Expected Python bool or TIR bool, but got {type(test)}")
diff --git a/python/tvm/te/tensor.py b/python/tvm/te/tensor.py
index 531915c6798a..b7238cf07eda 100644
--- a/python/tvm/te/tensor.py
+++ b/python/tvm/te/tensor.py
@@ -19,6 +19,7 @@
 # pylint: disable=invalid-name
 import tvm_ffi
 
+from tvm.ir import PrimType
 from tvm.runtime import Object, ObjectConvertible
 from tvm.tirx import DataProducer
 from tvm.tirx import expr as _expr
@@ -49,6 +50,10 @@ def dtype(self):
         """Data content of the tensor."""
         return self.tensor.dtype
 
+    def expr_ty(self):
+        """Compile-time element type of the tensor."""
+        return self.tensor.expr_ty()
+
 
 @tvm_ffi.register_object("te.Tensor")
 class Tensor(DataProducer, _expr.ExprOp):
@@ -86,6 +91,15 @@ def ndim(self):
         """Dimension of the tensor."""
         return len(self.shape)
 
+    @property
+    def dtype(self):
+        """Data content of the tensor."""
+        return PrimType(_ffi_api.TensorDType(self))
+
+    def expr_ty(self):
+        """Compile-time element type of the tensor."""
+        return self.dtype
+
     @property
     def name(self):
         op = self.op
diff --git a/python/tvm/tirx/buffer.py b/python/tvm/tirx/buffer.py
index 4caf154547fa..d021bb317220 100644
--- a/python/tvm/tirx/buffer.py
+++ b/python/tvm/tirx/buffer.py
@@ -544,7 +544,7 @@ def decl_buffer(
         elem_offset = Var(f"{name}_elem_offset", shape_dtype)
     if data is None:
         # Bool is represented as uint1 in the IR, but stored as int8
-        storage_type = PrimType(dtype)
+        storage_type = dtype if isinstance(dtype, PrimType) else PrimType(dtype)
         storage_type = PrimType("int8") if storage_type.dtype == "bool" else storage_type
         data = Var(name, PointerType(storage_type, scope), span)
     return _ffi_api.Buffer(  # type: ignore
diff --git a/python/tvm/tirx/expr.py b/python/tvm/tirx/expr.py
index a97171e436ae..2e01f0b6d556 100644
--- a/python/tvm/tirx/expr.py
+++ b/python/tvm/tirx/expr.py
@@ -34,7 +34,7 @@
 from tvm import ir
 from tvm.ir import Op, PrimExpr
 from tvm.ir.base import Span
-from tvm.runtime import DataType, DataTypeCode, Object, ObjectConvertible, Scriptable, const
+from tvm.runtime import DataTypeCode, Object, ObjectConvertible, Scriptable, const
 
 from . import _ffi_api
 from . import generic as _generic
@@ -56,13 +56,17 @@ def div_ambiguity_error() -> RuntimeError:
 def _dtype_is_int(value):
     if isinstance(value, int):
         return True
-    return isinstance(value, ExprOp) and DataType(value.dtype).type_code == DataTypeCode.INT  # type: ignore
+    if isinstance(value, ExprOp):
+        return value.expr_ty().matches_code(DataTypeCode.INT)
+    return False
 
 
 def _dtype_is_float(value):
     if isinstance(value, float):
         return True
-    return isinstance(value, ExprOp) and DataType(value.dtype).type_code == DataTypeCode.FLOAT  # type: ignore
+    if isinstance(value, ExprOp):
+        return value.expr_ty().matches_code(DataTypeCode.FLOAT)
+    return False
 
 
 class ExprOp:
@@ -70,6 +74,13 @@ class ExprOp:
 
     # TODO(tkonolige): use inspect to add source information to these objects
 
+    def expr_ty(self) -> ir.PrimType:
+        """Return the compile-time primitive type for expression operators."""
+        ty = getattr(self, "ty", None)
+        if isinstance(ty, ir.PrimType):
+            return ty
+        raise TypeError(f"Cannot determine PrimType for {type(self).__name__}")
+
     def __add__(self, other: PrimExpr) -> PrimExpr:
         return _generic.add(self, other)
 
@@ -259,6 +270,10 @@ def asobject(self) -> PrimExpr:
         """Convert object."""
         return _ffi_api._OpEQ(self.a, self.b, self.span)  # type: ignore
 
+    def expr_ty(self) -> ir.PrimType:
+        """Compile-time type of the equality result."""
+        return ir.PrimType("bool")
+
     def __repr__(self) -> str:
         return f"EqualOp({self.a!r}, {self.b!r})"
 
@@ -299,6 +314,10 @@ def asobject(self) -> PrimExpr:
         """Convert object."""
         return _ffi_api._OpNE(self.a, self.b, self.span)  # type: ignore
 
+    def expr_ty(self) -> ir.PrimType:
+        """Compile-time type of the inequality result."""
+        return ir.PrimType("bool")
+
     def __repr__(self) -> str:
         return f"NotEqualOp({self.a!r}, {self.b!r})"
 
@@ -473,6 +492,10 @@ def __init__(
             span,  # type: ignore
         )
 
+    def expr_ty(self) -> ir.PrimType:
+        """Compile-time type of the iteration variable."""
+        return self.var.ty
+
 
 @tvm_ffi.register_object("tirx.CommReducer")
 class CommReducer(Object, Scriptable):
@@ -1332,6 +1355,8 @@ def __init__(
             op = Op.get(op)
         if isinstance(attrs, dict):
             attrs = ir.make_node("ir.DictAttrs", **attrs)
+        if not isinstance(dtype, ir.PrimType):
+            dtype = ir.PrimType(dtype)
         if attrs:
             self.__init_handle_by_constructor__(  # type: ignore
                 _ffi_api.CallWithAttrs, dtype, op, args, attrs, span
diff --git a/python/tvm/tirx/script/parser/operation.py b/python/tvm/tirx/script/parser/operation.py
index dac8f06ebf80..c6cb50f291af 100644
--- a/python/tvm/tirx/script/parser/operation.py
+++ b/python/tvm/tirx/script/parser/operation.py
@@ -17,7 +17,8 @@
 """The tirx expression operation registration"""
 
 from tvm import tirx
-from tvm.runtime import DataType, DataTypeCode
+from tvm.ir import PrimType
+from tvm.runtime import DataTypeCode
 from tvm.script.parser._core import OpMethod, doc, register_op
 from tvm.tirx import IntImm
 from tvm.tirx.expr import FloatImm
@@ -26,12 +27,18 @@
 def _register_expr_op(ty: type):  # pylint: disable=invalid-name
     ty._dispatch_type = ty  # pylint: disable=protected-access
 
+    def _expr_ty(expr):
+        ty = expr.expr_ty()
+        if not isinstance(ty, PrimType):
+            raise TypeError(f"Expected a PrimType expression, but got {ty}")
+        return ty
+
     def _and(a, b):
         if isinstance(a, bool):
             a = IntImm("bool", a)
         if isinstance(b, bool):
             b = IntImm("bool", b)
-        if DataType(a.dtype).lanes > 1 or DataType(b.dtype).lanes > 1:
+        if not _expr_ty(a).is_scalar() or not _expr_ty(b).is_scalar():
             return a & b
         else:
             return tirx.And(a, b)
@@ -41,58 +48,56 @@ def _or(a, b):
             a = IntImm("bool", a)
         if isinstance(b, bool):
             b = IntImm("bool", b)
-        if DataType(a.dtype).lanes > 1 or DataType(b.dtype).lanes > 1:
+        if not _expr_ty(a).is_scalar() or not _expr_ty(b).is_scalar():
             return a | b
         else:
             return tirx.Or(a, b)
 
-    def _get_type_str(dtype: str):
-        if DataType(dtype).lanes == 1:
-            return dtype
-        index = dtype.find("x")
-        return dtype[0:index]
+    def _get_type_str(ty: PrimType):
+        dtype_str = str(ty.dtype)
+        if ty.is_scalar():
+            return dtype_str
+        index = dtype_str.find("x")
+        return dtype_str[0:index]
 
     def _auto_broadcast(a, b, op):
         if isinstance(a, int):
             if hasattr(b, "dtype"):
-                if (
-                    DataType(b.dtype).type_code == DataTypeCode.INT
-                    or DataType(b.dtype).type_code == DataTypeCode.UINT
-                    or DataType(b.dtype).type_code == DataTypeCode.BOOL
-                ):
-                    a = IntImm(_get_type_str(b.dtype), a)
-                elif DataType(b.dtype).type_code == DataTypeCode.FLOAT:
-                    a = FloatImm(_get_type_str(b.dtype), a)
+                b_ty = _expr_ty(b)
+                if b_ty.matches_code(DataTypeCode.INT, DataTypeCode.UINT, DataTypeCode.BOOL):
+                    a = IntImm(_get_type_str(b_ty), a)
+                elif b_ty.matches_code(DataTypeCode.FLOAT):
+                    a = FloatImm(_get_type_str(b_ty), a)
             elif isinstance(b, float):
                 a = FloatImm("float32", a)
             else:
                 a = IntImm("int32", a)
         elif isinstance(a, float):
-            if DataType(b.dtype).type_code == DataTypeCode.FLOAT:
-                a = FloatImm(_get_type_str(b.dtype), a)
+            b_ty = _expr_ty(b)
+            if b_ty.matches_code(DataTypeCode.FLOAT):
+                a = FloatImm(_get_type_str(b_ty), a)
             else:
                 a = FloatImm("float32", a)
 
         assert isinstance(a, tirx.PrimExpr), "Operand should be a PrimExpr."
         if isinstance(b, int):
-            if (
-                DataType(a.dtype).type_code == DataTypeCode.INT
-                or DataType(a.dtype).type_code == DataTypeCode.UINT
-                or DataType(a.dtype).type_code == DataTypeCode.BOOL
-            ):
-                b = IntImm(_get_type_str(a.dtype), b)
-            elif DataType(a.dtype).type_code == DataTypeCode.FLOAT:
-                b = FloatImm(_get_type_str(a.dtype), b)
+            a_ty = _expr_ty(a)
+            if a_ty.matches_code(DataTypeCode.INT, DataTypeCode.UINT, DataTypeCode.BOOL):
+                b = IntImm(_get_type_str(a_ty), b)
+            elif a_ty.matches_code(DataTypeCode.FLOAT):
+                b = FloatImm(_get_type_str(a_ty), b)
         elif isinstance(b, float):
-            b = FloatImm(_get_type_str(a.dtype), b)
+            b = FloatImm(_get_type_str(_expr_ty(a)), b)
 
-        if DataType(a.dtype).lanes == DataType(b.dtype).lanes:
+        a_ty = _expr_ty(a)
+        b_ty = _expr_ty(b)
+        if a_ty.dtype.lanes == b_ty.dtype.lanes:
             return op(a, b)
-        elif DataType(a.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes:
-            broadcast_a = tirx.Broadcast(a, DataType(b.dtype).lanes)
+        elif a_ty.is_scalar() and a_ty.dtype.lanes != b_ty.dtype.lanes:
+            broadcast_a = tirx.Broadcast(a, b_ty.dtype.lanes)
             return op(broadcast_a, b)
-        elif DataType(b.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes:
-            broadcast_b = tirx.Broadcast(b, DataType(a.dtype).lanes)
+        elif b_ty.is_scalar() and a_ty.dtype.lanes != b_ty.dtype.lanes:
+            broadcast_b = tirx.Broadcast(b, a_ty.dtype.lanes)
             return op(a, broadcast_b)
         else:
             raise TypeError("do not know how to deal with it.")
diff --git a/python/tvm/topi/math.py b/python/tvm/topi/math.py
index d3e8991c85c7..6088c4baa800 100644
--- a/python/tvm/topi/math.py
+++ b/python/tvm/topi/math.py
@@ -18,7 +18,7 @@
 
 # pylint: disable=redefined-builtin,unused-argument
 import tvm
-from tvm import DataType, DataTypeCode, te
+from tvm import DataTypeCode, te
 from tvm.tirx import PrimExpr
 
 from . import cpp, tag
@@ -26,11 +26,15 @@
 
 
 def _require_float_tensor(op_name, x):
-    if DataType(x.dtype).type_code not in (DataTypeCode.FLOAT, DataTypeCode.BFLOAT):
+    if not x.dtype.matches_code(DataTypeCode.FLOAT, DataTypeCode.BFLOAT):
         raise TypeError(f"topi.{op_name} only supports floating-point inputs, but got {x.dtype}")
     return x
 
 
+def _is_integer_tensor(x):
+    return x.dtype.matches_code(DataTypeCode.INT, DataTypeCode.UINT)
+
+
 @tvm.te.tag_scope(tag=tag.ELEMWISE)
 def identity(x):
     """Take identity of input x.
@@ -478,7 +482,7 @@ def log(x):
     y : tvm.te.Tensor
         The result.
     """
-    if x.dtype.startswith("int"):
+    if x.dtype.matches_code(DataTypeCode.INT):
         x = te.compute(x.shape, lambda *i: x(*i).astype("float32"))
     return te.compute(x.shape, lambda *i: te.log(x(*i)), tag=tag.ELEMWISE)
 
@@ -496,7 +500,7 @@ def log2(x):
     y : tvm.te.Tensor
         The result.
     """
-    if x.dtype.startswith("int"):
+    if x.dtype.matches_code(DataTypeCode.INT):
         x = te.compute(x.shape, lambda *i: x(*i).astype("float32"))
     return te.compute(x.shape, lambda *i: te.log2(x(*i)), tag=tag.ELEMWISE)
 
@@ -514,7 +518,7 @@ def log10(x):
     y : tvm.te.Tensor
         The result.
     """
-    if x.dtype.startswith("int"):
+    if x.dtype.matches_code(DataTypeCode.INT):
         x = te.compute(x.shape, lambda *i: x(*i).astype("float32"))
     return te.compute(x.shape, lambda *i: te.log10(x(*i)), tag=tag.ELEMWISE)
 
@@ -533,7 +537,7 @@ def sqrt(x):
     y : tvm.te.Tensor
         The result.
     """
-    if x.dtype.startswith("int"):
+    if x.dtype.matches_code(DataTypeCode.INT):
         x = te.compute(x.shape, lambda *i: x(*i).astype("float32"))
     return te.compute(x.shape, lambda *i: te.sqrt(x(*i)))
 
@@ -552,7 +556,7 @@ def rsqrt(x):
     y : tvm.te.Tensor
         The result.
     """
-    if x.dtype.startswith("int"):
+    if x.dtype.matches_code(DataTypeCode.INT):
         x = te.compute(x.shape, lambda *i: x(*i).astype("float32"))
     return te.compute(x.shape, lambda *i: te.rsqrt(x(*i)))
 
@@ -798,7 +802,7 @@ def fast_exp(x):
     y : tvm.te.Tensor
         The result.
     """
-    if x.dtype.startswith("int") or x.dtype.startswith("uint"):
+    if _is_integer_tensor(x):
         x = cast(x, "float32")
     return cpp.fast_exp(x, x.dtype, tag.ELEMWISE)
 
@@ -816,7 +820,7 @@ def fast_tanh(x):
     y : tvm.te.Tensor
         The result.
     """
-    if x.dtype.startswith("int") or x.dtype.startswith("uint"):
+    if _is_integer_tensor(x):
         x = cast(x, "float32")
     return cpp.fast_tanh(x, x.dtype, tag.ELEMWISE)
 
@@ -855,24 +859,26 @@ def ceil_log2(x):
     if not isinstance(x, tvm.tirx.PrimExpr):
         x = tvm.tirx.const(x)
 
-    if "float" in x.dtype:
+    if x.ty.matches_code(DataTypeCode.FLOAT, DataTypeCode.BFLOAT):
         return tvm.tirx.ceil(tvm.tirx.log2(x))
 
     target = tvm.target.Target.current()
 
-    if "vulkan" in target.kind.name:
-        clz = tvm.tirx.clz(x)
-        bits = int(x.dtype[-2:])
-        res = tvm.tirx.if_then_else(x & (x - 1) == 0, bits - clz - 1, bits - clz)
-        if res.dtype != x.dtype:
-            return cast(res, x.dtype)
-        return res
-
-    if "adreno" in str(target.attrs.get("device", "")) or target.kind.name in [
-        "metal",
-        "rocm",
-        "webgpu",
-    ]:
-        return cast(tvm.tirx.ceil(tvm.tirx.log2(cast(x, "float32"))), x.dtype)
+    if target is not None:
+        target_name = target.kind.name
+        if "vulkan" in target_name:
+            clz = tvm.tirx.clz(x)
+            bits = x.ty.dtype.bits
+            res = tvm.tirx.if_then_else(x & (x - 1) == 0, bits - clz - 1, bits - clz)
+            if res.dtype != x.dtype:
+                return cast(res, x.dtype)
+            return res
+
+        if "adreno" in str(target.attrs.get("device", "")) or target_name in [
+            "metal",
+            "rocm",
+            "webgpu",
+        ]:
+            return cast(tvm.tirx.ceil(tvm.tirx.log2(cast(x, "float32"))), x.dtype)
 
     return cast(tvm.tirx.ceil(tvm.tirx.log2(cast(x, "float64"))), x.dtype)
diff --git a/python/tvm/topi/scatter.py b/python/tvm/topi/scatter.py
index bf5b86599854..de35577c4d85 100644
--- a/python/tvm/topi/scatter.py
+++ b/python/tvm/topi/scatter.py
@@ -18,7 +18,7 @@
 # ruff: noqa: E741
 """ScatterND operator"""
 
-from tvm import te, tirx  # hide redefinition of min and max
+from tvm import DataTypeCode, te, tirx  # hide redefinition of min and max
 from tvm.arith.analyzer import Analyzer
 from tvm.script.ir_builder import IRBuilder
 from tvm.script.ir_builder import tirx as T
@@ -49,7 +49,7 @@ def _verify_scatter_nd_inputs(data, indices, updates):
             f"of out_shape[{i}] ({data.shape[i]})."
         )
 
-    assert "int" in indices.dtype, (
+    assert indices.dtype.matches_code(DataTypeCode.INT, DataTypeCode.UINT), (
         f"Indices must be a tensor of integers, but its elements are {indices.dtype}."
     )
 
diff --git a/python/tvm/topi/sort.py b/python/tvm/topi/sort.py
index 81821e462dcf..846573db5036 100644
--- a/python/tvm/topi/sort.py
+++ b/python/tvm/topi/sort.py
@@ -110,7 +110,7 @@ def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
         f = tvm.compile(s, [data, out], "llvm")
         dev = tvm.cpu()
         tvm_data = tvm.runtime.tensor(np_data, dev)
-        tvm_out = tvm.runtime.tensor(np.zeros(dshape, dtype=data.dtype), dev)
+        tvm_out = tvm.runtime.tensor(np.zeros(dshape, dtype=data.dtype.dtype), dev)
         f(tvm_data, tvm_out)
     """
     data_buf = tvm.tirx.decl_buffer(
diff --git a/src/arith/analyzer.cc b/src/arith/analyzer.cc
index fc59f891e1bf..94eb8788846b 100644
--- a/src/arith/analyzer.cc
+++ b/src/arith/analyzer.cc
@@ -73,7 +73,8 @@ void AnalyzerObj::Bind(const Var& var, const Range& range, bool allow_override)
 void AnalyzerObj::MarkGlobalNonNegValue(const PrimExpr& value) {
   // decompose value as symbol * scale + offset
   int64_t offset = 0;
-  PrimExpr symbol_scale = tirx::MakeConst(value.dtype(), 0);
+  PrimType value_ty = value.ty();
+  PrimExpr symbol_scale = tirx::MakeConst(value_ty, 0);
 
   auto fcollect_sum = [&](PrimExpr val, int sign) {
     if (const auto* intimm = val.as<IntImmNode>()) {
@@ -90,7 +91,7 @@ void AnalyzerObj::MarkGlobalNonNegValue(const PrimExpr& value) {
 
   // split out the symbol and non-symbolic part
   int64_t cscale = 1;
-  PrimExpr symbol = tirx::MakeConst(value.dtype(), 1);
+  PrimExpr symbol = tirx::MakeConst(value_ty, 1);
   auto fcollect_prod = [&](PrimExpr val) {
     if (const auto* intimm = val.as<IntImmNode>()) {
       cscale *= intimm->value;
@@ -110,7 +111,7 @@ void AnalyzerObj::MarkGlobalNonNegValue(const PrimExpr& value) {
     Var var = ffi::GetRef<Var>(var_ptr);
     // skip non-index type, keep it to be compatible
     // with any_dim that do not represent any value
-    if (!IsIndexType(var.dtype())) return;
+    if (!IsIndexTypedExpr(var)) return;
     bool allow_override = true;
     // mark the constant bound is sufficient
     // we cannot mark interval set as that will cause relaxation of the var
@@ -169,7 +170,7 @@ bool AnalyzerObj::CanProveEqual(const PrimExpr& lhs, const PrimExpr& rhs) {
   const auto* clhs = lhs.as<IntImmNode>();
   const auto* crhs = rhs.as<IntImmNode>();
   if (clhs && crhs) return clhs->value == crhs->value;
-  if (lhs->dtype.is_handle() || rhs->dtype.is_handle()) {
+  if (lhs->ty().IsHandle() || rhs->ty().IsHandle()) {
     return lhs.same_as(rhs);
   }
   return CanProve(lhs - rhs == 0);
@@ -189,7 +190,7 @@ bool AnalyzerObj::CanProveLessEqualThanSymbolicShapeValue(const PrimExpr& lhs,
     }
   };
   UnpackReduction<tirx::MulNode>(shape, fcollect);
-  PrimExpr const_shape_bound = IntImm(shape.dtype(), std::abs(cscale));
+  PrimExpr const_shape_bound = IntImm(shape.ty(), std::abs(cscale));
   if (this->CanProve(lhs <= const_shape_bound, ProofStrength::kSymbolicBound)) return true;
   return false;
 }
diff --git a/src/arith/bound_deducer.cc b/src/arith/bound_deducer.cc
index 475a687cd462..bceeb4eafa2e 100644
--- a/src/arith/bound_deducer.cc
+++ b/src/arith/bound_deducer.cc
@@ -96,7 +96,8 @@ class BoundDeducer : public ExprFunctor<void(const PrimExpr&)> {
   void VisitExprDefault_(const ffi::Object* op) final { success_ = false; }
 
   SignType GetSignType(const PrimExpr& e) {
-    if (e.dtype().is_uint()) {
+    PrimType e_ty = e.ty();
+    if (e_ty.code() == DLDataTypeCode::kDLUInt) {
       return kPositive;
     }
     return expr_map_[e].GetSignType();
diff --git a/src/arith/canonical_simplify.cc b/src/arith/canonical_simplify.cc
index 12344cffd1d8..17a6ba022e2b 100644
--- a/src/arith/canonical_simplify.cc
+++ b/src/arith/canonical_simplify.cc
@@ -83,14 +83,14 @@ inline PrimExpr DivImpl(PrimExpr a, PrimExpr b, DivMode mode) {
  * \param analyzer The analyzer
  * \return whether value fits in dtype
  */
-bool CastIsSafe(DataType dtype, PrimExpr value, AnalyzerObj* analyzer) {
-  if (!IsIndexType(dtype)) {
+bool CastIsSafe(PrimType dtype, PrimExpr value, AnalyzerObj* analyzer) {
+  if (!IsIndexType(dtype->dtype)) {
     return false;
   }
   ConstIntBound bound = analyzer->const_int_bound(value);
   int64_t ubound = max_value(dtype).as_or_throw<IntImm>()->value;
   int64_t lbound = min_value(dtype).as_or_throw<IntImm>()->value;
-  if (value.dtype().bits() <= dtype.bits() ||  // upcast is safe
+  if (value.ty().bits() <= dtype.bits() ||  // upcast is safe
       (bound->max_value <= ubound && bound->min_value >= lbound)) {
     return true;
   }
@@ -128,7 +128,7 @@ class SplitExprNode : public CanonicalExprNode {
 
   PrimExpr NormalizeWithScale(int64_t sscale) const {
     PrimExpr res = this->index;
-    DataType dtype = this->dtype;
+    PrimType dtype = this->ty();
     if (this->scale == 0) {
       return IntImm(dtype, 0);
     }
@@ -140,7 +140,7 @@ class SplitExprNode : public CanonicalExprNode {
     }
     sscale *= this->scale;
     if (sscale != 1) {
-      TVM_FFI_ICHECK(!dtype.is_uint() || sscale > 0);
+      TVM_FFI_ICHECK(dtype.code() != DLDataTypeCode::kDLUInt || sscale > 0);
       res = res * MakeConst(dtype, sscale);
     }
     return res;
@@ -156,12 +156,12 @@ class SplitExprNode : public CanonicalExprNode {
    * \param analyzer The analyzer
    * \return whether the cast can be safely pushed to children
    */
-  bool CanPushCastToChildren(DataType dtype, AnalyzerObj* analyzer) const {
+  bool CanPushCastToChildren(PrimType dtype, AnalyzerObj* analyzer) const {
     // cast(dtype, index % upper_factor / lower_factor * scale) ==
     // cast(dtype, index) % upper_factor / lower_factor * scale
     // iff it is an upcast (dtype.bits >= self.dtype.bits) or all of
     // its intermediate results fit in the range of dtype
-    if (dtype.bits() >= this->dtype.bits()) {
+    if (dtype.bits() >= this->ty().bits()) {
       return true;  // upcast is safe
     }
     PrimExpr res = this->index;
@@ -172,20 +172,20 @@ class SplitExprNode : public CanonicalExprNode {
       return false;
     }
     if (this->upper_factor != SplitExprNode::kPosInf) {
-      res = ModImpl(res, MakeConst(this->dtype, this->upper_factor), div_mode);
+      res = ModImpl(res, MakeConst(this->ty(), this->upper_factor), div_mode);
       if (!CastIsSafe(dtype, res, analyzer)) {
         return false;
       }
     }
     if (this->lower_factor != 1) {
-      res = DivImpl(res, MakeConst(this->dtype, this->lower_factor), div_mode);
+      res = DivImpl(res, MakeConst(this->ty(), this->lower_factor), div_mode);
       if (!CastIsSafe(dtype, res, analyzer)) {
         return false;
       }
     }
     if (this->scale != 1) {
-      TVM_FFI_ICHECK(!this->dtype.is_uint() || this->scale > 0);
-      res = res * MakeConst(this->dtype, this->scale);
+      TVM_FFI_ICHECK(this->ty().code() != DLDataTypeCode::kDLUInt || this->scale > 0);
+      res = res * MakeConst(this->ty(), this->scale);
       if (!CastIsSafe(dtype, res, analyzer)) {
         return false;
       }
@@ -197,9 +197,9 @@ class SplitExprNode : public CanonicalExprNode {
    * \brief self = cast(dtype, self)
    * \param dtype The target datatype
    */
-  void PushCastToChildren(DataType dtype) {
+  void PushCastToChildren(PrimType dtype) {
     this->index = cast(dtype, this->index);
-    this->dtype = dtype;
+    this->BaseExprNode::ty = dtype;
   }
 
   inline bool IndexEqual(const SplitExpr& other) const;
@@ -252,9 +252,9 @@ class SumExprNode : public CanonicalExprNode {
   PrimExpr Normalize() const final {
     // quick path 1.
     if (this->args.size() == 0) {
-      return MakeConst(this->dtype, this->base);
+      return MakeConst(this->ty(), this->base);
     }
-    return Normalize_(this->dtype, SimplifySplitExprs(args), base);
+    return Normalize_(this->ty(), SimplifySplitExprs(args), base);
   }
   /*!
    * \brief Whether self is divisible by scale.
@@ -334,14 +334,14 @@ class SumExprNode : public CanonicalExprNode {
    * \param analyzer The analyzer
    * \return whether the cast can be safely pushed to children
    */
-  bool CanPushCastToChildren(DataType dtype, AnalyzerObj* analyzer) const {
+  bool CanPushCastToChildren(PrimType dtype, AnalyzerObj* analyzer) const {
     bool is_min_value = dtype.bits() == 64 ? base == std::numeric_limits<int64_t>::lowest()
                                            : base == -(1LL << (dtype.bits() - 1));
     // cast(dtype, arg_1 + arg_2 + ... arg_n) ==
     // cast(dtype, arg_1) + ... + cast(dtype, arg_n)
     // iff it is an upcast (dtype.bits >= self.dtype.bits) or all of
     // its intermediate results fit in the range of dtype
-    if (dtype.bits() >= this->dtype.bits()) {
+    if (dtype.bits() >= this->ty().bits()) {
       return true;  // upcast is safe
     }
     PrimExpr res = IntImm(dtype, 0);
@@ -386,11 +386,11 @@ class SumExprNode : public CanonicalExprNode {
    * \brief self = cast(dtype, self)
    * \param dtype The target datatype
    */
-  void PushCastToChildren(DataType dtype) {
+  void PushCastToChildren(PrimType dtype) {
     for (auto& arg : args) {
       arg.CopyOnWrite()->PushCastToChildren(dtype);
     }
-    this->dtype = dtype;
+    this->BaseExprNode::ty = dtype;
   }
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("arith.SumExpr", SumExprNode, CanonicalExprNode);
 
@@ -496,7 +496,7 @@ class SumExprNode : public CanonicalExprNode {
     std::stable_sort(args.begin(), args.end(), fcompare);
     return args;
   }
-  static PrimExpr Normalize_(DataType dtype, const std::vector<SplitExpr>& args, int64_t base) {
+  static PrimExpr Normalize_(PrimType dtype, const std::vector<SplitExpr>& args, int64_t base) {
     bool is_min_value = dtype.bits() == 64 ? base == std::numeric_limits<int64_t>::lowest()
                                            : base == -(1LL << (dtype.bits() - 1));
     // Positive scales first
@@ -648,7 +648,7 @@ class CanonicalSimplifier::Impl : public RewriteSimplifier::Impl {
       expr = op->Normalize();
     }
     ffi::ObjectPtr<SplitExprNode> n = ffi::make_object<SplitExprNode>();
-    n->dtype = expr.dtype();
+    n->BaseExprNode::ty = expr.ty();
     n->index = std::move(expr);
     n->div_mode = kTruncDiv;
     return SplitExpr(n);
@@ -685,7 +685,7 @@ class CanonicalSimplifier::Impl : public RewriteSimplifier::Impl {
       return op.value();
     }
     ffi::ObjectPtr<SumExprNode> n = ffi::make_object<SumExprNode>();
-    n->dtype = expr.dtype();
+    n->BaseExprNode::ty = expr.ty();
     if (const auto* op = expr.as<IntImmNode>()) {
       n->base = op->value;
       return SumExpr(n);
@@ -699,7 +699,7 @@ class CanonicalSimplifier::Impl : public RewriteSimplifier::Impl {
 };
 
 PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const AddNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Rewriter::VisitExpr_(op);
   }
   // normalize
@@ -723,7 +723,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const AddNode* op) {
 }
 
 PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const SubNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Rewriter::VisitExpr_(op);
   }
   // normalize
@@ -747,7 +747,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const SubNode* op) {
 }
 
 PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const MulNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Rewriter::VisitExpr_(op);
   }
   // normalize
@@ -794,8 +794,8 @@ void CanonicalSimplifier::Impl::SeparateDivisibleParts(const SumExprNode* psum,
                                                        SumExpr* out_non_divisible) {
   auto divisible = ffi::make_object<SumExprNode>();
   auto non_divisible = ffi::make_object<SumExprNode>();
-  divisible->dtype = psum->dtype;
-  non_divisible->dtype = psum->dtype;
+  divisible->BaseExprNode::ty = psum->ty();
+  non_divisible->BaseExprNode::ty = psum->ty();
 
   if (psum->base % coeff == 0) {
     divisible->base = psum->base;
@@ -834,11 +834,11 @@ SplitExpr CanonicalSimplifier::Impl::SplitDivConst(SplitExpr lhs, int64_t cval,
       return lhs;
     } else if (lhs->upper_factor <= (lhs->lower_factor * scaled_cval)) {
       // (x % c1) / c2  => 0 when c2 >= c1
-      return ToSplitExpr(IntImm(lhs.dtype(), 0));
+      return ToSplitExpr(IntImm(lhs.ty(), 0));
     } else {
       // move the upper_factor modular into index.
       lhs.CopyOnWrite()->index =
-          ModImpl(lhs->index, MakeConst(lhs.dtype(), lhs->upper_factor), div_mode);
+          ModImpl(lhs->index, MakeConst(lhs.ty(), lhs->upper_factor), div_mode);
       lhs.CopyOnWrite()->upper_factor = SplitExprNode::kPosInf;
       lhs.CopyOnWrite()->scale = 1;
       lhs.CopyOnWrite()->lower_factor *= scaled_cval;
@@ -862,8 +862,9 @@ bool CanonicalSimplifier::Impl::ProdDivSimplify(PrimExpr* plhs, PrimExpr* prhs,
   if (prhs->as<IntImmNode>()) return false;
   // collect lhs products and try to eliminate by matching them to prod in rhs
   ffi::Array<ffi::Optional<PrimExpr>> lhs_prods;
-  PrimExpr new_rhs = MakeConst(prhs->dtype(), 1);
-  PrimExpr new_common_scale = MakeConst(prhs->dtype(), 1);
+  PrimType rhs_ty = prhs->ty();
+  PrimExpr new_rhs = MakeConst(rhs_ty, 1);
+  PrimExpr new_common_scale = MakeConst(rhs_ty, 1);
   int64_t lhs_cscale = 1, rhs_cscale = 1;
   int num_elimination = 0;
 
@@ -905,18 +906,19 @@ bool CanonicalSimplifier::Impl::ProdDivSimplify(PrimExpr* plhs, PrimExpr* prhs,
   if (num_elimination == 0 && cscale_gcd == 1) return false;
 
   // construct prod via canonical form
-  PrimExpr new_lhs = MakeConst(plhs->dtype(), 1);
+  PrimType lhs_ty = plhs->ty();
+  PrimExpr new_lhs = MakeConst(lhs_ty, 1);
   for (ffi::Optional<PrimExpr> val : lhs_prods) {
     if (val.defined()) new_lhs = new_lhs * val.value();
   }
-  *plhs = new_lhs * MakeConst(plhs->dtype(), lhs_cscale);
-  *prhs = new_rhs * MakeConst(prhs->dtype(), rhs_cscale);
-  *common_scale = new_common_scale * MakeConst(prhs->dtype(), cscale_gcd);
+  *plhs = new_lhs * MakeConst(lhs_ty, lhs_cscale);
+  *prhs = new_rhs * MakeConst(rhs_ty, rhs_cscale);
+  *common_scale = new_common_scale * MakeConst(rhs_ty, cscale_gcd);
   return true;
 }
 
 PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const DivNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Rewriter::VisitExpr_(op);
   }
 
@@ -958,7 +960,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const DivNode* op) {
       // if a >= 0 && a < cval, then result == 0
       auto cbound = analyzer_->const_int_bound(Normalize(a));
       if (cbound->min_value >= 0 && cbound->max_value < cval) {
-        return IntImm(a.dtype(), 0);
+        return IntImm(a.ty(), 0);
       }
     }
     return SplitDivConst(ToSplitExpr(std::move(a)), cval, kTruncDiv);
@@ -980,7 +982,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const DivNode* op) {
 }
 
 PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Rewriter::VisitExpr_(op);
   }
   PrimExpr a = this->CanonicalMutate(op->a);
@@ -1019,7 +1021,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
       // if a >= 0 && a < cval, then result == 0
       auto cbound = analyzer_->const_int_bound(Normalize(a));
       if (cbound->min_value >= 0 && cbound->max_value < cval) {
-        return IntImm(a.dtype(), 0);
+        return IntImm(a.ty(), 0);
       }
     }
     // Identity: floordiv(floormod(index, m*n), n) = floormod(floordiv(index, n), m)
@@ -1049,7 +1051,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
             }
             // Apply floormod(floordiv_result, m) to complete the identity
             PrimExpr div_result = Normalize(lhs);
-            return this->VisitExpr(floormod(div_result, MakeConst(a.dtype(), new_mod)));
+            return this->VisitExpr(floormod(div_result, MakeConst(a.ty(), new_mod)));
           }
         }
       }
@@ -1095,8 +1097,8 @@ SplitExpr CanonicalSimplifier::Impl::SplitModConst(SplitExpr lhs, int64_t cval,
       // Perhaps there are more chances in simplifying the index
       // Do a recursive call to simplify the mod with the new factor.
       if (new_upper_factor < lhs->upper_factor && lhs->upper_factor != SplitExprNode::kPosInf) {
-        auto updated = ToSplitExpr(this->VisitExpr(
-            ModImpl(lhs->index, MakeConst(lhs.dtype(), new_upper_factor), div_mode)));
+        auto updated = ToSplitExpr(
+            this->VisitExpr(ModImpl(lhs->index, MakeConst(lhs.ty(), new_upper_factor), div_mode)));
         // re-apply the lower_factor
         if (lhs->lower_factor != 1) {
           auto ret = SplitDivConst(updated, lhs->lower_factor, div_mode);
@@ -1126,7 +1128,7 @@ SplitExpr CanonicalSimplifier::Impl::SplitModConst(SplitExpr lhs, int64_t cval,
 }
 
 PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const ModNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Rewriter::VisitExpr_(op);
   }
   // normalize
@@ -1144,7 +1146,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const ModNode* op) {
       SumExpr lhs, extra;
       SeparateDivisibleParts(psum, cval, &lhs, &extra);
       if (extra->IsZero()) {
-        return IntImm(a.dtype(), 0);
+        return IntImm(a.ty(), 0);
       }
       // both lhs and extra are non-negative
       if (analyzer_->CanProveGreaterEqual(lhs->Normalize(), 0) &&
@@ -1200,7 +1202,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const ModNode* op) {
 }
 
 PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Rewriter::VisitExpr_(op);
   }
   // normalize
@@ -1362,7 +1364,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const ReduceNode* op) {
 }
 
 PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const CastNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Rewriter::VisitExpr_(op);
   }
   // normalize
@@ -1370,15 +1372,15 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const CastNode* op) {
   // PushCastToChildren
   if (value.as<SumExprNode>()) {
     SumExpr se = value.as_or_throw<SumExpr>();
-    if (se->CanPushCastToChildren(op->dtype, analyzer_)) {
-      se.CopyOnWrite()->PushCastToChildren(op->dtype);
+    if (se->CanPushCastToChildren(op->ty(), analyzer_)) {
+      se.CopyOnWrite()->PushCastToChildren(op->ty());
       return se;
     }
   }
   if (value.as<SplitExprNode>()) {
     SplitExpr se = value.as_or_throw<SplitExpr>();
-    if (se->CanPushCastToChildren(op->dtype, analyzer_)) {
-      se.CopyOnWrite()->PushCastToChildren(op->dtype);
+    if (se->CanPushCastToChildren(op->ty(), analyzer_)) {
+      se.CopyOnWrite()->PushCastToChildren(op->ty());
       return se;
     }
   }
@@ -1411,8 +1413,8 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const LTNode* op) {
     }
     SumExpr divisible, extra;
     SeparateDivisibleParts(lhs, gcd, &divisible, &extra);
-    DataType dtype = divisible->dtype;
-    TVM_FFI_ICHECK(extra->dtype == dtype);
+    PrimType dtype = divisible->ty();
+    TVM_FFI_ICHECK(extra->ty()->dtype == dtype->dtype);
     PrimExpr normal_extra = extra->Normalize();
     if (this->analyzer_->CanProve(normal_extra < MakeConst(dtype, gcd)) &&
         this->analyzer_->CanProve(normal_extra >= IntImm(dtype, 0))) {
diff --git a/src/arith/const_fold.h b/src/arith/const_fold.h
index fb1055660e3b..ed1fc2d1a7a6 100644
--- a/src/arith/const_fold.h
+++ b/src/arith/const_fold.h
@@ -72,18 +72,29 @@ inline ffi::Optional<PrimExpr> TryConstFold(PrimExpr a);
  * \param type The type to represent index.
  * \return the checked result.
  */
-inline bool IsIndexType(const DataType& type) {
-  return type.is_int() && !type.is_scalable_or_fixed_length_vector() &&
-         (type.bits() == 32 || type.bits() == 64);
+inline bool IsIndexType(DLDataType type) {
+  return type.code == static_cast<uint8_t>(DLDataTypeCode::kDLInt) &&
+         (type.bits == 32 || type.bits == 64) && type.lanes == 1;
+}
+
+inline bool IsIndexTypedExpr(const PrimExprNode* expr) {
+  TVM_FFI_DCHECK(expr != nullptr);
+  TVM_FFI_DCHECK(expr->BaseExprNode::ty.defined());
+  const auto* prim_ty = expr->BaseExprNode::ty.as<PrimTypeNode>();
+  TVM_FFI_DCHECK(prim_ty != nullptr);
+  return IsIndexType(prim_ty->dtype);
+}
+
+inline bool IsIndexTypedExpr(const PrimExpr& expr) {
+  return IsIndexTypedExpr(static_cast<const PrimExprNode*>(expr.get()));
 }
 
 /*! \brief Helper to get const folding result repr in int64. */
-inline int64_t GetFoldResultInt64Repr(int64_t x, const DataType& dtype) {
+inline int64_t GetFoldResultInt64Repr(int64_t x, const PrimType& dtype) {
   if (dtype.bits() < 64) {
     x &= (1LL << dtype.bits()) - 1;
   }
-  if (dtype.is_int()) {
-    // get sign extended value of integer with specified bits
+  if (dtype.code() == DLDataTypeCode::kDLInt) {
     int64_t m = 1LL << (dtype.bits() - 1);
     x = (x ^ m) - m;
   }
@@ -118,32 +129,30 @@ inline double GetFoldResultDoubleRepr(float x) {
   const FloatImmNode* fb = b.as<FloatImmNode>(); \
   BODY;
 
-#define TVM_INDEX_CONST_PROPAGATION(BODY)                 \
-  const IntImmNode* pa = a.as<IntImmNode>();              \
-  const IntImmNode* pb = b.as<IntImmNode>();              \
-  const DataType& ta = a.dtype();                         \
-  const DataType& tb = b.dtype();                         \
-  if (arith::IsIndexType(ta) && arith::IsIndexType(tb)) { \
-    BODY;                                                 \
+#define TVM_INDEX_CONST_PROPAGATION(BODY)                         \
+  const IntImmNode* pa = a.as<IntImmNode>();                      \
+  const IntImmNode* pb = b.as<IntImmNode>();                      \
+  if (arith::IsIndexTypedExpr(a) && arith::IsIndexTypedExpr(b)) { \
+    BODY;                                                         \
   }
 
 // specialization of constant folders.
 template <>
 inline ffi::Optional<PrimExpr> TryConstFold<tirx::Add>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
+    PrimType result_ty = a.ty();
     if (pa && pb) {
       int64_t res = pa->value + pb->value;
-      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
+      return IntImm(result_ty, GetFoldResultInt64Repr(res, result_ty));
     }
     if (pa && pa->value == 0) return b;
     if (pb && pb->value == 0) return a;
     if (fa && fb) {
-      if (rtype.bits() == 32) {
-        return FloatImm(rtype, GetFoldResultDoubleRepr(static_cast<float>(fa->value) +
-                                                       static_cast<float>(fb->value)));
-      } else if (rtype.bits() == 64) {
-        return FloatImm(rtype, fa->value + fb->value);
+      if (result_ty.bits() == 32) {
+        return FloatImm(result_ty, GetFoldResultDoubleRepr(static_cast<float>(fa->value) +
+                                                           static_cast<float>(fb->value)));
+      } else if (result_ty.bits() == 64) {
+        return FloatImm(result_ty, fa->value + fb->value);
       }
     }
     if (fa && fa->value == 0) return b;
@@ -155,22 +164,22 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::Add>(PrimExpr a, PrimExpr b) {
 template <>
 inline ffi::Optional<PrimExpr> TryConstFold<tirx::Sub>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
-    TVM_FFI_ICHECK(!((pa && pa->dtype.is_uint() && pa->value == 0U) &&
-                     (pb && pb->dtype.is_uint() && pb->value > 0U)))
+    TVM_FFI_ICHECK(!((pa && pa->ty().code() == DLDataTypeCode::kDLUInt && pa->value == 0U) &&
+                     (pb && pb->ty().code() == DLDataTypeCode::kDLUInt && pb->value > 0U)))
         << "Checked failed. Minuend 's value is 0U and it's dtype is uint "
         << "while Subtrahend's dtype is uint; which will cause a negative uint";
-    const DataType& rtype = a.dtype();
+    PrimType result_ty = a.ty();
     if (pa && pb) {
       int64_t res = pa->value - pb->value;
-      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
+      return IntImm(result_ty, GetFoldResultInt64Repr(res, result_ty));
     }
     if (pb && pb->value == 0) return a;
     if (fa && fb) {
-      if (rtype.bits() == 32) {
-        return FloatImm(rtype, GetFoldResultDoubleRepr(static_cast<float>(fa->value) -
-                                                       static_cast<float>(fb->value)));
-      } else if (rtype.bits() == 64) {
-        return FloatImm(rtype, fa->value - fb->value);
+      if (result_ty.bits() == 32) {
+        return FloatImm(result_ty, GetFoldResultDoubleRepr(static_cast<float>(fa->value) -
+                                                           static_cast<float>(fb->value)));
+      } else if (result_ty.bits() == 64) {
+        return FloatImm(result_ty, fa->value - fb->value);
       }
     }
     if (fb && fb->value == 0) return a;
@@ -181,10 +190,10 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::Sub>(PrimExpr a, PrimExpr b) {
 template <>
 inline ffi::Optional<PrimExpr> TryConstFold<tirx::Mul>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
+    PrimType result_ty = a.ty();
     if (pa && pb) {
       int64_t res = pa->value * pb->value;
-      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
+      return IntImm(result_ty, GetFoldResultInt64Repr(res, result_ty));
     }
     if (pa) {
       if (pa->value == 1) return b;
@@ -195,11 +204,11 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::Mul>(PrimExpr a, PrimExpr b) {
       if (pb->value == 0) return b;
     }
     if (fa && fb) {
-      if (rtype.bits() == 32) {
-        return FloatImm(rtype, GetFoldResultDoubleRepr(static_cast<float>(fa->value) *
-                                                       static_cast<float>(fb->value)));
-      } else if (rtype.bits() == 64) {
-        return FloatImm(rtype, fa->value * fb->value);
+      if (result_ty.bits() == 32) {
+        return FloatImm(result_ty, GetFoldResultDoubleRepr(static_cast<float>(fa->value) *
+                                                           static_cast<float>(fb->value)));
+      } else if (result_ty.bits() == 64) {
+        return FloatImm(result_ty, fa->value * fb->value);
       }
     }
     if (fa) {
@@ -217,13 +226,13 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::Mul>(PrimExpr a, PrimExpr b) {
 template <>
 inline ffi::Optional<PrimExpr> TryConstFold<tirx::Div>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
+    PrimType result_ty = a.ty();
     if (pa && pb) {
       // due to division and mod can have different modes
       // NOTE: this will assumes truc div.
       TVM_FFI_ICHECK_NE(pb->value, 0) << "Divide by zero";
       int64_t res = pa->value / pb->value;
-      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
+      return IntImm(result_ty, GetFoldResultInt64Repr(res, result_ty));
     }
     if (pa) {
       if (pa->value == 0) return a;
@@ -234,11 +243,11 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::Div>(PrimExpr a, PrimExpr b) {
     }
     if (fa && fb) {
       TVM_FFI_ICHECK_NE(fb->value, 0) << "Divide by zero";
-      if (rtype.bits() == 32) {
-        return FloatImm(rtype, GetFoldResultDoubleRepr(static_cast<float>(fa->value) /
-                                                       static_cast<float>(fb->value)));
-      } else if (rtype.bits() == 64) {
-        return FloatImm(rtype, fa->value / fb->value);
+      if (result_ty.bits() == 32) {
+        return FloatImm(result_ty, GetFoldResultDoubleRepr(static_cast<float>(fa->value) /
+                                                           static_cast<float>(fb->value)));
+      } else if (result_ty.bits() == 64) {
+        return FloatImm(result_ty, fa->value / fb->value);
       }
     }
     if (fa && fa->value == 0) return a;
@@ -253,18 +262,18 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::Div>(PrimExpr a, PrimExpr b) {
 template <>
 inline ffi::Optional<PrimExpr> TryConstFold<tirx::Mod>(PrimExpr a, PrimExpr b) {
   TVM_INDEX_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
+    PrimType result_ty = a.ty();
     if (pa && pb) {
       TVM_FFI_ICHECK_NE(pb->value, 0) << "Divide by zero";
       int64_t res = pa->value % pb->value;
-      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
+      return IntImm(result_ty, GetFoldResultInt64Repr(res, result_ty));
     }
     if (pa) {
       if (pa->value == 0) return a;
     }
     if (pb) {
       // MakeConst can handle both vector and scalar types.
-      if (pb->value == 1) return tirx::MakeConst(rtype, 0);
+      if (pb->value == 1) return tirx::MakeConst(result_ty, 0);
       TVM_FFI_ICHECK_NE(pb->value, 0) << "Divide by zero";
     }
   });
@@ -274,11 +283,11 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::Mod>(PrimExpr a, PrimExpr b) {
 template <>
 inline ffi::Optional<PrimExpr> TryConstFold<tirx::FloorDiv>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
+    PrimType result_ty = a.ty();
     if (pa && pb) {
       TVM_FFI_ICHECK_NE(pb->value, 0) << "Divide by zero";
       int64_t res = arith::floordiv(pa->value, pb->value);
-      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
+      return IntImm(result_ty, GetFoldResultInt64Repr(res, result_ty));
     }
     if (pa) {
       if (pa->value == 0) return a;
@@ -288,11 +297,12 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::FloorDiv>(PrimExpr a, PrimExpr
       TVM_FFI_ICHECK_NE(pb->value, 0) << "Divide by zero";
     }
     if (fa && fb && fb->value != 0) {
-      if (rtype.bits() == 32) {
-        return FloatImm(rtype, GetFoldResultDoubleRepr(std::floor(static_cast<float>(fa->value) /
-                                                                  static_cast<float>(fb->value))));
-      } else if (rtype.bits() == 64) {
-        return FloatImm(rtype, std::floor(fa->value / fb->value));
+      if (result_ty.bits() == 32) {
+        return FloatImm(result_ty,
+                        GetFoldResultDoubleRepr(std::floor(static_cast<float>(fa->value) /
+                                                           static_cast<float>(fb->value))));
+      } else if (result_ty.bits() == 64) {
+        return FloatImm(result_ty, std::floor(fa->value / fb->value));
       } else {
         return std::nullopt;
       }
@@ -309,18 +319,18 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::FloorDiv>(PrimExpr a, PrimExpr
 template <>
 inline ffi::Optional<PrimExpr> TryConstFold<tirx::FloorMod>(PrimExpr a, PrimExpr b) {
   TVM_INDEX_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
+    PrimType result_ty = a.ty();
     if (pa && pb) {
       TVM_FFI_ICHECK_NE(pb->value, 0) << "Divide by zero";
       int64_t res = arith::floormod(pa->value, pb->value);
-      return IntImm(rtype, GetFoldResultInt64Repr(res, rtype));
+      return IntImm(result_ty, GetFoldResultInt64Repr(res, result_ty));
     }
     if (pa) {
       if (pa->value == 0) return a;
     }
     if (pb) {
       // MakeConst can handle both vector and scalar types.
-      if (pb->value == 1) return tirx::MakeConst(rtype, 0);
+      if (pb->value == 1) return tirx::MakeConst(result_ty, 0);
       TVM_FFI_ICHECK_NE(pb->value, 0) << "Divide by zero";
     }
   });
@@ -330,9 +340,9 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::FloorMod>(PrimExpr a, PrimExpr
 template <>
 inline ffi::Optional<PrimExpr> TryConstFold<tirx::Min>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
-    if (pa && pb) return IntImm(rtype, std::min(pa->value, pb->value));
-    if (fa && fb) return FloatImm(rtype, std::min(fa->value, fb->value));
+    PrimType result_ty = a.ty();
+    if (pa && pb) return IntImm(result_ty, std::min(pa->value, pb->value));
+    if (fa && fb) return FloatImm(result_ty, std::min(fa->value, fb->value));
   });
   if (a.same_as(b)) return a;
   return std::nullopt;
@@ -341,9 +351,9 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::Min>(PrimExpr a, PrimExpr b) {
 template <>
 inline ffi::Optional<PrimExpr> TryConstFold<tirx::Max>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
-    if (pa && pb) return IntImm(rtype, std::max(pa->value, pb->value));
-    if (fa && fb) return FloatImm(rtype, std::max(fa->value, fb->value));
+    PrimType result_ty = a.ty();
+    if (pa && pb) return IntImm(result_ty, std::max(pa->value, pb->value));
+    if (fa && fb) return FloatImm(result_ty, std::max(fa->value, fb->value));
   });
   if (a.same_as(b)) return a;
   return std::nullopt;
diff --git a/src/arith/const_int_bound.cc b/src/arith/const_int_bound.cc
index 4d700564ea05..3e8087af0eff 100644
--- a/src/arith/const_int_bound.cc
+++ b/src/arith/const_int_bound.cc
@@ -151,7 +151,7 @@ class ConstIntBoundAnalyzer::Impl
 
   // Override visitor behaviors
   Entry VisitExprDefault_(const ffi::Object* op) final {
-    return Everything(static_cast<const PrimExprNode*>(op)->dtype);
+    return Everything(static_cast<const PrimExprNode*>(op)->ty());
   }
 
   Entry VisitExpr(const PrimExpr& expr) final {
@@ -167,7 +167,7 @@ class ConstIntBoundAnalyzer::Impl
     if (bound_) {
       auto val = bound_->find(expr);
       if (val != bound_->end()) {
-        auto everything = Everything(expr->dtype);
+        auto everything = Everything(expr->ty());
         TVM_FFI_ICHECK(
             (val->second->min_value == res.min_value && val->second->max_value == res.max_value) ||
             (val->second->min_value == everything.min_value &&
@@ -203,7 +203,7 @@ class ConstIntBoundAnalyzer::Impl
       a = VisitExpr(op->value);
     }
 
-    Entry b = Everything(op->dtype);
+    Entry b = Everything(op->ty());
     return Intersect(a, b);
   }
 
@@ -263,7 +263,7 @@ class ConstIntBoundAnalyzer::Impl
   Entry VisitExpr_(const DivNode* op) final {
     Entry a = VisitExpr(op->a);
     Entry b = AssumeNoZeroDivisor(VisitExpr(op->b));
-    return HandleDivision(a, b, op->dtype, InfAwareDiv);
+    return HandleDivision(a, b, op->ty(), InfAwareDiv);
   }
 
   Entry VisitExpr_(const ModNode* op) final {
@@ -312,14 +312,14 @@ class ConstIntBoundAnalyzer::Impl
       TVM_FFI_ICHECK(!b.is_const(0)) << "mod by zero";
       // mod by negative value is rare,
       // and we just use the simpliest rule.
-      return Everything(op->dtype);
+      return Everything(op->ty());
     }
   }
 
   Entry VisitExpr_(const FloorDivNode* op) final {
     Entry a = VisitExpr(op->a);
     Entry b = AssumeNoZeroDivisor(VisitExpr(op->b));
-    return HandleDivision(a, b, op->dtype, InfAwareFloorDiv);
+    return HandleDivision(a, b, op->ty(), InfAwareFloorDiv);
   }
 
   Entry VisitExpr_(const FloorModNode* op) final {
@@ -385,7 +385,7 @@ class ConstIntBoundAnalyzer::Impl
       int64_t b_max_cap = InfAwareAdd(b.max_value, -1);
       return Intersect(MakeBound(std::min(static_cast<int64_t>(0), b_min_cap),
                                  std::max(static_cast<int64_t>(0), b_max_cap)),
-                       Everything(op->dtype));
+                       Everything(op->ty()));
     }
   }
 
@@ -424,7 +424,7 @@ class ConstIntBoundAnalyzer::Impl
     } else if (op->op.same_as(tirx::builtin::bitwise_and())) {
       return VisitBitwiseAnd(op);
     } else {
-      return Everything(op->dtype);
+      return Everything(op->ty());
     }
   }
 
@@ -434,7 +434,7 @@ class ConstIntBoundAnalyzer::Impl
     if (it != var_map_.end()) {
       return it->second;
     } else {
-      return Everything(op->dtype);
+      return Everything(op->ty());
     }
   }
 
@@ -456,7 +456,7 @@ class ConstIntBoundAnalyzer::Impl
       // If either operand can negative, we may run into undefined
       // behavior for some targets.  In these cases, avoid making any
       // assumptions about the result.
-      return Everything(op->dtype);
+      return Everything(op->ty());
     }
 
     return BinaryOpBoundary(a, b, InfAwareLeftShift);
@@ -481,7 +481,7 @@ class ConstIntBoundAnalyzer::Impl
       if (a.min_value >= 0) {
         return MakeBound(0, a.max_value);
       }
-      return Everything(op->dtype);
+      return Everything(op->ty());
     }
   }
 
@@ -549,7 +549,7 @@ class ConstIntBoundAnalyzer::Impl
    * \return The result.
    */
   template <typename F>
-  static Entry HandleDivision(Entry a, Entry b, DataType dt, const F& op) {
+  static Entry HandleDivision(Entry a, Entry b, PrimType dt, const F& op) {
     // Here we have a / b.
     // The largest value of the division will be for the smallest (with
     // respect to the absolute value) value of b. If the range of b starts
@@ -557,7 +557,7 @@ class ConstIntBoundAnalyzer::Impl
     // be closer to 0, because BinaryOpBoundary only checks end-points of
     // the domain ranges.
     // If the range of b contains 0, then some infinity will be involved
-    if (b.min_value <= 0 && 0 <= b.max_value && dt.is_int()) {
+    if (b.min_value <= 0 && 0 <= b.max_value && dt.code() == DLDataTypeCode::kDLInt) {
       Entry b_neg = b.min_value < 0 ? MakeBound(b.min_value, -1) : Everything(dt);
       Entry b_pos = b.max_value > 0 ? MakeBound(1, b.max_value) : Everything(dt);
 
@@ -566,7 +566,7 @@ class ConstIntBoundAnalyzer::Impl
 
       return MakeBound(std::min(e_neg.min_value, e_pos.min_value),
                        std::max(e_neg.max_value, e_pos.max_value));
-    } else if (b.min_value == 0 && dt.is_uint()) {
+    } else if (b.min_value == 0 && dt.code() == DLDataTypeCode::kDLUInt) {
       // uints only have one sided bounds
       Entry assumed_b = MakeBound(1, b.max_value);
       return BinaryOpBoundary(a, assumed_b, op);
@@ -727,16 +727,17 @@ class ConstIntBoundAnalyzer::Impl
    * \param dtype The data type.
    * \return Bound that represent everything dtype can represent.
    */
-  static Entry Everything(DataType dtype) {
-    if (!dtype.is_int() && !dtype.is_uint() && !dtype.is_bool()) {
+  static Entry Everything(PrimType dtype) {
+    if (dtype.code() != DLDataTypeCode::kDLInt && dtype.code() != DLDataTypeCode::kDLUInt &&
+        dtype.code() != DLDataTypeCode::kDLBool) {
       return MakeBound(kNegInf, kPosInf);
     }
-    if (dtype.is_bool()) {
+    if (dtype.code() == DLDataTypeCode::kDLBool) {
       return MakeBound(0, 1);
     }
     Entry ret;
-    int64_t vbits = dtype.bits() - static_cast<int>(dtype.is_int());
-    if (dtype.is_uint()) {
+    int64_t vbits = dtype.bits() - static_cast<int>(dtype.code() == DLDataTypeCode::kDLInt);
+    if (dtype.code() == DLDataTypeCode::kDLUInt) {
       ret.min_value = 0;
     } else {
       if (vbits >= 63) {
@@ -800,7 +801,7 @@ class ConstIntBoundAnalyzer::Impl
   static ffi::Optional<PrimExpr> FindCeilLog2Arg(const CastNode* op) {
     static const Op& ceil_op = Op::Get("tirx.ceil");
     static const Op& log2_op = Op::Get("tirx.log2");
-    if (op->dtype.is_int()) {
+    if (op->ty().code() == DLDataTypeCode::kDLInt) {
       if (auto as_call = op->value.as<CallNode>()) {
         if (as_call->op.same_as(ceil_op)) {
           PrimExpr ceil_arg = as_call->args[0];
diff --git a/src/arith/detect_linear_equation.cc b/src/arith/detect_linear_equation.cc
index 5e77dca59405..f7e04ee0ebf5 100644
--- a/src/arith/detect_linear_equation.cc
+++ b/src/arith/detect_linear_equation.cc
@@ -54,10 +54,10 @@ class LinearEqDetector : public ExprFunctor<LinearEqEntry(const PrimExpr&, const
     *ret = VisitExpr(e, e);
     if (fail_) return false;
     if (!ret->base.defined()) {
-      ret->base = IntImm(var_.dtype(), 0);
+      ret->base = IntImm(var_.ty(), 0);
     }
     if (!ret->coeff.defined()) {
-      ret->coeff = IntImm(var_.dtype(), 0);
+      ret->coeff = IntImm(var_.ty(), 0);
     }
     return true;
   }
@@ -101,8 +101,8 @@ class LinearEqDetector : public ExprFunctor<LinearEqEntry(const PrimExpr&, const
   LinearEqEntry VisitExpr_(const VarNode* op, const PrimExpr& e) final {
     LinearEqEntry ret;
     if (op == var_.get()) {
-      auto dtype = op->dtype;
-      ret.coeff = MakeConst(DataType::Int(dtype.bits(), dtype.lanes()), 1);
+      PrimType dtype = op->ty();
+      ret.coeff = MakeConst(PrimType::Int(dtype.bits(), dtype.lanes()), 1);
     } else {
       ret.base = e;
     }
@@ -194,19 +194,21 @@ bool DetectClipBound(const PrimExpr& cond,
   bool is_eq = false;
   PrimExpr canonical;
   if (const LTNode* op = cond.as<LTNode>()) {
-    if (!op->a.dtype().is_int()) return false;
-    canonical = op->b - op->a - MakeConst(op->a.dtype(), 1);
+    PrimType a_ty = op->a.ty();
+    if (a_ty.code() != DLDataTypeCode::kDLInt) return false;
+    canonical = op->b - op->a - MakeConst(a_ty, 1);
   } else if (const LENode* op = cond.as<LENode>()) {
-    if (!op->a.dtype().is_int()) return false;
+    if (op->a.ty().code() != DLDataTypeCode::kDLInt) return false;
     canonical = op->b - op->a;
   } else if (const GTNode* op = cond.as<GTNode>()) {
-    if (!op->a.dtype().is_int()) return false;
-    canonical = op->a - op->b - MakeConst(op->a.dtype(), 1);
+    PrimType a_ty = op->a.ty();
+    if (a_ty.code() != DLDataTypeCode::kDLInt) return false;
+    canonical = op->a - op->b - MakeConst(a_ty, 1);
   } else if (const GENode* op = cond.as<GENode>()) {
-    if (!op->a.dtype().is_int()) return false;
+    if (op->a.ty().code() != DLDataTypeCode::kDLInt) return false;
     canonical = op->a - op->b;
   } else if (const EQNode* op = cond.as<EQNode>()) {
-    if (!op->a.dtype().is_int()) return false;
+    if (op->a.ty().code() != DLDataTypeCode::kDLInt) return false;
     canonical = op->a - op->b;
     is_eq = true;
   } else {
diff --git a/src/arith/int_constraints.cc b/src/arith/int_constraints.cc
index 55db4fc774b6..bcd957aac0f2 100644
--- a/src/arith/int_constraints.cc
+++ b/src/arith/int_constraints.cc
@@ -74,7 +74,9 @@ ffi::Array<PrimExpr> AsConditions(const ffi::Array<Var>& variables,
 
 IntGroupBounds::IntGroupBounds(PrimExpr coef, ffi::Array<PrimExpr> lower,
                                ffi::Array<PrimExpr> equal, ffi::Array<PrimExpr> upper) {
-  TVM_FFI_ICHECK(coef.dtype().is_int() || coef.dtype().is_uint())
+  PrimType coef_ty = coef.ty();
+  TVM_FFI_ICHECK(coef_ty.code() == DLDataTypeCode::kDLInt ||
+                 coef_ty.code() == DLDataTypeCode::kDLUInt)
       << "Coefficient in IntGroupBounds must be integers";
   ffi::ObjectPtr<IntGroupBoundsNode> node = ffi::make_object<IntGroupBoundsNode>();
   node->coef = std::move(coef);
@@ -86,7 +88,7 @@ IntGroupBounds::IntGroupBounds(PrimExpr coef, ffi::Array<PrimExpr> lower,
 
 IntGroupBounds IntGroupBounds::FromRange(const Range& r) {
   Analyzer analyzer;
-  PrimExpr coef = tirx::MakeConst(r->min.dtype(), 1);
+  PrimExpr coef = tirx::MakeConst(r->min.ty(), 1);
   ffi::Array<PrimExpr> equal;
   ffi::Array<PrimExpr> lower;
   ffi::Array<PrimExpr> upper;
@@ -232,7 +234,9 @@ IntConstraints::IntConstraints(ffi::Array<Var> variables, ffi::Map<Var, Range> r
   }
   TVM_FFI_ICHECK(relations.defined());
   for (const auto& var : variables) {
-    TVM_FFI_ICHECK(var.dtype().is_int() || var.dtype().is_uint())
+    PrimType var_ty = var.ty();
+    TVM_FFI_ICHECK(var_ty.code() == DLDataTypeCode::kDLInt ||
+                   var_ty.code() == DLDataTypeCode::kDLUInt)
         << "Variables in IntConstraints must be integers";
   }
   node->variables = std::move(variables);
diff --git a/src/arith/int_set.cc b/src/arith/int_set.cc
index d7bf32442497..ac966582e766 100644
--- a/src/arith/int_set.cc
+++ b/src/arith/int_set.cc
@@ -50,8 +50,8 @@ using tirx::MakeConst;
 
 TVM_FFI_STATIC_INIT_BLOCK() { IntervalSetNode::RegisterReflection(); }
 
-PrimExpr SymbolicLimits::pos_inf_ = Var("pos_inf", DataType::Handle());
-PrimExpr SymbolicLimits::neg_inf_ = Var("neg_inf", DataType::Handle());
+PrimExpr SymbolicLimits::pos_inf_ = Var("pos_inf", PrimType::Handle());
+PrimExpr SymbolicLimits::neg_inf_ = Var("neg_inf", PrimType::Handle());
 
 IntervalSet::IntervalSet(PrimExpr min_value, PrimExpr max_value) {
   auto node = ffi::make_object<IntervalSetNode>();
@@ -72,8 +72,10 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 IntervalSet Intersect(AnalyzerObj* analyzer, IntervalSet a, IntervalSet b) {
   PrimExpr max_value = min(a->max_value, b->max_value);
   PrimExpr min_value = max(a->min_value, b->min_value);
-  if ((max_value.dtype().is_int() || max_value.dtype().is_uint()) &&
-      (min_value.dtype().is_int() || min_value.dtype().is_uint()) &&
+  PrimType max_ty = max_value.ty();
+  PrimType min_ty = min_value.ty();
+  if ((max_ty.code() == DLDataTypeCode::kDLInt || max_ty.code() == DLDataTypeCode::kDLUInt) &&
+      (min_ty.code() == DLDataTypeCode::kDLInt || min_ty.code() == DLDataTypeCode::kDLUInt) &&
       analyzer->CanProve(max_value < min_value)) {
     return IntervalSet::Empty();
   } else {
@@ -121,7 +123,7 @@ TVM_DECLARE_LOGICAL_OP(Not);
  */
 template <typename Op, typename OpNode>
 inline IntervalSet Combine(AnalyzerObj* analyzer, IntervalSet a, IntervalSet b, const OpNode* op) {
-  DataType dtype = op->dtype;
+  PrimType dtype = op->ty();
   if (a->IsSinglePoint() && b->IsSinglePoint()) {
     PrimExpr expr;
     if (auto res = TryConstFold<Op>(a->min_value, b->min_value)) {
@@ -195,7 +197,7 @@ inline IntervalSet Combine<tirx::Mul>(AnalyzerObj* analyzer, IntervalSet a, Inte
       return IntervalSet(min_value, max_value);
     } else if (a->HasUpperBound() && a->HasLowerBound()) {
       using tirx::Select;
-      PrimExpr sign = b->min_value >= IntImm(b->min_value.dtype().element_of(), 0);
+      PrimExpr sign = b->min_value >= IntImm(b->min_value.ty().WithLanes(1), 0);
       PrimExpr e1 = a->min_value * b->min_value;
       PrimExpr e2 = a->max_value * b->min_value;
       return IntervalSet(Select(sign, e1, e2), Select(sign, e2, e1));
@@ -229,7 +231,7 @@ inline IntervalSet Combine<tirx::Div>(AnalyzerObj* analyzer, IntervalSet a, Inte
       return IntervalSet(min_value, max_value);
     } else if (a->HasUpperBound() && a->HasLowerBound()) {
       using tirx::Select;
-      PrimExpr sign = b->min_value >= IntImm(b->min_value.dtype().element_of(), 0);
+      PrimExpr sign = b->min_value >= IntImm(b->min_value.ty().WithLanes(1), 0);
       PrimExpr e1 = a->min_value / b->min_value;
       PrimExpr e2 = a->max_value / b->min_value;
       return IntervalSet(Select(sign, e1, e2), Select(sign, e2, e1));
@@ -258,7 +260,7 @@ inline IntervalSet Combine<tirx::Mod>(AnalyzerObj* analyzer, IntervalSet a, Inte
     // is the case of our application.
     // TODO(tqchen): add bound constraints for a.
     if (analyzer->CanProveGreaterEqual(divisor, 0)) {
-      return IntervalSet(IntImm(divisor.dtype(), 0), divisor - 1);
+      return IntervalSet(IntImm(divisor.ty(), 0), divisor - 1);
     } else {
       PrimExpr bound = abs(divisor) - 1;
       return IntervalSet(-bound, bound);
@@ -292,7 +294,7 @@ inline IntervalSet Combine<tirx::FloorDiv>(AnalyzerObj* analyzer, IntervalSet a,
       return IntervalSet(min_value, max_value);
     } else if (a->HasUpperBound() && a->HasLowerBound()) {
       using tirx::Select;
-      PrimExpr sign = b->min_value >= IntImm(b->min_value.dtype().element_of(), 0);
+      PrimExpr sign = b->min_value >= IntImm(b->min_value.ty().WithLanes(1), 0);
       PrimExpr e1 = floordiv(a->min_value, b->min_value);
       PrimExpr e2 = floordiv(a->max_value, b->min_value);
       return IntervalSet(Select(sign, e1, e2), Select(sign, e2, e1));
@@ -323,7 +325,7 @@ inline IntervalSet Combine<tirx::FloorMod>(AnalyzerObj* analyzer, IntervalSet a,
         auto qmin = a->HasLowerBound() ? floordiv(a->min_value, divisor) : neg_inf();
         // We can compare +/- inf against each other, but cannot use
         // operator== between the symbolic limits and an integer.
-        bool compatible_dtypes = !(qmin.dtype().is_handle() ^ qmax.dtype().is_handle());
+        bool compatible_dtypes = !(qmin.ty().IsHandle() ^ qmax.ty().IsHandle());
         if (compatible_dtypes && analyzer->CanProve(qmax == qmin)) {
           auto tmax = a->max_value - divisor * qmin;
           auto tmin = a->min_value - divisor * qmin;
@@ -348,12 +350,13 @@ inline IntervalSet Combine<tirx::FloorMod>(AnalyzerObj* analyzer, IntervalSet a,
             int64_t max_mod_result = max_quotient * gcd + (dividend_mod->base % gcd);
 
             if (max_mod_result >= 0 && max_mod_result < div_val) {
-              return IntervalSet(IntImm(op->dtype, 0), IntImm(op->dtype, max_mod_result));
+              PrimType result_ty = ffi::GetRef<PrimExpr>(op).ty();
+              return IntervalSet(IntImm(result_ty, 0), IntImm(result_ty, max_mod_result));
             }
           }
         }
       }
-      return IntervalSet(IntImm(divisor.dtype(), 0), divisor - 1);
+      return IntervalSet(IntImm(divisor.ty(), 0), divisor - 1);
     } else {
       PrimExpr bound = abs(divisor) - 1;
       return IntervalSet(-bound, bound);
@@ -522,7 +525,7 @@ class IntervalSetEvaluator : public ExprFunctor<IntervalSet(const PrimExpr&)> {
     IntervalSet base = Eval(op->base);
     PVar<IntImm> stride;
     if (stride.Match(op->stride)) {
-      DataType t = op->base.dtype();
+      PrimType t = op->base.ty();
       int64_t vstride = stride.Eval()->value;
       if (op->lanes->IsInstance<IntImmNode>()) {
         int lanes = static_cast<int>(op->lanes.as_or_throw<IntImm>()->value);
@@ -569,18 +572,19 @@ class IntervalSetEvaluator : public ExprFunctor<IntervalSet(const PrimExpr&)> {
     // short cut for the int set.
     if (value_set->min_value.same_as(value_set->max_value)) {
       if (value_set->IsEmpty()) return value_set;
-      return IntervalSet::SinglePoint(cast(op->dtype, value_set->min_value));
+      return IntervalSet::SinglePoint(cast(op->ty(), value_set->min_value));
     }
     PrimExpr min_value =
-        value_set->HasLowerBound() ? cast(op->dtype, value_set->min_value) : neg_inf();
+        value_set->HasLowerBound() ? cast(op->ty(), value_set->min_value) : neg_inf();
     PrimExpr max_value =
-        value_set->HasUpperBound() ? cast(op->dtype, value_set->max_value) : pos_inf();
+        value_set->HasUpperBound() ? cast(op->ty(), value_set->max_value) : pos_inf();
     return IntervalSet(min_value, max_value);
   }
 
   IntervalSet VisitExpr_(const BufferLoadNode* op) final {
-    if (!(op->dtype.is_int() || op->dtype.is_uint())) {
-      DLOG(WARNING) << "cannot evaluate set BufferLoad which loads from a " << op->dtype
+    PrimType op_ty = op->ty();
+    if (!(op_ty.code() == DLDataTypeCode::kDLInt || op_ty.code() == DLDataTypeCode::kDLUInt)) {
+      DLOG(WARNING) << "cannot evaluate set BufferLoad which loads from a " << op_ty->dtype
                     << " buffer";
       return IntervalSet::Everything();
     }
@@ -1048,7 +1052,7 @@ IntSet EvalSet(PrimExpr e, const ffi::Map<Var, IntSet>& dom_map) {
 
 IntSet IntSet::Vector(PrimExpr x) {
   // short cut: simply get single point
-  if (!x.dtype().is_scalable_or_fixed_length_vector()) {
+  if (!x.ty().IsScalableVector() && !x.ty().IsFixedLengthVector()) {
     return IntSet::SinglePoint(x);
   } else {
     // vector case.
@@ -1068,7 +1072,9 @@ IntSet EvalSet(PrimExpr e, const std::unordered_map<const VarNode*, IntSet>& dom
 
 IntSet EvalSet(Range r, const ffi::Map<Var, IntSet>& dom_map) {
   Analyzer ana;
-  if ((r->min->dtype.is_int() || r->min->dtype.is_uint()) && ana->CanProveEqual(r->extent, 1)) {
+  PrimType min_ty = r->min.ty();
+  if ((min_ty.code() == DLDataTypeCode::kDLInt || min_ty.code() == DLDataTypeCode::kDLUInt) &&
+      ana->CanProveEqual(r->extent, 1)) {
     return EvalSet(r->min, dom_map);
   }
   IntervalSetEvaluator m(ana.get(), dom_map);
diff --git a/src/arith/ir_mutator_with_analyzer.cc b/src/arith/ir_mutator_with_analyzer.cc
index 8dcef7a75a80..d6a264288b16 100644
--- a/src/arith/ir_mutator_with_analyzer.cc
+++ b/src/arith/ir_mutator_with_analyzer.cc
@@ -54,7 +54,7 @@ void AppendFloorDivConstraints(const FloorDivNode* div, int64_t value, CompareKi
   int64_t divisor_value = 0;
   if (!TryGetIntImm(div->b, &divisor_value) || divisor_value <= 0) return;
 
-  DataType dtype = div->a.dtype();
+  PrimType dtype = div->a.ty();
   PrimExpr divisor = MakeConst(dtype, divisor_value);
   PrimExpr k = MakeConst(dtype, value);
   PrimExpr lo = k * divisor;
@@ -117,7 +117,8 @@ void CollectDerivedConstraintFacts(const PrimExpr& condition, std::vector<PrimEx
   }
   if (const auto* call = condition.as<CallNode>()) {
     if (call->op.same_as(tirx::builtin::bitwise_and()) && call->args.size() == 2 &&
-        call->args[0].dtype().is_bool() && call->args[1].dtype().is_bool()) {
+        call->args[0].ty().MatchesElementType(DLDataTypeCode::kDLBool, 8) &&
+        call->args[1].ty().MatchesElementType(DLDataTypeCode::kDLBool, 8)) {
       CollectDerivedConstraintFacts(call->args[0], out);
       CollectDerivedConstraintFacts(call->args[1], out);
       return;
@@ -260,7 +261,7 @@ Stmt IRMutatorWithAnalyzer::VisitStmt_(const AttrStmtNode* op) {
     if (op->attr_key == tirx::attr::thread_extent || op->attr_key == s_tir::attr::virtual_thread) {
       IterVar iv = op->node.as_or_throw<IterVar>();
       TVM_FFI_ICHECK_NE(iv->thread_tag.length(), 0U);
-      Range dom = Range::FromMinExtent(IntImm(op->value.dtype(), 0), op->value);
+      Range dom = Range::FromMinExtent(IntImm(op->value.ty(), 0), op->value);
       analyzer_->Bind(iv->var, dom);
       iter_vars_.Set(iv->var, dom);
     }
@@ -313,7 +314,8 @@ PrimExpr IRMutatorWithAnalyzer::VisitExpr_(const CallNode* op) {
         false_value.same_as(op->args[2])) {
       return ffi::GetRef<PrimExpr>(op);
     } else {
-      return Call(op->dtype, op->op, {cond, true_value, false_value}, op->attrs, op->span);
+      return Call(ffi::GetRef<PrimExpr>(op).ty(), op->op, {cond, true_value, false_value},
+                  op->attrs, op->span);
     }
   }
   return StmtExprMutator::VisitExpr_(op);
diff --git a/src/arith/ir_visitor_with_analyzer.cc b/src/arith/ir_visitor_with_analyzer.cc
index ffe9c73bd6f2..0313dbfe4271 100644
--- a/src/arith/ir_visitor_with_analyzer.cc
+++ b/src/arith/ir_visitor_with_analyzer.cc
@@ -79,7 +79,7 @@ void IRVisitorWithAnalyzer::VisitStmt_(const AttrStmtNode* op) {
     if (op->attr_key == tirx::attr::thread_extent || op->attr_key == s_tir::attr::virtual_thread) {
       IterVar iv = op->node.as_or_throw<IterVar>();
       TVM_FFI_ICHECK_NE(iv->thread_tag.length(), 0U);
-      analyzer_->Bind(iv->var, Range::FromMinExtent(IntImm(op->value->dtype, 0), op->value));
+      analyzer_->Bind(iv->var, Range::FromMinExtent(IntImm(op->value.ty(), 0), op->value));
     }
     StmtExprVisitor::VisitStmt_(op);
   });
diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index c7f8819f944f..430a4ec5c839 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -66,8 +66,8 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 IterSplitExpr::IterSplitExpr(IterMark source) {
   auto n = ffi::make_object<IterSplitExprNode>();
-  auto one = MakeConst(source->source->dtype, 1);
-  n->dtype = source->source->dtype;
+  auto one = MakeConst(source->source.ty(), 1);
+  n->BaseExprNode::ty = source->source.ty();
   n->source = std::move(source);
   n->extent = n->source->extent;
   n->lower_factor = one;
@@ -77,8 +77,8 @@ IterSplitExpr::IterSplitExpr(IterMark source) {
 
 IterSplitExpr::IterSplitExpr(IterMark source, PrimExpr scale) {
   auto n = ffi::make_object<IterSplitExprNode>();
-  auto one = MakeConst(source->source->dtype, 1);
-  n->dtype = source->source->dtype;
+  auto one = MakeConst(source->source.ty(), 1);
+  n->BaseExprNode::ty = source->source.ty();
   n->source = std::move(source);
   n->extent = n->source->extent;
   n->lower_factor = one;
@@ -89,7 +89,7 @@ IterSplitExpr::IterSplitExpr(IterMark source, PrimExpr scale) {
 IterSplitExpr::IterSplitExpr(IterMark source, PrimExpr lower_factor, PrimExpr extent,
                              PrimExpr scale) {
   auto n = ffi::make_object<IterSplitExprNode>();
-  n->dtype = source->source->dtype;
+  n->BaseExprNode::ty = source->source.ty();
   n->source = std::move(source);
   n->lower_factor = std::move(lower_factor);
   n->extent = std::move(extent);
@@ -109,7 +109,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 IterSumExpr::IterSumExpr(ffi::Array<IterSplitExpr> args, PrimExpr base) {
   auto n = ffi::make_object<IterSumExprNode>();
-  n->dtype = base->dtype;
+  n->BaseExprNode::ty = base.ty();
   n->args = std::move(args);
   n->base = std::move(base);
   data_ = std::move(n);
@@ -563,7 +563,7 @@ class IterMapRewriter : public ExprMutator {
                                                IterMapLevel check_level) {
     std::vector<bool> used(splits.size(), false);
     std::vector<IterSplitExpr> iters;
-    PrimExpr expected_lower_factor = MakeConst(mark->source->dtype, 1);
+    PrimExpr expected_lower_factor = MakeConst(mark->source.ty(), 1);
 
     for (size_t i = 0; i < splits.size(); ++i) {
       size_t j = 0;
@@ -694,7 +694,7 @@ class IterMapRewriter : public ExprMutator {
       PrimExpr iter_min = mark_offset;
       PrimExpr iter_max = iter_min + mark->extent;
       // the delta of iter_min when it is updated when the lower bound predicate is present
-      PrimExpr iter_min_delta = IntImm(iter_min.dtype(), 0);
+      PrimExpr iter_min_delta = IntImm(iter_min.ty(), 0);
       if (predicate_induced_min.defined()) {
         iter_min_delta = max(predicate_induced_min.value(), iter_min) - iter_min;
         iter_min = max(predicate_induced_min.value(), iter_min);
@@ -788,7 +788,7 @@ class IterMapRewriter : public ExprMutator {
     for (IterSplitExpr split : expr->args) {
       int64_t symbol_prod_count = 0;
       int64_t cscale = 1;
-      PrimExpr res = tirx::MakeConst(split.dtype(), 1);
+      PrimExpr res = tirx::MakeConst(split.ty(), 1);
       auto fcollect = [&](PrimExpr val) {
         if (const auto* intimm = val.as<IntImmNode>()) {
           cscale *= intimm->value;
@@ -799,7 +799,7 @@ class IterMapRewriter : public ExprMutator {
       };
       UnpackReduction<tirx::MulNode>(split->scale, fcollect);
       if (cscale != 1) {
-        res = res * tirx::MakeConst(res.dtype(), cscale);
+        res = res * tirx::MakeConst(res.ty(), cscale);
       }
       split.CopyOnWrite()->scale = res;
       items.emplace_back(Item{cscale, symbol_prod_count, split});
@@ -830,7 +830,7 @@ class IterMapRewriter : public ExprMutator {
     if (auto op = expr.as<IterSumExpr>()) {
       return op.value();
     } else if (auto op = expr.as<IterSplitExpr>()) {
-      return IterSumExpr({op.value()}, IntImm(expr->dtype, 0));
+      return IterSumExpr({op.value()}, IntImm(expr.ty(), 0));
     } else {
       TVM_FFI_ICHECK(!expr->IsInstance<IterMapExprNode>());
       return IterSumExpr({}, expr);
@@ -1103,8 +1103,8 @@ class IterMapRewriter : public ExprMutator {
     std::vector<IterSplitExpr> flattened_iters, grouped_iters;
 
     // check if it can be remapped into a fused pattern.
-    PrimExpr expected_extra_base = IntImm(expr.dtype(), 0);
-    PrimExpr tail_extent = IntImm(expr.dtype(), 0);
+    PrimExpr expected_extra_base = IntImm(expr.ty(), 0);
+    PrimExpr tail_extent = IntImm(expr.ty(), 0);
     PrimExpr expected_scale = base_scale;
     int first_possible_unit_extent_pos = FindFirstPossibleUnitExtentIndex(expr);
 
@@ -1200,10 +1200,10 @@ class IterMapRewriter : public ExprMutator {
     IterSumExpr structured_form = expr, flattened_form = expr;
     flattened_form.CopyOnWrite()->args =
         ffi::Array<IterSplitExpr>(flattened_iters.rbegin(), flattened_iters.rend());
-    flattened_form.CopyOnWrite()->base = IntImm(expr.dtype(), 0);
+    flattened_form.CopyOnWrite()->base = IntImm(expr.ty(), 0);
     structured_form.CopyOnWrite()->args =
         ffi::Array<IterSplitExpr>(grouped_iters.rbegin(), grouped_iters.rend());
-    structured_form.CopyOnWrite()->base = IntImm(expr.dtype(), 0);
+    structured_form.CopyOnWrite()->base = IntImm(expr.ty(), 0);
     auto it = sum_fuse_map_.find(flattened_form);
     if (it != sum_fuse_map_.end()) {
       // old iter
@@ -1245,7 +1245,7 @@ class IterMapRewriter : public ExprMutator {
     if (sign > 0) {
       lhs->args.push_back(rhs);
     } else {
-      rhs.CopyOnWrite()->scale = IntImm(rhs->scale.dtype(), 0) - rhs->scale;
+      rhs.CopyOnWrite()->scale = IntImm(rhs->scale.ty(), 0) - rhs->scale;
       lhs->args.push_back(rhs);
     }
   }
@@ -1332,8 +1332,10 @@ bool MatchBoundConstraints(PrimExpr pred, ffi::Map<Var, Range>* input_iters,
     PrimExpr lhs_expr = lhs.Eval();
     PrimExpr rhs_expr = rhs.Eval();
     // we only accept predicate of integers
-    if (!((lhs_expr->dtype.is_int() || lhs_expr->dtype.is_uint()) &&
-          (rhs_expr->dtype.is_int() || rhs_expr->dtype.is_uint()))) {
+    PrimType lhs_ty = lhs_expr.ty();
+    PrimType rhs_ty = rhs_expr.ty();
+    if (!((lhs_ty.code() == DLDataTypeCode::kDLInt || lhs_ty.code() == DLDataTypeCode::kDLUInt) &&
+          (rhs_ty.code() == DLDataTypeCode::kDLInt || rhs_ty.code() == DLDataTypeCode::kDLUInt))) {
       return false;
     }
     // determine iter and bound, if we can not distinguish them simply,
@@ -1563,7 +1565,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const VarNode* op) {
 }
 
 PrimExpr IterMapRewriter::VisitExpr_(const AddNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Parent::VisitExpr_(op);
   }
   PrimExpr a = this->DirectMutate(op->a);
@@ -1596,7 +1598,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const AddNode* op) {
 }
 
 PrimExpr IterMapRewriter::VisitExpr_(const SubNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Parent::VisitExpr_(op);
   }
 
@@ -1631,7 +1633,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const SubNode* op) {
 }
 
 PrimExpr IterMapRewriter::VisitExpr_(const MulNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Parent::VisitExpr_(op);
   }
   // normalize
@@ -1677,7 +1679,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const MulNode* op) {
 IterSumExpr IterMapRewriter::PreprocessDividend(IterMapExpr dividend, PrimExpr original_dividend) {
   if (dividend->IsInstance<IterSplitExprNode>()) {
     auto split = dividend.as_or_throw<IterSplitExpr>();
-    return IterSumExpr({split}, IntImm(split.dtype(), 0));
+    return IterSumExpr({split}, IntImm(split.ty(), 0));
   } else if (dividend->IsInstance<IterSumExprNode>()) {
     auto sum = dividend.as_or_throw<IterSumExpr>();
     if (sum->args.empty()) {
@@ -1880,12 +1882,12 @@ PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr base, P
     } else if (CanProveDivisible(rhs, lhs->scale) && is_zero(base)) {
       // floordiv(x*c1, c1*c2) = floordiv(x, c2), c2=rhs/scale
       rhs = floordiv(rhs, lhs->scale);
-      lhs.CopyOnWrite()->scale = MakeConst(rhs->dtype, 1);
+      lhs.CopyOnWrite()->scale = MakeConst(rhs.ty(), 1);
     } else if (CanProveDivisible(rhs, lhs->scale) && CanProveDivisible(base, lhs->scale)) {
       // floordiv(x*c1 + y*c1, c1*c2) = floordiv(x+y, c2), c2=rhs/scale
       base = floordiv(base, lhs->scale);
       rhs = floordiv(rhs, lhs->scale);
-      lhs.CopyOnWrite()->scale = MakeConst(rhs->dtype, 1);
+      lhs.CopyOnWrite()->scale = MakeConst(rhs.ty(), 1);
     } else {
       // mark as unresolved.
       ErrorLogger(this) << "Cannot represent as IterMap: the numerator's scaling factor, "
@@ -1931,7 +1933,7 @@ PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr base, P
     new_split = IterSplitExpr(IterMark(padded, padded->extent),
                               /* lower_factor = */ rhs,
                               /* extent = */ analyzer_->Simplify(ceildiv(padded->extent, rhs)),
-                              /* scale = */ MakeConst(rhs->dtype, 1));
+                              /* scale = */ MakeConst(rhs.ty(), 1));
   }
 
   auto new_base = analyzer_->Simplify(floordiv(base - left_pad, rhs), 6);
@@ -1944,7 +1946,7 @@ PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr base, P
 }
 
 PrimExpr IterMapRewriter::VisitExpr_(const FloorDivNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Parent::VisitExpr_(op);
   }
 
@@ -1987,13 +1989,13 @@ PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr base, P
 
   if (is_one(rhs)) {
     // floormod(x, 1) = 0
-    return IntImm(lhs->dtype, 0);
+    return IntImm(lhs.ty(), 0);
   }
 
   if (!is_one(lhs->scale)) {
     if (CanProveDivisible(lhs->scale, rhs) && CanProveDivisible(base, rhs)) {
       // floormod(x*c1*c2, c1) = 0
-      return IntImm(lhs->dtype, 0);
+      return IntImm(lhs.ty(), 0);
     } else if (CanProveDivisible(rhs, lhs->scale) && is_zero(base)) {
       // floormod(x*c1, c1*c2) = (floormod(x, c2)) * c1, where c2 = rhs/scale
       rhs = floordiv(rhs, lhs->scale);
@@ -2028,7 +2030,7 @@ PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr base, P
 }
 
 PrimExpr IterMapRewriter::VisitExpr_(const FloorModNode* op) {
-  if (!IsIndexType(op->dtype)) {
+  if (!IsIndexTypedExpr(op)) {
     return Parent::VisitExpr_(op);
   }
 
@@ -2113,7 +2115,7 @@ class IterMapToExprNormalizer : public ExprMutator {
       // simplify trivial iters like `vi \in [0, 1)`, which can be useful for subsequent analysis
       // like tensorization.
       if (is_one(expr->extent) && !is_one(expr->source->extent)) {
-        return IntImm(expr->extent->dtype, 0);
+        return IntImm(expr->extent.ty(), 0);
       }
       return floordiv(source, expr->lower_factor) * expr->scale;
     } else {
@@ -2255,13 +2257,13 @@ class SubspaceDivider {
     IterSplitExpr GetInnerAsSplit() const { return GetAsSplit(inner, inner_extent); }
 
     static DivisionResult Inner(const IterMapExpr& iter, const PrimExpr& extent) {
-      auto dtype = iter.dtype();
+      PrimType dtype = iter.ty();
       return DivisionResult(IterSumExpr({}, IntImm(dtype, 0)), IntImm(dtype, 1), iter, extent,
                             Kind::kInner);
     }
 
     static DivisionResult Outer(const IterMapExpr& iter, const PrimExpr& extent) {
-      auto dtype = iter.dtype();
+      PrimType dtype = iter.ty();
       return DivisionResult(iter, extent, IterSumExpr({}, IntImm(dtype, 0)), IntImm(dtype, 1),
                             Kind::kOuter);
     }
@@ -2285,7 +2287,7 @@ class SubspaceDivider {
 
   // Divide an IterSumExpr
   DivisionResult DivideIterSumExpr(const IterSumExpr& expr, const PrimExpr& mark_extent) {
-    auto dtype = expr.dtype();
+    PrimType dtype = expr.ty();
     if (expr->args.empty()) {
       // base
       return DivisionResult(IterSumExpr({}, IntImm(dtype, 0)), IntImm(dtype, 1),
@@ -2377,7 +2379,7 @@ class SubspaceDivider {
   // args are sorted from inner to outer
   static IterMark MarkFromArgsAndBase(const std::vector<IterSplitExpr>& args, PrimExpr base) {
     std::vector<IterSplitExpr> res;
-    PrimExpr extent = MakeConst(base.dtype(), 1);
+    PrimExpr extent = MakeConst(base.ty(), 1);
     for (const IterSplitExpr& it : args) {
       IterSplitExpr arg = it;
       arg.CopyOnWrite()->scale = extent;
@@ -2431,7 +2433,7 @@ class SubspaceDivider {
       bool encountered_boundary = mark_division.IsOuter();
       std::vector<bool> used(splits.size(), false);
       std::vector<IterSplitExpr> inner_iters, outer_iters;
-      PrimExpr expected_lower_factor = MakeConst(expr->source->source->dtype, 1);
+      PrimExpr expected_lower_factor = MakeConst(expr->source->source.ty(), 1);
       // find the boundary of outer and inner, like case 1 above
       for (size_t i = 0; i < splits.size(); ++i) {
         size_t j = 0;
diff --git a/src/arith/pattern_match.h b/src/arith/pattern_match.h
index bb1ebd54cca7..f6a052089842 100644
--- a/src/arith/pattern_match.h
+++ b/src/arith/pattern_match.h
@@ -71,6 +71,7 @@
 #include <tvm/tirx/expr.h>
 
 #include <cmath>
+#include <optional>
 #include <tuple>
 
 #include "const_fold.h"
@@ -199,7 +200,10 @@ class PVar : public Pattern<PVar<T>> {
   // Store PVars by reference in the expression.
   using Nested = const PVar<T>&;
 
-  void InitMatch_() const { filled_ = false; }
+  void InitMatch_() const {
+    value_.reset();
+    filled_ = false;
+  }
 
   bool Match_(const T& value) const {
     if (!filled_) {
@@ -207,7 +211,7 @@ class PVar : public Pattern<PVar<T>> {
       filled_ = true;
       return true;
     } else {
-      return PEqualChecker<T>()(value_, value);
+      return PEqualChecker<T>()(value_.value(), value);
     }
   }
 
@@ -223,14 +227,14 @@ class PVar : public Pattern<PVar<T>> {
 
   T Eval() const {
     TVM_FFI_ICHECK(filled_);
-    return value_;
+    return value_.value();
   }
 
-  T EvalOr(const T& default_value) const { return filled_ ? value_ : default_value; }
+  T EvalOr(const T& default_value) const { return filled_ ? value_.value() : default_value; }
 
  protected:
   /*! \brief The matched value */
-  mutable T value_;
+  mutable std::optional<T> value_;
   /*! \brief whether the variable has been filled */
   mutable bool filled_{false};
 };
@@ -282,7 +286,7 @@ class PVarWithDataType : public PVarWithCheck<PVarWithDataType<T, DType>, T> {
  public:
   explicit PVarWithDataType(const DType& dtype) : dtype_(dtype) {}
 
-  bool Match_(const T& value) const { return dtype_.Match_(value->dtype); }
+  bool Match_(const T& value) const { return dtype_.Match_(value.ty()); }
 
  protected:
   typename DType::Nested dtype_;
@@ -291,15 +295,15 @@ class PVarWithDataType : public PVarWithCheck<PVarWithDataType<T, DType>, T> {
 /*!
  * \brief Pattern variable container for data type with lanes.
  */
-class PVecDataType : public PVarWithCheck<PVecDataType, DataType> {
+class PVecDataType : public PVarWithCheck<PVecDataType, PrimType> {
  public:
   /*! \brief construct vector dtype placeholder with element type check */
-  explicit PVecDataType(const DataType& elem_dtype) : elem_dtype_(elem_dtype) {}
+  explicit PVecDataType(PrimType elem_dtype) : elem_dtype_(elem_dtype) {}
 
-  bool Match_(const DataType& dtype) const { return dtype.code() == elem_dtype_.code(); }
+  bool Match_(PrimType dtype) const { return dtype.code() == elem_dtype_.code(); }
 
  protected:
-  DataType elem_dtype_;
+  PrimType elem_dtype_;
 };
 
 /*!
@@ -377,7 +381,7 @@ class PConstWithTypeLike : public Pattern<PConstWithTypeLike<TA>> {
     }
   }
 
-  PrimExpr Eval() const { return tirx::MakeConst(ref_.Eval().dtype(), value_); }
+  PrimExpr Eval() const { return tirx::MakeConst(ref_.Eval().ty(), value_); }
 
  private:
   typename TA::Nested ref_;
@@ -540,7 +544,7 @@ class PCastExpr : public Pattern<PCastExpr<DType, TA>> {
 
   bool Match_(const ffi::ObjectRef& node) const {
     if (const tirx::CastNode* ptr = node.as<tirx::CastNode>()) {
-      if (!dtype_.Match_(ptr->dtype)) return false;
+      if (!dtype_.Match_(ptr->ty())) return false;
       if (!value_.Match_(ptr->value)) return false;
       return true;
     } else {
@@ -558,7 +562,7 @@ class PCastExpr : public Pattern<PCastExpr<DType, TA>> {
 /*!
  * \brief Construct a cast pattern.
  *
- * \param dtype The target data type, can be PVar<DataType> or PConst<DataType>.
+ * \param dtype The target data type, can be PVar<PrimType> or PConst<PrimType>.
  * \param value The input type.
  *
  * \return The result pattern.
@@ -780,7 +784,7 @@ class PCallExpr : public Pattern<PCallExpr<Op, TArgs...>> {
 #define TVM_PATTERN_BINARY_INTRIN(FuncName, OpName, IntrinOpName)                         \
   struct OpName {                                                                         \
     static PrimExpr Eval(ffi::Array<PrimExpr> args) {                                     \
-      return tirx::Call(args[0].dtype(), GetOp(), args);                                  \
+      return tirx::Call(args[0].ty(), GetOp(), args);                                     \
     }                                                                                     \
     static const Op& GetOp() { return tirx::builtin::IntrinOpName(); }                    \
   };                                                                                      \
@@ -799,7 +803,7 @@ TVM_PATTERN_BINARY_INTRIN(operator^, PBitwiseXorOp, bitwise_xor);
 #define TVM_PATTERN_UNARY_INTRIN(FuncName, OpName, IntrinOpName)       \
   struct OpName {                                                      \
     static PrimExpr Eval(ffi::Array<PrimExpr> args) {                  \
-      return tirx::Call(args[0].dtype(), GetOp(), args);               \
+      return tirx::Call(args[0].ty(), GetOp(), args);                  \
     }                                                                  \
     static const Op& GetOp() { return tirx::builtin::IntrinOpName(); } \
   };                                                                   \
@@ -813,7 +817,7 @@ TVM_PATTERN_UNARY_INTRIN(operator~, PBitwiseNotOp, bitwise_not);
 // if_then_else
 struct PIfThenElseOp {
   static PrimExpr Eval(ffi::Array<PrimExpr> args) {
-    return tirx::Call(args[1].dtype(), GetOp(), args);
+    return tirx::Call(args[1].ty(), GetOp(), args);
   }
   static const Op& GetOp() { return tirx::builtin::if_then_else(); }
 };
@@ -841,7 +845,7 @@ inline PCallExpr<PIfThenElseOp, TCond, TA, TB> if_then_else(const Pattern<TCond>
 
 // vscale
 struct PVscaleOp {
-  static PrimExpr Eval() { return tirx::Call(DataType::Int(32), GetOp(), {}); }
+  static PrimExpr Eval() { return tirx::Call(PrimType::Int(32), GetOp(), {}); }
   static const Op& GetOp() { return tirx::builtin::vscale(); }
 };
 
diff --git a/src/arith/product_normal_form.h b/src/arith/product_normal_form.h
index 40d02c1952b7..79e040287fa7 100644
--- a/src/arith/product_normal_form.h
+++ b/src/arith/product_normal_form.h
@@ -79,7 +79,8 @@ inline void UnpackSum(const PrimExpr& value, FLeaf fleaf, int sign = 1) {
  */
 inline PrimExpr MulAndNormalize(const PrimExpr& lhs, const PrimExpr& rhs) {
   int64_t cscale = 1;
-  PrimExpr res = tirx::MakeConst(lhs.dtype(), 1);
+  PrimType lhs_ty = lhs.ty();
+  PrimExpr res = tirx::MakeConst(lhs_ty, 1);
   auto fcollect = [&](PrimExpr val) {
     if (const auto* intimm = val.as<IntImmNode>()) {
       cscale *= intimm->value;
@@ -90,7 +91,7 @@ inline PrimExpr MulAndNormalize(const PrimExpr& lhs, const PrimExpr& rhs) {
   UnpackReduction<tirx::MulNode>(lhs, fcollect);
   UnpackReduction<tirx::MulNode>(rhs, fcollect);
   if (cscale != 1) {
-    res = res * tirx::MakeConst(res.dtype(), cscale);
+    res = res * tirx::MakeConst(res.ty(), cscale);
   }
   return res;
 }
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index fa3ba0b519d6..07ea2c7a7778 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -425,7 +425,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AddNode* op) {
   // Pattern var for lanes in broadcast and ramp
   PVar<PrimExpr> lanes;
   // Vector rules
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(ramp(b1, s1, lanes) + ramp(b2, s2, lanes), ramp(b1 + b2, s1 + s2, lanes));
     TVM_TRY_REWRITE(ramp(b1, s1, lanes) + broadcast(x, lanes), ramp(b1 + x, s1, lanes));
     TVM_TRY_REWRITE(broadcast(x, lanes) + ramp(b1, s1, lanes), ramp(x + b1, s1, lanes));
@@ -433,7 +433,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AddNode* op) {
     TVM_TRY_REWRITE_IF(x + broadcast(c4, lanes), x, c4.Eval()->value == 0.0f);
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexTypedExpr(op)) {
     // Index rules
     // cancelation rules
     TVM_TRY_REWRITE((x - y) + y, x);
@@ -535,7 +535,7 @@ std::function<void()> RewriteSimplifier::Impl::EnterConstraint(const PrimExpr& c
     if (SideEffect(subconstraint) <= CallEffectKind::kPure) {
       literal_constraints_.push_back(subconstraint);
       PrimExpr negation;
-      if (subconstraint.dtype().is_bool()) {
+      if (subconstraint.ty().MatchesElementType(DLDataTypeCode::kDLBool, 8)) {
         // We could apply NormalizeBooleanOperators during
         // TryMatchLiteralConstraint, but that would require
         // performing a rewrite of each expression being checked.
@@ -543,7 +543,7 @@ std::function<void()> RewriteSimplifier::Impl::EnterConstraint(const PrimExpr& c
         // applied.
         negation = NormalizeBooleanOperators(Not(subconstraint));
       } else {
-        negation = subconstraint == IntImm(subconstraint.dtype(), 0);
+        negation = subconstraint == IntImm(subconstraint.ty(), 0);
       }
       literal_constraints_.push_back(Not(negation));
     }
@@ -575,14 +575,14 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const SubNode* op) {
   PVar<PrimExpr> lanes;
 
   // Vector rules
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(ramp(b1, s1, lanes) - ramp(b2, s2, lanes), ramp(b1 - b2, s1 - s2, lanes));
     TVM_TRY_REWRITE(ramp(b1, s1, lanes) - broadcast(x, lanes), ramp(b1 - x, s1, lanes));
     TVM_TRY_REWRITE(broadcast(x, lanes) - ramp(b1, s1, lanes), ramp(x - b1, 0 - s1, lanes));
     TVM_TRY_REWRITE(broadcast(x, lanes) - broadcast(y, lanes), broadcast(x - y, lanes));
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexTypedExpr(op)) {
     // Index rules
     // cancelation rules
     TVM_TRY_REWRITE(matches_one_of((x + y) - y, (y + x) - y), x);
@@ -765,7 +765,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MulNode* op) {
   // Pattern var for lanes in broadcast and ramp
   PVar<PrimExpr> lanes;
   // Vector rules
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(broadcast(x, lanes) * broadcast(y, lanes), broadcast(x * y, lanes));
     TVM_TRY_REWRITE(matches_one_of(ramp(b1, s1, lanes) * broadcast(x, lanes),
                                    broadcast(x, lanes) * ramp(b1, s1, lanes)),
@@ -773,7 +773,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MulNode* op) {
     TVM_TRY_REWRITE_IF(broadcast(c3, lanes) * x, broadcast(c3, lanes), c3.Eval()->value == 0.0f);
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexTypedExpr(op)) {
     // constant simplification rule
     TVM_TRY_REWRITE((x + c1) * c2, x * c2 + c1 * c2);
     TVM_TRY_REWRITE((x * c1) * c2, x * (c1 * c2));
@@ -803,7 +803,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const DivNode* op) {
   PVar<PrimExpr> lanes;
 
   // Vector rules
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     // NOTE: use div as the pattern also works for float.
     TVM_TRY_REWRITE(div(broadcast(x, lanes), broadcast(y, lanes)), broadcast(div(x, y), lanes));
     // ramp / bcast
@@ -827,7 +827,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const DivNode* op) {
     }
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexTypedExpr(op)) {
     // Be-aware of the division rules:
     // We adopt the default C division uses truncation instead of floordiv.
     // This means most rules need to check non-negativeness of the operands.
@@ -839,7 +839,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const DivNode* op) {
     if (truncdiv(c1, c2).Match(ret)) {
       int64_t c1val = c1.Eval()->value;
       int64_t c2val = c2.Eval()->value;
-      return MakeConst(op->dtype, truncdiv(c1val, c2val));
+      return MakeConst(op->ty(), truncdiv(c1val, c2val));
     }
 
     // while it is always true for trunc div
@@ -957,7 +957,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const ModNode* op) {
   PVar<PrimExpr> lanes;
 
   // Vector rules
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(truncmod(broadcast(x, lanes), broadcast(y, lanes)),
                     broadcast(truncmod(x, y), lanes));
 
@@ -994,7 +994,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const ModNode* op) {
     }
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexTypedExpr(op)) {
     // Be-aware of the division rules:
     // We adopt the default C division uses truncation instead of floordiv.
     // This means most rules need to check non-negativeness of the operands.
@@ -1019,7 +1019,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const ModNode* op) {
     // canonicalization: x % c == x % (-c) for truncated division
     // NOTE: trunc div required
     TVM_TRY_RECURSIVE_REWRITE_IF(
-        truncmod(x, c1), truncmod(x, PConst<PrimExpr>(MakeConst(op->dtype, -c1.Eval()->value))),
+        truncmod(x, c1), truncmod(x, PConst<PrimExpr>(MakeConst(op->ty(), -c1.Eval()->value))),
         c1.Eval()->value < 0);
 
     // try modular analysis
@@ -1046,7 +1046,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
   PVar<PrimExpr> lanes;
 
   // Vector rules
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(floordiv(broadcast(x, lanes), broadcast(y, lanes)),
                     broadcast(floordiv(x, y), lanes));
     // ramp // bcast
@@ -1077,7 +1077,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
     }
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexTypedExpr(op)) {
     // Be-aware of the division rules: this is floor division.
     TVM_TRY_REWRITE_IF(floordiv(floordiv(x, c1), c2), floordiv(x, c1 * c2),
                        c1.Eval()->value > 0 && c2.Eval()->value > 0);
@@ -1198,7 +1198,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
   PVar<PrimExpr> lanes;
 
   // Vector rules
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(floormod(broadcast(x, lanes), broadcast(y, lanes)),
                     broadcast(floormod(x, y), lanes));
 
@@ -1238,7 +1238,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
     }
   }
 
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexTypedExpr(op)) {
     // Be-aware of the division rules: we use floordiv/floormod here
     TVM_TRY_REWRITE_IF(floormod(x * c1, c2), floormod(x * floormod(c1, c2), c2),
                        c2.Eval()->value != 0);
@@ -1314,12 +1314,12 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MinNode* op) {
   PVar<PrimExpr> lanes;
 
   // vector rule
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(min(broadcast(x, lanes), broadcast(y, lanes)), broadcast(min(x, y), lanes));
     TVM_TRY_REWRITE(min(min(x, broadcast(y, lanes)), broadcast(z, lanes)),
                     min(x, broadcast(min(y, z), lanes)));
   }
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexTypedExpr(op)) {
     TVM_TRY_REWRITE(min(x, x), x);
 
     // constant int bound
@@ -1498,12 +1498,12 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MaxNode* op) {
   PVar<PrimExpr> lanes;
 
   // vector rule
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(max(broadcast(x, lanes), broadcast(y, lanes)), broadcast(max(x, y), lanes));
     TVM_TRY_REWRITE(max(max(x, broadcast(y, lanes)), broadcast(z, lanes)),
                     max(x, broadcast(max(y, z), lanes)));
   }
-  if (IsIndexType(op->dtype)) {
+  if (IsIndexTypedExpr(op)) {
     TVM_TRY_REWRITE(max(x, x), x);
 
     // constant int bound
@@ -1686,10 +1686,10 @@ ffi::Optional<PrimExpr> RewriteSimplifier::Impl::TryMatchLiteralConstraint(
   ExprDeepEqual expr_equal;
   for (const auto& constraint : literal_constraints_) {
     if (expr_equal(constraint, expr)) {
-      return MakeConst(expr->dtype, true);
+      return MakeConst(expr->ty(), true);
     }
     if (expr_equal(constraint, negation)) {
-      return MakeConst(expr->dtype, false);
+      return MakeConst(expr->ty(), false);
     }
   }
   return std::nullopt;
@@ -1715,20 +1715,20 @@ PrimExpr RewriteSimplifier::Impl::ApplyRewriteRules(EQ ret) {
   // Pattern var match IntImm
   PVar<IntImm> c1, c2;
   PVar<PrimExpr> lanes;
-  PConst<PrimExpr> ctrue(MakeConst(ret->dtype, true));
+  PConst<PrimExpr> ctrue(MakeConst(ret->ty(), true));
 
   // vector rule
-  if (ret->dtype.is_scalable_or_fixed_length_vector()) {
+  if (ret->ty().IsScalableVector() || ret->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(broadcast(x, lanes) == broadcast(y, lanes), broadcast(x == y, lanes));
   }
 
-  if (IsIndexType(ret->a.dtype())) {
+  if (IsIndexTypedExpr(ret->a)) {
     CompareResult result = TryCompare(ret->a, ret->b);
     if (result == CompareResult::kEQ) {
-      return MakeConst(ret->dtype, true);
+      return MakeConst(ret->ty(), true);
     } else if (result == CompareResult::kNE || result == CompareResult::kGT ||
                result == CompareResult::kLT) {
-      return MakeConst(ret->dtype, false);
+      return MakeConst(ret->ty(), false);
     }
     TVM_TRY_REWRITE(c1 == x, x == c1);
 
@@ -1758,13 +1758,13 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const NENode* op) {
   if (auto const_res = TryConstFold<NE>(op->a, op->b)) return const_res.value();
   if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
 
-  if (IsIndexType(op->a.dtype())) {
+  if (IsIndexTypedExpr(op->a)) {
     CompareResult result = TryCompare(op->a, op->b);
     if (result == CompareResult::kNE || result == CompareResult::kGT ||
         result == CompareResult::kLT) {
-      return MakeConst(op->dtype, true);
+      return MakeConst(op->ty(), true);
     } else if (result == CompareResult::kEQ) {
-      return MakeConst(op->dtype, false);
+      return MakeConst(op->ty(), false);
     } else if (result == CompareResult::kGE) {
       // Known: a >= b
       //
@@ -1802,13 +1802,13 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const LENode* op) {
   // (floordiv(A,B)<x) in these cases instead.
   ret = ApplyRewriteRules(Not(ApplyRewriteRules(LT(op->b, op->a))));
 
-  if (auto op = ret.as<LENode>(); op && IsIndexType(op->a.dtype())) {
+  if (auto op = ret.as<LENode>(); op && IsIndexTypedExpr(op->a)) {
     CompareResult result = TryCompare(op->a, op->b);
     if (result == CompareResult::kLE || result == CompareResult::kLT ||
         result == CompareResult::kEQ) {
-      return MakeConst(op->dtype, true);
+      return MakeConst(op->ty(), true);
     } else if (result == CompareResult::kGT) {
-      return MakeConst(op->dtype, false);
+      return MakeConst(op->ty(), false);
     } else if (result == CompareResult::kNE) {
       // Known: a != b
       //
@@ -1857,19 +1857,19 @@ PrimExpr RewriteSimplifier::Impl::ApplyRewriteRules(LT ret) {
   PVar<PrimExpr> lanes;
 
   // vector rule
-  if (ret->dtype.is_scalable_or_fixed_length_vector()) {
+  if (ret->ty().IsScalableVector() || ret->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(broadcast(x, lanes) < broadcast(y, lanes), broadcast(x < y, lanes));
     TVM_TRY_REWRITE(ramp(x, s1, lanes) < ramp(y, s1, lanes), broadcast(x < y, lanes));
   }
 
-  if (IsIndexType(ret->a.dtype())) {
+  if (IsIndexTypedExpr(ret->a)) {
     CompareResult result = TryCompare(ret->a, ret->b);
     if (result == CompareResult::kLT) {
-      return MakeConst(ret->dtype, true);
+      return MakeConst(ret->ty(), true);
     }
     if (result == CompareResult::kEQ || result == CompareResult::kGT ||
         result == CompareResult::kGE) {
-      return MakeConst(ret->dtype, false);
+      return MakeConst(ret->ty(), false);
     }
 
     // clang-format off
@@ -1987,9 +1987,9 @@ PrimExpr RewriteSimplifier::Impl::ApplyRewriteRules(LT ret) {
       } else if (diff == 1) {
         return lhs <= rhs;
       } else if (diff < 0 && rhs_offset != 0) {
-        return lhs + MakeConst(lhs.dtype(), -diff) < rhs;
+        return lhs + MakeConst(lhs.ty(), -diff) < rhs;
       } else if (diff > 0 && lhs_offset != 0) {
-        return lhs < rhs + MakeConst(rhs.dtype(), diff);
+        return lhs < rhs + MakeConst(rhs.ty(), diff);
       }
 
       return std::nullopt;
@@ -2024,7 +2024,7 @@ PrimExpr RewriteSimplifier::Impl::ApplyRewriteRules(Not ret) {
   // Pattern var to match any expression
   PVar<PrimExpr> x, y;
   PVar<PrimExpr> lanes;
-  if (ret->dtype.is_scalable_or_fixed_length_vector()) {
+  if (ret->ty().IsScalableVector() || ret->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(!broadcast(x, lanes), broadcast(!x, lanes));
   }
 
@@ -2100,11 +2100,11 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AndNode* op) {
   PVar<IntImm> c1, c2, c3;
   PVar<PrimExpr> lanes;
 
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(broadcast(x, lanes) && broadcast(y, lanes), broadcast(x && y, lanes));
   }
 
-  auto cfalse = PConst<PrimExpr>(MakeConst(op->dtype, false));
+  auto cfalse = PConst<PrimExpr>(MakeConst(op->ty(), false));
   TVM_TRY_REWRITE(x == y && x != y, cfalse);
   TVM_TRY_REWRITE(x != y && x == y, cfalse);
   TVM_TRY_REWRITE(x && !x, cfalse);
@@ -2248,11 +2248,11 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const OrNode* op) {
   PVar<IntImm> c1, c2;
   PVar<PrimExpr> lanes;
 
-  if (op->dtype.is_scalable_or_fixed_length_vector()) {
+  if (op->ty().IsScalableVector() || op->ty().IsFixedLengthVector()) {
     TVM_TRY_REWRITE(broadcast(x, lanes) || broadcast(y, lanes), broadcast(x || y, lanes));
   }
 
-  auto ctrue = PConst<PrimExpr>(MakeConst(op->dtype, true));
+  auto ctrue = PConst<PrimExpr>(MakeConst(op->ty(), true));
 
   TVM_TRY_REWRITE(x == y || x != y, ctrue);
   TVM_TRY_REWRITE(x != y || x == y, ctrue);
@@ -2319,12 +2319,14 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const CallNode* op) {
   static const Op& ceil_op = Op::Get("tirx.ceil");
   static const Op& log2_op = Op::Get("tirx.log2");
   static const Op& clz_op = Op::Get("tirx.clz");
+  PrimType ret_ty = ffi::GetRef<PrimExpr>(op).ty();
   if (op->op.same_as(ceil_op)) {
     PrimExpr ceil_arg = op->args[0];
     if (auto arg_int = op->args[0].as<IntImmNode>()) {
-      return cast(op->dtype, IntImm(arg_int->dtype, arg_int->value));
+      return cast(ret_ty, IntImm(ffi::GetRef<PrimExpr>(arg_int).ty(), arg_int->value));
     } else if (auto arg_float = ceil_arg.as<FloatImmNode>()) {
-      return cast(op->dtype, FloatImm(arg_float->dtype, std::ceil(arg_float->value)));
+      return cast(ret_ty,
+                  FloatImm(ffi::GetRef<PrimExpr>(arg_float).ty(), std::ceil(arg_float->value)));
     } else if (auto arg_call = ceil_arg.as<CallNode>()) {
       // ceil(log2(cast(n,"float64"))) is used as the implementation of
       // topi.math.ceil_log2, and appears in iteration bounds.
@@ -2334,17 +2336,17 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const CallNode* op) {
           // ceil(log2(n)) can be simplified, and should produce the
           // same integer result regardless of the target's rounding
           // conventions.
-          return FloatImm(op->dtype, std::ceil(std::log2(as_float->value)));
+          return FloatImm(ret_ty, std::ceil(std::log2(as_float->value)));
         }
       }
     }
   } else if (op->op.same_as(clz_op)) {
     if (const auto* arg_int = op->args[0].as<IntImmNode>()) {
-      int bits = arg_int->dtype.bits();
-      if (arg_int->value == 0) return MakeConst(op->dtype, bits);
+      int bits = arg_int->ty().bits();
+      if (arg_int->value == 0) return MakeConst(ret_ty, bits);
       for (int i = bits - 1; i >= 0; --i) {
         if ((int64_t(1) << i) & arg_int->value) {
-          return IntImm(op->dtype, bits - i - 1);
+          return IntImm(ret_ty, bits - i - 1);
         }
       }
       TVM_FFI_THROW(InternalError) << "Should not reach here";
@@ -2373,7 +2375,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const CallNode* op) {
       // Only check constant cases to avoid recursion
       if (is_const_number(inner_else_expr) && is_const_number(else_expr) &&
           analyzer_->CanProve(inner_else_expr == else_expr)) {
-        return Call(op->dtype, op->op, {cond && inner_cond, inner_then_expr, else_expr}, op->attrs,
+        return Call(ret_ty, op->op, {cond && inner_cond, inner_then_expr, else_expr}, op->attrs,
                     op->span);
       }
     }
@@ -2384,7 +2386,9 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const CallNode* op) {
 
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const VarNode* op) {
   Var var = ffi::GetRef<Var>(op);
-  if (op->dtype == DataType::Bool()) {
+  PrimType op_ty = op->ty();
+  if (op_ty.MatchesElementType(DLDataTypeCode::kDLBool, 8) && !op_ty.IsScalableVector() &&
+      !op_ty.IsFixedLengthVector()) {
     if (auto match = TryMatchLiteralConstraint(var)) {
       return match.value();
     }
@@ -2400,7 +2404,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const VarNode* op) {
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const CastNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
   op = ret.as<CastNode>();
-  return cast(op->dtype, op->value);
+  return cast(ret.ty(), op->value);
 }
 
 bool RewriteSimplifier::Impl::CanInlineLet(const LetNode* op) {
diff --git a/src/arith/solve_linear_equation.cc b/src/arith/solve_linear_equation.cc
index 27144c674b9f..fd507ccdd658 100644
--- a/src/arith/solve_linear_equation.cc
+++ b/src/arith/solve_linear_equation.cc
@@ -24,9 +24,9 @@
 #include <tvm/arith/analyzer.h>
 #include <tvm/arith/int_solver.h>
 #include <tvm/arith/pattern.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/tirx/expr.h>
 #include <tvm/tirx/op.h>
@@ -133,10 +133,10 @@ void SmithNormalFormDiag(std::vector<std::vector<int64_t>>* S, std::vector<std::
           (*S)[i][j] = new_i_j;
         }
         // We have to do the same with rhs
-        PrimExpr ea = tirx::MakeConst((*y)[index].dtype(), a);
-        PrimExpr eb = tirx::MakeConst((*y)[i].dtype(), b);
-        PrimExpr e_m_g = tirx::MakeConst((*y)[i].dtype(), m_g);
-        PrimExpr e_n_g = tirx::MakeConst((*y)[index].dtype(), n_g);
+        PrimExpr ea = tirx::MakeConst((*y)[index].ty(), a);
+        PrimExpr eb = tirx::MakeConst((*y)[i].ty(), b);
+        PrimExpr e_m_g = tirx::MakeConst((*y)[i].ty(), m_g);
+        PrimExpr e_n_g = tirx::MakeConst((*y)[index].ty(), n_g);
         PrimExpr new_index_rhs = ea * (*y)[index] + eb * (*y)[i];
         PrimExpr new_i_rhs = e_n_g * (*y)[index] - e_m_g * (*y)[i];
         (*y)[index] = new_index_rhs;
@@ -193,10 +193,10 @@ void SmithNormalFormDiag(std::vector<std::vector<int64_t>>* S, std::vector<std::
           (*V)[i][j] = new_i_j;
         }
         // And apply reverse transformations to new_to_old.
-        PrimExpr ea = tirx::MakeConst((*x)[j].dtype(), a);
-        PrimExpr eb = tirx::MakeConst((*x)[index].dtype(), b);
-        PrimExpr e_m_g = tirx::MakeConst((*x)[index].dtype(), m_g);
-        PrimExpr e_n_g = tirx::MakeConst((*x)[j].dtype(), n_g);
+        PrimExpr ea = tirx::MakeConst((*x)[j].ty(), a);
+        PrimExpr eb = tirx::MakeConst((*x)[index].ty(), b);
+        PrimExpr e_m_g = tirx::MakeConst((*x)[index].ty(), m_g);
+        PrimExpr e_n_g = tirx::MakeConst((*x)[j].ty(), n_g);
         PrimExpr new_index = e_m_g * (*x)[index] + e_n_g * (*x)[j];
         PrimExpr new_j = eb * (*x)[index] - ea * (*x)[j];
         (*x)[index] = new_index;
@@ -395,7 +395,7 @@ IntConstraintsTransform SolveLinearEquations(const IntConstraints& system_to_sol
       if (const VarNode* v_old = to_old.as<VarNode>()) {
         name_hint += "_" + v_old->name_hint;
       }
-      Var v = Var(name_hint, V_inv_x[j].dtype());
+      Var v = Var(name_hint, V_inv_x[j].ty());
       solution_for_V_inv_x.push_back(v);
       new_vars.push_back(v);
       new_to_old_map.Set(v, to_old);
@@ -403,12 +403,12 @@ IntConstraintsTransform SolveLinearEquations(const IntConstraints& system_to_sol
       // The j-th variable is just a single value, don't create a tvm variable
       // S^{-1}_{nxm} Uy_{mxn}
       if (S[j][j] >= 0) {
-        PrimExpr a = tirx::MakeConst(Uy[j].dtype(), S[j][j]);
+        PrimExpr a = tirx::MakeConst(Uy[j].ty(), S[j][j]);
         solution_for_V_inv_x.push_back(analyzer_problem->Simplify(floordiv(Uy[j], a)));
       } else {
         // This is required because some simplifiers
         // have problems with dividing by negative numbers
-        PrimExpr a = tirx::MakeConst(Uy[j].dtype(), -S[j][j]);
+        PrimExpr a = tirx::MakeConst(Uy[j].ty(), -S[j][j]);
         solution_for_V_inv_x.push_back(analyzer_problem->Simplify(floordiv(-Uy[j], a)));
       }
     }
@@ -416,9 +416,9 @@ IntConstraintsTransform SolveLinearEquations(const IntConstraints& system_to_sol
 
   // V V^{-1} x = x
   for (size_t i = 0; i < num_vars; ++i) {
-    PrimExpr e = IntImm(system_to_solve->variables[i].dtype(), 0);
+    PrimExpr e = IntImm(system_to_solve->variables[i].ty(), 0);
     for (size_t j = 0; j < num_vars; ++j) {
-      e = e + tirx::MakeConst(e.dtype(), V[i][j]) * solution_for_V_inv_x[j];
+      e = e + tirx::MakeConst(e.ty(), V[i][j]) * solution_for_V_inv_x[j];
     }
     e = analyzer_problem->Simplify(e);
     old_to_new_map.Set(system_to_solve->variables[i], e);
diff --git a/src/arith/solve_linear_inequality.cc b/src/arith/solve_linear_inequality.cc
index 80d064f71157..14b1affb9927 100644
--- a/src/arith/solve_linear_inequality.cc
+++ b/src/arith/solve_linear_inequality.cc
@@ -24,9 +24,9 @@
 #include <tvm/arith/analyzer.h>
 #include <tvm/arith/int_solver.h>
 #include <tvm/arith/pattern.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/tirx/analysis.h>
 #include <tvm/tirx/expr.h>
 #include <tvm/tirx/op.h>
@@ -91,10 +91,12 @@ class NormalizeComparisons : public ExprMutator {
   template <class T>
   PrimExpr Make(const PrimExpr& a, const PrimExpr& b) {
     // rewrite LT to LE for ints
-    if (std::is_same<T, LT>::value && (a.dtype().is_int() || a.dtype().is_uint())) {
-      return LE(analyzer_->Simplify(a - b + 1), IntImm(a.dtype(), 0));
+    PrimType a_ty = a.ty();
+    if (std::is_same<T, LT>::value &&
+        (a_ty.code() == DLDataTypeCode::kDLInt || a_ty.code() == DLDataTypeCode::kDLUInt)) {
+      return LE(analyzer_->Simplify(a - b + 1), IntImm(a.ty(), 0));
     }
-    return T(analyzer_->Simplify(a - b), IntImm(a.dtype(), 0));
+    return T(analyzer_->Simplify(a - b), IntImm(a.ty(), 0));
   }
   arith::Analyzer analyzer_;
 };
@@ -248,11 +250,12 @@ PartialSolvedInequalities SolveLinearInequalities(const IntConstraints& system_t
     for (const auto& pos : coef_pos) {
       for (const auto& neg : coef_neg) {
         auto first_gcd = ExtendedEuclidean(pos.first, -neg.first, &gcd_x, &gcd_y);
-        PrimExpr c_pos = MakeConst(v.dtype(), neg.first / first_gcd);
-        PrimExpr c_neg = MakeConst(v.dtype(), pos.first / first_gcd);
+        PrimType v_ty = v.ty();
+        PrimExpr c_pos = MakeConst(v_ty, neg.first / first_gcd);
+        PrimExpr c_neg = MakeConst(v_ty, pos.first / first_gcd);
         // eliminate the current variable
         PrimExpr new_lhs = c_neg * neg.second - c_pos * pos.second;
-        PrimExpr new_ineq = LE(new_lhs, IntImm(pos.second.dtype(), 0));
+        PrimExpr new_ineq = LE(new_lhs, IntImm(pos.second.ty(), 0));
         // we need rewrite_simplify -> canonical_simplify -> rewrite_simplify
         // to help simplify things like (((y + 10) - (-1*(y - 20))) <= 0) => y - 5 <= 0
         // with steps = 2 it's (y*2) - 10 <= 0
@@ -281,7 +284,7 @@ PartialSolvedInequalities SolveLinearInequalities(const IntConstraints& system_t
     lower_bounds.reserve(coef_neg.size());
 
     for (const auto& pos : coef_pos) {
-      PrimExpr bound = MakeConst(v.dtype(), -coef_lcm / pos.first) * pos.second;
+      PrimExpr bound = MakeConst(v.ty(), -coef_lcm / pos.first) * pos.second;
       bound = analyzer->Simplify(bound, kSimplifyRewriteCanonicalRewrite);
       // Don't add if any of the existing bounds is better
       if (std::any_of(upper_bounds.begin(), upper_bounds.end(),
@@ -302,7 +305,7 @@ PartialSolvedInequalities SolveLinearInequalities(const IntConstraints& system_t
       upper_bounds.push_back(bound);
     }
     for (const auto& neg : coef_neg) {
-      PrimExpr bound = MakeConst(v.dtype(), -coef_lcm / neg.first) * neg.second;
+      PrimExpr bound = MakeConst(v.ty(), -coef_lcm / neg.first) * neg.second;
       bound = analyzer->Simplify(bound, kSimplifyRewriteCanonicalRewrite);
       // Don't add if any of the existing bounds is better
       if (std::any_of(lower_bounds.begin(), lower_bounds.end(),
@@ -330,7 +333,7 @@ PartialSolvedInequalities SolveLinearInequalities(const IntConstraints& system_t
     std::sort(equal_list.begin(), equal_list.end(), ExprLess());
 
     // Write it to the result.
-    IntGroupBounds bnds(MakeConst(v.dtype(), coef_lcm),
+    IntGroupBounds bnds(MakeConst(v.ty(), coef_lcm),
                         ffi::Array<PrimExpr>(lower_bounds.begin(), lower_bounds.end()),
                         ffi::Array<PrimExpr>(equal_list.begin(), equal_list.end()),
                         ffi::Array<PrimExpr>(upper_bounds.begin(), upper_bounds.end()));
@@ -509,7 +512,7 @@ IntConstraintsTransform SolveInequalitiesDeskewRange(const IntConstraints& inequ
                            analyzer->Simplify(var - Substitute(best_range->min, res_dst_to_src)));
 
         // Add the new var to the resulting axis
-        auto range = Range(IntImm(new_var.dtype(), 0), best_range->extent);
+        auto range = Range(IntImm(new_var.ty(), 0), best_range->extent);
         res_variables.push_back(new_var);
         res_ranges.Set(new_var, range);
 
diff --git a/src/arith/transitive_comparison_analyzer.cc b/src/arith/transitive_comparison_analyzer.cc
index 20fd05169f43..e6465ad3cf93 100644
--- a/src/arith/transitive_comparison_analyzer.cc
+++ b/src/arith/transitive_comparison_analyzer.cc
@@ -615,7 +615,8 @@ CompareResult TransitiveComparisonAnalyzer::Impl::TryCompare(const PrimExpr& lhs
                                                              const PrimExpr& rhs_expr,
                                                              bool propagate_inequalities) const {
   // Currently only supports integer checks
-  if (!lhs_expr.dtype().is_int() || !rhs_expr.dtype().is_int()) {
+  if (lhs_expr.ty().code() != DLDataTypeCode::kDLInt ||
+      rhs_expr.ty().code() != DLDataTypeCode::kDLInt) {
     return CompareResult::kUnknown;
   }
 
diff --git a/src/arith/unwrap_vector_expr.cc b/src/arith/unwrap_vector_expr.cc
index e9245c48a102..dfe7a3cf404b 100644
--- a/src/arith/unwrap_vector_expr.cc
+++ b/src/arith/unwrap_vector_expr.cc
@@ -58,14 +58,16 @@ class Scalarizer : public ExprMutator {
     }
   }
   PrimExpr VisitExpr_(const LetNode* op) final {
-    if (op->value.dtype().lanes() == 1) {
+    PrimType value_ty = op->value.ty();
+    if (value_ty.lanes() == 1) {
       return ExprMutator::VisitExpr_(op);
     }
 
     auto it = let_var_remap_.find(op->var.get());
     TVM_FFI_ICHECK(it == let_var_remap_.end()) << "Duplicate binding of variable " << op->var;
 
-    Var new_var(op->var->name_hint + "_scalar", op->var.dtype().element_of());
+    PrimType var_ty = op->var.ty();
+    Var new_var(op->var->name_hint + "_scalar", var_ty.WithLanes(1));
     let_var_remap_[op->var.get()] = new_var;
 
     PrimExpr value = this->VisitExpr(op->value);
diff --git a/src/arith/z3_prover.cc b/src/arith/z3_prover.cc
index 604815c97955..9ceb156dead8 100644
--- a/src/arith/z3_prover.cc
+++ b/src/arith/z3_prover.cc
@@ -50,10 +50,10 @@
 #include <vector>
 
 #include "tvm/ffi/cast.h"
+#include "tvm/ffi/dtype.h"
 #include "tvm/ffi/object.h"
 #include "tvm/ffi/string.h"
 #include "tvm/ir/expr.h"
-#include "tvm/runtime/data_type.h"
 #include "z3++.h"
 
 namespace tvm::arith {
@@ -147,14 +147,14 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
   /// @brief Create a Free z3 expression from PrimExprNode
   z3::expr Create(const PrimExprNode* op) {
     auto ref = ffi::GetRef<PrimExpr>(op);
-    auto dtype = op->dtype;
+    PrimType dtype = op->ty();
     std::string name = ns.GetNewName(ref);
     /// TVM max_val can't handle uint64 max correctly, so we special case it here
-    if (dtype.is_bool()) {
+    if (dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
       return ctx->bool_const(name.c_str());
     } else {
       z3::expr e = ctx->int_const(name.c_str());
-      if (dtype.is_uint() && dtype.bits() == 64) {
+      if (dtype.MatchesCode(DLDataTypeCode::kDLUInt) && dtype.bits() == 64) {
         solver.add(ctx->int_val(0) <= e && e <= ctx->int_val((uint64_t)UINT64_MAX));
       } else {
         auto min_val = min_value(dtype).as_or_throw<IntImm>()->value;
@@ -249,7 +249,7 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
     // solver) must degrade to "cannot prove" instead of escaping to the caller.
     try {
       if (CheckTrivilBadCases(expr)) return false;
-      if (!IsValidDType(expr->dtype)) return false;
+      if (!IsValidType(expr.ty())) return false;
       z3::expr_vector constr(*ctx);
       constr.push_back(!ConvertBool(expr));
       auto result = solver.check(constr);
@@ -263,7 +263,7 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
   /// @brief Binded
   /// @brief Bind a variable to a value or a range
   void Bind(const Var& var, const PrimExpr& value, bool allow_override = false) {
-    if (!IsValidDType(var->dtype)) return;
+    if (!IsValidType(var.ty())) return;
     scope_stack_.back().push_back(Scope{Scope::BindValue, var, value});
     // we add the binding whenever the value is pure,
     // because non-pure parts are handling by creating free variables in VisitExpr
@@ -272,7 +272,7 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
 
   /// @brief Bind a variable to a range
   void Bind(const Var& var, const Range& range, bool allow_override = false) {
-    if (!IsValidDType(var->dtype)) return;
+    if (!IsValidType(var.ty())) return;
     scope_stack_.back().push_back(
         Scope{Scope::BindRange, var, PrimExpr(), range->min, range->extent});
     // 1. Create a placeholder for the var, and save it in the memo
@@ -427,7 +427,7 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
    * \return Number of satisfying values, -1 on error, -2 if min_consecutive constraint not met
    */
   int64_t CountSatisfyingValues(const Var& var, int64_t max_count, int64_t min_consecutive = 1) {
-    if (!IsValidDType(var->dtype)) {
+    if (!IsValidType(var.ty())) {
       return -1;
     }
 
@@ -550,12 +550,14 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
     }
     return e->IsInstance<CallNode>() || e->IsInstance<BufferLoadNode>() ||
            e->IsInstance<ProducerLoadNode>() || e->IsInstance<ReduceNode>() ||
-           (e->IsInstance<CastNode>() && !IsValidDType(e.as_or_throw<Cast>()->value->dtype));
+           (e->IsInstance<CastNode>() && !IsValidType(e.as_or_throw<Cast>()->value.ty()));
   }
 
   /// @brief Check if the dtype is valid for z3 integer operations
-  static bool IsValidDType(const DataType& dtype) {
-    return (dtype.is_int() || dtype.is_uint() || dtype.is_bool()) && dtype.lanes() == 1;
+  static bool IsValidType(const PrimType& dtype) {
+    return dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt,
+                             DLDataTypeCode::kDLBool) &&
+           dtype.lanes() == 1;
   }
 
   /// @brief Visit the expression and convert it into z3 integer expression
@@ -581,7 +583,7 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
   /// @brief Helper function to visit binary arithmetic operations
   z3::expr VisitArith(Z3BinOp signed_op, const PrimExprNode* op, const PrimExpr& a,
                       const PrimExpr& b) {
-    if (IsValidDType(a->dtype) && IsValidDType(b->dtype)) {
+    if (IsValidType(a.ty()) && IsValidType(b.ty())) {
       return signed_op(VisitInt(a), VisitInt(b));
     } else {
       return Create(op);
@@ -589,14 +591,14 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
   }
 
   z3::expr VisitExpr_(const LetNode* op) override {
-    if (IsValidDType(op->var->dtype)) {
+    if (IsValidType(op->var.ty())) {
       memo_.emplace(op->var, VisitInt(op->value));
     }
     return VisitExpr(op->body);
   }
   z3::expr VisitExpr_(const CastNode* op) override {
     // if the inner dtype is valid, we just visit it
-    if (IsValidDType(op->value->dtype) && IsValidDType(op->dtype)) {
+    if (IsValidType(op->value.ty()) && IsValidType(op->ty())) {
       return VisitInt(op->value);
     } else {
       // otherwise, we create a new free z3 variable
@@ -696,7 +698,7 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
     } else if (op->op.same_as(tirx::builtin::shift_right())) {
       return VisitShiftOp(z3::ashr, op);
     } else if (op->op.same_as(tirx::builtin::if_then_else()) && op->args.size() == 3 &&
-               IsValidDType(op->args[1]->dtype) && IsValidDType(op->args[2]->dtype)) {
+               IsValidType(op->args[1].ty()) && IsValidType(op->args[2].ty())) {
       // tir.if_then_else(cond, a, b) is a select-like ternary.
       return z3::ite(VisitBool(op->args[0]), VisitInt(op->args[1]), VisitInt(op->args[2]));
     } else {
@@ -715,9 +717,9 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
 
     const PrimExpr& a = op->args[0];
     const PrimExpr& b = op->args[1];
-    unsigned bit_width = std::max(op->args[0].dtype().bits(), op->args[1].dtype().bits());
+    unsigned bit_width = std::max(op->args[0].ty().bits(), op->args[1].ty().bits());
 
-    if (IsValidDType(a->dtype) && IsValidDType(b->dtype)) {
+    if (IsValidType(a.ty()) && IsValidType(b.ty())) {
       return z3::bv2int(
           op_func(z3::int2bv(bit_width, VisitInt(a)), z3::int2bv(bit_width, VisitInt(b))), true);
     } else {
@@ -734,9 +736,9 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
 
     const PrimExpr& a = op->args[0];
 
-    if (IsValidDType(a->dtype)) {
+    if (IsValidType(a.ty())) {
       // Cast integer to bit-vector, apply bitwise not, then cast back.
-      unsigned bit_width = a.dtype().bits();
+      unsigned bit_width = a.ty().bits();
       z3::expr a_int = VisitInt(a);
       z3::expr a_bv = z3::int2bv(bit_width, a_int);
       return z3::bv2int(~a_bv, true);
@@ -756,7 +758,7 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
     const PrimExpr& b = op->args[1];
 
     // Shift operations require integer types for both operands
-    if (IsValidDType(a->dtype) && IsValidDType(b->dtype)) {
+    if (IsValidType(a.ty()) && IsValidType(b.ty())) {
       z3::expr a_expr = VisitInt(a);
       z3::expr b_expr = VisitInt(b);
 
@@ -765,7 +767,7 @@ class Z3Prover::Impl : ExprFunctor<z3::expr(const PrimExpr&)> {
       // matching push/pop in this path, so the assertion would permanently
       // poison the shared solver and make all subsequent unrelated proofs about
       // `b` unsound.
-      unsigned bit_width = std::max(a.dtype().bits(), b.dtype().bits());
+      unsigned bit_width = std::max(a.ty().bits(), b.ty().bits());
       z3::expr a_bv = z3::int2bv(bit_width, a_expr);
       z3::expr b_bv = z3::int2bv(bit_width, b_expr);
 
diff --git a/src/backend/cuda/codegen/codegen_cuda.cc b/src/backend/cuda/codegen/codegen_cuda.cc
index 0f2838014b28..0d70d9aef3fd 100644
--- a/src/backend/cuda/codegen/codegen_cuda.cc
+++ b/src/backend/cuda/codegen/codegen_cuda.cc
@@ -56,13 +56,32 @@ bool IsOp(const tirx::CallNode* call, const Op& compat_op, const char* canonical
   return op_node != nullptr && op_node->name == canonical_name;
 }
 
+bool IsCUDAFloat8(DLDataTypeCode code) {
+  return code == DLDataTypeCode::kDLFloat8_e3m4 || code == DLDataTypeCode::kDLFloat8_e4m3 ||
+         code == DLDataTypeCode::kDLFloat8_e4m3b11fnuz ||
+         code == DLDataTypeCode::kDLFloat8_e4m3fn || code == DLDataTypeCode::kDLFloat8_e4m3fnuz ||
+         code == DLDataTypeCode::kDLFloat8_e5m2 || code == DLDataTypeCode::kDLFloat8_e5m2fnuz ||
+         code == DLDataTypeCode::kDLFloat8_e8m0fnu;
+}
+
+bool IsCUDAFloat6(DLDataTypeCode code) {
+  return code == DLDataTypeCode::kDLFloat6_e2m3fn || code == DLDataTypeCode::kDLFloat6_e3m2fn;
+}
+
+bool IsCUDAFloat4(DLDataTypeCode code) { return code == DLDataTypeCode::kDLFloat4_e2m1fn; }
+
+bool IsCUDAPackedFloat(DLDataTypeCode code) {
+  return IsCUDAFloat8(code) || IsCUDAFloat6(code) || IsCUDAFloat4(code);
+}
+
 }  // namespace
 
-std::string GetFP8Type(DataType type) {
+std::string GetFP8Type(DLDataType type) {
+  PrimType type_ty(type);
   std::stringstream stream;
-  int32_t lanes = type.lanes();
+  int32_t lanes = type_ty.lanes();
   std::string vec;
-  if (type.is_scalar()) {
+  if (type_ty.IsScalar()) {
     vec = "";
   } else if (lanes == 2) {
     vec = "x2";
@@ -78,11 +97,12 @@ std::string GetFP8Type(DataType type) {
   }
   stream << "__nv_fp8";
   std::string suffix;
-  if (type.code() == DataType::kFloat8_e4m3fn) {
+  DLDataTypeCode code = type_ty.code();
+  if (code == DLDataTypeCode::kDLFloat8_e4m3fn) {
     suffix = "_e4m3";
-  } else if (type.code() == DataType::kFloat8_e5m2) {
+  } else if (code == DLDataTypeCode::kDLFloat8_e5m2) {
     suffix = "_e5m2";
-  } else if (type.code() == DataType::kFloat8_e8m0fnu) {
+  } else if (code == DLDataTypeCode::kDLFloat8_e8m0fnu) {
     suffix = "_e8m0";
   } else {
     TVM_FFI_THROW(InternalError) << "Unsupported FP8 type in CUDA codegen";
@@ -91,11 +111,12 @@ std::string GetFP8Type(DataType type) {
   return stream.str();
 }
 
-std::string GetFP6Type(DataType type) {
+std::string GetFP6Type(DLDataType type) {
+  PrimType type_ty(type);
   std::stringstream stream;
-  int32_t lanes = type.lanes();
+  int32_t lanes = type_ty.lanes();
   std::string vec;
-  if (type.is_scalar()) {
+  if (type_ty.IsScalar()) {
     vec = "";
   } else if (lanes == 2) {
     vec = "x2";
@@ -110,9 +131,10 @@ std::string GetFP6Type(DataType type) {
   }
   stream << "__nv_fp6";
   std::string suffix;
-  if (type.code() == DataType::kFloat6_e2m3fn) {
+  DLDataTypeCode code = type_ty.code();
+  if (code == DLDataTypeCode::kDLFloat6_e2m3fn) {
     suffix = "_e2m3";
-  } else if (type.code() == DataType::kFloat6_e3m2fn) {
+  } else if (code == DLDataTypeCode::kDLFloat6_e3m2fn) {
     suffix = "_e3m2";
   } else {
     TVM_FFI_THROW(InternalError) << "Unsupported FP6 type in CUDA codegen";
@@ -121,11 +143,12 @@ std::string GetFP6Type(DataType type) {
   return stream.str();
 }
 
-std::string GetFP4Type(DataType type) {
+std::string GetFP4Type(DLDataType type) {
+  PrimType type_ty(type);
   std::stringstream stream;
-  int32_t lanes = type.lanes();
+  int32_t lanes = type_ty.lanes();
   std::string vec;
-  if (type.is_scalar()) {
+  if (type_ty.IsScalar()) {
     vec = "";
   } else if (lanes == 2) {
     vec = "x2";
@@ -140,7 +163,8 @@ std::string GetFP4Type(DataType type) {
   }
   stream << "__nv_fp4";
   std::string suffix;
-  if (type.code() == DataType::kFloat4_e2m1fn) {
+  DLDataTypeCode code = type_ty.code();
+  if (code == DLDataTypeCode::kDLFloat4_e2m1fn) {
     suffix = "_e2m1";
   } else {
     TVM_FFI_THROW(InternalError) << "Unsupported FP4 type in CUDA codegen";
@@ -299,31 +323,34 @@ void CodeGenCUDA::BindThreadIndex(const IterVar& iv) {
                                    ";\" : \"=r\"(ctaid) :);\n"
                                    "  return ctaid;\n"
                                    "}\n");
-    var_idmap_[iv->var.get()] = CastFromTo(func_name + "()", DataType::UInt(32), iv->var.dtype());
+    var_idmap_[iv->var.get()] =
+        CastFromTo(func_name + "()", DLDataType{kDLUInt, 32, 1}, iv->var.ty()->dtype);
   } else {
-    var_idmap_[iv->var.get()] = CastFromTo(iv->thread_tag, DataType::UInt(32), iv->var.dtype());
+    var_idmap_[iv->var.get()] =
+        CastFromTo(iv->thread_tag, DLDataType{kDLUInt, 32, 1}, iv->var.ty()->dtype);
   }
 }
 
-void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
+void CodeGenCUDA::PrintType(DLDataType raw_t, std::ostream& os) {  // NOLINT(*)
+  PrimType t(raw_t);
   int lanes = t.lanes();
-  if (t.is_handle()) {
-    TVM_FFI_ICHECK(t.is_scalar()) << "do not yet support vector types";
+  if (t.IsHandle()) {
+    TVM_FFI_ICHECK(t.IsScalar()) << "do not yet support vector types";
     os << "void*";
     return;
   }
 
-  if (t.is_void()) {
+  if (t.IsVoid()) {
     os << "void";
     return;
   }
 
   bool fail = false;
-  if (t.is_float()) {
+  if (t.code() == DLDataTypeCode::kDLFloat) {
     switch (t.bits()) {
       case 16:
         codegen_tags_.insert("fp16");
-        if (t.is_scalar()) {
+        if (t.IsScalar()) {
           os << "half";
         } else if (lanes <= 8) {
           TVM_FFI_ICHECK_EQ(lanes % 2, 0) << "Only support an even number of lanes for half type";
@@ -360,15 +387,15 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
         fail = true;
         break;
     }
-    if (!fail && (t.is_scalar() || t.bits() == 16)) return;
+    if (!fail && (t.IsScalar() || t.bits() == 16)) return;
     if (!fail && (lanes > 4 && lanes <= 8 && t.bits() == 32)) return;
     if (!fail && (lanes >= 2 && lanes <= 4)) {
       os << lanes;
       return;
     }
-  } else if (t.is_bfloat16()) {
+  } else if (t.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) {
     codegen_tags_.insert("bf16");
-    if (t.is_scalar()) {
+    if (t.IsScalar()) {
       os << "nv_bfloat16";
     } else if (lanes <= 8) {
       TVM_FFI_ICHECK_EQ(lanes % 2, 0) << "only support even lane for bfloat16 type";
@@ -381,57 +408,65 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
       fail = true;
     }
     if (!fail) return;
-  } else if (t.is_float8()) {
+  } else if (t.code() == DLDataTypeCode::kDLFloat8_e3m4 ||
+             t.code() == DLDataTypeCode::kDLFloat8_e4m3 ||
+             t.code() == DLDataTypeCode::kDLFloat8_e4m3b11fnuz ||
+             t.code() == DLDataTypeCode::kDLFloat8_e4m3fn ||
+             t.code() == DLDataTypeCode::kDLFloat8_e4m3fnuz ||
+             t.code() == DLDataTypeCode::kDLFloat8_e5m2 ||
+             t.code() == DLDataTypeCode::kDLFloat8_e5m2fnuz ||
+             t.code() == DLDataTypeCode::kDLFloat8_e8m0fnu) {
     codegen_tags_.insert("fp8");
-    if (t.lanes() <= 4) {
-      os << GetFP8Type(t);
+    if (lanes <= 4) {
+      os << GetFP8Type(raw_t);
     } else {
-      os << "uint" << t.lanes() / 4;
+      os << "uint" << lanes / 4;
     }
     return;
-  } else if (t.is_float6()) {
+  } else if (t.code() == DLDataTypeCode::kDLFloat6_e2m3fn ||
+             t.code() == DLDataTypeCode::kDLFloat6_e3m2fn) {
     codegen_tags_.insert("fp6");
-    if (t.lanes() <= 4) {
-      os << GetFP6Type(t);
+    if (lanes <= 4) {
+      os << GetFP6Type(raw_t);
     } else {
       fail = true;
     }
     return;
-  } else if (t.is_float4()) {
+  } else if (t.code() == DLDataTypeCode::kDLFloat4_e2m1fn) {
     codegen_tags_.insert("fp4");
-    if (t.lanes() <= 4) {
-      os << GetFP4Type(t);
+    if (lanes <= 4) {
+      os << GetFP4Type(raw_t);
     } else {
       fail = true;
     }
     return;
-  } else if (t == DataType::Bool()) {
+  } else if (raw_t == DLDataType{kDLBool, 8, 1}) {
     os << "bool";
     return;
-  } else if (t.is_vector_bool()) {
+  } else if (t.code() == DLDataTypeCode::kDLBool && lanes > 1) {
     // CUDA does not support bool vectors.
     // Use ushort vectors to represent instead.
-    int n = t.lanes();
+    int n = lanes;
     if (n <= 4) {
       os << "ushort" << n;
       return;
     }
-  } else if (t.is_uint() || t.is_int()) {
-    if (t.is_uint()) {
+  } else if (t.MatchesCode(DLDataTypeCode::kDLUInt, DLDataTypeCode::kDLInt)) {
+    if (t.MatchesCode(DLDataTypeCode::kDLUInt)) {
       os << "u";
     }
     switch (t.bits()) {
       case 1: {
-        if (t.is_scalar()) {
+        if (t.IsScalar()) {
           os << "int";
           return;
-        } else if (t.lanes() == 8) {
+        } else if (lanes == 8) {
           os << "int8_t";
           return;
-        } else if (t.lanes() == 16) {
+        } else if (lanes == 16) {
           os << "int16_t";
           return;
-        } else if (t.lanes() == 32) {
+        } else if (lanes == 32) {
           os << "int";
           return;
         } else {
@@ -439,23 +474,23 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
         }
       }
       case 4: {
-        if (t.is_scalar()) {
+        if (t.IsScalar()) {
           os << "int";
           return;
-        } else if (t.lanes() == 4) {
+        } else if (lanes == 4) {
           os << "int16_t";
           return;
-        } else if (t.lanes() == 8) {
+        } else if (lanes == 8) {
           // directly 8 4-bit int in integer.
           os << "int";
           return;
-        } else if (t.lanes() == 16) {
+        } else if (lanes == 16) {
           os << "int2";
           return;
-        } else if (t.lanes() == 32) {
+        } else if (lanes == 32) {
           os << "int4";
           return;
-        } else if (t.lanes() == 64) {
+        } else if (lanes == 64) {
           os << "int8";
           return;
         } else {
@@ -463,7 +498,7 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
         }
       }
       case 8: {
-        if (t.lanes() == 4) {
+        if (lanes == 4) {
           // directly 4 8 bit int in integer.
           codegen_tags_.insert("int8");
 
@@ -472,15 +507,15 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
           // into 32-bit data.
           os << "int";
           return;
-        } else if (t.lanes() == 8) {
+        } else if (lanes == 8) {
           codegen_tags_.insert("int8");
           os << "int2";
           return;
-        } else if (t.lanes() == 16) {
+        } else if (lanes == 16) {
           codegen_tags_.insert("int8");
           os << "int4";
           return;
-        } else if (!t.is_uint() && t.is_scalar()) {
+        } else if (!t.MatchesCode(DLDataTypeCode::kDLUInt) && t.IsScalar()) {
           os << "signed char";
           break;
         } else {
@@ -489,11 +524,11 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
         }
       }
       case 16: {
-        if (t.is_scalar()) {
+        if (t.IsScalar()) {
           os << "short";
-        } else if (t.lanes() <= 4) {
+        } else if (lanes <= 4) {
           os << "short" << lanes;
-        } else if (t.lanes() <= 8) {
+        } else if (lanes <= 8) {
           // Emit CUDA code to access int16 vector elements.
           //
           // short4 is stored as int2
@@ -503,9 +538,8 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
           // s4.z is emitted as *(short2*)(&(i2.y)).x
           // s4.w is emitted as *(short2*)(&(i2.y)).y
           //
-          TVM_FFI_ICHECK_EQ(t.lanes() % 2, 0)
-              << "only support even lane for shorT type with lanes > 4";
-          os << "int" << t.lanes() / 2;
+          TVM_FFI_ICHECK_EQ(lanes % 2, 0) << "only support even lane for shorT type with lanes > 4";
+          os << "int" << lanes / 2;
         } else {
           fail = true;
         }
@@ -515,11 +549,11 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
         break;
       }
       case 32: {
-        if (t.is_scalar()) {
+        if (t.IsScalar()) {
           os << "int";
-        } else if (t.lanes() <= 4) {
-          os << "int" << t.lanes();
-        } else if (t.lanes() <= 8) {
+        } else if (lanes <= 4) {
+          os << "int" << lanes;
+        } else if (lanes <= 8) {
           // Emit CUDA code to access int32 vector elements for 4 < lanes <= 8.
           //
           // int8 is stored as longlong4
@@ -538,13 +572,13 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
         break;
       }
       case 64: {
-        if (t.is_scalar()) {
+        if (t.IsScalar()) {
           os << "int64_t";
-        } else if (t.lanes() == 2) {
+        } else if (lanes == 2) {
           os << "longlong2";
-        } else if (t.lanes() == 3) {
+        } else if (lanes == 3) {
           os << "longlong3";
-        } else if (t.lanes() == 4) {
+        } else if (lanes == 4) {
           os << "longlong4";
         }
         return;
@@ -561,15 +595,16 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
       return;
     }
   }
-  TVM_FFI_THROW(InternalError) << "Cannot convert type " << t << " to CUDA type";
+  TVM_FFI_THROW(InternalError) << "Cannot convert type " << ffi::DLDataTypeToString(raw_t)
+                               << " to CUDA type";
 }
 
-void CodeGenCUDA::PrintVecConstructor(DataType t, std::ostream& os) {
+void CodeGenCUDA::PrintVecConstructor(DLDataType t, std::ostream& os) {
   os << "make_";
   PrintType(t, os);
 }
 
-void CodeGenCUDA::PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr lhs, PrimExpr rhs,
+void CodeGenCUDA::PrintVecBinaryOp(const std::string& op, DLDataType t, PrimExpr lhs, PrimExpr rhs,
                                    std::ostream& os) {  // NOLINT(*)
   // Declare the result.
   std::string sret = name_supply_->FreshName("_");
@@ -579,22 +614,22 @@ void CodeGenCUDA::PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr l
   int ssa_scope = BeginScope();
   {
     // Unpack into individual ops.
-    std::string vlhs = SSAGetID(PrintExpr(lhs), lhs.dtype());
-    std::string vrhs = SSAGetID(PrintExpr(rhs), rhs.dtype());
+    std::string vlhs = SSAGetID(PrintExpr(lhs), lhs.ty()->dtype);
+    std::string vrhs = SSAGetID(PrintExpr(rhs), rhs.ty()->dtype);
 
-    for (int i = 0, lanes = t.lanes(); i < lanes; ++i) {
+    for (int i = 0, lanes = PrimType(t).lanes(); i < lanes; ++i) {
       std::ostringstream value_temp;
       if (isalpha(op[0])) {
         value_temp << op << "(";
-        PrintVecElemLoad(vlhs, lhs.dtype(), i, value_temp);
+        PrintVecElemLoad(vlhs, lhs.ty()->dtype, i, value_temp);
         value_temp << ", ";
-        PrintVecElemLoad(vrhs, rhs.dtype(), i, value_temp);
+        PrintVecElemLoad(vrhs, rhs.ty()->dtype, i, value_temp);
         value_temp << ")";
       } else {
         value_temp << "(";
-        PrintVecElemLoad(vlhs, lhs.dtype(), i, value_temp);
+        PrintVecElemLoad(vlhs, lhs.ty()->dtype, i, value_temp);
         value_temp << op;
-        PrintVecElemLoad(vrhs, rhs.dtype(), i, value_temp);
+        PrintVecElemLoad(vrhs, rhs.ty()->dtype, i, value_temp);
         value_temp << ")";
       }
       PrintVecElemStore(sret, t, i, value_temp.str());
@@ -604,55 +639,58 @@ void CodeGenCUDA::PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr l
   os << sret;
 }
 
-void CodeGenCUDA::PrintVecElemLoad(const std::string& vec, DataType t, int i,
+void CodeGenCUDA::PrintVecElemLoad(const std::string& vec, DLDataType t, int i,
                                    std::ostream& os) {  // NOLINT(*)
-  if (t.is_scalar()) {
+  PrimType t_ty(t);
+  int lanes = t_ty.lanes();
+  if (t_ty.IsScalar()) {
     os << vec;
     return;
   }
 
   static const char access[] = {'x', 'y', 'z', 'w'};
-  TVM_FFI_ICHECK(i >= 0 && i < (t.bits() == 8 ? 16 : (t.bits() == 16 || t.bits() == 32) ? 8 : 4));
-  if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
-    std::string type_name = t.is_int() ? "signed char" : "unsigned char";
-    if (t.lanes() == 2 || t.lanes() == 3) {
-      os << vec << "." << access[i % t.lanes()];
+  TVM_FFI_ICHECK(i >= 0 && i < (t.bits == 8 ? 16 : (t.bits == 16 || t.bits == 32) ? 8 : 4));
+  if (t.bits == 8 && (t_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt))) {
+    std::string type_name =
+        t_ty.MatchesCode(DLDataTypeCode::kDLInt) ? "signed char" : "unsigned char";
+    if (lanes == 2 || lanes == 3) {
+      os << vec << "." << access[i % lanes];
     } else {
-      std::string ac = t.lanes() == 4 ? vec : (vec + "." + access[i / 4]);
+      std::string ac = lanes == 4 ? vec : (vec + "." + access[i / 4]);
       os << "(reinterpret_cast<const " << type_name << "*>(&(" << ac << "))[" << (i % 4) << "])";
     }
-  } else if (t.is_float16()) {
-    if (t.lanes() <= 4) {
+  } else if (t_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16)) {
+    if (lanes <= 4) {
       os << vec << "." << access[i];
     } else {
       os << "((half2*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2];
     }
-  } else if (t.is_bfloat16()) {
-    if (t.lanes() <= 4) {
+  } else if (t_ty.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) {
+    if (lanes <= 4) {
       os << vec << "." << access[i];
     } else {
       os << "((nv_bfloat162*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2];
     }
-  } else if (t.lanes() > 4 && t.lanes() <= 8) {
+  } else if (lanes > 4 && lanes <= 8) {
     std::string type_name;
-    if (t.bits() == 16) {
-      if (t.is_int()) {
+    if (t.bits == 16) {
+      if (t_ty.MatchesCode(DLDataTypeCode::kDLInt)) {
         type_name = "short";
-      } else if (t.is_uint()) {
+      } else if (t_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
         type_name = "ushort";
       }
-    } else if (t.bits() == 32) {
-      if (t.is_int()) {
+    } else if (t.bits == 32) {
+      if (t_ty.MatchesCode(DLDataTypeCode::kDLInt)) {
         type_name = "int";
-      } else if (t.is_uint()) {
+      } else if (t_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
         type_name = "uint";
-      } else if (t.is_float()) {
+      } else if (t_ty.code() == DLDataTypeCode::kDLFloat) {
         type_name = "float";
       }
     }
     TVM_FFI_ICHECK(!type_name.empty());
     os << "((" << type_name << "2*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2];
-  } else if (t.is_float4_e2m1fn()) {
+  } else if (t_ty.code() == DLDataTypeCode::kDLFloat4_e2m1fn) {
     os << "([](__nv_fp4_storage_t v) { __nv_fp4_e2m1 t; t.__x = v; return t; })((" << vec
        << ".__x >> " << i * 4 << ") & 0xF)";
   } else {
@@ -660,50 +698,53 @@ void CodeGenCUDA::PrintVecElemLoad(const std::string& vec, DataType t, int i,
   }
 }
 
-void CodeGenCUDA::PrintVecElemStore(const std::string& vec, DataType t, int i,
+void CodeGenCUDA::PrintVecElemStore(const std::string& vec, DLDataType t, int i,
                                     const std::string& value) {
+  PrimType t_ty(t);
+  int lanes = t_ty.lanes();
   this->PrintIndent();
   static const char access[] = {'x', 'y', 'z', 'w'};
-  TVM_FFI_ICHECK(i >= 0 && i < (t.bits() == 8 ? 16 : (t.bits() == 16 || t.bits() == 32) ? 8 : 4));
-  if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
-    if (t.lanes() == 2 || t.lanes() == 3) {
-      stream << vec << '.' << access[i % t.lanes()] << "="
+  TVM_FFI_ICHECK(i >= 0 && i < (t.bits == 8 ? 16 : (t.bits == 16 || t.bits == 32) ? 8 : 4));
+  if (t.bits == 8 && (t_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt))) {
+    if (lanes == 2 || lanes == 3) {
+      stream << vec << '.' << access[i % lanes] << "="
              << "(" << value << ");\n";
     } else {
-      std::string ac = t.lanes() == 4 ? vec : (vec + "." + access[i / 4]);
-      std::string type_name = t.is_int() ? "signed char" : "unsigned char";
+      std::string ac = lanes == 4 ? vec : (vec + "." + access[i / 4]);
+      std::string type_name =
+          t_ty.MatchesCode(DLDataTypeCode::kDLInt) ? "signed char" : "unsigned char";
       stream << "reinterpret_cast<" << type_name << "*>(&(" << ac << "))[" << (i % 4) << "] = ("
              << type_name << ")(" << value << ");\n";
     }
-  } else if (t.is_float16()) {
-    if (t.lanes() <= 4) {
+  } else if (t_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16)) {
+    if (lanes <= 4) {
       stream << vec << "." << access[i] << " = " << value << ";\n";
     } else {
       stream << "((half2*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2] << " = "
              << value << ";\n";
     }
 
-  } else if (t.is_bfloat16()) {
-    if (t.lanes() <= 4) {
+  } else if (t_ty.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) {
+    if (lanes <= 4) {
       stream << vec << "." << access[i] << " = " << value << ";\n";
     } else {
       stream << "((nv_bfloat162*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2]
              << " = " << value << ";\n";
     }
-  } else if (t.lanes() > 4 && t.lanes() <= 8) {
+  } else if (lanes > 4 && lanes <= 8) {
     std::string type_name;
-    if (t.bits() == 16) {
-      if (t.is_int()) {
+    if (t.bits == 16) {
+      if (t_ty.MatchesCode(DLDataTypeCode::kDLInt)) {
         type_name = "short";
-      } else if (t.is_uint()) {
+      } else if (t_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
         type_name = "ushort";
       }
-    } else if (t.bits() == 32) {
-      if (t.is_int()) {
+    } else if (t.bits == 32) {
+      if (t_ty.MatchesCode(DLDataTypeCode::kDLInt)) {
         type_name = "int";
-      } else if (t.is_uint()) {
+      } else if (t_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
         type_name = "uint";
-      } else if (t.is_float()) {
+      } else if (t_ty.code() == DLDataTypeCode::kDLFloat) {
         type_name = "float";
       }
     }
@@ -766,15 +807,19 @@ void CodeGenCUDA::PrintStorageScope(const std::string& scope, std::ostream& os)
   }
 }
 
-std::string CodeGenCUDA::CastFromTo(std::string value, DataType from, DataType target) {
+std::string CodeGenCUDA::CastFromTo(std::string value, DLDataType from, DLDataType target) {
   if (from == target) return value;
+  PrimType from_ty(from);
+  PrimType target_ty(target);
   std::ostringstream os;
   os << "((";
   this->PrintType(target, os);
   os << ")";
-  if (from.is_float16() && (target.is_int() || target.is_uint()) && target.bits() == 8) {
+  if (from_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16) &&
+      (target_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) &&
+      target.bits == 8) {
     os << "(";
-    if (target.is_uint()) {
+    if (target_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
       os << "u";
     }
     os << "int)";
@@ -794,33 +839,22 @@ void CodeGenCUDA::AddUtilFunction(const std::string& func_name, const std::strin
 }
 
 void CodeGenCUDA::VisitExpr_(const CastNode* op, std::ostream& os) {
-  DataType from_ty = op->value.dtype();
-  DataType target_ty = op->dtype;
+  DLDataType from_dtype = op->value.ty()->dtype;
+  DLDataType target_dtype = op->ty()->dtype;
+  PrimType from_ty(from_dtype);
+  PrimType target_ty(target_dtype);
   TVM_FFI_ICHECK_EQ(target_ty.lanes(), from_ty.lanes());
 
   // Emit simple C-style type conversion.
-  if (from_ty.is_scalar()) return CodeGenC::VisitExpr_(op, os);
-
-  if (target_ty.code() == DataType::kFloat8_e3m4 || target_ty.code() == DataType::kFloat8_e4m3 ||
-      target_ty.code() == DataType::kFloat8_e4m3b11fnuz ||
-      target_ty.code() == DataType::kFloat8_e4m3fn ||
-      target_ty.code() == DataType::kFloat8_e4m3fnuz ||
-      target_ty.code() == DataType::kFloat8_e5m2 ||
-      target_ty.code() == DataType::kFloat8_e5m2fnuz ||
-      target_ty.code() == DataType::kFloat8_e8m0fnu ||
-      target_ty.code() == DataType::kFloat4_e2m1fn ||
-
-      from_ty.code() == DataType::kFloat8_e3m4 || from_ty.code() == DataType::kFloat8_e4m3 ||
-      from_ty.code() == DataType::kFloat8_e4m3b11fnuz ||
-      from_ty.code() == DataType::kFloat8_e4m3fn || from_ty.code() == DataType::kFloat8_e4m3fnuz ||
-      from_ty.code() == DataType::kFloat8_e5m2 || from_ty.code() == DataType::kFloat8_e5m2fnuz ||
-      from_ty.code() == DataType::kFloat8_e8m0fnu || from_ty.code() == DataType::kFloat4_e2m1fn) {
+  if (from_ty.IsScalar()) return CodeGenC::VisitExpr_(op, os);
+
+  if (IsCUDAPackedFloat(target_ty.code()) || IsCUDAPackedFloat(from_ty.code())) {
     std::ostringstream val;
-    if (target_ty.code() == DataType::kBFloat && target_ty.lanes() == 2) {
+    if (target_ty.code() == DLDataTypeCode::kDLBfloat && target_ty.lanes() == 2) {
       val << "cast_to_nv_bfloat162(" << PrintExpr(op->value) << ")";
     } else {
       val << "(";
-      PrintType(target_ty, val);
+      PrintType(target_dtype, val);
       val << ")(" << PrintExpr(op->value) << ")";
     }
     os << val.str();
@@ -831,18 +865,18 @@ void CodeGenCUDA::VisitExpr_(const CastNode* op, std::ostream& os) {
   // too compact to read. Emit this as vectorized unary ops.
   std::string sret = name_supply_->FreshName("_");
   this->PrintIndent();
-  this->PrintType(target_ty, stream);
+  this->PrintType(target_dtype, stream);
   stream << ' ' << sret << ";\n";
   {
-    std::string src = SSAGetID(PrintExpr(op->value), from_ty);
+    std::string src = SSAGetID(PrintExpr(op->value), from_dtype);
     for (int i = 0, lanes = from_ty.lanes(); i < lanes; ++i) {
       std::ostringstream val;
       val << "(";
-      PrintType(target_ty.element_of(), val);
+      PrintType(DLDataType{target_dtype.code, target_dtype.bits, 1}, val);
       val << ")(";
-      PrintVecElemLoad(src, from_ty, i, val);
+      PrintVecElemLoad(src, from_dtype, i, val);
       val << ")";
-      PrintVecElemStore(sret, target_ty, i, val.str());
+      PrintVecElemStore(sret, target_dtype, i, val.str());
     }
   }
   os << sret;
@@ -851,8 +885,9 @@ void CodeGenCUDA::VisitExpr_(const CastNode* op, std::ostream& os) {
 void CodeGenCUDA::PrintCallExtern(Type ret_type, ffi::String global_symbol,
                                   const ffi::Array<PrimExpr>& args, bool skip_first_arg,
                                   std::ostream& os) {  // NOLINT(*)
-  DataType ret_dtype = GetRuntimeDataType(ret_type);
-  if (ret_dtype.is_fixed_length_vector()) {
+  DLDataType ret_dtype = GetRuntimeDataType(ret_type);
+  PrimType ret_ty(ret_dtype);
+  if (ret_ty.IsFixedLengthVector()) {
     //
     // Emit an unsupported vector call
     //
@@ -881,17 +916,17 @@ void CodeGenCUDA::PrintCallExtern(Type ret_type, ffi::String global_symbol,
       std::vector<std::string> sargs;
       size_t arg_begin = static_cast<size_t>(skip_first_arg);
       for (size_t i = arg_begin; i < args.size(); ++i) {
-        std::string val = SSAGetID(PrintExpr(args[i]), args[i].dtype());
+        std::string val = SSAGetID(PrintExpr(args[i]), args[i].ty()->dtype);
         sargs.push_back(std::move(val));
       }
 
       // Emit a scalar call for each lane.
-      for (int i = 0; i < ret_dtype.lanes(); ++i) {
+      for (int i = 0; i < ret_ty.lanes(); ++i) {
         std::ostringstream scall;
         scall << global_symbol << "(";
         for (size_t j = 0; j < sargs.size(); ++j) {
           if (j > 0) scall << ", ";
-          PrintVecElemLoad(sargs[j], args[arg_begin + j].dtype(), i, scall);
+          PrintVecElemLoad(sargs[j], args[arg_begin + j].ty()->dtype, i, scall);
         }
         scall << ")";
         PrintVecElemStore(sret, ret_dtype, i, scall.str());
@@ -1196,7 +1231,7 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     std::string local_ptr = this->PrintExpr(op->args[3]);
     std::string local_offset = this->PrintExpr(op->args[4]);
     std::string smem_ptr = this->PrintExpr(op->args[5]);
-    if (trans && op->dtype.bits() == 8) {
+    if (trans && op->ty()->dtype.bits == 8) {
       // ldmatrix can't transpose 8-bit elements (it assumes 16-bit), so
       // synthesize the equivalent manual gather loop. args[6] is the
       // shared-memory stride for this fallback.
@@ -1317,39 +1352,46 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
            << guard << ")\n";
     stream << ");\n";
   } else if (op->op.same_as(builtin::reinterpret())) {
-    DataType tgt_dtype = op->dtype;
-    DataType src_dtype = op->args[0]->dtype;
+    DLDataType tgt_dtype = op->ty()->dtype;
+    DLDataType src_dtype = op->args[0].ty()->dtype;
+    PrimType tgt_ty(tgt_dtype);
+    PrimType src_ty(src_dtype);
     PrimExpr value = op->args[0];
 
-    if (src_dtype.is_handle() && tgt_dtype.is_scalar() &&
-        (tgt_dtype.is_uint() || tgt_dtype.is_int()) && tgt_dtype.bits() == 64) {
+    if (src_ty.IsHandle() && tgt_ty.IsScalar() &&
+        tgt_ty.MatchesCode(DLDataTypeCode::kDLUInt, DLDataTypeCode::kDLInt) &&
+        tgt_dtype.bits == 64) {
       os << "reinterpret_cast<";
       this->PrintType(tgt_dtype, os);
       os << ">(" << PrintExpr(value) << ")";
       return;
     }
-    if (tgt_dtype.is_handle() && src_dtype.is_scalar() &&
-        (src_dtype.is_uint() || src_dtype.is_int()) && src_dtype.bits() == 64) {
+    if (tgt_ty.IsHandle() && src_ty.IsScalar() &&
+        src_ty.MatchesCode(DLDataTypeCode::kDLUInt, DLDataTypeCode::kDLInt) &&
+        src_dtype.bits == 64) {
       os << "reinterpret_cast<void*>(" << PrintExpr(value) << ")";
       return;
     }
 
     // Handle float4_e2m1fn reinterpret
-    if (!src_dtype.is_float4_e2m1fn() && !tgt_dtype.is_float4_e2m1fn()) {
+    if (!IsCUDAFloat4(src_ty.code()) && !IsCUDAFloat4(tgt_ty.code())) {
       return CodeGenC::VisitExpr_(op, os);
     }
     if (src_dtype == tgt_dtype ||
-        tgt_dtype.lanes() * tgt_dtype.bits() == src_dtype.lanes() * src_dtype.bits()) {
+        tgt_ty.lanes() * tgt_dtype.bits == src_ty.lanes() * src_dtype.bits) {
       return CodeGenC::VisitExpr_(op, os);
     }
-    TVM_FFI_ICHECK_EQ(tgt_dtype.lanes(), src_dtype.lanes())
+    TVM_FFI_ICHECK_EQ(tgt_ty.lanes(), src_ty.lanes())
         << "E2M1 float4 reinterpret expects source and target to have the same number of lanes. "
-        << "Source dtype: " << src_dtype << ", Target dtype: " << tgt_dtype;
-    TVM_FFI_ICHECK_EQ(tgt_dtype.bytes(), src_dtype.bytes())
+        << "Source dtype: " << ffi::DLDataTypeToString(src_dtype)
+        << ", Target dtype: " << ffi::DLDataTypeToString(tgt_dtype);
+    TVM_FFI_ICHECK_EQ((tgt_ty.lanes() * tgt_dtype.bits + 7) / 8,
+                      (src_ty.lanes() * src_dtype.bits + 7) / 8)
         << "E2M1 float4 reinterpret expects source and target to have the same number of bytes. "
-        << "Source dtype: " << src_dtype << ", Target dtype: " << tgt_dtype;
+        << "Source dtype: " << ffi::DLDataTypeToString(src_dtype)
+        << ", Target dtype: " << ffi::DLDataTypeToString(tgt_dtype);
 
-    int lanes = tgt_dtype.lanes();
+    int lanes = tgt_ty.lanes();
 
     int ssa_scope = BeginScope();
     if (lanes == 1) {
@@ -1360,47 +1402,47 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
       this->PrintType(tgt_dtype, os);
       os << " *)(&(" << rhs << ")))";
     } else if (lanes == 2) {
-      if (tgt_dtype.is_float4_e2m1fn()) {
+      if (IsCUDAFloat4(tgt_ty.code())) {
         // We view the source as an uint16, and then extract bits of two fp4 numbers,
         // and finally reinterpret the result as fp4x2.
-        value = tirx::Call(DataType::UInt(16), tirx::builtin::reinterpret(), {value});
-        tirx::Var temp_var("temp_var", DataType::UInt(16));
+        value = tirx::Call(PrimType::UInt(16), tirx::builtin::reinterpret(), {value});
+        tirx::Var temp_var("temp_var", PrimType::UInt(16));
         value = tirx::Let(temp_var, value,
-                          tirx::Cast(DataType::UInt(8),
-                                     (temp_var & IntImm(DataType::UInt(16), 0xF)) |
-                                         ((temp_var >> 4) & IntImm(DataType::UInt(16), 0xF0))));
+                          tirx::Cast(PrimType::UInt(8),
+                                     (temp_var & IntImm(PrimType::UInt(16), 0xF)) |
+                                         ((temp_var >> 4) & IntImm(PrimType::UInt(16), 0xF0))));
       } else {
-        value = tirx::Cast(DataType::UInt(16),
-                           tirx::Call(DataType::UInt(8), tirx::builtin::reinterpret(), {value}));
-        tirx::Var temp_var("temp_var", DataType::UInt(16));
+        value = tirx::Cast(PrimType::UInt(16),
+                           tirx::Call(PrimType::UInt(8), tirx::builtin::reinterpret(), {value}));
+        tirx::Var temp_var("temp_var", PrimType::UInt(16));
         value = tirx::Let(temp_var, value,
-                          (temp_var & IntImm(DataType::UInt(16), 0xF)) |
-                              ((temp_var & IntImm(DataType::UInt(16), 0xF0)) << 4));
+                          (temp_var & IntImm(PrimType::UInt(16), 0xF)) |
+                              ((temp_var & IntImm(PrimType::UInt(16), 0xF0)) << 4));
       }
-      os << PrintExpr(tirx::Call(tgt_dtype, tirx::builtin::reinterpret(), {value}));
+      os << PrintExpr(tirx::Call(PrimType(tgt_dtype), tirx::builtin::reinterpret(), {value}));
     } else if (lanes == 4) {
-      if (tgt_dtype.is_float4_e2m1fn()) {
+      if (IsCUDAFloat4(tgt_ty.code())) {
         // We view the source as an uint32, and then extract bits of four fp4 numbers,
         // and finally reinterpret the result as fp4x4.
-        value = tirx::Call(DataType::UInt(32), tirx::builtin::reinterpret(), {value});
-        tirx::Var temp_var("temp_var", DataType::UInt(32));
+        value = tirx::Call(PrimType::UInt(32), tirx::builtin::reinterpret(), {value});
+        tirx::Var temp_var("temp_var", PrimType::UInt(32));
         value = tirx::Let(temp_var, value,
-                          tirx::Cast(DataType::UInt(16),
-                                     (temp_var & IntImm(DataType::UInt(32), 0xF)) |
-                                         ((temp_var >> 4) & IntImm(DataType::UInt(32), 0xF0)) |
-                                         ((temp_var >> 8) & IntImm(DataType::UInt(32), 0xF00)) |
-                                         ((temp_var >> 12) & IntImm(DataType::UInt(32), 0xF000))));
+                          tirx::Cast(PrimType::UInt(16),
+                                     (temp_var & IntImm(PrimType::UInt(32), 0xF)) |
+                                         ((temp_var >> 4) & IntImm(PrimType::UInt(32), 0xF0)) |
+                                         ((temp_var >> 8) & IntImm(PrimType::UInt(32), 0xF00)) |
+                                         ((temp_var >> 12) & IntImm(PrimType::UInt(32), 0xF000))));
       } else {
-        value = tirx::Cast(DataType::UInt(32),
-                           tirx::Call(DataType::UInt(16), tirx::builtin::reinterpret(), {value}));
-        tirx::Var temp_var("temp_var", DataType::UInt(32));
+        value = tirx::Cast(PrimType::UInt(32),
+                           tirx::Call(PrimType::UInt(16), tirx::builtin::reinterpret(), {value}));
+        tirx::Var temp_var("temp_var", PrimType::UInt(32));
         value = tirx::Let(temp_var, value,
-                          (temp_var & IntImm(DataType::UInt(32), 0xF)) |
-                              ((temp_var & IntImm(DataType::UInt(32), 0xF0)) << 4) |
-                              ((temp_var & IntImm(DataType::UInt(32), 0xF00)) << 8) |
-                              ((temp_var & IntImm(DataType::UInt(32), 0xF000)) << 12));
+                          (temp_var & IntImm(PrimType::UInt(32), 0xF)) |
+                              ((temp_var & IntImm(PrimType::UInt(32), 0xF0)) << 4) |
+                              ((temp_var & IntImm(PrimType::UInt(32), 0xF00)) << 8) |
+                              ((temp_var & IntImm(PrimType::UInt(32), 0xF000)) << 12));
       }
-      os << PrintExpr(tirx::Call(tgt_dtype, tirx::builtin::reinterpret(), {value}));
+      os << PrintExpr(tirx::Call(PrimType(tgt_dtype), tirx::builtin::reinterpret(), {value}));
     } else {
       TVM_FFI_THROW(InternalError)
           << "Invalid number of lanes for float4_e2m1fn reinterpret: " << lanes;
@@ -1411,7 +1453,8 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
 
     const PrimExpr& arg = op->args[0];
     const auto* var_node = arg.as<VarNode>();
-    DataType dtype = op->dtype;
+    DLDataType dtype = op->ty()->dtype;
+    PrimType dtype_ty(dtype);
     bool is_string = op->args[2].as<IntImmNode>()->value;
     bool is_scalar = op->args[3].as<IntImmNode>()->value;
     int num_dims = op->args[4].as<IntImmNode>()->value;
@@ -1432,22 +1475,23 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     if (is_scalar) {
       // Scalar printing logic
       std::string format_specifier;
-      bool is_float16 = dtype.is_float() && dtype.bits() == 16;
-      if (dtype.is_float())
+      bool is_float16 = dtype_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16);
+      if (dtype_ty.code() == DLDataTypeCode::kDLFloat)
         format_specifier = "%f";
-      else if (dtype.is_int())
+      else if (dtype_ty.MatchesCode(DLDataTypeCode::kDLInt))
         format_specifier = "%d";
-      else if (dtype.is_uint())
+      else if (dtype_ty.MatchesCode(DLDataTypeCode::kDLUInt))
         format_specifier = "%u";
       else
-        TVM_FFI_THROW(InternalError) << "Unsupported data type for scalar print: " << dtype;
+        TVM_FFI_THROW(InternalError)
+            << "Unsupported data type for scalar print: " << ffi::DLDataTypeToString(dtype);
 
       std::string print_arg = var_node ? ("*" + GetVarID(var_node)) : PrintExpr(arg);
       os << "// print_buffer starts (scalar)\n"
          << "if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {\n"
-         << "  printf(\"Scalar (dtype: " << dtype << "): " << format_specifier << "\\n\\n\", "
-         << (is_float16 ? "static_cast<float>(" : "") << print_arg << (is_float16 ? ")" : "")
-         << ");\n"
+         << "  printf(\"Scalar (dtype: " << ffi::DLDataTypeToString(dtype)
+         << "): " << format_specifier << "\\n\\n\", " << (is_float16 ? "static_cast<float>(" : "")
+         << print_arg << (is_float16 ? ")" : "") << ");\n"
          << "}\n"
          << "// print_buffer ends\n";
       return;
@@ -1460,19 +1504,20 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
 
     std::string format_specifier;
     bool is_float16 = false;
-    if (dtype.is_float()) {
-      if (dtype.bits() == 16) {
+    if (dtype_ty.code() == DLDataTypeCode::kDLFloat) {
+      if (dtype.bits == 16) {
         format_specifier = "%f";
         is_float16 = true;
       } else {
         format_specifier = "%f";
       }
-    } else if (dtype.is_int()) {
+    } else if (dtype_ty.MatchesCode(DLDataTypeCode::kDLInt)) {
       format_specifier = "%d";
-    } else if (dtype.is_uint()) {
+    } else if (dtype_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
       format_specifier = "%u";
     } else {
-      TVM_FFI_THROW(InternalError) << "Unsupported data type for print: " << dtype;
+      TVM_FFI_THROW(InternalError)
+          << "Unsupported data type for print: " << ffi::DLDataTypeToString(dtype);
     }
 
     TVM_FFI_ICHECK(var_node) << "Formatted print is only supported for buffer variables.";
@@ -1485,7 +1530,7 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     for (int i = 0; i < num_dims; ++i) {
       os << PrintExpr(shape[i]) << (i < num_dims - 1 ? "," : "");
     }
-    os << "), dtype=" << dtype << "):\\n\");\n";
+    os << "), dtype=" << ffi::DLDataTypeToString(dtype) << "):\\n\");\n";
 
     std::vector<std::string> loop_vars;
     for (int i = 0; i < num_dims; ++i) {
@@ -1572,7 +1617,7 @@ void CodeGenCUDA::VisitStmt_(const AttrStmtNode* op) {
         << "For CUDA, the index of an async queue must be 0.";
     this->VisitStmt(op->body);
     static const Op& ptx_cp_async_commit_group_op = Op::Get("tirx.ptx.cp_async_commit_group");
-    auto commit_group = Call(DataType::Void(), ptx_cp_async_commit_group_op, {});
+    auto commit_group = Call(PrimType::Void(), ptx_cp_async_commit_group_op, {});
     this->PrintIndent();
     this->VisitExpr(commit_group, this->stream);
     this->stream << ";\n";
@@ -1584,7 +1629,7 @@ void CodeGenCUDA::VisitStmt_(const AttrStmtNode* op) {
         << "For CUDA, the index of an async queue must be 0.";
     auto wait_cnt = wait_attrs.second;
     static const Op& ptx_cp_async_wait_group_op = Op::Get("tirx.ptx.cp_async_wait_group");
-    auto wait_group = Call(DataType::Void(), ptx_cp_async_wait_group_op, {wait_cnt});
+    auto wait_group = Call(PrimType::Void(), ptx_cp_async_wait_group_op, {wait_cnt});
     this->PrintIndent();
     this->VisitExpr(wait_group, this->stream);
     this->stream << ";\n";
@@ -1614,19 +1659,23 @@ void CodeGenCUDA::VisitStmt_(const AllocBufferNode* op) {
   this->PrintIndent();
   std::string scope = GetPtrStorageScope(op->buffer->data);
   const VarNode* buffer = op->buffer->data.as<VarNode>();
-  DataType dtype = op->buffer->dtype;
+  DLDataType dtype = op->buffer->dtype->dtype;
 
   if (scope.find("wmma.") == 0) {
     if (scope == "wmma.matrix_a" || scope == "wmma.matrix_b") {
-      TVM_FFI_ICHECK(dtype == DataType::Float(16) || dtype == DataType::Int(8) ||
-                     dtype == DataType::UInt(8) || dtype == DataType::Int(4) ||
-                     dtype == DataType::UInt(4) || dtype == DataType::Int(1) ||
-                     dtype == DataType::BFloat(16))
+      bool supported_wmma_input_dtype =
+          dtype == DLDataType{kDLFloat, 16, 1} || dtype == DLDataType{kDLInt, 8, 1} ||
+          dtype == DLDataType{kDLUInt, 8, 1} || dtype == DLDataType{kDLInt, 4, 1} ||
+          dtype == DLDataType{kDLUInt, 4, 1} || dtype == DLDataType{kDLInt, 1, 1} ||
+          dtype == DLDataType{kDLBfloat, 16, 1};
+      TVM_FFI_ICHECK(supported_wmma_input_dtype)
           << "Matrix_a and matrix_b only support half or char or unsigned char "
           << "or uint4 or int4 or int1 type for now";
     } else {
-      TVM_FFI_ICHECK(dtype == DataType::Float(16) || dtype == DataType::Float(32) ||
-                     dtype == DataType::Int(32))
+      bool supported_wmma_accumulator_dtype = dtype == DLDataType{kDLFloat, 16, 1} ||
+                                              dtype == DLDataType{kDLFloat, 32, 1} ||
+                                              dtype == DLDataType{kDLInt, 32, 1};
+      TVM_FFI_ICHECK(supported_wmma_accumulator_dtype)
           << "Accumulator only support half, float and int type for now";
     }
     PrintWmmaScope(scope, dtype, buffer, stream);
@@ -1662,9 +1711,11 @@ void CodeGenCUDA::VisitStmt_(const AllocBufferNode* op) {
     if (scope.find("wmma.") == 0) {
       constant_size = GetWmmaFragmentSize(scope, buffer, constant_size);
     }
-    if ((dtype == DataType::Int(4) || dtype == DataType::UInt(4) || dtype == DataType::Int(1)) &&
-        scope == "shared") {
-      constant_size = constant_size / (32 / dtype.bits());
+    bool is_packed_integer_dtype = dtype == DLDataType{kDLInt, 4, 1} ||
+                                   dtype == DLDataType{kDLUInt, 4, 1} ||
+                                   dtype == DLDataType{kDLInt, 1, 1};
+    if (is_packed_integer_dtype && scope == "shared") {
+      constant_size = constant_size / (32 / dtype.bits);
     }
     stream << ' ' << vid << '[' << constant_size << "];\n";
   }
@@ -1693,9 +1744,10 @@ void CodeGenCUDA::VisitStmt_(const EvaluateNode* op) {
 }
 
 void CodeGenCUDA::VisitExpr_(const RampNode* op, std::ostream& os) {
-  int lanes = op->dtype.lanes();
+  PrimType op_ty = op->ty();
+  int lanes = op_ty.lanes();
   if (lanes <= 4) {
-    PrintVecConstructor(op->dtype, os);
+    PrintVecConstructor(op->ty()->dtype, os);
     os << "(";
     for (int i = 0; i < lanes; i++) {
       os << "(" << PrintExpr(op->base) << ")"
@@ -1710,16 +1762,16 @@ void CodeGenCUDA::VisitExpr_(const RampNode* op, std::ostream& os) {
   // constructor argument layout does not match TIR vector lane layout.
   std::string sret = name_supply_->FreshName("_");
   this->PrintIndent();
-  this->PrintType(op->dtype, stream);
+  this->PrintType(op->ty()->dtype, stream);
   stream << ' ' << sret << ";\n";
   int ssa_scope = BeginScope();
   {
-    std::string vbase = SSAGetID(PrintExpr(op->base), op->base.dtype());
-    std::string vstride = SSAGetID(PrintExpr(op->stride), op->stride.dtype());
+    std::string vbase = SSAGetID(PrintExpr(op->base), op->base.ty()->dtype);
+    std::string vstride = SSAGetID(PrintExpr(op->stride), op->stride.ty()->dtype);
     for (int i = 0; i < lanes; ++i) {
       std::ostringstream value_temp;
       value_temp << "(" << vbase << ")+(" << vstride << "*" << i << ")";
-      PrintVecElemStore(sret, op->dtype, i, value_temp.str());
+      PrintVecElemStore(sret, op->ty()->dtype, i, value_temp.str());
     }
   }
   EndScope(ssa_scope);
@@ -1727,14 +1779,16 @@ void CodeGenCUDA::VisitExpr_(const RampNode* op, std::ostream& os) {
 }
 
 void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NOLINT(*)
-  int lanes = op->dtype.lanes();
-  if ((op->dtype.is_int() || op->dtype.is_uint()) && op->dtype.bits() == 8 && lanes == 4) {
+  PrimType op_ty = op->ty();
+  int lanes = op_ty.lanes();
+  if ((op_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) && op_ty.bits() == 8 &&
+      lanes == 4) {
     // make_int8x4
     const int64_t* p = as_const_int(op->value);
     TVM_FFI_ICHECK(p);
     int64_t v = *p & 0xFF;
     v = (v << 24) | (v << 16) | (v << 8) | v;
-    if (op->dtype.is_uint()) {
+    if (op_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
       os << "(uint)" << v;
     } else {
       os << "(int)" << v;
@@ -1742,9 +1796,9 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
     return;
   }
 
-  if (op->dtype.is_float16()) {
+  if (op_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16)) {
     std::string v = PrintExpr(op->value);
-    PrintVecConstructor(op->dtype, os);
+    PrintVecConstructor(op->ty()->dtype, os);
     os << '(';
     if (lanes <= 4) {
       for (int i = 0; i < lanes / 2; ++i) {
@@ -1761,9 +1815,9 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
     return;
   }
 
-  if (op->dtype.is_bfloat16()) {
+  if (op_ty.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) {
     std::string v = PrintExpr(op->value);
-    PrintVecConstructor(op->dtype, os);
+    PrintVecConstructor(op->ty()->dtype, os);
     os << '(';
     if (lanes > 4) {
       for (int i = 0; i < lanes / 2; ++i) {
@@ -1780,12 +1834,11 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
     return;
   }
 
-  if (op->dtype.is_float8() || op->dtype.is_float4()) {
-    int lanes = op->dtype.lanes();
+  if (IsCUDAFloat8(op_ty.code()) || IsCUDAFloat4(op_ty.code())) {
     TVM_FFI_ICHECK(lanes == 1 || lanes == 2 || lanes == 4);
     std::string v = PrintExpr(op->value);
     // Implicit conversion from float back to fp8
-    PrintType(op->dtype, os);
+    PrintType(op->ty()->dtype, os);
     os << "(make_float" << lanes << "(";
     for (int i = 0; i < lanes; ++i) {
       if (i != 0) os << ", ";
@@ -1795,7 +1848,7 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
     return;
   }
 
-  if ((op->dtype.is_int() || op->dtype.is_uint()) && op->dtype.bits() == 4) {
+  if ((op_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) && op_ty.bits() == 4) {
     bool fail = false;
     const int64_t* p = as_const_int(op->value);
     TVM_FFI_ICHECK(p);
@@ -1803,7 +1856,7 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
 
     if (lanes == 4) {
       v = (v << 12) | (v << 8) | (v << 4) | v;
-      if (op->dtype.is_uint()) {
+      if (op_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
         os << "(uint16_t)" << v;
       } else {
         os << "(int16_t)" << v;
@@ -1811,17 +1864,17 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
     } else {
       v = (v << 28) | (v << 24) | (v << 20) | (v << 16) | (v << 12) | (v << 8) | (v << 4) | v;
       if (lanes == 8) {
-        if (op->dtype.is_uint()) {
+        if (op_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
           os << "(uint)" << v;
         } else {
           os << "(int)" << v;
         }
       } else if (lanes == 16 || lanes == 32) {
-        PrintVecConstructor(op->dtype, os);
+        PrintVecConstructor(op->ty()->dtype, os);
         os << '(';
         for (int i = 0; i < lanes / 8; ++i) {
           if (i != 0) os << ", ";
-          if (op->dtype.is_uint()) {
+          if (op_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
             os << "(uint)" << v;
           } else {
             os << "(int)" << v;
@@ -1839,7 +1892,7 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
   }
 
   std::string v = PrintExpr(op->value);
-  PrintVecConstructor(op->dtype, os);
+  PrintVecConstructor(op->ty()->dtype, os);
   os << '(';
   for (int i = 0; i < lanes; ++i) {
     if (i != 0) os << ", ";
@@ -1849,47 +1902,49 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
 }
 
 void CodeGenCUDA::VisitExpr_(const SelectNode* op, std::ostream& os) {
+  PrimType op_ty = op->ty();
   // Non-vector cases.
-  if (!op->dtype.is_fixed_length_vector()) {
+  if (!op_ty.IsFixedLengthVector()) {
     CodeGenC::VisitExpr_(op, os);
     return;
   }
 
   // Codegen vector condition case by serializing the select op.
-  TVM_FFI_ICHECK(op->false_value->dtype == op->dtype && op->true_value->dtype == op->dtype &&
-                 op->dtype.lanes() == op->condition.dtype().lanes());
+  TVM_FFI_ICHECK(op->false_value.ty() == op_ty && op->true_value.ty() == op_ty &&
+                 op_ty.lanes() == op->condition.ty().lanes());
 
   std::string r_var = name_supply_->FreshName("_");
   this->PrintIndent();
-  this->PrintType(op->dtype, stream);
+  this->PrintType(op->ty()->dtype, stream);
   stream << ' ' << r_var << ";\n";
   {
-    std::string c_var = SSAGetID(PrintExpr(op->condition), op->dtype);
-    std::string t_var = SSAGetID(PrintExpr(op->true_value), op->dtype);
-    std::string f_var = SSAGetID(PrintExpr(op->false_value), op->dtype);
+    std::string c_var = SSAGetID(PrintExpr(op->condition), op->ty()->dtype);
+    std::string t_var = SSAGetID(PrintExpr(op->true_value), op->ty()->dtype);
+    std::string f_var = SSAGetID(PrintExpr(op->false_value), op->ty()->dtype);
 
     // The condition is stored as an ushort vector.
-    int lanes = op->dtype.lanes();
-    DataType memory_ty(DataType::TypeCode::kUInt, 16, lanes);
+    int lanes = op_ty.lanes();
+    DLDataType memory_dtype{kDLUInt, 16, static_cast<uint16_t>(lanes)};
 
     for (int i = 0; i < lanes; ++i) {
       std::ostringstream item;
       item << "(bool(";
-      PrintVecElemLoad(c_var, memory_ty, i, item);
+      PrintVecElemLoad(c_var, memory_dtype, i, item);
       item << ")?";
-      PrintVecElemLoad(t_var, op->dtype, i, item);
+      PrintVecElemLoad(t_var, op->ty()->dtype, i, item);
       item << ':';
-      PrintVecElemLoad(f_var, op->dtype, i, item);
+      PrintVecElemLoad(f_var, op->ty()->dtype, i, item);
       item << ')';
-      PrintVecElemStore(r_var, op->dtype, i, item.str());
+      PrintVecElemStore(r_var, op->ty()->dtype, i, item.str());
     }
   }
   os << r_var;
 }
 
 inline void PrintConst(const FloatImmNode* op, std::ostream& os, CodeGenCUDA* p) {  // NOLINT(*)
+  PrimType op_ty = op->ty();
   // Type code is kBFloat
-  if (op->dtype.is_bfloat16()) {
+  if (op_ty.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) {
     os << "__float2bfloat16_rn";
     os << '(' << std::hexfloat << op->value << 'f';
     os << "/*" << std::scientific << op->value << "*/";
@@ -1897,15 +1952,15 @@ inline void PrintConst(const FloatImmNode* op, std::ostream& os, CodeGenCUDA* p)
     return;
   }
   // Type code is kFloat8_e5m2 or kE4M4Float
-  if (op->dtype.is_float8() || op->dtype.is_float4()) {
-    p->PrintType(op->dtype, os);
+  if (IsCUDAFloat8(op_ty.code()) || IsCUDAFloat4(op_ty.code())) {
+    p->PrintType(op->ty()->dtype, os);
     os << '(' << std::hexfloat << op->value << 'f';
     os << "/*" << std::scientific << op->value << "*/";
     os << ')';
     return;
   }
   // Type code is kFloat
-  switch (op->dtype.bits()) {
+  switch (op_ty.bits()) {
     case 64: {
       std::ostringstream temp;
       if (std::isinf(op->value)) {
@@ -1945,13 +2000,14 @@ inline void PrintConst(const FloatImmNode* op, std::ostream& os, CodeGenCUDA* p)
     }
     case 16: {
       os << "__float2half_rn" << '(';
-      FloatImm const_f32 = FloatImm(DataType::Float(32), op->value);
+      FloatImm const_f32 = FloatImm(PrimType::Float(32), op->value);
       PrintConst(const_f32.get(), os, p);
       os << ')';
       break;
     }
     default:
-      TVM_FFI_THROW(InternalError) << "Bad bit-width for float: " << op->dtype << "\n";
+      TVM_FFI_THROW(InternalError)
+          << "Bad bit-width for float: " << ffi::DLDataTypeToString(op->ty()->dtype) << "\n";
   }
 }
 
@@ -1959,25 +2015,27 @@ void CodeGenCUDA::VisitExpr_(const FloatImmNode* op, std::ostream& os) {  // NOL
   PrintConst(op, os, this);
 }
 
-void CodeGenCUDA::PrintWmmaScope(const std::string& scope, DataType t, const VarNode* variable,
+void CodeGenCUDA::PrintWmmaScope(const std::string& scope, DLDataType t, const VarNode* variable,
                                  std::ostream& os) {
+  PrimType t_ty(t);
   std::stringstream type;
   PrintType(t, type);
   TVM_FFI_ICHECK(fragment_shapes.count(variable))
       << "Cannot find shape of the wmma fragment " << variable->name_hint;
   std::string shape_str = fragment_shapes.at(variable);
-  if ((t.is_int() || t.is_uint()) && t.bits() < 8 && t.lanes() == 1) {
+  if ((t_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) && t.bits < 8 &&
+      t_ty.lanes() == 1) {
     type.str(std::string());
-    if (t.is_int()) {
-      if (t.bits() == 4) {
+    if (t_ty.MatchesCode(DLDataTypeCode::kDLInt)) {
+      if (t.bits == 4) {
         type << "nvcuda::wmma::experimental::precision::s4";
-      } else if (t.bits() == 1) {
+      } else if (t.bits == 1) {
         type << "nvcuda::wmma::experimental::precision::b1";
       } else {
         TVM_FFI_THROW(InternalError) << "Unhandled interger type for wmma fragment!";
       }
-    } else if (t.is_uint()) {
-      if (t.bits() == 4) {
+    } else if (t_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
+      if (t.bits == 4) {
         type << "nvcuda::wmma::experimental::precision::u4";
       } else {
         TVM_FFI_THROW(InternalError) << "Unhandled interger type for wmma fragment!";
@@ -2029,20 +2087,25 @@ void CodeGenCUDA::HandleVolatileLoads(const std::string& value, const BufferLoad
   // Cast away volatile qualifier for fp16 types. That is, only loads and
   // stores are volatile. The loaded objects are not marked as volatile.
   //
-  if ((op->dtype.is_float16() || op->dtype.is_bfloat16()) && IsVolatile(op->buffer->data.get())) {
+  PrimType op_ty = op->ty();
+  if ((op_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16) ||
+       op_ty.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) &&
+      IsVolatile(op->buffer->data.get())) {
     os << "(";
-    PrintType(op->dtype, os);
+    PrintType(op->ty()->dtype, os);
     os << ")(" << value << ")";
   } else {
     os << value;
   }
 }
 
-void CodeGenCUDA::PrintVecElemLoadExpr(DataType t, int i, const std::string& value,
+void CodeGenCUDA::PrintVecElemLoadExpr(DLDataType t, int i, const std::string& value,
                                        std::ostream& os) {
-  TVM_FFI_ICHECK_GT(t.lanes(), 1);
-  if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
-    if (!(t.lanes() == 2 || t.lanes() == 3)) {
+  PrimType t_ty(t);
+  int lanes = t_ty.lanes();
+  TVM_FFI_ICHECK_GT(lanes, 1);
+  if (t.bits == 8 && (t_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt))) {
+    if (!(lanes == 2 || lanes == 3)) {
       if (i != 0) {
         os << "|";
       }
@@ -2051,12 +2114,12 @@ void CodeGenCUDA::PrintVecElemLoadExpr(DataType t, int i, const std::string& val
     }
   }
 
-  if (t.is_float16()) {
+  if (t_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16)) {
     if (i == 0) {
       PrintVecConstructor(t, os);
       os << '(';
     }
-    if (i == t.lanes() - 1) {
+    if (i == lanes - 1) {
       os << value << ")";
     } else {
       os << value << ",";
@@ -2064,12 +2127,12 @@ void CodeGenCUDA::PrintVecElemLoadExpr(DataType t, int i, const std::string& val
     return;
   }
 
-  if (t.is_bfloat16()) {
+  if (t_ty.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) {
     if (i == 0) {
       PrintVecConstructor(t, os);
       os << '(';
     }
-    if (i == t.lanes() - 1) {
+    if (i == lanes - 1) {
       os << value << ")";
     } else {
       os << value << ",";
@@ -2082,7 +2145,7 @@ void CodeGenCUDA::PrintVecElemLoadExpr(DataType t, int i, const std::string& val
     os << "(";
   }
   os << value;
-  if (i != t.lanes() - 1) {
+  if (i != lanes - 1) {
     os << ",";
   } else {
     os << ")";
diff --git a/src/backend/cuda/codegen/codegen_cuda.h b/src/backend/cuda/codegen/codegen_cuda.h
index 92ca3cab34a4..94f86614e45e 100644
--- a/src/backend/cuda/codegen/codegen_cuda.h
+++ b/src/backend/cuda/codegen/codegen_cuda.h
@@ -56,16 +56,17 @@ class CodeGenCUDA final : public CodeGenC {
   void VisitStmt_(const WhileNode* op) final;
   void PrintStorageSync(const CallNode* op) final;
   void PrintStorageScope(const std::string& scope, std::ostream& os) final;  // NOLINT(*)
-  void PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr lhs, PrimExpr rhs,
-                        std::ostream& os) final;       // NOLINT(*)
-  void PrintType(DataType t, std::ostream& os) final;  // NOLINT(*)
-  void PrintVecConstructor(DataType t, std::ostream& os) final;
-  void PrintVecElemLoad(const std::string& vec, DataType t, int i,
+  void PrintVecBinaryOp(const std::string& op, DLDataType t, PrimExpr lhs, PrimExpr rhs,
+                        std::ostream& os) final;         // NOLINT(*)
+  void PrintType(DLDataType t, std::ostream& os) final;  // NOLINT(*)
+  void PrintVecConstructor(DLDataType t, std::ostream& os) final;
+  void PrintVecElemLoad(const std::string& vec, DLDataType t, int i,
                         std::ostream& os) final;  // NOLINT(*)
-  void PrintVecElemStore(const std::string& vec, DataType t, int i, const std::string& value) final;
+  void PrintVecElemStore(const std::string& vec, DLDataType t, int i,
+                         const std::string& value) final;
   void BindThreadIndex(const IterVar& iv) final;  // NOLINT(*)
-  void PrintVecElemLoadExpr(DataType t, int i, const std::string& value, std::ostream& os) final;
-  std::string CastFromTo(std::string value, DataType from, DataType target) final;
+  void PrintVecElemLoadExpr(DLDataType t, int i, const std::string& value, std::ostream& os) final;
+  std::string CastFromTo(std::string value, DLDataType from, DLDataType target) final;
   void AddUtilFunction(const std::string& name, const std::string& code);
   // overload visitor
   void VisitExpr_(const RampNode* op, std::ostream& os) final;       // NOLINT(*)
@@ -129,7 +130,7 @@ class CodeGenCUDA final : public CodeGenC {
   std::unordered_map<const VarNode*, std::string> fragment_shapes;
   std::unordered_map<const VarNode*, std::string> fragment_layouts;
   friend void PrintConst(const FloatImmNode* op, std::ostream& os, CodeGenCUDA* p);
-  void PrintWmmaScope(const std::string& scope, DataType t, const VarNode* variable,
+  void PrintWmmaScope(const std::string& scope, DLDataType t, const VarNode* variable,
                       std::ostream& os);
   int32_t GetWmmaFragmentSize(const std::string& scope, const VarNode* variable, int32_t size);
 };
diff --git a/src/backend/cuda/codegen/intrin_rule_cuda.cc b/src/backend/cuda/codegen/intrin_rule_cuda.cc
index dc8d4a020e1e..ea2d0abfa80e 100644
--- a/src/backend/cuda/codegen/intrin_rule_cuda.cc
+++ b/src/backend/cuda/codegen/intrin_rule_cuda.cc
@@ -34,8 +34,8 @@ namespace intrin {
 using tirx::FLowerIntrinsic;
 
 struct CUDAMath {
-  std::string operator()(DataType t, std::string name) const {
-    if (t.is_float()) {
+  std::string operator()(PrimType t, std::string name) const {
+    if (t.code() == DLDataTypeCode::kDLFloat) {
       switch (t.bits()) {
         case 64:
           // Use nearbyint (ties-to-even) for round to match constant-folding semantics.
@@ -56,7 +56,7 @@ struct CUDAMath {
         default:
           return "";
       }
-    } else if (t.is_bfloat16()) {
+    } else if (t.code() == DLDataTypeCode::kDLBfloat && t.bits() == 16) {
       if (name == "fabs") {
         return "__habs";
       } else if (name == "round") {
@@ -64,7 +64,7 @@ struct CUDAMath {
       } else {
         return "h" + name;
       }
-    } else if (t.is_int() || t.is_uint()) {
+    } else if (t.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
       switch (t.bits()) {
         case 32:
           return "__" + name;
@@ -79,8 +79,8 @@ struct CUDAMath {
 };
 
 struct CUDAFastMath : public CUDAMath {
-  std::string operator()(DataType t, std::string name) const {
-    if (t.is_float() && t.bits() == 32) {
+  std::string operator()(PrimType t, std::string name) const {
+    if (t.code() == DLDataTypeCode::kDLFloat && t.bits() == 32) {
       return "__" + name + 'f';
     } else {
       return CUDAMath::operator()(t, name);
@@ -90,8 +90,8 @@ struct CUDAFastMath : public CUDAMath {
 };
 
 struct CUDAFastMathTan : public CUDAMath {
-  std::string operator()(DataType t, std::string name) const {
-    if (t.is_float()) {
+  std::string operator()(PrimType t, std::string name) const {
+    if (t.code() == DLDataTypeCode::kDLFloat) {
       switch (t.bits()) {
         case 64:
           return name;
@@ -110,8 +110,8 @@ struct CUDAFastMathTan : public CUDAMath {
 };
 
 struct CUDAPopcount {
-  std::string operator()(DataType t, std::string name) const {
-    if (t.is_uint()) {
+  std::string operator()(PrimType t, std::string name) const {
+    if (t.MatchesCode(DLDataTypeCode::kDLUInt)) {
       switch (t.bits()) {
         case 32:
           return "__popc";
@@ -126,7 +126,7 @@ struct CUDAPopcount {
 };
 
 struct CUDAWarpIntrinsic {
-  const Op operator()(DataType t, const Op& orig_op) const {
+  const Op operator()(PrimType t, const Op& orig_op) const {
     if (orig_op.same_as(builtin::tvm_warp_shuffle())) {
       static const Op& cuda_shfl_sync_op = Op::Get("tirx.cuda.__shfl_sync");
       return cuda_shfl_sync_op;
@@ -147,7 +147,7 @@ struct CUDAWarpIntrinsic {
 static PrimExpr DispatchCUDAWarpActiveMask(const PrimExpr& e) {
   const CallNode* call = e.as<CallNode>();
   static const Op& cuda_active_mask_op = Op::Get("tirx.cuda.__activemask");
-  return Call(call->dtype, cuda_active_mask_op, call->args);
+  return Call(e.ty(), cuda_active_mask_op, call->args);
 }
 
 template <typename T>
@@ -156,7 +156,7 @@ static PrimExpr DispatchCUDAShuffle(const PrimExpr& e) {
   TVM_FFI_ICHECK(call != nullptr);
   TVM_FFI_ICHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
   ffi::Array<PrimExpr> cuda_args{{call->args[0], call->args[1], call->args[2], call->args[3]}};
-  return Call(call->dtype, T()(call->dtype, call->op.as_or_throw<Op>()), cuda_args);
+  return Call(e.ty(), T()(e.ty(), call->op.as_or_throw<Op>()), cuda_args);
 }
 
 void RegisterCudaIntrinRules() {
diff --git a/src/backend/cuda/codegen/llvm/codegen_nvptx.cc b/src/backend/cuda/codegen/llvm/codegen_nvptx.cc
index e523e2b22aab..eb84f10fda10 100644
--- a/src/backend/cuda/codegen/llvm/codegen_nvptx.cc
+++ b/src/backend/cuda/codegen/llvm/codegen_nvptx.cc
@@ -87,7 +87,7 @@ class CodeGenNVPTX : public CodeGenLLVM {
     }
 
     auto storage_scope = runtime::StorageScope::Create(GetPtrStorageScope(op->buffer->data));
-    DataType dtype = op->buffer->dtype;
+    PrimType dtype = op->buffer->dtype;
 
     if (storage_scope.rank == runtime::StorageRank::kShared && storage_scope.tag == ".dyn") {
       // Shared memory: address space == 3
@@ -230,7 +230,8 @@ class CodeGenNVPTX : public CodeGenLLVM {
 // corresponding nvvm intrinsic. Return true if the match is successful.
 static bool GetWarpShuffleIntrinsic(const CallNode* op, llvm::Intrinsic::ID* id) {
   // Only 32 bit data type is supported.
-  if (op->dtype.is_fixed_length_vector() || op->dtype.bits() != 32) {
+  PrimType op_ty = op->ty();
+  if (op_ty.IsFixedLengthVector() || op_ty.bits() != 32) {
     return false;
   }
 
@@ -253,7 +254,7 @@ static bool GetWarpShuffleIntrinsic(const CallNode* op, llvm::Intrinsic::ID* id)
     return false;
   }
 
-  *id = ids[offset + op->dtype.is_float()];
+  *id = ids[offset + (op_ty.code() == DLDataTypeCode::kDLFloat)];
   return true;
 }
 
@@ -279,10 +280,11 @@ llvm::Value* CodeGenNVPTX::CreateIntrinsic(const CallNode* op) {
     auto val = llvm::InlineAsm::get(fty, "activemask.b32 %0", "=r", true);
     return builder_->CreateCall(val);
   } else if (op->op.same_as(builtin::atomic_add())) {
-    TVM_FFI_ICHECK(op->args[1]->dtype.bits() == 32) << "Only supports 32 bit atomic for now";
+    PrimType value_ty = op->args[1].ty();
+    TVM_FFI_ICHECK(value_ty.bits() == 32) << "Only supports 32 bit atomic for now";
     llvm::Value* v0 = MakeValue(op->args[0]);
     llvm::Value* v1 = MakeValue(op->args[1]);
-    if (op->args[1]->dtype.is_float()) {
+    if (value_ty.code() == DLDataTypeCode::kDLFloat) {
       return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1, llvm::MaybeAlign(),
                                        llvm::AtomicOrdering::Monotonic);
     }
diff --git a/src/backend/cuda/codegen/llvm/intrin_rule_nvptx.cc b/src/backend/cuda/codegen/llvm/intrin_rule_nvptx.cc
index d8706a94b181..13d6f7d95a3b 100644
--- a/src/backend/cuda/codegen/llvm/intrin_rule_nvptx.cc
+++ b/src/backend/cuda/codegen/llvm/intrin_rule_nvptx.cc
@@ -38,7 +38,8 @@ inline PrimExpr DispatchPureExternLibDevice(const PrimExpr& e) {
   using namespace tirx;
   const CallNode* call = e.as<CallNode>();
   TVM_FFI_ICHECK(call != nullptr);
-  TVM_FFI_ICHECK(call->dtype.bits() == 32 || call->dtype.bits() == 64)
+  PrimType call_ty = call->ty();
+  TVM_FFI_ICHECK(call_ty.bits() == 32 || call_ty.bits() == 64)
       << "Only support float32 or float64.";
 
   const OpNode* op = call->op.as<OpNode>();
@@ -48,13 +49,13 @@ inline PrimExpr DispatchPureExternLibDevice(const PrimExpr& e) {
 
   std::ostringstream intrinsic_name;
   intrinsic_name << "__nv_" << name.substr(5);
-  if (call->dtype.bits() == 32) intrinsic_name << "f";
+  if (call_ty.bits() == 32) intrinsic_name << "f";
 
   ffi::Array<PrimExpr> new_args = {StringImm(intrinsic_name.str())};
   for (auto arg : call->args) {
     new_args.push_back(arg);
   }
-  return Call(call->dtype, builtin::call_pure_extern(), new_args);
+  return Call(call->ty(), builtin::call_pure_extern(), new_args);
 }
 
 namespace llvm {
@@ -73,7 +74,7 @@ TVM_REGISTER_OP("tirx.round")
       const CallNode* call = e.as<CallNode>();
       TVM_FFI_ICHECK(call != nullptr);
       static const Op& nearbyint_op = Op::Get("tirx.nearbyint");
-      auto new_call = Call(call->dtype, nearbyint_op, call->args);
+      auto new_call = Call(call->ty(), nearbyint_op, call->args);
       return DispatchPureExternLibDevice(new_call);
     });
 
diff --git a/src/backend/cuda/runtime/cuda_device_api.cc b/src/backend/cuda/runtime/cuda_device_api.cc
index 68ae39de56bf..6e30df29aa91 100644
--- a/src/backend/cuda/runtime/cuda_device_api.cc
+++ b/src/backend/cuda/runtime/cuda_device_api.cc
@@ -426,7 +426,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
     TVM_FFI_ICHECK_GE(args.size(), 4) << "init_cuTensorMap expects at least 4 arguments";
     size_t arg_cnt = 0;
     CUtensorMap* tensor_map = static_cast<CUtensorMap*>(args[arg_cnt++].cast<void*>());
-    runtime::DataType tensor_dtype = args[arg_cnt++].cast<runtime::DataType>();
+    DLDataType tensor_dtype = args[arg_cnt++].cast<DLDataType>();
     int32_t raw_tensor_rank = args[arg_cnt++].cast<int32_t>();
     TVM_FFI_ICHECK_GT(raw_tensor_rank, 0) << "tensorRank must be non-zero";
     TVM_FFI_ICHECK_LE(raw_tensor_rank, 5)
@@ -482,7 +482,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
         << "Expect tensor_dtype to have lanes=1, but get " << tensor_dtype;
     CUtensorMapDataType cu_dtype;
     switch (tensor_dtype.code()) {
-      case DataType::kInt:
+      case kDLInt:
         // int
         switch (tensor_dtype.bits()) {
           case 8:
@@ -499,7 +499,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                 << "Unsupported data type " << ffi::DLDataTypeToString(tensor_dtype);
         }
         break;
-      case DataType::kUInt:
+      case kDLUInt:
         // unsigned int
         switch (tensor_dtype.bits()) {
           case 8:
@@ -519,7 +519,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                 << "Unsupported data type " << ffi::DLDataTypeToString(tensor_dtype);
         }
         break;
-      case DataType::kFloat:
+      case kDLFloat:
         // float
         switch (tensor_dtype.bits()) {
           case 16:
@@ -536,7 +536,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                 << "Unsupported data type " << ffi::DLDataTypeToString(tensor_dtype);
         }
         break;
-      case DataType::kBFloat:
+      case kDLBfloat:
         // bfloat
         switch (tensor_dtype.bits()) {
           case 16:
@@ -547,15 +547,15 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                 << "Unsupported data type " << ffi::DLDataTypeToString(tensor_dtype);
         }
         break;
-      case DataType::kFloat8_e4m3fn:
+      case kDLFloat8_e4m3fn:
         // NV float8 e4m3
         cu_dtype = CU_TENSOR_MAP_DATA_TYPE_UINT8;
         break;
-      case DataType::kFloat8_e5m2:
+      case kDLFloat8_e5m2:
         // NV float8 e5m2
         cu_dtype = CU_TENSOR_MAP_DATA_TYPE_UINT8;
         break;
-      case DataType::kFloat4_e2m1fn:
+      case kDLFloat4_e2m1fn:
 #if (CUDA_VERSION >= 12080)
         // Packed FP4 in GMEM, unpacked into SMEM/TMEM-facing tiles.
         cu_dtype = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B;
diff --git a/src/backend/hexagon/codegen/llvm/codegen_hexagon.cc b/src/backend/hexagon/codegen/llvm/codegen_hexagon.cc
index 017796918444..60959f2aa9fe 100644
--- a/src/backend/hexagon/codegen/llvm/codegen_hexagon.cc
+++ b/src/backend/hexagon/codegen/llvm/codegen_hexagon.cc
@@ -66,6 +66,11 @@
 namespace tvm {
 namespace codegen {
 
+TVM_FFI_INLINE int GetVectorBytes(const PrimType& dtype) {
+  TVM_FFI_ICHECK(dtype.IsFixedLengthVector() || dtype.IsScalar());
+  return dtype.bits() * dtype.lanes() / 8;
+}
+
 // Hexagon code generation
 class CodeGenHexagon final : public CodeGenCPU {
  public:
@@ -97,12 +102,12 @@ class CodeGenHexagon final : public CodeGenCPU {
   void CreatePrintf(const std::string& format, llvm::ArrayRef<llvm::Value*> format_args) final;
 
  private:
-  TypedPointer CreateBufferPtr(llvm::Value* buffer_ptr, DataType buffer_element_dtype,
-                               llvm::ArrayRef<llvm::Value*> indices, DataType value_dtype) final;
+  TypedPointer CreateBufferPtr(llvm::Value* buffer_ptr, PrimType buffer_element_dtype,
+                               llvm::ArrayRef<llvm::Value*> indices, PrimType value_dtype) final;
 
   bool IsQHLFunction(const std::string& func);
 
-  llvm::Value* VectorLookupLoad(Buffer buffer, DataType buffer_type, ffi::Array<PrimExpr> indices);
+  llvm::Value* VectorLookupLoad(Buffer buffer, PrimType buffer_type, ffi::Array<PrimExpr> indices);
   llvm::Value* Intrinsic(llvm::Intrinsic::ID, llvm::ArrayRef<llvm::Value*> args);
   std::vector<std::string> fqhl_list_ = {
       "tvm_vect_qhmath_hvx_cos_ahf",     "tvm_vect_qhmath_hvx_tanh_ahf",
@@ -149,8 +154,9 @@ void CodeGenHexagon::InitTarget() {
 llvm::Value* CodeGenHexagon::CreateCallExternQHL(Type ret_type, ffi::String global_symbol,
                                                  const ffi::Array<PrimExpr>& args,
                                                  bool skip_first_arg) {
-  int num_lanes = args[1].dtype().lanes();
-  int vector_length = native_vector_bits_ / args[1].dtype().bits();
+  PrimType arg_ty = args[1].ty();
+  int num_lanes = arg_ty.lanes();
+  int vector_length = native_vector_bits_ / arg_ty.bits();
   num_lanes = ((num_lanes + vector_length - 1) / vector_length) * vector_length;
   std::vector<llvm::Value*> vect_split;
   for (int i = 0; i < num_lanes / vector_length; ++i) {
@@ -181,8 +187,9 @@ bool CodeGenHexagon::IsQHLFunction(const std::string& func) {
 llvm::Value* CodeGenHexagon::CreateCallExtern(Type ret_type, ffi::String global_symbol,
                                               const ffi::Array<PrimExpr>& args,
                                               bool skip_first_arg) {
-  int num_lanes = args[1].dtype().lanes();
-  int vector_length = native_vector_bits_ / args[1].dtype().bits();
+  PrimType arg_ty = args[1].ty();
+  int num_lanes = arg_ty.lanes();
+  int vector_length = native_vector_bits_ / arg_ty.bits();
   if (IsQHLFunction(global_symbol) && (num_lanes > vector_length))
     return CreateCallExternQHL(ret_type, global_symbol, args, skip_first_arg);
   return CodeGenCPU::CreateCallExtern(ret_type, global_symbol, args, skip_first_arg);
@@ -192,7 +199,7 @@ llvm::Value* CodeGenHexagon::VisitExpr_(const BufferLoadNode* op) {
   if (!op->buffer.same_as(op->buffer->data)) {
     // Check if we can generate a vector lookup.
     if (!op->indices[0].as<RampNode>()) {
-      if (auto* vlut = VectorLookupLoad(op->buffer, op->dtype, op->indices)) {
+      if (auto* vlut = VectorLookupLoad(op->buffer, PrimType(op->ty()->dtype), op->indices)) {
         return vlut;
       }
     }
@@ -261,9 +268,9 @@ void CodeGenHexagon::CreatePrintf(const std::string& format,
 }
 
 CodeGenLLVM::TypedPointer CodeGenHexagon::CreateBufferPtr(llvm::Value* buffer_ptr,
-                                                          DataType buffer_element_dtype,
+                                                          PrimType buffer_element_dtype,
                                                           llvm::ArrayRef<llvm::Value*> indices,
-                                                          DataType value_dtype) {
+                                                          PrimType value_dtype) {
   // Flat indices get delegated to the LLVM codegen.
   if (indices.size() == 1) {
     return CodeGenCPU::CreateBufferPtr(buffer_ptr, buffer_element_dtype, indices, value_dtype);
@@ -274,7 +281,7 @@ CodeGenLLVM::TypedPointer CodeGenHexagon::CreateBufferPtr(llvm::Value* buffer_pt
       << "-d buffer indices";
 
   // Use the first index to identify the pointer.
-  DataType dtype_void_ptr = DataType::Handle();
+  PrimType dtype_void_ptr = PrimType::Handle();
   CodeGenLLVM::TypedPointer buffer_chunk_ptr_ptr =
       CodeGenCPU::CreateBufferPtr(buffer_ptr, dtype_void_ptr, {indices[0]}, dtype_void_ptr);
   llvm::Value* buffer_chunk_ptr =
@@ -317,10 +324,11 @@ llvm::Value* CodeGenHexagon::Intrinsic(llvm::Intrinsic::ID IntID,
   return builder_->CreateCall(intf_callee, conv_args);
 }
 
-llvm::Value* CodeGenHexagon::VectorLookupLoad(Buffer buffer, DataType buffer_type,
+llvm::Value* CodeGenHexagon::VectorLookupLoad(Buffer buffer, PrimType buffer_type,
                                               ffi::Array<PrimExpr> indices) {
   PrimExpr index = indices[0];
-  if (!index.dtype().is_fixed_length_vector()) {
+  PrimType index_ty = index.ty();
+  if (!index_ty.IsFixedLengthVector()) {
     return nullptr;
   }
 
@@ -329,16 +337,16 @@ llvm::Value* CodeGenHexagon::VectorLookupLoad(Buffer buffer, DataType buffer_typ
   int table_elem_count = arith::Analyzer()->Simplify(buffer->shape[0]).as<IntImmNode>()->value;
   if (table_elem_count <= 0 || table_elem_count > 256) return nullptr;
 
-  auto int32 = DataType::Int(32);
+  auto int32 = PrimType::Int(32);
   auto native_vector_bytes = native_vector_bits_ / 8;
 
   // Indexes
-  llvm::Value* trunc = MakeValue(Cast(index.dtype().with_bits(8), index));
+  llvm::Value* trunc = MakeValue(Cast(index_ty.WithBits(8), index));
   llvm::Value* index_pad = CreateVecPad(trunc, native_vector_bytes);
 
   // Values
   std::vector<llvm::Value*> vloads;
-  DataType table_type = buffer_type.with_lanes(table_elem_count);
+  PrimType table_type = buffer_type.WithLanes(table_elem_count);
 
   auto table_all =
       MakeValue(BufferLoad(buffer, {
@@ -347,7 +355,7 @@ llvm::Value* CodeGenHexagon::VectorLookupLoad(Buffer buffer, DataType buffer_typ
 
   // The number of value vectors should be a power of 2.
   int table_vec_count = llvm::PowerOf2Ceil(GetVectorBytes(table_type) / native_vector_bytes);
-  int table_vec_length = native_vector_bytes / buffer_type.bytes();
+  int table_vec_length = native_vector_bytes / GetVectorBytes(buffer_type);
   for (int i = 0; i != table_vec_count; ++i) {
     // CreateVecSlice will generate undefs for elements outside the source vector.
     vloads.push_back(CreateVecSlice(table_all, i * table_vec_length, table_vec_length));
diff --git a/src/backend/hexagon/codegen/llvm/intrin_rule_hexagon.cc b/src/backend/hexagon/codegen/llvm/intrin_rule_hexagon.cc
index 3e46e322a881..928df03f38aa 100644
--- a/src/backend/hexagon/codegen/llvm/intrin_rule_hexagon.cc
+++ b/src/backend/hexagon/codegen/llvm/intrin_rule_hexagon.cc
@@ -50,7 +50,7 @@ inline PrimExpr TVMExternCall(const tirx::CallNode* call, const std::string& fna
   for (PrimExpr arg : call->args) {
     new_args.push_back(arg);
   }
-  return tirx::Call(call->dtype, tirx::builtin::call_pure_extern(), new_args);
+  return tirx::Call(call->ty(), tirx::builtin::call_pure_extern(), new_args);
 }
 
 template <std::string& tvm_wrapper, unsigned id, int num_sign>
@@ -72,14 +72,16 @@ inline PrimExpr DispatchTVMQHLWrapperFp16(const PrimExpr& e) {
 
   // Enable QHL library for FP16 data type
   const PrimExpr& x = call->args[0];
-  if (x->dtype.is_float16() && x->dtype.is_vector() && useqhl) {
+  PrimType x_ty = x.ty();
+  if (x_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16) &&
+      (x_ty.IsFixedLengthVector() || x_ty.IsScalableVector()) && useqhl) {
     return TVMExternCall(call, tvm_wrapper);
   }
 #endif
-  new_args.push_back(IntImm(DataType::UInt(32), id));
-  new_args.push_back(IntImm(DataType::UInt(32), num_sign));
+  new_args.push_back(IntImm(PrimType::UInt(32), id));
+  new_args.push_back(IntImm(PrimType::UInt(32), num_sign));
   new_args.insert(new_args.end(), call->args.begin(), call->args.end());
-  return tirx::Call(call->dtype, tirx::builtin::call_llvm_pure_intrin(), new_args);
+  return tirx::Call(call->ty(), tirx::builtin::call_llvm_pure_intrin(), new_args);
 }
 
 void RegisterHexagonIntrinRules() {
@@ -117,6 +119,7 @@ TVM_REGISTER_OP("tirx.tanh")
       const tirx::CallNode* call = e.as<tirx::CallNode>();
       TVM_FFI_ICHECK(call != nullptr);
       const PrimExpr& x = call->args[0];
+      PrimType x_ty = x.ty();
 
 #if ENABLE_QHL
       // Check target for qfloat enablement
@@ -130,14 +133,15 @@ TVM_REGISTER_OP("tirx.tanh")
       }
 
       // Enable QHL library for FP16 data type
-      if (x->dtype.is_float16() && x->dtype.is_vector() && useqhl) {
+      if (x_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16) &&
+          (x_ty.IsFixedLengthVector() || x_ty.IsScalableVector()) && useqhl) {
         std::string tvm_wrapper("tvm_vect_qhmath_hvx_tanh_ahf");
         return TVMExternCall(call, tvm_wrapper);
       }
 #endif
-      PrimExpr one = tirx::MakeConst(x.dtype(), 1);
-      PrimExpr two = tirx::MakeConst(x.dtype(), 2);
-      PrimExpr neg_two = tirx::MakeConst(x.dtype(), -2);
+      PrimExpr one = tirx::MakeConst(x_ty, 1);
+      PrimExpr two = tirx::MakeConst(x_ty, 2);
+      PrimExpr neg_two = tirx::MakeConst(x_ty, -2);
 
       PrimExpr exp_neg2x = exp(neg_two * x);
       PrimExpr exp_pos2x = exp(two * x);
@@ -145,7 +149,7 @@ TVM_REGISTER_OP("tirx.tanh")
       PrimExpr tanh_pos = (one - exp_neg2x) / (one + exp_neg2x);
       PrimExpr tanh_neg = (exp_pos2x - one) / (exp_pos2x + one);
       // MakeConst can handle both vector and scalar types.
-      PrimExpr tanh_x = tirx::Select(x >= tirx::MakeConst(x.dtype(), 0), tanh_pos, tanh_neg);
+      PrimExpr tanh_x = tirx::Select(x >= tirx::MakeConst(x_ty, 0), tanh_pos, tanh_neg);
       return tanh_x;
     });
 
@@ -154,6 +158,7 @@ TVM_REGISTER_OP("tirx.tan")
       const tirx::CallNode* call = e.as<tirx::CallNode>();
       TVM_FFI_ICHECK(call != nullptr);
       const PrimExpr& x = call->args[0];
+      PrimType x_ty = x.ty();
 #if ENABLE_QHL
       // Check target for qfloat enablement
       const auto f = tvm::ffi::Function::GetGlobal("target.TargetCurrent");
@@ -166,7 +171,8 @@ TVM_REGISTER_OP("tirx.tan")
       }
 
       // Enable QHL library for FP16 data type
-      if (x->dtype.is_float16() && x->dtype.is_vector() && useqhl) {
+      if (x_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16) &&
+          (x_ty.IsFixedLengthVector() || x_ty.IsScalableVector()) && useqhl) {
         std::string tvm_wrapper("tvm_vect_qhmath_hvx_tan_ahf");
         return TVMExternCall(call, tvm_wrapper);
       }
@@ -184,6 +190,7 @@ TVM_REGISTER_OP("tirx.sigmoid")
       const tirx::CallNode* call = e.as<tirx::CallNode>();
       TVM_FFI_ICHECK(call != nullptr);
       const PrimExpr& x = call->args[0];
+      PrimType x_ty = x.ty();
 #if ENABLE_QHL
       // Check target for qfloat enablement
       const auto f = tvm::ffi::Function::GetGlobal("target.TargetCurrent");
@@ -195,21 +202,22 @@ TVM_REGISTER_OP("tirx.sigmoid")
         useqhl = tstring.find("+hvx-qfloat") != std::string::npos;
       }
 
-      PrimExpr MinBound = tirx::MakeConst(x.dtype(), -8);
-      PrimExpr MaxBound = tirx::MakeConst(x.dtype(), 8);
+      PrimExpr MinBound = tirx::MakeConst(x_ty, -8);
+      PrimExpr MaxBound = tirx::MakeConst(x_ty, 8);
       const PrimExpr v1 = tirx::Max(x, MinBound);
       const PrimExpr v2 = tirx::Min(v1, MaxBound);
 
       ffi::Array<tvm::PrimExpr> new_args = {v2};
-      const tirx::Call new_call = tirx::Call(call->dtype, call->op, new_args);
+      const tirx::Call new_call = tirx::Call(call->ty(), call->op, new_args);
 
       // Enable QHL library for FP16 data type
-      if (x->dtype.is_float16() && x->dtype.is_vector() && useqhl) {
+      if (x_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16) &&
+          (x_ty.IsFixedLengthVector() || x_ty.IsScalableVector()) && useqhl) {
         std::string tvm_wrapper("tvm_vect_qhmath_hvx_sigmoid_ahf");
         return TVMExternCall(new_call.get(), tvm_wrapper);
       }
 #endif
-      PrimExpr one = tirx::MakeConst(x.dtype(), 1);
+      PrimExpr one = tirx::MakeConst(x_ty, 1);
       return one / (one + exp(-x));
     });
 
diff --git a/src/backend/hexagon/runtime/ops/conv2d_fp16_hvx.cc b/src/backend/hexagon/runtime/ops/conv2d_fp16_hvx.cc
index d555fb77cfae..c063ae62b1bd 100644
--- a/src/backend/hexagon/runtime/ops/conv2d_fp16_hvx.cc
+++ b/src/backend/hexagon/runtime/ops/conv2d_fp16_hvx.cc
@@ -21,8 +21,8 @@
 #include <hexagon_types.h>
 #include <hvx_hexagon_protos.h>
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/runtime/base.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/device_api.h>
 
 #include <algorithm>
@@ -469,7 +469,7 @@ int conv2d_packed_fp16(void*, TVMFFIAny* args, int num_args, TVMFFIAny* out_val)
   // Prepare zero_block
   int64_t block_nbytes = 2048;
   void* zero_block = device_api->AllocDataSpace(conv_utils::hexagon_device, 1, &block_nbytes,
-                                                tvm::runtime::DataType::UInt(8), vtcm_scope);
+                                                DLDataType{kDLUInt, 8, 1}, vtcm_scope);
   memset(zero_block, 0, 2048);
 
   // FIXME: Setting bias to zero_block: this works for up to 256 output channels.
diff --git a/src/backend/metal/codegen/codegen_metal.cc b/src/backend/metal/codegen/codegen_metal.cc
index 3f483f79aaed..e6ef1647e5bf 100644
--- a/src/backend/metal/codegen/codegen_metal.cc
+++ b/src/backend/metal/codegen/codegen_metal.cc
@@ -46,7 +46,7 @@ void CodeGenMetal::InitFuncState(const PrimFunc& f) {
   CodeGenC::InitFuncState(f);
   // analyze the data;
   for (Var arg : f->params) {
-    if (arg.dtype().is_handle()) {
+    if (arg.ty().IsHandle()) {
       alloc_storage_scope_[arg.get()] = "global";
     }
   }
@@ -97,7 +97,7 @@ void CodeGenMetal::AddFunction(const GlobalVar& gvar, const PrimFunc& func) {
   }
   for (size_t i = 0; i < func->params.size(); ++i, ++num_buffer) {
     Var v = func->params[i];
-    if (!v.dtype().is_handle()) break;
+    if (!v.ty().IsHandle()) break;
     this->stream << "  ";
     std::string vid = AllocVarID(v.get());
     auto it = alloc_storage_scope_.find(v.get());
@@ -126,24 +126,24 @@ void CodeGenMetal::AddFunction(const GlobalVar& gvar, const PrimFunc& func) {
     decl_stream << "struct " << arg_buf_type << " {\n";
     for (size_t i = num_buffer; i < func->params.size(); ++i) {
       Var v = func->params[i];
-      TVM_FFI_ICHECK(!v.dtype().is_handle());
+      TVM_FFI_ICHECK(!v.ty().IsHandle());
       std::string vid = AllocVarID(v.get());
       std::ostringstream vref;
-      if (v.dtype().bits() == 32) {
+      if (v.ty().bits() == 32) {
         decl_stream << "  ";
-        PrintType(v.dtype(), decl_stream);
+        PrintType(v.ty()->dtype, decl_stream);
         decl_stream << " " << vid << "[2];\n";
         vref << varg << "." << vid << "[0]";
-      } else if (v.dtype().bits() == 64) {
+      } else if (v.ty().bits() == 64) {
         decl_stream << "  ";
-        PrintType(v.dtype(), decl_stream);
+        PrintType(v.ty()->dtype, decl_stream);
         decl_stream << " " << vid << ";\n";
         vref << varg << "." << vid;
       } else {
         // For non 32bit type, ref through arg union.
         decl_stream << "  __TVMArgUnion " << vid << ";\n";
         vref << varg << "." << vid << ".v_";
-        PrintType(v.dtype(), vref);
+        PrintType(v.ty()->dtype, vref);
       }
       var_idmap_[v.get()] = vref.str();
     }
@@ -165,10 +165,14 @@ void CodeGenMetal::AddFunction(const GlobalVar& gvar, const PrimFunc& func) {
   if (work_dim != 0) {
     // use ushort by default for now
     stream << "  ";
-    PrintType(DataType::UInt(thread_index_bits_, work_dim), stream);
+    PrintType(DLDataType{kDLUInt, static_cast<uint8_t>(thread_index_bits_),
+                         static_cast<uint16_t>(work_dim)},
+              stream);
     stream << " blockIdx [[threadgroup_position_in_grid]],\n";
     stream << "  ";
-    PrintType(DataType::UInt(thread_index_bits_, work_dim), stream);
+    PrintType(DLDataType{kDLUInt, static_cast<uint8_t>(thread_index_bits_),
+                         static_cast<uint16_t>(work_dim)},
+              stream);
     stream << " threadIdx [[thread_position_in_threadgroup]]\n";
   }
   thread_work_dim_ = work_dim;
@@ -190,28 +194,29 @@ void CodeGenMetal::BindThreadIndex(const IterVar& iv) {
   if (thread_work_dim_ <= 1) {
     vname = vname.substr(0, iv->thread_tag.length() - 2);
   }
-  var_idmap_[iv->var.get()] =
-      CastFromTo(vname, DataType::UInt(thread_index_bits_), iv->var.dtype());
+  var_idmap_[iv->var.get()] = CastFromTo(
+      vname, DLDataType{kDLUInt, static_cast<uint8_t>(thread_index_bits_), 1}, iv->var.ty()->dtype);
 }
 
-void CodeGenMetal::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
+void CodeGenMetal::PrintType(DLDataType raw_t, std::ostream& os) {  // NOLINT(*)
+  PrimType t(raw_t);
   int lanes = t.lanes();
-  if (t.is_handle()) {
+  if (t.IsHandle()) {
     TVM_FFI_ICHECK_EQ(lanes, 1) << "do not yet support vector types";
     os << "void*";
     return;
   }
 
-  if (t.is_void()) {
+  if (t.IsVoid()) {
     os << "void";
     return;
   }
-  if (t == DataType::Bool()) {
+  if (raw_t == DLDataType{kDLBool, 8, 1}) {
     os << "bool";
     return;
   }
   bool fail = false;
-  if (t.is_float()) {
+  if (t.code() == DLDataTypeCode::kDLFloat) {
     // Need to care about sizes and alignment of half3/float3 because tirx representation might not
     // be aware of Metal half3/float3 details and can treat them as just three elements,
     // while sizes and alignmnents of half3/float3 are one element more (half3-8 bytes/
@@ -239,8 +244,8 @@ void CodeGenMetal::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
       os << lanes;
       return;
     }
-  } else if (t.is_uint() || t.is_int()) {
-    if (t.is_uint()) {
+  } else if (t.MatchesCode(DLDataTypeCode::kDLUInt, DLDataTypeCode::kDLInt)) {
+    if (t.MatchesCode(DLDataTypeCode::kDLUInt)) {
       os << 'u';
     }
     switch (t.bits()) {
@@ -268,11 +273,12 @@ void CodeGenMetal::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
       os << lanes;
       return;
     }
-  } else if (t.is_bfloat16()) {
+  } else if (t.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) {
     os << "bfloat";
     return;
   }
-  TVM_FFI_THROW(InternalError) << "Cannot convert type " << t << " to Metal type";
+  TVM_FFI_THROW(InternalError) << "Cannot convert type " << ffi::DLDataTypeToString(raw_t)
+                               << " to Metal type";
 }
 
 void CodeGenMetal::PrintStorageSync(const CallNode* op) {
@@ -288,12 +294,12 @@ void CodeGenMetal::PrintStorageSync(const CallNode* op) {
   }
 }
 
-void CodeGenMetal::PrintVecElemLoad(const std::string& vec, DataType t, int i,
+void CodeGenMetal::PrintVecElemLoad(const std::string& vec, DLDataType t, int i,
                                     std::ostream& os) {  // NOLINT(*)
   os << vec << "[" << i << "]";
 }
 
-void CodeGenMetal::PrintVecElemStore(const std::string& vec, DataType t, int i,
+void CodeGenMetal::PrintVecElemStore(const std::string& vec, DLDataType t, int i,
                                      const std::string& value) {
   this->PrintIndent();
   stream << vec << "[" << i << "]"
@@ -328,11 +334,14 @@ void CodeGenMetal::VisitStmt_(const AllocBufferNode* op) {
 
   auto scope = GetPtrStorageScope(op->buffer->data);
   alloc_storage_scope_[op->buffer->data.get()] = scope;
-  DataType dtype = op->buffer->dtype;
+  DLDataType dtype = op->buffer->dtype->dtype;
   if (scope == "metal.simdgroup") {
-    TVM_FFI_ICHECK(dtype == DataType::Float(16) || dtype == DataType::Float(32) ||
-                   dtype == DataType::BFloat(16))
-        << "Only float16, float32, and bfloat16 are supported, but got " << dtype;
+    bool supported_simdgroup_dtype = dtype == DLDataType{kDLFloat, 16, 1} ||
+                                     dtype == DLDataType{kDLFloat, 32, 1} ||
+                                     dtype == DLDataType{kDLBfloat, 16, 1};
+    TVM_FFI_ICHECK(supported_simdgroup_dtype)
+        << "Only float16, float32, and bfloat16 are supported, but got "
+        << ffi::DLDataTypeToString(dtype);
     TVM_FFI_ICHECK(constant_size % 64 == 0)
         << "Only 8x8 matrix is supported, but got " << constant_size << " bytes\n";
 
@@ -360,8 +369,8 @@ void CodeGenMetal::VisitExpr_(const SelectNode* op, std::ostream& os) {  // NOLI
 
 void CodeGenMetal::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NOLINT(*)
   std::string v = PrintExpr(op->value);
-  int lanes = op->dtype.lanes();
-  PrintType(op->dtype, os);
+  int lanes = op->ty().lanes();
+  PrintType(op->ty()->dtype, os);
   os << "(";
   for (int i = 0; i < lanes; ++i) {
     if (i != 0) os << ", ";
@@ -422,7 +431,7 @@ void CodeGenMetal::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT
   } else if (op->op.same_as(builtin::reinterpret())) {
     // generate as_type<TYPE>(ARG)
     os << "(as_type<";
-    this->PrintType(op->dtype, os);
+    this->PrintType(op->ty()->dtype, os);
     os << ">(";
     this->PrintExpr(op->args[0], os);
     os << "))";
@@ -442,9 +451,9 @@ void CodeGenMetal::VisitExpr_(const FloatImmNode* op, std::ostream& os) {  // NO
     temp << "NAN";
   } else {
     temp << std::scientific << op->value;
-    if (op->dtype.bits() == 32)
+    if (op->ty().bits() == 32)
       temp << 'f';
-    else if (op->dtype.bits() == 16)
+    else if (op->ty().bits() == 16)
       temp << 'h';
   }
   MarkConst(temp.str());
diff --git a/src/backend/metal/codegen/codegen_metal.h b/src/backend/metal/codegen/codegen_metal.h
index b92608aecfa1..ffa9a321aa43 100644
--- a/src/backend/metal/codegen/codegen_metal.h
+++ b/src/backend/metal/codegen/codegen_metal.h
@@ -43,13 +43,14 @@ class CodeGenMetal final : public CodeGenC {
   void InitFuncState(const PrimFunc& f) final;
   void PrintStorageScope(const std::string& scope, std::ostream& os) final;  // NOLINT(*)
   void PrintStorageSync(const CallNode* op) final;                           // NOLINT(*)
-  void PrintType(DataType t, std::ostream& os) final;                        // NOLINT(*)
+  void PrintType(DLDataType t, std::ostream& os) final;                      // NOLINT(*)
   void BindThreadIndex(const IterVar& iv) final;                             // NOLINT(*)
   // print load of single element
-  void PrintVecElemLoad(const std::string& vec, DataType t, int i,
+  void PrintVecElemLoad(const std::string& vec, DLDataType t, int i,
                         std::ostream& os) final;  // NOLINT(*)
   // print store of single element.
-  void PrintVecElemStore(const std::string& vec, DataType t, int i, const std::string& value) final;
+  void PrintVecElemStore(const std::string& vec, DLDataType t, int i,
+                         const std::string& value) final;
   // overload visitor
   void VisitStmt_(const AllocBufferNode* op) final;                  // NOLINT(*)
   void VisitExpr_(const SelectNode* op, std::ostream& os) final;     // NOLINT(*)
diff --git a/src/backend/metal/codegen/intrin_rule_metal.cc b/src/backend/metal/codegen/intrin_rule_metal.cc
index c807ac4c2e8a..999fe526f04e 100644
--- a/src/backend/metal/codegen/intrin_rule_metal.cc
+++ b/src/backend/metal/codegen/intrin_rule_metal.cc
@@ -31,7 +31,7 @@ namespace intrin {
 using tirx::FLowerIntrinsic;
 
 struct MetalWarpIntrinsic {
-  const Op operator()(DataType t, const Op& orig_op) const {
+  const Op operator()(PrimType t, const Op& orig_op) const {
     if (orig_op.same_as(builtin::tvm_warp_shuffle())) {
       static const Op& metal_simd_shuffle_op = Op::Get("tirx.metal.simd_shuffle");
       return metal_simd_shuffle_op;
@@ -52,7 +52,7 @@ static PrimExpr DispatchMetalShuffle(const PrimExpr& e) {
   TVM_FFI_ICHECK(call != nullptr);
   TVM_FFI_ICHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
   ffi::Array<PrimExpr> metal_args{{call->args[1], call->args[2]}};
-  return Call(call->dtype, T()(call->dtype, call->op.as_or_throw<Op>()), metal_args);
+  return Call(e.ty(), T()(e.ty(), call->op.as_or_throw<Op>()), metal_args);
 }
 
 void RegisterMetalIntrinRules() {
@@ -81,7 +81,7 @@ TVM_REGISTER_OP("tirx.round")
       for (auto arg : call->args) {
         new_args.push_back(arg);
       }
-      return tirx::Call(call->dtype, tirx::builtin::call_pure_extern(), new_args);
+      return tirx::Call(e.ty(), tirx::builtin::call_pure_extern(), new_args);
     });
 
 TVM_REGISTER_OP("tirx.nearbyint")
diff --git a/src/backend/opencl/codegen/codegen_opencl.cc b/src/backend/opencl/codegen/codegen_opencl.cc
index 51719785195b..001d4a33b081 100644
--- a/src/backend/opencl/codegen/codegen_opencl.cc
+++ b/src/backend/opencl/codegen/codegen_opencl.cc
@@ -84,7 +84,7 @@ void CodeGenOpenCL::InitFuncState(const PrimFunc& f) {
       // Storage scope qualifiers for textures are inferred
       // and set prior to function codegen.
       continue;
-    } else if (arg.dtype().is_handle()) {
+    } else if (arg.ty().IsHandle()) {
       alloc_storage_scope_[arg.get()] = "global";
     }
   }
@@ -189,26 +189,27 @@ void CodeGenOpenCL::BindThreadIndex(const IterVar& iv) {
   } else {
     os << "get_group_id(" << ts.dim_index << ")";
   }
-  var_idmap_[iv->var.get()] = CastFromTo(os.str(), DataType::UInt(64), iv->var.dtype());
+  var_idmap_[iv->var.get()] = CastFromTo(os.str(), DLDataType{kDLUInt, 64, 1}, iv->var.ty()->dtype);
 }
 
-void CodeGenOpenCL::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
+void CodeGenOpenCL::PrintType(DLDataType raw_t, std::ostream& os) {  // NOLINT(*)
+  PrimType t(raw_t);
   int lanes = t.lanes();
-  if (t.is_handle()) {
+  if (t.IsHandle()) {
     TVM_FFI_ICHECK_EQ(lanes, 1) << "do not yet support vector types";
     os << "void*";
     return;
   }
-  if (t.is_void()) {
+  if (t.IsVoid()) {
     os << "void";
     return;
   }
-  if (t == DataType::Bool()) {
+  if (raw_t == DLDataType{kDLBool, 8, 1}) {
     os << "bool";
     return;
   }
   bool fail = false;
-  if (t.is_float()) {
+  if (t.code() == DLDataTypeCode::kDLFloat) {
     switch (t.bits()) {
       case 16:
         os << "half";
@@ -230,14 +231,14 @@ void CodeGenOpenCL::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
       os << lanes;
       return;
     }
-  } else if (t.is_bool()) {
+  } else if (t.MatchesCode(DLDataTypeCode::kDLBool)) {
     os << "uint";
     if (!fail && ((lanes >= 2 && lanes <= 4) || lanes == 8 || lanes == 16)) {
       os << lanes;
       return;
     }
-  } else if (t.is_uint() || t.is_int()) {
-    if (t.is_uint()) {
+  } else if (t.MatchesCode(DLDataTypeCode::kDLUInt, DLDataTypeCode::kDLInt)) {
+    if (t.MatchesCode(DLDataTypeCode::kDLUInt)) {
       os << 'u';
     }
     switch (t.bits()) {
@@ -266,7 +267,8 @@ void CodeGenOpenCL::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
       return;
     }
   }
-  TVM_FFI_THROW(InternalError) << "Cannot convert type " << t << " to OpenCL type";
+  TVM_FFI_THROW(InternalError) << "Cannot convert type " << ffi::DLDataTypeToString(raw_t)
+                               << " to OpenCL type";
 }
 
 void CodeGenOpenCL::PrintType(const Type& type, std::ostream& os) {  // NOLINT(*)
@@ -286,41 +288,44 @@ void CodeGenOpenCL::PrintType(const Type& type, std::ostream& os) {  // NOLINT(*
   }
 }
 
-void CodeGenOpenCL::PrintVecAddr(const BufferNode* buffer, DataType t, PrimExpr base,
+void CodeGenOpenCL::PrintVecAddr(const BufferNode* buffer, DLDataType t, PrimExpr base,
                                  std::ostream& os) {  // NOLINT(*)
   const VarNode* buffer_var = buffer->data.get();
-  if (!HandleTypeMatch(buffer_var, t.element_of())) {
+  DLDataType elem_type{t.code, t.bits, 1};
+  if (!HandleTypeMatch(buffer_var, elem_type)) {
     os << '(';
     auto it = alloc_storage_scope_.find(buffer_var);
     if (it != alloc_storage_scope_.end()) {
       PrintStorageScope(it->second, os);
     }
-    PrintType(t.element_of(), os);
+    PrintType(elem_type, os);
     os << "*)";
   }
   os << GetVarID(buffer_var) << " + ";
   PrintExpr(base, os);
 }
-std::string CodeGenOpenCL::GetVecLoad(DataType t, const BufferNode* buffer, PrimExpr base) {
+std::string CodeGenOpenCL::GetVecLoad(DLDataType t, const BufferNode* buffer, PrimExpr base) {
   std::ostringstream os;
-  os << "vload" << t.lanes() << "(0, ";
+  os << "vload" << PrimType(t).lanes() << "(0, ";
   PrintVecAddr(buffer, t, base, os);
   os << ")";
   return os.str();
 }
 
-void CodeGenOpenCL::PrintVecStore(const BufferNode* buffer, DataType t, PrimExpr base,
+void CodeGenOpenCL::PrintVecStore(const BufferNode* buffer, DLDataType t, PrimExpr base,
                                   const std::string& value) {
   this->PrintIndent();
-  stream << "vstore" << t.lanes() << "(" << value << ", 0, ";
+  stream << "vstore" << PrimType(t).lanes() << "(" << value << ", 0, ";
   PrintVecAddr(buffer, t, base, stream);
   stream << ");\n";
 }
 
-void CodeGenOpenCL::PrintVecElemLoadExpr(DataType t, int i, const std::string& value,
+void CodeGenOpenCL::PrintVecElemLoadExpr(DLDataType t, int i, const std::string& value,
                                          std::ostream& os) {  // NOLINT(*)
-  TVM_FFI_ICHECK_GT(t.lanes(), 1);
-  if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
+  PrimType t_ty(t);
+  int lanes = t_ty.lanes();
+  TVM_FFI_ICHECK_GT(lanes, 1);
+  if (t.bits == 8 && (t_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt))) {
     if (i != 0) {
       os << "|";
     }
@@ -334,7 +339,7 @@ void CodeGenOpenCL::PrintVecElemLoadExpr(DataType t, int i, const std::string& v
     os << ")(";
   }
   os << value;
-  if (i != t.lanes() - 1) {
+  if (i != lanes - 1) {
     os << ",";
   } else {
     os << "))";
@@ -376,14 +381,14 @@ void CodeGenOpenCL::PrintRestrict(const Var& v, std::ostream& os) {
   }
 }
 
-std::string CodeGenOpenCL::CastFromTo(std::string value, DataType from, DataType target) {
+std::string CodeGenOpenCL::CastFromTo(std::string value, DLDataType from, DLDataType target) {
   if (from == target) return value;
   return CastTo(value, target);
 }
 
-std::string CodeGenOpenCL::CastTo(std::string value, DataType target) {
+std::string CodeGenOpenCL::CastTo(std::string value, DLDataType target) {
   std::ostringstream os;
-  if (target == DataType::Bool()) {
+  if (target == DLDataType{kDLBool, 8, 1}) {
     os << "(";
     os << "(";
     this->PrintType(target, os);
@@ -422,7 +427,7 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     if (it != alloc_storage_scope_.end()) {
       PrintStorageScope(it->second, os);
     }
-    this->PrintType(load->dtype.element_of(), os);
+    this->PrintType(DLDataType{load->ty()->dtype.code, load->ty()->dtype.bits, 1}, os);
     os << " *)" << this->GetVarID(load->buffer->data.get()) << " + ";
     this->PrintExpr(load->indices[0], os);
     os << ')';
@@ -434,13 +439,14 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     const int channel_size = op->args[4].as_or_throw<IntImm>()->value;
     TVM_FFI_ICHECK(channel_size == 64 || channel_size == 128)
         << "Unsupported Channel Size: " << channel_size;
-    DataType channel_type = runtime::GetChannelType(channel_size);
+    DLDataType channel_type = runtime::GetChannelType(channel_size);
 
-    DataType buffer_type = ptr_type->element_type.as<PrimTypeNode>()->dtype;
+    DLDataType buffer_type = ptr_type->element_type.as<PrimTypeNode>()->dtype;
     std::stringstream ss;
     this->PrintExpr(op->args[5], ss);
     std::string value;
-    value = this->SSAGetID(ss.str(), buffer_type.with_lanes(channel_size / buffer_type.bits()));
+    value = this->SSAGetID(ss.str(),
+                           PrimType(buffer_type).WithLanes(channel_size / buffer_type.bits)->dtype);
     if (channel_size == 64) {
       os << "write_imageh(";
     } else if (channel_size == 128) {
@@ -467,11 +473,11 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     enable_compliant_texture_reads_ = true;
     std::stringstream ss;
     const int channel_size = op->args[4].as_or_throw<IntImm>()->value;
-    const int data_lanes = channel_size / op->dtype.bits();
+    const int data_lanes = channel_size / op->ty().bits();
     TVM_FFI_ICHECK(channel_size == 64 || channel_size == 128)
         << "Unsupported Channel Size: " << channel_size;
     ss << "as_";
-    this->PrintType(op->dtype.with_lanes(data_lanes), ss);
+    this->PrintType(op->ty().WithLanes(data_lanes)->dtype, ss);
     ss << "(";
     if (channel_size == 64) {
       ss << "READ_IMAGEH(";
@@ -493,7 +499,7 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     this->PrintExpr(IntImm::Int32(0), ss);
     ss << "))))";
 
-    std::string rhs = SSAGetID(ss.str(), op->dtype.with_lanes(data_lanes));
+    std::string rhs = SSAGetID(ss.str(), op->ty().WithLanes(data_lanes)->dtype);
     if (auto ramp = op->args.back().as<RampNode>()) {
       if (ramp->base.as<IntImmNode>() && *tirx::as_const_int(ramp->base) == 0 &&
           *tirx::as_const_int(ramp->lanes) == data_lanes &&
@@ -501,10 +507,10 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
         os << rhs;
       } else if (*tirx::as_const_int(ramp->stride) == 1) {
         os << "(*(";
-        this->PrintType(op->dtype.with_lanes(*tirx::as_const_int(ramp->lanes)), os);
+        this->PrintType(op->ty().WithLanes(*tirx::as_const_int(ramp->lanes))->dtype, os);
         os << "*)";
         os << "((";
-        this->PrintType(op->dtype.with_lanes(1), os);
+        this->PrintType(op->ty().WithLanes(1)->dtype, os);
         os << "*)&" << rhs << " + ";
         this->PrintExpr(ramp->base, os);
         os << "))";
@@ -513,7 +519,7 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
       }
     } else {
       os << "((";
-      this->PrintType(op->dtype.with_lanes(1), os);
+      this->PrintType(op->ty().WithLanes(1)->dtype, os);
       os << "*)&" << rhs << ")[";
       this->PrintExpr(op->args.back(), os);
       os << "]";
@@ -521,7 +527,7 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
   } else if (op->op.same_as(builtin_call_extern_) || op->op.same_as(builtin_call_pure_extern_)) {
     auto func = op->args[0].as_or_throw<StringImm>();
     // Enable atomics extension if used.
-    if (func->value == "atomic_add" && op->dtype.is_float()) {
+    if (func->value == "atomic_add" && op->ty().code() == DLDataTypeCode::kDLFloat) {
       enable_atomics_ = true;
       this->PrintCallExtern(GetType(ffi::GetRef<PrimExpr>(op)), "atomic_add_float_emu", op->args,
                             true, os);
@@ -540,9 +546,9 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
 
 void CodeGenOpenCL::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NOLINT(*)
   std::string v = PrintExpr(op->value);
-  int lanes = op->dtype.lanes();
+  int lanes = op->ty().lanes();
   os << "((";
-  PrintType(op->dtype, os);
+  PrintType(op->ty()->dtype, os);
   os << ")(";
   for (int i = 0; i < lanes; ++i) {
     if (i != 0) os << ", ";
@@ -553,9 +559,9 @@ void CodeGenOpenCL::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  //
 
 void CodeGenOpenCL::VisitExpr_(const RampNode* op, std::ostream& os) {  // NOLINT(*)
   os << "((";
-  PrintType(op->dtype, os);
+  PrintType(op->ty()->dtype, os);
   os << ")(";
-  int lanes = op->dtype.lanes();
+  int lanes = op->ty().lanes();
   for (int i = 0; i < lanes; i++) {
     os << "(" << PrintExpr(op->base) << ")"
        << "+(" << PrintExpr(op->stride) << "*" << i << ")";
@@ -579,18 +585,18 @@ void CodeGenOpenCL::VisitExpr_(const FloatImmNode* op, std::ostream& os) {  // N
 
 template <typename T>
 inline void PrintBinaryExpr(const T* op, const char* opstr, std::ostream& os, CodeGenOpenCL* p) {
-  if (op->dtype.lanes() == 1) {
+  if (op->ty().lanes() == 1) {
     os << opstr << "((";
-    p->PrintType(op->a->dtype, os);
+    p->PrintType(op->a.ty()->dtype, os);
     os << ")";
     p->PrintExpr(op->a, os);
     os << ", (";
-    p->PrintType(op->b->dtype, os);
+    p->PrintType(op->b.ty()->dtype, os);
     os << ")";
     p->PrintExpr(op->b, os);
     os << ')';
   } else {
-    p->PrintVecBinaryOp(opstr, op->dtype, op->a, op->b, os);
+    p->PrintVecBinaryOp(opstr, op->ty()->dtype, op->a, op->b, os);
   }
 }
 
@@ -604,14 +610,16 @@ void CodeGenOpenCL::VisitExpr_(const MaxNode* op, std::ostream& os) {
 
 void CodeGenOpenCL::VisitExpr_(const ModNode* op, std::ostream& os) {  // NOLINT(*)
   std::string opstr;
-  if (op->dtype.is_int() || op->dtype.is_uint()) {
+  PrimType op_ty = op->ty();
+  if (op_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
     opstr = "%";
   } else {
-    TVM_FFI_ICHECK(op->dtype.is_float())
-        << "Expected floating point or integer dtype in Mod, but got " << op->dtype;
+    TVM_FFI_ICHECK(op_ty.code() == DLDataTypeCode::kDLFloat)
+        << "Expected floating point or integer dtype in Mod, but got "
+        << ffi::DLDataTypeToString(op->ty()->dtype);
     opstr = "fmod";
   }
-  if (op->dtype.lanes() == 1) {
+  if (op_ty.lanes() == 1) {
     if (isalpha(opstr.c_str()[0])) {
       os << opstr.c_str() << '(';
       this->PrintExpr(op->a, os);
@@ -626,7 +634,7 @@ void CodeGenOpenCL::VisitExpr_(const ModNode* op, std::ostream& os) {  // NOLINT
       os << ')';
     }
   } else {
-    this->PrintVecBinaryOp(opstr.c_str(), op->dtype, op->a, op->b, os);
+    this->PrintVecBinaryOp(opstr.c_str(), op->ty()->dtype, op->a, op->b, os);
   }
 }
 
@@ -634,11 +642,11 @@ void CodeGenOpenCL::VisitExpr_(const AndNode* op, std::ostream& os) {
   std::ostringstream oss;
   os << "(";
   this->PrintExpr(op->a, oss);
-  os << CastTo(oss.str(), op->dtype);
+  os << CastTo(oss.str(), op->ty()->dtype);
   oss.str("");
   os << " && ";
   this->PrintExpr(op->b, oss);
-  os << CastTo(oss.str(), op->dtype);
+  os << CastTo(oss.str(), op->ty()->dtype);
   os << ")";
 }
 
@@ -646,11 +654,11 @@ void CodeGenOpenCL::VisitExpr_(const OrNode* op, std::ostream& os) {
   std::ostringstream oss;
   os << "(";
   this->PrintExpr(op->a, oss);
-  os << CastTo(oss.str(), op->dtype);
+  os << CastTo(oss.str(), op->ty()->dtype);
   oss.str("");
   os << " || ";
   this->PrintExpr(op->b, oss);
-  os << CastTo(oss.str(), op->dtype);
+  os << CastTo(oss.str(), op->ty()->dtype);
   os << ")";
 }
 
@@ -658,18 +666,19 @@ void CodeGenOpenCL::VisitExpr_(const SelectNode* op, std::ostream& os) {
   std::ostringstream oss;
   os << "select(";
   PrintExpr(op->false_value, oss);
-  os << CastFromTo(oss.str(), op->false_value.dtype(), op->dtype);
+  os << CastFromTo(oss.str(), op->false_value.ty()->dtype, op->ty()->dtype);
   oss.str("");
   os << ", ";
   PrintExpr(op->true_value, oss);
-  os << CastFromTo(oss.str(), op->true_value.dtype(), op->dtype);
+  os << CastFromTo(oss.str(), op->true_value.ty()->dtype, op->ty()->dtype);
   oss.str("");
   os << ", ";
   PrintExpr(op->condition, oss);
-  if (op->dtype.is_float()) {
-    os << CastTo(oss.str(), DataType::Int(op->dtype.bits(), op->dtype.lanes()));
+  if (op->ty().code() == DLDataTypeCode::kDLFloat) {
+    os << CastTo(oss.str(), DLDataType{kDLInt, static_cast<uint8_t>(op->ty().bits()),
+                                       static_cast<uint16_t>(op->ty().lanes())});
   } else {
-    os << CastFromTo(oss.str(), op->condition.dtype(), op->dtype);
+    os << CastFromTo(oss.str(), op->condition.ty()->dtype, op->ty()->dtype);
   }
   os << ")";
 }
diff --git a/src/backend/opencl/codegen/codegen_opencl.h b/src/backend/opencl/codegen/codegen_opencl.h
index d588a18c2029..47667e30663a 100644
--- a/src/backend/opencl/codegen/codegen_opencl.h
+++ b/src/backend/opencl/codegen/codegen_opencl.h
@@ -46,20 +46,20 @@ class CodeGenOpenCL final : public CodeGenC {
   void BindThreadIndex(const IterVar& iv) final;                             // NOLINT(*)
   void PrintStorageScope(const std::string& scope, std::ostream& os) final;  // NOLINT(*)
   void PrintStorageSync(const CallNode* op) final;                           // NOLINT(*)
-  void PrintType(DataType t, std::ostream& os) final;                        // NOLINT(*)
+  void PrintType(DLDataType t, std::ostream& os) final;                      // NOLINT(*)
   void PrintType(const Type& type, std::ostream& os) final;                  // NOLINT(*)
-  std::string GetVecLoad(DataType t, const BufferNode* buffer, PrimExpr base) final;
-  void PrintVecStore(const BufferNode* buffer, DataType t, PrimExpr base,
+  std::string GetVecLoad(DLDataType t, const BufferNode* buffer, PrimExpr base) final;
+  void PrintVecStore(const BufferNode* buffer, DLDataType t, PrimExpr base,
                      const std::string& value) final;  // NOLINT(*)
-  void PrintVecElemLoadExpr(DataType t, int i, const std::string& value,
+  void PrintVecElemLoadExpr(DLDataType t, int i, const std::string& value,
                             std::ostream& os) final;  // NOLINT(*)
   // the address of load/store
-  void PrintVecAddr(const BufferNode* buffer, DataType t, PrimExpr base,
-                    std::ostream& os);                                           // NOLINT(*)
-  void PrintRestrict(const Var& v, std::ostream& os) final;                      // NOLINT(*)
-  std::string CastFromTo(std::string value, DataType from, DataType target);     // NOLINT(*)
-  std::string CastTo(std::string value, DataType target);                        // NOLINT(*)
-  void SetTextureScope(const std::unordered_map<const VarNode*, std::string>&);  // NOLINT(*)
+  void PrintVecAddr(const BufferNode* buffer, DLDataType t, PrimExpr base,
+                    std::ostream& os);                                            // NOLINT(*)
+  void PrintRestrict(const Var& v, std::ostream& os) final;                       // NOLINT(*)
+  std::string CastFromTo(std::string value, DLDataType from, DLDataType target);  // NOLINT(*)
+  std::string CastTo(std::string value, DLDataType target);                       // NOLINT(*)
+  void SetTextureScope(const std::unordered_map<const VarNode*, std::string>&);   // NOLINT(*)
 
   // overload visitor
   void VisitStmt_(const AllocBufferNode* op) final;                  // NOLINT(*)
diff --git a/src/backend/opencl/codegen/intrin_rule_opencl.cc b/src/backend/opencl/codegen/intrin_rule_opencl.cc
index f0f58be84d10..669fd1863b39 100644
--- a/src/backend/opencl/codegen/intrin_rule_opencl.cc
+++ b/src/backend/opencl/codegen/intrin_rule_opencl.cc
@@ -42,7 +42,7 @@ static PrimExpr DispatchIntelShuffle(const PrimExpr& e) {
       << "Intel warp shuffle dose not support width != warp_size";
   ffi::Array<PrimExpr> opencl_args{
       {StringImm("intel_sub_group_shuffle"), call->args[1], call->args[2]}};
-  return Call(call->dtype, builtin::call_pure_extern(), opencl_args);
+  return Call(e.ty(), builtin::call_pure_extern(), opencl_args);
 }
 
 void RegisterOpenCLIntrinRules() {
@@ -75,7 +75,7 @@ TVM_REGISTER_OP("tirx.round")
       for (auto arg : call->args) {
         new_args.push_back(arg);
       }
-      return tirx::Call(call->dtype, tirx::builtin::call_pure_extern(), new_args);
+      return tirx::Call(e.ty(), tirx::builtin::call_pure_extern(), new_args);
     });
 
 TVM_REGISTER_OP("tirx.nearbyint")
diff --git a/src/backend/opencl/runtime/opencl_common.h b/src/backend/opencl/runtime/opencl_common.h
index 3b99fa166def..4fc7ce85e383 100644
--- a/src/backend/opencl/runtime/opencl_common.h
+++ b/src/backend/opencl/runtime/opencl_common.h
@@ -186,24 +186,25 @@ inline const char* CLGetErrorString(cl_int error) {
 }
 
 inline cl_channel_type DTypeToOpenCLChannelType(DLDataType data_type) {
-  DataType dtype(data_type);
-  dtype = dtype.with_lanes(1);
+  DLDataType dtype = data_type;
+  // OpenCL image channel type depends on the scalar element type, not vector lanes.
+  dtype.lanes = 1;
 
-  if (dtype == DataType::Float(32)) {
+  if (dtype == DLDataType{kDLFloat, 32, 1}) {
     return CL_FLOAT;
-  } else if (dtype == DataType::Float(16)) {
+  } else if (dtype == DLDataType{kDLFloat, 16, 1}) {
     return CL_HALF_FLOAT;
-  } else if (dtype == DataType::Int(8)) {
+  } else if (dtype == DLDataType{kDLInt, 8, 1}) {
     return CL_SIGNED_INT8;
-  } else if (dtype == DataType::Int(16)) {
+  } else if (dtype == DLDataType{kDLInt, 16, 1}) {
     return CL_SIGNED_INT16;
-  } else if (dtype == DataType::Int(32)) {
+  } else if (dtype == DLDataType{kDLInt, 32, 1}) {
     return CL_SIGNED_INT32;
-  } else if (dtype == DataType::UInt(8)) {
+  } else if (dtype == DLDataType{kDLUInt, 8, 1}) {
     return CL_UNSIGNED_INT8;
-  } else if (dtype == DataType::UInt(16)) {
+  } else if (dtype == DLDataType{kDLUInt, 16, 1}) {
     return CL_UNSIGNED_INT16;
-  } else if (dtype == DataType::UInt(32)) {
+  } else if (dtype == DLDataType{kDLUInt, 32, 1}) {
     return CL_UNSIGNED_INT32;
   }
   TVM_FFI_THROW(InternalError) << "data type is not supported in OpenCL runtime yet: " << dtype;
diff --git a/src/backend/opencl/runtime/opencl_device_api.cc b/src/backend/opencl/runtime/opencl_device_api.cc
index eeb8e95ad543..0b53a1915192 100644
--- a/src/backend/opencl/runtime/opencl_device_api.cc
+++ b/src/backend/opencl/runtime/opencl_device_api.cc
@@ -779,14 +779,12 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                     int64_t height = shape[1];
                     int64_t depth = shape[2];
                     int64_t channel_size = args[7].cast<int64_t>();
-                    DataType channel_type = GetChannelType(channel_size);
+                    DLDataType channel_type = GetChannelType(channel_size);
                     Device dev;
                     dev.device_type = static_cast<DLDeviceType>(device_type);
                     dev.device_id = device_id;
                     DLDataType type_hint;
-                    type_hint.code = channel_type.code();
-                    type_hint.bits = channel_type.bits();
-                    type_hint.lanes = channel_type.lanes();
+                    type_hint = channel_type;
 
                     *rv = OpenCLWorkspace::Global()->AllocDataSpace(
                         dev, static_cast<size_t>(width), static_cast<size_t>(height),
diff --git a/src/backend/opencl/runtime/texture.h b/src/backend/opencl/runtime/texture.h
index a8711805cbfa..3aa2d3681142 100644
--- a/src/backend/opencl/runtime/texture.h
+++ b/src/backend/opencl/runtime/texture.h
@@ -120,15 +120,13 @@ size_t GetTextureMemorySize(T shape, int bits, int lanes, std::string mem_scope,
 /*!
  * \brief Returns the standard channel datatype for any given type.
  * \param channel_size The Number of bits in a Channel
- * \return DataType to be used in the codegen.
+ * \return DLDataType to be used in the codegen.
  */
-inline DataType GetChannelType(size_t channel_size) {
-  DataType channel_type;
-
+inline DLDataType GetChannelType(size_t channel_size) {
   if (channel_size == 128)
-    return DataType::Float(32, 4);
+    return DLDataType{kDLFloat, 32, 4};
   else if (channel_size == 64)
-    return DataType::Float(16, 4);
+    return DLDataType{kDLFloat, 16, 4};
 
   TVM_FFI_THROW(InternalError) << "Unsupported Channel Size: " << channel_size;
 }
diff --git a/src/backend/rocm/codegen/llvm/codegen_amdgpu.cc b/src/backend/rocm/codegen/llvm/codegen_amdgpu.cc
index 22ce75cddade..6f70343f46a4 100644
--- a/src/backend/rocm/codegen/llvm/codegen_amdgpu.cc
+++ b/src/backend/rocm/codegen/llvm/codegen_amdgpu.cc
@@ -100,7 +100,7 @@ class CodeGenAMDGPU : public CodeGenLLVM {
     llvm::Value* buf = nullptr;
     StorageInfo& info = alloc_storage_info_[op->buffer->data.get()];
     auto storage_scope = runtime::StorageScope::Create(GetPtrStorageScope(op->buffer->data));
-    DataType dtype = op->buffer->dtype;
+    PrimType dtype = op->buffer->dtype;
 
     if (storage_scope.rank == runtime::StorageRank::kShared && storage_scope.tag == ".dyn") {
       LOG(WARNING) << "Dynamic shared memory support for rocm is experimental.";
@@ -188,7 +188,7 @@ class CodeGenAMDGPU : public CodeGenLLVM {
     llvm::Function* f = llvm::Intrinsic::getDeclaration(module_.get(), intrin_id);
 #endif
     llvm::Value* result = builder_->CreateCall(f, {});
-    return this->CreateCast(DataType::Int(32), iv->var->dtype, result);
+    return this->CreateCast(PrimType::Int(32), iv->var.ty(), result);
   }
 
   llvm::Value* CreateStorageSync(const CallNode* op) final {
@@ -220,10 +220,11 @@ class CodeGenAMDGPU : public CodeGenLLVM {
 
   llvm::Value* CreateIntrinsic(const CallNode* op) final {
     if (op->op.same_as(builtin::atomic_add())) {
-      TVM_FFI_ICHECK(op->args[1]->dtype.bits() == 32) << "Only supports 32 bit atomic for now";
+      PrimType value_ty = op->args[1].ty();
+      TVM_FFI_ICHECK(value_ty.bits() == 32) << "Only supports 32 bit atomic for now";
       llvm::Value* v0 = MakeValue(op->args[0]);
       llvm::Value* v1 = MakeValue(op->args[1]);
-      if (op->args[1]->dtype.is_float()) {
+      if (value_ty.MatchesCode(DLDataTypeCode::kDLFloat)) {
         return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1, llvm::MaybeAlign(),
                                          llvm::AtomicOrdering::Monotonic);
       }
diff --git a/src/backend/rocm/codegen/llvm/intrin_rule_rocm.cc b/src/backend/rocm/codegen/llvm/intrin_rule_rocm.cc
index 4859fd5f4a24..db0f113b9c8b 100644
--- a/src/backend/rocm/codegen/llvm/intrin_rule_rocm.cc
+++ b/src/backend/rocm/codegen/llvm/intrin_rule_rocm.cc
@@ -50,14 +50,14 @@ inline PrimExpr DispatchPureExternOCML(const PrimExpr& e) {
   TVM_FFI_ICHECK_EQ(name.substr(0, 5), "tirx.");
 
   std::ostringstream intrinsic_name;
-  intrinsic_name << "__ocml_" << name.substr(5) << "_f" << call->dtype.bits();
+  intrinsic_name << "__ocml_" << name.substr(5) << "_f" << call->ty().bits();
 
   ffi::Array<PrimExpr> new_args = {StringImm(intrinsic_name.str())};
   for (auto arg : call->args) {
     new_args.push_back(arg);
   }
 
-  return Call(call->dtype, builtin::call_pure_extern(), new_args);
+  return Call(call->ty(), builtin::call_pure_extern(), new_args);
 }
 
 inline PrimExpr DispatchShuffle(const PrimExpr& e) {
@@ -66,15 +66,17 @@ inline PrimExpr DispatchShuffle(const PrimExpr& e) {
   TVM_FFI_ICHECK(call != nullptr);
   TVM_FFI_ICHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
   PrimExpr var = call->args[1];
-  TVM_FFI_ICHECK_EQ(var.dtype().bits(), 32);
+  PrimType var_ty = var.ty();
+  TVM_FFI_ICHECK_EQ(var_ty.bits(), 32);
 
   // get own lane in self (__lane_id)
   PrimExpr minus_one = IntImm::Int32(-1);
   PrimExpr zero = IntImm::Int32(0);
-  PrimExpr lo = Call(DataType::Int(32), builtin::call_pure_extern(),
+  PrimType i32_ty = PrimType::Int(32);
+  PrimExpr lo = Call(i32_ty, builtin::call_pure_extern(),
                      {StringImm("llvm.amdgcn.mbcnt.lo"), minus_one, zero});
-  PrimExpr self = Call(DataType::Int(32), builtin::call_pure_extern(),
-                       {StringImm("llvm.amdgcn.mbcnt.hi"), minus_one, lo});
+  PrimExpr self =
+      Call(i32_ty, builtin::call_pure_extern(), {StringImm("llvm.amdgcn.mbcnt.hi"), minus_one, lo});
 
   // compute lane to get from
   PrimExpr width = call->args[3];
@@ -93,12 +95,12 @@ inline PrimExpr DispatchShuffle(const PrimExpr& e) {
     index = Select((self & (width - 1)) + delta >= width, self, index);
   }
   // reinterprete var as int32
-  bool is_int32 = var.dtype().is_int() && var.dtype().bits() == 32;
-  PrimExpr source = is_int32 ? var : reinterpret(DataType::Int(32), var);
-  PrimExpr res = Call(DataType::Int(32), builtin::call_pure_extern(),
+  bool is_int32 = var_ty.MatchesElementType(DLDataTypeCode::kDLInt, 32);
+  PrimExpr source = is_int32 ? var : reinterpret(PrimType::Int(32), var);
+  PrimExpr res = Call(i32_ty, builtin::call_pure_extern(),
                       {StringImm("llvm.amdgcn.ds.bpermute"), index << 2, source});
   if (!is_int32) {
-    res = reinterpret(var.dtype(), res);
+    res = reinterpret(var_ty, res);
   }
   return res;
 }
diff --git a/src/backend/trn/codegen/codegen_trn.cc b/src/backend/trn/codegen/codegen_trn.cc
index eb9d7ca4b437..631df21f8b08 100644
--- a/src/backend/trn/codegen/codegen_trn.cc
+++ b/src/backend/trn/codegen/codegen_trn.cc
@@ -110,7 +110,7 @@ void CodeGenTrainium::AddFunction(const GlobalVar& gvar, const PrimFunc& func) {
   size_t num_buffer = 0;
   for (size_t i = 0; i < func->params.size(); ++i, ++num_buffer) {
     Var v = func->params[i];
-    if (!v.dtype().is_handle()) {
+    if (!v.ty().IsHandle()) {
       LOG(FATAL) << "Trainium codegen currently only support buffer arguments";
     };
     std::string vid = AllocVarID(v.get());
@@ -137,16 +137,17 @@ void CodeGenTrainium::AddFunction(const GlobalVar& gvar, const PrimFunc& func) {
   this->EndScope(func_scope);
 }
 
-void CodeGenTrainium::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
+void CodeGenTrainium::PrintType(DLDataType raw_t, std::ostream& os) {  // NOLINT(*)
+  PrimType t(raw_t);
   int lanes = t.lanes();
   TVM_FFI_ICHECK(lanes == 1) << "Trainium codegen does not support vector types";
-  TVM_FFI_ICHECK(!t.is_handle()) << "Trainium codegen does not support handle type";
-  TVM_FFI_ICHECK(!t.is_void()) << "Trainium codegen does not support void type";
-  if (t == DataType::Bool()) {
+  TVM_FFI_ICHECK(!t.IsHandle()) << "Trainium codegen does not support handle type";
+  TVM_FFI_ICHECK(!t.IsVoid()) << "Trainium codegen does not support void type";
+  if (t.MatchesCode(DLDataTypeCode::kDLBool)) {
     os << "np.bool";
     return;
   }
-  if (t.is_float()) {
+  if (t.code() == DLDataTypeCode::kDLFloat) {
     switch (t.bits()) {
       case 16:
         os << "np.float16";
@@ -160,13 +161,13 @@ void CodeGenTrainium::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
     }
     return;
   }
-  if (t.is_uint() || t.is_int()) {
+  if (t.MatchesCode(DLDataTypeCode::kDLUInt, DLDataTypeCode::kDLInt)) {
     if (t.bits() == 1) {
       os << "np.bool";
       return;
     }
     os << "np.";
-    if (t.is_uint()) {
+    if (t.MatchesCode(DLDataTypeCode::kDLUInt)) {
       os << 'u';
     }
     switch (t.bits()) {
@@ -188,11 +189,11 @@ void CodeGenTrainium::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
     }
     return;
   }
-  if (t.is_bfloat16()) {
+  if (t.code() == DLDataTypeCode::kDLBfloat && t.bits() == 16) {
     os << "nl.bfloat16";
     return;
   }
-  LOG(FATAL) << "Cannot convert type " << t << " to Trainium type";
+  LOG(FATAL) << "Cannot convert type " << raw_t << " to Trainium type";
 }
 
 std::string CodeGenTrainium::GetStorageScopeStr(const std::string& scope) {  // NOLINT(*)
@@ -215,7 +216,7 @@ void CodeGenTrainium::VisitStmt_(const AllocBufferNode* op) {
   this->PrintIndent();
   auto scope = GetPtrStorageScope(op->buffer->data);
   std::ostringstream dtype_os;
-  PrintType(op->buffer->dtype, dtype_os);
+  PrintType(op->buffer->dtype->dtype, dtype_os);
   std::string dtype_str = dtype_os.str();
   if (scope == "trn.psum") {
     stream << vid << " = nl.ndarray(shape=[";
@@ -589,7 +590,7 @@ void CodeGenTrainium::VisitExpr_(const VarNode* op, std::ostream& os) {  // NOLI
 }
 
 void CodeGenTrainium::VisitExpr_(const CastNode* op, std::ostream& os) {
-  ctx_.dst_dtype = op->dtype;
+  ctx_.dst_dtype = op->ty();
   CodeGenTrainium::VisitExpr(op->value, os);
 }
 
diff --git a/src/backend/trn/codegen/codegen_trn.h b/src/backend/trn/codegen/codegen_trn.h
index 2c3b5fd37393..ec4eaad29cce 100644
--- a/src/backend/trn/codegen/codegen_trn.h
+++ b/src/backend/trn/codegen/codegen_trn.h
@@ -41,7 +41,7 @@ struct NKIInstructionCtx {
   bool is_matmul_input = false;
   int buffer_index = -1;
   int used_var_cnt = 0;
-  DataType dst_dtype;
+  PrimType dst_dtype = PrimType::Void();
   PrimExpr mask;
   bool tensorizing = false;
 };
@@ -57,7 +57,7 @@ class CodeGenTrainium final : public CodeGenC {
   void InitFuncState(const PrimFunc& f) final;
   std::string GetStorageScopeStr(const std::string& scope);           // NOLINT(*)
   void VisitExpr_(const VarNode* op, std::ostream& os) final;         // NOLINT(*)
-  void PrintType(DataType t, std::ostream& os) final;                 // NOLINT(*)
+  void PrintType(DLDataType t, std::ostream& os) final;               // NOLINT(*)
   void VisitStmt_(const AllocBufferNode* op) final;                   // NOLINT(*)
   void VisitStmt_(const AttrStmtNode* op) final;                      // NOLINT(*)
   void VisitStmt_(const ForNode* op) final;                           // NOLINT(*)
diff --git a/src/backend/trn/transform/lower_trainium_layout.cc b/src/backend/trn/transform/lower_trainium_layout.cc
index ad4b206a48b2..fb1d92c5215d 100644
--- a/src/backend/trn/transform/lower_trainium_layout.cc
+++ b/src/backend/trn/transform/lower_trainium_layout.cc
@@ -176,8 +176,8 @@ class TrainiumLayoutApplier : public arith::IRMutatorWithAnalyzer {
       flattened = buf.GetFlattenedBuffer();
       writer = flattened.CopyOnWrite();
     }
-    if (flattened->dtype == DataType::Bool()) {
-      writer->dtype = DataType::Int(8);
+    if (flattened->dtype->dtype == DLDataType{kDLBool, 8, 1}) {
+      writer->dtype = PrimType::Int(8);
     }
     for (size_t i = 0; i < flattened->shape.size(); ++i) {
       writer->shape.Set(i, analyzer_->canonical_simplify(flattened->shape[i]));
@@ -191,28 +191,30 @@ class TrainiumLayoutApplier : public arith::IRMutatorWithAnalyzer {
 
   Stmt VisitStmt_(const BufferStoreNode* op) final {
     BufferStore store = StmtExprMutator::VisitStmt_(op).as_or_throw<BufferStore>();
-    bool store_returns_bool = (op->value.dtype() == DataType::Bool());
+    PrimType store_value_ty = op->value.ty();
+    bool store_returns_bool = store_value_ty.MatchesCode(DLDataTypeCode::kDLBool);
     store = VisitBufferAccess(store);
 
     if (store_returns_bool) {
-      TVM_FFI_ICHECK_EQ(store->buffer->dtype, DataType::Int(8))
+      TVM_FFI_ICHECK_EQ(store->buffer->dtype->dtype, (DLDataType{kDLInt, 8, 1}))
           << "Expected int8 backing array for boolean tensor";
       auto writer = store.CopyOnWrite();
-      writer->value = tvm::cast(DataType::Int(8), store->value);
+      writer->value = tvm::cast(PrimType::Int(8), store->value);
       return std::move(store);
     }
     return std::move(store);
   }
 
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
-    bool load_returns_bool = (op->dtype == DataType::Bool());
+    PrimType load_ty = op->ty();
+    bool load_returns_bool = load_ty.MatchesCode(DLDataTypeCode::kDLBool);
     BufferLoad load = StmtExprMutator::VisitExpr_(op).as_or_throw<BufferLoad>();
     load = VisitBufferAccess(load);
     if (load_returns_bool) {
-      TVM_FFI_ICHECK_EQ(load->buffer->dtype, DataType::Int(8))
+      TVM_FFI_ICHECK_EQ(load->buffer->dtype->dtype, (DLDataType{kDLInt, 8, 1}))
           << "Expected int8 backing array for boolean tensor";
-      load.CopyOnWrite()->dtype = DataType::Int(8);
-      return tvm::cast(DataType::Bool(), load);
+      load.CopyOnWrite()->BaseExprNode::ty = PrimType::Int(8);
+      return tvm::cast(PrimType::Bool(), load);
     } else {
       return std::move(load);
     }
diff --git a/src/backend/vulkan/codegen/codegen_spirv.cc b/src/backend/vulkan/codegen/codegen_spirv.cc
index 5737c60da9dc..094e31370481 100644
--- a/src/backend/vulkan/codegen/codegen_spirv.cc
+++ b/src/backend/vulkan/codegen/codegen_spirv.cc
@@ -52,8 +52,8 @@ runtime::SPIRVShader CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std::s
   const uint32_t descriptor_set = 0;
 
   for (Var arg : f->params) {
-    DataType t = arg.dtype();
-    if (t.is_handle()) {
+    PrimType t = PrimType(arg.ty()->dtype);
+    if (t.IsHandle()) {
       auto* ptr = arg->type_annotation.as<PointerTypeNode>();
       TVM_FFI_ICHECK(ptr)
           << "All handles passed to the Vulkan codegen must have a type_annotation as a "
@@ -64,11 +64,11 @@ runtime::SPIRVShader CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std::s
           << "All handles passed to the Vulkan codegen must have a type_annotation as a "
              "PointerType, "
           << "and must point to a PrimType";
-      DataType value_storage_type = prim->dtype;
-      if (value_storage_type == DataType::Bool()) {
+      PrimType value_storage_type(prim->dtype);
+      if (value_storage_type == PrimType::Bool()) {
         // We need a physically addressable buffer type to support boolean tensors.
         // The loaded byte is cast to bool inside the LoadNode visitor below.
-        value_storage_type = boolean_storage_type_.with_lanes(value_storage_type.lanes());
+        value_storage_type = boolean_storage_type_.WithLanes(value_storage_type.lanes());
       }
       spirv::Value arg_value = builder_->BufferArgument(builder_->GetSType(value_storage_type),
                                                         descriptor_set, i_buffer++);
@@ -87,7 +87,7 @@ runtime::SPIRVShader CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std::s
   if (pod_args.size() != 0) {
     std::vector<spirv::SType> value_types;
     for (size_t i = 0; i < pod_args.size(); ++i) {
-      value_types.push_back(builder_->GetSType(pod_args[i].dtype()));
+      value_types.push_back(builder_->GetSType(PrimType(pod_args[i].ty()->dtype)));
     }
     if (pod_args.size() * sizeof(runtime::ArgUnion64) <= runtime::vulkan::kMaxPushConstantsBytes) {
       spirv::Value ptr = builder_->DeclarePushConstant(value_types);
@@ -150,7 +150,7 @@ spirv::Value CodeGenSPIRV::GetThreadIndex(const IterVar& iv, const PrimExpr& ext
   } else {
     v = builder_->GetWorkgroupID(ts.dim_index);
   }
-  return builder_->Cast(builder_->GetSType(iv->var.dtype()), v);
+  return builder_->Cast(builder_->GetSType(PrimType(iv->var.ty()->dtype)), v);
 }
 
 spirv::Value CodeGenSPIRV::CreateStorageSync(const CallNode* op) {
@@ -179,7 +179,7 @@ spirv::Value CodeGenSPIRV::CreateStorageSync(const CallNode* op) {
     TVM_FFI_THROW(InternalError) << "Do not support sync " << sync;
   }
 
-  auto type_int = builder_->GetSType(DataType::Int(32));
+  auto type_int = builder_->GetSType(PrimType::Int(32));
   builder_->MakeInst(spv::OpControlBarrier, builder_->IntImm(type_int, sync_scope),
                      builder_->IntImm(type_int, sync_scope),
                      builder_->IntImm(type_int, memory_semantics));
@@ -194,11 +194,11 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const VarNode* op) {
 }
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const IntImmNode* op) {
-  return builder_->IntImm(builder_->GetSType(op->dtype), op->value);
+  return builder_->IntImm(builder_->GetSType(PrimType(op->ty()->dtype)), op->value);
 }
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const FloatImmNode* op) {
-  return builder_->FloatImm(builder_->GetSType(op->dtype), op->value);
+  return builder_->FloatImm(builder_->GetSType(PrimType(op->ty()->dtype)), op->value);
 }
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const StringImmNode* op) {
@@ -206,7 +206,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const StringImmNode* op) {
 }
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const CastNode* op) {
-  return builder_->Cast(builder_->GetSType(op->dtype), MakeValue(op->value));
+  return builder_->Cast(builder_->GetSType(PrimType(op->ty()->dtype)), MakeValue(op->value));
 }
 
 spirv::Value CodeGenSPIRV::VisitExpr_(const AddNode* op) {
@@ -308,7 +308,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     for (size_t i = 1; i < op->args.size(); ++i) {
       values.push_back(MakeValue(op->args[i]));
     }
-    return builder_->CallGLSL450(builder_->GetSType(op->dtype), inst_id, values);
+    return builder_->CallGLSL450(builder_->GetSType(PrimType(op->ty()->dtype)), inst_id, values);
   } else if (op->op.same_as(builtin::bitwise_and())) {
     TVM_FFI_ICHECK_EQ(op->args.size(), 2U);
     spirv::Value a = MakeValue(op->args[0]);
@@ -337,20 +337,20 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     TVM_FFI_ICHECK_EQ(op->args.size(), 2U);
     spirv::Value a = MakeValue(op->args[0]);
     spirv::Value b = MakeValue(op->args[1]);
-    if (op->args[0].dtype().is_int()) {
+    if (PrimType(op->args[0].ty()->dtype).MatchesCode(DLDataTypeCode::kDLInt)) {
       return builder_->MakeValue(spv::OpShiftRightArithmetic, a.stype, a, b);
     } else {
       return builder_->MakeValue(spv::OpShiftRightLogical, a.stype, a, b);
     }
   } else if (op->op.same_as(builtin::reinterpret())) {
-    return builder_->MakeValue(spv::OpBitcast, builder_->GetSType(op->dtype),
+    return builder_->MakeValue(spv::OpBitcast, builder_->GetSType(PrimType(op->ty()->dtype)),
                                MakeValue(op->args[0]));
   } else if (op->op.same_as(builtin::large_uint_imm())) {
     TVM_FFI_ICHECK_EQ(op->args.size(), 2U);
     uint64_t low = static_cast<uint64_t>(op->args[0].as_or_throw<IntImm>()->value);
     uint64_t high = static_cast<uint64_t>(op->args[1].as_or_throw<IntImm>()->value);
     uint64_t val = (high << 32U) | low;
-    return builder_->UIntImm(builder_->GetSType(op->dtype), val);
+    return builder_->UIntImm(builder_->GetSType(PrimType(op->ty()->dtype)), val);
   } else if (op->op.same_as(builtin::tvm_storage_sync())) {
     return this->CreateStorageSync(op);
   } else if (op->op.same_as(builtin::if_then_else())) {
@@ -378,7 +378,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     phi.SetIncoming(1, else_value, else_value_label);
     return phi;
   } else if (op->op.same_as(builtin::popcount())) {
-    return builder_->MakeValue(spv::OpBitCount, builder_->GetSType(op->dtype),
+    return builder_->MakeValue(spv::OpBitCount, builder_->GetSType(PrimType(op->ty()->dtype)),
                                MakeValue(op->args[0]));
   } else if (op->op.same_as(builtin::call_pure_extern())) {
     TVM_FFI_ICHECK_GE(op->args.size(), 1U);
@@ -388,7 +388,8 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
       for (size_t i = 1; i < op->args.size(); ++i) {
         values.push_back(MakeValue(op->args[i]));
       }
-      return builder_->CallKHRIntegerDotProduct(builder_->GetSType(op->dtype), values, op->dtype);
+      PrimType op_dtype(op->ty()->dtype);
+      return builder_->CallKHRIntegerDotProduct(builder_->GetSType(op_dtype), values, op_dtype);
     } else {
       TVM_FFI_THROW(InternalError)
           << "SPIR-V shader cannot make extern calls.  Graph contains extern \""
@@ -412,8 +413,9 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     TVM_FFI_ICHECK_EQ(op->args.size(), 6U);
     const VarNode* buffer_node = op->args[0].as<VarNode>();
     TVM_FFI_ICHECK(buffer_node && fragment_info_.count(buffer_node));
-    DataType ele_dtype = GetElementDataType(buffer_node);
-    TVM_FFI_ICHECK(ele_dtype.is_float()) << "Only floating point fragment accumulator is supported";
+    PrimType ele_dtype = GetElementDataType(buffer_node);
+    TVM_FFI_ICHECK(ele_dtype.MatchesCode(DLDataTypeCode::kDLFloat))
+        << "Only floating point fragment accumulator is supported";
     spirv::SType ele_stype = builder_->GetSType(ele_dtype);
     spirv::SType& fragment_type = fragment_info_[buffer_node].stype;
     double init = static_cast<uint64_t>(op->args[5].as_or_throw<FloatImm>()->value);
@@ -435,7 +437,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     PrimExpr dst_index = op->args[4];
     PrimExpr src_ptr_expr = op->args[5];
     int stride = static_cast<int>(op->args[6].as_or_throw<IntImm>()->value);
-    auto type_int = builder_->GetSType(DataType::Int(32));
+    auto type_int = builder_->GetSType(PrimType::Int(32));
     spirv::Value stride_val = builder_->IntImm(type_int, stride);
     std::string layout = (op->args[7].as<StringImmNode>())->value;
     spirv::SType dst_ptr_type =
@@ -443,7 +445,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     spirv::Value dst_ptr =
         builder_->StructArrayAccess(dst_ptr_type, var_map_[buffer_node], MakeValue(dst_index));
     spirv::Value src_ptr = VisitExpr(op->args[5]);
-    spirv::SType type_bool = builder_->GetSType(DataType::Bool());
+    spirv::SType type_bool = builder_->GetSType(PrimType::Bool());
     spirv::Value t_val = builder_->UIntImm(type_bool, 1);
     spirv::Value f_val = builder_->UIntImm(type_bool, 0);
     spirv::Value loaded =
@@ -494,7 +496,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     PrimExpr index = op->args[4];
     PrimExpr buffer_ptr = op->args[5];
     int stride = static_cast<int>(op->args[6].as_or_throw<IntImm>()->value);
-    auto type_int = builder_->GetSType(DataType::Int(32));
+    auto type_int = builder_->GetSType(PrimType::Int(32));
     spirv::Value stride_val = builder_->IntImm(type_int, stride);
     std::string layout = (op->args[7].as<StringImmNode>())->value;
     spirv::Value dst_ptr = VisitExpr(op->args[5]);
@@ -505,7 +507,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
         builder_->StructArrayAccess(ptr_type, var_map_[buffer_node], MakeValue(index));
     uint32_t mask = spv::MemoryAccessMaskNone;
     spirv::Value loaded = builder_->MakeValue(spv::OpLoad, fragment_type, ptr, mask);
-    spirv::SType type_bool = builder_->GetSType(DataType::Bool());
+    spirv::SType type_bool = builder_->GetSType(PrimType::Bool());
     spirv::Value t_val = builder_->UIntImm(type_bool, 1);
     spirv::Value f_val = builder_->UIntImm(type_bool, 0);
     builder_->MakeInst(spv::OpCooperativeMatrixStoreNV, dst_ptr, loaded, stride_val,
@@ -516,7 +518,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
     Var buffer_var = load->buffer->data;
     const VarNode* buffer_node = buffer_var.get();
     PrimExpr index = load->indices[0];
-    DataType ele_dtype = GetElementDataType(buffer_node);
+    PrimType ele_dtype = GetElementDataType(buffer_node);
     spirv::SType ele_stype = builder_->GetSType(ele_dtype);
     spirv::Value buffer_val = MakeValue(buffer_var);
     spirv::SType ptr_type = builder_->GetPointerType(ele_stype, buffer_val.stype.storage_class);
@@ -532,11 +534,11 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const CallNode* op) {
 spirv::Value CodeGenSPIRV::VisitExpr_(const RampNode* op) {
   std::vector<spirv::Value> values;
   spirv::Value base = MakeValue(op->base);
-  int lanes = op->dtype.lanes();
+  int lanes = op->ty().lanes();
   for (int i = 0; i < lanes; ++i) {
     spirv::Value v = base;
     if (i != 0) {
-      spirv::Value offset = MakeValue(MakeConst(op->stride.dtype(), i) * op->stride);
+      spirv::Value offset = MakeValue(MakeConst(op->stride.ty(), i) * op->stride);
       v = builder_->Add(v, offset);
     }
     values.push_back(v);
@@ -547,7 +549,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const RampNode* op) {
 spirv::Value CodeGenSPIRV::VisitExpr_(const BroadcastNode* op) {
   std::vector<spirv::Value> values;
   spirv::Value v = MakeValue(op->value);
-  int lanes = op->dtype.lanes();
+  int lanes = op->ty().lanes();
   for (int i = 0; i < lanes; i++) {
     values.push_back(v);
   }
@@ -560,15 +562,15 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const BufferLoadNode* op) {
   Var buffer_var = op->buffer->data;
   PrimExpr prim_index = op->indices[0];
 
-  DataType desired_read_type = op->dtype;
-  if (desired_read_type == DataType::Bool()) {
-    desired_read_type = boolean_storage_type_.with_lanes(desired_read_type.lanes());
+  PrimType desired_read_type(op->ty()->dtype);
+  if (desired_read_type == PrimType::Bool()) {
+    desired_read_type = boolean_storage_type_.WithLanes(desired_read_type.lanes());
   }
 
   auto it = storage_info_.find(buffer_var.get());
   TVM_FFI_ICHECK(it != storage_info_.end());
   StorageInfo& info = it->second;
-  info.CheckContentType(desired_read_type, prim_index.dtype().lanes());
+  info.CheckContentType(desired_read_type, PrimType(prim_index.ty()->dtype).lanes());
 
   spirv::SType content_type = builder_->GetSType(info.element_type);
   spirv::Value buffer = MakeValue(buffer_var);
@@ -588,13 +590,13 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const BufferLoadNode* op) {
     spirv::Value loaded = builder_->MakeValue(spv::OpLoad, content_type, ptr, mask);
     // OpTypeBool have no physical address/storage.  Here, cast from
     // the storage type to an OpTypeBool.
-    if (op->dtype == DataType::Bool()) {
-      auto spirv_bool = builder_->GetSType(DataType::Bool());
+    if (PrimType(op->ty()->dtype) == PrimType::Bool()) {
+      auto spirv_bool = builder_->GetSType(PrimType::Bool());
       loaded = builder_->Cast(spirv_bool, loaded);
     }
     return loaded;
 
-  } else if (desired_read_type.element_of() == info.element_type) {
+  } else if (desired_read_type.WithLanes(1) == info.element_type) {
     // Requested several elements returned as an array.  Read out each
     // element and concatenate into the result.
     std::vector<spirv::Value> values;
@@ -609,21 +611,22 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const BufferLoadNode* op) {
     TVM_FFI_THROW(InternalError) << "Cannot perform buffer access of buffer variable '"
                                  << buffer_var->name_hint << "' with element type "
                                  << info.element_type << " using index of type "
-                                 << prim_index->dtype << " to produce output of type " << op->dtype;
+                                 << PrimType(prim_index.ty()->dtype)
+                                 << " to produce output of type " << PrimType(op->ty()->dtype);
     return spirv::Value();
   }
 }
 
 void CodeGenSPIRV::Scalarize(const PrimExpr& e, std::function<void(int i, spirv::Value v)> f) {
   if (const RampNode* ramp = e.as<RampNode>()) {
-    for (int i = 0; i < ramp->dtype.lanes(); ++i) {
+    for (int i = 0; i < ramp->ty().lanes(); ++i) {
       PrimExpr offset = ramp->base + ramp->stride * i;
       f(i, MakeValue(offset));
     }
   } else {
-    spirv::SType etype = builder_->GetSType(e.dtype().element_of());
+    spirv::SType etype = builder_->GetSType(PrimType(e.ty()->dtype).WithLanes(1));
     spirv::Value value = MakeValue(e);
-    for (int i = 0; i < e.dtype().lanes(); ++i) {
+    for (int i = 0; i < PrimType(e.ty()->dtype).lanes(); ++i) {
       f(i, builder_->MakeValue(spv::OpCompositeExtract, etype, value, i));
     }
   }
@@ -635,7 +638,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const ShuffleNode* op) {
       << "of one vector with one index";
   spirv::Value vector = MakeValue(op->vectors[0]);
   int index = op->indices[0].as_or_throw<IntImm>()->value;
-  spirv::SType etype = builder_->GetSType(op->dtype);
+  spirv::SType etype = builder_->GetSType(PrimType(op->ty()->dtype));
   spirv::Value element = builder_->MakeValue(spv::OpCompositeExtract, etype, vector, index);
   return element;
 }
@@ -649,7 +652,7 @@ void CodeGenSPIRV::VisitStmt_(const BufferStoreNode* op) {
   auto it = storage_info_.find(buffer_var.get());
   TVM_FFI_ICHECK(it != storage_info_.end());
   StorageInfo& info = it->second;
-  info.CheckContentType(op->value.dtype(), prim_index.dtype().lanes());
+  info.CheckContentType(PrimType(op->value.ty()->dtype), PrimType(prim_index.ty()->dtype).lanes());
 
   spirv::SType content_type = builder_->GetSType(info.element_type);
   spirv::Value buffer = MakeValue(buffer_var);
@@ -661,16 +664,16 @@ void CodeGenSPIRV::VisitStmt_(const BufferStoreNode* op) {
     mask |= spv::MemoryAccessVolatileMask;
   }
 
-  if (op->value.dtype() == info.element_type) {
+  if (PrimType(op->value.ty()->dtype) == info.element_type) {
     // Requested store of a single value.  This may be a scalar store
     // or a vectorized store, based on the array element type.
-    TVM_FFI_ICHECK_EQ(info.element_type, op->value.dtype())
+    TVM_FFI_ICHECK_EQ(info.element_type, PrimType(op->value.ty()->dtype))
         << "Vulkan only allow one type access to the same buffer";
     spirv::Value index = MakeValue(prim_index);
     spirv::Value ptr = builder_->StructArrayAccess(ptr_type, buffer, index);
     builder_->MakeInst(spv::OpStore, ptr, value, mask);
 
-  } else if (op->value.dtype().element_of() == info.element_type) {
+  } else if (PrimType(op->value.ty()->dtype).WithLanes(1) == info.element_type) {
     // Requested store of several arbitrarily located values.  Extract
     // each value from the composite, then assign to the buffer.
     auto f = [&](int i, spirv::Value index) {
@@ -681,10 +684,10 @@ void CodeGenSPIRV::VisitStmt_(const BufferStoreNode* op) {
     this->Scalarize(prim_index, f);
 
   } else {
-    TVM_FFI_THROW(InternalError) << "Cannot store value of type " << op->value.dtype()
+    TVM_FFI_THROW(InternalError) << "Cannot store value of type " << PrimType(op->value.ty()->dtype)
                                  << " into buffer variable '" << buffer_var->name_hint
                                  << "' with element type " << info.element_type
-                                 << " using index of type " << prim_index->dtype;
+                                 << " using index of type " << PrimType(prim_index.ty()->dtype);
   }
 }
 
@@ -697,10 +700,11 @@ void CodeGenSPIRV::VisitStmt_(const ForNode* op) {
   // loop step
   spirv::Value step;
   if (op->HasTrivialStep()) {
-    step = op->loop_var.dtype().is_int() ? builder_->IntImm(init_value.stype, 1)
-                                         : builder_->UIntImm(init_value.stype, 1);
+    step = PrimType(op->loop_var.ty()->dtype).MatchesCode(DLDataTypeCode::kDLInt)
+               ? builder_->IntImm(init_value.stype, 1)
+               : builder_->UIntImm(init_value.stype, 1);
   } else {
-    step = MakeValue(tvm::cast(end->dtype, *op->step));
+    step = MakeValue(tvm::cast(end.ty(), *op->step));
   }
 
   // Must get init label after making value(to make sure they are correct)
@@ -807,7 +811,7 @@ void CodeGenSPIRV::VisitStmt_(const IfThenElseNode* op) {
 }
 
 void CodeGenSPIRV::VisitStmt_(const AllocBufferNode* op) {
-  TVM_FFI_ICHECK(!op->buffer->dtype.is_handle());
+  TVM_FFI_ICHECK(!op->buffer->dtype.IsHandle());
   const IntImmNode* dim_imm = op->buffer->shape[0].as<IntImmNode>();
   TVM_FFI_ICHECK(dim_imm) << "Can only handle constant size stack allocation in GPU";
   size_t constant_size = static_cast<size_t>(dim_imm->value);
@@ -848,7 +852,7 @@ void CodeGenSPIRV::VisitStmt_(const AllocBufferNode* op) {
       int32_t aligned_constant_size = ((constant_size + 3) & ~0x3);
       buf = builder_->Allocate(etype, static_cast<uint32_t>(aligned_constant_size), storage_class);
 
-      size_t num_bytes = op->buffer->dtype.bytes() * op->buffer->dtype.lanes() *
+      size_t num_bytes = ((op->buffer->dtype.bits() + 7) / 8) * op->buffer->dtype.lanes() *
                          static_cast<uint32_t>(aligned_constant_size);
       shared_memory_bytes_used_ += num_bytes;
     } break;
@@ -897,7 +901,7 @@ void CodeGenSPIRV::VisitStmt_(const AssertStmtNode* op) {
 
 void CodeGenSPIRV::VisitStmt_(const BindNode* op) {
   TVM_FFI_ICHECK(!var_map_.count(op->var.get()));
-  TVM_FFI_ICHECK(!op->var.dtype().is_handle());
+  TVM_FFI_ICHECK(!PrimType(op->var.ty()->dtype).IsHandle());
   var_map_[op->var.get()] = MakeValue(op->value);
   analyzer_->Bind(op->var, op->value);
 }
@@ -910,18 +914,18 @@ void CodeGenSPIRV::VisitStmt_(const SeqStmtNode* op) {
 
 void CodeGenSPIRV::VisitStmt_(const EvaluateNode* op) { MakeValue(op->value); }
 
-spirv::SType CodeGenSPIRV::GetFragmentSType(const VarNode* buffer, const DataType& dtype) {
+spirv::SType CodeGenSPIRV::GetFragmentSType(const VarNode* buffer, const PrimType& dtype) {
   TVM_FFI_ICHECK(fragment_info_.count(buffer));
   const std::string& scope = fragment_info_[buffer].scope;
   const std::string& shape_str = fragment_info_.at(buffer).shape;
   std::pair<int32_t, int32_t> dim = GetWmmaFragmentDimSize(shape_str, scope);
   int64_t size = dim.first * dim.second;
-  spirv::SType stype = builder_->GetSType(dtype.with_lanes(size), dim.first, dim.second);
+  spirv::SType stype = builder_->GetSType(dtype.WithLanes(size), dim.first, dim.second);
   fragment_info_[buffer].stype = stype;
   return stype;
 }
 
-DataType CodeGenSPIRV::GetElementDataType(const VarNode* buffer) {
+PrimType CodeGenSPIRV::GetElementDataType(const VarNode* buffer) {
   auto it = storage_info_.find(buffer);
   TVM_FFI_ICHECK(it != storage_info_.end());
   return it->second.element_type;
diff --git a/src/backend/vulkan/codegen/codegen_spirv.h b/src/backend/vulkan/codegen/codegen_spirv.h
index 46fbcb696b6f..5ade6e383908 100644
--- a/src/backend/vulkan/codegen/codegen_spirv.h
+++ b/src/backend/vulkan/codegen/codegen_spirv.h
@@ -142,7 +142,7 @@ class CodeGenSPIRV : public ExprFunctor<spirv::Value(const PrimExpr&)>,
      * buffer variable (AllocBufferNode) or of the parameter (shader
      * arguments).
      */
-    DataType element_type{DataType()};
+    PrimType element_type{PrimType::Void()};
 
     /* \brief Check that the access type matches the known type
      *
@@ -156,10 +156,10 @@ class CodeGenSPIRV : public ExprFunctor<spirv::Value(const PrimExpr&)>,
      * product of the number of lanes of the buffer element type and
      * the number of lanes of the index.
      */
-    void CheckContentType(DataType type, int index_lanes = 1) const {
+    void CheckContentType(PrimType type, int index_lanes = 1) const {
       TVM_FFI_ICHECK(element_type_known) << "Cannot check element type of buffer " << name_hint
                                          << " no previous element type defined";
-      DataType expected_type = element_type.with_lanes(index_lanes * element_type.lanes());
+      PrimType expected_type = element_type.WithLanes(index_lanes * element_type.lanes());
       TVM_FFI_ICHECK_EQ(type, expected_type)
           << "Attempted to access buffer " << name_hint << " as element type " << type
           << " using an index of size " << index_lanes << " when the element type is "
@@ -167,7 +167,7 @@ class CodeGenSPIRV : public ExprFunctor<spirv::Value(const PrimExpr&)>,
     }
 
     // Update content type if it hasn't been updated.
-    void SetContentType(DataType type, std::string name_hint) {
+    void SetContentType(PrimType type, std::string name_hint) {
       TVM_FFI_ICHECK(!element_type_known)
           << "Cannot set element type of buffer " << name_hint << " a second time.";
       this->element_type = type;
@@ -191,8 +191,8 @@ class CodeGenSPIRV : public ExprFunctor<spirv::Value(const PrimExpr&)>,
   spirv::Value CreateStorageSync(const CallNode* op);
   void Scalarize(const PrimExpr& e, std::function<void(int i, spirv::Value v)> f);
 
-  spirv::SType GetFragmentSType(const VarNode* buffer, const DataType& dtype);
-  DataType GetElementDataType(const VarNode* buffer);
+  spirv::SType GetFragmentSType(const VarNode* buffer, const PrimType& dtype);
+  PrimType GetElementDataType(const VarNode* buffer);
 
   // SPIRV-related capabilities of the target
   SPIRVSupport spirv_support_;
@@ -213,7 +213,7 @@ class CodeGenSPIRV : public ExprFunctor<spirv::Value(const PrimExpr&)>,
    * integer type supported by the device, as not all Vulkan
    * implementations support int8.
    */
-  DataType boolean_storage_type_{DataType::Int(8)};
+  PrimType boolean_storage_type_{PrimType::Int(8)};
 
   // the storage scope of allocation
   std::unordered_map<const VarNode*, StorageInfo> storage_info_;
diff --git a/src/backend/vulkan/codegen/intrin_rule_spirv.cc b/src/backend/vulkan/codegen/intrin_rule_spirv.cc
index 14287562d9e4..6deb6e0a9b61 100644
--- a/src/backend/vulkan/codegen/intrin_rule_spirv.cc
+++ b/src/backend/vulkan/codegen/intrin_rule_spirv.cc
@@ -39,12 +39,12 @@ PrimExpr CallGLSLIntrin(PrimExpr e, const ffi::Array<PrimExpr>& args) {
   TVM_FFI_ICHECK(call != nullptr);
   ffi::Array<PrimExpr> cargs;
   // intrin id.
-  cargs.push_back(IntImm(DataType::UInt(32), id));
+  cargs.push_back(IntImm(PrimType::UInt(32), id));
 
   for (PrimExpr arg : args) {
     cargs.push_back(arg);
   }
-  return tirx::Call(call->dtype, tirx::builtin::call_spirv_pure_glsl450(), cargs);
+  return tirx::Call(call->ty(), tirx::builtin::call_spirv_pure_glsl450(), cargs);
 }
 
 template <unsigned id>
@@ -166,21 +166,22 @@ TVM_REGISTER_OP("tirx.clz")
       TVM_FFI_ICHECK(call != nullptr);
       TVM_FFI_ICHECK_EQ(call->args.size(), 1);
       PrimExpr arg = call->args[0];
+      PrimType arg_ty = arg.ty();
       PrimExpr msb;
-      if (arg.dtype().bits() == 64) {
+      if (arg_ty.bits() == 64) {
         // SPIR-V FindUMsb intrinsic only supports 32 bit input
-        auto int32 = DataType::Int(32);
+        auto int32 = PrimType::Int(32);
         PrimExpr arg_hi32 = tvm::tirx::Cast(int32, arg >> 32);
         PrimExpr arg_lo32 = tvm::tirx::Cast(int32, arg);
         PrimExpr msb_hi = CallGLSLIntrin<GLSLstd450FindUMsb>(e, {arg_hi32});
         PrimExpr msb_lo = CallGLSLIntrin<GLSLstd450FindUMsb>(e, {arg_lo32});
         msb = tvm::if_then_else(arg_hi32 == 0, msb_lo, msb_hi + 32);
-      } else if (arg.dtype().bits() == 32) {
+      } else if (arg_ty.bits() == 32) {
         msb = CallGLSLIntrin<GLSLstd450FindUMsb>(e);
       } else {
         TVM_FFI_THROW(InternalError) << "SPIR-V clz only supports a 32 bit or 64 bit integer.";
       }
-      return PrimExpr(arg.dtype().bits() - 1) - msb;
+      return PrimExpr(arg_ty.bits() - 1) - msb;
     });
   // clang-format on
 }
diff --git a/src/backend/vulkan/codegen/ir_builder.cc b/src/backend/vulkan/codegen/ir_builder.cc
index f912e482761c..ca82b06b0554 100644
--- a/src/backend/vulkan/codegen/ir_builder.cc
+++ b/src/backend/vulkan/codegen/ir_builder.cc
@@ -74,10 +74,10 @@ void IRBuilder::InitHeader() {
 
 void IRBuilder::InitPreDefs() {
   ext_glsl450_ = ExtInstImport("GLSL.std.450");
-  t_int32_ = DeclareType(DataType::Int(32));
-  t_uint32_ = DeclareType(DataType::UInt(32));
-  t_bool_ = DeclareType(DataType::Bool());
-  t_fp32_ = DeclareType(DataType::Float(32));
+  t_int32_ = DeclareType(PrimType::Int(32));
+  t_uint32_ = DeclareType(PrimType::UInt(32));
+  t_bool_ = DeclareType(PrimType::Bool());
+  t_fp32_ = DeclareType(PrimType::Float(32));
   const_i32_zero_ = IntImm(t_int32_, 0);
 
   // declare void, and void functions
@@ -112,14 +112,14 @@ std::vector<uint32_t> IRBuilder::Finalize() {
   return data;
 }
 
-SType IRBuilder::GetSType(const DataType& dtype, uint32_t row, uint32_t col) {
-  if (dtype == DataType::Int(32)) {
+SType IRBuilder::GetSType(const PrimType& dtype, uint32_t row, uint32_t col) {
+  if (dtype == PrimType::Int(32)) {
     return t_int32_;
-  } else if (dtype == DataType::Bool()) {
+  } else if (dtype == PrimType::Bool()) {
     return t_bool_;
-  } else if (dtype == DataType::Float(32)) {
+  } else if (dtype == PrimType::Float(32)) {
     return t_fp32_;
-  } else if (dtype == DataType::UInt(32)) {
+  } else if (dtype == PrimType::UInt(32)) {
     return t_uint32_;
   }
   uint64_t type_key;
@@ -151,7 +151,7 @@ SType IRBuilder::GetPointerType(const SType& value_type, spv::StorageClass stora
   }
   SType t;
   t.id = id_counter_++;
-  t.type = DataType::Handle();
+  t.type = PrimType::Handle();
   t.element_type_id = value_type.id;
   t.storage_class = storage_class;
   ib_.Begin(spv::OpTypePointer).AddSeq(t, storage_class, value_type).Commit(&global_);
@@ -169,11 +169,11 @@ SType IRBuilder::GetStructArrayType(const SType& value_type, uint32_t num_elems,
 
   SType arr_type;
   arr_type.id = id_counter_++;
-  arr_type.type = DataType::Handle();
+  arr_type.type = PrimType::Handle();
   arr_type.element_type_id = value_type.id;
 
   if (num_elems != 0) {
-    Value length = UIntImm(GetSType(DataType::UInt(32)), num_elems);
+    Value length = UIntImm(GetSType(PrimType::UInt(32)), num_elems);
     ib_.Begin(spv::OpTypeArray).AddSeq(arr_type, value_type, length).Commit(&global_);
   } else {
     ib_.Begin(spv::OpTypeRuntimeArray).AddSeq(arr_type, value_type).Commit(&global_);
@@ -188,7 +188,7 @@ SType IRBuilder::GetStructArrayType(const SType& value_type, uint32_t num_elems,
   // declare struct of array
   SType struct_type;
   struct_type.id = id_counter_++;
-  struct_type.type = DataType::Handle();
+  struct_type.type = PrimType::Handle();
   struct_type.element_type_id = value_type.id;
   ib_.Begin(spv::OpTypeStruct).AddSeq(struct_type, arr_type).Commit(&global_);
 
@@ -241,7 +241,7 @@ Value IRBuilder::FloatImm(const SType& dtype, double value) {
     if (data == 0)
       return GetConst_(dtype, &data);
     else
-      return Cast(dtype, FloatImm(GetSType(DataType::Float(32)), value));
+      return Cast(dtype, FloatImm(GetSType(PrimType::Float(32)), value));
   }
 }
 
@@ -270,7 +270,7 @@ Value IRBuilder::DeclareStorageVariable(const std::vector<SType>& value_types,
                                         spv::StorageClass storage_class, ValueKind kind) {
   SType struct_type;
   struct_type.id = id_counter_++;
-  struct_type.type = DataType::Handle();
+  struct_type.type = PrimType::Handle();
   ib_.Begin(spv::OpTypeStruct).Add(struct_type);
   for (const SType& vtype : value_types) {
     ib_.Add(vtype);
@@ -282,7 +282,7 @@ Value IRBuilder::DeclareStorageVariable(const std::vector<SType>& value_types,
     ib_.Begin(spv::OpMemberDecorate)
         .AddSeq(struct_type, i, spv::DecorationOffset, offset)
         .Commit(&decorate_);
-    DataType t = value_types[i].type;
+    PrimType t = value_types[i].type;
     uint32_t nbits = t.bits() * t.lanes();
     TVM_FFI_ICHECK_EQ(nbits % 8, 0);
     uint32_t bytes = (nbits / 8);
@@ -394,13 +394,13 @@ Value IRBuilder::GetBuiltInValue(spv::BuiltIn built_in, uint32_t index, const st
     }
   }
 
-  DataType data_type;
-  DataType global_arr_type;
+  PrimType data_type;
+  PrimType global_arr_type;
   switch (built_in) {
     case spv::BuiltInLocalInvocationId:
     case spv::BuiltInWorkgroupId:
-      data_type = DataType::Int(32);
-      global_arr_type = data_type.with_lanes(3);
+      data_type = PrimType::Int(32);
+      global_arr_type = data_type.WithLanes(3);
       break;
 
     default:
@@ -468,7 +468,7 @@ Value IRBuilder::GetConst_(const SType& dtype, const uint64_t* pvalue) {
   }
   TVM_FFI_ICHECK_LE(dtype.type.bits(), 64);
   Value ret = NewValue(dtype, kConstant);
-  if (dtype.type == DataType::Bool()) {
+  if (dtype.type == PrimType::Bool()) {
     // bool types.
     if (*pvalue) {
       ib_.Begin(spv::OpConstantTrue).AddSeq(dtype, ret);
@@ -481,7 +481,7 @@ Value IRBuilder::GetConst_(const SType& dtype, const uint64_t* pvalue) {
     uint64_t mask = 0xFFFFFFFFUL;
     ib_.Add(static_cast<uint32_t>(pvalue[0] & mask));
     if (dtype.type.bits() > 32) {
-      if (dtype.type.is_int()) {
+      if (dtype.type.MatchesCode(DLDataTypeCode::kDLInt)) {
         int64_t sign_mask = 0xFFFFFFFFL;
         const int64_t* sign_ptr = reinterpret_cast<const int64_t*>(pvalue);
         ib_.Add(static_cast<uint32_t>((sign_ptr[0] >> 32L) & sign_mask));
@@ -495,20 +495,20 @@ Value IRBuilder::GetConst_(const SType& dtype, const uint64_t* pvalue) {
   return ret;
 }
 
-SType IRBuilder::DeclareType(const DataType& dtype, uint32_t row, uint32_t col) {
+SType IRBuilder::DeclareType(const PrimType& dtype, uint32_t row, uint32_t col) {
   AddCapabilityFor(dtype);
 
   if (dtype.lanes() == 1) {
     SType t;
     t.id = id_counter_++;
     t.type = dtype;
-    if (dtype.is_bool()) {
+    if (dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
       ib_.Begin(spv::OpTypeBool).Add(t).Commit(&global_);
-    } else if (dtype.is_int()) {
+    } else if (dtype.MatchesCode(DLDataTypeCode::kDLInt)) {
       ib_.Begin(spv::OpTypeInt).AddSeq(t, dtype.bits(), 1).Commit(&global_);
-    } else if (dtype.is_uint()) {
+    } else if (dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
       ib_.Begin(spv::OpTypeInt).AddSeq(t, dtype.bits(), 0).Commit(&global_);
-    } else if (dtype.is_float()) {
+    } else if (dtype.MatchesCode(DLDataTypeCode::kDLFloat)) {
       ib_.Begin(spv::OpTypeFloat).AddSeq(t, dtype.bits()).Commit(&global_);
     } else {
       TVM_FFI_THROW(InternalError) << "declare type do not support handle";
@@ -518,15 +518,15 @@ SType IRBuilder::DeclareType(const DataType& dtype, uint32_t row, uint32_t col)
     SType t;
     t.id = id_counter_++;
     t.type = dtype;
-    SType base_type = GetSType(dtype.element_of());
+    SType base_type = GetSType(dtype.WithLanes(1));
 
     if (row * col == 0) {
       TVM_FFI_ICHECK((row == 0) && (col == 0));
       ib_.Begin(spv::OpTypeVector).AddSeq(t, base_type, dtype.lanes()).Commit(&global_);
     } else {
-      Value v_row = GetSpecConst(GetSType(DataType::UInt(32)), row);
-      Value v_col = GetSpecConst(GetSType(DataType::UInt(32)), col);
-      Value scope = UIntImm(GetSType(DataType::UInt(32)), spv::ScopeSubgroup);
+      Value v_row = GetSpecConst(GetSType(PrimType::UInt(32)), row);
+      Value v_col = GetSpecConst(GetSType(PrimType::UInt(32)), col);
+      Value scope = UIntImm(GetSType(PrimType::UInt(32)), spv::ScopeSubgroup);
       ib_.Begin(spv::OpTypeCooperativeMatrixNV)
           .AddSeq(t, base_type, scope, v_row, v_col)
           .Commit(&global_);
@@ -535,9 +535,9 @@ SType IRBuilder::DeclareType(const DataType& dtype, uint32_t row, uint32_t col)
   }
 }
 
-void IRBuilder::AddCapabilityFor(const DataType& dtype) {
+void IRBuilder::AddCapabilityFor(const PrimType& dtype) {
   // Declare appropriate capabilities for int/float types
-  if (dtype.is_int() || dtype.is_uint()) {
+  if (dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
     if (dtype.bits() == 8) {
       TVM_FFI_ICHECK(spirv_support_.supports_int8)
           << "Vulkan target does not support Int8 capability.  "
@@ -561,7 +561,7 @@ void IRBuilder::AddCapabilityFor(const DataType& dtype) {
       capabilities_used_.insert(spv::CapabilityInt64);
     }
 
-  } else if (dtype.is_float()) {
+  } else if (dtype.MatchesCode(DLDataTypeCode::kDLFloat)) {
     if (dtype.bits() == 16) {
       TVM_FFI_ICHECK(spirv_support_.supports_float16)
           << "Vulkan target does not support Float16 capability.  "
@@ -584,7 +584,7 @@ void IRBuilder::AddCapabilityFor(const DataType& dtype) {
   // future.  Requiring StorageBuffer8BitAccess in order to declare an
   // Int8 prevents use of an 8-bit loop iterator on a device that
   // supports Int8 but doesn't support 8-bit buffer access.
-  if (dtype.bits() == 8 && !dtype.is_bool()) {
+  if (dtype.bits() == 8 && !dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
     TVM_FFI_ICHECK(spirv_support_.supports_storage_buffer_8bit_access)
         << "Vulkan target does not support StorageBuffer8BitAccess.  "
         << "If your device supports 8-bit buffer access, "
@@ -642,7 +642,7 @@ Value IRBuilder::CallGLSL450(const SType& ret_type, uint32_t inst_id,
 }
 
 Value IRBuilder::CallKHRIntegerDotProduct(const SType& ret_type, const std::vector<Value>& args,
-                                          const DataType& dtype) {
+                                          const PrimType& dtype) {
   if (args.size() != 3) {
     TVM_FFI_THROW(InternalError) << "Unresolved arguments in SPIRV_KHR_integer_dot_product";
   }
@@ -653,9 +653,9 @@ Value IRBuilder::CallKHRIntegerDotProduct(const SType& ret_type, const std::vect
       << "If your device supports integer dot product operations, "
       << "please either add -mattr=+dotprod to the target, "
       << "or query all device parameters by adding -from_device=0.";
-  if (dtype.is_int()) {
+  if (dtype.MatchesCode(DLDataTypeCode::kDLInt)) {
     ib_.Begin(spv::OpSDotAccSatKHR).AddSeq(ret_type, val);
-  } else if (dtype.is_uint()) {
+  } else if (dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
     ib_.Begin(spv::OpUDotAccSatKHR).AddSeq(ret_type, val);
   } else {
     TVM_FFI_THROW(InternalError) << "Unsupported type";
@@ -674,15 +674,15 @@ Value IRBuilder::CallKHRIntegerDotProduct(const SType& ret_type, const std::vect
 
 Value IRBuilder::Concat(const std::vector<Value>& vec) {
   bool is_const = vec[0].flag == kConstant;
-  DataType etype = vec[0].stype.type;
+  PrimType etype = vec[0].stype.type;
   int lanes = etype.lanes();
   for (size_t i = 1; i < vec.size(); ++i) {
-    TVM_FFI_ICHECK_EQ(etype, vec[i].stype.type.element_of())
+    TVM_FFI_ICHECK_EQ(etype, vec[i].stype.type.WithLanes(1))
         << "Cannot concat vector of different element type";
     lanes += vec[i].stype.type.lanes();
     is_const = is_const && (vec[i].flag == kConstant);
   }
-  Value ret = NewValue(GetSType(etype.with_lanes(lanes)), kNormal);
+  Value ret = NewValue(GetSType(etype.WithLanes(lanes)), kNormal);
   if (is_const && vec.size() == static_cast<size_t>(lanes)) {
     ib_.Begin(spv::OpConstantComposite);
     ib_.AddSeq(ret.stype, ret);
@@ -704,53 +704,56 @@ Value IRBuilder::Concat(const std::vector<Value>& vec) {
 Value IRBuilder::Cast(const SType& dst_type, spirv::Value value) {
   TVM_FFI_ICHECK_NE(value.stype.id, 0U);
   if (value.stype.id == dst_type.id) return value;
-  const tvm::DataType& from = value.stype.type;
-  const tvm::DataType& to = dst_type.type;
+  const tvm::PrimType& from = value.stype.type;
+  const tvm::PrimType& to = dst_type.type;
   TVM_FFI_ICHECK_EQ(from.lanes(), to.lanes());
-  if (from == DataType::Bool()) {
-    if (to.is_int()) {
+  if (from == PrimType::Bool()) {
+    if (to.MatchesCode(DLDataTypeCode::kDLInt)) {
       return Select(value, IntImm(dst_type, 1), IntImm(dst_type, 0));
-    } else if (to.is_uint()) {
+    } else if (to.MatchesCode(DLDataTypeCode::kDLUInt)) {
       return Select(value, UIntImm(dst_type, 1), UIntImm(dst_type, 0));
-    } else if (to.is_float()) {
+    } else if (to.MatchesCode(DLDataTypeCode::kDLFloat)) {
       return MakeValue(spv::OpConvertUToF, dst_type,
                        Select(value, UIntImm(t_uint32_, 1), UIntImm(t_uint32_, 0)));
     } else {
       TVM_FFI_THROW(InternalError) << "cannot cast from " << from << " to " << to;
       return Value();
     }
-  } else if (to == DataType::Bool()) {
-    if (from.is_int()) {
+  } else if (to == PrimType::Bool()) {
+    if (from.MatchesCode(DLDataTypeCode::kDLInt)) {
       return NE(value, IntImm(value.stype, 0));
-    } else if (to.is_uint()) {
+    } else if (from.MatchesCode(DLDataTypeCode::kDLUInt)) {
       return NE(value, UIntImm(value.stype, 0));
     } else {
       TVM_FFI_THROW(InternalError) << "cannot cast from " << from << " to " << to;
       return Value();
     }
-  } else if (from.is_int() && to.is_int()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLInt) && to.MatchesCode(DLDataTypeCode::kDLInt)) {
     return MakeValue(spv::OpSConvert, dst_type, value);
-  } else if (from.is_uint() && to.is_uint()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLUInt) && to.MatchesCode(DLDataTypeCode::kDLUInt)) {
     return MakeValue(spv::OpUConvert, dst_type, value);
-  } else if (from.is_uint() && to.is_int()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLUInt) && to.MatchesCode(DLDataTypeCode::kDLInt)) {
     if (from.bits() != to.bits()) {
-      value = MakeValue(spv::OpUConvert, GetSType(from.with_bits(to.bits())), value);
+      value = MakeValue(spv::OpUConvert, GetSType(from.WithBits(to.bits())), value);
     }
     return MakeValue(spv::OpBitcast, dst_type, value);
-  } else if (from.is_int() && to.is_uint()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLInt) && to.MatchesCode(DLDataTypeCode::kDLUInt)) {
     if (from.bits() != to.bits()) {
-      value = MakeValue(spv::OpSConvert, GetSType(from.with_bits(to.bits())), value);
+      value = MakeValue(spv::OpSConvert, GetSType(from.WithBits(to.bits())), value);
     }
     return MakeValue(spv::OpBitcast, dst_type, value);
-  } else if (from.is_float() && to.is_int()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLFloat) && to.MatchesCode(DLDataTypeCode::kDLInt)) {
     return MakeValue(spv::OpConvertFToS, dst_type, value);
-  } else if (from.is_float() && to.is_uint()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLFloat) &&
+             to.MatchesCode(DLDataTypeCode::kDLUInt)) {
     return MakeValue(spv::OpConvertFToU, dst_type, value);
-  } else if (from.is_int() && to.is_float()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLInt) && to.MatchesCode(DLDataTypeCode::kDLFloat)) {
     return MakeValue(spv::OpConvertSToF, dst_type, value);
-  } else if (from.is_uint() && to.is_float()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLUInt) &&
+             to.MatchesCode(DLDataTypeCode::kDLFloat)) {
     return MakeValue(spv::OpConvertUToF, dst_type, value);
-  } else if (from.is_float() && to.is_float()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLFloat) &&
+             to.MatchesCode(DLDataTypeCode::kDLFloat)) {
     return MakeValue(spv::OpFConvert, dst_type, value);
   } else {
     TVM_FFI_THROW(InternalError) << "do not support type cast from " << from << " to " << to;
@@ -782,28 +785,28 @@ Value IRBuilder::GetSpecConst(const SType& dtype, uint64_t value) {
   return ret;
 }
 
-#define DEFINE_BUILDER_BINARY_USIGN_OP(_OpName, _Op)       \
-  Value IRBuilder::_OpName(Value a, Value b) {             \
-    TVM_FFI_ICHECK_EQ(a.stype.id, b.stype.id);             \
-    if (a.stype.type.is_int() || a.stype.type.is_uint()) { \
-      return MakeValue(spv::OpI##_Op, a.stype, a, b);      \
-    } else {                                               \
-      TVM_FFI_ICHECK(a.stype.type.is_float());             \
-      return MakeValue(spv::OpF##_Op, a.stype, a, b);      \
-    }                                                      \
+#define DEFINE_BUILDER_BINARY_USIGN_OP(_OpName, _Op)                                 \
+  Value IRBuilder::_OpName(Value a, Value b) {                                       \
+    TVM_FFI_ICHECK_EQ(a.stype.id, b.stype.id);                                       \
+    if (a.stype.type.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) { \
+      return MakeValue(spv::OpI##_Op, a.stype, a, b);                                \
+    } else {                                                                         \
+      TVM_FFI_ICHECK(a.stype.type.MatchesCode(DLDataTypeCode::kDLFloat));            \
+      return MakeValue(spv::OpF##_Op, a.stype, a, b);                                \
+    }                                                                                \
   }
 
-#define DEFINE_BUILDER_BINARY_SIGN_OP(_OpName, _Op)   \
-  Value IRBuilder::_OpName(Value a, Value b) {        \
-    TVM_FFI_ICHECK_EQ(a.stype.id, b.stype.id);        \
-    if (a.stype.type.is_int()) {                      \
-      return MakeValue(spv::OpS##_Op, a.stype, a, b); \
-    } else if (a.stype.type.is_uint()) {              \
-      return MakeValue(spv::OpU##_Op, a.stype, a, b); \
-    } else {                                          \
-      TVM_FFI_ICHECK(a.stype.type.is_float());        \
-      return MakeValue(spv::OpF##_Op, a.stype, a, b); \
-    }                                                 \
+#define DEFINE_BUILDER_BINARY_SIGN_OP(_OpName, _Op)                       \
+  Value IRBuilder::_OpName(Value a, Value b) {                            \
+    TVM_FFI_ICHECK_EQ(a.stype.id, b.stype.id);                            \
+    if (a.stype.type.MatchesCode(DLDataTypeCode::kDLInt)) {               \
+      return MakeValue(spv::OpS##_Op, a.stype, a, b);                     \
+    } else if (a.stype.type.MatchesCode(DLDataTypeCode::kDLUInt)) {       \
+      return MakeValue(spv::OpU##_Op, a.stype, a, b);                     \
+    } else {                                                              \
+      TVM_FFI_ICHECK(a.stype.type.MatchesCode(DLDataTypeCode::kDLFloat)); \
+      return MakeValue(spv::OpF##_Op, a.stype, a, b);                     \
+    }                                                                     \
   }
 
 DEFINE_BUILDER_BINARY_USIGN_OP(Add, Add);
@@ -813,29 +816,29 @@ DEFINE_BUILDER_BINARY_SIGN_OP(Div, Div);
 
 Value IRBuilder::Mod(Value a, Value b) {
   TVM_FFI_ICHECK_EQ(a.stype.id, b.stype.id);
-  if (a.stype.type.is_int()) {
+  if (a.stype.type.MatchesCode(DLDataTypeCode::kDLInt)) {
     return MakeValue(spv::OpSRem, a.stype, a, b);
-  } else if (a.stype.type.is_uint()) {
+  } else if (a.stype.type.MatchesCode(DLDataTypeCode::kDLUInt)) {
     return MakeValue(spv::OpUMod, a.stype, a, b);
   } else {
-    TVM_FFI_ICHECK(a.stype.type.is_float());
+    TVM_FFI_ICHECK(a.stype.type.MatchesCode(DLDataTypeCode::kDLFloat));
     return MakeValue(spv::OpFRem, a.stype, a, b);
   }
 }
 
-#define DEFINE_BUILDER_CMP_OP(_OpName, _Op)                                                    \
-  Value IRBuilder::_OpName(Value a, Value b) {                                                 \
-    TVM_FFI_ICHECK_EQ(a.stype.id, b.stype.id);                                                 \
-    TVM_FFI_ICHECK_EQ(a.stype.type.lanes(), b.stype.type.lanes());                             \
-    const auto& bool_type = this->GetSType(DataType::Bool().with_lanes(a.stype.type.lanes())); \
-    if (a.stype.type.is_int()) {                                                               \
-      return MakeValue(spv::OpS##_Op, bool_type, a, b);                                        \
-    } else if (a.stype.type.is_uint()) {                                                       \
-      return MakeValue(spv::OpU##_Op, bool_type, a, b);                                        \
-    } else {                                                                                   \
-      TVM_FFI_ICHECK(a.stype.type.is_float());                                                 \
-      return MakeValue(spv::OpFOrd##_Op, bool_type, a, b);                                     \
-    }                                                                                          \
+#define DEFINE_BUILDER_CMP_OP(_OpName, _Op)                                                   \
+  Value IRBuilder::_OpName(Value a, Value b) {                                                \
+    TVM_FFI_ICHECK_EQ(a.stype.id, b.stype.id);                                                \
+    TVM_FFI_ICHECK_EQ(a.stype.type.lanes(), b.stype.type.lanes());                            \
+    const auto& bool_type = this->GetSType(PrimType::Bool().WithLanes(a.stype.type.lanes())); \
+    if (a.stype.type.MatchesCode(DLDataTypeCode::kDLInt)) {                                   \
+      return MakeValue(spv::OpS##_Op, bool_type, a, b);                                       \
+    } else if (a.stype.type.MatchesCode(DLDataTypeCode::kDLUInt)) {                           \
+      return MakeValue(spv::OpU##_Op, bool_type, a, b);                                       \
+    } else {                                                                                  \
+      TVM_FFI_ICHECK(a.stype.type.MatchesCode(DLDataTypeCode::kDLFloat));                     \
+      return MakeValue(spv::OpFOrd##_Op, bool_type, a, b);                                    \
+    }                                                                                         \
   }
 
 DEFINE_BUILDER_CMP_OP(LT, LessThan);
@@ -843,17 +846,17 @@ DEFINE_BUILDER_CMP_OP(LE, LessThanEqual);
 DEFINE_BUILDER_CMP_OP(GT, GreaterThan);
 DEFINE_BUILDER_CMP_OP(GE, GreaterThanEqual);
 
-#define DEFINE_BUILDER_CMP_UOP(_OpName, _Op)                                                   \
-  Value IRBuilder::_OpName(Value a, Value b) {                                                 \
-    TVM_FFI_ICHECK_EQ(a.stype.id, b.stype.id);                                                 \
-    TVM_FFI_ICHECK_EQ(a.stype.type.lanes(), b.stype.type.lanes());                             \
-    const auto& bool_type = this->GetSType(DataType::Bool().with_lanes(a.stype.type.lanes())); \
-    if (a.stype.type.is_int() || a.stype.type.is_uint()) {                                     \
-      return MakeValue(spv::OpI##_Op, bool_type, a, b);                                        \
-    } else {                                                                                   \
-      TVM_FFI_ICHECK(a.stype.type.is_float());                                                 \
-      return MakeValue(spv::OpFOrd##_Op, bool_type, a, b);                                     \
-    }                                                                                          \
+#define DEFINE_BUILDER_CMP_UOP(_OpName, _Op)                                                  \
+  Value IRBuilder::_OpName(Value a, Value b) {                                                \
+    TVM_FFI_ICHECK_EQ(a.stype.id, b.stype.id);                                                \
+    TVM_FFI_ICHECK_EQ(a.stype.type.lanes(), b.stype.type.lanes());                            \
+    const auto& bool_type = this->GetSType(PrimType::Bool().WithLanes(a.stype.type.lanes())); \
+    if (a.stype.type.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {          \
+      return MakeValue(spv::OpI##_Op, bool_type, a, b);                                       \
+    } else {                                                                                  \
+      TVM_FFI_ICHECK(a.stype.type.MatchesCode(DLDataTypeCode::kDLFloat));                     \
+      return MakeValue(spv::OpFOrd##_Op, bool_type, a, b);                                    \
+    }                                                                                         \
   }
 
 DEFINE_BUILDER_CMP_UOP(EQ, Equal);
@@ -861,7 +864,7 @@ DEFINE_BUILDER_CMP_UOP(NE, NotEqual);
 
 Value IRBuilder::Select(Value cond, Value a, Value b) {
   TVM_FFI_ICHECK_EQ(a.stype.id, b.stype.id);
-  TVM_FFI_ICHECK_EQ(cond.stype.type.element_of(), DataType::Bool());
+  TVM_FFI_ICHECK_EQ(cond.stype.type.WithLanes(1), PrimType::Bool());
   return MakeValue(spv::OpSelect, a.stype, cond, a, b);
 }
 
diff --git a/src/backend/vulkan/codegen/ir_builder.h b/src/backend/vulkan/codegen/ir_builder.h
index 3cca1b4cfe33..85dbdc00cff4 100644
--- a/src/backend/vulkan/codegen/ir_builder.h
+++ b/src/backend/vulkan/codegen/ir_builder.h
@@ -50,7 +50,7 @@ struct SType {
   /*! \brief The Id to represent type */
   uint32_t id{0};
   /*! \brief corresponding TVM type */
-  tvm::DataType type;
+  tvm::PrimType type;
   /*! \brief content type id if it is a pointer/struct-array class */
   uint32_t element_type_id{0};
   /*! \brief The storage class, if it is a pointer */
@@ -430,7 +430,7 @@ class IRBuilder {
    * \return The result value.
    */
   Value CallKHRIntegerDotProduct(const SType& ret_type, const std::vector<Value>& args,
-                                 const DataType& dtype);
+                                 const PrimType& dtype);
 
   /*!
    * \brief Build vector by concatenating components
@@ -444,7 +444,7 @@ class IRBuilder {
    * \param dtype The data type.
    * \return The corresponding spirv type.
    */
-  SType GetSType(const tvm::DataType& dtype, uint32_t row = 0, uint32_t col = 0);
+  SType GetSType(const tvm::PrimType& dtype, uint32_t row = 0, uint32_t col = 0);
   /*!
    * \brief Get the pointer type that points to value_type
    * \param value_type.
@@ -656,11 +656,11 @@ class IRBuilder {
   Value GetConst_(const SType& dtype, const uint64_t* pvalue);
 
   // declare type
-  SType DeclareType(const DataType& dtype, uint32_t row = 0, uint32_t col = 0);
+  SType DeclareType(const PrimType& dtype, uint32_t row = 0, uint32_t col = 0);
 
   // Declare the appropriate SPIR-V capabilities and extensions to use
   // this data type.
-  void AddCapabilityFor(const DataType& dtype);
+  void AddCapabilityFor(const PrimType& dtype);
 
   /*! \brief SPIRV-related capabilities of the target
    *
diff --git a/src/backend/webgpu/codegen/codegen_webgpu.cc b/src/backend/webgpu/codegen/codegen_webgpu.cc
index 440f1f04b95e..7129aa23d2ee 100644
--- a/src/backend/webgpu/codegen/codegen_webgpu.cc
+++ b/src/backend/webgpu/codegen/codegen_webgpu.cc
@@ -68,7 +68,7 @@ class WebGPUWorkgroupInfoCollector : public StmtExprVisitor {
   void VisitExpr_(const VarNode* op) final {
     StmtExprVisitor::VisitExpr_(op);
     Var buffer_var = ffi::GetRef<Var>(op);
-    if (buffer_var.dtype().is_handle()) {
+    if (buffer_var.ty().IsHandle()) {
       info_.write_access_set.insert(buffer_var);
     }
   }
@@ -119,7 +119,7 @@ void CodeGenWebGPU::InitFuncState(const PrimFunc& f) {
   CodeGenC::InitFuncState(f);
   // analyze the data;
   for (Var arg : f->params) {
-    if (arg.dtype().is_handle()) {
+    if (arg.ty().IsHandle()) {
       alloc_storage_scope_[arg.get()] = "global";
     }
   }
@@ -174,10 +174,10 @@ runtime::FunctionInfo CodeGenWebGPU::AddFunction(const PrimFunc& f, bool skip_re
   os_param_access << "paramWriteAccess:[";
   // setup buffer argumemts
   for (Var arg : f->params) {
-    DataType t = arg.dtype();
-    func_arg_types.push_back(t);
+    PrimType t = arg.ty();
+    func_arg_types.push_back(t->dtype);
 
-    if (t.is_handle()) {
+    if (t.IsHandle()) {
       auto* ptr = arg->type_annotation.as<PointerTypeNode>();
       TVM_FFI_ICHECK(ptr)
           << "All handles passed to the CodeGenWebGPU must have a type_annotation as a "
@@ -188,11 +188,11 @@ runtime::FunctionInfo CodeGenWebGPU::AddFunction(const PrimFunc& f, bool skip_re
           << "All handles passed to the CodeGenWebGPU must have a type_annotation as a "
              "PointerType, "
           << "and must point to a PrimType";
-      DataType value_storage_type = prim->dtype;
-      if (value_storage_type == DataType::Bool()) {
+      PrimType value_storage_type(prim->dtype);
+      if (value_storage_type.MatchesCode(DLDataTypeCode::kDLBool)) {
         // We need a physically addressable buffer type to support boolean tensors.
         // The loaded byte is cast to bool inside the LoadNode visitor below.
-        value_storage_type = boolean_storage_type_.with_lanes(value_storage_type.lanes());
+        value_storage_type = boolean_storage_type_.WithLanes(value_storage_type.lanes());
       }
       std::string vid = AllocVarID(arg.get());
       std::string access_mode;
@@ -209,7 +209,7 @@ runtime::FunctionInfo CodeGenWebGPU::AddFunction(const PrimFunc& f, bool skip_re
       // add extra access mode info to launch params
       this->decl_stream << "@group(0) @binding(" << num_buffer++ << ") "
                         << "var<storage, " << access_mode << "> " << vid << " : array<";
-      this->PrintType(value_storage_type, this->decl_stream);
+      this->PrintType(value_storage_type->dtype, this->decl_stream);
       this->decl_stream << ">;\n";
     } else {
       pod_args.push_back(arg);
@@ -228,17 +228,17 @@ runtime::FunctionInfo CodeGenWebGPU::AddFunction(const PrimFunc& f, bool skip_re
 
   for (size_t i = 0; i < pod_args.size(); ++i) {
     Var v = pod_args[i];
-    TVM_FFI_ICHECK(!v.dtype().is_handle());
+    TVM_FFI_ICHECK(!v.ty().IsHandle());
     std::string vid = AllocVarID(v.get());
 
-    if (v.dtype() == DataType::Int(32)) {
+    if (v.ty() == PrimType::Int(32)) {
       this->decl_stream << "  " << vid << ": i32";
-    } else if (v.dtype() == DataType::UInt(32)) {
+    } else if (v.ty() == PrimType::UInt(32)) {
       this->decl_stream << "  " << vid << ": u32";
-    } else if (v.dtype() == DataType::Float(32)) {
+    } else if (v.ty() == PrimType::Float(32)) {
       this->decl_stream << "  " << vid << ": f32";
     } else {
-      TVM_FFI_THROW(InternalError) << "Do not support pod argument type " << v.dtype();
+      TVM_FFI_THROW(InternalError) << "Do not support pod argument type " << v.ty()->dtype;
     }
     this->decl_stream << ",\n";
     // value ref
@@ -289,13 +289,13 @@ runtime::FunctionInfo CodeGenWebGPU::AddFunction(const PrimFunc& f, bool skip_re
 void CodeGenWebGPU::BindThreadIndex(const IterVar& iv) {
   TVM_FFI_ICHECK(!var_idmap_.count(iv->var.get()));
   std::ostringstream os;
-  PrintType(iv->var.dtype(), os);
+  PrintType(iv->var.ty()->dtype, os);
   if (iv->thread_tag == "blockIdx.x") {
     // WebGPU have restriction to limit the maximum size of blockId.x to be 65535
     // We allow runtime to spread the load out to blockIdx.z so it can be a large number.
     os << "(blockIdx.z * gridDim.x + blockIdx.x)";
     std::string tidx = os.str();
-    std::string aggregated_bidx = SSAGetID(os.str(), iv->var.dtype());
+    std::string aggregated_bidx = SSAGetID(os.str(), iv->var.ty()->dtype);
     var_idmap_[iv->var.get()] = aggregated_bidx;
   } else {
     os << "(" << iv->thread_tag << ")";
@@ -305,16 +305,17 @@ void CodeGenWebGPU::BindThreadIndex(const IterVar& iv) {
   }
 }
 
-void CodeGenWebGPU::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
+void CodeGenWebGPU::PrintType(DLDataType raw_t, std::ostream& os) {  // NOLINT(*)
+  PrimType t(raw_t);
   int lanes = t.lanes();
-  if (t.is_handle()) {
+  if (t.IsHandle()) {
     TVM_FFI_THROW(InternalError) << "Cannot print handle type in WebGPU";
   }
-  if (t.is_void()) {
+  if (t.IsVoid()) {
     os << "void";
     return;
   }
-  if (t == DataType::Bool()) {
+  if (raw_t == DLDataType{kDLBool, 8, 1}) {
     os << "bool";
     return;
   }
@@ -323,28 +324,29 @@ void CodeGenWebGPU::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
     TVM_FFI_ICHECK(lanes >= 2 && lanes <= 4)
         << "CodeGenWebGPU: only allows vector with lanes in {2, 3, 4}";
     // Currently WebGPU doesn't support `i8` and an `int8x4` is represented as a `u32`.
-    if (t.is_int() && t.bits() == 8 && lanes == 4) {
+    if (t.MatchesCode(DLDataTypeCode::kDLInt) && t.bits() == 8 && lanes == 4) {
       os << "u32";
       return;
     }
     os << "vec" << lanes << "<";
   }
 
-  if (t.is_float()) {
+  if (t.code() == DLDataTypeCode::kDLFloat) {
     TVM_FFI_ICHECK(t.bits() == 16 || t.bits() == 32) << "CodeGenWebGPU: only support f16 or f32";
     if (t.bits() == 16) {
       // Using f16 requires enable directive
       enable_fp16_ = true;
     }
     os << "f" << t.bits();
-  } else if (t.is_uint()) {
+  } else if (t.MatchesCode(DLDataTypeCode::kDLUInt)) {
     TVM_FFI_ICHECK(t.bits() != 64) << "CodeGenWebGPU: do not support u64";
     os << "u" << t.bits();
-  } else if (t.is_int()) {
+  } else if (t.MatchesCode(DLDataTypeCode::kDLInt)) {
     TVM_FFI_ICHECK(t.bits() != 64) << "CodeGenWebGPU: do not support i64";
     os << "i" << t.bits();
   } else {
-    TVM_FFI_THROW(InternalError) << "CodeGenWebGPU: Cannot convert type " << t << " to WebGPU type";
+    TVM_FFI_THROW(InternalError) << "CodeGenWebGPU: Cannot convert type "
+                                 << ffi::DLDataTypeToString(raw_t) << " to WebGPU type";
   }
   if (lanes != 1) {
     os << ">";
@@ -365,18 +367,18 @@ void CodeGenWebGPU::PrintStorageSync(const CallNode* op) {
 }
 
 void CodeGenWebGPU::PrintSSAAssign(const std::string& target, const std::string& src,
-                                   DataType type) {
+                                   PrimType type) {
   stream << "let " << target << " : ";
-  PrintType(type, stream);
+  PrintType(type->dtype, stream);
   stream << " = " << src << ";\n";
 }
 
-void CodeGenWebGPU::PrintVecElemLoad(const std::string& vec, DataType t, int i,
+void CodeGenWebGPU::PrintVecElemLoad(const std::string& vec, DLDataType t, int i,
                                      std::ostream& os) {  // NOLINT(*)
   os << vec << "[" << i << "]";
 }
 
-void CodeGenWebGPU::PrintVecElemStore(const std::string& vec, DataType t, int i,
+void CodeGenWebGPU::PrintVecElemStore(const std::string& vec, DLDataType t, int i,
                                       const std::string& value) {
   this->PrintIndent();
   stream << vec << "[" << i << "] = " << value << ";\n";
@@ -384,8 +386,8 @@ void CodeGenWebGPU::PrintVecElemStore(const std::string& vec, DataType t, int i,
 
 void CodeGenWebGPU::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NOLINT(*)
   std::string v = PrintExpr(op->value);
-  int lanes = op->dtype.lanes();
-  PrintType(op->dtype, os);
+  int lanes = op->ty().lanes();
+  PrintType(op->ty()->dtype, os);
   os << "(";
   for (int i = 0; i < lanes; ++i) {
     if (i != 0) os << ", ";
@@ -395,14 +397,14 @@ void CodeGenWebGPU::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  //
 }
 
 PrimExpr CodeGenWebGPU::EnforceU32(PrimExpr value) {
-  return cast(DataType::UInt(32, value.dtype().lanes()), value);
+  return cast(PrimType::UInt(32, value.ty().lanes()), value);
 }
 
 void CodeGenWebGPU::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
   if (op->op.same_as(builtin::reinterpret())) {
     // generate bitcast<TYPE>(ARG)
     os << "bitcast<";
-    this->PrintType(op->dtype, os);
+    this->PrintType(op->ty()->dtype, os);
     os << ">(";
     this->PrintExpr(op->args[0], os);
     os << ")";
@@ -426,7 +428,7 @@ void CodeGenWebGPU::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLIN
     std::string cond = PrintExpr(op->args[0]);
     this->PrintIndent();
     this->stream << "var " << result << " : ";
-    PrintType(op->dtype, this->stream);
+    PrintType(op->ty()->dtype, this->stream);
     this->stream << ";\n";
     this->PrintIndent();
     this->stream << "if (" << cond << ") {\n";
@@ -459,7 +461,7 @@ void CodeGenWebGPU::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLIN
 }
 
 void CodeGenWebGPU::VisitExpr_(const CastNode* op, std::ostream& os) {  // NOLINT(*)
-  PrintType(op->dtype, os);
+  PrintType(op->ty()->dtype, os);
   os << "(" << PrintExpr(op->value) << ")";
 }
 
@@ -478,7 +480,7 @@ void CodeGenWebGPU::VisitExpr_(const LetNode* op, std::ostream& os) {  // NOLINT
     PrintIndent();
     std::string value = PrintExpr(op->value);
     this->stream << "let " << AllocVarID(op->var.get()) << " : ";
-    PrintType(op->var.dtype(), this->stream);
+    PrintType(op->var.ty()->dtype, this->stream);
     this->stream << " = " << value << ";\n";
   }
   os << PrintExpr(op->body);
@@ -490,18 +492,18 @@ void CodeGenWebGPU::VisitExpr_(const LetNode* op, std::ostream& os) {  // NOLINT
 }
 
 void CodeGenWebGPU::VisitExpr_(const IntImmNode* op, std::ostream& os) {  // NOLINT(*)
-  if (op->dtype.bits() == 32) {
+  if (op->ty().bits() == 32) {
     std::ostringstream temp;
-    if (op->dtype.is_int()) {
+    if (op->ty().MatchesCode(DLDataTypeCode::kDLInt)) {
       temp << op->value << "i";
     } else {
-      TVM_FFI_ICHECK(op->dtype.is_uint());
+      TVM_FFI_ICHECK(op->ty().MatchesCode(DLDataTypeCode::kDLUInt));
       temp << op->value << "u";
     }
     this->MarkConst(temp.str());
     os << temp.str();
   } else {
-    this->PrintType(op->dtype, os);
+    this->PrintType(op->ty()->dtype, os);
     os << "(" << op->value << ")";
   }
 }
@@ -509,14 +511,14 @@ void CodeGenWebGPU::VisitExpr_(const IntImmNode* op, std::ostream& os) {  // NOL
 void CodeGenWebGPU::VisitExpr_(const FloatImmNode* op, std::ostream& os) {  // NOLINT(*)
   std::ostringstream temp;
   temp << std::scientific << op->value;
-  if (op->dtype.bits() == 32) {
+  if (op->ty().bits() == 32) {
     temp << 'f';
-  } else if (op->dtype.bits() == 16) {
+  } else if (op->ty().bits() == 16) {
     // Using f16 requires enable directive
     enable_fp16_ = true;
     temp << 'h';
   } else {
-    TVM_FFI_THROW(InternalError) << "Unsupported floating point bits " << op->dtype.bits();
+    TVM_FFI_THROW(InternalError) << "Unsupported floating point bits " << op->ty().bits();
   }
   MarkConst(temp.str());
   os << temp.str();
@@ -530,39 +532,42 @@ void CodeGenWebGPU::VisitExpr_(const BufferLoadNode* op, std::ostream& os) {  //
   TVM_FFI_ICHECK_EQ(op->indices.size(), 1) << "Load from non-flat memory not supported.";
   TVM_FFI_ICHECK(!op->predicate.defined()) << "Predicated buffer load is not supported.";
 
-  DataType value_dtype = op->dtype;
+  DLDataType value_dtype = op->ty()->dtype;
+  PrimType value_ty(value_dtype);
   PrimExpr index = op->indices[0];
   Var buffer_var = op->buffer->data;
-  DataType element_dtype = op->buffer->dtype;
+  DLDataType element_dtype = op->buffer->dtype->dtype;
+  PrimType element_ty(element_dtype);
 
-  int lanes = op->dtype.lanes();
+  int lanes = value_ty.lanes();
   std::string buffer_vid = GetVarID(buffer_var.get());
 
-  if (value_dtype.lanes() == element_dtype.lanes()) {
+  if (value_ty.lanes() == element_ty.lanes()) {
     // Direct buffer loading
     // Special handle bool loading
-    if (value_dtype == DataType::Bool()) {
+    if (value_dtype == DLDataType{kDLBool, 8, 1}) {
       this->PrintType(value_dtype, os);
       os << "(";
     } else {
       TVM_FFI_ICHECK(value_dtype == element_dtype);
     }
-    TVM_FFI_ICHECK_EQ(index.dtype().lanes(), 1);
+    TVM_FFI_ICHECK_EQ(index.ty().lanes(), 1);
     os << buffer_vid << "[" << this->PrintExpr(index) << "]";
     // Special handle bool loading
-    if (value_dtype == DataType::Bool()) {
+    if (value_dtype == DLDataType{kDLBool, 8, 1}) {
       os << ")";
     }
   } else {
     // Vector load from scalar buffer
-    TVM_FFI_ICHECK_EQ(element_dtype.lanes(), 1) << "Can only vector load scalar array";
-    TVM_FFI_ICHECK(value_dtype.element_of() == element_dtype)
+    TVM_FFI_ICHECK_EQ(element_ty.lanes(), 1) << "Can only vector load scalar array";
+    DLDataType value_element_dtype{value_dtype.code, value_dtype.bits, 1};
+    TVM_FFI_ICHECK(value_element_dtype == element_dtype)
         << "WebGPU vector loading requires base type to match";
     arith::PVar<PrimExpr> base;
-    if (arith::ramp(base, 1, op->dtype.lanes()).Match(index)) {
+    if (arith::ramp(base, 1, value_ty.lanes()).Match(index)) {
       // vec3<f32>(buf[base + 0], buf[base + 1], buf[base + 2]);
-      std::string base_vid = SSAGetID(PrintExpr(base.Eval()), base.Eval().dtype());
-      PrintType(element_dtype.with_lanes(value_dtype.lanes()), os);
+      std::string base_vid = SSAGetID(PrintExpr(base.Eval()), base.Eval().ty()->dtype);
+      PrintType(element_ty.WithLanes(value_ty.lanes())->dtype, os);
       os << "(";
       for (int i = 0; i < lanes; ++i) {
         if (i != 0) os << ", ";
@@ -571,8 +576,8 @@ void CodeGenWebGPU::VisitExpr_(const BufferLoadNode* op, std::ostream& os) {  //
       os << ")";
     } else {
       // vec3<f32>(buf[index[0]], buf[index[1]], buf[index[2]]);
-      std::string index_vid = SSAGetID(PrintExpr(index), index.dtype());
-      PrintType(element_dtype.with_lanes(value_dtype.lanes()), os);
+      std::string index_vid = SSAGetID(PrintExpr(index), index.ty()->dtype);
+      PrintType(element_ty.WithLanes(value_ty.lanes())->dtype, os);
       os << "(";
       for (int i = 0; i < lanes; ++i) {
         if (i != 0) os << ", ";
@@ -593,7 +598,7 @@ void CodeGenWebGPU::VisitStmt_(const BindNode* op) {
     PrintIndent();
     std::string value = PrintExpr(op->value);
     this->stream << "let " << AllocVarID(op->var.get()) << " : ";
-    PrintType(op->var.dtype(), this->stream);
+    PrintType(op->var.ty()->dtype, this->stream);
     this->stream << " = " << value << ";\n";
   }
 }
@@ -602,14 +607,16 @@ void CodeGenWebGPU::VisitStmt_(const BufferStoreNode* op) {
   TVM_FFI_ICHECK_EQ(op->indices.size(), 1) << "Store to non-flat memory not supported.";
   TVM_FFI_ICHECK(!op->predicate.defined()) << "Predicated buffer store is not supported.";
 
-  DataType value_dtype = op->value.dtype();
-  DataType element_dtype = op->buffer->dtype;
+  DLDataType value_dtype = op->value.ty()->dtype;
+  PrimType value_ty(value_dtype);
+  DLDataType element_dtype = op->buffer->dtype->dtype;
+  PrimType element_ty(element_dtype);
   PrimExpr index = op->indices[0];
   Var buffer_var = op->buffer->data;
 
   std::string buffer_vid = GetVarID(buffer_var.get());
 
-  if (value_dtype.lanes() == element_dtype.lanes()) {
+  if (value_ty.lanes() == element_ty.lanes()) {
     // must execute print expr first
     // so we won't have recursive append to stream
     std::string index_vid = PrintExpr(index);
@@ -618,7 +625,7 @@ void CodeGenWebGPU::VisitStmt_(const BufferStoreNode* op) {
     this->PrintIndent();
     stream << buffer_vid << "[" << index_vid << "] = ";
     // special explicit conversion of bool
-    if (value_dtype == DataType::Bool()) {
+    if (value_dtype == DLDataType{kDLBool, 8, 1}) {
       PrintType(element_dtype, stream);
       stream << "(";
     } else {
@@ -626,22 +633,23 @@ void CodeGenWebGPU::VisitStmt_(const BufferStoreNode* op) {
     }
     stream << value_vid;
     // Special handle bool store
-    if (value_dtype == DataType::Bool()) {
+    if (value_dtype == DLDataType{kDLBool, 8, 1}) {
       stream << ")";
     }
     stream << ";\n";
   } else {
     // Vector store into scalar buffer
-    TVM_FFI_ICHECK_EQ(element_dtype.lanes(), 1) << "Can only vector load scalar array";
-    TVM_FFI_ICHECK(value_dtype.element_of() == element_dtype)
+    TVM_FFI_ICHECK_EQ(element_ty.lanes(), 1) << "Can only vector load scalar array";
+    DLDataType value_element_dtype{value_dtype.code, value_dtype.bits, 1};
+    TVM_FFI_ICHECK(value_element_dtype == element_dtype)
         << "WebGPU vector stire requires base type to match";
     std::string value_vid = PrintExpr(op->value);
     arith::PVar<PrimExpr> base;
-    if (arith::ramp(base, 1, value_dtype.lanes()).Match(index)) {
+    if (arith::ramp(base, 1, value_ty.lanes()).Match(index)) {
       // buf[base + 0] = value[0]
       // buf[base + 1] = value[1]
-      std::string base_vid = SSAGetID(PrintExpr(base.Eval()), base.Eval().dtype());
-      for (int i = 0; i < value_dtype.lanes(); ++i) {
+      std::string base_vid = SSAGetID(PrintExpr(base.Eval()), base.Eval().ty()->dtype);
+      for (int i = 0; i < value_ty.lanes(); ++i) {
         this->PrintIndent();
         stream << buffer_vid << "[" << base_vid << " + " << i << "] = " << value_vid << "[" << i
                << "];\n";
@@ -649,8 +657,8 @@ void CodeGenWebGPU::VisitStmt_(const BufferStoreNode* op) {
     } else {
       // buf[index[0]] = value[0]
       // buf[index[1]] = value[1]
-      std::string index_vid = SSAGetID(PrintExpr(index), index.dtype());
-      for (int i = 0; i < value_dtype.lanes(); ++i) {
+      std::string index_vid = SSAGetID(PrintExpr(index), index.ty()->dtype);
+      for (int i = 0; i < value_ty.lanes(); ++i) {
         this->PrintIndent();
         stream << buffer_vid << "[" << index_vid << "[" << i << "]] = " << value_vid << "[" << i
                << "];\n";
@@ -673,12 +681,12 @@ void CodeGenWebGPU::VisitStmt_(const AllocBufferNode* op) {
 
   if (storage_scope.rank == runtime::StorageRank::kShared) {
     this->decl_stream << "var<workgroup> " << vid << " : array<";
-    PrintType(op->buffer->dtype, this->decl_stream);
+    PrintType(op->buffer->dtype->dtype, this->decl_stream);
     this->decl_stream << ", " << constant_size << ">;\n";
   } else if (storage_scope.rank == runtime::StorageRank::kLocal) {
     this->PrintIndent();
     this->stream << "var " << vid << " : array<";
-    PrintType(op->buffer->dtype, this->stream);
+    PrintType(op->buffer->dtype->dtype, this->stream);
     this->stream << ", " << constant_size << ">;\n";
   } else {
     TVM_FFI_THROW(InternalError) << "WebGPU: Do not support storage scope: "
@@ -694,7 +702,7 @@ void CodeGenWebGPU::VisitStmt_(const ForNode* op) {
   std::string vid = AllocVarID(op->loop_var.get());
   PrintIndent();
   stream << "for (var " << vid << " : ";
-  PrintType(op->loop_var.dtype(), stream);
+  PrintType(op->loop_var.ty()->dtype, stream);
   stream << " = " << begin_str << "; " << vid << " < " << end_str << "; " << vid;
   if (step_str.empty()) {
     stream << "++";
diff --git a/src/backend/webgpu/codegen/codegen_webgpu.h b/src/backend/webgpu/codegen/codegen_webgpu.h
index 4c873ac3db18..c2179c5c48aa 100644
--- a/src/backend/webgpu/codegen/codegen_webgpu.h
+++ b/src/backend/webgpu/codegen/codegen_webgpu.h
@@ -51,16 +51,17 @@ class CodeGenWebGPU final : public CodeGenC {
   using CodeGenC::AddFunction;
   runtime::FunctionInfo AddFunction(const PrimFunc& f, bool skip_readonly_decl);  // NOLINT(*)
   void InitFuncState(const PrimFunc& f) final;
-  void PrintStorageSync(const CallNode* op) final;     // NOLINT(*)
-  void PrintType(DataType t, std::ostream& os) final;  // NOLINT(*)
-  void BindThreadIndex(const IterVar& iv) final;       // NOLINT(*)
+  void PrintStorageSync(const CallNode* op) final;       // NOLINT(*)
+  void PrintType(DLDataType t, std::ostream& os) final;  // NOLINT(*)
+  void BindThreadIndex(const IterVar& iv) final;         // NOLINT(*)
 
   // assignment printing
-  void PrintSSAAssign(const std::string& target, const std::string& src, DataType type) final;
+  void PrintSSAAssign(const std::string& target, const std::string& src, PrimType type) final;
 
   // overload printing vector element load/store
-  void PrintVecElemLoad(const std::string& vec, DataType t, int i, std::ostream& os) final;
-  void PrintVecElemStore(const std::string& vec, DataType t, int i, const std::string& value) final;
+  void PrintVecElemLoad(const std::string& vec, DLDataType t, int i, std::ostream& os) final;
+  void PrintVecElemStore(const std::string& vec, DLDataType t, int i,
+                         const std::string& value) final;
 
   // overload visitor
   void VisitExpr_(const BroadcastNode* op, std::ostream& os) final;   // NOLINT(*)
@@ -90,7 +91,7 @@ class CodeGenWebGPU final : public CodeGenC {
   /*!
    * \brief Storage type of bool values.
    */
-  DataType boolean_storage_type_{DataType::Int(8)};
+  PrimType boolean_storage_type_{PrimType::Int(8)};
 
   // whether enable fp16
   bool enable_fp16_{false};
diff --git a/src/backend/webgpu/codegen/intrin_rule_webgpu.cc b/src/backend/webgpu/codegen/intrin_rule_webgpu.cc
index 1c172fcd141b..7992fa9915c0 100644
--- a/src/backend/webgpu/codegen/intrin_rule_webgpu.cc
+++ b/src/backend/webgpu/codegen/intrin_rule_webgpu.cc
@@ -34,7 +34,7 @@ using tirx::FLowerIntrinsic;
 
 // warp-level primitives. Follows implementation in intrin_rule_metal.cc
 struct WebGPUWarpIntrinsic {
-  const Op operator()(DataType t, const Op& orig_op) const {
+  const Op operator()(PrimType t, const Op& orig_op) const {
     if (orig_op.same_as(builtin::tvm_warp_shuffle())) {
       static const Op& webgpu_subgroup_shuffle_op = Op::Get("tirx.webgpu.subgroup_shuffle");
       return webgpu_subgroup_shuffle_op;
@@ -55,9 +55,9 @@ static PrimExpr DispatchWebGPUShuffle(const PrimExpr& e) {
   const CallNode* call = e.as<CallNode>();
   TVM_FFI_ICHECK(call != nullptr);
   TVM_FFI_ICHECK_EQ(call->args.size(), 5);  // mask, value, warp_id, width, warp_size
-  PrimExpr lane_or_delta = Cast(DataType::UInt(32, call->args[2].dtype().lanes()), call->args[2]);
+  PrimExpr lane_or_delta = Cast(PrimType::UInt(32, call->args[2].ty().lanes()), call->args[2]);
   ffi::Array<PrimExpr> webgpu_args{{call->args[1], lane_or_delta}};
-  return Call(call->dtype, T()(call->dtype, call->op.as_or_throw<Op>()), webgpu_args);
+  return Call(e.ty(), T()(e.ty(), call->op.as_or_throw<Op>()), webgpu_args);
 }
 
 void RegisterWebGPUIntrinRules() {
@@ -69,7 +69,7 @@ void RegisterWebGPUIntrinRules() {
 // See full list of builtin: https://www.w3.org/TR/WGSL/#builtin-functions
 
 struct ReturnAbs {
-  std::string operator()(DataType t, std::string name) const { return "abs"; }
+  std::string operator()(PrimType t, std::string name) const { return "abs"; }
 };
 
 TVM_REGISTER_OP("tirx.fabs")
@@ -124,7 +124,7 @@ TVM_REGISTER_OP("tirx.pow")
     .set_attr<FLowerIntrinsic>("webgpu.FLowerIntrinsic", DispatchPureExtern<Direct>);
 
 struct ReturnRound {
-  std::string operator()(DataType t, std::string name) const { return "round"; }
+  std::string operator()(PrimType t, std::string name) const { return "round"; }
 };
 
 // WGSL round() uses ties-to-even (banker's rounding), matching IEEE 754 and ONNX Round spec.
diff --git a/src/ir/expr.cc b/src/ir/expr.cc
index ef6ea0ed6dca..f73cd6ae3913 100644
--- a/src/ir/expr.cc
+++ b/src/ir/expr.cc
@@ -26,6 +26,7 @@
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ir/expr.h>
 #include <tvm/ir/function.h>
+#include <tvm/ir/type.h>
 #include <tvm/te/tensor.h>
 #include <tvm/tirx/expr.h>
 
@@ -48,33 +49,39 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 PrimExpr::PrimExpr(int32_t value) : PrimExpr(IntImm::Int32(value)) {}
 
-PrimExpr::PrimExpr(float value) : PrimExpr(FloatImm(DataType::Float(32), value)) {}
+PrimExpr::PrimExpr(float value) : PrimExpr(FloatImm(PrimType::Float(32), value)) {}
 
 PrimExpr PrimExpr::ConvertFallbackValue(ffi::String value) { return tirx::StringImm(value); }
 
-IntImm::IntImm(DataType dtype, int64_t value, Span span) {
-  TVM_FFI_CHECK(dtype.is_scalar(), ValueError)
-      << "IntImm can only take scalar, but " << dtype << " was supplied.";
-  TVM_FFI_CHECK(dtype.is_int() || dtype.is_uint() || dtype.is_bool(), ValueError)
-      << "IntImm supports only int or uint or bool type, but " << dtype << " was supplied.";
-  if (dtype.is_uint()) {
+IntImm::IntImm(PrimType value_ty, int64_t value, Span span) {
+  DLDataType runtime_dtype = value_ty->dtype;
+  DLDataTypeCode code = value_ty.code();
+  int32_t bits = value_ty.bits();
+  TVM_FFI_CHECK(!value_ty.IsScalableVector() && !value_ty.IsFixedLengthVector(), ValueError)
+      << "IntImm can only take scalar, but " << runtime_dtype << " was supplied.";
+  TVM_FFI_CHECK(value_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt,
+                                     DLDataTypeCode::kDLBool),
+                ValueError)
+      << "IntImm supports only int or uint or bool type, but " << runtime_dtype << " was supplied.";
+  if (code == DLDataTypeCode::kDLUInt) {
     TVM_FFI_CHECK_GE(value, 0U, ValueError)
-        << "Literal value " << value << " is negative for unsigned integer type " << dtype;
-    if (dtype.bits() < 64) {
-      TVM_FFI_CHECK_LT(value, 1LL << dtype.bits(), ValueError)
-          << "Literal value " << value << " exceeds maximum of " << dtype;
+        << "Literal value " << value << " is negative for unsigned integer type " << runtime_dtype;
+    if (bits < 64) {
+      TVM_FFI_CHECK_LT(value, 1LL << bits, ValueError)
+          << "Literal value " << value << " exceeds maximum of " << runtime_dtype;
     }
-  } else if (dtype.bits() == 1 || dtype.is_bool()) {
+  } else if (bits == 1 || code == DLDataTypeCode::kDLBool) {
     // int(1)
-    TVM_FFI_CHECK(value == 0 || value == 1, ValueError) << value << " exceeds range of " << dtype;
-  } else if (dtype.bits() < 64) {
-    TVM_FFI_CHECK_GE(value, -(1LL << (dtype.bits() - 1)), ValueError)
-        << "Literal value " << value << " exceeds minimum of " << dtype;
-    TVM_FFI_CHECK_LT(value, 1LL << (dtype.bits() - 1), ValueError)
-        << "Literal value " << value << " exceeds maximum of " << dtype;
+    TVM_FFI_CHECK(value == 0 || value == 1, ValueError)
+        << value << " exceeds range of " << runtime_dtype;
+  } else if (bits < 64) {
+    TVM_FFI_CHECK_GE(value, -(1LL << (bits - 1)), ValueError)
+        << "Literal value " << value << " exceeds minimum of " << runtime_dtype;
+    TVM_FFI_CHECK_LT(value, 1LL << (bits - 1), ValueError)
+        << "Literal value " << value << " exceeds maximum of " << runtime_dtype;
   }
   ffi::ObjectPtr<IntImmNode> node = ffi::make_object<IntImmNode>();
-  node->dtype = dtype;
+  node->BaseExprNode::ty = std::move(value_ty);
   node->value = value;
   node->span = span;
   data_ = std::move(node);
@@ -82,103 +89,118 @@ IntImm::IntImm(DataType dtype, int64_t value, Span span) {
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("ir.IntImm", [](DataType dtype, int64_t value, Span span) {
-    return IntImm(dtype, value, span);
+  refl::GlobalDef().def("ir.IntImm", [](DLDataType dtype, int64_t value, Span span) {
+    return IntImm(PrimType(dtype), value, span);
   });
 }
 
-FloatImm::FloatImm(DataType dtype, double value, Span span) {
-  TVM_FFI_CHECK_EQ(dtype.lanes(), 1, ValueError) << "FloatImm can only take scalar.";
+FloatImm::FloatImm(PrimType value_ty, double value, Span span) {
+  DLDataType runtime_dtype = value_ty->dtype;
+  DLDataTypeCode code = value_ty.code();
+  int32_t bits = value_ty.bits();
+  TVM_FFI_CHECK(!value_ty.IsScalableVector() && !value_ty.IsFixedLengthVector(), ValueError)
+      << "FloatImm can only take scalar.";
 
-  TVM_FFI_CHECK(dtype.is_float() || dtype.is_bfloat16() || dtype.is_float8() || dtype.is_float6() ||
-                    dtype.is_float4() || dtype.code() >= DataType::kCustomBegin,
-                ValueError)
-      << "FloatImm supports only float, but " << dtype << " was supplied.";
+  TVM_FFI_CHECK(
+      value_ty.MatchesCode(DLDataTypeCode::kDLFloat, DLDataTypeCode::kDLFloat8_e3m4,
+                           DLDataTypeCode::kDLFloat8_e4m3, DLDataTypeCode::kDLFloat8_e4m3b11fnuz,
+                           DLDataTypeCode::kDLFloat8_e4m3fn, DLDataTypeCode::kDLFloat8_e4m3fnuz,
+                           DLDataTypeCode::kDLFloat8_e5m2, DLDataTypeCode::kDLFloat8_e5m2fnuz,
+                           DLDataTypeCode::kDLFloat8_e8m0fnu, DLDataTypeCode::kDLFloat6_e2m3fn,
+                           DLDataTypeCode::kDLFloat6_e3m2fn) ||
+          value_ty.MatchesElementType(DLDataTypeCode::kDLBfloat, 16) ||
+          value_ty.MatchesElementType(DLDataTypeCode::kDLFloat4_e2m1fn, 4) ||
+          static_cast<int>(code) >= static_cast<int>(ffi::DLExtDataTypeCode::kDLExtCustomBegin),
+      ValueError)
+      << "FloatImm supports only float, but " << runtime_dtype << " was supplied.";
 
   // check range for float32 and float16 since they have specified range.
   if (!std::isinf(value) && !std::isnan(value)) {
-    if (dtype.bits() == 32) {
+    if (bits == 32) {
       TVM_FFI_CHECK_GE(value, std::numeric_limits<float>::lowest(), ValueError)
-          << "Literal value " << value << " exceeds minimum of " << dtype;
+          << "Literal value " << value << " exceeds minimum of " << runtime_dtype;
       TVM_FFI_CHECK_LE(value, std::numeric_limits<float>::max(), ValueError)
-          << "Literal value " << value << " exceeds maximum of " << dtype;
-    } else if (dtype.is_float16()) {
+          << "Literal value " << value << " exceeds maximum of " << runtime_dtype;
+    } else if (value_ty.MatchesElementType(DLDataTypeCode::kDLFloat, 16)) {
       TVM_FFI_CHECK_GE(value, -support::kMaxFloat16, ValueError)
-          << "Literal value " << value << " exceeds minimum of " << dtype;
+          << "Literal value " << value << " exceeds minimum of " << runtime_dtype;
       TVM_FFI_CHECK_LE(value, support::kMaxFloat16, ValueError)
-          << "Literal value " << value << " exceeds maximum of " << dtype;
-    } else if (dtype.is_bfloat16()) {
+          << "Literal value " << value << " exceeds maximum of " << runtime_dtype;
+    } else if (value_ty.MatchesElementType(DLDataTypeCode::kDLBfloat, 16)) {
       TVM_FFI_CHECK_GE(value, -support::kMaxBFloat16, ValueError)
-          << "Literal value " << value << " exceeds minimum of " << dtype;
+          << "Literal value " << value << " exceeds minimum of " << runtime_dtype;
       TVM_FFI_CHECK_LE(value, support::kMaxBFloat16, ValueError)
-          << "Literal value " << value << " exceeds maximum of " << dtype;
-    } else if (dtype.is_float8_e3m4() || dtype.is_float8_e4m3() || dtype.is_float8_e4m3b11fnuz() ||
-               dtype.is_float8_e4m3fn() || dtype.is_float8_e4m3fnuz() || dtype.is_float8_e5m2() ||
-               dtype.is_float8_e5m2fnuz() || dtype.is_float8_e8m0fnu()) {
+          << "Literal value " << value << " exceeds maximum of " << runtime_dtype;
+    } else if (value_ty.MatchesCode(
+                   DLDataTypeCode::kDLFloat8_e3m4, DLDataTypeCode::kDLFloat8_e4m3,
+                   DLDataTypeCode::kDLFloat8_e4m3b11fnuz, DLDataTypeCode::kDLFloat8_e4m3fn,
+                   DLDataTypeCode::kDLFloat8_e4m3fnuz, DLDataTypeCode::kDLFloat8_e5m2,
+                   DLDataTypeCode::kDLFloat8_e5m2fnuz, DLDataTypeCode::kDLFloat8_e8m0fnu)) {
       double bound = 0.0;
       bool nonneg = false;
 
-      switch (dtype.code()) {
-        case DataType::TypeCode::kFloat8_e3m4:
+      switch (code) {
+        case DLDataTypeCode::kDLFloat8_e3m4:
           bound = support::kMaxE3M4;
           break;
-        case DataType::TypeCode::kFloat8_e4m3:
+        case DLDataTypeCode::kDLFloat8_e4m3:
           bound = support::kMaxE4M3;
           break;
-        case DataType::TypeCode::kFloat8_e4m3b11fnuz:
+        case DLDataTypeCode::kDLFloat8_e4m3b11fnuz:
           bound = support::kMaxE4M3B11FNUZ;
           nonneg = true;
           break;
-        case DataType::TypeCode::kFloat8_e4m3fn:
+        case DLDataTypeCode::kDLFloat8_e4m3fn:
           bound = support::kMaxE4M3FN;
           break;
-        case DataType::TypeCode::kFloat8_e4m3fnuz:
+        case DLDataTypeCode::kDLFloat8_e4m3fnuz:
           bound = support::kMaxE4M3FNUZ;
           nonneg = true;
           break;
-        case DataType::TypeCode::kFloat8_e5m2:
+        case DLDataTypeCode::kDLFloat8_e5m2:
           bound = support::kMaxE5M2;
           break;
-        case DataType::TypeCode::kFloat8_e5m2fnuz:
+        case DLDataTypeCode::kDLFloat8_e5m2fnuz:
           bound = support::kMaxE5M2FNUZ;
           nonneg = true;
           break;
-        case DataType::TypeCode::kFloat8_e8m0fnu:
+        case DLDataTypeCode::kDLFloat8_e8m0fnu:
           bound = support::kMaxE8M0FNU;
           nonneg = true;
           break;
         default:
-          TVM_FFI_THROW(InternalError) << "Unhandled float8 type: " << dtype;
+          TVM_FFI_THROW(InternalError) << "Unhandled float8 type: " << runtime_dtype;
       }
 
       if (nonneg) {
         TVM_FFI_CHECK_GE(value, 0, ValueError)
-            << "Literal value " << value << " below zero for unsigned " << dtype;
+            << "Literal value " << value << " below zero for unsigned " << runtime_dtype;
       } else {
         TVM_FFI_CHECK_GE(value, -bound, ValueError)
-            << "Literal value " << value << " below minimum of " << dtype;
+            << "Literal value " << value << " below minimum of " << runtime_dtype;
       }
       TVM_FFI_CHECK_LE(value, bound, ValueError)
-          << "Literal value " << value << " exceeds maximum of " << dtype;
+          << "Literal value " << value << " exceeds maximum of " << runtime_dtype;
 
-    } else if (dtype.is_float6_e2m3fn() || dtype.is_float6_e3m2fn()) {
-      double bound = (dtype.code() == DataType::TypeCode::kFloat6_e2m3fn) ? support::kMaxE2M3FN
-                                                                          : support::kMaxE3M2FN;
+    } else if (value_ty.MatchesCode(DLDataTypeCode::kDLFloat6_e2m3fn,
+                                    DLDataTypeCode::kDLFloat6_e3m2fn)) {
+      double bound =
+          (code == DLDataTypeCode::kDLFloat6_e2m3fn) ? support::kMaxE2M3FN : support::kMaxE3M2FN;
       TVM_FFI_CHECK_GE(value, -bound, ValueError)
-          << "Literal value " << value << " below minimum of " << dtype;
+          << "Literal value " << value << " below minimum of " << runtime_dtype;
       TVM_FFI_CHECK_LE(value, bound, ValueError)
-          << "Literal value " << value << " exceeds maximum of " << dtype;
+          << "Literal value " << value << " exceeds maximum of " << runtime_dtype;
 
-    } else if (dtype.is_float4_e2m1fn()) {
+    } else if (code == DLDataTypeCode::kDLFloat4_e2m1fn) {
       double bound = support::kMaxE2M1FN;
       TVM_FFI_CHECK_GE(value, -bound, ValueError)
-          << "Literal value " << value << " below minimum of " << dtype;
+          << "Literal value " << value << " below minimum of " << runtime_dtype;
       TVM_FFI_CHECK_LE(value, bound, ValueError)
-          << "Literal value " << value << " exceeds maximum of " << dtype;
+          << "Literal value " << value << " exceeds maximum of " << runtime_dtype;
     }
   }
   ffi::ObjectPtr<FloatImmNode> node = ffi::make_object<FloatImmNode>();
-  node->dtype = dtype;
+  node->BaseExprNode::ty = std::move(value_ty);
   node->value = value;
   node->span = span;
   data_ = std::move(node);
@@ -186,8 +208,8 @@ FloatImm::FloatImm(DataType dtype, double value, Span span) {
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("ir.FloatImm", [](DataType dtype, double value, Span span) {
-    return FloatImm(dtype, value, span);
+  refl::GlobalDef().def("ir.FloatImm", [](DLDataType dtype, double value, Span span) {
+    return FloatImm(PrimType(dtype), value, span);
   });
 }
 
@@ -206,7 +228,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
         if (end.defined()) {
           return Range(begin, end.value(), span);
         } else {
-          return Range(IntImm(begin->dtype, 0), begin, span);
+          return Range(IntImm(begin.ty(), 0), begin, span);
         }
       });
 }
diff --git a/src/ir/type.cc b/src/ir/type.cc
index d6d059dba079..2464f6faa659 100644
--- a/src/ir/type.cc
+++ b/src/ir/type.cc
@@ -24,27 +24,120 @@
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ir/type.h>
+
+#include <cstdint>
+#include <unordered_map>
+
 namespace tvm {
 
+namespace {
+
+DLDataType ScalableVectorDType(DLDataTypeCode code, int bits, int lanes) {
+  TVM_FFI_ICHECK_GT(lanes, 1) << "Invalid value for vscale factor " << lanes;
+  TVM_FFI_ICHECK_LT(lanes, 32768);
+  return DLDataType{static_cast<uint8_t>(code), static_cast<uint8_t>(bits),
+                    static_cast<uint16_t>(-lanes)};
+}
+
+uint32_t PackDataTypeKey(DLDataType dtype) {
+  return (static_cast<uint32_t>(dtype.code) << 24) | (static_cast<uint32_t>(dtype.bits) << 16) |
+         static_cast<uint32_t>(dtype.lanes);
+}
+
+int64_t PrimTypeAnyHash(const ffi::Any& src) {
+  return static_cast<int64_t>(PackDataTypeKey(src.cast<PrimType>()->dtype));
+}
+
+bool PrimTypeAnyEqual(const ffi::Any& lhs, const ffi::Any& rhs) {
+  return lhs.cast<PrimType>()->dtype == rhs.cast<PrimType>()->dtype;
+}
+
+ffi::ObjectPtr<PrimTypeNode> GetCachedPrimTypeNode(DLDataType dtype) {
+  thread_local std::unordered_map<uint32_t, ffi::ObjectPtr<PrimTypeNode>> cache;
+  uint32_t key = PackDataTypeKey(dtype);
+  auto it = cache.find(key);
+  if (it != cache.end()) {
+    return it->second;
+  }
+
+  ffi::ObjectPtr<PrimTypeNode> node = ffi::make_object<PrimTypeNode>();
+  node->dtype = dtype;
+  return cache.emplace(key, std::move(node)).first->second;
+}
+
+}  // namespace
+
 TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
   TypeNode::RegisterReflection();
   PrimTypeNode::RegisterReflection();
+  refl::TypeAttrDef<PrimTypeNode>()
+      .attr(refl::type_attr::kAnyHash, reinterpret_cast<void*>(&PrimTypeAnyHash))
+      .attr(refl::type_attr::kAnyEqual, reinterpret_cast<void*>(&PrimTypeAnyEqual));
   PointerTypeNode::RegisterReflection();
   TupleTypeNode::RegisterReflection();
   FuncTypeNode::RegisterReflection();
   TensorMapTypeNode::RegisterReflection();
 }
 
-PrimType::PrimType(runtime::DataType dtype, Span span) {
-  ffi::ObjectPtr<PrimTypeNode> n = ffi::make_object<PrimTypeNode>();
-  n->dtype = dtype;
-  n->span = std::move(span);
-  data_ = std::move(n);
+PrimType::PrimType(DLDataType dtype) { data_ = GetCachedPrimTypeNode(dtype); }
+
+PrimType::PrimType(DLDataTypeCode code, int bits, int lanes)
+    : PrimType(DLDataType{static_cast<uint8_t>(code), static_cast<uint8_t>(bits),
+                          static_cast<uint16_t>(lanes)}) {}
+
+PrimType PrimType::Int(int bits, int lanes) {
+  if (lanes == 1) {
+    if (bits == 32) {
+      static const PrimType i32_ty(DLDataType{kDLInt, 32, 1});
+      return i32_ty;
+    }
+    if (bits == 64) {
+      static const PrimType i64_ty(DLDataType{kDLInt, 64, 1});
+      return i64_ty;
+    }
+  }
+  return PrimType(DLDataType{kDLInt, static_cast<uint8_t>(bits), static_cast<uint16_t>(lanes)});
+}
+
+PrimType PrimType::UInt(int bits, int lanes) {
+  return PrimType(DLDataType{kDLUInt, static_cast<uint8_t>(bits), static_cast<uint16_t>(lanes)});
+}
+
+PrimType PrimType::Float(int bits, int lanes) {
+  if (bits == 32 && lanes == 1) {
+    static const PrimType f32_ty(DLDataType{kDLFloat, 32, 1});
+    return f32_ty;
+  }
+  return PrimType(DLDataType{kDLFloat, static_cast<uint8_t>(bits), static_cast<uint16_t>(lanes)});
+}
+
+PrimType PrimType::BFloat(int bits, int lanes) {
+  return PrimType(DLDataType{kDLBfloat, static_cast<uint8_t>(bits), static_cast<uint16_t>(lanes)});
+}
+
+PrimType PrimType::Bool(int lanes) {
+  if (lanes == 1) {
+    static const PrimType bool_ty(DLDataType{kDLBool, 8, 1});
+    return bool_ty;
+  }
+  return PrimType(DLDataType{kDLBool, 8, static_cast<uint16_t>(lanes)});
+}
+
+PrimType PrimType::Handle(int bits, int lanes) {
+  return PrimType(
+      DLDataType{kDLOpaqueHandle, static_cast<uint8_t>(bits), static_cast<uint16_t>(lanes)});
+}
+
+PrimType PrimType::Void() { return PrimType(DLDataType{kDLOpaqueHandle, 0, 0}); }
+
+PrimType PrimType::ScalableVector(DLDataTypeCode code, int bits, int lanes) {
+  return PrimType(ScalableVectorDType(code, bits, lanes));
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("ir.PrimType", [](runtime::DataType dtype) { return PrimType(dtype); });
+  refl::GlobalDef().def("ir.PrimType", [](DLDataType dtype) { return PrimType(dtype); });
 }
 
 PointerType::PointerType(Type element_type, ffi::String storage_scope) {
diff --git a/src/relax/analysis/tir_op_pattern_kind.cc b/src/relax/analysis/tir_op_pattern_kind.cc
index 369f5793d9b5..0bfb48cca94c 100644
--- a/src/relax/analysis/tir_op_pattern_kind.cc
+++ b/src/relax/analysis/tir_op_pattern_kind.cc
@@ -478,8 +478,8 @@ bool HasReshapePattern(const PrimFunc& func) {
       }
 
       if (nontrivial_indices.defined()) {
-        DataType dtype =
-            !block->iter_vars.empty() ? block->iter_vars[0]->var->dtype : DataType::Int(64);
+        PrimType dtype =
+            !block->iter_vars.empty() ? block->iter_vars[0]->var.ty() : PrimType::Int(64);
         tirx::Var fused_var("fused", dtype);
         ffi::Map<tirx::Var, PrimExpr> inverse_indices_map;
         PrimExpr stride = IntImm(dtype, /*value=*/1);
@@ -494,7 +494,8 @@ bool HasReshapePattern(const PrimFunc& func) {
 
         ffi::Array<PrimExpr> simplify_res = arith::IterMapSimplify(
             /*indices=*/{flattened_idx},
-            /*input_iters=*/{{fused_var, Range(IntImm(dtype, /*value=*/0), stride)}},
+            /*input_iters=*/
+            ffi::Map<tirx::Var, Range>{{fused_var, Range(IntImm(dtype, /*value=*/0), stride)}},
             /*input_pred=*/IntImm::Bool(true),
             /*check_level=*/arith::IterMapLevel::Surjective,
             /*analyzer=*/this->ana_,
diff --git a/src/relax/analysis/type_analysis.cc b/src/relax/analysis/type_analysis.cc
index 33070051ae63..34f5a4de6216 100644
--- a/src/relax/analysis/type_analysis.cc
+++ b/src/relax/analysis/type_analysis.cc
@@ -43,7 +43,7 @@ class StaticTypeDeriver : public TypeFunctor<Type(const Type&)> {
  public:
   Type VisitType_(const ObjectTypeNode* op) final { return ObjectType(op->span); }
 
-  Type VisitType_(const PrimTypeNode* op) final { return PrimType(op->dtype, op->span); }
+  Type VisitType_(const PrimTypeNode* op) final { return tvm::PrimType(op->dtype); }
 
   Type VisitType_(const ShapeTypeNode* op) final { return ShapeType(op->ndim, op->span); }
 
@@ -86,7 +86,9 @@ Type TypeFromStaticType(const Type& type) {
   if (type.as<ObjectTypeNode>()) {
     return ObjectType(type->span);
   } else if (const PrimTypeNode* prim_type = type.as<PrimTypeNode>()) {
-    return PrimType(prim_type->dtype, prim_type->span);
+    return tvm::PrimType(prim_type->dtype);
+  } else if (const tvm::PrimTypeNode* prim_type = type.as<tvm::PrimTypeNode>()) {
+    return tvm::PrimType(prim_type->dtype);
   } else if (const ShapeTypeNode* shape_type = type.as<ShapeTypeNode>()) {
     return ShapeType(shape_type->ndim, type->span);
   } else if (const TensorTypeNode* tensor_type = type.as<TensorTypeNode>()) {
@@ -221,9 +223,9 @@ class WellDefinedEraser : public TypeMutator, public ExprMutatorBase, public tir
     if (ret.defined()) {
       PrimExpr value = ret.value();
       if (value->IsInstance<IntImmNode>()) {
-        return tvm::cast(DataType::Int(64), value);
+        return tvm::cast(PrimType::Int(64), value);
       }
-      TVM_FFI_ICHECK(value.dtype() == DataType::Int(64))
+      TVM_FFI_ICHECK(value.ty().MatchesElementType(DLDataTypeCode::kDLInt, 64))
           << "Can only provide i64 expressions in shape";
       return value;
     } else {
@@ -1015,7 +1017,9 @@ class TypeLCAFinder : public TypeFunctor<Type(const Type&, const Type&)> {
     if (rhs == nullptr) return ObjectType(lhs->span);
 
     // find the target dtype, ndim, and vdevice.
-    DataType dtype = lhs->dtype == rhs->dtype ? lhs->dtype : DataType::Void();
+    PrimType dtype = lhs->dtype->dtype == rhs->dtype->dtype
+                         ? PrimType(lhs->dtype->dtype)
+                         : PrimType(DLDataType{kDLOpaqueHandle, 0, 0});
     int ndim = lhs->ndim == rhs->ndim ? lhs->ndim : kUnknownNDim;
     VDevice vdev = VDevice();
     if (lhs->vdevice.defined() && rhs->vdevice.defined() &&
@@ -1028,7 +1032,7 @@ class TypeLCAFinder : public TypeFunctor<Type(const Type&, const Type&)> {
         !CanProveShapeEqual(lhs->shape.value(), rhs->shape.value(),
                             ffi::GetRef<arith::Analyzer>(analyzer_))) {
       // reuse lhs when possible
-      if (!lhs->shape.defined() && lhs->dtype == dtype && lhs->ndim == ndim &&
+      if (!lhs->shape.defined() && lhs->dtype->dtype == dtype->dtype && lhs->ndim == ndim &&
           (!lhs->vdevice.defined() || vdev.defined())) {
         return ffi::GetRef<Type>(lhs);
       } else {
@@ -1036,7 +1040,7 @@ class TypeLCAFinder : public TypeFunctor<Type(const Type&, const Type&)> {
       }
     }
     // symbolic shape and vdevice match but dtype mismatch
-    if (lhs->dtype != dtype || (lhs->vdevice.defined() && !vdev.defined())) {
+    if (lhs->dtype->dtype != dtype->dtype || (lhs->vdevice.defined() && !vdev.defined())) {
       return TensorType(lhs->shape.value(), dtype, vdev, lhs->span);
     } else {
       return ffi::GetRef<Type>(lhs);
diff --git a/src/relax/analysis/well_formed.cc b/src/relax/analysis/well_formed.cc
index 5c3547249c5e..52e974be75f0 100644
--- a/src/relax/analysis/well_formed.cc
+++ b/src/relax/analysis/well_formed.cc
@@ -457,9 +457,9 @@ class WellFormedChecker : public relax::ExprVisitor,
     for (PrimExpr expr : op->values) {
       // check if the symbolic vars in the expr are defined, e.g, 2 * m
       tirx::ExprVisitor::VisitExpr(expr);
-      if (!expr.dtype().is_int()) {
+      if (expr.ty().code() != DLDataTypeCode::kDLInt) {
         TVM_FFI_VISIT_THROW(TypeError, expr)
-            << "Shape expressions must be of integer type, but got " << expr.dtype();
+            << "Shape expressions must be of integer type, but got " << expr.ty()->dtype;
       }
     }
     CheckType(op);
diff --git a/src/relax/backend/contrib/codegen_c/codegen_c.h b/src/relax/backend/contrib/codegen_c/codegen_c.h
index 1a5fb1dd801e..0c36b04812c8 100644
--- a/src/relax/backend/contrib/codegen_c/codegen_c.h
+++ b/src/relax/backend/contrib/codegen_c/codegen_c.h
@@ -347,19 +347,20 @@ class CodegenCBase {
    */
   std::string GetDtypeString(const TensorTypeNode* tensor_ty) {
     std::string dtype;
-    if (runtime::TypeMatch(tensor_ty->dtype, kDLFloat, 32)) {
+    DLDataType raw_dtype = tensor_ty->dtype->dtype;
+    if (raw_dtype == DLDataType{kDLFloat, 32, 1}) {
       dtype = "float";
-    } else if (runtime::TypeMatch(tensor_ty->dtype, kDLFloat, 16)) {
+    } else if (raw_dtype == DLDataType{kDLFloat, 16, 1}) {
       dtype = "half";
-    } else if (runtime::TypeMatch(tensor_ty->dtype, kDLBfloat, 16)) {
+    } else if (raw_dtype == DLDataType{kDLBfloat, 16, 1}) {
       dtype = "bfloat";
-    } else if (runtime::TypeMatch(tensor_ty->dtype, kDLInt, 32)) {
+    } else if (raw_dtype == DLDataType{kDLInt, 32, 1}) {
       dtype = "int";
-    } else if (runtime::TypeMatch(tensor_ty->dtype, kDLInt, 64)) {
+    } else if (raw_dtype == DLDataType{kDLInt, 64, 1}) {
       dtype = "int64_t";
-    } else if (runtime::TypeMatch(tensor_ty->dtype, kDLInt, 8)) {
+    } else if (raw_dtype == DLDataType{kDLInt, 8, 1}) {
       dtype = "int8_t";
-    } else if (runtime::TypeMatch(tensor_ty->dtype, kDLUInt, 8)) {
+    } else if (raw_dtype == DLDataType{kDLUInt, 8, 1}) {
       dtype = "uint8_t";
     } else {
       TVM_FFI_THROW(InternalError) << "Unsupported dtype " << tensor_ty->dtype;
diff --git a/src/relax/backend/contrib/cublas/codegen.cc b/src/relax/backend/contrib/cublas/codegen.cc
index 5284de94f622..f2999b172136 100644
--- a/src/relax/backend/contrib/cublas/codegen.cc
+++ b/src/relax/backend/contrib/cublas/codegen.cc
@@ -86,11 +86,11 @@ class CublasJSONSerializer : public JSONSerializer {
         const auto* const_expr = dequantize_call->args[1].as<ConstantNode>();
         auto ty = const_expr->ty.as_or_throw<TensorType>();
         float alpha = 1.0;
-        if (ty->dtype == DataType::Float(16)) {
+        if (ty->dtype == PrimType::Float(16)) {
           alpha = __extendXfYf2__<uint16_t, uint16_t, 10, float, uint32_t, 23>(
               static_cast<uint16_t*>(const_expr->data->data)[0]);
         } else {
-          TVM_FFI_ICHECK(ty->dtype == DataType::Float(32));
+          TVM_FFI_ICHECK(ty->dtype == PrimType::Float(32));
           alpha = static_cast<float*>(const_expr->data->data)[0];
         }
 
diff --git a/src/relax/backend/contrib/utils.h b/src/relax/backend/contrib/utils.h
index 93916bf23236..6147a6eb2199 100644
--- a/src/relax/backend/contrib/utils.h
+++ b/src/relax/backend/contrib/utils.h
@@ -59,9 +59,7 @@ inline std::vector<int64_t> GetIntShape(const ffi::Array<PrimExpr>& shape) {
  * \param typ
  * \return std::string string format of type
  */
-inline std::string DType2String(const tvm::DataType dtype) {
-  return tvm::ffi::DLDataTypeToString(dtype);
-}
+inline std::string DType2String(DLDataType dtype) { return tvm::ffi::DLDataTypeToString(dtype); }
 
 /*!
  * \brief Check if a call node is calling an op with the given name
diff --git a/src/relax/backend/vm/codegen_vm_tir.cc b/src/relax/backend/vm/codegen_vm_tir.cc
index c1e9af85511c..3e2ac365d4fb 100644
--- a/src/relax/backend/vm/codegen_vm_tir.cc
+++ b/src/relax/backend/vm/codegen_vm_tir.cc
@@ -88,19 +88,19 @@ class CodeGenVMTIR : public ExprFunctor<ffi::Optional<PrimExpr>(const Expr&)> {
 
   PrimExpr RegListGet(int64_t slot) const {
     // use 128 bits to represent any
-    return tirx::Call(DataType::Handle(), tirx::builtin::anylist_getitem(),
+    return tirx::Call(tvm::PrimType::Handle(), tirx::builtin::anylist_getitem(),
                       {reg_anylist_handle_, ConstInt32(slot)});
   }
 
   PrimExpr ConstListGet(int64_t slot) const {
     // use 128 bits to represent any
-    return tirx::Call(DataType::Handle(), tirx::builtin::anylist_getitem(),
+    return tirx::Call(tvm::PrimType::Handle(), tirx::builtin::anylist_getitem(),
                       {const_anylist_handle_, ConstInt32(slot)});
   }
 
   PrimExpr FuncListGet(int64_t slot) const {
     // use 128 bits to represent any
-    return tirx::Call(DataType::Handle(), tirx::builtin::anylist_getitem(),
+    return tirx::Call(tvm::PrimType::Handle(), tirx::builtin::anylist_getitem(),
                       {func_anylist_handle_, ConstInt32(slot)});
   }
 
@@ -121,11 +121,11 @@ class CodeGenVMTIR : public ExprFunctor<ffi::Optional<PrimExpr>(const Expr&)> {
       all_args.push_back(arg);
     }
     if (dst_anylist_slot >= 0) {
-      this->EmitStmt(tirx::Evaluate(
-          tirx::Call(DataType::Int(32), tirx::builtin::anylist_setitem_call_packed(), all_args)));
+      this->EmitStmt(tirx::Evaluate(tirx::Call(
+          tvm::PrimType::Int(32), tirx::builtin::anylist_setitem_call_packed(), all_args)));
     } else {
       this->EmitStmt(tirx::Evaluate(
-          tirx::Call(DataType::Int(32), tirx::builtin::tvm_call_packed(), all_args)));
+          tirx::Call(tvm::PrimType::Int(32), tirx::builtin::tvm_call_packed(), all_args)));
     }
   }
 
@@ -143,11 +143,11 @@ class CodeGenVMTIR : public ExprFunctor<ffi::Optional<PrimExpr>(const Expr&)> {
       all_args.push_back(arg);
     }
     if (dst_anylist_slot >= 0) {
-      this->EmitStmt(tirx::Evaluate(
-          tirx::Call(DataType::Int(32), tirx::builtin::anylist_setitem_call_cpacked(), all_args)));
+      this->EmitStmt(tirx::Evaluate(tirx::Call(
+          tvm::PrimType::Int(32), tirx::builtin::anylist_setitem_call_cpacked(), all_args)));
     } else {
       this->EmitStmt(tirx::Evaluate(
-          tirx::Call(DataType::Int(32), tirx::builtin::tvm_call_cpacked(), all_args)));
+          tirx::Call(tvm::PrimType::Int(32), tirx::builtin::tvm_call_cpacked(), all_args)));
     }
   }
 
@@ -160,10 +160,10 @@ class CodeGenVMTIR : public ExprFunctor<ffi::Optional<PrimExpr>(const Expr&)> {
     stmt_stack_ = {};
     registers_num_ = 0;
     var_map_.clear();
-    ctx_ptr_ = tirx::Var("ctx_ptr", DataType::Handle());
-    reg_anylist_handle_ = tirx::Var("r", DataType::Handle());
-    func_anylist_handle_ = tirx::Var("f", DataType::Handle());
-    const_anylist_handle_ = tirx::Var("c", DataType::Handle());
+    ctx_ptr_ = tirx::Var("ctx_ptr", PrimType::Handle());
+    reg_anylist_handle_ = tirx::Var("r", PrimType::Handle());
+    func_anylist_handle_ = tirx::Var("f", PrimType::Handle());
+    const_anylist_handle_ = tirx::Var("c", PrimType::Handle());
 
     ffi::Array<ffi::String> param_names;
     for (Var param : func->params) {
@@ -231,7 +231,7 @@ class CodeGenVMTIR : public ExprFunctor<ffi::Optional<PrimExpr>(const Expr&)> {
     Call call = ffi::GetRef<Call>(call_node);
 
     if (call_node->op == null_value_op_) {
-      return tirx::Call(DataType::Handle(), tirx::builtin::reinterpret(), {IntImm::Int64(0)});
+      return tirx::Call(tvm::PrimType::Handle(), tirx::builtin::reinterpret(), {IntImm::Int64(0)});
     }
     int64_t dst_reg = HasVoidType(call) ? -1 : NewRegister();
     if (call->op.as<OpNode>()) {
@@ -264,7 +264,7 @@ class CodeGenVMTIR : public ExprFunctor<ffi::Optional<PrimExpr>(const Expr&)> {
     size_t merge_register = NewRegister();
     PrimExpr cond_value = this->VisitExpr(op->cond).value();
 
-    cond_value = tirx::Call(DataType::Bool(), tirx::builtin::tvm_call_packed(),
+    cond_value = tirx::Call(tvm::PrimType::Bool(), tirx::builtin::tvm_call_packed(),
                             {tirx::StringImm("vm.builtin.read_if_cond"), cond_value});
 
     tirx::Stmt true_branch = WithNewScope([&]() {
@@ -438,7 +438,7 @@ class CodeGenVMTIR : public ExprFunctor<ffi::Optional<PrimExpr>(const Expr&)> {
     TVM_FFI_ICHECK(tir_call->args[0].same_as(reg_anylist_handle_));
     const auto* p_dst_reg = tir_call->args[1].as<tirx::IntImmNode>();
     TVM_FFI_ICHECK(p_dst_reg != nullptr);
-    TVM_FFI_ICHECK(p_dst_reg->dtype == DataType::Int(32));
+    TVM_FFI_ICHECK(p_dst_reg->ty().MatchesElementType(DLDataTypeCode::kDLInt, 32));
 
     int64_t dst_reg = p_dst_reg->value;
     this->EmitCallPacked("vm.builtin.null_value", {}, dst_reg);
diff --git a/src/relax/backend/vm/lower_runtime_builtin.cc b/src/relax/backend/vm/lower_runtime_builtin.cc
index 344fc6a67e65..4a32efd81e5a 100644
--- a/src/relax/backend/vm/lower_runtime_builtin.cc
+++ b/src/relax/backend/vm/lower_runtime_builtin.cc
@@ -21,6 +21,7 @@
  * \brief Lowers most builtin functions and packed calls.
  */
 #include <tvm/ffi/cast.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/relax/analysis.h>
 #include <tvm/relax/attrs/op.h>
@@ -29,7 +30,6 @@
 #include <tvm/relax/expr_functor.h>
 #include <tvm/relax/op_attr_types.h>
 #include <tvm/relax/type.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/tirx/op.h>
 
 namespace tvm {
@@ -85,7 +85,7 @@ class LowerRuntimeBuiltinMutator : public ExprMutator {
   Expr MakeMemAllocStorage(const Call& call) {
     PrimValue runtime_device_index = call->args[1].as_or_throw<PrimValue>();
     StringImm storage_scope = call->args[2].as_or_throw<StringImm>();
-    DataTypeImm output_dtype = DataTypeImm(DataType::UInt(8));
+    DataTypeImm output_dtype = DataTypeImm((DLDataType{kDLUInt, 8, 1}));
     return Call(vm_alloc_storage_op_,
                 {call->args[0], runtime_device_index, output_dtype, storage_scope}, Attrs());
   }
diff --git a/src/relax/backend/vm/vm_shape_lower.cc b/src/relax/backend/vm/vm_shape_lower.cc
index 3d895349bbc3..6784489c5b32 100644
--- a/src/relax/backend/vm/vm_shape_lower.cc
+++ b/src/relax/backend/vm/vm_shape_lower.cc
@@ -229,7 +229,7 @@ class VMShapeLowerMutator
     slot_map_.clear();
     current_gvar_ = gvar;
     PrimExprSlotCollector::Collect(func, &slot_vec_, &slot_map_);
-    heap_size_ = IntImm(ShapeDType(), static_cast<int64_t>(slot_vec_.size()));
+    heap_size_ = IntImm(tvm::PrimType(ShapeDType()), static_cast<int64_t>(slot_vec_.size()));
     VarBinding shape_heap_binding = this->AllocShapeHeapBinding(heap_size_);
     shape_heap_ = shape_heap_binding->var;
 
@@ -298,7 +298,7 @@ class VMShapeLowerMutator
   //-------------------------------------------------------
   // PrimExpr slot handling
   //-------------------------------------------------------
-  static DataType ShapeDType() { return DataType::Int(64); }
+  static DLDataType ShapeDType() { return DLDataType{kDLInt, 64, 1}; }
 
   /*! \brief populate additional information in the slot. */
   void PopulateSlotInfo() {
@@ -329,7 +329,7 @@ class VMShapeLowerMutator
 
   VarBinding AllocShapeHeapBinding(IntImm heap_size) {
     if (heap_size->value > 0) {
-      TensorType heap_ty(ShapeDType(), 1);
+      TensorType heap_ty(PrimType(ShapeDType()), 1);
       Var var("shape_heap", heap_ty);
       // set up the builtin func.
       Call call(call_builtin_with_ctx_op_,
@@ -566,7 +566,7 @@ class VMShapeLowerMutator
     if (to_compute.size() == 0) return 0;
     TVM_FFI_ICHECK_GT(heap_size_->value, 0);
     // construct a PrimFunc that compute the shape.
-    tirx::Var heap("heap", DataType::Handle());
+    tirx::Var heap("heap", PrimType::Handle());
     ffi::Array<PrimExpr> buffer_shape{heap_size_};
     tirx::Buffer buffer = tirx::decl_buffer(buffer_shape, ShapeDType(), "H", "global");
     ffi::Map<tirx::Var, tirx::Buffer> buffer_map;
@@ -575,7 +575,8 @@ class VMShapeLowerMutator
     auto var_map = [&](const tirx::Var& var) -> ffi::Optional<PrimExpr> {
       auto it = slot_map_.find(var);
       TVM_FFI_ICHECK(it != slot_map_.end());
-      return tirx::BufferLoad(buffer, {IntImm(ShapeDType(), it->second->index)});
+      return tirx::BufferLoad(
+          buffer, ffi::Array<PrimExpr>{IntImm(tvm::PrimType(ShapeDType()), it->second->index)});
     };
 
     ffi::Array<tirx::Stmt> seq;
@@ -583,7 +584,8 @@ class VMShapeLowerMutator
       TVM_FFI_ICHECK(!slot->value_computed);
       slot->value_computed = true;
       PrimExpr value = tirx::Substitute(slot->expr, var_map);
-      seq.push_back(tirx::BufferStore(buffer, value, {IntImm(ShapeDType(), slot->index)}));
+      seq.push_back(
+          tirx::BufferStore(buffer, value, {IntImm(tvm::PrimType(ShapeDType()), slot->index)}));
     }
 
     tirx::Stmt body = tirx::SeqStmt::Flatten(seq);
@@ -678,10 +680,11 @@ class VMShapeLowerMutator
       // if we only check dynamic shapes, and the shape is static, we can skip.
       return;
     }
-    if (always_check || !IsBaseOf(TensorType(op->dtype, op->ndim), GetType(value))) {
+    if (always_check || !IsBaseOf(TensorType(PrimType(op->dtype), op->ndim), GetType(value))) {
       // check_tensor_info(value, ndim, dtype, err_ctx)
       Call call(builtin_check_tensor_info_,
-                {value, PrimValue::Int64(op->ndim), DataTypeImm(op->dtype), GetErrContext(err_ctx)},
+                {value, PrimValue::Int64(op->ndim), DataTypeImm(op->dtype->dtype),
+                 GetErrContext(err_ctx)},
                 Attrs(), {void_ty_});
       builder_->Emit(call, "_");
     }
diff --git a/src/relax/ir/dataflow_expr_rewriter.cc b/src/relax/ir/dataflow_expr_rewriter.cc
index 7b14a1f7e7e9..10fd67de1740 100644
--- a/src/relax/ir/dataflow_expr_rewriter.cc
+++ b/src/relax/ir/dataflow_expr_rewriter.cc
@@ -736,7 +736,7 @@ PatternMatchingRewriter PatternMatchingRewriter::FromModule(IRModule mod) {
       return ExternFuncPattern(func->global_symbol);
 
     } else if (auto prim = expr.as<PrimValueNode>()) {
-      return TypePattern(WildcardPattern(), PrimType(prim->value.dtype()));
+      return TypePattern(WildcardPattern(), PrimType(prim->value.ty()));
 
     } else {
       TVM_FFI_THROW(TypeError) << "Cannot convert Relax expression of type " << expr->GetTypeKey()
diff --git a/src/relax/ir/dataflow_matcher.cc b/src/relax/ir/dataflow_matcher.cc
index 08689bd10f0b..f75c540a96cd 100644
--- a/src/relax/ir/dataflow_matcher.cc
+++ b/src/relax/ir/dataflow_matcher.cc
@@ -573,8 +573,7 @@ bool DFPatternMatcher::VisitDFPattern_(const DataTypePatternNode* op, const Expr
   // no need to jump, as var.dtype == value.dtype
   auto expr_ty = expr.as<ExprNode>()->ty;
   if (const TensorTypeNode* tensor_ty = expr_ty.as<TensorTypeNode>()) {
-    return (ffi::StructuralEqual()(op->dtype, tensor_ty->dtype)) &&
-           VisitDFPattern(op->pattern, expr);
+    return op->dtype == tensor_ty->dtype->dtype && VisitDFPattern(op->pattern, expr);
   }
   return false;
 }
diff --git a/src/relax/ir/dataflow_pattern.cc b/src/relax/ir/dataflow_pattern.cc
index 5cb5352ec6c2..6302ee85049a 100644
--- a/src/relax/ir/dataflow_pattern.cc
+++ b/src/relax/ir/dataflow_pattern.cc
@@ -369,15 +369,15 @@ RELAX_PATTERN_PRINTER_DEF(SameShapeConstraintNode, [](auto p, auto node) {
   p->stream << ")";
 });
 
-DataTypePattern::DataTypePattern(DFPattern pattern, DataType dtype) {
+DataTypePattern::DataTypePattern(DFPattern pattern, DLDataType dtype) {
   ffi::ObjectPtr<DataTypePatternNode> n = ffi::make_object<DataTypePatternNode>();
   n->pattern = std::move(pattern);
-  n->dtype = std::move(dtype);
+  n->dtype = dtype;
   data_ = std::move(n);
 }
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("relax.dpl.DataTypePattern", [](DFPattern pattern, DataType dtype) {
+  refl::GlobalDef().def("relax.dpl.DataTypePattern", [](DFPattern pattern, DLDataType dtype) {
     return DataTypePattern(pattern, dtype);
   });
 }
@@ -474,11 +474,11 @@ AttrPattern DFPattern::HasAttr(const ffi::Map<ffi::String, Any>& attrs) const {
   return AttrPattern(*this, DictAttrs(attrs));
 }
 TypePattern DFPattern::HasType(const Type& ty) const { return TypePattern(*this, ty); }
-DataTypePattern DFPattern::HasDtype(const DataType& dtype) const {
+DataTypePattern DFPattern::HasDtype(DLDataType dtype) const {
   return DataTypePattern(*this, dtype);
 }
 DataTypePattern DFPattern::HasDtype(const std::string& dtype) const {
-  return HasDtype(DataType(ffi::StringToDLDataType(dtype)));
+  return HasDtype(ffi::StringToDLDataType(dtype));
 }
 ShapePattern DFPattern::HasShape(const ffi::Array<PrimExpr>& shape) const {
   return ShapePattern(*this, shape);
diff --git a/src/relax/ir/dependent_type.cc b/src/relax/ir/dependent_type.cc
index 6a2034ccc2a8..d95ebb1534e7 100644
--- a/src/relax/ir/dependent_type.cc
+++ b/src/relax/ir/dependent_type.cc
@@ -54,9 +54,9 @@ ShapeType::ShapeType(ffi::Array<PrimExpr> values, Span span) {
   n->ndim = static_cast<int>(values.size());
   n->values = values.Map([](PrimExpr value) {
     if (value->IsInstance<IntImmNode>()) {
-      return tvm::cast(DataType::Int(64), value);
+      return tvm::cast(PrimType::Int(64), value);
     }
-    TVM_FFI_ICHECK(value.dtype() == DataType::Int(64))
+    TVM_FFI_ICHECK(value.ty().MatchesElementType(DLDataTypeCode::kDLInt, 64))
         << "the value in ShapeType can only have dtype of int64";
     return value;
   });
@@ -86,7 +86,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 }
 
 // Tensor
-TensorType::TensorType(Expr shape, DataType dtype, ffi::Optional<VDevice> vdevice, Span span) {
+TensorType::TensorType(Expr shape, PrimType dtype, ffi::Optional<VDevice> vdevice, Span span) {
   ffi::ObjectPtr<TensorTypeNode> n = ffi::make_object<TensorTypeNode>();
   // assign ndim before move
   TVM_FFI_ICHECK(shape.defined()) << "Must provide a shape in this constructor";
@@ -103,7 +103,7 @@ TensorType::TensorType(Expr shape, DataType dtype, ffi::Optional<VDevice> vdevic
   data_ = std::move(n);
 }
 
-TensorType::TensorType(DataType dtype, int ndim, ffi::Optional<VDevice> vdevice, Span span) {
+TensorType::TensorType(PrimType dtype, int ndim, ffi::Optional<VDevice> vdevice, Span span) {
   ffi::ObjectPtr<TensorTypeNode> n = ffi::make_object<TensorTypeNode>();
   TVM_FFI_ICHECK(ndim >= -1) << "ndim of TensorType must be >= -1, but got " << ndim;
   n->ndim = ndim;
@@ -116,13 +116,14 @@ TensorType::TensorType(DataType dtype, int ndim, ffi::Optional<VDevice> vdevice,
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def(
-      "relax.TensorType", [](ffi::Optional<Expr> shape, ffi::Optional<DataType> dtype, int ndim,
+      "relax.TensorType", [](ffi::Optional<Expr> shape, ffi::Optional<PrimType> dtype, int ndim,
                              VDevice vdevice, Span span) {
+        PrimType resolved_dtype = dtype.value_or(PrimType(DLDataType{kDLOpaqueHandle, 0, 0}));
         if (shape.defined()) {
           TVM_FFI_CHECK_EQ(ndim, kUnknownNDim, ValueError) << "Cannot both specify shape and ndim";
-          return TensorType(shape.value(), dtype.value_or(DataType::Void()), vdevice, span);
+          return TensorType(shape.value(), resolved_dtype, vdevice, span);
         } else {
-          return TensorType(dtype.value_or(DataType::Void()), ndim, vdevice, span);
+          return TensorType(resolved_dtype, ndim, vdevice, span);
         }
       });
 }
diff --git a/src/relax/ir/emit_te.cc b/src/relax/ir/emit_te.cc
index 304911c1dca2..68e48eaf93b6 100644
--- a/src/relax/ir/emit_te.cc
+++ b/src/relax/ir/emit_te.cc
@@ -42,7 +42,7 @@ te::Tensor TETensor(Expr value, ffi::Map<tirx::Var, PrimExpr> tir_var_map, std::
   // checked-type might not be properly set. In this case we set the shape and dtype of the returned
   // TE tensor.
   if (const auto* constant = value.as<ConstantNode>()) {
-    n->dtype = DataType(constant->data->dtype);
+    n->dtype = PrimType(constant->data->dtype);
 
     int ndim = constant->data->ndim;
     ffi::Shape shape_tuple = constant->data.Shape();
diff --git a/src/relax/ir/expr.cc b/src/relax/ir/expr.cc
index 11e80135500a..b4c4486f0dd4 100644
--- a/src/relax/ir/expr.cc
+++ b/src/relax/ir/expr.cc
@@ -257,9 +257,9 @@ ShapeExpr::ShapeExpr(ffi::Array<PrimExpr> values, Span span) {
 
   n->values = values.Map([](PrimExpr value) {
     if (value->IsInstance<IntImmNode>()) {
-      return tvm::cast(DataType::Int(64), value);
+      return tvm::cast(PrimType::Int(64), value);
     }
-    TVM_FFI_ICHECK(value.dtype() == DataType::Int(64))
+    TVM_FFI_ICHECK(value.ty().MatchesElementType(DLDataTypeCode::kDLInt, 64))
         << "the value in ShapeType can only have dtype of int64";
     return value;
   });
@@ -350,7 +350,7 @@ Constant::Constant(runtime::Tensor data, ffi::Optional<Type> ty_annotation, Span
   if (ty_annotation.defined()) {
     n->ty = ty_annotation.value();
   } else {
-    TensorType tinfo(ShapeExpr(values), n->data.DataType(), VDevice(), span);
+    TensorType tinfo(ShapeExpr(values), PrimType(n->data.DataType()), VDevice(), span);
     n->ty = tinfo;
   }
 
@@ -366,7 +366,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 PrimValue::PrimValue(PrimExpr value, Span span) {
   ffi::ObjectPtr<PrimValueNode> n = ffi::make_object<PrimValueNode>();
-  n->ty = PrimType(value.dtype());
+  n->ty = PrimType(value.ty());
   n->value = std::move(value);
   n->span = std::move(span);
   data_ = std::move(n);
@@ -396,9 +396,9 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                         [](ffi::String value, Span span) { return StringImm(value, span); });
 }
 
-DataTypeImm::DataTypeImm(DataType value, Span span) {
+DataTypeImm::DataTypeImm(DLDataType value, Span span) {
   ffi::ObjectPtr<DataTypeImmNode> n = ffi::make_object<DataTypeImmNode>();
-  n->value = std::move(value);
+  n->value = value;
   n->span = std::move(span);
   n->ty = ObjectType();
   data_ = std::move(n);
@@ -407,7 +407,7 @@ DataTypeImm::DataTypeImm(DataType value, Span span) {
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("relax.DataTypeImm",
-                        [](DataType value, Span span) { return DataTypeImm(value, span); });
+                        [](DLDataType value, Span span) { return DataTypeImm(value, span); });
 }
 
 MatchCast::MatchCast(Var var, Expr value, Type ty, Span span) {
diff --git a/src/relax/op/ccl/ccl.cc b/src/relax/op/ccl/ccl.cc
index dd67f65dea09..15b8064d2b6f 100644
--- a/src/relax/op/ccl/ccl.cc
+++ b/src/relax/op/ccl/ccl.cc
@@ -85,7 +85,7 @@ Type InferTypeAllGather(const Call& call, const BlockBuilder& ctx) {
   const auto* attrs = call->attrs.as<AllGatherAttrs>();
   int num_workers = attrs->num_workers;
 
-  DataType output_dtype = input_ty->dtype;
+  PrimType output_dtype = input_ty->dtype;
   auto input_shape = input_ty->GetShape();
   if (!input_shape.defined()) {
     return input_ty;
@@ -143,7 +143,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 Type InferTypeScatter(const Call& call, const BlockBuilder& ctx) {
   TensorType input_ty = GetUnaryInputTensorType(call, ctx);
-  DataType output_dtype = input_ty->dtype;
+  PrimType output_dtype = input_ty->dtype;
 
   const auto* attrs = call->attrs.as<ScatterCollectiveAttrs>();
   int num_workers = attrs->num_workers;
diff --git a/src/relax/op/distributed/binary.cc b/src/relax/op/distributed/binary.cc
index 766d60edb86f..daaacff4121b 100644
--- a/src/relax/op/distributed/binary.cc
+++ b/src/relax/op/distributed/binary.cc
@@ -31,7 +31,7 @@ Type InferDistTypeBroadcastCMP(const Call& call, const BlockBuilder& ctx) {
   return InferDistTypeBroadcast(
       call, ctx,
       [](const Call& call, const BlockBuilder& ctx, const TensorType& x1_ty,
-         const TensorType& x2_ty) { return DataType::Bool(); });
+         const TensorType& x2_ty) { return DLDataType{kDLBool, 8, 1}; });
 }
 
 /***************** Arithmetic operators *****************/
diff --git a/src/relax/op/distributed/binary.h b/src/relax/op/distributed/binary.h
index 5fd39b50f364..a6d3fd9ba124 100644
--- a/src/relax/op/distributed/binary.h
+++ b/src/relax/op/distributed/binary.h
@@ -41,8 +41,8 @@ Type InferDistTypeBroadcast(const Call& call, const BlockBuilder& ctx, FType f_c
   TensorType x1_ty = input_dtensor_tys[0]->tensor_ty;
   TensorType x2_ty = input_dtensor_tys[1]->tensor_ty;
 
-  // DateType
-  DataType output_dtype = f_compute_out_dtype(call, ctx, x1_ty, x2_ty);
+  // Dtype
+  PrimType output_dtype(f_compute_out_dtype(call, ctx, x1_ty, x2_ty));
 
   // ndims
   TVM_FFI_ICHECK(!x1_ty->IsUnknownNdim() && !x2_ty->IsUnknownNdim())
diff --git a/src/relax/op/distributed/distributed.cc b/src/relax/op/distributed/distributed.cc
index b009630070cd..ff5bc986c0c7 100644
--- a/src/relax/op/distributed/distributed.cc
+++ b/src/relax/op/distributed/distributed.cc
@@ -154,7 +154,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 Type InferTypeRtoS(const Call& call, const BlockBuilder& ctx) {
   TensorType input_ty = GetUnaryInputTensorType(call, ctx);
-  DataType output_dtype = input_ty->dtype;
+  PrimType output_dtype = input_ty->dtype;
 
   const auto* attrs = call->attrs.as<ScatterCollectiveAttrs>();
   int num_workers = attrs->num_workers;
diff --git a/src/relax/op/distributed/linear_algebra.cc b/src/relax/op/distributed/linear_algebra.cc
index 80fccbe115a9..b498f1a4a953 100644
--- a/src/relax/op/distributed/linear_algebra.cc
+++ b/src/relax/op/distributed/linear_algebra.cc
@@ -32,9 +32,9 @@ Type InferDistTypeMatmul(const Call& call, const BlockBuilder& ctx) {
   TensorType x2_ty = input_dtensor_tys[1]->tensor_ty;
 
   const auto* attrs = call->attrs.as<MatmulAttrs>();
-  DataType out_dtype = attrs->out_dtype.is_void()
-                           ? InferBinaryArithOpOutDtype(call, ctx, x1_ty, x2_ty)
-                           : attrs->out_dtype;
+  PrimType out_dtype = PrimType(attrs->out_dtype == DLDataType{kDLOpaqueHandle, 0, 0}
+                                    ? InferBinaryArithOpOutDtype(call, ctx, x1_ty, x2_ty)
+                                    : attrs->out_dtype);
 
   if (x1_ty->IsUnknownNdim() || x2_ty->IsUnknownNdim()) {
     TVM_FFI_VISIT_THROW(ValueError, call)
diff --git a/src/relax/op/distributed/nn.cc b/src/relax/op/distributed/nn.cc
index 1339a18e72d0..386401521974 100644
--- a/src/relax/op/distributed/nn.cc
+++ b/src/relax/op/distributed/nn.cc
@@ -33,7 +33,9 @@ Type InferDistTypeSoftmax(const Call& call, const BlockBuilder& ctx) {
   if (input_tensor_ty->IsUnknownNdim()) {
     TVM_FFI_VISIT_THROW(ValueError, call) << "Input of distributed operator must have known ndim";
   }
-  if (!input_tensor_ty->IsUnknownDtype() && !input_tensor_ty->dtype.is_float()) {
+  PrimType input_dtype = input_tensor_ty->dtype;
+  // Softmax validation preserves the old float-kind check; lanes do not affect this policy.
+  if (!input_tensor_ty->IsUnknownDtype() && input_dtype.code() != DLDataTypeCode::kDLFloat) {
     TVM_FFI_VISIT_THROW(TypeError, call) << "Softmax requires the input tensor to have float "
                                             "dtype. However, the given input dtype is "
                                          << input_tensor_ty->dtype;
diff --git a/src/relax/op/distributed/unary.cc b/src/relax/op/distributed/unary.cc
index 4356b403c6d9..8e4ccce23a9c 100644
--- a/src/relax/op/distributed/unary.cc
+++ b/src/relax/op/distributed/unary.cc
@@ -25,7 +25,7 @@ namespace distributed {
 
 Type InferDistTypeUnaryCheck(const Call& call, const BlockBuilder& ctx) {
   return InferDistTypeUnary<false>(call, ctx,
-                                   [](const TensorType& input_ty) { return DataType::Bool(); });
+                                   [](const TensorType& input_ty) { return PrimType::Bool(); });
 }
 
 RELAX_REGISTER_UNARY_ARITH_DIST_INFER_TYPE(abs, /*require_float_dtype=*/false);
diff --git a/src/relax/op/distributed/unary.h b/src/relax/op/distributed/unary.h
index 92c719ad0b98..be7ca27d3ade 100644
--- a/src/relax/op/distributed/unary.h
+++ b/src/relax/op/distributed/unary.h
@@ -40,15 +40,22 @@ Type InferDistTypeUnary(const Call& call, const BlockBuilder& ctx, FType f_compu
   distributed::DTensorType input_dtensor_ty = input_dtensor_tys[0];
   TensorType input_tensor_ty = input_dtensor_ty->tensor_ty;
 
+  PrimType input_dtype = input_tensor_ty->dtype;
+  // Unary op validation preserves the old float-kind check; lanes do not affect this policy.
   if (require_float_dtype && !input_tensor_ty->IsUnknownDtype() &&
-      !input_tensor_ty->dtype.is_float()) {
+      input_dtype.code() != DLDataTypeCode::kDLFloat) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << call->op
         << " requires the input tensor to have float dtype. However, the given input dtype is "
         << input_tensor_ty->dtype;
   }
   auto output_ty = ffi::make_object<TensorTypeNode>(*input_tensor_ty.get());
-  output_ty->dtype = f_compute_out_dtype(input_tensor_ty);
+  auto computed_dtype = f_compute_out_dtype(input_tensor_ty);
+  if constexpr (std::is_same_v<std::decay_t<decltype(computed_dtype)>, PrimType>) {
+    output_ty->dtype = computed_dtype;
+  } else {
+    output_ty->dtype = PrimType(computed_dtype);
+  }
   TensorType out_tensor_ty(output_ty);
   return distributed::DTensorType(out_tensor_ty, input_dtensor_ty->device_mesh,
                                   input_dtensor_ty->placement);
diff --git a/src/relax/op/image/resize.cc b/src/relax/op/image/resize.cc
index b92167e031f1..82b12c0fe26f 100644
--- a/src/relax/op/image/resize.cc
+++ b/src/relax/op/image/resize.cc
@@ -41,7 +41,7 @@ TVM_FFI_STATIC_INIT_BLOCK() { Resize3DAttrs::RegisterReflection(); }
 Expr resize2d(Expr data, Expr size, ffi::Array<FloatImm> roi, ffi::String layout,
               ffi::String method, ffi::String coordinate_transformation_mode,
               ffi::String rounding_method, double cubic_alpha, int cubic_exclude,
-              double extrapolation_value, ffi::Optional<DataType> out_dtype) {
+              double extrapolation_value, ffi::Optional<DLDataType> out_dtype) {
   ffi::ObjectPtr<Resize2DAttrs> attrs = ffi::make_object<Resize2DAttrs>();
   attrs->roi = std::move(roi);
   attrs->layout = std::move(layout);
@@ -51,7 +51,7 @@ Expr resize2d(Expr data, Expr size, ffi::Array<FloatImm> roi, ffi::String layout
   attrs->cubic_alpha = cubic_alpha;
   attrs->cubic_exclude = cubic_exclude;
   attrs->extrapolation_value = extrapolation_value;
-  attrs->out_dtype = out_dtype.value_or(DataType::Void());
+  attrs->out_dtype = out_dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
 
   static const Op& op = Op::Get("relax.image.resize2d");
   return Call(op, {std::move(data), std::move(size)}, Attrs(attrs), {});
@@ -93,7 +93,9 @@ Type InferTypeResize2D(const Call& call, const BlockBuilder& ctx) {
                                                     /*tgt_layout=*/"NCHW",     //
                                                     /*tensor_name=*/"data");
 
-  DataType out_dtype = attrs->out_dtype.is_void() ? data_ty->dtype : attrs->out_dtype;
+  PrimType out_dtype = attrs->out_dtype == DLDataType{kDLOpaqueHandle, 0, 0}
+                           ? data_ty->dtype
+                           : PrimType(attrs->out_dtype);
 
   ffi::Optional<ShapeExpr> data_shape =
       CheckNdimPerLayoutAndGetShape(call, ctx, ffi::GetRef<TensorType>(data_ty), data_layout);
@@ -155,7 +157,7 @@ TVM_REGISTER_OP("relax.image.resize2d")
 Expr resize3d(Expr data, Expr size, ffi::Array<FloatImm> roi, ffi::String layout,
               ffi::String method, ffi::String coordinate_transformation_mode,
               ffi::String rounding_method, double cubic_alpha, int cubic_exclude,
-              double extrapolation_value, ffi::Optional<DataType> out_dtype) {
+              double extrapolation_value, ffi::Optional<DLDataType> out_dtype) {
   ffi::ObjectPtr<Resize3DAttrs> attrs = ffi::make_object<Resize3DAttrs>();
   attrs->roi = std::move(roi);
   attrs->layout = std::move(layout);
@@ -165,7 +167,7 @@ Expr resize3d(Expr data, Expr size, ffi::Array<FloatImm> roi, ffi::String layout
   attrs->cubic_alpha = cubic_alpha;
   attrs->cubic_exclude = cubic_exclude;
   attrs->extrapolation_value = extrapolation_value;
-  attrs->out_dtype = out_dtype.value_or(DataType::Void());
+  attrs->out_dtype = out_dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
 
   static const Op& op = Op::Get("relax.image.resize3d");
   return Call(op, {std::move(data), std::move(size)}, Attrs(attrs), {});
@@ -207,7 +209,9 @@ Type InferTypeResize3D(const Call& call, const BlockBuilder& ctx) {
                                                      /*tgt_layout=*/"NCDHW",    //
                                                      /*tensor_name=*/"data");
 
-  DataType out_dtype = attrs->out_dtype.is_void() ? data_ty->dtype : attrs->out_dtype;
+  PrimType out_dtype = attrs->out_dtype == DLDataType{kDLOpaqueHandle, 0, 0}
+                           ? data_ty->dtype
+                           : PrimType(attrs->out_dtype);
 
   ffi::Optional<ShapeExpr> data_shape =
       CheckNdimPerLayoutAndGetShape(call, ctx, ffi::GetRef<TensorType>(data_ty), data_layout);
@@ -315,7 +319,7 @@ Type InferTypeGridSample(const Call& call, const BlockBuilder& ctx) {
                                                    /*tgt_layout=*/is_ncdhw ? "NCDHW" : "NCHW",
                                                    /*tensor_name=*/"data");
 
-  DataType out_dtype = data_ty->dtype;
+  PrimType out_dtype = data_ty->dtype;
 
   ffi::Optional<ShapeExpr> data_shape =
       CheckNdimPerLayoutAndGetShape(call, ctx, ffi::GetRef<TensorType>(data_ty), data_layout);
@@ -422,7 +426,7 @@ Type InferTypeAffineGrid(const Call& call, const BlockBuilder& ctx) {
     }
   }
 
-  DataType out_dtype = data_ty->dtype;
+  PrimType out_dtype = data_ty->dtype;
 
   if (data_shape == nullptr || size_value == nullptr) {
     return TensorType(out_dtype, /*ndim=*/4, data_ty->vdevice);
diff --git a/src/relax/op/image/resize.h b/src/relax/op/image/resize.h
index 382a3a162be2..1aaed69f9146 100644
--- a/src/relax/op/image/resize.h
+++ b/src/relax/op/image/resize.h
@@ -36,13 +36,13 @@ namespace relax {
 Expr resize2d(Expr data, Expr size, ffi::Array<FloatImm> roi, ffi::String layout,
               ffi::String method, ffi::String coordinate_transformation_mode,
               ffi::String rounding_method, double cubic_alpha, int cubic_exclude,
-              double extrapolation_value, ffi::Optional<DataType> out_dtype);
+              double extrapolation_value, ffi::Optional<DLDataType> out_dtype);
 
 /*! \brief Image resize3d operator. */
 Expr resize3d(Expr data, Expr size, ffi::Array<FloatImm> roi, ffi::String layout,
               ffi::String method, ffi::String coordinate_transformation_mode,
               ffi::String rounding_method, double cubic_alpha, int cubic_exclude,
-              double extrapolation_value, ffi::Optional<DataType> out_dtype);
+              double extrapolation_value, ffi::Optional<DLDataType> out_dtype);
 
 /*! \brief Image grid_sample operator. */
 Expr grid_sample(Expr data, Expr grid, ffi::String method, ffi::String layout,
diff --git a/src/relax/op/memory/view.cc b/src/relax/op/memory/view.cc
index 25ad9aa66d8e..f2c5b7da8614 100644
--- a/src/relax/op/memory/view.cc
+++ b/src/relax/op/memory/view.cc
@@ -87,7 +87,7 @@ Type InferTypeView(const Call& call, const BlockBuilder& ctx) {
     }
   }();
 
-  auto view_dtype = [&]() -> std::optional<DataType> {
+  auto view_dtype = [&]() -> std::optional<DLDataType> {
     Type ty = GetType(arg_dtype);
 
     if (HasVoidType(arg_dtype)) {
@@ -116,7 +116,7 @@ Type InferTypeView(const Call& call, const BlockBuilder& ctx) {
     } else if (ty.as<ObjectTypeNode>()) {
       // The view changes the datatype, but we don't know what it is
       // being changed into.
-      return DataType::Void();
+      return DLDataType{kDLOpaqueHandle, 0, 0};
     } else {
       TVM_FFI_THROW(TypeError) << "Operator " << call->op
                                << " expects the dtype argument to be a relax::DataTypeImm, "
@@ -131,7 +131,7 @@ Type InferTypeView(const Call& call, const BlockBuilder& ctx) {
       // No byte offset is specified, so no change is applied.
       return IntImm::Int64(0);
     } else if (auto prim_ty = ty.as<PrimTypeNode>()) {
-      TVM_FFI_CHECK_EQ(prim_ty->dtype, DataType::Int(64), TypeError)
+      TVM_FFI_CHECK_EQ(prim_ty->dtype, (DLDataType{kDLInt, 64, 1}), TypeError)
           << "Operator " << call->op
           << " expects the relative_byte_offset to be a 64-bit integer, but received "
           << arg_relative_byte_offset << ", which has type " << ty;
@@ -167,16 +167,15 @@ Type InferTypeView(const Call& call, const BlockBuilder& ctx) {
     output_ndim = data_ty->ndim;
   }
 
-  DataType output_dtype = view_dtype.value_or(data_ty->dtype);
+  DLDataType output_raw_dtype = view_dtype.value_or(data_ty->dtype->dtype);
+  PrimType output_dtype(output_raw_dtype);
 
-  // Helper function, returns the number of bytes per vectorized
-  // element.  Cannot use `DataType::bytes`, as it returns the
-  // number of bytes per scalar element.
-  auto get_size_bytes = [](const DataType& dtype) -> ffi::Optional<IntImm> {
-    if (dtype.is_void()) {
+  // Helper function returns the number of bytes per vectorized element.
+  auto get_size_bytes = [](DLDataType dtype) -> ffi::Optional<IntImm> {
+    if ((((dtype).code == kDLOpaqueHandle) && ((dtype).bits == 0) && ((dtype).lanes == 0))) {
       return std::nullopt;
     } else {
-      auto size_bits = dtype.bits() * dtype.lanes();
+      auto size_bits = ((dtype).bits) * static_cast<int16_t>((dtype).lanes);
       return IntImm::Int64((size_bits + 7) / 8);
     }
   };
@@ -199,8 +198,8 @@ Type InferTypeView(const Call& call, const BlockBuilder& ctx) {
   ffi::Optional<PrimExpr> input_nelements = get_num_elements(input_shape);
   ffi::Optional<PrimExpr> output_nelements = get_num_elements(output_shape);
 
-  ffi::Optional<IntImm> input_element_size = get_size_bytes(data_ty->dtype);
-  ffi::Optional<IntImm> output_element_size = get_size_bytes(output_dtype);
+  ffi::Optional<IntImm> input_element_size = get_size_bytes(data_ty->dtype->dtype);
+  ffi::Optional<IntImm> output_element_size = get_size_bytes(output_raw_dtype);
 
   if (input_nelements && output_nelements && input_element_size && output_element_size &&
       view_relative_byte_offset) {
@@ -329,8 +328,9 @@ Expr LowerBuiltinView(const BlockBuilder& bb, const Call& call) {
   }
 
   if (HasVoidType(dtype)) {
-    auto data_dtype = data->ty.as<TensorType>().value()->dtype;
-    TVM_FFI_ICHECK(!data_dtype.is_void())
+    DLDataType data_dtype = data->ty.as<TensorType>().value()->dtype->dtype;
+    TVM_FFI_ICHECK(!(((data_dtype).code == kDLOpaqueHandle) && ((data_dtype).bits == 0) &&
+                     ((data_dtype).lanes == 0)))
         << "Legalization of " << call->op
         << " requires that either the output dtype be explicitly specified, "
         << "or the input dtype is known.  "
diff --git a/src/relax/op/nn/attention.cc b/src/relax/op/nn/attention.cc
index 83080537c1d0..62e7d2959346 100644
--- a/src/relax/op/nn/attention.cc
+++ b/src/relax/op/nn/attention.cc
@@ -143,7 +143,7 @@ Type InferTypeAttention(const Call& call, const BlockBuilder& ctx) {
   return TensorType(ShapeExpr(output_shape), q_ty->dtype, q_ty->vdevice);
 }
 
-Call InferMixedPrecisionAttention(const Call& call, const DataType& out_dtype) {
+Call InferMixedPrecisionAttention(const Call& call, DLDataType out_dtype) {
   return attention(call->args[0], call->args[1], call->args[2], std::nullopt, std::nullopt,
                    std::nullopt, std::nullopt)
       .as_or_throw<Call>();
diff --git a/src/relax/op/nn/convolution.cc b/src/relax/op/nn/convolution.cc
index 1fa9b9b1ae94..90d58a9e662d 100644
--- a/src/relax/op/nn/convolution.cc
+++ b/src/relax/op/nn/convolution.cc
@@ -47,7 +47,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 Expr conv1d(Expr data, Expr weight, ffi::Array<int64_t> strides, ffi::Array<int64_t> padding,
             ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
             ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-            ffi::Optional<DataType> out_dtype) {
+            ffi::Optional<DLDataType> out_dtype) {
   padding = GetCompletePadding1D(std::move(padding));
 
   TVM_FFI_ICHECK_GT(groups, 0)
@@ -62,7 +62,8 @@ Expr conv1d(Expr data, Expr weight, ffi::Array<int64_t> strides, ffi::Array<int6
   return MakeConv<Conv1DAttrs>(std::move(data), std::move(weight), std::move(strides),
                                std::move(padding), std::move(dilation), groups, data_layout,
                                std::move(kernel_layout), out_layout.value_or(data_layout),
-                               out_dtype.value_or(DataType::Void()), /*op_name=*/"relax.nn.conv1d");
+                               out_dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0})),
+                               /*op_name=*/"relax.nn.conv1d");
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
@@ -91,9 +92,9 @@ Type InferTypeConv1d(const Call& call, const BlockBuilder& ctx) {
   ffi::Optional<ShapeExpr> weight_shape =
       CheckNdimPerLayoutAndGetShape(call, ctx, weight_ty, weight_layout);
 
-  DataType out_dtype = attrs->out_dtype.is_void()
-                           ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
-                           : attrs->out_dtype;
+  PrimType out_dtype = PrimType(attrs->out_dtype == DLDataType{kDLOpaqueHandle, 0, 0}
+                                    ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
+                                    : attrs->out_dtype);
   ffi::Optional<VDevice> vdevice = InferBinaryArithOpOutVDevice(call, ctx, data_ty, weight_ty);
   if (!data_shape.defined() || !weight_shape.defined()) {
     return TensorType(out_dtype, out_layout.ndim(), vdevice);
@@ -186,7 +187,7 @@ InferLayoutOutput InferLayoutConv1d(
   return InferLayoutOutput({data_layout, weight_layout}, {output_layout}, Attrs(new_attrs));
 }
 
-Call InferMixedPrecisionConv1d(const Call& call, const DataType& out_dtype) {
+Call InferMixedPrecisionConv1d(const Call& call, DLDataType out_dtype) {
   const auto* conv1d_attrs = call->attrs.as<Conv1DAttrs>();
   return conv1d(call->args[0], call->args[1], conv1d_attrs->strides, conv1d_attrs->padding,
                 conv1d_attrs->dilation, conv1d_attrs->groups, conv1d_attrs->data_layout,
@@ -210,7 +211,7 @@ TVM_REGISTER_OP("relax.nn.conv1d")
 Expr conv2d(Expr data, Expr weight, ffi::Array<int64_t> strides, ffi::Array<int64_t> padding,
             ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
             ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-            ffi::Optional<DataType> out_dtype) {
+            ffi::Optional<DLDataType> out_dtype) {
   padding = GetCompletePadding2D(std::move(padding));
   if (strides.size() == 1) {
     strides.push_back(strides[0]);
@@ -231,7 +232,8 @@ Expr conv2d(Expr data, Expr weight, ffi::Array<int64_t> strides, ffi::Array<int6
   return MakeConv<Conv2DAttrs>(std::move(data), std::move(weight), std::move(strides),
                                std::move(padding), std::move(dilation), groups, data_layout,
                                std::move(kernel_layout), out_layout.value_or(data_layout),
-                               out_dtype.value_or(DataType::Void()), /*op_name=*/"relax.nn.conv2d");
+                               out_dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0})),
+                               /*op_name=*/"relax.nn.conv2d");
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
@@ -260,9 +262,9 @@ Type InferTypeConv2d(const Call& call, const BlockBuilder& ctx) {
   ffi::Optional<ShapeExpr> weight_shape =
       CheckNdimPerLayoutAndGetShape(call, ctx, weight_ty, weight_layout);
 
-  DataType out_dtype = attrs->out_dtype.is_void()
-                           ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
-                           : attrs->out_dtype;
+  PrimType out_dtype = PrimType(attrs->out_dtype == DLDataType{kDLOpaqueHandle, 0, 0}
+                                    ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
+                                    : attrs->out_dtype);
   ffi::Optional<VDevice> vdevice = InferBinaryArithOpOutVDevice(call, ctx, data_ty, weight_ty);
   if (!data_shape.defined() || !weight_shape.defined()) {
     return TensorType(out_dtype, out_layout.ndim(), vdevice);
@@ -336,9 +338,10 @@ InferLayoutOutput InferLayoutConv2d(
     SLayout desired_data_layout = (*it).second[0];
     SLayout desired_weight_layout = (*it).second[1];
     SLayout desired_output_layout = (*it).second.size() == 3 ? (*it).second[2] : (*it).second[0];
-    tirx::SLayout input_layout(attrs->data_layout, DataType::Int(64));
-    tirx::SLayout kernel_layout(attrs->kernel_layout, DataType::Int(64));
-    tirx::SLayout out_layout(attrs->out_layout, DataType::Int(64));
+    tvm::PrimType i64_ty = tvm::PrimType::Int(64);
+    tirx::SLayout input_layout(attrs->data_layout, i64_ty);
+    tirx::SLayout kernel_layout(attrs->kernel_layout, i64_ty);
+    tirx::SLayout out_layout(attrs->out_layout, i64_ty);
 
     if ((desired_data_layout.ndim() == input_layout.ndim()) &&
         (desired_weight_layout.ndim() == kernel_layout.ndim()) &&
@@ -396,7 +399,7 @@ InferLayoutOutput InferLayoutConv2d(
   return InferLayoutOutput({data_layout, weight_layout}, {output_layout}, Attrs(new_attrs));
 }
 
-Call InferMixedPrecisionConv2d(const Call& call, const DataType& out_dtype) {
+Call InferMixedPrecisionConv2d(const Call& call, DLDataType out_dtype) {
   const auto* conv2d_attrs = call->attrs.as<Conv2DAttrs>();
   return conv2d(call->args[0], call->args[1], conv2d_attrs->strides, conv2d_attrs->padding,
                 conv2d_attrs->dilation, conv2d_attrs->groups, conv2d_attrs->data_layout,
@@ -420,7 +423,7 @@ TVM_REGISTER_OP("relax.nn.conv2d")
 Expr conv3d(Expr data, Expr weight, ffi::Array<int64_t> strides, ffi::Array<int64_t> padding,
             ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
             ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-            ffi::Optional<DataType> out_dtype) {
+            ffi::Optional<DLDataType> out_dtype) {
   padding = GetCompletePadding3D(std::move(padding));
   if (strides.size() == 1) {
     strides.push_back(strides[0]);
@@ -443,7 +446,8 @@ Expr conv3d(Expr data, Expr weight, ffi::Array<int64_t> strides, ffi::Array<int6
   return MakeConv<Conv3DAttrs>(std::move(data), std::move(weight), std::move(strides),
                                std::move(padding), std::move(dilation), groups, data_layout,
                                std::move(kernel_layout), out_layout.value_or(data_layout),
-                               out_dtype.value_or(DataType::Void()), /*op_name=*/"relax.nn.conv3d");
+                               out_dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0})),
+                               /*op_name=*/"relax.nn.conv3d");
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
@@ -472,9 +476,9 @@ Type InferTypeConv3d(const Call& call, const BlockBuilder& ctx) {
   ffi::Optional<ShapeExpr> weight_shape =
       CheckNdimPerLayoutAndGetShape(call, ctx, weight_ty, weight_layout);
 
-  DataType out_dtype = attrs->out_dtype.is_void()
-                           ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
-                           : attrs->out_dtype;
+  PrimType out_dtype = PrimType(attrs->out_dtype == DLDataType{kDLOpaqueHandle, 0, 0}
+                                    ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
+                                    : attrs->out_dtype);
   ffi::Optional<VDevice> vdevice = InferBinaryArithOpOutVDevice(call, ctx, data_ty, weight_ty);
   if (!data_shape.defined() || !weight_shape.defined()) {
     return TensorType(out_dtype, out_layout.ndim(), vdevice);
@@ -581,7 +585,7 @@ InferLayoutOutput InferLayoutConv3d(
   return InferLayoutOutput({data_layout, weight_layout}, {output_layout}, Attrs(new_attrs));
 }
 
-Call InferMixedPrecisionConv3d(const Call& call, const DataType& out_dtype) {
+Call InferMixedPrecisionConv3d(const Call& call, DLDataType out_dtype) {
   const auto* conv3d_attrs = call->attrs.as<Conv3DAttrs>();
   return conv3d(call->args[0], call->args[1], conv3d_attrs->strides, conv3d_attrs->padding,
                 conv3d_attrs->dilation, conv3d_attrs->groups, conv3d_attrs->data_layout,
@@ -604,7 +608,7 @@ Expr conv1d_transpose(Expr data, Expr weight, ffi::Array<int64_t> strides,
                       ffi::Array<int64_t> padding, ffi::Array<int64_t> output_padding,
                       ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
                       ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-                      ffi::Optional<DataType> out_dtype) {
+                      ffi::Optional<DLDataType> out_dtype) {
   padding = GetCompletePadding1D(std::move(padding));
 
   TVM_FFI_ICHECK_GT(groups, 0)
@@ -630,7 +634,7 @@ Expr conv1d_transpose(Expr data, Expr weight, ffi::Array<int64_t> strides,
   attrs->data_layout = data_layout;
   attrs->kernel_layout = std::move(kernel_layout);
   attrs->out_layout = out_layout.value_or(data_layout);
-  attrs->out_dtype = std::move(out_dtype.value_or(DataType::Void()));
+  attrs->out_dtype = out_dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
   const Op& op = Op::Get("relax.nn.conv1d_transpose");
   return Call(op, {data, weight}, Attrs(attrs), {});
 }
@@ -660,9 +664,9 @@ Type InferTypeConv1dTranspose(const Call& call, const BlockBuilder& ctx) {
   ffi::Optional<ShapeExpr> weight_shape =
       CheckNdimPerLayoutAndGetShape(call, ctx, weight_ty, weight_layout);
 
-  DataType out_dtype = attrs->out_dtype.is_void()
-                           ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
-                           : attrs->out_dtype;
+  PrimType out_dtype = PrimType(attrs->out_dtype == DLDataType{kDLOpaqueHandle, 0, 0}
+                                    ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
+                                    : attrs->out_dtype);
   ffi::Optional<VDevice> vdevice = InferBinaryArithOpOutVDevice(call, ctx, data_ty, weight_ty);
   if (!data_shape.defined() || !weight_shape.defined()) {
     return TensorType(out_dtype, out_layout.ndim(), vdevice);
@@ -758,7 +762,7 @@ InferLayoutOutput InferLayoutConv1dTranspose(
   return InferLayoutOutput({data_layout, weight_layout}, {output_layout}, Attrs(new_attrs));
 }
 
-Call InferMixedPrecisionConv1dTranspose(const Call& call, const DataType& out_dtype) {
+Call InferMixedPrecisionConv1dTranspose(const Call& call, DLDataType out_dtype) {
   const auto* conv1d_transpose_attrs = call->attrs.as<Conv1DTransposeAttrs>();
   return conv1d_transpose(call->args[0], call->args[1], conv1d_transpose_attrs->strides,
                           conv1d_transpose_attrs->padding, conv1d_transpose_attrs->output_padding,
@@ -786,7 +790,7 @@ Expr conv2d_transpose(Expr data, Expr weight, ffi::Array<int64_t> strides,
                       ffi::Array<int64_t> padding, ffi::Array<int64_t> output_padding,
                       ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
                       ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-                      ffi::Optional<DataType> out_dtype) {
+                      ffi::Optional<DLDataType> out_dtype) {
   padding = GetCompletePadding2D(std::move(padding));
   if (output_padding.size() == 1) {
     output_padding.push_back(output_padding[0]);
@@ -821,7 +825,7 @@ Expr conv2d_transpose(Expr data, Expr weight, ffi::Array<int64_t> strides,
   attrs->data_layout = data_layout;
   attrs->kernel_layout = std::move(kernel_layout);
   attrs->out_layout = out_layout.value_or(data_layout);
-  attrs->out_dtype = std::move(out_dtype.value_or(DataType::Void()));
+  attrs->out_dtype = out_dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
   const Op& op = Op::Get("relax.nn.conv2d_transpose");
   return Call(op, {data, weight}, Attrs(attrs), {});
 }
@@ -852,9 +856,9 @@ Type InferTypeConv2dTranspose(const Call& call, const BlockBuilder& ctx) {
   ffi::Optional<ShapeExpr> weight_shape =
       CheckNdimPerLayoutAndGetShape(call, ctx, weight_ty, weight_layout);
 
-  DataType out_dtype = attrs->out_dtype.is_void()
-                           ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
-                           : attrs->out_dtype;
+  PrimType out_dtype = PrimType(attrs->out_dtype == DLDataType{kDLOpaqueHandle, 0, 0}
+                                    ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
+                                    : attrs->out_dtype);
   ffi::Optional<VDevice> vdevice = InferBinaryArithOpOutVDevice(call, ctx, data_ty, weight_ty);
   if (!data_shape.defined() || !weight_shape.defined()) {
     return TensorType(out_dtype, out_layout.ndim(), vdevice);
@@ -987,7 +991,7 @@ InferLayoutOutput InferLayoutConv2dTranspose(
   return InferLayoutOutput({data_layout, weight_layout}, {output_layout}, Attrs(new_attrs));
 }
 
-Call InferMixedPrecisionConv2dTranspose(const Call& call, const DataType& out_dtype) {
+Call InferMixedPrecisionConv2dTranspose(const Call& call, DLDataType out_dtype) {
   const auto* conv2d_transpose_attrs = call->attrs.as<Conv2DTransposeAttrs>();
   return conv2d_transpose(call->args[0], call->args[1], conv2d_transpose_attrs->strides,
                           conv2d_transpose_attrs->padding, conv2d_transpose_attrs->output_padding,
@@ -1015,7 +1019,7 @@ Expr conv3d_transpose(Expr data, Expr weight, ffi::Array<int64_t> strides,
                       ffi::Array<int64_t> padding, ffi::Array<int64_t> output_padding,
                       ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
                       ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-                      ffi::Optional<DataType> out_dtype) {
+                      ffi::Optional<DLDataType> out_dtype) {
   padding = GetCompletePadding3D(std::move(padding));
   if (output_padding.size() == 1) {
     output_padding.push_back(output_padding[0]);
@@ -1053,7 +1057,7 @@ Expr conv3d_transpose(Expr data, Expr weight, ffi::Array<int64_t> strides,
   attrs->data_layout = data_layout;
   attrs->kernel_layout = std::move(kernel_layout);
   attrs->out_layout = out_layout.value_or(data_layout);
-  attrs->out_dtype = std::move(out_dtype.value_or(DataType::Void()));
+  attrs->out_dtype = out_dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
   const Op& op = Op::Get("relax.nn.conv3d_transpose");
   return Call(op, {data, weight}, Attrs(attrs), {});
 }
@@ -1084,9 +1088,9 @@ Type InferTypeConv3dTranspose(const Call& call, const BlockBuilder& ctx) {
   ffi::Optional<ShapeExpr> weight_shape =
       CheckNdimPerLayoutAndGetShape(call, ctx, weight_ty, weight_layout);
 
-  DataType out_dtype = attrs->out_dtype.is_void()
-                           ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
-                           : attrs->out_dtype;
+  PrimType out_dtype = PrimType(attrs->out_dtype == DLDataType{kDLOpaqueHandle, 0, 0}
+                                    ? InferBinaryArithOpOutDtype(call, ctx, data_ty, weight_ty)
+                                    : attrs->out_dtype);
   ffi::Optional<VDevice> vdevice = InferBinaryArithOpOutVDevice(call, ctx, data_ty, weight_ty);
   if (!data_shape.defined() || !weight_shape.defined()) {
     return TensorType(out_dtype, out_layout.ndim(), vdevice);
@@ -1227,7 +1231,7 @@ InferLayoutOutput InferLayoutConv3dTranspose(
   return InferLayoutOutput({data_layout, weight_layout}, {output_layout}, Attrs(new_attrs));
 }
 
-Call InferMixedPrecisionConv3dTranspose(const Call& call, const DataType& out_dtype) {
+Call InferMixedPrecisionConv3dTranspose(const Call& call, DLDataType out_dtype) {
   const auto* conv3d_transpose_attrs = call->attrs.as<Conv3DTransposeAttrs>();
   return conv3d_transpose(call->args[0], call->args[1], conv3d_transpose_attrs->strides,
                           conv3d_transpose_attrs->padding, conv3d_transpose_attrs->output_padding,
diff --git a/src/relax/op/nn/convolution.h b/src/relax/op/nn/convolution.h
index b08eb8a83ff8..b33a19f07057 100644
--- a/src/relax/op/nn/convolution.h
+++ b/src/relax/op/nn/convolution.h
@@ -39,7 +39,7 @@ template <typename T>
 inline Expr MakeConv(Expr data, Expr weight, ffi::Array<int64_t> strides,
                      ffi::Array<int64_t> padding, ffi::Array<int64_t> dilation, int groups,
                      ffi::String data_layout, ffi::String kernel_layout, ffi::String out_layout,
-                     DataType out_dtype, std::string op_name) {
+                     DLDataType out_dtype, std::string op_name) {
   auto attrs = ffi::make_object<T>();
   attrs->strides = std::move(strides);
   attrs->padding = std::move(padding);
@@ -48,7 +48,7 @@ inline Expr MakeConv(Expr data, Expr weight, ffi::Array<int64_t> strides,
   attrs->data_layout = std::move(data_layout);
   attrs->kernel_layout = std::move(kernel_layout);
   attrs->out_layout = std::move(out_layout);
-  attrs->out_dtype = std::move(out_dtype);
+  attrs->out_dtype = out_dtype;
   const Op& op = Op::Get(op_name);
   return Call(op, {data, weight}, Attrs(attrs), {});
 }
@@ -57,19 +57,19 @@ inline Expr MakeConv(Expr data, Expr weight, ffi::Array<int64_t> strides,
 Expr conv1d(Expr data, Expr weight, ffi::Array<int64_t> strides, ffi::Array<int64_t> padding,
             ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
             ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-            ffi::Optional<DataType> out_dtype);
+            ffi::Optional<DLDataType> out_dtype);
 
 /*! \brief 2D convolution */
 Expr conv2d(Expr data, Expr weight, ffi::Array<int64_t> strides, ffi::Array<int64_t> padding,
             ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
             ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-            ffi::Optional<DataType> out_dtype);
+            ffi::Optional<DLDataType> out_dtype);
 
 /*! \brief 3D convolution */
 Expr conv3d(Expr data, Expr weight, ffi::Array<int64_t> strides, ffi::Array<int64_t> padding,
             ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
             ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-            ffi::Optional<DataType> out_dtype);
+            ffi::Optional<DLDataType> out_dtype);
 
 /*!
  * \brief One dimensional transposed convolution operator.
@@ -81,7 +81,7 @@ Expr conv1d_transpose(Expr data, Expr weight, ffi::Array<int64_t> strides,
                       ffi::Array<int64_t> padding, ffi::Array<int64_t> output_padding,
                       ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
                       ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-                      ffi::Optional<DataType> out_dtype);
+                      ffi::Optional<DLDataType> out_dtype);
 
 /*!
  * \brief Two dimensional transposed convolution operator.
@@ -93,7 +93,7 @@ Expr conv2d_transpose(Expr data, Expr weight, ffi::Array<int64_t> strides,
                       ffi::Array<int64_t> padding, ffi::Array<int64_t> output_padding,
                       ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
                       ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-                      ffi::Optional<DataType> out_dtype);
+                      ffi::Optional<DLDataType> out_dtype);
 
 /*!
  * \brief Three dimensional transposed convolution operator.
@@ -105,7 +105,7 @@ Expr conv3d_transpose(Expr data, Expr weight, ffi::Array<int64_t> strides,
                       ffi::Array<int64_t> padding, ffi::Array<int64_t> output_padding,
                       ffi::Array<int64_t> dilation, int groups, ffi::String data_layout,
                       ffi::String kernel_layout, ffi::Optional<ffi::String> out_layout,
-                      ffi::Optional<DataType> out_dtype);
+                      ffi::Optional<DLDataType> out_dtype);
 
 }  // namespace relax
 }  // namespace tvm
diff --git a/src/relax/op/nn/nn.cc b/src/relax/op/nn/nn.cc
index b24f81c72d49..c34f7afbc79d 100644
--- a/src/relax/op/nn/nn.cc
+++ b/src/relax/op/nn/nn.cc
@@ -122,7 +122,9 @@ Type InferTypePRelu(const Call& call, const BlockBuilder& ctx) {
   if (data_ty->IsUnknownNdim()) {
     return data_ty;
   }
-  if (!data_ty->IsUnknownDtype() && !data_ty->dtype.is_float()) {
+  PrimType data_dtype = data_ty->dtype;
+  // PRelu preserves the old float-kind check; vector lanes are irrelevant to this check.
+  if (!data_ty->IsUnknownDtype() && data_dtype.code() != DLDataTypeCode::kDLFloat) {
     TVM_FFI_VISIT_THROW(TypeError, call) << "Prelu requires the input tensor to have float "
                                             "dtype. However, the given input dtype is "
                                          << data_ty->dtype;
@@ -186,10 +188,14 @@ Type InferTypeSoftmax(const Call& call, const BlockBuilder& ctx) {
   if (data_ty->IsUnknownNdim()) {
     return data_ty;
   }
-  if (!data_ty->IsUnknownDtype() && !data_ty->dtype.is_float() && !data_ty->dtype.is_bfloat()) {
-    TVM_FFI_VISIT_THROW(TypeError, call) << "Softmax requires the input tensor to have float "
-                                            "dtype. However, the given input dtype is "
-                                         << data_ty->dtype;
+  if (!data_ty->IsUnknownDtype()) {
+    PrimType data_dtype = data_ty->dtype;
+    // Softmax only requires a floating element kind; lane encoding is irrelevant to the check.
+    if (data_dtype.code() != kDLFloat && data_dtype.code() != kDLBfloat) {
+      TVM_FFI_VISIT_THROW(TypeError, call) << "Softmax requires the input tensor to have float "
+                                              "dtype. However, the given input dtype is "
+                                           << data_ty->dtype;
+    }
   }
   const auto* attrs = call->attrs.as<SoftmaxAttrs>();
   NormalizeAxis(call, ctx, data_ty->ndim, attrs->axis);
@@ -380,10 +386,14 @@ bool NormCheckDtypeAndShape(const Call& call, const BlockBuilder& ctx,
     axes_non_neg = NormalizeAxes(call, ctx, data_ty->ndim, axes);
   }
   int n_axis = axes.size();
-  if (!data_ty->IsUnknownDtype() && (!data_ty->dtype.is_float() && !data_ty->dtype.is_bfloat())) {
-    TVM_FFI_VISIT_THROW(TypeError, call)
-        << op << " requires the input data to have float dtype. However, the given data dtype is "
-        << data_ty->dtype;
+  if (!data_ty->IsUnknownDtype()) {
+    PrimType data_dtype = data_ty->dtype;
+    // Norm ops only require a floating element kind; lane encoding is irrelevant to the check.
+    if (data_dtype.code() != kDLFloat && data_dtype.code() != kDLBfloat) {
+      TVM_FFI_VISIT_THROW(TypeError, call)
+          << op << " requires the input data to have float dtype. However, the given data dtype is "
+          << data_ty->dtype;
+    }
   }
   for (int i = 1; i < n_input; ++i) {
     if (input_ty[i]->dtype != data_ty->dtype) {
@@ -462,7 +472,7 @@ Type InferTypeBatchNorm(const Call& call, const BlockBuilder& ctx) {
   const auto* attrs = call->attrs.as<BatchNormAttrs>();
   bool unknown_shape = NormCheckDtypeAndShape(call, ctx, input_ty, {attrs->axis});
 
-  DataType dtype = input_ty[0]->dtype;
+  PrimType dtype = input_ty[0]->dtype;
   if (unknown_shape) {
     auto vdev = input_ty[0]->vdevice;
     return TupleType({TensorType(dtype, input_ty[0]->ndim, vdev),
@@ -620,7 +630,9 @@ Type InferTypeGroupNorm(const Call& call, const BlockBuilder& ctx) {
           << channel_axis << ", axes: " << attrs->axes;
     }
   }
-  if (!data_ty->IsUnknownDtype() && !data_ty->dtype.is_float()) {
+  PrimType data_dtype = data_ty->dtype;
+  // GroupNorm preserves the old float-kind check; vector lanes are irrelevant to this check.
+  if (!data_ty->IsUnknownDtype() && data_dtype.code() != DLDataTypeCode::kDLFloat) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << op << " expects that data must be float, but got " << data_ty->dtype;
   }
@@ -890,7 +902,7 @@ Type InferTypeCrossEntropy(const Call& call, const BlockBuilder& ctx) {
   TensorType label_ty = input_ty[1];
 
   // infer dtype
-  DataType dtype = InferBinaryArithOpOutDtype(call, ctx, pred_ty, label_ty);
+  PrimType dtype(InferBinaryArithOpOutDtype(call, ctx, pred_ty, label_ty));
 
   // infer vdevice
   ffi::Optional<VDevice> vdevice = InferBinaryArithOpOutVDevice(call, ctx, pred_ty, label_ty);
@@ -1002,23 +1014,26 @@ Type InferTypeNLLLoss(const Call& call, const BlockBuilder& ctx) {
   }
 
   // infer dtype, vdevice
-  DataType output_dtype;
-  ffi::Optional<VDevice> vdevice;
-  if (wgt_ty != nullptr) {
-    output_dtype = InferBinaryArithOpOutDtype(call, ctx, ffi::GetRef<TensorType>(pred_ty),
-                                              ffi::GetRef<TensorType>(wgt_ty));
-    vdevice = InferBinaryArithOpOutVDevice(call, ctx, ffi::GetRef<TensorType>(pred_ty),
-                                           ffi::GetRef<TensorType>(wgt_ty));
-  } else {
-    output_dtype = pred_ty->dtype;
-    vdevice = pred_ty->vdevice;
-  }
+  PrimType output_dtype =
+      wgt_ty != nullptr
+          ? PrimType(InferBinaryArithOpOutDtype(call, ctx, ffi::GetRef<TensorType>(pred_ty),
+                                                ffi::GetRef<TensorType>(wgt_ty)))
+          : pred_ty->dtype;
+  ffi::Optional<VDevice> vdevice =
+      wgt_ty != nullptr ? InferBinaryArithOpOutVDevice(call, ctx, ffi::GetRef<TensorType>(pred_ty),
+                                                       ffi::GetRef<TensorType>(wgt_ty))
+                        : pred_ty->vdevice;
 
   // the type of targets must be int/uint.
-  if (!tgt_ty->IsUnknownDtype() && !tgt_ty->dtype.is_int() && !tgt_ty->dtype.is_uint()) {
-    TVM_FFI_VISIT_THROW(TypeError, call)
-        << "NLLLoss expects the dtype of targets to be int/uint. However, the dtype of targets is "
-        << tgt_ty->dtype;
+  if (!tgt_ty->IsUnknownDtype()) {
+    PrimType target_dtype = tgt_ty->dtype;
+    // NLLLoss only needs the target element kind; vector lanes do not affect target indexing.
+    if (!target_dtype.MatchesCode(DLDataTypeCode::kDLInt) &&
+        !target_dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
+      TVM_FFI_VISIT_THROW(TypeError, call) << "NLLLoss expects the dtype of targets to be "
+                                              "int/uint. However, the dtype of targets is "
+                                           << tgt_ty->dtype;
+    }
   }
 
   // infer ndim
diff --git a/src/relax/op/nn/pooling.cc b/src/relax/op/nn/pooling.cc
index 856cd75c5902..84f994bc612f 100644
--- a/src/relax/op/nn/pooling.cc
+++ b/src/relax/op/nn/pooling.cc
@@ -275,7 +275,8 @@ InferLayoutOutput InferLayoutPool2d(
   ffi::ObjectPtr<Pool2DAttrs> new_attrs = ffi::make_object<Pool2DAttrs>(*attrs);
 
   if (layout->layout.ndim() != layout->layout.ndim_primal()) {
-    tirx::SLayout in_layout(attrs->layout, DataType::Int(64));
+    tvm::PrimType i64_ty = tvm::PrimType::Int(64);
+    tirx::SLayout in_layout(attrs->layout, i64_ty);
     auto desired_layout = TransposeSubLayoutLike(attrs->layout, InitialLayout(4), layout->layout);
     auto data_si = GetType(call->args[0]);
     TensorType data_ty = data_si.as<TensorType>().value();
@@ -675,7 +676,8 @@ InferLayoutOutput InferLayoutAdaptiveAvgPool2D(
   LayoutDecision layout = GetLayoutDecision(var_layout_map, call->args[0]);
   ffi::ObjectPtr<AdaptivePool2DAttrs> new_attrs = ffi::make_object<AdaptivePool2DAttrs>(*attrs);
   if (layout->layout.ndim() != layout->layout.ndim_primal()) {
-    tirx::SLayout in_layout(attrs->layout, DataType::Int(64));
+    tvm::PrimType i64_ty = tvm::PrimType::Int(64);
+    tirx::SLayout in_layout(attrs->layout, i64_ty);
     auto desired_layout = TransposeSubLayoutLike(attrs->layout, InitialLayout(4), layout->layout);
     auto data_si = GetType(call->args[0]);
     TensorType data_ty = data_si.as<TensorType>().value();
diff --git a/src/relax/op/op.cc b/src/relax/op/op.cc
index 9c58ab769950..16e5d5f20d0e 100644
--- a/src/relax/op/op.cc
+++ b/src/relax/op/op.cc
@@ -409,9 +409,9 @@ static ffi::Optional<Type> InferCallTIROutputTypeFromArguments(
       TVM_FFI_ICHECK(packed_tuple_ty);
       PrimType dummy_arg_ty = [&]() {
         if (packed_tuple_ty->values) {
-          return PrimType(packed_tuple_ty->values.value()[i].dtype());
+          return PrimType(packed_tuple_ty->values.value()[i].ty());
         } else {
-          return PrimType(DataType::Int(64));
+          return PrimType::Int(64);
         }
       }();
       dummy_args.push_back(Var("dummy_trailing_arg", dummy_arg_ty));
@@ -1119,7 +1119,7 @@ Type InferTypeSize(const Call& call, const BlockBuilder& ctx) {
   auto* tensor_ty = GetType(call->args[0]).as<TensorTypeNode>();
   TVM_FFI_ICHECK(tensor_ty) << "size expects a tensor input, but received " << arg_ty
                             << "; use MatchCast if necessary";
-  return TensorType(ShapeExpr(ffi::Array<PrimExpr>{}), DataType::Int(64));
+  return TensorType(ShapeExpr(ffi::Array<PrimExpr>{}), PrimType::Int(64));
 }
 
 TVM_REGISTER_OP("relax.size")
@@ -1182,7 +1182,7 @@ Type ReturnShapeToTensorType(const Call& call, const BlockBuilder& ctx) {
   const auto* ty = GetTypeAs<ShapeTypeNode>(call->args[0]);
   TVM_FFI_ICHECK(ty);
   int32_t ndim = ty->ndim;
-  return TensorType(ShapeExpr({PrimExpr(ndim)}), DataType::Int(64));
+  return TensorType(ShapeExpr({PrimExpr(ndim)}), PrimType::Int(64));
 }
 
 TVM_REGISTER_OP("relax.shape_to_tensor")
@@ -1209,10 +1209,10 @@ Type InferTypeAllocateTensor(const Call& call, const BlockBuilder& ctx) {
       << "must be ShapeExpr, but got " << call->args[0]->GetTypeKey();
   TVM_FFI_ICHECK(call->args[1].as<DataTypeImmNode>())
       << "must be DataTypeImm, but got " << call->args[1]->GetTypeKey();
-  DataType out_dtype;
+  PrimType out_dtype = PrimType::Void();
   if (const auto* dtype_node = call->args[1].as<DataTypeImmNode>()) {
     const DataTypeImm dtype_imm = ffi::GetRef<DataTypeImm>(dtype_node);
-    out_dtype = dtype_imm->value;
+    out_dtype = PrimType(dtype_imm->value);
   }
   int64_t vdevice_index = -1;
   if (auto* prim_value_node = call->args[2].as<PrimValueNode>()) {
@@ -1284,10 +1284,10 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 Type InferTypeMemAllocTensor(const Call& call, const BlockBuilder& ctx) {
   TVM_FFI_ICHECK(GetTypeAs<ShapeTypeNode>(call->args[2]))
       << "must be a Expr of ShapeType, but got " << call->args[1]->GetTypeKey();
-  DataType out_dtype;
+  PrimType out_dtype = PrimType::Void();
   if (const auto* dtype_node = call->args[3].as<DataTypeImmNode>()) {
     const DataTypeImm dtype_imm = ffi::GetRef<DataTypeImm>(dtype_node);
-    out_dtype = dtype_imm->value;
+    out_dtype = PrimType(dtype_imm->value);
   }
 
   if (call->args.size() == 5) {
@@ -1408,10 +1408,10 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 // vm alloc_tensor
 
 Type InferTypeVMAllocTensor(const Call& call, const BlockBuilder& ctx) {
-  DataType out_dtype;
+  PrimType out_dtype = PrimType::Void();
   if (const auto* dtype_node = call->args[3].as<DataTypeImmNode>()) {
     const DataTypeImm dtype_imm = ffi::GetRef<DataTypeImm>(dtype_node);
-    out_dtype = dtype_imm->value;
+    out_dtype = PrimType(dtype_imm->value);
   }
   int64_t vdevice_index = -1;
   if (auto* prim_value_node = call->args[4].as<PrimValueNode>()) {
diff --git a/src/relax/op/op_common.h b/src/relax/op/op_common.h
index cb0d6034e2d1..a19f59d4d56a 100644
--- a/src/relax/op/op_common.h
+++ b/src/relax/op/op_common.h
@@ -33,6 +33,7 @@
 
 #include <optional>
 #include <tuple>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -184,14 +185,12 @@ std::tuple<ArgTypes...> GetArgType(const Call& call, const BlockBuilder& ctx) {
     tvm::ffi::reflection::GlobalDef().def("relax.op." OpRegName, OpName); \
   }
 
-/************ Utilities ************/
-
 /*!
  * \brief Infer the type for unary elementwise ops.
  * \param call The context Call to the operator.
  * \param ctx The error reporting context.
  * \param f_compute_out_dtype The function to compute the output dtype, with
- * signature DataType f_compute_out_dtype(const TensorType& input_ty).
+ * signature DLDataType or PrimType f_compute_out_dtype(const TensorType& input_ty).
  * \tparam require_float_dtype whether this op requires the input dtype to be float
  * \tparam Ftype the type of f_compute_out_dtype
  * \return The inferred type.
@@ -199,15 +198,21 @@ std::tuple<ArgTypes...> GetArgType(const Call& call, const BlockBuilder& ctx) {
 template <bool require_float_dtype, typename FType>
 inline Type InferTypeUnary(const Call& call, const BlockBuilder& ctx, FType f_compute_out_dtype) {
   TensorType input_ty = GetUnaryInputTensorType(call, ctx);
+  DLDataType input_dtype = input_ty->dtype->dtype;
   if (require_float_dtype && !input_ty->IsUnknownDtype() &&
-      (!input_ty->dtype.is_float() && !input_ty->dtype.is_bfloat())) {
+      (input_dtype.code != kDLFloat && input_dtype.code != kDLBfloat)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << call->op
         << " requires the input tensor to have float dtype. However, the given input dtype is "
         << input_ty->dtype;
   }
   auto output_ty = ffi::make_object<TensorTypeNode>(*input_ty.get());
-  output_ty->dtype = f_compute_out_dtype(input_ty);
+  auto computed_dtype = f_compute_out_dtype(input_ty);
+  if constexpr (std::is_same_v<std::decay_t<decltype(computed_dtype)>, PrimType>) {
+    output_ty->dtype = computed_dtype;
+  } else {
+    output_ty->dtype = PrimType(computed_dtype);
+  }
   if (call->ty_args.size() > 0) {
     auto defined_ty = call->ty_args[0].as<TensorTypeNode>();
     TVM_FFI_ICHECK(defined_ty);
@@ -274,9 +279,9 @@ InferLayoutOutput InferLayoutUnaryEwise(
  * \return The inferred element dtype.
  * \throw Throw exception if the Type doesn't have an element type.
  */
-inline std::optional<DataType> GetElementDType(const Type& ty) {
+inline std::optional<PrimType> GetElementDType(const Type& ty) {
   if (const auto* prim = ty.as<PrimTypeNode>()) {
-    return prim->dtype;
+    return ffi::GetRef<PrimType>(prim);
   } else if (const auto* tensor = ty.as<TensorTypeNode>()) {
     return tensor->dtype;
   } else {
@@ -296,8 +301,8 @@ inline std::optional<DataType> GetElementDType(const Type& ty) {
  * \return The inferred output dtype.
  * \throw Throw exception if the dtype of two input TensorType don’t match
  */
-inline DataType InferBinaryArithOpOutDtype(const Call& call, const BlockBuilder& ctx,
-                                           const Type& lhs_ty, const Type& rhs_ty) {
+inline DLDataType InferBinaryArithOpOutDtype(const Call& call, const BlockBuilder& ctx,
+                                             const Type& lhs_ty, const Type& rhs_ty) {
   auto opt_lhs_dtype = GetElementDType(lhs_ty);
   if (!opt_lhs_dtype) {
     TVM_FFI_VISIT_THROW(TypeError, call)
@@ -318,15 +323,17 @@ inline DataType InferBinaryArithOpOutDtype(const Call& call, const BlockBuilder&
   }
   auto rhs_dtype = opt_rhs_dtype.value();
 
-  if (lhs_dtype.is_void() || rhs_dtype.is_void()) {
-    return DataType::Void();
-  } else if (lhs_dtype != rhs_dtype && !lhs_dtype.is_bool() && !rhs_dtype.is_bool()) {
+  if (lhs_dtype.IsVoid() || rhs_dtype.IsVoid()) {
+    return DLDataType{kDLOpaqueHandle, 0, 0};
+  } else if (lhs_dtype->dtype != rhs_dtype->dtype &&
+             !lhs_dtype.MatchesCode(DLDataTypeCode::kDLBool) &&
+             !rhs_dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Binary operators must have the same datatype for both operands.  "
         << "However, " << call << " uses datatype " << lhs_dtype << " on the LHS (Type of "
         << lhs_ty << "), and datatype " << rhs_dtype << " on the RHS (Type of " << rhs_ty << ").";
   }
-  return lhs_dtype;
+  return lhs_dtype->dtype;
 }
 
 /*!
@@ -469,7 +476,7 @@ bool IsIdentityPermutation(const std::vector<int>& permutation);
  */
 inline ffi::Array<IntImm> ConvertIntImmToInt64(const ffi::Array<IntImm>& int_imms) {
   return int_imms.Map(
-      [](const IntImm& i) { return cast(DataType::Int(64), i).as_or_throw<IntImm>(); });
+      [](const IntImm& i) { return cast(PrimType::Int(64), i).as_or_throw<IntImm>(); });
 }
 
 /************ Utilities for NN operators ************/
@@ -560,8 +567,9 @@ inline ffi::Array<int64_t> GetCompletePadding3D(ffi::Array<int64_t> padding) {
 inline std::pair<tirx::SLayout, tirx::SBijectiveLayout> CheckTensorLayout(
     const Call& call, const BlockBuilder& ctx, const ffi::String& tensor_layout,
     const ffi::String& tgt_layout, const ffi::String& tensor_name) {
-  tirx::SLayout _tensor_layout(tensor_layout, DataType::Int(64));
-  tirx::SBijectiveLayout tensor2tgt(_tensor_layout, tirx::SLayout(tgt_layout, DataType::Int(64)));
+  tvm::PrimType i64_ty = tvm::PrimType::Int(64);
+  tirx::SLayout _tensor_layout(tensor_layout, i64_ty);
+  tirx::SBijectiveLayout tensor2tgt(_tensor_layout, tirx::SLayout(tgt_layout, i64_ty));
   if (!tensor2tgt.defined()) {
     TVM_FFI_VISIT_THROW(ValueError, call)
         << call->op << " requires the given " << tensor_name << " layout to be convertible from "
diff --git a/src/relax/op/tensor/binary.cc b/src/relax/op/tensor/binary.cc
index 84c411238473..cbc786de0f8e 100644
--- a/src/relax/op/tensor/binary.cc
+++ b/src/relax/op/tensor/binary.cc
@@ -51,11 +51,11 @@ Type InferTypeBroadcast(const Call& call, const BlockBuilder& ctx, FType f_compu
       << "Arguments to binary operators must be either R.Tensor or R.Prim types, "
       << "but expression " << call << " has RHS " << call->args[1] << ", which has Type " << rhs_ty;
 
-  // DateType
-  DataType output_dtype = f_compute_out_dtype(call, ctx, lhs_ty, rhs_ty);
+  // Dtype
+  PrimType output_dtype(f_compute_out_dtype(call, ctx, lhs_ty, rhs_ty));
 
   if (lhs_ty.as<PrimTypeNode>() && rhs_ty.as<PrimTypeNode>()) {
-    return PrimType(output_dtype);
+    return output_dtype;
   }
 
   // VDevice
@@ -136,7 +136,7 @@ Type InferTypeBroadcastArith(const Call& call, const BlockBuilder& ctx) {
 Type InferTypeBroadcastCMP(const Call& call, const BlockBuilder& ctx) {
   return InferTypeBroadcast(call, ctx,
                             [](const Call& call, const BlockBuilder& ctx, const Type& lhs_ty,
-                               const Type& rhs_ty) { return DataType::Bool(); });
+                               const Type& rhs_ty) { return DLDataType{kDLBool, 8, 1}; });
 }
 
 InferLayoutOutput InferLayoutBinaryEwise(
diff --git a/src/relax/op/tensor/create.cc b/src/relax/op/tensor/create.cc
index e7a972896569..fbe3a0b0c534 100644
--- a/src/relax/op/tensor/create.cc
+++ b/src/relax/op/tensor/create.cc
@@ -46,7 +46,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 /* relax.full */
 Expr full(ffi::Variant<Expr, ffi::Array<PrimExpr>> shape, Expr fill_value,
-          ffi::Optional<DataType> dtype) {
+          ffi::Optional<DLDataType> dtype) {
   Expr shape_in_expr{nullptr};
   if (const auto* expr = shape.as<ExprNode>()) {
     shape_in_expr = ffi::GetRef<Expr>(expr);
@@ -59,7 +59,7 @@ Expr full(ffi::Variant<Expr, ffi::Array<PrimExpr>> shape, Expr fill_value,
   }
 
   ffi::ObjectPtr<InitAttrs> attrs = ffi::make_object<InitAttrs>();
-  attrs->dtype = dtype.value_or(DataType::Void());
+  attrs->dtype = dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
 
   static const Op& op = Op::Get("relax.full");
   return Call(op, {std::move(shape_in_expr), std::move(fill_value)}, Attrs(attrs), {});
@@ -88,7 +88,8 @@ Type InferTypeFull(const Call& call, const BlockBuilder& ctx) {
   }
 
   const auto* attrs = call->attrs.as<InitAttrs>();
-  DataType out_dtype = attrs->dtype.is_void() ? fill_value_ty->dtype : attrs->dtype;
+  PrimType out_dtype = attrs->dtype == DLDataType{kDLOpaqueHandle, 0, 0} ? fill_value_ty->dtype
+                                                                         : PrimType(attrs->dtype);
   return TensorType(/*shape=*/call->args[0], out_dtype, fill_value_ty->vdevice);
 }
 
@@ -104,9 +105,9 @@ TVM_REGISTER_OP("relax.full")
     .set_attr<bool>("FPurity", true);
 
 /* relax.full_like */
-Expr full_like(Expr x, Expr fill_value, ffi::Optional<DataType> dtype) {
+Expr full_like(Expr x, Expr fill_value, ffi::Optional<DLDataType> dtype) {
   ffi::ObjectPtr<InitAttrs> attrs = ffi::make_object<InitAttrs>();
-  attrs->dtype = dtype.value_or(DataType::Void());
+  attrs->dtype = dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
   static const Op& op = Op::Get("relax.full_like");
   return Call(op, {std::move(x), std::move(fill_value)}, Attrs(attrs), {});
 }
@@ -127,11 +128,11 @@ Type InferTypeFullLike(const Call& call, const BlockBuilder& ctx) {
   }
 
   const auto* attrs = call->attrs.as<InitAttrs>();
-  if (attrs->dtype.is_void()) {
+  if (attrs->dtype == DLDataType{kDLOpaqueHandle, 0, 0}) {
     return data_ty;
   } else {
     auto output_ty = ffi::make_object<TensorTypeNode>(*data_ty.get());
-    output_ty->dtype = attrs->dtype;
+    output_ty->dtype = PrimType(attrs->dtype);
     return TensorType(output_ty);
   }
 }
@@ -158,25 +159,26 @@ Type InferTypeOnesZeros(const Call& call, const BlockBuilder& ctx) {
         << call->args[0]->ty->GetTypeKey();
   }
   const auto* attrs = call->attrs.as<InitAttrs>();
-  return TensorType(/*shape=*/call->args[0], attrs->dtype);
+  return TensorType(/*shape=*/call->args[0], PrimType(attrs->dtype));
 }
 
 // Structure info inference for ones_like and zeros_like
 Type InferTypeOnesLikeZerosLike(const Call& call, const BlockBuilder& ctx) {
   TensorType data_ty = GetUnaryInputTensorType(call, ctx);
   const auto* attrs = call->attrs.as<InitAttrs>();
-  if (attrs->dtype.is_void()) {
+  if (attrs->dtype == DLDataType{kDLOpaqueHandle, 0, 0}) {
     return data_ty;
   } else {
     auto output_ty = ffi::make_object<TensorTypeNode>(*data_ty.get());
-    output_ty->dtype = attrs->dtype;
+    output_ty->dtype = PrimType(attrs->dtype);
     return TensorType(output_ty);
   }
 }
 
 /* relax.ones & relax.ones_like */
-Expr ones(Expr shape, DataType dtype) {
-  TVM_FFI_ICHECK(!dtype.is_void()) << "Ones op expects the input dtype not to be void";
+Expr ones(Expr shape, DLDataType dtype) {
+  TVM_FFI_ICHECK((dtype != DLDataType{kDLOpaqueHandle, 0, 0}))
+      << "Ones op expects the input dtype not to be void";
   ffi::ObjectPtr<InitAttrs> attrs = ffi::make_object<InitAttrs>();
   attrs->dtype = dtype;
 
@@ -184,9 +186,9 @@ Expr ones(Expr shape, DataType dtype) {
   return Call(op, {std::move(shape)}, Attrs(attrs), {});
 }
 
-Expr ones_like(Expr x, ffi::Optional<DataType> dtype) {
+Expr ones_like(Expr x, ffi::Optional<DLDataType> dtype) {
   ffi::ObjectPtr<InitAttrs> attrs = ffi::make_object<InitAttrs>();
-  attrs->dtype = dtype.value_or(DataType::Void());
+  attrs->dtype = dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
   static const Op& op = Op::Get("relax.ones_like");
   return Call(op, {std::move(x)}, Attrs(attrs), {});
 }
@@ -212,8 +214,9 @@ TVM_REGISTER_OP("relax.ones_like")
     .set_attr<bool>("FPurity", true);
 
 /* relax.zeros & relax.zeros_like */
-Expr zeros(Expr shape, DataType dtype) {
-  TVM_FFI_ICHECK(!dtype.is_void()) << "Zeros op expects the input dtype not to be void";
+Expr zeros(Expr shape, DLDataType dtype) {
+  TVM_FFI_ICHECK((dtype != DLDataType{kDLOpaqueHandle, 0, 0}))
+      << "Zeros op expects the input dtype not to be void";
   ffi::ObjectPtr<InitAttrs> attrs = ffi::make_object<InitAttrs>();
   attrs->dtype = dtype;
 
@@ -221,9 +224,9 @@ Expr zeros(Expr shape, DataType dtype) {
   return Call(op, {std::move(shape)}, Attrs(attrs), {});
 }
 
-Expr zeros_like(Expr x, ffi::Optional<DataType> dtype) {
+Expr zeros_like(Expr x, ffi::Optional<DLDataType> dtype) {
   ffi::ObjectPtr<InitAttrs> attrs = ffi::make_object<InitAttrs>();
-  attrs->dtype = dtype.value_or(DataType::Void());
+  attrs->dtype = dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
   static const Op& op = Op::Get("relax.zeros_like");
   return Call(op, {std::move(x)}, Attrs(attrs), {});
 }
@@ -249,16 +252,16 @@ TVM_REGISTER_OP("relax.zeros_like")
     .set_attr<bool>("FPurity", true);
 
 /* relax.eye & relax.eye_like */
-Expr eye(PrimValue n, PrimValue m, PrimValue k, DataType dtype) {
+Expr eye(PrimValue n, PrimValue m, PrimValue k, DLDataType dtype) {
   ffi::ObjectPtr<InitAttrs> attrs = ffi::make_object<InitAttrs>();
   attrs->dtype = dtype;
   static const Op& op = Op::Get("relax.eye");
   return Call(op, {std::move(n), std::move(m), std::move(k)}, Attrs(attrs), {});
 }
 
-Expr eye_like(Expr x, PrimValue k, ffi::Optional<DataType> dtype) {
+Expr eye_like(Expr x, PrimValue k, ffi::Optional<DLDataType> dtype) {
   ffi::ObjectPtr<InitAttrs> attrs = ffi::make_object<InitAttrs>();
-  attrs->dtype = dtype.value_or(DataType::Void());
+  attrs->dtype = dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
   static const Op& op = Op::Get("relax.eye_like");
   return Call(op, {std::move(x), std::move(k)}, Attrs(attrs), {});
 }
@@ -285,8 +288,8 @@ Type InferTypeEye(const Call& call, const BlockBuilder& ctx) {
   PrimExpr n = get_prim_value(call->args[0], "n");
   PrimExpr m = get_prim_value(call->args[1], "m");
 
-  DataType dtype = call->attrs.as<InitAttrs>()->dtype;
-  return TensorType(ShapeExpr({n, m}), dtype);
+  DLDataType dtype = call->attrs.as<InitAttrs>()->dtype;
+  return TensorType(ShapeExpr({n, m}), PrimType(dtype));
 }
 
 Type InferTypeEyeLike(const Call& call, const BlockBuilder& ctx) {
@@ -309,7 +312,8 @@ Type InferTypeEyeLike(const Call& call, const BlockBuilder& ctx) {
   }
 
   const auto* attrs = call->attrs.as<InitAttrs>();
-  DataType out_dtype = attrs->dtype.is_void() ? x_ty->dtype : attrs->dtype;
+  PrimType out_dtype =
+      attrs->dtype == DLDataType{kDLOpaqueHandle, 0, 0} ? x_ty->dtype : PrimType(attrs->dtype);
 
   return TensorType(x_ty->shape.value(), out_dtype, x_ty->vdevice);
 }
@@ -333,7 +337,7 @@ TVM_REGISTER_OP("relax.eye_like")
     .set_attr<bool>("FPurity", true);
 
 /* relax.arange */
-Expr arange(PrimValue start, PrimValue stop, PrimValue step, DataType dtype) {
+Expr arange(PrimValue start, PrimValue stop, PrimValue step, DLDataType dtype) {
   ffi::ObjectPtr<InitAttrs> attrs = ffi::make_object<InitAttrs>();
   attrs->dtype = dtype;
   static const Op& op = Op::Get("relax.arange");
@@ -362,17 +366,18 @@ Type InferTypeArange(const Call& call, const BlockBuilder& ctx) {
   PrimExpr start = get_prim_value(call->args[0], "start");
   PrimExpr end = get_prim_value(call->args[1], "end");
   PrimExpr step = get_prim_value(call->args[2], "step");
-  DataType dtype = call->attrs.as<InitAttrs>()->dtype;
+  DLDataType dtype = call->attrs.as<InitAttrs>()->dtype;
   PrimExpr num_elem;
-  if (start.dtype().is_int() && end.dtype().is_int() && step.dtype().is_int()) {
+  if (start.ty().code() == DLDataTypeCode::kDLInt && end.ty().code() == DLDataTypeCode::kDLInt &&
+      step.ty().code() == DLDataTypeCode::kDLInt) {
     num_elem = tvm::floordiv((end - start + step - 1), step);
   } else {
-    num_elem = tvm::cast(tvm::DataType::Int(64),
-                         tvm::ceil(tvm::cast(tvm::DataType::Float(32), end - start) / step));
+    num_elem = tvm::cast(tvm::PrimType::Int(64),
+                         tvm::ceil(tvm::cast(tvm::PrimType::Float(32), end - start) / step));
   }
   arith::Analyzer analyzer;
   num_elem = analyzer->Simplify(num_elem);
-  return TensorType(ShapeExpr({num_elem}), dtype);
+  return TensorType(ShapeExpr({num_elem}), PrimType(dtype));
 }
 
 TVM_REGISTER_OP("relax.arange")
@@ -387,7 +392,7 @@ TVM_REGISTER_OP("relax.arange")
 
 /* relax.hamming_window */
 Expr hamming_window(PrimValue window_size, PrimValue periodic, PrimValue alpha, PrimValue beta,
-                    DataType dtype) {
+                    DLDataType dtype) {
   ffi::ObjectPtr<InitAttrs> attrs = ffi::make_object<InitAttrs>();
   attrs->dtype = dtype;
   static const Op& op = Op::Get("relax.hamming_window");
@@ -401,8 +406,8 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 }
 
 Type InferTypeHammingWindow(const Call& call, const BlockBuilder& ctx) {
-  DataType dtype = call->attrs.as<InitAttrs>()->dtype;
-  if (dtype.is_int() || dtype.is_uint() || dtype.is_uint()) {
+  DLDataType dtype = call->attrs.as<InitAttrs>()->dtype;
+  if (dtype.code == DLDataTypeCode::kDLInt || dtype.code == DLDataTypeCode::kDLUInt) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Hamming Window expects the datatype to be float but got " << dtype;
   }
@@ -422,7 +427,7 @@ Type InferTypeHammingWindow(const Call& call, const BlockBuilder& ctx) {
         << window_size;
   }
   window_size = analyzer->Simplify(window_size);
-  return TensorType(ShapeExpr({window_size}), dtype);
+  return TensorType(ShapeExpr({window_size}), PrimType(dtype));
 }
 
 TVM_REGISTER_OP("relax.hamming_window")
diff --git a/src/relax/op/tensor/create.h b/src/relax/op/tensor/create.h
index 284448111739..497a535a4d0f 100644
--- a/src/relax/op/tensor/create.h
+++ b/src/relax/op/tensor/create.h
@@ -42,7 +42,7 @@ namespace relax {
  * \return The result tensor.
  */
 Expr full(ffi::Variant<Expr, ffi::Array<PrimExpr>> shape, Expr fill_value,
-          ffi::Optional<DataType> dtype);
+          ffi::Optional<DLDataType> dtype);
 
 /*!
  * \brief Construct a tensor such that
@@ -55,7 +55,7 @@ Expr full(ffi::Variant<Expr, ffi::Array<PrimExpr>> shape, Expr fill_value,
  * void, the input tensor's dtype will be used.
  * \return The result tensor.
  */
-Expr full_like(Expr x, Expr fill_value, ffi::Optional<DataType> dtype);
+Expr full_like(Expr x, Expr fill_value, ffi::Optional<DLDataType> dtype);
 
 /*!
  * \brief Construct a tensor of all ones, with the input shape and dtype.
@@ -63,7 +63,7 @@ Expr full_like(Expr x, Expr fill_value, ffi::Optional<DataType> dtype);
  * \param dtype The data type of the created tensor.
  * \return The result tensor.
  */
-Expr ones(Expr shape, DataType dtype);
+Expr ones(Expr shape, DLDataType dtype);
 
 /*!
  * \brief Construct a tensor with all ones, with shape of the input tensor shape.
@@ -73,7 +73,7 @@ Expr ones(Expr shape, DataType dtype);
  * void, the input tensor's dtype will be used.
  * \return The result tensor.
  */
-Expr ones_like(Expr x, ffi::Optional<DataType> dtype);
+Expr ones_like(Expr x, ffi::Optional<DLDataType> dtype);
 
 /*!
  * \brief Construct a tensor of all zeros, with the input shape and dtype.
@@ -81,7 +81,7 @@ Expr ones_like(Expr x, ffi::Optional<DataType> dtype);
  * \param dtype The data type of the created tensor.
  * \return The result tensor.
  */
-Expr zeros(Expr shape, DataType dtype);
+Expr zeros(Expr shape, DLDataType dtype);
 
 /*!
  * \brief Construct a tensor with all zeros, with shape of the input tensor shape.
@@ -91,7 +91,7 @@ Expr zeros(Expr shape, DataType dtype);
  * void, the input tensor's dtype will be used.
  * \return The result tensor.
  */
-Expr zeros_like(Expr x, ffi::Optional<DataType> dtype);
+Expr zeros_like(Expr x, ffi::Optional<DLDataType> dtype);
 
 /*!
  * \brief Construct a 2-D tensor with ones on the diagonal and zeros elsewhere.
@@ -102,7 +102,7 @@ Expr zeros_like(Expr x, ffi::Optional<DataType> dtype);
  * \param dtype The data type of the created tensor.
  * \return The result tensor.
  */
-Expr eye(PrimValue n, PrimValue m, PrimValue k, DataType dtype);
+Expr eye(PrimValue n, PrimValue m, PrimValue k, DLDataType dtype);
 
 /*!
  * \brief Construct a tensor with ones on the diagonal and zeros elsewhere,
@@ -115,10 +115,10 @@ Expr eye(PrimValue n, PrimValue m, PrimValue k, DataType dtype);
  * void, the input tensor's dtype will be used.
  * \return The result tensor.
  */
-Expr eye_like(Expr x, PrimValue k, ffi::Optional<DataType> dtype);
+Expr eye_like(Expr x, PrimValue k, ffi::Optional<DLDataType> dtype);
 
 /*! \brief Construct a tensor with evenly spaced elements. */
-Expr arange(PrimValue start, PrimValue stop, PrimValue step, DataType dtype);
+Expr arange(PrimValue start, PrimValue stop, PrimValue step, DLDataType dtype);
 
 /*!
  * \brief Hamming window function.
@@ -131,7 +131,7 @@ Expr arange(PrimValue start, PrimValue stop, PrimValue step, DataType dtype);
  * \return The result tensor.
  */
 Expr hamming_window(PrimValue window_size, PrimValue periodic, PrimValue alpha, PrimValue beta,
-                    DataType dtype);
+                    DLDataType dtype);
 
 /*! \brief Return the lower triangular part of a matrix or a batch of matrices. */
 Expr tril(Expr x, Expr k);
diff --git a/src/relax/op/tensor/datatype.cc b/src/relax/op/tensor/datatype.cc
index 907dffb0b3f3..ec1043a025e1 100644
--- a/src/relax/op/tensor/datatype.cc
+++ b/src/relax/op/tensor/datatype.cc
@@ -38,7 +38,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 /* relax.astype */
 
-Expr astype(Expr x, DataType dtype) {
+Expr astype(Expr x, DLDataType dtype) {
   ffi::ObjectPtr<AstypeAttrs> attrs = ffi::make_object<AstypeAttrs>();
   attrs->dtype = dtype;
 
@@ -55,7 +55,7 @@ Type InferTypeAstype(const Call& call, const BlockBuilder& ctx) {
   TensorType ty = GetUnaryInputTensorType(call, ctx);
   const auto* attrs = call->attrs.as<AstypeAttrs>();
   ffi::ObjectPtr<TensorTypeNode> new_ty = ffi::make_object<TensorTypeNode>(*ty.get());
-  new_ty->dtype = attrs->dtype;
+  new_ty->dtype = PrimType(attrs->dtype);
   return TensorType(new_ty);
 }
 
@@ -70,7 +70,7 @@ TVM_REGISTER_OP("relax.astype")
 
 /* relax.wrap_param */
 
-Expr MakeWrapParam(Expr data, DataType dtype) {
+Expr MakeWrapParam(Expr data, DLDataType dtype) {
   ffi::ObjectPtr<WrapParamAttrs> attrs = ffi::make_object<WrapParamAttrs>();
   attrs->dtype = dtype;
 
@@ -87,7 +87,7 @@ Type InferTypeWrapParam(const Call& call, const BlockBuilder& ctx) {
   TensorType ty = GetUnaryInputTensorType(call, ctx);
   const auto* attrs = call->attrs.as<WrapParamAttrs>();
   ffi::ObjectPtr<TensorTypeNode> new_ty = ffi::make_object<TensorTypeNode>(*ty.get());
-  new_ty->dtype = attrs->dtype;
+  new_ty->dtype = PrimType(attrs->dtype);
   return TensorType(new_ty);
 }
 
diff --git a/src/relax/op/tensor/datatype.h b/src/relax/op/tensor/datatype.h
index b612c45fc941..db2ee396c0d6 100644
--- a/src/relax/op/tensor/datatype.h
+++ b/src/relax/op/tensor/datatype.h
@@ -37,7 +37,7 @@ namespace relax {
  * \param dtype The target data type
  * \return The casted result.
  */
-Expr astype(Expr x, DataType dtype);
+Expr astype(Expr x, DLDataType dtype);
 
 /*!
  * \brief A wrapper to wrap the input const tensor to the given data type.
@@ -45,7 +45,7 @@ Expr astype(Expr x, DataType dtype);
  * \param dtype The target data type
  * \return The wrapped result.
  */
-Expr wrap_param(Expr x, DataType dtype);
+Expr wrap_param(Expr x, DLDataType dtype);
 
 }  // namespace relax
 }  // namespace tvm
diff --git a/src/relax/op/tensor/index.cc b/src/relax/op/tensor/index.cc
index 515f37126183..e42feb0ae06c 100644
--- a/src/relax/op/tensor/index.cc
+++ b/src/relax/op/tensor/index.cc
@@ -72,7 +72,7 @@ Type InferTypeTake(const Call& call, const BlockBuilder& ctx) {
     if (auto tensor_ty = ty.as<TensorType>()) {
       return tensor_ty.value();
     } else if (auto prim_ty = ty.as<PrimTypeNode>()) {
-      return TensorType(ShapeExpr(ffi::Array<PrimExpr>{}), prim_ty->dtype);
+      return TensorType(ShapeExpr(ffi::Array<PrimExpr>{}), ffi::GetRef<PrimType>(prim_ty));
     } else {
       TVM_FFI_VISIT_THROW(TypeError, call)
           << "Operator " << call->op << " requires the indices argument to be "
@@ -84,11 +84,15 @@ Type InferTypeTake(const Call& call, const BlockBuilder& ctx) {
 
   if (indices_ty->IsUnknownDtype()) {
     LOG(WARNING) << "Data type of indices has not been specified. Assume it has an integer type.";
-  } else if (!(indices_ty->dtype.is_int() || indices_ty->dtype.is_uint())) {
-    TVM_FFI_VISIT_THROW(TypeError, call)
-        << "Take op requires the input indices to have integer dtype. However, the "
-           "given indices dtype is "
-        << indices_ty->dtype;
+  } else {
+    PrimType indices_dtype = indices_ty->dtype;
+    if (!indices_dtype.MatchesCode(DLDataTypeCode::kDLInt) &&
+        !indices_dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
+      TVM_FFI_VISIT_THROW(TypeError, call)
+          << "Take op requires the input indices to have integer dtype. However, the "
+             "given indices dtype is "
+          << indices_ty->dtype;
+    }
   }
 
   const auto* attrs = call->attrs.as<TakeAttrs>();
@@ -309,7 +313,7 @@ Type InferTypeStridedSlice(const Call& call, const BlockBuilder& ctx) {
     }
   }();
 
-  TVM_FFI_ICHECK(IsBaseOf(relax::TensorType(DataType::Void(), kUnknownNDim), GetType(data)))
+  TVM_FFI_ICHECK(IsBaseOf(relax::TensorType(PrimType::Void(), kUnknownNDim), GetType(data)))
       << "Operator " << call->op << " requires the first argument to be a tensor.  "
       << "However, in expression " << call << ", the first argument " << data << " has type "
       << GetType(data);
@@ -325,9 +329,8 @@ Type InferTypeStridedSlice(const Call& call, const BlockBuilder& ctx) {
     const auto* tuple = ty.as<TupleTypeNode>();
     if (!tuple) return false;
 
-    return std::all_of(tuple->fields.begin(), tuple->fields.end(), [](const Type& field) {
-      return IsBaseOf(tvm::PrimType(DataType::Int(64)), field);
-    });
+    return std::all_of(tuple->fields.begin(), tuple->fields.end(),
+                       [](const Type& field) { return IsBaseOf(tvm::PrimType::Int(64), field); });
   };
   auto check_tuple = [&](const char* name, Expr expr) {
     auto ty = GetType(expr);
@@ -347,7 +350,7 @@ Type InferTypeStridedSlice(const Call& call, const BlockBuilder& ctx) {
 
   const auto* data_ty = data->ty.as<TensorTypeNode>();
 
-  DataType dtype = DataType::Void();
+  PrimType dtype(DLDataType{kDLOpaqueHandle, 0, 0});
   ffi::Optional<VDevice> vdevice = std::nullopt;
   int ndim = kUnknownNDim;
   if (data_ty) {
@@ -545,7 +548,7 @@ Type InferTypeDynStridedSlice(const Call& call, const BlockBuilder& ctx) {
       LOG(WARNING) << "Dynamic strided slice assumes " << name
                    << " to be int64 when it is not specified.";
     } else {
-      TVM_FFI_ICHECK(ty->dtype == DataType::Int(64))
+      TVM_FFI_ICHECK(ty->dtype == PrimType::Int(64))
           << "Dynamic strided_slice expects the input " << name
           << "values to be all int64. However, " << name << " has dtype " << ty->dtype << ".";
     }
diff --git a/src/relax/op/tensor/inspect.cc b/src/relax/op/tensor/inspect.cc
index bf57670e7f2a..97955eb62455 100644
--- a/src/relax/op/tensor/inspect.cc
+++ b/src/relax/op/tensor/inspect.cc
@@ -88,24 +88,21 @@ std::tuple<TensorType, ffi::Optional<int64_t>> GetTensorArgInfoWithIndex(const C
   return {ffi::GetRef<TensorType>(tensor_ty), int_imm_axis};
 }
 
-DataType GetTensorDataType(const Call& call) { return GetTensorArgInfo(call)->dtype; }
+tirx::PrimFunc GetDLTensorField(tirx::builtin::TVMStructFieldKind field, PrimType field_ty) {
+  tirx::Var dlpack_handle("dlpack_handle", PrimType::Handle());
 
-tirx::PrimFunc GetDLTensorField(tirx::builtin::TVMStructFieldKind field, DataType field_dtype) {
-  tirx::Var dlpack_handle("dlpack_handle", DataType::Handle());
-
-  tirx::Var value("value", field_dtype);
+  tirx::Var value("value", field_ty);
 
   tirx::Stmt body = tirx::SeqStmt(
-      {tirx::Bind(value, tirx::Call(field_dtype, tirx::builtin::tvm_struct_get(),
+      {tirx::Bind(value, tirx::Call(field_ty, tirx::builtin::tvm_struct_get(),
                                     {dlpack_handle, IntImm::Int32(0), IntImm::Int32(field)})),
        tirx::Evaluate(tvm::ret(value))});
 
   DictAttrs attrs({{"tirx.is_scheduled", true}, {"tirx.is_host_func", true}});
 
-  tirx::PrimFunc func(ffi::Array<tirx::Var>{dlpack_handle}, body, tvm::PrimType(field_dtype), {},
-                      attrs);
+  tirx::PrimFunc func(ffi::Array<tirx::Var>{dlpack_handle}, body, field_ty, {}, attrs);
 
-  FuncType ty({TensorType(DataType::Void(), kUnknownNDim)}, PrimType(field_dtype));
+  FuncType ty({TensorType(PrimType::Void(), kUnknownNDim)}, field_ty);
   func->ty = ty;
 
   return func;
@@ -120,23 +117,14 @@ Expr tensor_dtype_code(Expr expr) {
   return Call(op, {expr});
 }
 
-Type InferTypeTensorDtypeCode(const Call& call, const BlockBuilder&) {
-  auto dlpack_type = DataType::UInt(8);
-
-  DataType dtype = GetTensorDataType(call);
-  if (dtype.is_void()) {
-    return PrimType(dlpack_type);
-  } else {
-    return PrimType(dlpack_type);
-  }
-}
+Type InferTypeTensorDtypeCode(const Call& call, const BlockBuilder&) { return PrimType::UInt(8); }
 
 Expr LegalizeTensorDtypeCode(const BlockBuilder& bb, const Call& call) {
-  auto field_dtype = call->ty.as_or_throw<PrimType>()->dtype;
+  PrimType field_ty = call->ty.as_or_throw<tvm::PrimType>();
 
   Expr arg = call->args[0];
   tirx::PrimFunc getter =
-      GetDLTensorField(tirx::builtin::TVMStructFieldKind::kDLTensorTypeCode, field_dtype);
+      GetDLTensorField(tirx::builtin::TVMStructFieldKind::kDLTensorTypeCode, field_ty);
 
   GlobalVar gvar_getter = bb->AddFunction(getter, "_get_tensor_dtype_code");
   return Call(gvar_getter, {arg});
@@ -158,23 +146,14 @@ Expr tensor_dtype_bits(Expr expr) {
   return Call(op, {expr});
 }
 
-Type InferTypeTensorDtypeBits(const Call& call, const BlockBuilder&) {
-  auto dlpack_type = DataType::UInt(8);
-
-  DataType dtype = GetTensorDataType(call);
-  if (dtype.is_void()) {
-    return PrimType(dlpack_type);
-  } else {
-    return PrimType(dlpack_type);
-  }
-}
+Type InferTypeTensorDtypeBits(const Call& call, const BlockBuilder&) { return PrimType::UInt(8); }
 
 Expr LegalizeTensorDtypeBits(const BlockBuilder& bb, const Call& call) {
-  auto field_dtype = call->ty.as_or_throw<PrimType>()->dtype;
+  PrimType field_ty = call->ty.as_or_throw<tvm::PrimType>();
 
   Expr arg = call->args[0];
   tirx::PrimFunc getter =
-      GetDLTensorField(tirx::builtin::TVMStructFieldKind::kDLTensorTypeBits, field_dtype);
+      GetDLTensorField(tirx::builtin::TVMStructFieldKind::kDLTensorTypeBits, field_ty);
 
   GlobalVar gvar_getter = bb->AddFunction(getter, "_get_tensor_dtype_bits");
   return Call(gvar_getter, {arg});
@@ -196,23 +175,14 @@ Expr tensor_dtype_lanes(Expr expr) {
   return Call(op, {expr});
 }
 
-Type InferTypeTensorDtypeLanes(const Call& call, const BlockBuilder&) {
-  auto dlpack_type = DataType::UInt(16);
-
-  DataType dtype = GetTensorDataType(call);
-  if (dtype.is_void()) {
-    return PrimType(dlpack_type);
-  } else {
-    return PrimType(dlpack_type);
-  }
-}
+Type InferTypeTensorDtypeLanes(const Call& call, const BlockBuilder&) { return PrimType::UInt(16); }
 
 Expr LegalizeTensorDtypeLanes(const BlockBuilder& bb, const Call& call) {
-  auto field_dtype = call->ty.as_or_throw<PrimType>()->dtype;
+  PrimType field_ty = call->ty.as_or_throw<tvm::PrimType>();
 
   Expr arg = call->args[0];
   tirx::PrimFunc getter =
-      GetDLTensorField(tirx::builtin::TVMStructFieldKind::kDLTensorTypeLanes, field_dtype);
+      GetDLTensorField(tirx::builtin::TVMStructFieldKind::kDLTensorTypeLanes, field_ty);
 
   GlobalVar gvar_getter = bb->AddFunction(getter, "_get_tensor_dtype_lanes");
   return Call(gvar_getter, {arg});
@@ -234,23 +204,14 @@ Expr tensor_ndim(Expr expr) {
   return Call(op, {expr});
 }
 
-Type InferTypeTensorNDim(const Call& call, const BlockBuilder&) {
-  auto dlpack_type = DataType::Int(32);
-
-  auto ty = GetTensorArgInfo(call);
-  if (ty->IsUnknownNdim()) {
-    return PrimType(dlpack_type);
-  } else {
-    return PrimType(dlpack_type);
-  }
-}
+Type InferTypeTensorNDim(const Call& call, const BlockBuilder&) { return PrimType::Int(32); }
 
 Expr LegalizeTensorNDim(const BlockBuilder& bb, const Call& call) {
-  auto field_dtype = call->ty.as_or_throw<PrimType>()->dtype;
+  PrimType field_ty = call->ty.as_or_throw<tvm::PrimType>();
 
   Expr arg = call->args[0];
   tirx::PrimFunc getter =
-      GetDLTensorField(tirx::builtin::TVMStructFieldKind::kDLTensorNDim, field_dtype);
+      GetDLTensorField(tirx::builtin::TVMStructFieldKind::kDLTensorNDim, field_ty);
 
   GlobalVar gvar_getter = bb->AddFunction(getter, "_get_tensor_ndim");
   return Call(gvar_getter, {arg});
@@ -273,45 +234,45 @@ Expr tensor_shape_i(Expr expr) {
 }
 
 Type InferTypeTensorShape(const Call& call, const BlockBuilder&) {
-  auto dlpack_type = DataType::Int(64);
+  auto dlpack_type = PrimType::Int(64);
 
   auto [tensor_ty, int_imm_axis] = GetTensorArgInfoWithIndex(call);
 
   auto tensor_shape = tensor_ty->GetShape();
 
   if (int_imm_axis && tensor_shape.defined()) {
-    return PrimType(tensor_shape.value()[int_imm_axis.value()].dtype());
+    return tensor_shape.value()[int_imm_axis.value()].ty();
   } else {
-    return PrimType(dlpack_type);
+    return dlpack_type;
   }
 }
 
 Expr LegalizeTensorShape(const BlockBuilder& bb, const Call& call) {
-  auto field_dtype = call->ty.as_or_throw<PrimType>()->dtype;
+  PrimType field_ty = call->ty.as_or_throw<tvm::PrimType>();
 
   tirx::PrimFunc getter = [&]() -> tirx::PrimFunc {
-    tirx::Var dlpack_handle("dlpack_handle", DataType::Handle());
-    tirx::Var axis("axis", DataType::Int(64));
+    tirx::Var dlpack_handle("dlpack_handle", PrimType::Handle());
+    tirx::Var axis("axis", PrimType::Int(64));
 
-    tirx::Var ndim("ndim", DataType::Int(32));
+    tirx::Var ndim("ndim", PrimType::Int(32));
 
-    tirx::Buffer shape_buffer = tirx::decl_buffer({ndim}, field_dtype, "shape");
+    tirx::Buffer shape_buffer = tirx::decl_buffer({ndim}, field_ty, "shape");
 
-    tirx::Var extent("extent", field_dtype);
+    tirx::Var extent("extent", field_ty);
 
     tirx::Stmt body = tirx::SeqStmt(
         {tirx::AssertStmt(0 <= axis, tirx::StringImm("RuntimeError"),
                           {tirx::StringImm("Specified axis may not be negative")}),
          tirx::Bind(ndim,
-                    tirx::Call(ndim->dtype, tirx::builtin::tvm_struct_get(),
+                    tirx::Call(ndim.ty(), tirx::builtin::tvm_struct_get(),
                                {dlpack_handle, IntImm::Int32(0),
                                 IntImm::Int32(tirx::builtin::TVMStructFieldKind::kDLTensorNDim)})),
          tirx::AssertStmt(
-             axis < tvm::cast(axis->dtype, ndim), tirx::StringImm("RuntimeError"),
+             axis < tvm::cast(axis.ty(), ndim), tirx::StringImm("RuntimeError"),
              {tirx::StringImm(
                  "Specified axis may not be larger than the tensor's dimensionality")}),
          tirx::Bind(shape_buffer->data,
-                    tirx::Call(DataType::Handle(), tirx::builtin::tvm_struct_get(),
+                    tirx::Call(tvm::PrimType::Handle(), tirx::builtin::tvm_struct_get(),
                                {dlpack_handle, IntImm::Int32(0),
                                 IntImm::Int32(tirx::builtin::TVMStructFieldKind::kDLTensorShape)})),
          tirx::DeclBuffer(shape_buffer), tirx::Bind(extent, tirx::BufferLoad(shape_buffer, {axis})),
@@ -319,10 +280,9 @@ Expr LegalizeTensorShape(const BlockBuilder& bb, const Call& call) {
 
     DictAttrs attrs({{"tirx.is_scheduled", true}, {"tirx.is_host_func", true}});
 
-    tirx::PrimFunc func({dlpack_handle, axis}, body, tvm::PrimType(field_dtype), {}, attrs);
+    tirx::PrimFunc func({dlpack_handle, axis}, body, field_ty, {}, attrs);
 
-    FuncType ty({TensorType(DataType::Void(), kUnknownNDim), PrimType(axis->dtype)},
-                PrimType(field_dtype));
+    FuncType ty({TensorType(PrimType::Void(), kUnknownNDim), axis.ty()}, field_ty);
     func->ty = ty;
     return func;
   }();
@@ -349,7 +309,7 @@ Expr tensor_stride_i(Expr expr) {
 }
 
 Type InferTypeTensorStride(const Call& call, const BlockBuilder&) {
-  auto dlpack_type = DataType::Int(64);
+  auto dlpack_type = PrimType::Int(64);
 
   auto [tensor_ty, int_imm_axis] = GetTensorArgInfoWithIndex(call);
 
@@ -373,9 +333,9 @@ Type InferTypeTensorStride(const Call& call, const BlockBuilder&) {
     for (size_t axis = int_imm_axis.value() + 1; axis < tensor_shape.size(); axis++) {
       stride = stride * tensor_shape[axis];
     }
-    return PrimType(stride.dtype());
+    return stride.ty();
   } else {
-    return PrimType(dlpack_type);
+    return dlpack_type;
   }
 }
 
@@ -396,7 +356,7 @@ Expr tensor_byte_offset(Expr expr) {
 }
 
 Type InferTypeTensorByteOffset(const Call& call, const BlockBuilder&) {
-  auto dlpack_type = DataType::UInt(64);
+  auto dlpack_type = PrimType::UInt(64);
 
   auto tensor_ty = GetTensorArgInfo(call);
 
@@ -405,9 +365,9 @@ Type InferTypeTensorByteOffset(const Call& call, const BlockBuilder&) {
     // Relax implicitly requires that the byte offset is zero for any
     // legalizable tensor.  See InferTypeTensorStride for full
     // explanation.
-    return PrimType(dlpack_type);
+    return dlpack_type;
   } else {
-    return PrimType(dlpack_type);
+    return dlpack_type;
   }
 }
 
@@ -427,7 +387,7 @@ Expr tensor_elem_offset(Expr expr) {
 }
 
 Type InferTypeTensorElemOffset(const Call& call, const BlockBuilder&) {
-  auto dlpack_type = DataType::UInt(64);
+  auto dlpack_type = PrimType::UInt(64);
 
   auto tensor_ty = GetTensorArgInfo(call);
 
@@ -436,9 +396,9 @@ Type InferTypeTensorElemOffset(const Call& call, const BlockBuilder&) {
     // Relax implicitly requires that the element offset is zero for
     // any legalizable tensor.  See InferTypeTensorStride for
     // full explanation.
-    return PrimType(dlpack_type);
+    return dlpack_type;
   } else {
-    return PrimType(dlpack_type);
+    return dlpack_type;
   }
 }
 
diff --git a/src/relax/op/tensor/inspect.h b/src/relax/op/tensor/inspect.h
index 3f820ab58a83..92cc4c256c79 100644
--- a/src/relax/op/tensor/inspect.h
+++ b/src/relax/op/tensor/inspect.h
@@ -36,7 +36,7 @@ namespace inspect {
  * `TensorType`.
  *
  * \returns The uint8_t value of the type_code, with
- * `PrimType(DataType::UInt(8))`
+ * `PrimType::UInt(8)`
  */
 Expr tensor_dtype_code(Expr expr);
 
@@ -46,7 +46,7 @@ Expr tensor_dtype_code(Expr expr);
  * `TensorType`.
  *
  * \returns The uint8_t value of the number of bits, with
- * `PrimType(DataType::UInt(8))`.  For vectorized types, returns
+ * `PrimType::UInt(8)`.  For vectorized types, returns
  * the bit width of the underlying scalar type (e.g. 32 for
  * "float32x4", not 128).
  */
@@ -58,7 +58,7 @@ Expr tensor_dtype_bits(Expr expr);
  * `TensorType`.
  *
  * \returns The uint16_t value of the number of lanes, with
- * `PrimType(DataType::UInt(16))`
+ * `PrimType::UInt(16)`
  */
 Expr tensor_dtype_lanes(Expr expr);
 
@@ -68,7 +68,7 @@ Expr tensor_dtype_lanes(Expr expr);
  * `TensorType`.
  *
  * \returns The int32_t value of the dimensionality, with
- * `PrimType(DataType::Int(32))`.
+ * `PrimType::Int(32)`.
  */
 Expr tensor_ndim(Expr expr);
 
@@ -81,7 +81,7 @@ Expr tensor_ndim(Expr expr);
  *     axis < tensor_ndim(expr)`, or else the results are undefined.
  *
  * \returns The int64_t extent of the specified tensor axis, with
- * `PrimType(DataType::Int(64))`.
+ * `PrimType::Int(64)`.
  */
 Expr tensor_shape_i(Expr expr, Expr axis);
 
@@ -98,7 +98,7 @@ Expr tensor_shape_i(Expr expr, Expr axis);
  *     axis < tensor_ndim(expr)`, or else the results are undefined.
  *
  * \returns The int64_t extent of the specified tensor axis, with
- * `PrimType(DataType::Int(64))`.
+ * `PrimType::Int(64)`.
  */
 Expr tensor_stride_i(Expr expr, Expr axis);
 
@@ -107,7 +107,7 @@ Expr tensor_stride_i(Expr expr, Expr axis);
  * \param expr The relax expression to be inspected.  Must have
  * `TensorType`.
  *
- * \returns The uint64_t byte offset, with `PrimType(DataType::UInt(64))`.
+ * \returns The uint64_t byte offset, with `PrimType::UInt(64)`.
  */
 Expr tensor_byte_offset(Expr expr);
 
@@ -120,7 +120,7 @@ Expr tensor_byte_offset(Expr expr);
  * \param expr The relax expression to be inspected.  Must have
  * `TensorType`.
  *
- * \returns The uint64_t element offset, with `PrimType(DataType::UInt(64))`.
+ * \returns The uint64_t element offset, with `PrimType::UInt(64)`.
  */
 Expr tensor_elem_offset(Expr expr);
 
diff --git a/src/relax/op/tensor/linear_algebra.cc b/src/relax/op/tensor/linear_algebra.cc
index a1693c6563f2..6ea68b422378 100644
--- a/src/relax/op/tensor/linear_algebra.cc
+++ b/src/relax/op/tensor/linear_algebra.cc
@@ -42,9 +42,9 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 /* relax.matmul */
 
-Expr matmul(Expr x1, Expr x2, ffi::Optional<DataType> out_dtype) {
+Expr matmul(Expr x1, Expr x2, ffi::Optional<DLDataType> out_dtype) {
   ffi::ObjectPtr<MatmulAttrs> attrs = ffi::make_object<MatmulAttrs>();
-  attrs->out_dtype = out_dtype.value_or(DataType::Void());
+  attrs->out_dtype = out_dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
 
   static const Op& op = Op::Get("relax.matmul");
   return Call(op, {std::move(x1), std::move(x2)}, Attrs(attrs), {});
@@ -74,9 +74,9 @@ Type InferTypeMatmul(const Call& call, const BlockBuilder& ctx) {
   }
 
   const auto* attrs = call->attrs.as<MatmulAttrs>();
-  DataType out_dtype = attrs->out_dtype.is_void()
-                           ? InferBinaryArithOpOutDtype(call, ctx, x1_ty, x2_ty)
-                           : attrs->out_dtype;
+  PrimType out_dtype = PrimType(attrs->out_dtype == DLDataType{kDLOpaqueHandle, 0, 0}
+                                    ? InferBinaryArithOpOutDtype(call, ctx, x1_ty, x2_ty)
+                                    : attrs->out_dtype);
 
   if (x1_ty->IsUnknownNdim() || x2_ty->IsUnknownNdim()) {
     if (vdev.defined()) {
@@ -158,7 +158,7 @@ Type InferTypeMatmul(const Call& call, const BlockBuilder& ctx) {
   return TensorType(ShapeExpr(output_shape), out_dtype);
 }
 
-Call InferMixedPrecisionMatmul(const Call& call, const DataType& out_dtype) {
+Call InferMixedPrecisionMatmul(const Call& call, DLDataType out_dtype) {
   return matmul(call->args[0], call->args[1], out_dtype).as_or_throw<Call>();
 }
 
@@ -218,17 +218,17 @@ Type InferTypeEinsum(const Call& call, const BlockBuilder& ctx) {
 
   ffi::String subscripts = attrs->subscripts;
 
-  DataType operand_dtype = operands_tensor_ty[0]->dtype;
+  PrimType operand_ty = operands_tensor_ty[0]->dtype;
   std::vector<ffi::Array<PrimExpr>> input_shapes;
   input_shapes.reserve(operands_tensor_ty.size());
 
   for (TensorType tensor_ty : operands_tensor_ty) {
     // Check the input tuple consists of tensors with same dtype
-    if (tensor_ty->dtype != operand_dtype) {
+    if (tensor_ty->dtype != operand_ty) {
       TVM_FFI_VISIT_THROW(TypeError, call)
           << "Einsum expects all input tensors to have the same dtype. However, the "
              "input contains tensors with dtype "
-          << operand_dtype << " and " << tensor_ty->dtype;
+          << operand_ty << " and " << tensor_ty->dtype;
     }
 
     // Get input shapes
@@ -237,18 +237,18 @@ Type InferTypeEinsum(const Call& call, const BlockBuilder& ctx) {
       input_shapes.push_back(shape_expr->values);
     } else {
       if (!vdevice_unknown) {
-        return TensorType(operand_dtype, tensor_ty->ndim, vdev);
+        return TensorType(operand_ty, tensor_ty->ndim, vdev);
       }
-      return TensorType(operand_dtype, tensor_ty->ndim);
+      return TensorType(operand_ty, tensor_ty->ndim);
     }
   }
   // Calculate output shape using InferEinsumShape in topi
   ffi::Array<PrimExpr> oshape = topi::InferEinsumShape(subscripts, input_shapes);
 
   if (!vdevice_unknown) {
-    return TensorType(ShapeExpr(oshape), operand_dtype, vdev);
+    return TensorType(ShapeExpr(oshape), operand_ty, vdev);
   }
-  return TensorType(ShapeExpr(oshape), operand_dtype);
+  return TensorType(ShapeExpr(oshape), operand_ty);
 }
 
 TVM_REGISTER_OP("relax.einsum")
diff --git a/src/relax/op/tensor/linear_algebra.h b/src/relax/op/tensor/linear_algebra.h
index ddfceae4dc35..481193f897b8 100644
--- a/src/relax/op/tensor/linear_algebra.h
+++ b/src/relax/op/tensor/linear_algebra.h
@@ -41,7 +41,7 @@ namespace relax {
  * When it is not specified, the output dtype will be the same as input dtype.
  * \return The computed result.
  */
-Expr matmul(Expr x1, Expr x2, ffi::Optional<DataType> out_dtype);
+Expr matmul(Expr x1, Expr x2, ffi::Optional<DLDataType> out_dtype);
 
 /*!
  * \brief Einstein summation on the operands.
diff --git a/src/relax/op/tensor/manipulate.cc b/src/relax/op/tensor/manipulate.cc
index caa730091383..8fe14c78555f 100644
--- a/src/relax/op/tensor/manipulate.cc
+++ b/src/relax/op/tensor/manipulate.cc
@@ -35,7 +35,7 @@
 #include <utility>
 #include <vector>
 
-#include "tvm/runtime/data_type.h"
+#include "tvm/ffi/dtype.h"
 
 namespace tvm {
 namespace relax {
@@ -219,7 +219,7 @@ Type InferTypeConcat(const Call& call, const BlockBuilder& ctx) {
 
   const auto* attrs = call->attrs.as<ConcatAttrs>();
   int output_ndim = attrs->axis.has_value() ? kUnknownNDim : 1;
-  DataType output_dtype = DataType::Void();
+  PrimType output_dtype = PrimType::Void();
   ffi::Optional<VDevice> vdev = std::nullopt;
   bool shape_unknown = false;
   bool is_void_dtype = false;
@@ -229,9 +229,9 @@ Type InferTypeConcat(const Call& call, const BlockBuilder& ctx) {
 
   for (TensorType ty : tensor_ty) {
     // Update the output dtype.
-    if (ty->dtype.is_void()) {
+    if (ty->IsUnknownDtype()) {
       is_void_dtype = true;
-    } else if (output_dtype.is_void()) {
+    } else if (output_dtype.IsVoid()) {
       output_dtype = ty->dtype;
     } else if (ty->dtype != output_dtype) {
       TVM_FFI_VISIT_THROW(TypeError, call)
@@ -285,7 +285,7 @@ Type InferTypeConcat(const Call& call, const BlockBuilder& ctx) {
   }
 
   if (is_void_dtype) {
-    output_dtype = DataType::Void();
+    output_dtype = PrimType::Void();
   }
   if (vdevice_unknown) {
     vdev = std::nullopt;
@@ -573,14 +573,16 @@ Type InferTypeIndexTensor(const Call& call, const BlockBuilder& ctx) {
         << "index_tensor expects a non‑empty tuple of index tensors";
   }
 
-  DataType output_dtype = data_ty->dtype;
+  PrimType output_dtype = data_ty->dtype;
   int n_indices = static_cast<int>(indices_ty.size());
   ffi::Optional<VDevice> vdev = data_ty->vdevice;
 
   // Indices must be integers
   for (int i = 0; i < n_indices; ++i) {
     const auto& s = indices_ty[i];
-    if (!s->IsUnknownDtype() && !s->dtype.is_int()) {
+    PrimType index_dtype = s->dtype;
+    // Indexing only requires integer element kind; vector lanes do not affect shape inference.
+    if (!s->IsUnknownDtype() && index_dtype.code() != DLDataTypeCode::kDLInt) {
       TVM_FFI_VISIT_THROW(TypeError, call)
           << "index_tensor requires every index tensor to have an integer dtype; "
           << "index " << i << " has dtype " << s->dtype;
@@ -725,9 +727,10 @@ Type InferTypeLayoutTransform(const Call& call, const BlockBuilder& ctx) {
   // Check pad_value has same dtype as input.
   if (optional_pad_value.defined()) {
     PrimExpr padded_value = optional_pad_value.value()->value;
-    if (padded_value->dtype != data_ty->dtype) {
+    PrimType padded_dtype = padded_value.ty();
+    if (padded_dtype != data_ty->dtype) {
       TVM_FFI_VISIT_THROW(TypeError, call)
-          << "layout_transform pad_value dtype (" << padded_value->dtype << ") and input dtype ("
+          << "layout_transform pad_value dtype (" << padded_dtype << ") and input dtype ("
           << data_ty->dtype << ") must be the same";
     }
   }
@@ -916,9 +919,10 @@ Expr ConvertNewShapeToExpr(const Expr& data,
            "Array of PrimExprs. However, the given new shape is "
         << shape;
     PrimExpr len = ffi::GetRef<PrimExpr>(_len);
-    TVM_FFI_ICHECK(len->dtype.is_int()) << "Reshape requires the new shape values to be all "
-                                           "integers. However, the give new shape is "
-                                        << shape;
+    TVM_FFI_ICHECK(len.ty().code() == DLDataTypeCode::kDLInt)
+        << "Reshape requires the new shape values to be all "
+           "integers. However, the give new shape is "
+        << shape;
     const auto* int_len = len.as<IntImmNode>();
     if (int_len != nullptr && int_len->value == 0) {
       // Note that this dimension should be copied from the original shape.
@@ -1108,7 +1112,7 @@ Type InferTypeSplit(const Call& call, const BlockBuilder& ctx) {
 
     TVM_FFI_ICHECK_NE(axis, -1);
 
-    IntImm zero(DataType::Int(64), /*value=*/0);
+    IntImm zero(tvm::PrimType::Int(64), /*value=*/0);
 
     std::vector<Type> output_ty;
     for (size_t i = 0; i < p_indices.size() + 1; i++) {
@@ -1489,7 +1493,7 @@ Type InferTypeStack(const Call& call, const BlockBuilder& ctx) {
 
   // Default axis is 0 if not specified
   int output_ndim = tensor_ty[0]->ndim + 1;  // Stack adds one dimension
-  DataType output_dtype = DataType::Void();
+  PrimType output_dtype = PrimType::Void();
   ffi::Optional<VDevice> vdev = std::nullopt;
   bool shape_unknown = false;
   bool is_void_dtype = false;
@@ -1499,9 +1503,9 @@ Type InferTypeStack(const Call& call, const BlockBuilder& ctx) {
 
   for (TensorType ty : tensor_ty) {
     // Check dtype consistency
-    if (ty->dtype.is_void()) {
+    if (ty->IsUnknownDtype()) {
       is_void_dtype = true;
-    } else if (output_dtype.is_void()) {
+    } else if (output_dtype.IsVoid()) {
       output_dtype = ty->dtype;
     } else if (ty->dtype != output_dtype) {
       TVM_FFI_VISIT_THROW(TypeError, call)
@@ -1542,7 +1546,7 @@ Type InferTypeStack(const Call& call, const BlockBuilder& ctx) {
     }
   }
 
-  if (is_void_dtype) output_dtype = DataType::Void();
+  if (is_void_dtype) output_dtype = PrimType::Void();
   if (vdevice_unknown) vdev = std::nullopt;
 
   // Normalize axis (default to 0 if not specified)
@@ -1650,7 +1654,7 @@ Type InferTypeCollapseSumLike(const Call& call, const BlockBuilder& ctx) {
   TensorType data_ty = input_ty[0];
   TensorType collapse_target_ty = input_ty[1];
 
-  DataType output_dtype = data_ty->dtype;
+  PrimType output_dtype = data_ty->dtype;
 
   ffi::Optional<ffi::Array<PrimExpr>> data_shape_value;
   if (data_ty->shape.defined()) {
@@ -1711,7 +1715,7 @@ Type InferTypeCollapseSumTo(const Call& call, const BlockBuilder& ctx) {
         << call->args[1]->ty->GetTypeKey();
   }
 
-  DataType output_dtype = data_ty->dtype;
+  PrimType output_dtype = data_ty->dtype;
 
   ffi::Optional<ffi::Array<PrimExpr>> data_shape_value;
   if (data_ty->shape.defined()) {
@@ -2192,7 +2196,9 @@ Type InferTypeGatherElements(const Call& call, const BlockBuilder& ctx) {
         << call->args[1]->ty->GetTypeKey();
   }
 
-  if (!indices_ty->IsUnknownDtype() && !indices_ty->dtype.is_int()) {
+  PrimType indices_dtype = indices_ty->dtype;
+  // Gather indices only require integer element kind; vector lanes do not affect shape inference.
+  if (!indices_ty->IsUnknownDtype() && indices_dtype.code() != DLDataTypeCode::kDLInt) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "GatherElements requires the input indices to have int64 dtype. However, the "
         << "given indices dtype is " << indices_ty->dtype;
@@ -2295,7 +2301,7 @@ Type InferTypeGatherND(const Call& call, const BlockBuilder& ctx) {
   TVM_FFI_ICHECK_GE(attrs->batch_dims, 0);
   int batch_dims = static_cast<int>(attrs->batch_dims);
   int input_dims = data_ty->ndim;
-  if (!indices_ty->IsUnknownDtype() && indices_ty->dtype != DataType::Int(64)) {
+  if (!indices_ty->IsUnknownDtype() && indices_ty->dtype != PrimType::Int(64)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "GatherND requires the input indices to have int64 dtype. However, the "
         << "given indices dtype is " << indices_ty->dtype;
@@ -2430,10 +2436,14 @@ Type InferTypeIndexPut(const Call& call, const BlockBuilder& ctx) {
     if (tensor_ty->IsUnknownDtype()) {
       LOG(WARNING) << "Data type of index tensor " << i
                    << " has not been specified. Assume it has an integer type.";
-    } else if (!(tensor_ty->dtype.is_int() || tensor_ty->dtype.is_uint())) {
-      TVM_FFI_VISIT_THROW(TypeError, call)
-          << "IndexPut requires each index tensor to have integer dtype. "
-          << "However, index tensor " << i << " has dtype=" << tensor_ty->dtype;
+    } else {
+      PrimType index_dtype = tensor_ty->dtype;
+      if (!index_dtype.MatchesCode(DLDataTypeCode::kDLInt) &&
+          !index_dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
+        TVM_FFI_VISIT_THROW(TypeError, call)
+            << "IndexPut requires each index tensor to have integer dtype. "
+            << "However, index tensor " << i << " has dtype=" << tensor_ty->dtype;
+      }
     }
   }
 
@@ -2531,7 +2541,7 @@ Type InferTypeMeshgrid(const Call& call, const BlockBuilder& ctx) {
   }
 
   std::vector<PrimExpr> lengths;
-  DataType common_dtype = DataType::Void();
+  PrimType common_dtype = PrimType::Void();
   bool shape_unknown = false;
   ffi::Optional<VDevice> vdev = std::nullopt;
   bool vdevice_unknown = false;
@@ -2545,9 +2555,9 @@ Type InferTypeMeshgrid(const Call& call, const BlockBuilder& ctx) {
           << i;
     }
 
-    if (ty->dtype.is_void()) {
+    if (ty->IsUnknownDtype()) {
       continue;
-    } else if (common_dtype.is_void()) {
+    } else if (common_dtype.IsVoid()) {
       common_dtype = ty->dtype;
     } else if (ty->dtype != common_dtype) {
       TVM_FFI_VISIT_THROW(TypeError, call)
@@ -2683,11 +2693,15 @@ Type InferTypeScatterElements(const Call& call, const BlockBuilder& ctx) {
 
   if (indices_ty->IsUnknownDtype()) {
     LOG(WARNING) << "Data type of indices has not been specified. Assume it has an integer type.";
-  } else if (!(indices_ty->dtype.is_int() || indices_ty->dtype.is_uint())) {
-    TVM_FFI_VISIT_THROW(TypeError, call)
-        << "ScatterElements op requires the input indices to have integer dtype. However, the "
-           "given indices dtype is "
-        << indices_ty->dtype;
+  } else {
+    PrimType indices_dtype = indices_ty->dtype;
+    if (!indices_dtype.MatchesCode(DLDataTypeCode::kDLInt) &&
+        !indices_dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
+      TVM_FFI_VISIT_THROW(TypeError, call)
+          << "ScatterElements op requires the input indices to have integer dtype. However, the "
+             "given indices dtype is "
+          << indices_ty->dtype;
+    }
   }
 
   const auto* indices_shape = indices_ty->shape.as<ShapeExprNode>();
@@ -2803,11 +2817,15 @@ Type InferTypeScatterND(const Call& call, const BlockBuilder& ctx) {
 
   if (indices_ty->IsUnknownDtype()) {
     LOG(WARNING) << "Data type of indices has not been specified. Assume it has an integer type.";
-  } else if (!(indices_ty->dtype.is_int() || indices_ty->dtype.is_uint())) {
-    TVM_FFI_VISIT_THROW(TypeError, call)
-        << "ScatterND op requires the input indices to have integer dtype. However, "
-           "the given indices dtype is "
-        << indices_ty->dtype;
+  } else {
+    PrimType indices_dtype = indices_ty->dtype;
+    if (!indices_dtype.MatchesCode(DLDataTypeCode::kDLInt) &&
+        !indices_dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
+      TVM_FFI_VISIT_THROW(TypeError, call)
+          << "ScatterND op requires the input indices to have integer dtype. However, "
+             "the given indices dtype is "
+          << indices_ty->dtype;
+    }
   }
 
   const auto* data_shape = data_ty->shape.as<ShapeExprNode>();
@@ -3003,10 +3021,11 @@ Type InferTypeSliceScatter(const Call& call, const BlockBuilder& ctx) {
           << ") to be a PrimValue, but got " << arg_expr->GetTypeKey();
     }
     const PrimExpr& prim_expr = prim_value_node->value;
-    if (!prim_expr.dtype().is_int() && !prim_expr.dtype().is_uint()) {
+    tvm::PrimType prim_ty = prim_expr.ty();
+    if (prim_ty.code() != DLDataTypeCode::kDLInt && prim_ty.code() != DLDataTypeCode::kDLUInt) {
       TVM_FFI_VISIT_THROW(TypeError, call)
           << "SliceScatter expects `" << key << "` (" << prim_expr
-          << ") to be an integer PrimValue, but got dtype " << prim_expr.dtype();
+          << ") to be an integer PrimValue, but got dtype " << prim_ty;
     }
     return prim_expr;
   };
@@ -3085,8 +3104,8 @@ Expr one_hot(Expr indices, PrimValue on_value, PrimValue off_value, int depth, i
   attrs->axis = axis;
 
   // Check if on_value and off_value have the same dtype
-  DataType on_dtype = on_value->value->dtype;
-  DataType off_dtype = off_value->value->dtype;
+  PrimType on_dtype = on_value->value.ty();
+  PrimType off_dtype = off_value->value.ty();
   TVM_FFI_ICHECK(on_dtype == off_dtype)
       << "one_hot: on_value and off_value must have the same dtype, "
       << "but got " << on_dtype << " and " << off_dtype;
@@ -3108,19 +3127,25 @@ Type InferTypeOneHot(const Call& call, const BlockBuilder& ctx) {
   PrimValue on_value = call->args[1].as_or_throw<PrimValue>();
   PrimValue off_value = call->args[2].as_or_throw<PrimValue>();
   // Check if on_value and off_value have the same dtype
-  TVM_FFI_ICHECK(on_value->value->dtype == off_value->value->dtype)
+  PrimType on_dtype = on_value->value.ty();
+  PrimType off_dtype = off_value->value.ty();
+  TVM_FFI_ICHECK(on_dtype == off_dtype)
       << "one_hot: on_value and off_value must have the same dtype, "
-      << "but got " << on_value->value->dtype << " and " << off_value->value->dtype;
-  DataType dtype = on_value->value->dtype;
+      << "but got " << on_dtype << " and " << off_dtype;
+  PrimType dtype = on_dtype;
 
   // Check if indices has an integer dtype
   if (indices_ty->IsUnknownDtype()) {
     LOG(WARNING) << "Data type of indices has not been specified. Assume it has an integer type.";
-  } else if (!(indices_ty->dtype.is_int() || indices_ty->dtype.is_uint())) {
-    TVM_FFI_VISIT_THROW(TypeError, call)
-        << "one_hot op requires the input indices to have integer dtype. However, the "
-           "given indices dtype is "
-        << indices_ty->dtype;
+  } else {
+    PrimType indices_dtype = indices_ty->dtype;
+    if (!indices_dtype.MatchesCode(DLDataTypeCode::kDLInt) &&
+        !indices_dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
+      TVM_FFI_VISIT_THROW(TypeError, call)
+          << "one_hot op requires the input indices to have integer dtype. However, the "
+             "given indices dtype is "
+          << indices_ty->dtype;
+    }
   }
   // Check if indices has unknown dimension
   if (indices_ty->IsUnknownNdim()) {
diff --git a/src/relax/op/tensor/qdq.cc b/src/relax/op/tensor/qdq.cc
index 974d70e7300a..8940594abc51 100644
--- a/src/relax/op/tensor/qdq.cc
+++ b/src/relax/op/tensor/qdq.cc
@@ -39,7 +39,7 @@ TVM_FFI_STATIC_INIT_BLOCK() { QuantizeAttrs::RegisterReflection(); }
 
 /* relax.quantize */
 
-Expr quantize(Expr data, Expr scale, Expr zero_point, int axis, DataType out_dtype) {
+Expr quantize(Expr data, Expr scale, Expr zero_point, int axis, DLDataType out_dtype) {
   ffi::ObjectPtr<QuantizeAttrs> attrs = ffi::make_object<QuantizeAttrs>();
   attrs->axis = axis;
   attrs->out_dtype = out_dtype;
@@ -54,9 +54,14 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 Type InferTypeQuantize(const Call& call, const BlockBuilder& ctx) {
   const auto* attrs = call->attrs.as<QuantizeAttrs>();
-  if (attrs->out_dtype != DataType::Int(8) && attrs->out_dtype != DataType::UInt(8) &&
-      attrs->out_dtype != DataType::Int(16) && attrs->out_dtype != DataType::UInt(16) &&
-      attrs->out_dtype != DataType::Float8E4M3FN() && attrs->out_dtype != DataType::Float8E5M2()) {
+  if (attrs->out_dtype != DLDataType{kDLInt, 8, 1} &&
+      attrs->out_dtype != DLDataType{kDLUInt, 8, 1} &&
+      attrs->out_dtype != DLDataType{kDLInt, 16, 1} &&
+      attrs->out_dtype != DLDataType{kDLUInt, 16, 1} &&
+      attrs->out_dtype != DLDataType{static_cast<uint8_t>(kDLFloat8_e4m3fn),
+                                     static_cast<uint8_t>(8), static_cast<uint16_t>(1)} &&
+      attrs->out_dtype != DLDataType{static_cast<uint8_t>(kDLFloat8_e5m2), static_cast<uint8_t>(8),
+                                     static_cast<uint16_t>(1)}) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Unsupported output datatype attribute for operation: '" << attrs->out_dtype;
   }
@@ -64,24 +69,27 @@ Type InferTypeQuantize(const Call& call, const BlockBuilder& ctx) {
   TensorType input_ty = GetInputTensorType(call, ctx)[0];
   TensorType scale_ty = GetInputTensorType(call, ctx)[1];
   TensorType zp_ty = GetInputTensorType(call, ctx)[2];
+  PrimType input_dtype = input_ty->dtype;
+  PrimType scale_dtype = scale_ty->dtype;
+  PrimType zp_dtype = zp_ty->dtype;
 
   // Check input datatype:
-  if (input_ty->dtype != DataType::Float(16) && input_ty->dtype != DataType::Float(32)) {
+  if (input_dtype != PrimType::Float(16) && input_dtype != PrimType::Float(32)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Unsupported input datatype for operation: " << input_ty->dtype;
   }
 
   // Check datatype of scale param:
-  if (scale_ty->dtype != DataType::Float(32) && scale_ty->dtype != DataType::Float(16)) {
+  if (scale_dtype != PrimType::Float(32) && scale_dtype != PrimType::Float(16)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "scale param datatype should be one of [float16, float32], but got " << scale_ty->dtype;
   }
 
   // Check datatype of zero_point param:
-  if (zp_ty->dtype != DataType::Int(8) && zp_ty->dtype != DataType::UInt(8) &&
-      zp_ty->dtype != DataType::Int(16) && zp_ty->dtype != DataType::UInt(16) &&
-      zp_ty->dtype != DataType::Int(32) && zp_ty->dtype != DataType::UInt(32) &&
-      zp_ty->dtype != DataType::Float(16)) {
+  if (zp_dtype != PrimType::Int(8) && zp_dtype != PrimType::UInt(8) &&
+      zp_dtype != PrimType::Int(16) && zp_dtype != PrimType::UInt(16) &&
+      zp_dtype != PrimType::Int(32) && zp_dtype != PrimType::UInt(32) &&
+      zp_dtype != PrimType::Float(16)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "zero_point param datatype should be one of "
         << "['int8', 'uint8', 'int16', 'uint16', 'int32', 'uint32', 'float16'], "
@@ -124,7 +132,7 @@ Type InferTypeQuantize(const Call& call, const BlockBuilder& ctx) {
   if (!is_scalar_or_singleton_vector(zp_ty)) check_param_size(zp_ty, input_ty, "zero_point");
 
   auto output_ty = ffi::make_object<TensorTypeNode>(*input_ty.get());
-  output_ty->dtype = attrs->out_dtype;
+  output_ty->dtype = PrimType(attrs->out_dtype);
   return TensorType(output_ty);
 }
 
@@ -139,7 +147,7 @@ TVM_REGISTER_OP("relax.quantize")
 
 /* relax.dequantize */
 
-Expr dequantize(Expr data, Expr scale, Expr zero_point, int axis, DataType out_dtype) {
+Expr dequantize(Expr data, Expr scale, Expr zero_point, int axis, DLDataType out_dtype) {
   ffi::ObjectPtr<QuantizeAttrs> attrs = ffi::make_object<QuantizeAttrs>();
   attrs->axis = axis;
   attrs->out_dtype = out_dtype;
@@ -154,7 +162,8 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 Type InferTypeDequantize(const Call& call, const BlockBuilder& ctx) {
   const auto* attrs = call->attrs.as<QuantizeAttrs>();
-  if (attrs->out_dtype != DataType::Float(16) && attrs->out_dtype != DataType::Float(32)) {
+  if (attrs->out_dtype != DLDataType{kDLFloat, 16, 1} &&
+      attrs->out_dtype != DLDataType{kDLFloat, 32, 1}) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Unsupported output datatype attribute for operation: " << attrs->out_dtype;
   }
@@ -162,28 +171,34 @@ Type InferTypeDequantize(const Call& call, const BlockBuilder& ctx) {
   TensorType input_ty = GetInputTensorType(call, ctx)[0];
   TensorType scale_ty = GetInputTensorType(call, ctx)[1];
   TensorType zp_ty = GetInputTensorType(call, ctx)[2];
+  PrimType input_dtype = input_ty->dtype;
+  PrimType scale_dtype = scale_ty->dtype;
+  PrimType zp_dtype = zp_ty->dtype;
 
   // Check input datatype:
-  if (input_ty->dtype != DataType::Int(8) && input_ty->dtype != DataType::UInt(8) &&
-      input_ty->dtype != DataType::Int(16) && input_ty->dtype != DataType::UInt(16) &&
-      input_ty->dtype != DataType::Int(32) && input_ty->dtype != DataType::Float8E4M3FN() &&
-      input_ty->dtype != DataType::Float8E5M2() && input_ty->dtype != DataType::Float(16) &&
-      input_ty->dtype != DataType::Float(32)) {
+  if (input_dtype != PrimType::Int(8) && input_dtype != PrimType::UInt(8) &&
+      input_dtype != PrimType::Int(16) && input_dtype != PrimType::UInt(16) &&
+      input_dtype != PrimType::Int(32) &&
+      input_dtype != PrimType(DLDataType{static_cast<uint8_t>(kDLFloat8_e4m3fn),
+                                         static_cast<uint8_t>(8), static_cast<uint16_t>(1)}) &&
+      input_dtype != PrimType(DLDataType{static_cast<uint8_t>(kDLFloat8_e5m2),
+                                         static_cast<uint8_t>(8), static_cast<uint16_t>(1)}) &&
+      input_dtype != PrimType::Float(16) && input_dtype != PrimType::Float(32)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Unsupported input datatype for operation: " << attrs->out_dtype;
   }
 
   // Check datatype of scale param:
-  if (scale_ty->dtype != DataType::Float(32) && scale_ty->dtype != DataType::Float(16)) {
+  if (scale_dtype != PrimType::Float(32) && scale_dtype != PrimType::Float(16)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "scale param datatype should be one of [float16, float32], but got " << scale_ty->dtype;
   }
 
   // Check datatype of zero_point param:
-  if (zp_ty->dtype != DataType::Int(8) && zp_ty->dtype != DataType::UInt(8) &&
-      zp_ty->dtype != DataType::Int(16) && zp_ty->dtype != DataType::UInt(16) &&
-      zp_ty->dtype != DataType::Int(32) && zp_ty->dtype != DataType::UInt(32) &&
-      zp_ty->dtype != DataType::Float(16)) {
+  if (zp_dtype != PrimType::Int(8) && zp_dtype != PrimType::UInt(8) &&
+      zp_dtype != PrimType::Int(16) && zp_dtype != PrimType::UInt(16) &&
+      zp_dtype != PrimType::Int(32) && zp_dtype != PrimType::UInt(32) &&
+      zp_dtype != PrimType::Float(16)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "zero_point param datatype should be one of "
         << "['int8', 'uint8', 'int16', 'uint16', 'int32', 'uint32', 'float16'], "
@@ -226,7 +241,7 @@ Type InferTypeDequantize(const Call& call, const BlockBuilder& ctx) {
   if (!is_scalar_or_singleton_vector(zp_ty)) check_param_size(zp_ty, input_ty, "zero_point");
 
   auto output_ty = ffi::make_object<TensorTypeNode>(*input_ty.get());
-  output_ty->dtype = attrs->out_dtype;
+  output_ty->dtype = PrimType(attrs->out_dtype);
   return TensorType(output_ty);
 }
 
diff --git a/src/relax/op/tensor/qdq.h b/src/relax/op/tensor/qdq.h
index 9d13dcde277f..bdb31f87e61e 100644
--- a/src/relax/op/tensor/qdq.h
+++ b/src/relax/op/tensor/qdq.h
@@ -40,7 +40,7 @@ namespace relax {
  * \param out_dtype The data type of the output tensor.
  * \return The computed result.
  */
-Expr quantize(Expr data, Expr scale, Expr zero_point, int axis, DataType out_dtype);
+Expr quantize(Expr data, Expr scale, Expr zero_point, int axis, DLDataType out_dtype);
 
 /*!
  * \brief Dequantize op.
@@ -53,7 +53,7 @@ Expr quantize(Expr data, Expr scale, Expr zero_point, int axis, DataType out_dty
  * \param out_dtype The data type of the output tensor.
  * \return The computed result.
  */
-Expr dequantize(Expr data, Expr scale, Expr zero_point, int axis, DataType out_dtype);
+Expr dequantize(Expr data, Expr scale, Expr zero_point, int axis, DLDataType out_dtype);
 
 }  // namespace relax
 }  // namespace tvm
diff --git a/src/relax/op/tensor/sampling.cc b/src/relax/op/tensor/sampling.cc
index 27f9241e2c29..196e6f887649 100644
--- a/src/relax/op/tensor/sampling.cc
+++ b/src/relax/op/tensor/sampling.cc
@@ -37,7 +37,8 @@ TVM_FFI_STATIC_INIT_BLOCK() { MultinomialFromUniformAttrs::RegisterReflection();
 
 /* relax.multinomial_from_uniform */
 
-Expr multinomial_from_uniform(Expr prob, Expr uniform_sample, Expr sample_indices, DataType dtype) {
+Expr multinomial_from_uniform(Expr prob, Expr uniform_sample, Expr sample_indices,
+                              DLDataType dtype) {
   ffi::ObjectPtr<MultinomialFromUniformAttrs> attrs =
       ffi::make_object<MultinomialFromUniformAttrs>();
   attrs->dtype = dtype;
@@ -59,19 +60,24 @@ Type InferTypeMultinomialFromUniform(const Call& call, const BlockBuilder& ctx)
   TensorType sample_indices_ty = GetInputTensorType(call, 2, ctx);
   const auto* attrs = call->attrs.as<MultinomialFromUniformAttrs>();
 
-  if (!prob_ty->dtype.is_float()) {
+  // Only the element kind matters here; shape inference does not depend on vector lanes.
+  if (prob_ty->dtype.code() != DLDataTypeCode::kDLFloat &&
+      prob_ty->dtype.code() != DLDataTypeCode::kDLBfloat) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Multinomial_from_uniform op requires the input prob to have float dtype. "
            "However, the given prob dtype is "
         << prob_ty->dtype;
   }
-  if (!uniform_sample_ty->dtype.is_float()) {
+  // Only the element kind matters here; shape inference does not depend on vector lanes.
+  if (uniform_sample_ty->dtype.code() != DLDataTypeCode::kDLFloat &&
+      uniform_sample_ty->dtype.code() != DLDataTypeCode::kDLBfloat) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Multinomial_from_uniform op requires the input uniform_sample to have float "
            "dtype. However, the given uniform_sample dtype is "
         << uniform_sample_ty->dtype;
   }
-  if (!sample_indices_ty->dtype.is_int()) {
+  // Only the element kind matters here; shape inference does not depend on vector lanes.
+  if (sample_indices_ty->dtype.code() != DLDataTypeCode::kDLInt) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Multinomial from uniform op requires the input sample_indices to have int "
            "dtype. However, the given sample_indices dtype is "
@@ -79,7 +85,7 @@ Type InferTypeMultinomialFromUniform(const Call& call, const BlockBuilder& ctx)
   }
   if (prob_ty->IsUnknownNdim() || uniform_sample_ty->IsUnknownNdim() ||
       sample_indices_ty->IsUnknownNdim()) {
-    return TensorType(attrs->dtype, kUnknownNDim, prob_ty->vdevice);
+    return TensorType(PrimType(attrs->dtype), kUnknownNDim, prob_ty->vdevice);
   }
   if (prob_ty->ndim != 2) {
     TVM_FFI_VISIT_THROW(ValueError, call)
@@ -109,7 +115,7 @@ Type InferTypeMultinomialFromUniform(const Call& call, const BlockBuilder& ctx)
   // The output shape is expected to be `(n, 1)`
 
   if (prob_shape == nullptr || uniform_sample_shape == nullptr || sample_indices_shape == nullptr) {
-    return TensorType(attrs->dtype, 2, prob_ty->vdevice);
+    return TensorType(PrimType(attrs->dtype), 2, prob_ty->vdevice);
   }
 
   PrimExpr batch = prob_shape->values[0];
@@ -132,7 +138,7 @@ Type InferTypeMultinomialFromUniform(const Call& call, const BlockBuilder& ctx)
         << uniform_sample_ty->shape << " and the given sample_indices tensor has shape "
         << sample_indices_ty->shape;
   }
-  return TensorType(ShapeExpr({n, 1}), attrs->dtype, prob_ty->vdevice);
+  return TensorType(ShapeExpr({n, 1}), PrimType(attrs->dtype), prob_ty->vdevice);
 }
 
 TVM_REGISTER_OP("relax.multinomial_from_uniform")
diff --git a/src/relax/op/tensor/sampling.h b/src/relax/op/tensor/sampling.h
index d13aa835d68d..077ef4313669 100644
--- a/src/relax/op/tensor/sampling.h
+++ b/src/relax/op/tensor/sampling.h
@@ -49,7 +49,8 @@ namespace relax {
  * \param dtype The data type of the output tensor.
  * \return The sampled result.
  */
-Expr multinomial_from_uniform(Expr prob, Expr uniform_sample, Expr sample_indices, DataType dtype);
+Expr multinomial_from_uniform(Expr prob, Expr uniform_sample, Expr sample_indices,
+                              DLDataType dtype);
 
 }  // namespace relax
 }  // namespace tvm
diff --git a/src/relax/op/tensor/search.cc b/src/relax/op/tensor/search.cc
index d80f484ebcf5..635879db2be3 100644
--- a/src/relax/op/tensor/search.cc
+++ b/src/relax/op/tensor/search.cc
@@ -64,10 +64,9 @@ Type InferTypeBucketize(const Call& call, const BlockBuilder& ctx) {
   }
 
   auto attrs = call->attrs.as<BucketizeAttrs>();
-  DataType out_dtype;
-  out_dtype = DataType::Int(64);
+  PrimType out_dtype = PrimType::Int(64);
   if (attrs->out_int32) {
-    out_dtype = DataType::Int(32);
+    out_dtype = PrimType::Int(32);
   }
 
   const auto* data_shape = input_tensor_info->shape.as<ShapeExprNode>();
@@ -119,13 +118,15 @@ Type InferTypeWhere(const Call& call, const BlockBuilder& ctx) {
     }
   }
 
-  if (!cond_ty->dtype.is_bool()) {
+  PrimType cond_dtype = cond_ty->dtype;
+  // Where condition validation only checks the boolean element kind; lanes are irrelevant here.
+  if (cond_dtype.code() != DLDataTypeCode::kDLBool) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Where requires the input condition tensor to have boolean dtype. However, "
            "the given condition dtype is "
         << cond_ty->dtype;
   }
-  DataType output_dtype = InferBinaryArithOpOutDtype(call, ctx, x1_ty, x2_ty);
+  PrimType output_dtype(InferBinaryArithOpOutDtype(call, ctx, x1_ty, x2_ty));
 
   int output_ndim;
   if (cond_ty->IsUnknownNdim() || x1_ty->IsUnknownNdim() || x2_ty->IsUnknownNdim()) {
@@ -209,7 +210,7 @@ Type InferTypeArgmaxArgmin(const Call& call, const BlockBuilder& ctx) {
     TVM_FFI_ICHECK_GE(out_ndim, 0);
   }
 
-  DataType out_dtype = DataType::Int(64);
+  PrimType out_dtype = PrimType::Int(64);
   // The inference rule for reduction operator output shapes:
   // - axes is None, keepdims is false -> return the zero-rank shape;
   // - axes is None, keepdims is true -> return the shape whose ndim is the same as input and every
@@ -230,7 +231,7 @@ Type InferTypeArgmaxArgmin(const Call& call, const BlockBuilder& ctx) {
   }
 
   if (data_ty->ndim > 0) {
-    out_dtype = data_shape->values[0]->dtype;
+    out_dtype = data_shape->values[0].ty();
   }
 
   ffi::Array<PrimExpr> out_shape;
diff --git a/src/relax/op/tensor/set.cc b/src/relax/op/tensor/set.cc
index 57999a3356b7..a92cbee4a001 100644
--- a/src/relax/op/tensor/set.cc
+++ b/src/relax/op/tensor/set.cc
@@ -106,9 +106,9 @@ Type InferTypeUnique(const Call& call, const BlockBuilder& ctx) {
   if (f_convert_to_int64(return_index->value)) {
     if (data_ty->ndim == 0) {
       output_ty.push_back(
-          TensorType(ShapeExpr({IntImm::Int64(/*value=*/1)}), DataType::Int(64), data_ty->vdevice));
+          TensorType(ShapeExpr({IntImm::Int64(/*value=*/1)}), PrimType::Int(64), data_ty->vdevice));
     } else {
-      output_ty.push_back(TensorType(DataType::Int(64), /*ndim=*/1, data_ty->vdevice));
+      output_ty.push_back(TensorType(PrimType::Int(64), /*ndim=*/1, data_ty->vdevice));
     }
   }
 
@@ -116,9 +116,9 @@ Type InferTypeUnique(const Call& call, const BlockBuilder& ctx) {
   if (f_convert_to_int64(return_inverse->value)) {
     if (data_ty->ndim == 0) {
       output_ty.push_back(
-          TensorType(ShapeExpr({IntImm::Int64(/*value=*/1)}), DataType::Int(64), data_ty->vdevice));
+          TensorType(ShapeExpr({IntImm::Int64(/*value=*/1)}), PrimType::Int(64), data_ty->vdevice));
     } else {
-      output_ty.push_back(TensorType(DataType::Int(64), /*ndim=*/1, data_ty->vdevice));
+      output_ty.push_back(TensorType(PrimType::Int(64), /*ndim=*/1, data_ty->vdevice));
     }
   }
 
@@ -126,9 +126,9 @@ Type InferTypeUnique(const Call& call, const BlockBuilder& ctx) {
   if (f_convert_to_int64(return_counts->value)) {
     if (data_ty->ndim == 0) {
       output_ty.push_back(
-          TensorType(ShapeExpr({IntImm::Int64(/*value=*/1)}), DataType::Int(64), data_ty->vdevice));
+          TensorType(ShapeExpr({IntImm::Int64(/*value=*/1)}), PrimType::Int(64), data_ty->vdevice));
     } else {
-      output_ty.push_back(TensorType(DataType::Int(64), /*ndim=*/1, data_ty->vdevice));
+      output_ty.push_back(TensorType(PrimType::Int(64), /*ndim=*/1, data_ty->vdevice));
     }
   }
 
@@ -175,7 +175,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 Type InferTypeNonzero(const Call& call, const BlockBuilder& ctx) {
   TensorType data_ty = GetInputTensorType(call, 0, ctx);
-  return TensorType(DataType::Int(64), 2, data_ty->vdevice);
+  return TensorType(PrimType::Int(64), 2, data_ty->vdevice);
 }
 
 TVM_REGISTER_OP("relax.nonzero")
diff --git a/src/relax/op/tensor/sorting.cc b/src/relax/op/tensor/sorting.cc
index 2d014cded4ec..c470fa0d4f6e 100644
--- a/src/relax/op/tensor/sorting.cc
+++ b/src/relax/op/tensor/sorting.cc
@@ -66,7 +66,7 @@ TVM_REGISTER_OP("relax.sort")
 
 /* relax.argsort */
 
-Expr argsort(Expr data, int axis, bool descending, DataType dtype) {
+Expr argsort(Expr data, int axis, bool descending, DLDataType dtype) {
   auto attrs = ffi::make_object<ArgsortAttrs>();
   attrs->axis = std::move(axis);
   attrs->descending = std::move(descending);
@@ -84,7 +84,8 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 Type InferTypeArgsort(const Call& call, const BlockBuilder& ctx) {
   TensorType data_ty = GetUnaryInputTensorType(call, ctx);
   const auto* attrs = call->attrs.as<ArgsortAttrs>();
-  DataType out_type = attrs->dtype.is_void() ? data_ty->dtype : attrs->dtype;
+  PrimType out_type =
+      attrs->dtype == DLDataType{kDLOpaqueHandle, 0, 0} ? data_ty->dtype : PrimType(attrs->dtype);
   if (data_ty->shape.defined()) {
     return TensorType(data_ty->shape.value(), out_type, data_ty->vdevice);
   }
@@ -100,7 +101,7 @@ TVM_REGISTER_OP("relax.argsort")
 
 /* relax.topk */
 
-Expr topk(Expr data, int k, int axis, ffi::String ret_type, bool largest, DataType dtype) {
+Expr topk(Expr data, int k, int axis, ffi::String ret_type, bool largest, DLDataType dtype) {
   auto attrs = ffi::make_object<TopKAttrs>();
   attrs->k = std::move(k);
   attrs->axis = std::move(axis);
@@ -121,7 +122,8 @@ Type InferTypeTopK(const Call& call, const BlockBuilder& ctx) {
   TensorType data_ty = GetUnaryInputTensorType(call, ctx);
   const auto* data_shape = data_ty->shape.as<ShapeExprNode>();
   const auto* attrs = call->attrs.as<TopKAttrs>();
-  DataType indices_type = attrs->dtype.is_void() ? data_ty->dtype : attrs->dtype;
+  PrimType indices_type =
+      attrs->dtype == DLDataType{kDLOpaqueHandle, 0, 0} ? data_ty->dtype : PrimType(attrs->dtype);
   int ndim = data_ty->ndim;
   int k = attrs->k;
   ffi::String ret_type = attrs->ret_type;
diff --git a/src/relax/op/tensor/sorting.h b/src/relax/op/tensor/sorting.h
index a4154ce416ad..8a2ec98388df 100644
--- a/src/relax/op/tensor/sorting.h
+++ b/src/relax/op/tensor/sorting.h
@@ -51,7 +51,7 @@ Expr sort(Expr data, int axis, bool descending);
  * \param dtype The data type of the output indices.
  * \return The computed result.
  */
-Expr argsort(Expr data, int axis, bool descending, DataType dtype);
+Expr argsort(Expr data, int axis, bool descending, DLDataType dtype);
 
 /*!
  * \brief Get the top k elements in an input tensor along the given axis.
@@ -63,7 +63,7 @@ Expr argsort(Expr data, int axis, bool descending, DataType dtype);
  * \param dtype The data type of the indices output.
  * \return The computed result.
  */
-Expr topk(Expr data, int k, int axis, ffi::String ret_type, bool largest, DataType dtype);
+Expr topk(Expr data, int k, int axis, ffi::String ret_type, bool largest, DLDataType dtype);
 
 }  // namespace relax
 }  // namespace tvm
diff --git a/src/relax/op/tensor/statistical.cc b/src/relax/op/tensor/statistical.cc
index 9fe68afe2901..15bbd701e67f 100644
--- a/src/relax/op/tensor/statistical.cc
+++ b/src/relax/op/tensor/statistical.cc
@@ -155,7 +155,8 @@ Type InferTypeScan(const Call& call, const BlockBuilder& ctx) {
   TensorType data_ty = GetUnaryInputTensorType(call, ctx);
   const auto* attrs = call->attrs.as<ScanopAttrs>();
 
-  DataType out_type = attrs->dtype.is_void() ? data_ty->dtype : attrs->dtype;
+  PrimType out_type =
+      attrs->dtype == DLDataType{kDLOpaqueHandle, 0, 0} ? data_ty->dtype : PrimType(attrs->dtype);
 
   if (!attrs->axis.has_value()) {
     // flattened
@@ -216,7 +217,7 @@ Type InferTypeStatisticalExtension(const Call& call, const BlockBuilder& ctx) {
       return TensorType(ShapeExpr(ffi::Array<PrimExpr>()), data_ty->dtype, data_ty->vdevice);
     }
     return TupleType({TensorType(data_ty->dtype, out_ndim, data_ty->vdevice),
-                      TensorType(DataType::Int(64), out_ndim, data_ty->vdevice)});
+                      TensorType(PrimType::Int(64), out_ndim, data_ty->vdevice)});
   }
 
   ffi::Array<PrimExpr> out_shape;
@@ -234,15 +235,15 @@ Type InferTypeStatisticalExtension(const Call& call, const BlockBuilder& ctx) {
     return TensorType(ShapeExpr(out_shape), data_ty->dtype, data_ty->vdevice);
   else
     return TupleType({TensorType(ShapeExpr(out_shape), data_ty->dtype, data_ty->vdevice),
-                      TensorType(ShapeExpr(out_shape), DataType::Int(64), data_ty->vdevice)});
+                      TensorType(ShapeExpr(out_shape), PrimType::Int(64), data_ty->vdevice)});
 }
 
 /* relax.cumprod */
-Expr cumprod(Expr data, ffi::Optional<int64_t> axis, ffi::Optional<DataType> dtype,
+Expr cumprod(Expr data, ffi::Optional<int64_t> axis, ffi::Optional<DLDataType> dtype,
              bool exclusive) {
   auto attrs = ffi::make_object<ScanopAttrs>();
   attrs->axis = std::move(axis);
-  attrs->dtype = std::move(dtype.value_or(DataType::Void()));
+  attrs->dtype = dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
   attrs->exclusive = exclusive;
 
   static const Op& op = Op::Get("relax.cumprod");
@@ -262,10 +263,11 @@ TVM_REGISTER_OP("relax.cumprod")
     .set_attr<bool>("FPurity", true);
 
 /* relax.cumsum */
-Expr cumsum(Expr data, ffi::Optional<int64_t> axis, ffi::Optional<DataType> dtype, bool exclusive) {
+Expr cumsum(Expr data, ffi::Optional<int64_t> axis, ffi::Optional<DLDataType> dtype,
+            bool exclusive) {
   auto attrs = ffi::make_object<ScanopAttrs>();
   attrs->axis = std::move(axis);
-  attrs->dtype = std::move(dtype.value_or(DataType::Void()));
+  attrs->dtype = dtype.value_or((DLDataType{kDLOpaqueHandle, 0, 0}));
   attrs->exclusive = exclusive;
 
   static const Op& op = Op::Get("relax.cumsum");
diff --git a/src/relax/op/tensor/statistical.h b/src/relax/op/tensor/statistical.h
index 2d80790926ed..3ab998110603 100644
--- a/src/relax/op/tensor/statistical.h
+++ b/src/relax/op/tensor/statistical.h
@@ -99,7 +99,7 @@ Expr sum(Expr x, ffi::Optional<ffi::Array<int64_t>> axis, bool keepdims);
  * result.
  */
 Expr cumprod(Expr data, ffi::Optional<int64_t> axis = std::nullopt,
-             ffi::Optional<DataType> dtype = std::nullopt, bool exclusive = false);
+             ffi::Optional<DLDataType> dtype = std::nullopt, bool exclusive = false);
 
 /*!
  * \brief Numpy style cumsum op. Return the cumulative inclusive sum of the elements along
@@ -114,7 +114,7 @@ Expr cumprod(Expr data, ffi::Optional<int64_t> axis = std::nullopt,
  * \return The computed result.
  */
 Expr cumsum(Expr data, ffi::Optional<int64_t> axis = std::nullopt,
-            ffi::Optional<DataType> dtype = std::nullopt, bool exclusive = false);
+            ffi::Optional<DLDataType> dtype = std::nullopt, bool exclusive = false);
 
 /*! \brief Computes the variance of tensor elements over given axes. */
 Expr variance(Expr x, ffi::Optional<ffi::Array<int64_t>> axis, bool keepdims);
diff --git a/src/relax/op/tensor/ternary.cc b/src/relax/op/tensor/ternary.cc
index 6daacfe16578..1e21e7dbdcc7 100644
--- a/src/relax/op/tensor/ternary.cc
+++ b/src/relax/op/tensor/ternary.cc
@@ -57,9 +57,9 @@ Type InferTypeEwiseFMA(const Call& call, const BlockBuilder& ctx) {
     }
   }
 
-  DataType output_dtype;
+  PrimType output_dtype = PrimType::Void();
   if (t1->IsUnknownDtype() || t2->IsUnknownDtype() || t3->IsUnknownDtype()) {
-    output_dtype = DataType::Void();
+    output_dtype = PrimType::Void();
   } else if (t1->dtype != t2->dtype || t2->dtype != t3->dtype) {
     TVM_FFI_VISIT_THROW(TypeError, call) << "Data types " << t1->dtype << ", " << t2->dtype
                                          << ", and " << t3->dtype << " must be equal for EwiseFMA";
diff --git a/src/relax/op/tensor/unary.cc b/src/relax/op/tensor/unary.cc
index 598ec78aacda..bd15223df878 100644
--- a/src/relax/op/tensor/unary.cc
+++ b/src/relax/op/tensor/unary.cc
@@ -33,7 +33,7 @@ namespace relax {
 
 Type InferTypeUnaryCheck(const Call& call, const BlockBuilder& ctx) {
   return InferTypeUnary<false>(call, ctx,
-                               [](const TensorType& input_ty) { return DataType::Bool(); });
+                               [](const TensorType& input_ty) { return PrimType::Bool(); });
 }
 
 /***************** Arithmetic operators *****************/
diff --git a/src/relax/op/vision/nms.cc b/src/relax/op/vision/nms.cc
index bde579f0ed5a..6f289d6b8755 100644
--- a/src/relax/op/vision/nms.cc
+++ b/src/relax/op/vision/nms.cc
@@ -84,8 +84,8 @@ Type InferTypeAllClassNMS(const Call& call, const BlockBuilder& ctx) {
     ShapeExpr oshape(oshape_values);
     tvm::ffi::Array<PrimExpr> counts_values = {1};
     ShapeExpr counts_shape(counts_values);
-    tvm::ffi::Array<Type> fields = {TensorType(oshape, DataType::Int(64), vdev),
-                                    TensorType(counts_shape, DataType::Int(64), vdev)};
+    tvm::ffi::Array<Type> fields = {TensorType(oshape, PrimType::Int(64), vdev),
+                                    TensorType(counts_shape, PrimType::Int(64), vdev)};
     return TupleType(fields);
   }
 
@@ -96,9 +96,9 @@ Type InferTypeAllClassNMS(const Call& call, const BlockBuilder& ctx) {
   ShapeExpr scores_shape(scores_values);
   tvm::ffi::Array<PrimExpr> counts_values = {batch};
   ShapeExpr counts_shape(counts_values);
-  tvm::ffi::Array<Type> fields = {TensorType(indices_shape, DataType::Int(64), vdev),
-                                  TensorType(scores_shape, DataType::Float(32), vdev),
-                                  TensorType(counts_shape, DataType::Int(64), vdev)};
+  tvm::ffi::Array<Type> fields = {TensorType(indices_shape, PrimType::Int(64), vdev),
+                                  TensorType(scores_shape, PrimType::Float(32), vdev),
+                                  TensorType(counts_shape, PrimType::Int(64), vdev)};
   return TupleType(fields);
 }
 
@@ -153,9 +153,9 @@ Type InferTypeGetValidCounts(const Call& call, const BlockBuilder& ctx) {
   auto vdev = data_ty->vdevice;
   const auto* data_shape = data_ty->shape.as<ShapeExprNode>();
   if (data_shape == nullptr) {
-    tvm::ffi::Array<Type> fields = {TensorType(DataType::Int(32), /*ndim=*/1, vdev),
+    tvm::ffi::Array<Type> fields = {TensorType(PrimType::Int(32), /*ndim=*/1, vdev),
                                     TensorType(data_ty->dtype, /*ndim=*/3, vdev),
-                                    TensorType(DataType::Int(32), /*ndim=*/2, vdev)};
+                                    TensorType(PrimType::Int(32), /*ndim=*/2, vdev)};
     return TupleType(fields);
   }
 
@@ -177,9 +177,9 @@ Type InferTypeGetValidCounts(const Call& call, const BlockBuilder& ctx) {
   }
 
   tvm::ffi::Array<Type> fields = {
-      TensorType(ShapeExpr({batch}), DataType::Int(32), vdev),
+      TensorType(ShapeExpr({batch}), PrimType::Int(32), vdev),
       TensorType(ShapeExpr({batch, num_anchors, elem_length}), data_ty->dtype, vdev),
-      TensorType(ShapeExpr({batch, num_anchors}), DataType::Int(32), vdev)};
+      TensorType(ShapeExpr({batch, num_anchors}), PrimType::Int(32), vdev)};
   return TupleType(fields);
 }
 
@@ -251,12 +251,12 @@ Type InferTypeNMS(const Call& call, const BlockBuilder& ctx) {
     TVM_FFI_VISIT_THROW(ValueError, call)
         << "non_max_suppression expects indices to be 2-D, got ndim " << indices_ty->ndim;
   }
-  if (!valid_count_ty->IsUnknownDtype() && valid_count_ty->dtype != DataType::Int(32)) {
+  if (!valid_count_ty->IsUnknownDtype() && valid_count_ty->dtype != PrimType::Int(32)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "non_max_suppression expects valid_count to have dtype int32, got "
         << valid_count_ty->dtype;
   }
-  if (!indices_ty->IsUnknownDtype() && indices_ty->dtype != DataType::Int(32)) {
+  if (!indices_ty->IsUnknownDtype() && indices_ty->dtype != PrimType::Int(32)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "non_max_suppression expects indices to have dtype int32, got " << indices_ty->dtype;
   }
@@ -319,30 +319,30 @@ Type InferTypeNMS(const Call& call, const BlockBuilder& ctx) {
       //                   valid_box_count[batch, 1])
       if (data_shape == nullptr) {
         tvm::ffi::Array<Type> fields = {TensorType(data_ty->dtype, /*ndim=*/3, vdev),
-                                        TensorType(DataType::Int(32), /*ndim=*/2, vdev),
-                                        TensorType(DataType::Int(32), /*ndim=*/2, vdev)};
+                                        TensorType(PrimType::Int(32), /*ndim=*/2, vdev),
+                                        TensorType(PrimType::Int(32), /*ndim=*/2, vdev)};
         return TupleType(fields);
       }
       auto batch = data_shape->values[0];
       auto num_anchors = data_shape->values[1];
       tvm::ffi::Array<Type> fields = {
           TensorType(ffi::GetRef<ShapeExpr>(data_shape), data_ty->dtype, vdev),
-          TensorType(ShapeExpr({batch, num_anchors}), DataType::Int(32), vdev),
-          TensorType(ShapeExpr({batch, IntImm::Int64(1)}), DataType::Int(32), vdev)};
+          TensorType(ShapeExpr({batch, num_anchors}), PrimType::Int(32), vdev),
+          TensorType(ShapeExpr({batch, IntImm::Int64(1)}), PrimType::Int(32), vdev)};
       return TupleType(fields);
     }
 
     // Hard NMS returns (box_indices[batch, num_anchors], valid_box_count[batch, 1])
     if (data_shape == nullptr) {
-      tvm::ffi::Array<Type> fields = {TensorType(DataType::Int(32), /*ndim=*/2, vdev),
-                                      TensorType(DataType::Int(32), /*ndim=*/2, vdev)};
+      tvm::ffi::Array<Type> fields = {TensorType(PrimType::Int(32), /*ndim=*/2, vdev),
+                                      TensorType(PrimType::Int(32), /*ndim=*/2, vdev)};
       return TupleType(fields);
     }
     auto batch = data_shape->values[0];
     auto num_anchors = data_shape->values[1];
     tvm::ffi::Array<Type> fields = {
-        TensorType(ShapeExpr({batch, num_anchors}), DataType::Int(32), vdev),
-        TensorType(ShapeExpr({batch, IntImm::Int64(1)}), DataType::Int(32), vdev)};
+        TensorType(ShapeExpr({batch, num_anchors}), PrimType::Int(32), vdev),
+        TensorType(ShapeExpr({batch, IntImm::Int64(1)}), PrimType::Int(32), vdev)};
     return TupleType(fields);
   }
 
diff --git a/src/relax/script/printer/dependent_type.cc b/src/relax/script/printer/dependent_type.cc
index a37c21406fac..e3a14c0cdafe 100644
--- a/src/relax/script/printer/dependent_type.cc
+++ b/src/relax/script/printer/dependent_type.cc
@@ -100,7 +100,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           }
           if (!n->IsUnknownDtype()) {
             kwargs_keys.push_back("dtype");
-            kwargs_values.push_back(LiteralDoc::DataType(n->dtype, n_p->Attr("dtype")));
+            kwargs_values.push_back(LiteralDoc::DataType(n->dtype->dtype, n_p->Attr("dtype")));
           }
           if (!n->shape.defined() && !n->IsUnknownNdim()) {
             kwargs_keys.push_back("ndim");
diff --git a/src/relax/script/printer/distributed.cc b/src/relax/script/printer/distributed.cc
index f05ec8fe714a..97d800d5d139 100644
--- a/src/relax/script/printer/distributed.cc
+++ b/src/relax/script/printer/distributed.cc
@@ -61,11 +61,11 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           }
           if (!n->tensor_ty->IsUnknownDtype()) {
             if (!require_kwargs) {
-              args.push_back(LiteralDoc::DataType(n->tensor_ty->dtype, n_p->Attr("dtype")));
+              args.push_back(LiteralDoc::DataType(n->tensor_ty->dtype->dtype, n_p->Attr("dtype")));
             } else {
               kwargs_keys.push_back("dtype");
               kwargs_values.push_back(
-                  LiteralDoc::DataType(n->tensor_ty->dtype, n_p->Attr("dtype")));
+                  LiteralDoc::DataType(n->tensor_ty->dtype->dtype, n_p->Attr("dtype")));
             }
           } else {
             require_kwargs = true;
diff --git a/src/relax/script/printer/expr.cc b/src/relax/script/printer/expr.cc
index dfce2b40b1f9..7b2f39ecf335 100644
--- a/src/relax/script/printer/expr.cc
+++ b/src/relax/script/printer/expr.cc
@@ -81,21 +81,21 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
         });
 
 ffi::Optional<ExprDoc> SpecialScalar(const runtime::Tensor& n, const AccessPath& p) {
-  DataType dtype = n.DataType();
+  DLDataType dtype = n.DataType();
   const void* data = n->data;
   if (n->ndim != 0 || n->device.device_type != kDLCPU) {
     return std::nullopt;
   }
 
-  if (dtype == DataType::Int(8)) {
+  if (dtype == DLDataType{kDLInt, 8, 1}) {
     return LiteralDoc::Int(*reinterpret_cast<const int8_t*>(data), p);
-  } else if (dtype == DataType::Int(16)) {
+  } else if (dtype == DLDataType{kDLInt, 16, 1}) {
     return LiteralDoc::Int(*reinterpret_cast<const int16_t*>(data), p);
-  } else if (dtype == DataType::Int(32)) {
+  } else if (dtype == DLDataType{kDLInt, 32, 1}) {
     return LiteralDoc::Int(*reinterpret_cast<const int32_t*>(data), p);
-  } else if (dtype == DataType::Int(64)) {
+  } else if (dtype == DLDataType{kDLInt, 64, 1}) {
     return LiteralDoc::Int(*reinterpret_cast<const int64_t*>(data), p);
-  } else if (dtype == DataType::Float(16)) {
+  } else if (dtype == DLDataType{kDLFloat, 16, 1}) {
     // From IEEE-754 float16 definition
     //
     // Ref: https://en.wikipedia.org/wiki/Half-precision_floating-point_format
@@ -122,11 +122,11 @@ ffi::Optional<ExprDoc> SpecialScalar(const runtime::Tensor& n, const AccessPath&
     }
 
     return LiteralDoc::Float(value, p);
-  } else if (dtype == DataType::Float(32)) {
+  } else if (dtype == DLDataType{kDLFloat, 32, 1}) {
     return LiteralDoc::Float(*reinterpret_cast<const float*>(data), p);
-  } else if (dtype == DataType::Float(64)) {
+  } else if (dtype == DLDataType{kDLFloat, 64, 1}) {
     return LiteralDoc::Float(*reinterpret_cast<const double*>(data), p);
-  } else if (dtype == DataType::Bool()) {
+  } else if (dtype == DLDataType{kDLBool, 8, 1}) {
     return LiteralDoc::Boolean(*reinterpret_cast<const uint8_t*>(data), p);
   } else {
     return std::nullopt;
diff --git a/src/relax/script/printer/tir.cc b/src/relax/script/printer/tir.cc
index e0742f8edd44..06bce7c1ff8c 100644
--- a/src/relax/script/printer/tir.cc
+++ b/src/relax/script/printer/tir.cc
@@ -43,9 +43,10 @@ RelaxFrameNode* GetRelaxFrame(IRDocsifier d) {
 }
 
 Doc PrintTIRVar(tirx::Var n, AccessPath n_p, IRDocsifier d) {
-  TVM_FFI_CHECK(n->dtype.is_scalar(), TypeError)
+  PrimType n_ty = n.ty();
+  TVM_FFI_CHECK(!n_ty.IsScalableVector() && !n_ty.IsFixedLengthVector(), TypeError)
       << "Relax only uses scalar TIR variables,"
-      << "but received TIR variable " << n << " with dtype " << n->dtype;
+      << "but received TIR variable " << n << " with dtype " << n_ty->dtype;
 
   if (!d->IsVarDefined(n)) {
     RelaxFrameNode* f = GetRelaxFrame(d);
@@ -77,7 +78,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tvm::IntImm>(                                             //
         "relax", [](tvm::IntImm n, AccessPath n_p, IRDocsifier d) -> Doc {  //
           // TODO(@junrushao): support non-int64 cases
-          if (n->dtype.is_bool()) {
+          if (n->ty().MatchesElementType(DLDataTypeCode::kDLBool, 8)) {
             return LiteralDoc::Boolean(n->value, n_p);
           } else {
             return LiteralDoc::Int(n->value, n_p);
diff --git a/src/relax/transform/adjust_matmul_order.cc b/src/relax/transform/adjust_matmul_order.cc
index 4cf8831514dc..2d6e6fcc5e33 100644
--- a/src/relax/transform/adjust_matmul_order.cc
+++ b/src/relax/transform/adjust_matmul_order.cc
@@ -208,22 +208,24 @@ std::tuple<DFPattern, ffi::TypedFunction<Expr(Expr, ffi::Map<DFPattern, Expr>)>>
     // If two of the three are compile-time, group those two values
     // together, to allow them to be lifted out and pre-computed.
     if (is_compile_time(expr_a) && is_compile_time(expr_b)) {
-      return matmul(matmul(expr_a, expr_b, DataType::Void()), expr_c, DataType::Void());
+      return matmul(matmul(expr_a, expr_b, (DLDataType{kDLOpaqueHandle, 0, 0})), expr_c,
+                    (DLDataType{kDLOpaqueHandle, 0, 0}));
     } else if (is_compile_time(expr_b) && is_compile_time(expr_c)) {
-      return matmul(expr_a, matmul(expr_b, expr_c, DataType::Void()), DataType::Void());
+      return matmul(expr_a, matmul(expr_b, expr_c, (DLDataType{kDLOpaqueHandle, 0, 0})),
+                    (DLDataType{kDLOpaqueHandle, 0, 0}));
     }
 
     // Otherwise, select the order that reduces the total number of
     // operations required, assuming a naive matmul (see below).
 
     if (shape_a.size() == 1) {
-      shape_a = {IntImm(shape_a[0].dtype(), 1), shape_a[0]};
+      shape_a = {IntImm(shape_a[0].ty(), 1), shape_a[0]};
     }
     if (shape_b.size() == 1) {
       if (matches.count(pat_matmul_on_lhs)) {
-        shape_b = {shape_b[0], IntImm(shape_b[0].dtype(), 1)};
+        shape_b = {shape_b[0], IntImm(shape_b[0].ty(), 1)};
       } else if (matches.count(pat_matmul_on_rhs)) {
-        shape_b = {IntImm(shape_b[0].dtype(), 1), shape_b[0]};
+        shape_b = {IntImm(shape_b[0].ty(), 1), shape_b[0]};
       } else {
         TVM_FFI_THROW(InternalError)
             << "OrPattern " << pat << " matched, but neither " << pat_matmul_on_lhs << " nor "
@@ -231,7 +233,7 @@ std::tuple<DFPattern, ffi::TypedFunction<Expr(Expr, ffi::Map<DFPattern, Expr>)>>
       }
     }
     if (shape_c.size() == 1) {
-      shape_c = {shape_c[0], IntImm(shape_c[0].dtype(), 1)};
+      shape_c = {shape_c[0], IntImm(shape_c[0].ty(), 1)};
     }
 
     PrimExpr size_N = shape_a[shape_a.size() - 2];  // row of A
@@ -285,9 +287,11 @@ std::tuple<DFPattern, ffi::TypedFunction<Expr(Expr, ffi::Map<DFPattern, Expr>)>>
                       size_N > 0 && size_R > 0 && size_M > 0 && size_B > 0);
 
     if (analyzer->CanProve(ops_with_lhs_first < ops_with_rhs_first)) {
-      return matmul(matmul(expr_a, expr_b, DataType::Void()), expr_c, DataType::Void());
+      return matmul(matmul(expr_a, expr_b, (DLDataType{kDLOpaqueHandle, 0, 0})), expr_c,
+                    (DLDataType{kDLOpaqueHandle, 0, 0}));
     } else if (analyzer->CanProve(ops_with_rhs_first < ops_with_lhs_first)) {
-      return matmul(expr_a, matmul(expr_b, expr_c, DataType::Void()), DataType::Void());
+      return matmul(expr_a, matmul(expr_b, expr_c, (DLDataType{kDLOpaqueHandle, 0, 0})),
+                    (DLDataType{kDLOpaqueHandle, 0, 0}));
     }
 
     // If we cannot determine which order is best, keep the existing order.
diff --git a/src/relax/transform/allocate_workspace.cc b/src/relax/transform/allocate_workspace.cc
index 4dfc84b822da..a593cb7ffee7 100644
--- a/src/relax/transform/allocate_workspace.cc
+++ b/src/relax/transform/allocate_workspace.cc
@@ -61,7 +61,7 @@ class ExternFunctionRewriter : ExprMutator {
       // Append the workspace parameter to this function.
       ffi::Array<Var> new_params = func_node->params;
 
-      auto ty = TensorType(ShapeExpr({IntImm::Int32(max_workspace_size_)}), DataType::UInt(8));
+      auto ty = TensorType(ShapeExpr({IntImm::Int32(max_workspace_size_)}), PrimType::UInt(8));
       Var workspace_param(name_sup_->FreshName("workspace"), ty);
 
       if (func_node->GetAttr<ffi::String>(attr::kCodegen)) {
@@ -149,7 +149,7 @@ class WorkspaceProvider : ExprMutator {
     builder_->BeginDataflowBlock();
     if (!workspace_var_main_.defined()) {
       auto shape = ShapeExpr({IntImm::Int32(max_workspace_size_)});
-      auto ty = DataTypeImm(DataType::UInt(8));
+      auto ty = DataTypeImm((DLDataType{kDLUInt, 8, 1}));
       auto workspace = MakeAllocTensor(shape, ty, PrimValue::Int64(0));
       workspace_var_main_ = builder_->Emit(workspace, "workspace_main");
     }
diff --git a/src/relax/transform/alter_op_impl.cc b/src/relax/transform/alter_op_impl.cc
index a938b946d20c..7a3b5743f423 100644
--- a/src/relax/transform/alter_op_impl.cc
+++ b/src/relax/transform/alter_op_impl.cc
@@ -45,7 +45,7 @@ static constexpr const char* kOperatorName = "operator_name";
 
 /*! \brief Construct ranges from shape dimensions */
 static ffi::Array<Range> ConstructRangeFromShape(const ffi::Array<PrimExpr>& shape) {
-  return shape.Map([](const PrimExpr& dim) { return Range(IntImm(dim.dtype(), 0), dim); });
+  return shape.Map([](const PrimExpr& dim) { return Range(IntImm(dim.ty(), 0), dim); });
 }
 
 static ffi::Array<PrimExpr> GetShapeFromTensorType(const TensorType& tensor_ty) {
@@ -206,7 +206,7 @@ class AlterOpImplMutator : public ExprMutator {
    * \brief Adds the \p remove_pad op to the module if it has not already been added before.
    * \returns The global var associated with the remove_pad PrimFunc.
    */
-  GlobalVar GetOrCreateRemovePadOp(const ffi::Array<PrimExpr>& old_shape, const DataType& dtype) {
+  GlobalVar GetOrCreateRemovePadOp(const ffi::Array<PrimExpr>& old_shape, DLDataType dtype) {
     int t_shape = old_shape.size();
     if (remove_pad_map_.count(t_shape) != 0) {
       return remove_pad_map_[t_shape];
@@ -214,8 +214,8 @@ class AlterOpImplMutator : public ExprMutator {
     // Create dynamic shapes for input and output tensors
     ffi::Array<PrimExpr> dyn_padded_shape, dyn_old_shape;
     for (int i = 0; i < t_shape; i++) {
-      tirx::Var var1("p" + std::to_string(i), old_shape[i].dtype());
-      tirx::Var var2("i" + std::to_string(i), old_shape[i].dtype());
+      tirx::Var var1("p" + std::to_string(i), old_shape[i].ty());
+      tirx::Var var2("i" + std::to_string(i), old_shape[i].ty());
       dyn_padded_shape.push_back(var1);
       dyn_old_shape.push_back(var2);
     }
@@ -264,7 +264,7 @@ class AlterOpImplMutator : public ExprMutator {
           TransformLayout(expr, inverse_index_map, axis_separator, input_axis_separator));
       const auto& tensor_ty = padded_expr->ty.as_or_throw<TensorType>();
 
-      GlobalVar gv_remove_pad = GetOrCreateRemovePadOp(old_shape, tensor_ty->dtype);
+      GlobalVar gv_remove_pad = GetOrCreateRemovePadOp(old_shape, tensor_ty->dtype->dtype);
       return Call(call_tir_op_, {gv_remove_pad, Tuple({padded_expr})}, {}, {old_tensor_ty});
     }
   }
diff --git a/src/relax/transform/call_tir_rewrite.cc b/src/relax/transform/call_tir_rewrite.cc
index 61fee5be7f8d..5a1bbcaa0040 100644
--- a/src/relax/transform/call_tir_rewrite.cc
+++ b/src/relax/transform/call_tir_rewrite.cc
@@ -90,12 +90,12 @@ class CallTIRMutator : public ExprMutator {
         }
 
         if (!is_inplace) {
-          outs.push_back(builder_->Emit(
-              Call(alloc_tensor_op,
-                   {output_ty->shape.value().as_or_throw<ShapeExpr>(),
-                    DataTypeImm(output_ty->dtype), PrimValue::Int64(dev_index), StringImm(scope)},
-                   Attrs(), {output_ty}),
-              "alloc"));
+          outs.push_back(builder_->Emit(Call(alloc_tensor_op,
+                                             {output_ty->shape.value().as_or_throw<ShapeExpr>(),
+                                              DataTypeImm(output_ty->dtype->dtype),
+                                              PrimValue::Int64(dev_index), StringImm(scope)},
+                                             Attrs(), {output_ty}),
+                                        "alloc"));
         } else {
           // if there is only one output, it must be an in-place argument, but check anyway
           TVM_FFI_ICHECK(inplace_attrs->inplace_indices[0] != -1)
@@ -129,8 +129,8 @@ class CallTIRMutator : public ExprMutator {
             outs.push_back(
                 builder_->Emit(Call(alloc_tensor_op,
                                     {field_tensor->shape.value().as_or_throw<ShapeExpr>(),
-                                     DataTypeImm(field_tensor->dtype), PrimValue::Int64(dev_index),
-                                     StringImm(scope)},
+                                     DataTypeImm(field_tensor->dtype->dtype),
+                                     PrimValue::Int64(dev_index), StringImm(scope)},
                                     Attrs(), {field_tensor}),
                                "alloc"));
           } else {
diff --git a/src/relax/transform/combine_parallel_matmul.cc b/src/relax/transform/combine_parallel_matmul.cc
index 1319356ee169..128202063695 100644
--- a/src/relax/transform/combine_parallel_matmul.cc
+++ b/src/relax/transform/combine_parallel_matmul.cc
@@ -202,7 +202,7 @@ ffi::TypedFunction<ffi::Map<Var, Expr>(ffi::Map<DFPattern, Var>, ffi::Map<Var, E
       }
 
       auto concat_rhs = concat(Tuple(rhs), rhs_dim - 1);
-      auto out_dtype = GetTensorType(matchings[patterns.matmul[indices[0]]])->dtype;
+      DLDataType out_dtype = GetTensorType(matchings[patterns.matmul[indices[0]]])->dtype->dtype;
       auto matmul_combined = matmul(lhs, concat_rhs, out_dtype);
 
       if (branch_info.bias_dim) {
diff --git a/src/relax/transform/compute_prim_value.cc b/src/relax/transform/compute_prim_value.cc
index 4ad34d04367d..4c937fe135dc 100644
--- a/src/relax/transform/compute_prim_value.cc
+++ b/src/relax/transform/compute_prim_value.cc
@@ -43,11 +43,12 @@ class PrimValueComputeInjector : public ExprMutator {
       return node;
     }
 
-    auto ret_dtype = node->value->dtype;
+    tvm::PrimType ret_ty = node->value.ty();
     auto param_vars = tirx::UndefinedVars(node->value);
-    tirx::Stmt body = tirx::Evaluate(tirx::Call(ret_dtype, tirx::builtin::ret(), {node->value}));
+    tirx::Stmt body =
+        tirx::Evaluate(tirx::Call(node->value.ty(), tirx::builtin::ret(), {node->value}));
 
-    tirx::PrimFunc func(param_vars, body, tvm::PrimType(ret_dtype), {},
+    tirx::PrimFunc func(param_vars, body, ret_ty, {},
                         DictAttrs({{tirx::attr::kIsHostFunc, true}, {tvm::attr::kSTir, true}}));
     func = s_tir::RenewDefs(func);
 
diff --git a/src/relax/transform/convert_layout.cc b/src/relax/transform/convert_layout.cc
index ed2a9b1c8a8a..bd4631bb4cf8 100644
--- a/src/relax/transform/convert_layout.cc
+++ b/src/relax/transform/convert_layout.cc
@@ -102,7 +102,7 @@ class LayoutConvertMutator : public ExprMutator {
     ffi::Array<PrimExpr> initial_indices_expr;
     initial_indices.reserve(ndim);
     for (int i = 0; i < ndim; ++i) {
-      auto var = tvm::tirx::Var("i" + std::to_string(i), DataType::Int(32));
+      auto var = tvm::tirx::Var("i" + std::to_string(i), PrimType::Int(32));
       initial_indices.push_back(var);
       initial_indices_expr.push_back(var);
     }
diff --git a/src/relax/transform/dataflow_inplace.cc b/src/relax/transform/dataflow_inplace.cc
index fcedd3119599..289c1c3c3b40 100644
--- a/src/relax/transform/dataflow_inplace.cc
+++ b/src/relax/transform/dataflow_inplace.cc
@@ -383,7 +383,7 @@ std::unordered_set<Type, ffi::ObjectPtrHash, ffi::ObjectPtrEqual> GatherCandidat
     const Type& result_ty) {
   if (auto* tensor_info = result_ty.as<TensorTypeNode>()) {
     // don't consider void dtype (don't know the size at compile time)
-    if (tensor_info->dtype.is_void()) {
+    if (tensor_info->dtype.IsVoid()) {
       return {};
     }
     // don't consider cases where we don't know the shape at compile time
diff --git a/src/relax/transform/decompose_ops.cc b/src/relax/transform/decompose_ops.cc
index 494e4a67a4a4..156d3c278c46 100644
--- a/src/relax/transform/decompose_ops.cc
+++ b/src/relax/transform/decompose_ops.cc
@@ -66,7 +66,7 @@ Tuple DecomposeBatchNorm(const Call& call) {
   Expr moving_var = ExpandToMatchInput(call->args[4], ty->ndim, {attrs->axis});
 
   // output = (x - mean) / sqrt(var + epsilon) * gamma + beta
-  Expr epsilon = MakeConstantScalar(attrs->epsilon, ty->dtype);
+  Expr epsilon = MakeConstantScalar(attrs->epsilon, ty->dtype->dtype);
   Expr sqrt_var = sqrt(add(moving_var, epsilon));
   Expr out = divide(subtract(data, moving_mean), sqrt_var);
 
@@ -103,8 +103,8 @@ Expr MutateBatchNormForTraining(Call call) {
   Expr data_mean = mean(data, reduce_axes, false);
   Expr data_var = variance(data, reduce_axes, false);
 
-  Expr momentum = MakeConstantScalar(attrs->momentum, ty->dtype);
-  Expr one_minus_mom = MakeConstantScalar(1 - attrs->momentum, ty->dtype);
+  Expr momentum = MakeConstantScalar(attrs->momentum, ty->dtype->dtype);
+  Expr one_minus_mom = MakeConstantScalar(1 - attrs->momentum, ty->dtype->dtype);
 
   Expr new_moving_mean = add(multiply(one_minus_mom, moving_mean), multiply(momentum, data_mean));
   Expr new_moving_var = add(multiply(one_minus_mom, moving_var), multiply(momentum, data_var));
@@ -128,7 +128,7 @@ Expr DecomposeLayerNorm(const Call& call) {
   Expr data_var = variance(data, attrs->axes, true);
 
   // output = (x - mean) / sqrt(var + epsilon) * gamma + beta
-  Expr epsilon = MakeConstantScalar(attrs->epsilon, ty->dtype);
+  Expr epsilon = MakeConstantScalar(attrs->epsilon, ty->dtype->dtype);
   Expr sqrt_var = sqrt(add(data_var, epsilon));
   Expr out = divide(subtract(data, data_mean), sqrt_var);
 
@@ -159,7 +159,7 @@ Expr TensorToShape(const Call& call_node, const BlockBuilder& builder) {
   // ffi::Array<PrimExpr>), we define symbolic variables and returns them as a ShapeExpr.
   ffi::Array<PrimExpr> shape_var;
   for (int i = 0; i < ty->ndim; i++) {
-    shape_var.push_back(tirx::Var("x", DataType::Int(64)));
+    shape_var.push_back(tirx::Var("x", PrimType::Int(64)));
   }
   // bind symbolic variables to the shape tuple
   relax::Var var("y", ShapeType(shape_var));
diff --git a/src/relax/transform/expand_matmul_of_sum.cc b/src/relax/transform/expand_matmul_of_sum.cc
index 1e768478fd95..9bf5fbd53b2d 100644
--- a/src/relax/transform/expand_matmul_of_sum.cc
+++ b/src/relax/transform/expand_matmul_of_sum.cc
@@ -88,7 +88,8 @@ std::tuple<DFPattern, ffi::TypedFunction<Expr(Expr, ffi::Map<DFPattern, Expr>)>>
       rhs_b = permute_dims(rhs_b, axes);
     }
 
-    return add(matmul(lhs, rhs_a, DataType::Void()), matmul(lhs, rhs_b, DataType::Void()));
+    return add(matmul(lhs, rhs_a, (DLDataType{kDLOpaqueHandle, 0, 0})),
+               matmul(lhs, rhs_b, (DLDataType{kDLOpaqueHandle, 0, 0})));
   };
 
   return {pat_matmul, rewriter};
diff --git a/src/relax/transform/fold_constant.cc b/src/relax/transform/fold_constant.cc
index d615c014709b..7c92ae49c578 100644
--- a/src/relax/transform/fold_constant.cc
+++ b/src/relax/transform/fold_constant.cc
@@ -197,7 +197,7 @@ class ConstantFolder : public ExprMutator {
   // Returns std::nullopt on failure.
   ffi::Optional<Expr> ConstEvaluateCallTIR(tirx::PrimFunc tir_func,
                                            ffi::Array<runtime::Tensor> arr_args, ffi::Shape shape,
-                                           DataType ret_type) {
+                                           DLDataType ret_type) {
     // obtain function from the cache.
     ffi::Optional<ffi::Function> func = GetCachedBuild(tir_func);
     if (!func) return std::nullopt;
@@ -243,7 +243,8 @@ class ConstantFolder : public ExprMutator {
       if (!shape) return std::nullopt;
       auto tensor_ty = tuple_ty->fields[i].as_or_throw<TensorType>();
       if (tensor_ty->IsUnknownDtype()) return std::nullopt;
-      ret_tensors.push_back(runtime::Tensor::Empty(shape.value(), tensor_ty->dtype, cpu_dev));
+      ret_tensors.push_back(
+          runtime::Tensor::Empty(shape.value(), tensor_ty->dtype->dtype, cpu_dev));
     }
 
     // Pack input args + all output tensors.
@@ -288,7 +289,8 @@ class ConstantFolder : public ExprMutator {
     ffi::Optional<ffi::Shape> shape = MatchConstShape(call->ty_args[0]);
     if (shape) {
       TensorType ret_ty = call->ty.as_or_throw<TensorType>();
-      return ConstEvaluateCallTIR(func.value(), arr_args.value(), shape.value(), ret_ty->dtype)
+      return ConstEvaluateCallTIR(func.value(), arr_args.value(), shape.value(),
+                                  ret_ty->dtype->dtype)
           .value_or({});
     }
     return {};
@@ -391,7 +393,7 @@ class ConstantFolder : public ExprMutator {
         for (size_t i = 0; i < values.size(); i++) {
           PrimExpr val = values[i];
           arr.push_back(val.as<IntImmNode>()->value);
-          is_known &= (val.dtype() == DataType::Int(64));
+          is_known &= val.ty().MatchesElementType(DLDataTypeCode::kDLInt, 64);
         }
         if (is_known) {
           const auto func = tvm::ffi::Function::GetGlobalRequired("relax.run.shape_to_tensor");
diff --git a/src/relax/transform/fuse_tir.cc b/src/relax/transform/fuse_tir.cc
index d5e656d15256..00c1029a98d1 100644
--- a/src/relax/transform/fuse_tir.cc
+++ b/src/relax/transform/fuse_tir.cc
@@ -60,10 +60,10 @@ class SymbolicMatcher : ExprFunctor<void(const PrimExpr& n, const PrimExpr& othe
   void VisitExpr(const PrimExpr& node, const PrimExpr& other) {
     if (node.same_as(other)) {
       return;
-    } else if (node.dtype().code() != other.dtype().code()) {
+    } else if (node.ty().code() != other.ty().code()) {
       TVM_FFI_THROW(InternalError)
-          << "Parameter expression " << node << " with dtype " << node.dtype()
-          << " cannot match to argument " << other << " with dtype " << other.dtype();
+          << "Parameter expression " << node << " with dtype " << node.ty()->dtype
+          << " cannot match to argument " << other << " with dtype " << other.ty()->dtype;
     } else {
       ExprFunctor::VisitExpr(node, other);
     }
@@ -120,9 +120,10 @@ class SymbolicMatcher : ExprFunctor<void(const PrimExpr& n, const PrimExpr& othe
   void VisitExpr_(const CastNode* op, const PrimExpr& other) {
     const auto* rhs = other.as<CastNode>();
     if (!rhs) {
-      TVM_FFI_THROW(InternalError) << "Parameter expression " << ffi::GetRef<PrimExpr>(op)
-                                   << " expected an cast to " << op->dtype << " as the argument, "
-                                   << "but was provided with the argument " << other;
+      TVM_FFI_THROW(InternalError)
+          << "Parameter expression " << ffi::GetRef<PrimExpr>(op) << " expected an cast to "
+          << op->ty()->dtype << " as the argument, "
+          << "but was provided with the argument " << other;
     }
     VisitExpr(op->value, rhs->value);
   }
@@ -132,10 +133,11 @@ class SymbolicMatcher : ExprFunctor<void(const PrimExpr& n, const PrimExpr& othe
 
     if (lhs.same_as(rhs)) {
       // Reference identity, no further checks needed.
-    } else if (op->dtype.code() != rhs->dtype.code()) {
+    } else if (op->ty().code() != rhs.ty().code()) {
       TVM_FFI_THROW(InternalError)
-          << "Parameter expression " << ffi::GetRef<PrimExpr>(op) << " with dtype " << op->dtype
-          << " cannot match to argument " << rhs << " with dtype " << rhs.dtype();
+          << "Parameter expression " << ffi::GetRef<PrimExpr>(op) << " with dtype "
+          << op->ty()->dtype << " cannot match to argument " << rhs << " with dtype "
+          << rhs.ty()->dtype;
     } else if (auto it = var_remap_->find(lhs); it != var_remap_->end()) {
       VisitExpr((*it).second, rhs);
     } else {
@@ -592,7 +594,7 @@ class FusedTIRConstructor : public ExprVisitor {
         // printed, it's more readable when done explicitly.  Since
         // Buffer is used more than param it gets the name with better
         // readability.
-        tirx::Var param = tirx::Var("p_" + buffer->name, tvm::PrimType(DataType::Handle()));
+        tirx::Var param = tirx::Var("p_" + buffer->name, tvm::PrimType::Handle());
         func_info_.params.push_back(param);
         func_info_.buffer_map.Set(param, buffer);
       }
@@ -636,8 +638,7 @@ class FusedTIRConstructor : public ExprVisitor {
         continue;
       }
 
-      tirx::Var param =
-          tirx::Var("p_output" + std::to_string(out_idx), tvm::PrimType(DataType::Handle()));
+      tirx::Var param = tirx::Var("p_output" + std::to_string(out_idx), tvm::PrimType::Handle());
       out_idx++;
       func_info_.buffer_map.Set(param, buffers[i]);
       func_info_.params.push_back(param);
@@ -855,9 +856,10 @@ class FusedTIRConstructor : public ExprVisitor {
     for (int64_t idx : output_indices) {
       int i = static_cast<int>(idx);
       const tirx::Var& param = func->params[static_cast<size_t>(i)];
-      if (param->dtype.is_int() || param->dtype.is_uint()) {
+      tvm::PrimType param_ty = param.ty();
+      if (param_ty.code() == DLDataTypeCode::kDLInt || param_ty.code() == DLDataTypeCode::kDLUInt) {
         if (symbolic_var_index == -1) symbolic_var_index = i;
-      } else if (param->dtype.is_handle()) {
+      } else if (param_ty.IsHandle()) {
         TVM_FFI_ICHECK(symbolic_var_index == -1)
             << "The scalar input should be at the ending of the "
                "parameter list.";
@@ -865,7 +867,7 @@ class FusedTIRConstructor : public ExprVisitor {
       } else {
         TVM_FFI_THROW(InternalError)
             << "The params of PrimFunc are expected to be Buffer handle or scalar, but got: "
-            << param->dtype;
+            << param_ty->dtype;
       }
     }
 
@@ -967,7 +969,7 @@ class FusedTIRConstructor : public ExprVisitor {
       // Case 1. The relax param is a Tensor, we directly create a tirx var and buffer
       const auto* shape_expr = tensor->shape.as<ShapeExprNode>();
       TVM_FFI_ICHECK(shape_expr) << "FuseTIR expects all Tensor parameters have a known shape.";
-      DataType dtype = tensor->dtype;
+      DLDataType dtype = tensor->dtype->dtype;
       tirx::Buffer buffer;
       if (tir_buffer_param.defined()) {
         buffer = tirx::decl_buffer(shape_expr->values, dtype, name_hint,
@@ -980,7 +982,7 @@ class FusedTIRConstructor : public ExprVisitor {
 
     } else if (const auto* prim_value = ty.as<PrimTypeNode>()) {
       // Case 2. The relax param is a scalar, we directly create a tirx var
-      out->push_back(tirx::Var(name_hint, prim_value->dtype));
+      out->push_back(tirx::Var(name_hint, tvm::PrimType(prim_value->dtype)));
 
     } else if (const auto* shape_expr = ty.as<ShapeTypeNode>()) {
       // Case 3. The relax param is a tuple of scalars, each represented as a tirx var
@@ -1257,7 +1259,7 @@ class TIRFuseMutator : public ExprMutator {
         if (const auto* literal = arg.as<PrimValueNode>()) {
           tir_vars.push_back(literal->value);
         } else if (const auto* var = arg.as<VarNode>()) {
-          tir_vars.push_back(tirx::Var(var->name_hint(), prim_value->dtype));
+          tir_vars.push_back(tirx::Var(var->name_hint(), tvm::PrimType(prim_value->dtype)));
         } else {
           TVM_FFI_THROW(TypeError) << "FuseTIR expects scalar arguments to be PrimValue or Var, "
                                    << "but received " << arg;
diff --git a/src/relax/transform/gradient.cc b/src/relax/transform/gradient.cc
index df22650e036d..e23524388435 100644
--- a/src/relax/transform/gradient.cc
+++ b/src/relax/transform/gradient.cc
@@ -304,7 +304,7 @@ class BackwardBindingGenerator : private ExprVisitor {
 
     // Initialize the adjoint of target_var as ones op. We have already checked the target.
     auto* target_ty = GetTypeAs<TensorTypeNode>(target_var);
-    generator.UpdateAdjoint(target_var, ones(target_ty->shape.value(), target_ty->dtype));
+    generator.UpdateAdjoint(target_var, ones(target_ty->shape.value(), target_ty->dtype->dtype));
 
     // Do reverse-mode ad, so visit bindings backwards
     for (auto it = forward_block->bindings.rbegin(); it != forward_block->bindings.rend(); ++it) {
@@ -546,7 +546,7 @@ class BackwardBindingGenerator : private ExprVisitor {
       auto* tensor_ty = ty.as<TensorTypeNode>();
       TVM_FFI_ICHECK(tensor_ty) << "The leaf of adjoint should be a Tensor.";
       TVM_FFI_ICHECK(tensor_ty->shape.defined()) << "Missing shape when building zeros tuple.";
-      const Expr& init = zeros(tensor_ty->shape.value(), tensor_ty->dtype);
+      const Expr& init = zeros(tensor_ty->shape.value(), tensor_ty->dtype->dtype);
       return init;
     });
     return AdjointMsgToExpr(msg);
@@ -707,7 +707,8 @@ class GradientMutator : private ExprMutator {
 
   static bool IsFloatTensorType(const Type& ty) {
     auto* tensor_ty = ty.as<TensorTypeNode>();
-    return tensor_ty && tensor_ty->dtype.is_float();
+    // Gradient eligibility preserves the old float-kind check; lanes do not affect this policy.
+    return tensor_ty && tensor_ty->dtype.code() == DLDataTypeCode::kDLFloat;
   }
 
   // When the return value is a Var, it is the target;
diff --git a/src/relax/transform/infer_amp_utils.cc b/src/relax/transform/infer_amp_utils.cc
index 41c6cfe5ae42..4952aeea8fa2 100644
--- a/src/relax/transform/infer_amp_utils.cc
+++ b/src/relax/transform/infer_amp_utils.cc
@@ -22,19 +22,19 @@
 namespace tvm {
 namespace relax {
 
-NType NTypeFrom(const Type& ty, DataType dtype) {
+NType NTypeFrom(const Type& ty, DLDataType dtype) {
   auto fmapleaf = [&](const Type& ty) -> NType {
     const auto* tensor = ty.as<TensorTypeNode>();
     TVM_FFI_ICHECK(tensor) << "Expected TensorType, but got " << ty;
-    if (dtype == DataType::Void())
-      return NType(DLDataTypeToString(tensor->dtype));
+    if (dtype == DLDataType{kDLOpaqueHandle, 0, 0})
+      return NType(DLDataTypeToString(tensor->dtype->dtype));
     else
       return NType(DLDataTypeToString(dtype));
   };
   return MapToNestedMsg<ffi::String>(ty, fmapleaf);
 }
 
-NType NTypeFrom(const Expr& expr, DataType dtype) { return NTypeFrom(GetType(expr), dtype); }
+NType NTypeFrom(const Expr& expr, DLDataType dtype) { return NTypeFrom(GetType(expr), dtype); }
 
 NType NTypeMerge(const NType& a, const NType& b) {
   auto fcombine = [&](const ffi::String& a_str, const ffi::String& b_str) -> ffi::String {
@@ -44,20 +44,20 @@ NType NTypeMerge(const NType& a, const NType& b) {
       return a_str;
     }
 
-    DataType a = DataType(ffi::StringToDLDataType(a_str));
-    DataType b = DataType(ffi::StringToDLDataType(b_str));
-    TVM_FFI_ICHECK_EQ(a.code(), b.code());
-    TVM_FFI_ICHECK_EQ(a.lanes(), b.lanes());
-    return a.bits() > b.bits() ? a_str : b_str;
+    DLDataType a = ffi::StringToDLDataType(a_str);
+    DLDataType b = ffi::StringToDLDataType(b_str);
+    TVM_FFI_ICHECK_EQ(a.code, b.code);
+    TVM_FFI_ICHECK_EQ(a.lanes, b.lanes);
+    return a.bits > b.bits ? a_str : b_str;
   };
   return CombineNestedMsg<ffi::String>(a, b, fcombine);
 }
 
-ffi::Array<ffi::ObjectRef> InferMixedPrecisionFollow(const Call& call, const DataType& out_dtype) {
+ffi::Array<ffi::ObjectRef> InferMixedPrecisionFollow(const Call& call, DLDataType out_dtype) {
   return {IntImm::Int32(MixedPrecisionPolicyKind::kFollow), call};
 }
 
-ffi::Array<ffi::ObjectRef> InferMixedPrecisionNever(const Call& call, const DataType& out_dtype) {
+ffi::Array<ffi::ObjectRef> InferMixedPrecisionNever(const Call& call, DLDataType out_dtype) {
   return {IntImm::Int32(MixedPrecisionPolicyKind::kNever), call};
 }
 
diff --git a/src/relax/transform/infer_amp_utils.h b/src/relax/transform/infer_amp_utils.h
index faa33edd4a18..7f9f884a29d0 100644
--- a/src/relax/transform/infer_amp_utils.h
+++ b/src/relax/transform/infer_amp_utils.h
@@ -58,10 +58,10 @@ struct NTypeEqual {
 };
 
 // Construct a NType from an Type
-NType NTypeFrom(const Type& ty, DataType dtype = DataType::Void());
+NType NTypeFrom(const Type& ty, DLDataType dtype = DLDataType{kDLOpaqueHandle, 0, 0});
 
 // Construct a NType from an Expr
-NType NTypeFrom(const Expr& expr, DataType dtype = DataType::Void());
+NType NTypeFrom(const Expr& expr, DLDataType dtype = DLDataType{kDLOpaqueHandle, 0, 0});
 
 // Merge two messages, we keep the higher precision type for each leaf tensor
 NType NTypeMerge(const NType& a, const NType& b);
@@ -70,12 +70,11 @@ NType NTypeMerge(const NType& a, const NType& b);
 using VarDTypeMap = std::unordered_map<Var, NType>;
 
 // Call is a call node, out_dtype is the expected output_dtype
-using FInferMixedPrecision =
-    ffi::TypedFunction<Call(const Call& call_node, const DataType& out_dtype)>;
+using FInferMixedPrecision = ffi::TypedFunction<Call(const Call& call_node, DLDataType out_dtype)>;
 
-ffi::Array<ffi::ObjectRef> InferMixedPrecisionFollow(const Call& call, const DataType& out_dtype);
+ffi::Array<ffi::ObjectRef> InferMixedPrecisionFollow(const Call& call, DLDataType out_dtype);
 
-ffi::Array<ffi::ObjectRef> InferMixedPrecisionNever(const Call& call, const DataType& out_dtype);
+ffi::Array<ffi::ObjectRef> InferMixedPrecisionNever(const Call& call, DLDataType out_dtype);
 
 }  // namespace relax
 }  // namespace tvm
diff --git a/src/relax/transform/lazy_transform_params.cc b/src/relax/transform/lazy_transform_params.cc
index 7c42928d7d87..b800199610b8 100644
--- a/src/relax/transform/lazy_transform_params.cc
+++ b/src/relax/transform/lazy_transform_params.cc
@@ -65,8 +65,7 @@ class LazyInputMutator : public ExprMutator {
       param_lookup.insert({func->params[i], i - num_input_params});
     }
 
-    Var fget_param("fget_param",
-                   FuncType({PrimType(DataType::Int(64)), ObjectType()}, ObjectType()));
+    Var fget_param("fget_param", FuncType({PrimType::Int(64), ObjectType()}, ObjectType()));
 
     ffi::Array<Var> new_params(func->params.begin(), func->params.begin() + num_input_params);
     new_params.push_back(fget_param);
@@ -145,7 +144,7 @@ class LazyOutputMutator : public ExprMutator {
       define_lookup(0, func_body->body);
     }
 
-    Var fset_output("fset_output", FuncType({PrimType(DataType::Int(64)), ObjectType()},
+    Var fset_output("fset_output", FuncType({PrimType::Int(64), ObjectType()},
                                             TupleType(ffi::Array<Type>{}), /* purity = */ false));
     plan_ = FunctionPlan{std::move(output_lookup), fset_output};
 
diff --git a/src/relax/transform/legalize_ops.cc b/src/relax/transform/legalize_ops.cc
index 00bd8e859ac3..2c518cfbbeae 100644
--- a/src/relax/transform/legalize_ops.cc
+++ b/src/relax/transform/legalize_ops.cc
@@ -282,7 +282,7 @@ class LegalizeMutator : public ExprMutator {
         //     This fallback would only be applicable for cases where
         //     both the dtype and the dimensionality are known.  While
         //     Relax can express a tensor with unknown dtype and
-        //     dimensionality as `TensorType(DataType::Void(),
+        //     dimensionality as `TensorType(DLDataType{kDLOpaqueHandle, 0, 0},
         //     kUnknownNDim)`, TIR cannot express unknown dtype or
         //     unknown dimensionality.
         return false;
diff --git a/src/relax/transform/lower_alloc_tensor.cc b/src/relax/transform/lower_alloc_tensor.cc
index 66c2c95b89c2..67cbcc7e8791 100644
--- a/src/relax/transform/lower_alloc_tensor.cc
+++ b/src/relax/transform/lower_alloc_tensor.cc
@@ -72,7 +72,8 @@ class Mutator : public ExprMutator {
       }();
 
       PrimExpr nbytes = [&]() -> PrimExpr {
-        PrimExpr nbytes = IntImm::Int64(dtype->value.bytes());
+        PrimExpr nbytes = IntImm::Int64(
+            ((((dtype->value).bits * static_cast<int16_t>((dtype->value).lanes)) + 7) / 8));
         for (const auto& dim : shape) {
           nbytes *= dim;
         }
@@ -112,7 +113,7 @@ class Mutator : public ExprMutator {
       auto offset = PrimValue::Int64(0);
 
       Expr storage = relax::Call(mem_alloc_storage_op, {size, runtime_device_index, storage_scope,
-                                                        DataTypeImm(DataType::UInt(8))});
+                                                        DataTypeImm((DLDataType{kDLUInt, 8, 1}))});
       storage = builder_->Emit(storage, "storage");
       Expr tensor =
           relax::Call(mem_alloc_tensor_op, {storage, offset, shape_arg, dtype, op->args[2]});
diff --git a/src/relax/transform/remove_unused_outputs.cc b/src/relax/transform/remove_unused_outputs.cc
index 995fe019be04..f8a9e8cde70b 100644
--- a/src/relax/transform/remove_unused_outputs.cc
+++ b/src/relax/transform/remove_unused_outputs.cc
@@ -289,7 +289,7 @@ Pass RemoveUnusedOutputs() {
                   // into the old tuple, but it's simpler to just let
                   // CanonicalizeBindings and DCE handle it.
                   new_results.push_back(
-                      relax::PrimValue(FloatImm(DataType::Float(64), std::nan(""))));
+                      relax::PrimValue(FloatImm(tvm::PrimType::Float(64), std::nan(""))));
                 }
               }
 
diff --git a/src/relax/transform/remove_unused_parameters.cc b/src/relax/transform/remove_unused_parameters.cc
index ebe9fa000f77..4f28f9d13132 100644
--- a/src/relax/transform/remove_unused_parameters.cc
+++ b/src/relax/transform/remove_unused_parameters.cc
@@ -100,7 +100,7 @@ std::optional<CalleeAnalysis> AnalyzeCallee(Function func) {
   }
 
   for (const auto& tir_var : free_tir_vars) {
-    Var relax_var("param_" + tir_var->name_hint, PrimType(tir_var.dtype()));
+    Var relax_var("param_" + tir_var->name_hint, PrimType(tir_var.ty()));
     params.push_back(relax_var);
   }
 
diff --git a/src/relax/transform/reorder_take_after_matmul.cc b/src/relax/transform/reorder_take_after_matmul.cc
index bd36c5cb89c5..7fd0fb7eecaa 100644
--- a/src/relax/transform/reorder_take_after_matmul.cc
+++ b/src/relax/transform/reorder_take_after_matmul.cc
@@ -92,7 +92,7 @@ std::tuple<DFPattern, ffi::TypedFunction<Expr(Expr, ffi::Map<DFPattern, Expr>)>>
       // indices.shape = [outfeatures]
 
       // out_table.shape = [*batch, table_size]
-      auto out_table = matmul(lhs, weights, DataType::Void());
+      auto out_table = matmul(lhs, weights, (DLDataType{kDLOpaqueHandle, 0, 0}));
       // new_output.shape = [*batch, outfeatures]
       auto new_output = take(out_table, indices, matmul_ty->ndim - 1);
 
@@ -116,7 +116,7 @@ std::tuple<DFPattern, ffi::TypedFunction<Expr(Expr, ffi::Map<DFPattern, Expr>)>>
       auto fused_weight = reshape(reordered_weight,
                                   ShapeExpr({weight_shape[1], weight_shape[0] * weight_shape[2]}));
       // fused_output.shape = [batch1, batch2, table_size * outfeatures]
-      auto fused_output = matmul(lhs, fused_weight, DataType::Void());
+      auto fused_output = matmul(lhs, fused_weight, (DLDataType{kDLOpaqueHandle, 0, 0}));
       // indexed_output.shape = [batch1, batch2, table_size, outfeatures]
       auto indexed_output = reshape(
           fused_output, ShapeExpr({lhs_shape[0], lhs_shape[1], weight_shape[0], weight_shape[2]}));
diff --git a/src/relax/transform/split_call_tir_by_pattern.cc b/src/relax/transform/split_call_tir_by_pattern.cc
index 4d15c0fd88f5..19e0dfdf8f00 100644
--- a/src/relax/transform/split_call_tir_by_pattern.cc
+++ b/src/relax/transform/split_call_tir_by_pattern.cc
@@ -129,7 +129,7 @@ class ForMatcher : public TensorizeComparator {
         if (match) {
           evaluated_symbols.back().insert(symbol_map.begin(), symbol_map.end());
           evaluated_symbols.back()[ffi::GetRef<Var>(operand_a)] =
-              MakeConstScalar(rhs_ptr->b.dtype(), 1);
+              MakeConstScalar(rhs_ptr->b.ty(), 1);
           return true;
         }
       }
@@ -142,7 +142,7 @@ class ForMatcher : public TensorizeComparator {
         if (match) {
           evaluated_symbols.back().insert(symbol_map.begin(), symbol_map.end());
           evaluated_symbols.back()[ffi::GetRef<Var>(operand_b)] =
-              MakeConstScalar(rhs_ptr->a.dtype(), 1);
+              MakeConstScalar(rhs_ptr->a.ty(), 1);
           return true;
         }
       }
@@ -160,7 +160,7 @@ class ForMatcher : public TensorizeComparator {
         if (match) {
           evaluated_symbols.back().insert(symbol_map.begin(), symbol_map.end());
           evaluated_symbols.back()[ffi::GetRef<Var>(operand_a)] =
-              MakeConstScalar(rhs_ptr->b.dtype(), 0);
+              MakeConstScalar(rhs_ptr->b.ty(), 0);
           return true;
         }
       }
@@ -173,7 +173,7 @@ class ForMatcher : public TensorizeComparator {
         if (match) {
           evaluated_symbols.back().insert(symbol_map.begin(), symbol_map.end());
           evaluated_symbols.back()[ffi::GetRef<Var>(operand_b)] =
-              MakeConstScalar(rhs_ptr->a.dtype(), 0);
+              MakeConstScalar(rhs_ptr->a.ty(), 0);
           return true;
         }
       }
@@ -622,7 +622,7 @@ std::pair<PrimFunc, ffi::Optional<PrimFunc>> SplitFunctions(
     }
   }
   arg_partition->push_back(arg_partition1);
-  new_params1.push_back(Var("output", DataType::Handle()));
+  new_params1.push_back(Var("output", PrimType::Handle()));
   ffi::Map<Var, Buffer> new_buffer_map1;
   for (const auto& kv : func->buffer_map) {
     if (partitioner.input1.count(kv.second)) {
@@ -635,7 +635,7 @@ std::pair<PrimFunc, ffi::Optional<PrimFunc>> SplitFunctions(
   // Step 4. Craft the second function.
   ffi::Array<Var> new_params2;
   std::vector<int> arg_partition2;
-  new_params2.push_back(Var("input", DataType::Handle()));
+  new_params2.push_back(Var("input", PrimType::Handle()));
   for (int i = 0; i < static_cast<int>(func->params.size()); i++) {
     Var param = func->params[i];
     if (partitioner.input2.count(func->buffer_map[param])) {
@@ -752,7 +752,7 @@ class SplitMutator : public ExprMutator {
     TVM_FFI_ICHECK(lib_func->IsInstance<ExternFuncNode>());
     builder_->UpdateFunction(gv, lib_func);
     tirx::Buffer intermediate_buffer = func1->buffer_map.at(func1->params.back());
-    DataType dtype = intermediate_buffer->dtype;
+    PrimType dtype = intermediate_buffer->dtype;
     Call call1(call_dps_packed_, {lib_func, Tuple(args1)}, call->attrs,
                {TensorType(ShapeExpr(intermediate_buffer->shape), dtype)});
     Var call_var1 = builder_->Emit(call1);
diff --git a/src/relax/transform/split_layout_rewrite_preproc.cc b/src/relax/transform/split_layout_rewrite_preproc.cc
index 0560582fac59..e09e377e8a70 100644
--- a/src/relax/transform/split_layout_rewrite_preproc.cc
+++ b/src/relax/transform/split_layout_rewrite_preproc.cc
@@ -65,11 +65,11 @@ class SplitPrimFuncLayoutRewrite : public StmtMutator {
     ffi::Map<Var, Buffer> buffer_map;
 
     for (const auto& info : rewrite_infos_) {
-      params.push_back(Var(info.pre_rewrite_buffer->name, DataType::Handle()));
+      params.push_back(Var(info.pre_rewrite_buffer->name, PrimType::Handle()));
       buffer_map.Set(params.back(), info.pre_rewrite_buffer);
     }
     for (const auto& info : rewrite_infos_) {
-      params.push_back(Var(info.post_rewrite_buffer->name, DataType::Handle()));
+      params.push_back(Var(info.post_rewrite_buffer->name, PrimType::Handle()));
       buffer_map.Set(params.back(), info.post_rewrite_buffer);
     }
 
diff --git a/src/relax/transform/static_plan_block_memory.cc b/src/relax/transform/static_plan_block_memory.cc
index 651b70961090..3d4fcb256d0e 100644
--- a/src/relax/transform/static_plan_block_memory.cc
+++ b/src/relax/transform/static_plan_block_memory.cc
@@ -106,7 +106,7 @@ class StorageTokenNode : public ffi::Object {
   /*! \brief Number of bytes that this token requires. */
   PrimExpr bytes;
   /*! \brief The dtype of this token. */
-  DataType dtype;
+  DLDataType dtype;
   /*! \brief The memory scope of the token. */
   std::string storage_scope;
   /*! \brief The VDevice information. */
@@ -135,10 +135,10 @@ class StorageTokenNode : public ffi::Object {
  */
 class StorageToken : public ffi::ObjectRef {
  public:
-  explicit StorageToken(ffi::Array<PrimExpr> shape, DataType dtype, std::string storage_scope,
+  explicit StorageToken(ffi::Array<PrimExpr> shape, DLDataType dtype, std::string storage_scope,
                         ffi::Optional<VDevice> vdevice = std::nullopt) {
     // Compute the tensor size from the shape.
-    int64_t const_coeff = dtype.bytes() * dtype.lanes();
+    int64_t const_coeff = ((((dtype).bits * static_cast<int16_t>((dtype).lanes)) + 7) / 8);
     PrimExpr size = IntImm::Int64(1);
     bool size_computed = false;
 
@@ -303,13 +303,16 @@ class TokenAllocatorMixed {
   }
 
  private:
-  /*! \brief The hash class to enable std::pair as map key class. */
-  struct PairHash {
-    template <class T1, class T2>
-    std::size_t operator()(const std::pair<T1, T2>& p) const {
-      auto h1 = std::hash<T1>{}(p.first);
-      auto h2 = std::hash<T2>{}(p.second);
-      return h1 ^ h2;
+  using PoolKey = std::pair<std::string, DLDataType>;
+
+  /*! \brief The hash class to enable storage scope and raw dtype as map key class. */
+  struct PoolKeyHash {
+    std::size_t operator()(const PoolKey& p) const {
+      std::size_t h = std::hash<std::string>{}(p.first);
+      h ^= static_cast<std::size_t>(p.second.code) + 0x9e3779b9 + (h << 6) + (h >> 2);
+      h ^= static_cast<std::size_t>(p.second.bits) + 0x9e3779b9 + (h << 6) + (h >> 2);
+      h ^= static_cast<std::size_t>(p.second.lanes) + 0x9e3779b9 + (h << 6) + (h >> 2);
+      return h;
     }
   };
 
@@ -318,9 +321,7 @@ class TokenAllocatorMixed {
   /*! \brief A constant scale representing the token search range. */
   const int match_range_{16};
   /*! \brief The pool of available storage tokens for each storage scope and dtype. */
-  std::unordered_map<std::pair<std::string, DataType>, std::multimap<int64_t, StorageToken>,
-                     PairHash>
-      available_pool_;
+  std::unordered_map<PoolKey, std::multimap<int64_t, StorageToken>, PoolKeyHash> available_pool_;
   /*! \brief All the storage tokens that have been allocated with actual storage. */
   std::vector<StorageToken> full_pool_;
 };
@@ -636,7 +637,7 @@ class StorageAllocatorInit : public StorageAllocatorBaseVisitor {
     const auto* shape = ty->shape.as<ShapeExprNode>();
     TVM_FFI_ICHECK_NOTNULL(shape);
     TVM_FFI_ICHECK(!ty->IsUnknownDtype());
-    TVM_FFI_ICHECK(ty->dtype == call->args[1].as_or_throw<DataTypeImm>()->value);
+    TVM_FFI_ICHECK(ty->dtype->dtype == call->args[1].as_or_throw<DataTypeImm>()->value);
     TVM_FFI_ICHECK(!token_map_.count(call));
 
     // Use the upper bounds of TIR vars as their values. The upper bound shape can still be dynamic
@@ -653,7 +654,7 @@ class StorageAllocatorInit : public StorageAllocatorBaseVisitor {
     }
     ffi::Optional<VDevice> vdevice = GetGlobalVDevice(ctx_mod_, vdevice_index);
 
-    StorageToken token(upper_bounded_shape, ty->dtype, storage_scope->value, vdevice);
+    StorageToken token(upper_bounded_shape, ty->dtype->dtype, storage_scope->value, vdevice);
 
     Tokens tokens(token);
     SetTokens(call, tokens);
@@ -938,7 +939,7 @@ class StorageAllocationRewriter : public ExprMutator {
       if (it_token == token2storage_var_.end()) {
         ShapeExpr size({token->bytes});
         PrimValue virtual_device_index = runtime_device_index;
-        DataType dtype = token->dtype;
+        DLDataType dtype = token->dtype;
         Call alloc_storage(mem_alloc_storage,
                            {std::move(size), virtual_device_index, StringImm(token->storage_scope),
                             DataTypeImm(dtype)},
@@ -951,7 +952,7 @@ class StorageAllocationRewriter : public ExprMutator {
 
       // And always create a `memory.alloc_tensor` for the old `builtin.alloc_tensor`.
       PrimValue offset = PrimValue::Int64(0);
-      DataType dtype = ty->dtype;
+      DLDataType dtype = ty->dtype->dtype;
       return Call(mem_alloc_tensor,
                   {storage_var, offset, ty->shape.value(), DataTypeImm(dtype), call->args[2]},
                   Attrs());
@@ -970,22 +971,23 @@ class StorageAllocationRewriter : public ExprMutator {
           GetUpperBoundShape(shape->values, ana_.get(), dom_map_);
       if (!IsStaticShape(shape->values)) {
         TVM_FFI_ICHECK(!ty->IsUnknownDtype());
-        TVM_FFI_ICHECK_EQ(ty->dtype, call->args[1].as_or_throw<DataTypeImm>()->value);
+        TVM_FFI_ICHECK_EQ(ty->dtype->dtype, call->args[1].as_or_throw<DataTypeImm>()->value);
         PrimExpr bytes = upper_bounded_shape[0];
         for (int i = 1; i < static_cast<int>(upper_bounded_shape.size()); ++i) {
           bytes *= upper_bounded_shape[i];
         }
-        bytes *= ty->dtype.bytes() * ty->dtype.lanes();
+        DLDataType dtype = ty->dtype->dtype;
+        bytes *= ((((dtype).bits * static_cast<int16_t>((dtype).lanes)) + 7) / 8);
         Call alloc_storage(mem_alloc_storage,
                            {/*size=*/ShapeExpr({bytes}),
                             /*virtual_device_index=*/call->args[2].as_or_throw<PrimValue>(),
                             /*storage_scope=*/call->args[3].as_or_throw<StringImm>(),  //
-                            /*dtype=*/DataTypeImm(ty->dtype)});
+                            /*dtype=*/DataTypeImm(dtype)});
         Var storage = builder_->Emit(alloc_storage, "storage");
         return Call(mem_alloc_tensor, {storage,  //
                                        /*offset=*/PrimValue::Int64(0),
                                        /*shape=*/ffi::GetRef<ShapeExpr>(shape),  //
-                                       /*dtype=*/DataTypeImm(ty->dtype),
+                                       /*dtype=*/DataTypeImm(dtype),
                                        /*vdevice_index=*/call->args[2]});
       }
     }
@@ -1040,7 +1042,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   refl::GlobalDef().def("relax.transform.StaticPlanBlockMemory", StaticPlanBlockMemory);
 }
 
-PrimExpr GetTextureMemorySizeFromVDevice(ffi::Array<PrimExpr> pshape, DataType dtype,
+PrimExpr GetTextureMemorySizeFromVDevice(ffi::Array<PrimExpr> pshape, DLDataType dtype,
                                          VDevice vdevice) {
   int image_row_align = static_cast<int>(
       vdevice->target->GetAttr<int64_t>("image_base_address_alignment").value_or(64));
@@ -1056,7 +1058,9 @@ PrimExpr GetTextureMemorySizeFromVDevice(ffi::Array<PrimExpr> pshape, DataType d
   };
   auto shape = Shape{pshape};
 
-  size_t size = runtime::GetTextureMemorySize<Shape>(shape, dtype.bytes() * 8, dtype.lanes(),
+  int lanes = static_cast<int16_t>(dtype.lanes);
+  TVM_FFI_ICHECK_GE(lanes, 0) << "Can't fetch the bytes of a scalable vector at a compile time.";
+  size_t size = runtime::GetTextureMemorySize<Shape>(shape, dtype.bits, lanes,
                                                      vdevice->memory_scope, image_row_align);
   return IntImm::Int64(size);
 }
diff --git a/src/relax/transform/to_mixed_precision.cc b/src/relax/transform/to_mixed_precision.cc
index ddd23ce2ea7b..45d2af9b8579 100644
--- a/src/relax/transform/to_mixed_precision.cc
+++ b/src/relax/transform/to_mixed_precision.cc
@@ -116,9 +116,9 @@ int GetMixedPrecisionInfo(const CallNode* call_node) {
  */
 class DTypeDecisionCollector : public ExprVisitor {
  public:
-  explicit DTypeDecisionCollector(DataType output_dtype) : output_dtype_(output_dtype) {}
+  explicit DTypeDecisionCollector(DLDataType output_dtype) : output_dtype_(output_dtype) {}
 
-  static VarDTypeMap Collect(Function func, DataType output_dtype) {
+  static VarDTypeMap Collect(Function func, DLDataType output_dtype) {
     DTypeDecisionCollector collector(output_dtype);
     collector.VisitExpr(func);
     return std::move(collector.only_fp16_map_);
@@ -165,7 +165,7 @@ class DTypeDecisionCollector : public ExprVisitor {
   }
 
   // merge the message for all vars in the expr list
-  void RequireArgsToType(ffi::Array<Expr> args, DataType to) {
+  void RequireArgsToType(ffi::Array<Expr> args, DLDataType to) {
     std::vector<Expr> arg_arr;
     std::vector<NType> to_arr;
     for (const Expr& arg : args) {
@@ -262,16 +262,16 @@ class DTypeDecisionCollector : public ExprVisitor {
     }
   }
 
-  DataType unknown_ = DataType(DataType::TypeCode::kFloat, 0, 1);
-  DataType fp16_ = DataType(DataType::TypeCode::kFloat, 16, 1);
-  DataType fp32_ = DataType(DataType::TypeCode::kFloat, 32, 1);
-  DataType output_dtype_;
+  DLDataType unknown_ = DLDataType{kDLFloat, 0, 1};
+  DLDataType fp16_ = DLDataType{kDLFloat, 16, 1};
+  DLDataType fp32_ = DLDataType{kDLFloat, 32, 1};
+  DLDataType output_dtype_;
   VarDTypeMap only_fp16_map_;
 };
 
 class ToMixedPrecisionRewriter : public ExprMutator {
  public:
-  explicit ToMixedPrecisionRewriter(const VarDTypeMap* only_fp16_map, DataType output_dtype,
+  explicit ToMixedPrecisionRewriter(const VarDTypeMap* only_fp16_map, DLDataType output_dtype,
                                     const std::unordered_set<std::string>& fp16_input_names)
       : only_fp16_map_(only_fp16_map),
         output_dtype_(output_dtype),
@@ -290,7 +290,7 @@ class ToMixedPrecisionRewriter : public ExprMutator {
           if (tensor_ty->vdevice.defined()) {
             vdev = tensor_ty->vdevice.value();
           }
-          TensorType fp16_ty(tensor_ty->shape.value(), DataType::Float(16), vdev, tensor_ty->span);
+          TensorType fp16_ty(tensor_ty->shape.value(), PrimType::Float(16), vdev, tensor_ty->span);
           Var fp16_var(var->vid, fp16_ty, var->span);
           var_remap_[var->vid] = fp16_var;
           return fp16_var;
@@ -315,13 +315,14 @@ class ToMixedPrecisionRewriter : public ExprMutator {
       if (NTypeEqual()(to[0], NTypeFrom(expr))) return expr;
       // We only rewrite the expr if the dtype is fp16 or fp32, dtypes such as int32, float64 is not
       // supported to be rewritten
-      if (tensor->dtype != fp16_ && tensor->dtype != fp32_) return expr;
-      return astype(expr, DataType(ffi::StringToDLDataType(to[0].LeafValue())));
+      DLDataType tensor_dtype = tensor->dtype->dtype;
+      if (tensor_dtype != fp16_ && tensor_dtype != fp32_) return expr;
+      return astype(expr, ffi::StringToDLDataType(to[0].LeafValue()));
     };
     return TransformTupleLeaf<ffi::String>(expr, std::array<NType, 1>({to}), fvisitleaf);
   }
 
-  ffi::Array<Expr> RewriteArgs(const ffi::Array<Expr>& args, DataType to) {
+  ffi::Array<Expr> RewriteArgs(const ffi::Array<Expr>& args, DLDataType to) {
     ffi::Array<Expr> new_args;
     for (const Expr& arg : args) {
       if (IsNestedTensor(arg)) {
@@ -346,7 +347,7 @@ class ToMixedPrecisionRewriter : public ExprMutator {
   bool AllFP16Castable(const ffi::Array<Expr>& args) {
     auto is_fp16 = [](Type ty) {
       if (auto tensor_ty = ty.as<TensorTypeNode>();
-          tensor_ty && tensor_ty->dtype == DataType::Float(16)) {
+          tensor_ty && tensor_ty->dtype == PrimType::Float(16)) {
         return true;
       }
       return false;
@@ -359,7 +360,7 @@ class ToMixedPrecisionRewriter : public ExprMutator {
         return false;
       }
 
-      if (data.DataType() == DataType::Float(16)) {
+      if (data.DataType() == DLDataType{kDLFloat, 16, 1}) {
         return true;
       }
 
@@ -372,17 +373,17 @@ class ToMixedPrecisionRewriter : public ExprMutator {
       std::vector<uint8_t> bytes(size_1d * elem_bytes);
       data.CopyToBytes(bytes.data(), bytes.size());
 
-      if (data.DataType() == DataType::Float(32)) {
+      if (data.DataType() == DLDataType{kDLFloat, 32, 1}) {
         return CheckInFP16Range<float>(bytes, size_1d);
-      } else if (data.DataType() == DataType::Float(64)) {
+      } else if (data.DataType() == DLDataType{kDLFloat, 64, 1}) {
         return CheckInFP16Range<double>(bytes, size_1d);
-      } else if (data.DataType() == DataType::Int(8)) {
+      } else if (data.DataType() == DLDataType{kDLInt, 8, 1}) {
         return CheckInFP16Range<std::int8_t>(bytes, size_1d);
-      } else if (data.DataType() == DataType::Int(16)) {
+      } else if (data.DataType() == DLDataType{kDLInt, 16, 1}) {
         return CheckInFP16Range<std::int16_t>(bytes, size_1d);
-      } else if (data.DataType() == DataType::Int(32)) {
+      } else if (data.DataType() == DLDataType{kDLInt, 32, 1}) {
         return CheckInFP16Range<std::int32_t>(bytes, size_1d);
-      } else if (data.DataType() == DataType::Int(64)) {
+      } else if (data.DataType() == DLDataType{kDLInt, 64, 1}) {
         return CheckInFP16Range<std::int64_t>(bytes, size_1d);
       }
       return false;
@@ -476,7 +477,7 @@ class ToMixedPrecisionRewriter : public ExprMutator {
     new_call.CopyOnWrite()->args = RemapArgs(new_call->args);
 
     // Then we rewrite the args according to the policy
-    std::optional<DataType> opt_new_dtype = std::nullopt;
+    std::optional<DLDataType> opt_new_dtype = std::nullopt;
 
     if (policy == kAlways) {
       opt_new_dtype = fp16_;
@@ -589,16 +590,16 @@ class ToMixedPrecisionRewriter : public ExprMutator {
 
   const VarDTypeMap* only_fp16_map_;
 
-  DataType fp16_ = DataType(DataType::TypeCode::kFloat, 16, 1);
-  DataType fp32_ = DataType(DataType::TypeCode::kFloat, 32, 1);
-  DataType output_dtype_;
+  DLDataType fp16_ = DLDataType{kDLFloat, 16, 1};
+  DLDataType fp32_ = DLDataType{kDLFloat, 32, 1};
+  DLDataType output_dtype_;
   ffi::Array<Var> params_;
   std::unordered_set<std::string> fp16_input_names_;
 
   const Op& wrap_param_op = Op::Get("relax.wrap_param");
 };
 
-Expr ToMixedPrecision(const Function& f, const DataType& out_dtype,
+Expr ToMixedPrecision(const Function& f, DLDataType out_dtype,
                       ffi::Optional<ffi::Array<ffi::String>> fp16_input_names) {
   VarDTypeMap only_fp16_map = DTypeDecisionCollector::Collect(f, out_dtype);
   std::unordered_set<std::string> fp16_input_names_set;
@@ -611,7 +612,7 @@ Expr ToMixedPrecision(const Function& f, const DataType& out_dtype,
 
 namespace transform {
 
-Pass ToMixedPrecision(const DataType& out_dtype,
+Pass ToMixedPrecision(DLDataType out_dtype,
                       ffi::Optional<ffi::Array<ffi::String>> fp16_input_names) {
   auto pass_func = [=](Function f, IRModule m, PassContext pc) {
     return ToMixedPrecision(f, out_dtype, fp16_input_names).as_or_throw<Function>();
diff --git a/src/relax/transform/utils.h b/src/relax/transform/utils.h
index 275c7ca94f8d..0dd6aa6e54a6 100644
--- a/src/relax/transform/utils.h
+++ b/src/relax/transform/utils.h
@@ -319,39 +319,39 @@ class FunctionCopier : public SymbolicVarRenewMutator {
  * \return A Constant.
  */
 template <typename T>
-inline Constant MakeConstantScalar(T value, DataType dtype) {
+inline Constant MakeConstantScalar(T value, DLDataType dtype) {
   runtime::Tensor arr = runtime::Tensor::Empty({}, dtype, {kDLCPU, 0});
-  if (dtype == DataType::Float(32)) {
+  if (dtype == DLDataType{kDLFloat, 32, 1}) {
     *static_cast<float*>(arr->data) = static_cast<float>(value);
-  } else if (dtype == DataType::Float(64)) {
+  } else if (dtype == DLDataType{kDLFloat, 64, 1}) {
     *static_cast<double*>(arr->data) = static_cast<double>(value);
-  } else if (dtype == DataType::Int(32)) {
+  } else if (dtype == DLDataType{kDLInt, 32, 1}) {
     *static_cast<int32_t*>(arr->data) = static_cast<int32_t>(value);
-  } else if (dtype == DataType::Int(64)) {
+  } else if (dtype == DLDataType{kDLInt, 64, 1}) {
     *static_cast<int64_t*>(arr->data) = static_cast<int64_t>(value);
-  } else if (dtype == DataType::Bool()) {
+  } else if (dtype == DLDataType{kDLBool, 1, 1}) {
     *static_cast<bool*>(arr->data) = static_cast<bool>(value);
-  } else if (dtype == DataType::UInt(8)) {
+  } else if (dtype == DLDataType{kDLUInt, 8, 1}) {
     *static_cast<uint8_t*>(arr->data) = static_cast<uint8_t>(value);
-  } else if (dtype == DataType::UInt(16)) {
+  } else if (dtype == DLDataType{kDLUInt, 16, 1}) {
     *static_cast<uint16_t*>(arr->data) = static_cast<uint16_t>(value);
-  } else if (dtype == DataType::UInt(32)) {
+  } else if (dtype == DLDataType{kDLUInt, 32, 1}) {
     *static_cast<uint32_t*>(arr->data) = static_cast<uint32_t>(value);
-  } else if (dtype == DataType::UInt(64)) {
+  } else if (dtype == DLDataType{kDLUInt, 64, 1}) {
     *static_cast<uint64_t*>(arr->data) = static_cast<uint64_t>(value);
-  } else if (dtype == DataType::Int(8)) {
+  } else if (dtype == DLDataType{kDLInt, 8, 1}) {
     *static_cast<int8_t*>(arr->data) = static_cast<int8_t>(value);
-  } else if (dtype == DataType::Int(16)) {
+  } else if (dtype == DLDataType{kDLInt, 16, 1}) {
     *static_cast<int16_t*>(arr->data) = static_cast<int16_t>(value);
-  } else if (dtype == DataType::Int(32)) {
+  } else if (dtype == DLDataType{kDLInt, 32, 1}) {
     *static_cast<int32_t*>(arr->data) = static_cast<int32_t>(value);
-  } else if (dtype == DataType::Int(64)) {
+  } else if (dtype == DLDataType{kDLInt, 64, 1}) {
     *static_cast<int64_t*>(arr->data) = static_cast<int64_t>(value);
-  } else if (dtype == DataType::Float(16)) {
+  } else if (dtype == DLDataType{kDLFloat, 16, 1}) {
     // convert to float16 storage is uint16_t
     *static_cast<uint16_t*>(arr->data) =
         __truncXfYf2__<float, uint32_t, 23, uint16_t, uint16_t, 10>(static_cast<float>(value));
-  } else if (dtype == DataType::BFloat(16)) {
+  } else if (dtype == DLDataType{kDLBfloat, 16, 1}) {
     // convert to bfloat16 storage is uint16_t
     *static_cast<uint16_t*>(arr->data) =
         __truncXfYf2__<float, uint32_t, 23, uint16_t, uint16_t, 7>(static_cast<float>(value));
diff --git a/src/relax/utils.cc b/src/relax/utils.cc
index 370947e4b01f..2f5cc6d9dea8 100644
--- a/src/relax/utils.cc
+++ b/src/relax/utils.cc
@@ -179,11 +179,11 @@ tvm::ffi::Map<tirx::Var, PrimExpr> InferSymbolicVarMap(
 }
 
 bool IsBoolType(const Type& ty, bool permit_unknown_rank, bool permit_unknown_dtype) {
-  DataType dtype;
+  DLDataType dtype;
   int ndim;
 
   if (const auto* tensor = ty.as<TensorTypeNode>()) {
-    dtype = tensor->dtype;
+    dtype = tensor->dtype->dtype;
     ndim = tensor->ndim;
   } else if (const auto* prim = ty.as<PrimTypeNode>()) {
     dtype = prim->dtype;
@@ -192,7 +192,9 @@ bool IsBoolType(const Type& ty, bool permit_unknown_rank, bool permit_unknown_dt
     return false;
   }
 
-  bool correct_dtype = dtype.is_bool() || (permit_unknown_dtype && dtype.is_void());
+  // Bool-type matching preserves the old element-code-only behavior; rank is checked separately.
+  bool correct_dtype = dtype.code == DLDataTypeCode::kDLBool ||
+                       (permit_unknown_dtype && dtype == DLDataType{kDLOpaqueHandle, 0, 0});
   bool correct_rank = ndim == 0 || (permit_unknown_rank && ndim == -1);
   return correct_dtype && correct_rank;
 }
diff --git a/src/runtime/extra/contrib/cblas/cblas.cc b/src/runtime/extra/contrib/cblas/cblas.cc
index d71eaeb17672..aae0a5acce1c 100644
--- a/src/runtime/extra/contrib/cblas/cblas.cc
+++ b/src/runtime/extra/contrib/cblas/cblas.cc
@@ -21,10 +21,10 @@
  * \file Use external cblas library call.
  */
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/error.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 
 extern "C" {
 #include <cblas.h>
diff --git a/src/runtime/extra/contrib/cblas/dnnl_blas.cc b/src/runtime/extra/contrib/cblas/dnnl_blas.cc
index 08d72e57b7ad..c267a37aa58e 100644
--- a/src/runtime/extra/contrib/cblas/dnnl_blas.cc
+++ b/src/runtime/extra/contrib/cblas/dnnl_blas.cc
@@ -21,10 +21,10 @@
  * \file Use external cblas library call.
  */
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/error.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 
 extern "C" {
 #include <dnnl.h>
diff --git a/src/runtime/extra/contrib/cblas/gemm_common.h b/src/runtime/extra/contrib/cblas/gemm_common.h
index 52f306e86238..65b13aa4c728 100644
--- a/src/runtime/extra/contrib/cblas/gemm_common.h
+++ b/src/runtime/extra/contrib/cblas/gemm_common.h
@@ -26,8 +26,8 @@
 #define TVM_RUNTIME_CONTRIB_CBLAS_GEMM_COMMON_H_
 
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/function.h>
-#include <tvm/runtime/data_type.h>
 
 #include <algorithm>
 #include <string>
@@ -37,7 +37,6 @@ namespace contrib {
 
 using ffi::Any;
 using ffi::PackedArgs;
-using runtime::TypeMatch;
 
 inline int ColumnStride(const DLTensor* tensor) {
   // If the tensor itself is transposed then it will have strides
@@ -96,8 +95,8 @@ inline void CallGemm(ffi::PackedArgs args, ffi::Any* ret, TGemmOp op) {
   transa = IsInPlaceTransposed(A) ? !transa : transa;
   transb = IsInPlaceTransposed(B) ? !transb : transb;
 
-  TVM_FFI_ICHECK(TypeMatch(B->dtype, kDLFloat, bit_depth));
-  TVM_FFI_ICHECK(TypeMatch(C->dtype, kDLFloat, bit_depth));
+  TVM_FFI_ICHECK((B->dtype == DLDataType{kDLFloat, static_cast<uint8_t>(bit_depth), 1}));
+  TVM_FFI_ICHECK((C->dtype == DLDataType{kDLFloat, static_cast<uint8_t>(bit_depth), 1}));
   double alpha = args.size() > 5 ? args[5].cast<double>() : 1.0;
   double beta = args.size() > 6 ? args[6].cast<double>() : 0.0;
   op(transb, transa, ColumnCount(B, transb), RowCount(A, transa), ColumnCount(A, transa),
@@ -143,9 +142,9 @@ inline void CallU8S8S32Gemm(ffi::PackedArgs args, ffi::Any* ret, TGemmOp op) {
   transa = IsInPlaceTransposed(A) ? !transa : transa;
   transb = IsInPlaceTransposed(B) ? !transb : transb;
 
-  TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLUInt, 8));
-  TVM_FFI_ICHECK(TypeMatch(B->dtype, kDLInt, 8));
-  TVM_FFI_ICHECK(TypeMatch(C->dtype, kDLInt, 32));
+  TVM_FFI_ICHECK((A->dtype == DLDataType{kDLUInt, 8, 1}));
+  TVM_FFI_ICHECK((B->dtype == DLDataType{kDLInt, 8, 1}));
+  TVM_FFI_ICHECK((C->dtype == DLDataType{kDLInt, 32, 1}));
   double alpha = args.size() > 5 ? args[5].cast<double>() : 1.0;
   double beta = args.size() > 6 ? args[6].cast<double>() : 0.0;
   op(transb, transa, ColumnCount(B, transb), RowCount(A, transa), ColumnCount(A, transa),
@@ -207,8 +206,8 @@ inline void CallBatchGemm(ffi::PackedArgs args, ffi::Any* ret, TBatchGemmOp op)
   transa = IsInPlaceTransposed3D(A) ? !transa : transa;
   transb = IsInPlaceTransposed3D(B) ? !transb : transb;
 
-  TVM_FFI_ICHECK(TypeMatch(B->dtype, kDLFloat, bit_depth));
-  TVM_FFI_ICHECK(TypeMatch(C->dtype, kDLFloat, bit_depth));
+  TVM_FFI_ICHECK((B->dtype == DLDataType{kDLFloat, static_cast<uint8_t>(bit_depth), 1}));
+  TVM_FFI_ICHECK((C->dtype == DLDataType{kDLFloat, static_cast<uint8_t>(bit_depth), 1}));
 
   double alpha = args.size() > 5 ? args[5].cast<double>() : 1.0;
   double beta = args.size() > 6 ? args[6].cast<double>() : 0.0;
diff --git a/src/runtime/extra/contrib/cblas/mkl.cc b/src/runtime/extra/contrib/cblas/mkl.cc
index 20f0c539076b..f039df8e676f 100644
--- a/src/runtime/extra/contrib/cblas/mkl.cc
+++ b/src/runtime/extra/contrib/cblas/mkl.cc
@@ -21,10 +21,10 @@
  * \file Use external mkl library call.
  */
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/error.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 
 extern "C" {
 #include <mkl_cblas.h>
diff --git a/src/runtime/extra/contrib/coreml/coreml_runtime.mm b/src/runtime/extra/contrib/coreml/coreml_runtime.mm
index a72948b250a7..d9823407fb0a 100644
--- a/src/runtime/extra/contrib/coreml/coreml_runtime.mm
+++ b/src/runtime/extra/contrib/coreml/coreml_runtime.mm
@@ -44,15 +44,15 @@
     [shape addObject:[NSNumber numberWithInteger:data_in->shape[i]]];
   }
 
-  DataType dtype(data_in->dtype);
+  DLDataType dtype = data_in->dtype;
   MLMultiArrayDataType dataType;
-  if (dtype == DataType::Float(64)) {
+  if (dtype == DLDataType{kDLFloat, 64, 1}) {
     dataType = MLMultiArrayDataTypeDouble;
     size *= sizeof(double);
-  } else if (dtype == DataType::Float(32)) {
+  } else if (dtype == DLDataType{kDLFloat, 32, 1}) {
     dataType = MLMultiArrayDataTypeFloat32;
     size *= sizeof(float);
-  } else if (dtype == DataType::Int(32)) {
+  } else if (dtype == DLDataType{kDLInt, 32, 1}) {
     dataType = MLMultiArrayDataTypeInt32;
     size *= sizeof(int);
   } else {
@@ -87,15 +87,15 @@
     shape.push_back(n);
   }
 
-  DataType dtype;
+  DLDataType dtype = DLDataType{kDLOpaqueHandle, 0, 0};
   if (data_desc.dataType == MLMultiArrayDataTypeDouble) {
-    dtype = DataType::Float(64);
+    dtype = DLDataType{kDLFloat, 64, 1};
     size *= sizeof(double);
   } else if (data_desc.dataType == MLMultiArrayDataTypeFloat32) {
-    dtype = DataType::Float(32);
+    dtype = DLDataType{kDLFloat, 32, 1};
     size *= sizeof(float);
   } else if (data_desc.dataType == MLMultiArrayDataTypeInt32) {
-    dtype = DataType::Int(32);
+    dtype = DLDataType{kDLInt, 32, 1};
     size *= sizeof(int);
   } else {
     LOG(FATAL) << "unexpected data type " << data_desc.dataType;
diff --git a/src/runtime/extra/contrib/cublas/cublas.cc b/src/runtime/extra/contrib/cublas/cublas.cc
index 4ef1b702c16c..f114cfa6e939 100644
--- a/src/runtime/extra/contrib/cublas/cublas.cc
+++ b/src/runtime/extra/contrib/cublas/cublas.cc
@@ -21,11 +21,11 @@
  * \file Use external cblas library call.
  */
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/error.h>
 #include <tvm/ffi/extra/c_env_api.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 
 #include "../../../../../3rdparty/compiler-rt/builtin_fp16.h"
 #include "../cblas/gemm_common.h"
@@ -170,9 +170,9 @@ void CallCublasLt(cublasLtHandle_t hdl, cudaStream_t stream,
     ab_type = CUDA_R_16BF;
   } else if (TypeMatch(A->dtype, kDLInt, 8)) {
     ab_type = CUDA_R_8I;
-  } else if (TypeMatch(A->dtype, DataType::TypeCode::kFloat8_e4m3fn, 8)) {
+  } else if (TypeMatch(A->dtype, kDLFloat8_e4m3fn, 8)) {
 #if CUDART_VERSION >= 11080
-    TVM_FFI_ICHECK(TypeMatch(B->dtype, DataType::TypeCode::kFloat8_e4m3fn, 8));
+    TVM_FFI_ICHECK(TypeMatch(B->dtype, kDLFloat8_e4m3fn, 8));
     ab_type = CUDA_R_8F_E4M3;
 #else
     TVM_FFI_THROW(InternalError) << "Float8 (E4M3) is only supported in CUDA 11.8 and above.";
diff --git a/src/runtime/extra/contrib/cudnn/conv_backward.cc b/src/runtime/extra/contrib/cudnn/conv_backward.cc
index df3d7c8e6ff7..97832248fe53 100644
--- a/src/runtime/extra/contrib/cudnn/conv_backward.cc
+++ b/src/runtime/extra/contrib/cudnn/conv_backward.cc
@@ -21,9 +21,9 @@
  * \file cuDNN kernel calls for backward algorithms.
  */
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/logging.h>
 
diff --git a/src/runtime/extra/contrib/cudnn/conv_forward.cc b/src/runtime/extra/contrib/cudnn/conv_forward.cc
index 3a573297f29e..b7257d35f2b5 100644
--- a/src/runtime/extra/contrib/cudnn/conv_forward.cc
+++ b/src/runtime/extra/contrib/cudnn/conv_forward.cc
@@ -21,9 +21,9 @@
  * \file cuDNN kernel calls for the forward algorithm.
  */
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/logging.h>
 
diff --git a/src/runtime/extra/contrib/cudnn/cudnn_utils.cc b/src/runtime/extra/contrib/cudnn/cudnn_utils.cc
index 5c34d4a2b0a6..3edb20dbacbc 100644
--- a/src/runtime/extra/contrib/cudnn/cudnn_utils.cc
+++ b/src/runtime/extra/contrib/cudnn/cudnn_utils.cc
@@ -23,10 +23,10 @@
 
 #include "cudnn_utils.h"
 
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/extra/c_env_api.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 
 #include <string>
 #include <vector>
diff --git a/src/runtime/extra/contrib/cutlass/fp16_group_gemm.cuh b/src/runtime/extra/contrib/cutlass/fp16_group_gemm.cuh
index 35c4a5767236..85653222169b 100644
--- a/src/runtime/extra/contrib/cutlass/fp16_group_gemm.cuh
+++ b/src/runtime/extra/contrib/cutlass/fp16_group_gemm.cuh
@@ -49,17 +49,17 @@ void tvm_cutlass_group_gemm_impl(Tensor x, Tensor weight, Tensor indptr, Tensor
   float alpha = 1.0f;
   float beta = 0.0f;
 
-  if (DataType(x->dtype) == DataType::Float(16)) {
-    TVM_FFI_ICHECK(DataType(weight->dtype) == DataType::Float(16));
-    TVM_FFI_ICHECK(DataType(out->dtype) == DataType::Float(16));
+  if (x->dtype == DLDataType{kDLFloat, 16, 1}) {
+    TVM_FFI_ICHECK((weight->dtype == DLDataType{kDLFloat, 16, 1}));
+    TVM_FFI_ICHECK((out->dtype == DLDataType{kDLFloat, 16, 1}));
     using Dtype = cutlass::half_t;
     CutlassGroupGemm<Arch, Dtype, Dtype, Dtype>::run(
         static_cast<Dtype*>(x->data), static_cast<Dtype*>(weight->data),
         static_cast<int64_t*>(indptr->data), static_cast<uint8_t*>(workspace->data),
         workspace->shape[0], n, k, num_groups, alpha, beta, static_cast<Dtype*>(out->data), stream);
-  } else if (DataType(x->dtype) == DataType::BFloat(16)) {
-    TVM_FFI_ICHECK(DataType(weight->dtype) == DataType::BFloat(16));
-    TVM_FFI_ICHECK(DataType(out->dtype) == DataType::BFloat(16));
+  } else if (x->dtype == DLDataType{kDLBfloat, 16, 1}) {
+    TVM_FFI_ICHECK((weight->dtype == DLDataType{kDLBfloat, 16, 1}));
+    TVM_FFI_ICHECK((out->dtype == DLDataType{kDLBfloat, 16, 1}));
     using Dtype = cutlass::bfloat16_t;
     CutlassGroupGemm<Arch, Dtype, Dtype, Dtype>::run(
         static_cast<Dtype*>(x->data), static_cast<Dtype*>(weight->data),
diff --git a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm.cuh b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm.cuh
index db88ec0faaed..1af60af4da3a 100644
--- a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm.cuh
+++ b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_gemm.cuh
@@ -66,14 +66,15 @@ void tvm_cutlass_fp8_groupwise_scaled_gemm_impl(Tensor a, Tensor b, Tensor scale
   TVM_FFI_ICHECK_EQ((n + block_size_0 - 1) / block_size_0, scales_b->shape[0]);
   TVM_FFI_ICHECK_EQ(scales_b->shape[1] * block_size_1, k);
 
-  using tvm::runtime::DataType;
-  TVM_FFI_ICHECK_EQ(DataType(a->dtype), DataType::Float8E4M3FN());
-  TVM_FFI_ICHECK_EQ(DataType(b->dtype), DataType::Float8E4M3FN());
-  TVM_FFI_ICHECK_EQ(DataType(scales_a->dtype), DataType::Float(32));
-  TVM_FFI_ICHECK_EQ(DataType(scales_b->dtype), DataType::Float(32));
-  TVM_FFI_ICHECK_EQ(DataType(workspace->dtype), DataType::UInt(8));
+  TVM_FFI_ICHECK_EQ(a->dtype, DLDataType{kDLFloat8_e4m3fn, 8, 1});
+  TVM_FFI_ICHECK_EQ(b->dtype, DLDataType{kDLFloat8_e4m3fn, 8, 1});
+  TVM_FFI_ICHECK_EQ(scales_a->dtype, DLDataType{kDLFloat, 32, 1});
+  TVM_FFI_ICHECK_EQ(scales_b->dtype, DLDataType{kDLFloat, 32, 1});
+  TVM_FFI_ICHECK_EQ(workspace->dtype, DLDataType{kDLUInt, 8, 1});
+  int64_t workspace_nbytes =
+      workspace->shape[0] * ((workspace->dtype.bits * workspace->dtype.lanes + 7) / 8);
 
-  if (DataType(out->dtype) == DataType::Float(16)) {
+  if (out->dtype == DLDataType{kDLFloat, 16, 1}) {
     CutlassFP8GroupwiseGemm<Arch, TileShape, ClusterShape, cutlass::float_e4m3_t,
                             cutlass::float_e4m3_t, cutlass::half_t,
                             float>::run(static_cast<cutlass::float_e4m3_t*>(a->data),
@@ -81,10 +82,9 @@ void tvm_cutlass_fp8_groupwise_scaled_gemm_impl(Tensor a, Tensor b, Tensor scale
                                         static_cast<float*>(scales_a->data),
                                         static_cast<float*>(scales_b->data),
                                         static_cast<cutlass::half_t*>(out->data),
-                                        static_cast<uint8_t*>(workspace->data),
-                                        workspace->shape[0] * DataType(workspace->dtype).bytes(), m,
+                                        static_cast<uint8_t*>(workspace->data), workspace_nbytes, m,
                                         n, k, 1, stream);
-  } else if (DataType(out->dtype) == DataType::BFloat(16)) {
+  } else if (out->dtype == DLDataType{kDLBfloat, 16, 1}) {
     CutlassFP8GroupwiseGemm<Arch, TileShape, ClusterShape, cutlass::float_e4m3_t,
                             cutlass::float_e4m3_t, cutlass::bfloat16_t,
                             float>::run(static_cast<cutlass::float_e4m3_t*>(a->data),
@@ -92,11 +92,10 @@ void tvm_cutlass_fp8_groupwise_scaled_gemm_impl(Tensor a, Tensor b, Tensor scale
                                         static_cast<float*>(scales_a->data),
                                         static_cast<float*>(scales_b->data),
                                         static_cast<cutlass::bfloat16_t*>(out->data),
-                                        static_cast<uint8_t*>(workspace->data),
-                                        workspace->shape[0] * DataType(workspace->dtype).bytes(), m,
+                                        static_cast<uint8_t*>(workspace->data), workspace_nbytes, m,
                                         n, k, 1, stream);
   } else {
-    LOG(FATAL) << "Unsupported output dtype: " << DataType(out->dtype);
+    LOG(FATAL) << "Unsupported output dtype: " << out->dtype;
   }
 }
 
@@ -131,14 +130,15 @@ void tvm_cutlass_fp8_groupwise_scaled_bmm_impl(Tensor a, Tensor b, Tensor scales
   TVM_FFI_ICHECK_EQ(scales_b->shape[1] * block_size_0, n);
   TVM_FFI_ICHECK_EQ(scales_b->shape[2] * block_size_1, k);
 
-  using tvm::runtime::DataType;
-  TVM_FFI_ICHECK_EQ(DataType(a->dtype), DataType::Float8E4M3FN());
-  TVM_FFI_ICHECK_EQ(DataType(b->dtype), DataType::Float8E4M3FN());
-  TVM_FFI_ICHECK_EQ(DataType(scales_a->dtype), DataType::Float(32));
-  TVM_FFI_ICHECK_EQ(DataType(scales_b->dtype), DataType::Float(32));
-  TVM_FFI_ICHECK_EQ(DataType(workspace->dtype), DataType::UInt(8));
+  TVM_FFI_ICHECK_EQ(a->dtype, DLDataType{kDLFloat8_e4m3fn, 8, 1});
+  TVM_FFI_ICHECK_EQ(b->dtype, DLDataType{kDLFloat8_e4m3fn, 8, 1});
+  TVM_FFI_ICHECK_EQ(scales_a->dtype, DLDataType{kDLFloat, 32, 1});
+  TVM_FFI_ICHECK_EQ(scales_b->dtype, DLDataType{kDLFloat, 32, 1});
+  TVM_FFI_ICHECK_EQ(workspace->dtype, DLDataType{kDLUInt, 8, 1});
+  int64_t workspace_nbytes =
+      workspace->shape[0] * ((workspace->dtype.bits * workspace->dtype.lanes + 7) / 8);
 
-  if (DataType(out->dtype) == DataType::Float(16)) {
+  if (out->dtype == DLDataType{kDLFloat, 16, 1}) {
     CutlassFP8GroupwiseGemm<Arch, TileShape, ClusterShape, cutlass::float_e4m3_t,
                             cutlass::float_e4m3_t, cutlass::half_t,
                             float>::run(static_cast<cutlass::float_e4m3_t*>(a->data),
@@ -146,10 +146,9 @@ void tvm_cutlass_fp8_groupwise_scaled_bmm_impl(Tensor a, Tensor b, Tensor scales
                                         static_cast<float*>(scales_a->data),
                                         static_cast<float*>(scales_b->data),
                                         static_cast<cutlass::half_t*>(out->data),
-                                        static_cast<uint8_t*>(workspace->data),
-                                        workspace->shape[0] * DataType(workspace->dtype).bytes(), m,
+                                        static_cast<uint8_t*>(workspace->data), workspace_nbytes, m,
                                         n, k, batch_size, stream);
-  } else if (DataType(out->dtype) == DataType::BFloat(16)) {
+  } else if (out->dtype == DLDataType{kDLBfloat, 16, 1}) {
     CutlassFP8GroupwiseGemm<Arch, TileShape, ClusterShape, cutlass::float_e4m3_t,
                             cutlass::float_e4m3_t, cutlass::bfloat16_t,
                             float>::run(static_cast<cutlass::float_e4m3_t*>(a->data),
@@ -157,11 +156,10 @@ void tvm_cutlass_fp8_groupwise_scaled_bmm_impl(Tensor a, Tensor b, Tensor scales
                                         static_cast<float*>(scales_a->data),
                                         static_cast<float*>(scales_b->data),
                                         static_cast<cutlass::bfloat16_t*>(out->data),
-                                        static_cast<uint8_t*>(workspace->data),
-                                        workspace->shape[0] * DataType(workspace->dtype).bytes(), m,
+                                        static_cast<uint8_t*>(workspace->data), workspace_nbytes, m,
                                         n, k, batch_size, stream);
   } else {
-    LOG(FATAL) << "Unsupported output dtype: " << DataType(out->dtype);
+    LOG(FATAL) << "Unsupported output dtype: " << out->dtype;
   }
 }
 
diff --git a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_group_gemm_sm100.cu b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_group_gemm_sm100.cu
index ea70eee38650..6bd9f45ab25e 100644
--- a/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_group_gemm_sm100.cu
+++ b/src/runtime/extra/contrib/cutlass/fp8_groupwise_scaled_group_gemm_sm100.cu
@@ -57,15 +57,14 @@ void tvm_fp8_groupwise_scaled_group_gemm_sm100(Tensor a, Tensor b, Tensor scales
   TVM_FFI_ICHECK_EQ((n + block_size_0 - 1) / block_size_0, scales_b->shape[1]);
   TVM_FFI_ICHECK_EQ((k + block_size_1 - 1) / block_size_1, scales_b->shape[2]);
 
-  using tvm::runtime::DataType;
-  TVM_FFI_ICHECK_EQ(DataType(a->dtype), DataType::Float8E4M3FN());
-  TVM_FFI_ICHECK_EQ(DataType(b->dtype), DataType::Float8E4M3FN());
-  TVM_FFI_ICHECK_EQ(DataType(scales_a->dtype), DataType::Float(32));
-  TVM_FFI_ICHECK_EQ(DataType(scales_b->dtype), DataType::Float(32));
-  TVM_FFI_ICHECK_EQ(DataType(indptr->dtype), DataType::Int(64));
-  TVM_FFI_ICHECK_EQ(DataType(workspace->dtype), DataType::UInt(8));
+  TVM_FFI_ICHECK_EQ(a->dtype, DLDataType{kDLFloat8_e4m3fn, 8, 1});
+  TVM_FFI_ICHECK_EQ(b->dtype, DLDataType{kDLFloat8_e4m3fn, 8, 1});
+  TVM_FFI_ICHECK_EQ(scales_a->dtype, DLDataType{kDLFloat, 32, 1});
+  TVM_FFI_ICHECK_EQ(scales_b->dtype, DLDataType{kDLFloat, 32, 1});
+  TVM_FFI_ICHECK_EQ(indptr->dtype, DLDataType{kDLInt, 64, 1});
+  TVM_FFI_ICHECK_EQ(workspace->dtype, DLDataType{kDLUInt, 8, 1});
 
-  if (DataType(out->dtype) == DataType::Float(16)) {
+  if (out->dtype == DLDataType{kDLFloat, 16, 1}) {
     using Dtype = cutlass::half_t;
     cutlass_fp8_groupwise_scaled_group_gemm_sm100<cutlass::float_e4m3_t, cutlass::float_e4m3_t,
                                                   Dtype, float>(
@@ -73,7 +72,7 @@ void tvm_fp8_groupwise_scaled_group_gemm_sm100(Tensor a, Tensor b, Tensor scales
         static_cast<float*>(scales_a->data), static_cast<float*>(scales_b->data),
         static_cast<int64_t*>(indptr->data), static_cast<uint8_t*>(workspace->data),
         workspace->shape[0], n, k, num_groups, static_cast<Dtype*>(out->data), stream);
-  } else if (DataType(out->dtype) == DataType::BFloat(16)) {
+  } else if (out->dtype == DLDataType{kDLBfloat, 16, 1}) {
     using Dtype = cutlass::bfloat16_t;
     cutlass_fp8_groupwise_scaled_group_gemm_sm100<cutlass::float_e4m3_t, cutlass::float_e4m3_t,
                                                   Dtype, float>(
diff --git a/src/runtime/extra/contrib/dnnl/dnnl_utils.cc b/src/runtime/extra/contrib/dnnl/dnnl_utils.cc
index 23992209f2ad..e41d378b3d30 100644
--- a/src/runtime/extra/contrib/dnnl/dnnl_utils.cc
+++ b/src/runtime/extra/contrib/dnnl/dnnl_utils.cc
@@ -32,21 +32,21 @@ namespace contrib {
 dnnl::memory::data_type dtype_dl2dnnl(DLDataType dltype) {
   using dt = dnnl::memory::data_type;
   dt dnnl_type = dt::undef;
-  if (dltype.code == DataType::TypeCode::kFloat) {
+  if (dltype.code == DLDataTypeCode::kDLFloat) {
     if (dltype.bits == 16) {
       dnnl_type = dt::f16;
     } else if (dltype.bits == 32) {
       dnnl_type = dt::f32;
     }
-  } else if (dltype.code == DataType::TypeCode::kBFloat && dltype.bits == 16) {
+  } else if (dltype.code == DLDataTypeCode::kDLBfloat && dltype.bits == 16) {
     dnnl_type = dt::bf16;
-  } else if (dltype.code == DataType::TypeCode::kInt) {
+  } else if (dltype.code == DLDataTypeCode::kDLInt) {
     if (dltype.bits == 8) {
       dnnl_type = dt::s8;
     } else if (dltype.bits == 32) {
       dnnl_type = dt::s32;
     }
-  } else if (dltype.code == DataType::TypeCode::kUInt && dltype.bits == 8) {
+  } else if (dltype.code == DLDataTypeCode::kDLUInt && dltype.bits == 8) {
     dnnl_type = dt::u8;
   }
   if (dnnl_type == dt::undef) {
diff --git a/src/runtime/extra/contrib/dnnl/dnnl_utils.h b/src/runtime/extra/contrib/dnnl/dnnl_utils.h
index a598b6704450..6f36ed4d8fbe 100644
--- a/src/runtime/extra/contrib/dnnl/dnnl_utils.h
+++ b/src/runtime/extra/contrib/dnnl/dnnl_utils.h
@@ -34,7 +34,7 @@
 //  -Wzero-as-null-pointer-constant and -Wdocumentation-unknown-command
 #include <dnnl.hpp>
 
-#include "tvm/runtime/data_type.h"
+#include "tvm/ffi/dtype.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/extra/contrib/hipblas/hipblas.cc b/src/runtime/extra/contrib/hipblas/hipblas.cc
index 5276b4f7956d..eae6f7241cc7 100644
--- a/src/runtime/extra/contrib/hipblas/hipblas.cc
+++ b/src/runtime/extra/contrib/hipblas/hipblas.cc
@@ -21,10 +21,10 @@
  * \file Use external hipblas library call.
  */
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/error.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 
 #include "../../../../../3rdparty/compiler-rt/builtin_fp16.h"
 #include "../cblas/gemm_common.h"
diff --git a/src/runtime/extra/contrib/json/json_node.h b/src/runtime/extra/contrib/json/json_node.h
index c165f6b05cf3..40c96d826914 100644
--- a/src/runtime/extra/contrib/json/json_node.h
+++ b/src/runtime/extra/contrib/json/json_node.h
@@ -29,9 +29,9 @@
 #include <tvm/ffi/any.h>
 #include <tvm/ffi/container/array.h>
 #include <tvm/ffi/container/map.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/extra/json.h>
 #include <tvm/ffi/string.h>
-#include <tvm/runtime/data_type.h>
 
 #include <cstdint>
 #include <cstdio>
diff --git a/src/runtime/extra/contrib/nvshmem/memory_allocator.cc b/src/runtime/extra/contrib/nvshmem/memory_allocator.cc
index cb6e3520c8c1..1483563b6200 100644
--- a/src/runtime/extra/contrib/nvshmem/memory_allocator.cc
+++ b/src/runtime/extra/contrib/nvshmem/memory_allocator.cc
@@ -57,7 +57,7 @@ class NVSHMEMAllocator final : public PooledAllocator {
     return allocator;
   }
 
-  Tensor Empty(ffi::Shape shape, DataType dtype, Device device) {
+  Tensor Empty(ffi::Shape shape, DLDataType dtype, Device device) {
     class NVSHMEMAlloc {
      public:
       explicit NVSHMEMAlloc(Buffer buffer) : buffer_(buffer) {}
@@ -87,7 +87,7 @@ class NVSHMEMAllocator final : public PooledAllocator {
   void DeviceFreeDataSpace(Device dev, void* ptr) final { nvshmem_free(ptr); }
 };
 
-Tensor NVSHMEMEmpty(ffi::Shape shape, DataType dtype, ffi::Optional<Device> device) {
+Tensor NVSHMEMEmpty(ffi::Shape shape, DLDataType dtype, ffi::Optional<Device> device) {
   return NVSHMEMAllocator::Global()->Empty(shape, dtype, UseDefaultDeviceIfNone(device));
 }
 
diff --git a/src/runtime/extra/contrib/random/random.cc b/src/runtime/extra/contrib/random/random.cc
index a3d0cd8b85a8..81db658cb86e 100644
--- a/src/runtime/extra/contrib/random/random.cc
+++ b/src/runtime/extra/contrib/random/random.cc
@@ -21,10 +21,10 @@
  * \file External random functions for tensor.
  */
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/error.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 
 #include <algorithm>
 #include <cstdint>
diff --git a/src/runtime/extra/contrib/sort/sort.cc b/src/runtime/extra/contrib/sort/sort.cc
index 51a94111b6e6..6e3a99f93522 100644
--- a/src/runtime/extra/contrib/sort/sort.cc
+++ b/src/runtime/extra/contrib/sort/sort.cc
@@ -23,10 +23,10 @@
 
 #include <dlpack/dlpack.h>
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/error.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 
 #include <algorithm>
 #include <vector>
@@ -36,8 +36,6 @@
 namespace tvm {
 namespace contrib {
 
-using namespace runtime;
-
 template <typename DType, bool stable_comparison = false>
 bool CompareAscend(const std::pair<int64_t, DType>& lhs, const std::pair<int64_t, DType>& rhs) {
   if constexpr (stable_comparison) {
diff --git a/src/runtime/extra/contrib/vllm/cache_alloc.cc b/src/runtime/extra/contrib/vllm/cache_alloc.cc
index 266138406cb9..42601d7a5e69 100644
--- a/src/runtime/extra/contrib/vllm/cache_alloc.cc
+++ b/src/runtime/extra/contrib/vllm/cache_alloc.cc
@@ -39,9 +39,9 @@ ffi::Array<Tensor> AllocateKVCache(int head_size, int num_layers, int num_heads,
   for (int i = 0; i < num_layers; ++i) {
     Tensor key_blocks =
         Tensor::Empty({num_blocks, num_heads, head_size / vec_size, block_size, vec_size},
-                      runtime::DataType::Float(16), dev);
+                      DLDataType{kDLFloat, 16, 1}, dev);
     Tensor value_blocks = Tensor::Empty({num_blocks, num_heads, head_size, block_size},
-                                        runtime::DataType::Float(16), dev);
+                                        DLDataType{kDLFloat, 16, 1}, dev);
     cache.push_back(key_blocks);
     cache.push_back(value_blocks);
   }
diff --git a/src/runtime/extra/contrib/vllm/cache_kernels.cu b/src/runtime/extra/contrib/vllm/cache_kernels.cu
index 5af93a1fd904..6a09497a8d12 100644
--- a/src/runtime/extra/contrib/vllm/cache_kernels.cu
+++ b/src/runtime/extra/contrib/vllm/cache_kernels.cu
@@ -206,16 +206,16 @@ TVM_FFI_STATIC_INIT_BLOCK() {
         DLDevice dev = key_cache->device;
 
         Tensor key_cache_ptrs_gpu =
-            Tensor::Empty({static_cast<int>(num_layers)}, runtime::DataType::Int(64), dev);
+            Tensor::Empty({static_cast<int>(num_layers)}, DLDataType{kDLInt, 64, 1}, dev);
         Tensor value_cache_ptrs_gpu =
-            Tensor::Empty({static_cast<int>(num_layers)}, runtime::DataType::Int(64), dev);
+            Tensor::Empty({static_cast<int>(num_layers)}, DLDataType{kDLInt, 64, 1}, dev);
         key_cache_ptrs_gpu.CopyFromBytes(key_cache_ptrs.data(),
                                          sizeof(int64_t) * key_cache_ptrs.size());
         value_cache_ptrs_gpu.CopyFromBytes(value_cache_ptrs.data(),
                                            sizeof(int64_t) * value_cache_ptrs.size());
 
         Tensor block_mapping_gpu =
-            Tensor::Empty(block_mapping.Shape(), runtime::DataType::Int(64), dev);
+            Tensor::Empty(block_mapping.Shape(), DLDataType{kDLInt, 64, 1}, dev);
         block_mapping_gpu.CopyFromBytes(block_mapping->data,
                                         sizeof(int64_t) * block_mapping->shape[0]);
 
diff --git a/src/runtime/extra/disco/builtin.cc b/src/runtime/extra/disco/builtin.cc
index da9f472b3e76..d9d5fc132768 100644
--- a/src/runtime/extra/disco/builtin.cc
+++ b/src/runtime/extra/disco/builtin.cc
@@ -71,7 +71,7 @@ ffi::Module LoadVMModule(std::string path, ffi::Optional<Device> device) {
   return mod;
 }
 
-Tensor DiscoEmptyTensor(ffi::Shape shape, DataType dtype, ffi::Optional<Device> device) {
+Tensor DiscoEmptyTensor(ffi::Shape shape, DLDataType dtype, ffi::Optional<Device> device) {
   return Tensor::Empty(shape, dtype, UseDefaultDeviceIfNone(device));
 }
 
@@ -131,7 +131,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   refl::GlobalDef()
       .def("runtime.disco.load_vm_module", LoadVMModule)
       .def("runtime.disco.empty",
-           [](ffi::Shape shape, DataType dtype, ffi::Optional<Device> device, bool worker0_only,
+           [](ffi::Shape shape, DLDataType dtype, ffi::Optional<Device> device, bool worker0_only,
               bool in_group) -> ffi::Optional<Tensor> {
              int worker_id = WorkerId();
              int group_size =
diff --git a/src/runtime/extra/disco/cuda_ipc/cuda_ipc_memory.cc b/src/runtime/extra/disco/cuda_ipc/cuda_ipc_memory.cc
index 426557b7b7ad..a8a8030f0169 100644
--- a/src/runtime/extra/disco/cuda_ipc/cuda_ipc_memory.cc
+++ b/src/runtime/extra/disco/cuda_ipc/cuda_ipc_memory.cc
@@ -97,10 +97,12 @@ class CUDAIPCMemoryAllocator final : public memory::PooledAllocator {
     auto [data_ptr, data_comm_ptrs] =
         AllocIPCMemory(dev, size, alignment, type_hint, /*reset_memory_to_zero=*/false);
     int barrier_ptr_size = sizeof(uint32_t) * (MAX_ALL_REDUCE_BLOCKS + 2) * MAX_RANKS_PER_NODE;
-    auto [barrier_in_ptr, barrier_in_comm_ptrs] = AllocIPCMemory(
-        dev, barrier_ptr_size, alignment, DataType::UInt(32), /*reset_memory_to_zero=*/true);
-    auto [barrier_out_ptr, barrier_out_comm_ptrs] = AllocIPCMemory(
-        dev, barrier_ptr_size, alignment, DataType::UInt(32), /*reset_memory_to_zero=*/true);
+    auto [barrier_in_ptr, barrier_in_comm_ptrs] =
+        AllocIPCMemory(dev, barrier_ptr_size, alignment, DLDataType{kDLUInt, 32, 1},
+                       /*reset_memory_to_zero=*/true);
+    auto [barrier_out_ptr, barrier_out_comm_ptrs] =
+        AllocIPCMemory(dev, barrier_ptr_size, alignment, DLDataType{kDLUInt, 32, 1},
+                       /*reset_memory_to_zero=*/true);
 
     // Create the CUDAIPCMemory object.
     ffi::ObjectPtr<CUDAIPCMemoryObj> ipc_memory = ffi::make_object<CUDAIPCMemoryObj>();
diff --git a/src/runtime/extra/disco/cuda_ipc/custom_allreduce.cc b/src/runtime/extra/disco/cuda_ipc/custom_allreduce.cc
index ffe00d5feef9..3eaca5ba98d4 100644
--- a/src/runtime/extra/disco/cuda_ipc/custom_allreduce.cc
+++ b/src/runtime/extra/disco/cuda_ipc/custom_allreduce.cc
@@ -81,7 +81,7 @@ void CustomAllReduce(DLTensor* send, int strategy, DLTensor* recv) {
     // Dispatch to nccl AllReduce if the customized all-reduce cannot apply.
     deviceStream_t stream = ctx->GetDefaultStream();
     NCCL_CALL(ncclAllReduce(send->data, recv->data, num_elements,
-                            /*datatype=*/nccl::AsNCCLDataType(DataType(send->dtype)),
+                            /*datatype=*/nccl::AsNCCLDataType(send->dtype),
                             /*op=*/ncclSum, ctx->global_comm, stream));
     return;
   }
diff --git a/src/runtime/extra/disco/loader.cc b/src/runtime/extra/disco/loader.cc
index 86caac6573ed..f714112aecf3 100644
--- a/src/runtime/extra/disco/loader.cc
+++ b/src/runtime/extra/disco/loader.cc
@@ -17,10 +17,10 @@
  * under the License.
  */
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/extra/json.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/disco/builtin.h>
 #include <tvm/runtime/vm/tensor_cache_support.h>
 
@@ -45,7 +45,7 @@ using ParamRecord = TensorCacheMetadata::FileRecord::ParamRecord;
 struct ShardInfo {
   struct TensorInfo {
     ffi::Shape shape;
-    DataType dtype;
+    DLDataType dtype;
   };
   struct ShardFunc {
     std::string name;
@@ -67,8 +67,7 @@ ShardInfo::TensorInfo LoadTensorInfoFromJSON(const json::Array& json_tensor_info
     shape.push_back(shape_json[i].cast<int64_t>());
   }
   std::string dtype = json_tensor_info[1].cast<ffi::String>();
-  return ShardInfo::TensorInfo{ffi::Shape(std::move(shape)),
-                               DataType(ffi::StringToDLDataType(dtype))};
+  return ShardInfo::TensorInfo{ffi::Shape(std::move(shape)), ffi::StringToDLDataType(dtype)};
 }
 
 ShardInfo::ShardFunc LoadShardFuncFromJSON(const json::Array& json_shard_func) {
@@ -301,7 +300,7 @@ Tensor ShardLoaderObj::Load(int weight_index) const {
   bool needs_sharding = !param_info.shard_info.funcs.empty();
   if (needs_sharding) {
     ffi::Shape shape = param_info.shard_info.funcs.back().output_info.shape;
-    DataType dtype = param_info.shard_info.funcs.back().output_info.dtype;
+    DLDataType dtype = param_info.shard_info.funcs.back().output_info.dtype;
     TVM_FFI_CHECK(shape.size() >= 1 && shape[0] == num_shards, ValueError)
         << "The first dimension of the "
         << "output shape must be equal to the "
diff --git a/src/runtime/extra/disco/nccl/nccl.cc b/src/runtime/extra/disco/nccl/nccl.cc
index 887f440b1b4f..cd00a1ac3d6b 100644
--- a/src/runtime/extra/disco/nccl/nccl.cc
+++ b/src/runtime/extra/disco/nccl/nccl.cc
@@ -122,8 +122,8 @@ void AllReduce(Tensor send, ReduceKind reduce_kind, bool in_group, Tensor recv)
   ffi::Shape shape = send.Shape();
   int64_t numel = shape->Product();
   deviceStream_t stream = ctx->GetDefaultStream();
-  DataType dtype = DataType(send->dtype);
-  if (dtype == DataType::Float8E4M3FN() || dtype == DataType::Float8E5M2()) {
+  DLDataType dtype = send->dtype;
+  if (dtype == DLDataType{kDLFloat8_e4m3fn, 8, 1} || dtype == DLDataType{kDLFloat8_e5m2, 8, 1}) {
     TVM_FFI_THROW(InternalError)
         << "Float8 data type cannot be allreduced, as nccl does not support this data type.";
   }
@@ -139,7 +139,7 @@ void AllGather(Tensor send, bool in_group, Tensor recv) {
   int64_t numel = shape->Product();
   deviceStream_t stream = ctx->GetDefaultStream();
   NCCL_CALL(ncclAllGather(send->data, recv->data, numel,
-                          /*datatype=*/AsNCCLDataType(DataType(send->dtype)),
+                          /*datatype=*/AsNCCLDataType(send->dtype),
                           in_group ? ctx->group_comm : ctx->global_comm, stream));
 }
 
@@ -162,7 +162,7 @@ void BroadcastFromWorker0(ffi::Optional<Tensor> send, bool in_group, Tensor recv
 
   deviceStream_t stream = ctx->GetDefaultStream();
   NCCL_CALL(ncclBroadcast(send_data, recv->data, numel,
-                          /*datatype=*/AsNCCLDataType(DataType(recv->dtype)),
+                          /*datatype=*/AsNCCLDataType(recv->dtype),
                           /*root=*/0, in_group ? ctx->group_comm : ctx->global_comm, stream));
 }
 
@@ -185,9 +185,9 @@ void ScatterFromWorker0(ffi::Optional<Tensor> send, bool in_group, Tensor recv)
            "of elements in the buffer to be "
            "divisible by the number of workers, but got numel = "
         << numel << " and " << num_receiver << " workers.";
-    DataType dtype(buffer->dtype);
+    DLDataType dtype = buffer->dtype;
     int64_t numel_per_shard = numel / num_receiver;
-    int64_t bytes_per_shard = numel_per_shard * dtype.bytes();
+    int64_t bytes_per_shard = numel_per_shard * ((dtype.bits * dtype.lanes + 7) / 8);
     TVM_FFI_CHECK_EQ(numel_per_shard, recv.Shape().Product(), ValueError)
         << "The number of elements in buffer `recv` must be the same as each shard "
            "of "
@@ -209,7 +209,7 @@ void ScatterFromWorker0(ffi::Optional<Tensor> send, bool in_group, Tensor recv)
     NCCL_CALL(ncclGroupStart());
   }
   int64_t numel = recv.Shape().Product();
-  DataType dtype(recv->dtype);
+  DLDataType dtype = recv->dtype;
   NCCL_CALL(ncclRecv(recv->data, numel, AsNCCLDataType(dtype), 0,
                      in_group ? ctx->group_comm : ctx->global_comm, stream));
   NCCL_CALL(ncclGroupEnd());
@@ -234,9 +234,9 @@ void GatherToWorker0(Tensor send, bool in_group, ffi::Optional<Tensor> recv) {
            "of elements in the buffer to be "
            "divisible by the number of workers, but got numel = "
         << numel << " and " << num_receiver << " workers.";
-    DataType dtype(buffer->dtype);
+    DLDataType dtype = buffer->dtype;
     int64_t numel_per_shard = numel / num_receiver;
-    int64_t bytes_per_shard = numel_per_shard * dtype.bytes();
+    int64_t bytes_per_shard = numel_per_shard * ((dtype.bits * dtype.lanes + 7) / 8);
     TVM_FFI_CHECK_EQ(numel_per_shard, send.Shape().Product(), ValueError)
         << "The number of elements in buffer `send` must be the same as each shard "
            "of "
@@ -258,7 +258,7 @@ void GatherToWorker0(Tensor send, bool in_group, ffi::Optional<Tensor> recv) {
     NCCL_CALL(ncclGroupStart());
   }
   int64_t numel = send.Shape().Product();
-  DataType dtype(send->dtype);
+  DLDataType dtype = send->dtype;
   NCCL_CALL(ncclSend(send->data, numel, AsNCCLDataType(dtype), 0,
                      in_group ? ctx->group_comm : ctx->global_comm, stream));
   NCCL_CALL(ncclGroupEnd());
diff --git a/src/runtime/extra/disco/nccl/nccl_context.h b/src/runtime/extra/disco/nccl/nccl_context.h
index 7a99be0897c0..d529ab441d11 100644
--- a/src/runtime/extra/disco/nccl/nccl_context.h
+++ b/src/runtime/extra/disco/nccl/nccl_context.h
@@ -86,39 +86,39 @@ inline void StreamDestroy(deviceStream_t stream) { ROCM_CALL(hipStreamDestroy(st
 
 #endif
 
-/*! \brief Convert DataType to ncclDataType. */
-inline ncclDataType_t AsNCCLDataType(runtime::DataType dtype) {
-  if (dtype == DataType::Int(8)) {
+/*! \brief Convert DLPack dtype to ncclDataType. */
+inline ncclDataType_t AsNCCLDataType(DLDataType dtype) {
+  if (dtype == DLDataType{kDLInt, 8, 1}) {
     return ncclInt8;
   }
-  if (dtype == DataType::UInt(8) || dtype == DataType::Float8E4M3FN() ||
-      dtype == DataType::Float8E5M2()) {
+  if (dtype == DLDataType{kDLUInt, 8, 1} || dtype == DLDataType{kDLFloat8_e4m3fn, 8, 1} ||
+      dtype == DLDataType{kDLFloat8_e5m2, 8, 1}) {
     // For float8 data type, pretend to be uint8 in nccl.
     // And will throw error when allreduce, as it makes no sense in this case.
     return ncclUint8;
   }
-  if (dtype == DataType::Int(32)) {
+  if (dtype == DLDataType{kDLInt, 32, 1}) {
     return ncclInt32;
   }
-  if (dtype == DataType::UInt(32)) {
+  if (dtype == DLDataType{kDLUInt, 32, 1}) {
     return ncclUint32;
   }
-  if (dtype == DataType::Int(64)) {
+  if (dtype == DLDataType{kDLInt, 64, 1}) {
     return ncclInt64;
   }
-  if (dtype == DataType::UInt(64)) {
+  if (dtype == DLDataType{kDLUInt, 64, 1}) {
     return ncclUint64;
   }
-  if (dtype == DataType::Float(16)) {
+  if (dtype == DLDataType{kDLFloat, 16, 1}) {
     return ncclFloat16;
   }
-  if (dtype == DataType::Float(32)) {
+  if (dtype == DLDataType{kDLFloat, 32, 1}) {
     return ncclFloat32;
   }
-  if (dtype == DataType::Float(64)) {
+  if (dtype == DLDataType{kDLFloat, 64, 1}) {
     return ncclFloat64;
   }
-  if (dtype == DataType::BFloat(16)) {
+  if (dtype == DLDataType{kDLBfloat, 16, 1}) {
     return ncclBfloat16;
   }
   TVM_FFI_THROW(ValueError) << "Unsupported data type " << dtype;
diff --git a/src/runtime/tensor.cc b/src/runtime/tensor.cc
index 887d576537f2..ed12d0b4885a 100644
--- a/src/runtime/tensor.cc
+++ b/src/runtime/tensor.cc
@@ -33,7 +33,7 @@
 
 #include "../support/base64.h"
 #include "../support/bytes_io.h"
-#include "tvm/runtime/data_type.h"
+#include "tvm/ffi/dtype.h"
 
 namespace tvm {
 namespace runtime {
@@ -52,11 +52,11 @@ inline void VerifyDataType(DLDataType dtype) {
       return;
     else if (dtype.bits == 4 && dtype.code == kDLInt)
       return;
-    else if (dtype.bits == 6 && dtype.code == DataType::kFloat6_e2m3fn)
+    else if (dtype.bits == 6 && dtype.code == kDLFloat6_e2m3fn)
       return;
-    else if (dtype.bits == 6 && dtype.code == DataType::kFloat6_e3m2fn)
+    else if (dtype.bits == 6 && dtype.code == kDLFloat6_e3m2fn)
       return;
-    else if (dtype.bits == 4 && dtype.code == DataType::kFloat4_e2m1fn)
+    else if (dtype.bits == 4 && dtype.code == kDLFloat4_e2m1fn)
       return;
     else
       TVM_FFI_ICHECK_EQ(dtype.bits % 8, 0);
diff --git a/src/runtime/vm/attn_backend.h b/src/runtime/vm/attn_backend.h
index 067fa8d10dc1..6aececc755ea 100644
--- a/src/runtime/vm/attn_backend.h
+++ b/src/runtime/vm/attn_backend.h
@@ -321,7 +321,7 @@ class PagedDecodeFunc : public AttnBackendFunc {
                             Tensor page_locked_int_workspace_buffer, HostMemoryVector* page_indptr,
                             int64_t batch_size, int64_t page_size, int64_t num_qo_heads,
                             int64_t num_kv_heads, int64_t qk_head_dim, int64_t v_head_dim,
-                            RoPEMode rope_mode, DataType q_dtype, DataType kv_dtype,
+                            RoPEMode rope_mode, DLDataType q_dtype, DLDataType kv_dtype,
                             TVMStreamHandle copy_stream) {
     // Do nothing. Subclasses can override to customize behavior.
   }
@@ -377,7 +377,7 @@ class FlashInferPagedDecodeFunc : public PagedDecodeFunc {
                     Tensor page_locked_int_workspace_buffer, HostMemoryVector* page_indptr,
                     int64_t batch_size, int64_t page_size, int64_t num_qo_heads,
                     int64_t num_kv_heads, int64_t qk_head_dim, int64_t v_head_dim,
-                    RoPEMode rope_mode, DataType q_dtype, DataType kv_dtype,
+                    RoPEMode rope_mode, DLDataType q_dtype, DLDataType kv_dtype,
                     TVMStreamHandle copy_stream) final {
     // Todo(tvm-team): enable cuda graph
     ffi::Shape plan_info_vec =
diff --git a/src/runtime/vm/attn_utils.h b/src/runtime/vm/attn_utils.h
index 7a2c93414c0f..4f9cd648e9d7 100644
--- a/src/runtime/vm/attn_utils.h
+++ b/src/runtime/vm/attn_utils.h
@@ -359,7 +359,7 @@ class HostMemoryVector {
 
   explicit HostMemoryVector(int64_t reserved_size, DLDataType dtype, Device device)
       : reserved_size_(reserved_size) {
-    TVM_FFI_ICHECK(DataType(dtype) == DataType::Int(32));
+    TVM_FFI_ICHECK((dtype == DLDataType{kDLInt, 32, 1}));
     data_ = Tensor::Empty({reserved_size}, dtype, device);
   }
 
@@ -368,7 +368,7 @@ class HostMemoryVector {
     if (current_size_ == reserved_size_) {
       reserved_size_ *= 2;
       Tensor new_data = Tensor::Empty({reserved_size_}, data_->dtype, data_->device);
-      std::memcpy(new_data->data, data_->data, current_size_ * DataType(data_->dtype).bytes());
+      std::memcpy(new_data->data, data_->data, current_size_ * (((data_->dtype).bits + 7) / 8));
       data_ = new_data;
     }
     static_cast<int32_t*>(data_->data)[current_size_++] = value;
@@ -382,7 +382,7 @@ class HostMemoryVector {
         reserved_size_ *= 2;
       }
       Tensor new_data = Tensor::Empty({reserved_size_}, data_->dtype, data_->device);
-      std::memcpy(new_data->data, data_->data, current_size_ * DataType(data_->dtype).bytes());
+      std::memcpy(new_data->data, data_->data, current_size_ * (((data_->dtype).bits + 7) / 8));
       data_ = new_data;
     }
     std::memcpy(static_cast<int32_t*>(data_->data) + current_size_, values.data(),
@@ -466,7 +466,7 @@ class PagedKVCacheAuxDataManager {
         device_(device),
         preferred_host_device_(preferred_host_device),
         copy_stream_(copy_stream) {
-    TVM_FFI_ICHECK(DataType(dtype_aux) == DataType::Int(32));
+    TVM_FFI_ICHECK((dtype_aux == DLDataType{kDLInt, 32, 1}));
   }
 
   virtual ~PagedKVCacheAuxDataManager() = default;
diff --git a/src/runtime/vm/builtin.cc b/src/runtime/vm/builtin.cc
index 8fc18c5c0722..30fbf77b9c7f 100644
--- a/src/runtime/vm/builtin.cc
+++ b/src/runtime/vm/builtin.cc
@@ -22,11 +22,11 @@
 #include <tvm/ffi/any.h>
 #include <tvm/ffi/container/array.h>
 #include <tvm/ffi/container/shape.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/error.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/memory.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/memory/memory_manager.h>
 #include <tvm/runtime/tensor.h>
@@ -243,14 +243,14 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 void CheckTensorInfo(ffi::PackedArgs args, ffi::Any* rv) {
   ffi::AnyView arg = args[0];
   int ndim = args[1].cast<int>();
-  DataType dtype;
+  DLDataType dtype;
   ffi::Optional<ffi::String> err_ctx;
 
   if (args.size() == 3) {
-    dtype = DataType::Void();
+    dtype = DLDataType{kDLOpaqueHandle, 0, 0};
     err_ctx = args[2].cast<ffi::Optional<ffi::String>>();
   } else {
-    dtype = args[2].cast<DataType>();
+    dtype = args[2].cast<DLDataType>();
     err_ctx = args[3].cast<ffi::Optional<ffi::String>>();
   }
 
@@ -264,10 +264,10 @@ void CheckTensorInfo(ffi::PackedArgs args, ffi::Any* rv) {
         << err_ctx.value_or("") << " expect Tensor with ndim " << ndim << " but get " << ptr->ndim;
   }
 
-  if (dtype != DataType::Void()) {
-    TVM_FFI_CHECK(DataType(ptr->dtype) == dtype, ValueError)
+  if (dtype != DLDataType{kDLOpaqueHandle, 0, 0}) {
+    TVM_FFI_CHECK(ptr->dtype == dtype, ValueError)
         << err_ctx.value_or("") << " expect Tensor with dtype " << dtype << " but get "
-        << DataType(ptr->dtype);
+        << ptr->dtype;
   }
 }
 
@@ -301,23 +301,24 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 /*!
  * \brief Builtin function to check if arg is PrimValue(dtype)
  * \param arg The input argument.
- * \param dtype Expected dtype of the PrimValue.  Can be DataType::Void() for unknown dtype.
+ * \param dtype Expected dtype of the PrimValue.  Can be DLDataType{kDLOpaqueHandle, 0, 0} for
+ * unknown dtype.
  * \param err_ctx Additional context if error occurs.
  */
-void CheckPrimValueInfo(ffi::AnyView arg, DataType dtype, ffi::Optional<ffi::String> err_ctx) {
+void CheckPrimValueInfo(ffi::AnyView arg, DLDataType dtype, ffi::Optional<ffi::String> err_ctx) {
   if (auto opt_obj = arg.as<ffi::ObjectRef>()) {
     TVM_FFI_THROW(TypeError) << err_ctx.value_or("") << ", expected dtype " << dtype
                              << ", but received ObjectRef of type "
                              << opt_obj.value()->GetTypeKey();
-  } else if (dtype.is_bool()) {
+  } else if (((dtype).code == kDLBool)) {
     arg.cast<bool>();
-  } else if (dtype.is_int()) {
+  } else if (((dtype).code == kDLInt)) {
     arg.cast<int64_t>();
-  } else if (dtype.is_uint()) {
+  } else if (((dtype).code == kDLUInt)) {
     arg.cast<uint64_t>();
-  } else if (dtype.is_float()) {
+  } else if (((dtype).code == kDLFloat)) {
     arg.cast<double>();
-  } else if (dtype.is_handle()) {
+  } else if (dtype.code == kDLOpaqueHandle && !(dtype.bits == 0 && dtype.lanes == 0)) {
     arg.cast<void*>();
   } else {
     TVM_FFI_THROW(TypeError) << err_ctx.value_or("") << ", unsupported dtype " << dtype;
@@ -398,7 +399,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
         Storage sobj = args[0].cast<Storage>();
         int64_t offset = args[1].cast<int64_t>();
         ffi::Shape shape = args[2].cast<ffi::Shape>();
-        DataType dtype = args[3].cast<DataType>();
+        DLDataType dtype = args[3].cast<DLDataType>();
         if (args.size() == 5) {
           ffi::String scope = args[4].cast<ffi::String>();
           *rv = sobj->AllocTensorScoped(offset, shape, dtype, scope);
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index 33ff1503f823..9e3a5f932309 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -101,8 +101,7 @@ std::string VMExecutable::Stats() const {
       oss << opt_int.value();
       oss << ", ";
     } else if (auto opt_dtype = it.as<DLDataType>()) {
-      DataType dtype(opt_dtype.value());
-      oss << dtype;
+      oss << opt_dtype.value();
       oss << ", ";
     } else {
       TVM_FFI_THROW(InternalError) << "Unsupported constant pool type " << it.GetTypeKey();
diff --git a/src/runtime/vm/lm_support.cc b/src/runtime/vm/lm_support.cc
index 51b271441a27..2516e0d8a1af 100644
--- a/src/runtime/vm/lm_support.cc
+++ b/src/runtime/vm/lm_support.cc
@@ -362,7 +362,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 // NOTE this is a built-in highly related to LM so we put it here.
 int SampleTopPFromLogits(Tensor logits, double temperature, double top_p, double uniform_sample) {
   TVM_FFI_ICHECK(logits.IsContiguous());
-  TVM_FFI_ICHECK(logits.DataType() == DataType::Float(32));
+  TVM_FFI_ICHECK((logits.DataType() == DLDataType{kDLFloat, 32, 1}));
 
   if (logits->device.device_type != kDLCPU) {
     logits = logits.CopyTo(DLDevice{kDLCPU, 0});
@@ -428,7 +428,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 int SampleTopPFromProb(Tensor prob, double top_p, double uniform_sample) {
   TVM_FFI_ICHECK(prob.IsContiguous());
-  TVM_FFI_ICHECK(prob.DataType() == DataType::Float(32));
+  TVM_FFI_ICHECK((prob.DataType() == DLDataType{kDLFloat, 32, 1}));
 
   if (prob->device.device_type != kDLCPU) {
     prob = prob.CopyTo(DLDevice{kDLCPU, 0});
@@ -543,7 +543,8 @@ Tensor MultinomialFromUniform(Tensor prob, Tensor uniform_sample) {
   int64_t vocab_size = prob->shape[prob->ndim - 1];
   const float* pprob = static_cast<float*>(prob->data);
   const float* psample = static_cast<float*>(uniform_sample->data);
-  Tensor new_array = Tensor::Empty({batch_size, 1}, DataType::Int(64), uniform_sample->device);
+  Tensor new_array =
+      Tensor::Empty({batch_size, 1}, DLDataType{kDLInt, 64, 1}, uniform_sample->device);
   int64_t* parray = static_cast<int64_t*>(new_array->data);
   for (int64_t i = 0; i < batch_size; ++i) {
     float cum_sum_prob = 0.0f;
@@ -569,8 +570,9 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 void ApplyRepetitionPenalty(Tensor logits, Tensor token_ids, double penalty) {
   TVM_FFI_ICHECK(logits.IsContiguous());
   TVM_FFI_ICHECK(token_ids.IsContiguous());
-  TVM_FFI_ICHECK(logits.DataType() == DataType::Float(32)) << "Logits data type is not float32!";
-  TVM_FFI_ICHECK(token_ids.DataType() == DataType::Int(32)) << "token ids must be int32!";
+  TVM_FFI_ICHECK((logits.DataType() == DLDataType{kDLFloat, 32, 1}))
+      << "Logits data type is not float32!";
+  TVM_FFI_ICHECK((token_ids.DataType() == DLDataType{kDLInt, 32, 1})) << "token ids must be int32!";
   TVM_FFI_ICHECK(logits->device.device_type == kDLCPU) << "logits device must be CPU!";
   TVM_FFI_ICHECK(token_ids->device.device_type == kDLCPU) << "token_ids device must be CPU!";
   float* logits_raw_data = static_cast<float*>(logits->data);
@@ -606,9 +608,11 @@ void ApplyPresenceAndFrequencyPenalty(Tensor logits, Tensor token_ids, Tensor to
   TVM_FFI_ICHECK(logits.IsContiguous());
   TVM_FFI_ICHECK(token_ids.IsContiguous());
   TVM_FFI_ICHECK(token_freqs.IsContiguous());
-  TVM_FFI_ICHECK(logits.DataType() == DataType::Float(32)) << "Logits data type is not float32!";
-  TVM_FFI_ICHECK(token_ids.DataType() == DataType::Int(32)) << "token ids must be int32!";
-  TVM_FFI_ICHECK(token_freqs.DataType() == DataType::Int(32)) << "token freqs must be int32!";
+  TVM_FFI_ICHECK((logits.DataType() == DLDataType{kDLFloat, 32, 1}))
+      << "Logits data type is not float32!";
+  TVM_FFI_ICHECK((token_ids.DataType() == DLDataType{kDLInt, 32, 1})) << "token ids must be int32!";
+  TVM_FFI_ICHECK((token_freqs.DataType() == DLDataType{kDLInt, 32, 1}))
+      << "token freqs must be int32!";
   TVM_FFI_ICHECK(logits->device.device_type == kDLCPU) << "logits device must be CPU!";
   TVM_FFI_ICHECK(token_ids->device.device_type == kDLCPU) << "token_ids device must be CPU!";
   TVM_FFI_ICHECK(token_freqs->device.device_type == kDLCPU) << "token_ids device must be CPU!";
@@ -633,7 +637,8 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 // This is an inplace operation.
 void ApplySoftmaxWithTemperature(Tensor logits, double temperature) {
   TVM_FFI_ICHECK(logits.IsContiguous());
-  TVM_FFI_ICHECK(logits.DataType() == DataType::Float(32)) << "Logits data type is not float32!";
+  TVM_FFI_ICHECK((logits.DataType() == DLDataType{kDLFloat, 32, 1}))
+      << "Logits data type is not float32!";
   TVM_FFI_ICHECK(logits->device.device_type == kDLCPU) << "logits device must be CPU!";
   int vocab_size = logits->shape[logits->ndim - 1];
   float* logits_raw_data = static_cast<float*>(logits->data);
diff --git a/src/runtime/vm/paged_kv_cache.cc b/src/runtime/vm/paged_kv_cache.cc
index e5c4576e01c1..cd7920d6eef0 100644
--- a/src/runtime/vm/paged_kv_cache.cc
+++ b/src/runtime/vm/paged_kv_cache.cc
@@ -116,9 +116,9 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
   const ffi::Optional<Tensor> rope_ext_factors_;
 
   /*! \brief The KV cache dtype. */
-  const DataType kv_dtype_;
+  const DLDataType kv_dtype_;
   /*! \brief We fix int32 to be the index dtype of auxiliary data. */
-  const DLDataType dtype_aux_ = DLDataType(DataType::Int(32, 1));
+  const DLDataType dtype_aux_ = DLDataType{kDLInt, 32, 1};
 
   /********************* Page Structures *********************/
 
@@ -326,7 +326,7 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
         rotary_scale_(rotary_scale),
         rotary_theta_(rotary_theta),
         rope_ext_factors_(std::move(rope_ext_factors)),
-        kv_dtype_(DataType(dtype)),
+        kv_dtype_(dtype),
         reserved_num_seqs_(reserved_num_seqs),
         f_transpose_append_mha_(std::move(f_transpose_append_mha)),
         f_transpose_append_mla_(std::move(f_transpose_append_mla)),
@@ -372,7 +372,7 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
         pages_.push_back(nvshmem_pages_.CreateView(
             {num_total_pages_, 2, num_kv_heads_, page_size_, qk_head_dim_}, nvshmem_pages_->dtype,
             i * num_total_pages_ * 2 * num_kv_heads_ * page_size_ * qk_head_dim_ *
-                nvshmem_pages_.DataType().bytes()));
+                (nvshmem_pages_.DataType().bits + 7) / 8));
       }
 
       const auto f_transfer_kv_ptr = tvm::ffi::Function::GetGlobal("nvshmem.KVTransfer");
@@ -450,9 +450,9 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
     for (int d = 0; d < kPagedKVCacheMaxBlockDepth; ++d) {
       if (NeedKernelBeginForward()) {
         temp_int_attn_workspace_.push_back(
-            Tensor::Empty({kIntAttnWorkspaceByte}, DataType::UInt(8), device));
+            Tensor::Empty({kIntAttnWorkspaceByte}, DLDataType{kDLUInt, 8, 1}, device));
         temp_int_pinned_attn_workspace_.push_back(Tensor::Empty(
-            {kIntAttnWorkspaceByte}, DataType::UInt(8), GetPreferredHostDevice(device)));
+            {kIntAttnWorkspaceByte}, DLDataType{kDLUInt, 8, 1}, GetPreferredHostDevice(device)));
       }
       qo_indptr_on_depths_view_.push_back(Tensor());
       page_indptr_on_depths_view_.push_back(Tensor());
@@ -470,11 +470,11 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
     // Additional workspace for the "prefill with ragged kv" kernel.
     if (NeedKernelBeginForward()) {
       temp_int_attn_workspace_.push_back(
-          Tensor::Empty({kIntAttnWorkspaceByte}, DataType::UInt(8), device));
+          Tensor::Empty({kIntAttnWorkspaceByte}, DLDataType{kDLUInt, 8, 1}, device));
       temp_int_pinned_attn_workspace_.push_back(Tensor::Empty(
-          {kIntAttnWorkspaceByte}, DataType::UInt(8), GetPreferredHostDevice(device)));
+          {kIntAttnWorkspaceByte}, DLDataType{kDLUInt, 8, 1}, GetPreferredHostDevice(device)));
       temp_float_attn_workspace_ =
-          Tensor::Empty({kFloatAttnWorkspaceByte}, DataType::UInt(8), device);
+          Tensor::Empty({kFloatAttnWorkspaceByte}, DLDataType{kDLUInt, 8, 1}, device);
     }
 
     if (std::find(attn_kinds_.begin(), attn_kinds_.end(), AttnKind::kMHA) != attn_kinds_.end()) {
@@ -488,9 +488,9 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
     temp_attn_output_device_ =
         Tensor::Empty({prefill_chunk_size_, num_qo_heads, v_head_dim}, dtype, device);
     temp_attn_lse_device_ =
-        Tensor::Empty({prefill_chunk_size_, num_qo_heads}, DataType::Float(32), device);
+        Tensor::Empty({prefill_chunk_size_, num_qo_heads}, DLDataType{kDLFloat, 32, 1}, device);
     merged_attn_lse_device_ =
-        Tensor::Empty({prefill_chunk_size_, num_qo_heads}, DataType::Float(32), device);
+        Tensor::Empty({prefill_chunk_size_, num_qo_heads}, DLDataType{kDLFloat, 32, 1}, device);
     for (int64_t page_id = num_total_pages - 1; page_id >= 0; --page_id) {
       free_page_ids_.push_back(page_id);
     }
diff --git a/src/runtime/vm/rnn_state.cc b/src/runtime/vm/rnn_state.cc
index 9926b3d235e8..a38acf6e1cdf 100644
--- a/src/runtime/vm/rnn_state.cc
+++ b/src/runtime/vm/rnn_state.cc
@@ -83,7 +83,7 @@ class RNNStateImpObj : public RNNStateObj {
   const ffi::Array<Tensor> init_layer_value_;
 
   /*! \brief We fix int32 to be the index dtype of auxiliary data. */
-  const DLDataType dtype_aux_ = DLDataType(DataType::Int(32, 1));
+  const DLDataType dtype_aux_ = DLDataType{kDLInt, 32, 1};
 
   /******************* Storage Structures *******************/
 
diff --git a/src/runtime/vm/tensor_cache_support.cc b/src/runtime/vm/tensor_cache_support.cc
index ee77c5ddd8f0..62fd1a34c62f 100644
--- a/src/runtime/vm/tensor_cache_support.cc
+++ b/src/runtime/vm/tensor_cache_support.cc
@@ -64,7 +64,7 @@ TensorCacheMetadata::FileRecord::ParamRecord JSONAsParamRecord(const json::Objec
   TensorCacheMetadata::FileRecord::ParamRecord result;
   std::string dtype = json["dtype"].cast<ffi::String>();
   result.name = json["name"].cast<ffi::String>();
-  result.dtype = DataType(ffi::StringToDLDataType(dtype));
+  result.dtype = ffi::StringToDLDataType(dtype);
   result.format = json["format"].cast<ffi::String>();
   result.nbytes = json["nbytes"].cast<int64_t>();
   result.byte_offset = json["byteOffset"].cast<int64_t>();
@@ -154,7 +154,7 @@ void CopyTensorFromBytes(Tensor param, const void* data, size_t nbytes,
 Tensor TensorCacheMetadata::FileRecord::ParamRecord::Load(
     Device device, const std::string* raw_data, ffi::Optional<Tensor>* staging_buffer) const {
   Tensor arr = Tensor::Empty(shape, dtype, device);
-  if (dtype == DataType::Float(32) && format == "f32-to-bf16") {
+  if (dtype == DLDataType{kDLFloat, 32, 1} && format == "f32-to-bf16") {
     // decode bf16 to f32
     std::vector<uint16_t> buffer(nbytes / 2);
     std::vector<uint32_t> decoded(nbytes / 2);
diff --git a/src/s_tir/analysis/calculate_allocated_memory.cc b/src/s_tir/analysis/calculate_allocated_memory.cc
index 51330a63e88b..41df4ee4bb8a 100644
--- a/src/s_tir/analysis/calculate_allocated_memory.cc
+++ b/src/s_tir/analysis/calculate_allocated_memory.cc
@@ -76,7 +76,7 @@ class AllocBufferCalculator : public StmtExprVisitor {
         break;
       }
     }
-    size *= op->buffer->dtype.bytes() * op->buffer->dtype.lanes();
+    size *= ((op->buffer->dtype.bits() + 7) / 8) * op->buffer->dtype.lanes();
     _current_size[storage_scope] += size;
     _max_size[storage_scope] = std::max(_current_size[storage_scope], _max_size[storage_scope]);
     StmtExprVisitor::VisitStmt_(op);
diff --git a/src/s_tir/analysis/estimate_flops.cc b/src/s_tir/analysis/estimate_flops.cc
index d77e715db1b6..bcde2d4b70bd 100644
--- a/src/s_tir/analysis/estimate_flops.cc
+++ b/src/s_tir/analysis/estimate_flops.cc
@@ -26,15 +26,13 @@ namespace tvm {
 namespace s_tir {
 using namespace tvm::tirx;
 
-int32_t DataType2Int(const tvm::DataType& dtype) {
+int32_t DataType2Int(DLDataType dtype) {
   static_assert(sizeof(DLDataType) == sizeof(int32_t), "Incorrect size of DLDataType");
   union {
     DLDataType src;
     int32_t dst;
   } converter;
-  converter.src.code = dtype.code();
-  converter.src.bits = dtype.bits();
-  converter.src.lanes = dtype.lanes();
+  converter.src = dtype;
   return converter.dst;
 }
 
@@ -57,7 +55,7 @@ ffi::String Int2DataTypeStr(int32_t dtype) {
 struct TResult {
   TResult() = default;
 
-  void Add(const tvm::DataType& dtype) { data_[DataType2Int(dtype)] += 1; }
+  void Add(DLDataType dtype) { data_[DataType2Int(dtype)] += 1; }
 
   TResult operator+=(const TResult& rhs) {
     for (const auto& kv : rhs.data_) {
@@ -98,7 +96,7 @@ class FlopEstimator : private ExprFunctor<TResult(const PrimExpr& n)>,
   TResult VisitExpr_(const Node* op) final {     \
     TResult result = VisitExpr(op->a);           \
     result += VisitExpr(op->b);                  \
-    result.Add(op->dtype);                       \
+    result.Add(op->ty()->dtype);                 \
     return result;                               \
   }
   TVM_TIR_ESTIMATE_FLOP_VISIT_BINARY(AddNode);
diff --git a/src/s_tir/analysis/sblock_access_region_detector.cc b/src/s_tir/analysis/sblock_access_region_detector.cc
index 18eef8e2fe01..9fa0a7b0b325 100644
--- a/src/s_tir/analysis/sblock_access_region_detector.cc
+++ b/src/s_tir/analysis/sblock_access_region_detector.cc
@@ -348,7 +348,7 @@ ffi::Array<BufferRegion> BlockReadWriteDetector::CollectRegions(
       const tvm::arith::IntSet& range = regions[i][j];
       if (range.CanProveSinglePoint(ana_)) {
         PrimExpr min = range.min();
-        region.push_back(Range::FromMinExtent(min, MakeConst(min.dtype(), 1)));
+        region.push_back(Range::FromMinExtent(min, MakeConst(min.ty(), 1)));
       } else {
         region.push_back(range.CoverRange(Range::FromMinExtent(0, buffers[i]->shape[j])));
       }
diff --git a/src/s_tir/analysis/verify_gpu_code.cc b/src/s_tir/analysis/verify_gpu_code.cc
index bd7b7c92ba7c..837485d32de1 100644
--- a/src/s_tir/analysis/verify_gpu_code.cc
+++ b/src/s_tir/analysis/verify_gpu_code.cc
@@ -76,20 +76,22 @@ class GPUCodeVerifier : public StmtExprVisitor {
         break;
       }
     }
+    DLDataType dtype = op->buffer->dtype->dtype;
     if (storage_scope.rank == runtime::StorageRank::kLocal) {
-      local_memory_per_block_ +=
-          static_cast<size_t>(const_size) * op->buffer->dtype.bytes() * op->buffer->dtype.lanes();
+      local_memory_per_block_ += static_cast<size_t>(const_size) * (((dtype).bits + 7) / 8) *
+                                 static_cast<int16_t>((dtype).lanes);
     } else if (storage_scope.rank == runtime::StorageRank::kShared) {
-      shared_memory_per_block_ +=
-          static_cast<size_t>(const_size) * op->buffer->dtype.bytes() * op->buffer->dtype.lanes();
+      shared_memory_per_block_ += static_cast<size_t>(const_size) * (((dtype).bits + 7) / 8) *
+                                  static_cast<int16_t>((dtype).lanes);
     }
-    if (op->buffer->dtype.is_vector()) {
-      if (static_cast<size_t>(op->buffer->dtype.lanes() * op->buffer->dtype.bytes()) >
+    if ((static_cast<int16_t>((dtype).lanes) > 1)) {
+      if (static_cast<size_t>(static_cast<int16_t>((dtype).lanes) * (((dtype).bits + 7) / 8)) >
           max_vector_bytes_) {
         std::stringstream s;
-        s << "Number of lanes (" << op->buffer->dtype.lanes() << ") times number of bytes ("
-          << op->buffer->dtype.bytes() << ") for dtype " << op->buffer->dtype
-          << " is greater than the maximum number of vector bytes (" << max_vector_bytes_ << ")";
+        s << "Number of lanes (" << static_cast<int16_t>((dtype).lanes)
+          << ") times number of bytes (" << (((dtype).bits + 7) / 8) << ") for dtype "
+          << op->buffer->dtype << " is greater than the maximum number of vector bytes ("
+          << max_vector_bytes_ << ")";
         errors_.push_back(s.str());
       }
     }
@@ -202,12 +204,16 @@ class GPUCodeVerifier : public StmtExprVisitor {
   void CheckBufferIndicesVectorizable(const ffi::Array<PrimExpr> indices) {
     for (const auto index : indices) {
       if (const auto* ramp = index.as<RampNode>()) {
-        if (!is_one(ramp->stride) &&
-            static_cast<size_t>(ramp->dtype.lanes() * ramp->dtype.bytes()) > max_vector_bytes_) {
+        PrimType ramp_ty = ramp->ty();
+        DLDataType ramp_dtype = ramp_ty->dtype;
+        if (!is_one(ramp->stride) && ramp_ty.IsFixedLengthVector() &&
+            static_cast<size_t>(static_cast<int16_t>((ramp_dtype).lanes) *
+                                (((ramp_dtype).bits + 7) / 8)) > max_vector_bytes_) {
           std::stringstream s;
-          s << "Number of lanes (" << ramp->dtype.lanes() << ") times number of bytes ("
-            << ramp->dtype.bytes() << ") for dtype " << ramp->dtype
-            << " is greater than the maximum number of vector bytes (" << max_vector_bytes_ << ")";
+          s << "Number of lanes (" << static_cast<int16_t>((ramp_dtype).lanes)
+            << ") times number of bytes (" << (((ramp_dtype).bits + 7) / 8) << ") for dtype "
+            << ramp_dtype << " is greater than the maximum number of vector bytes ("
+            << max_vector_bytes_ << ")";
           errors_.push_back(s.str());
         }
       }
@@ -215,12 +221,16 @@ class GPUCodeVerifier : public StmtExprVisitor {
   }
 
   void VisitExpr_(const CastNode* op) {
-    if (op->dtype.is_vector()) {
-      if (static_cast<size_t>(op->dtype.lanes() * op->dtype.bytes()) > max_vector_bytes_) {
+    PrimType op_ty = op->ty();
+    DLDataType op_dtype = op_ty->dtype;
+    if (op_ty.IsFixedLengthVector()) {
+      if (static_cast<size_t>(static_cast<int16_t>((op_dtype).lanes) *
+                              (((op_dtype).bits + 7) / 8)) > max_vector_bytes_) {
         std::stringstream s;
-        s << "Number of lanes (" << op->dtype.lanes() << ") times number of bytes ("
-          << op->dtype.bytes() << ") for dtype " << op->dtype
-          << " is greater than the maximum number of vector bytes (" << max_vector_bytes_ << ")";
+        s << "Number of lanes (" << static_cast<int16_t>((op_dtype).lanes)
+          << ") times number of bytes (" << (((op_dtype).bits + 7) / 8) << ") for dtype "
+          << op_dtype << " is greater than the maximum number of vector bytes ("
+          << max_vector_bytes_ << ")";
         errors_.push_back(s.str());
       }
     }
@@ -228,12 +238,16 @@ class GPUCodeVerifier : public StmtExprVisitor {
   }
 
   void VisitExpr_(const BufferLoadNode* op) {
-    if (op->dtype.is_vector()) {
-      if (static_cast<size_t>(op->dtype.lanes() * op->dtype.bytes()) > max_vector_bytes_) {
+    PrimType op_ty = op->ty();
+    DLDataType op_dtype = op_ty->dtype;
+    if (op_ty.IsFixedLengthVector()) {
+      if (static_cast<size_t>(static_cast<int16_t>((op_dtype).lanes) *
+                              (((op_dtype).bits + 7) / 8)) > max_vector_bytes_) {
         std::stringstream s;
-        s << "Number of lanes (" << op->dtype.lanes() << ") times number of bytes ("
-          << op->dtype.bytes() << ") for dtype " << op->dtype
-          << " is greater than the maximum number of vector bytes (" << max_vector_bytes_ << ")";
+        s << "Number of lanes (" << static_cast<int16_t>((op_dtype).lanes)
+          << ") times number of bytes (" << (((op_dtype).bits + 7) / 8) << ") for dtype "
+          << op_dtype << " is greater than the maximum number of vector bytes ("
+          << max_vector_bytes_ << ")";
         errors_.push_back(s.str());
       }
       CheckBufferIndicesVectorizable(op->indices);
@@ -242,13 +256,16 @@ class GPUCodeVerifier : public StmtExprVisitor {
   }
 
   void VisitStmt_(const BufferStoreNode* op) {
-    if (op->value->dtype.is_vector()) {
-      if (static_cast<size_t>(op->value->dtype.lanes() * op->value->dtype.bytes()) >
-          max_vector_bytes_) {
+    PrimType value_ty = op->value.ty();
+    DLDataType value_dtype = value_ty->dtype;
+    if (value_ty.IsFixedLengthVector()) {
+      if (static_cast<size_t>(static_cast<int16_t>((value_dtype).lanes) *
+                              (((value_dtype).bits + 7) / 8)) > max_vector_bytes_) {
         std::stringstream s;
-        s << "Number of lanes (" << op->value->dtype.lanes() << ") times number of bytes ("
-          << op->value->dtype.bytes() << ") for dtype " << op->value->dtype
-          << " is greater than the maximum number of vector bytes (" << max_vector_bytes_ << ")";
+        s << "Number of lanes (" << static_cast<int16_t>((value_dtype).lanes)
+          << ") times number of bytes (" << (((value_dtype).bits + 7) / 8) << ") for dtype "
+          << value_dtype << " is greater than the maximum number of vector bytes ("
+          << max_vector_bytes_ << ")";
         errors_.push_back(s.str());
       }
       CheckBufferIndicesVectorizable(op->indices);
diff --git a/src/s_tir/backend/adreno/inject_texture_alloc.cc b/src/s_tir/backend/adreno/inject_texture_alloc.cc
index e4e7c322ef55..5b6aeda19362 100644
--- a/src/s_tir/backend/adreno/inject_texture_alloc.cc
+++ b/src/s_tir/backend/adreno/inject_texture_alloc.cc
@@ -79,11 +79,11 @@ class TextureAllocInjector : public arith::IRMutatorWithAnalyzer {
       ffi::Array<PrimExpr> args;
       args.push_back(StringImm(storage_scope));
       args.push_back(IntImm::Int64(3));
-      args.push_back(Call(DataType::Handle(), builtin::tvm_stack_make_shape(),
+      args.push_back(Call(PrimType::Handle(), builtin::tvm_stack_make_shape(),
                           {texture.width, texture.height, texture.depth}));
       args.push_back(IntImm::Int64(channel_size));
       stmt = Bind(op->buffer->data,
-                  Call(op->buffer->data.dtype(), builtin::nd_mem_alloc_with_scope(), args));
+                  Call(op->buffer->data.ty(), builtin::nd_mem_alloc_with_scope(), args));
     }
     return stmt;
   }
diff --git a/src/s_tir/backend/adreno/texture_flatten.cc b/src/s_tir/backend/adreno/texture_flatten.cc
index 0dd939ad817a..d4297e42e4d2 100644
--- a/src/s_tir/backend/adreno/texture_flatten.cc
+++ b/src/s_tir/backend/adreno/texture_flatten.cc
@@ -100,7 +100,7 @@ class TextureFlattener : public TextureLoweringBase {
     if (IsTextureStorage(storage_scope)) {
       ffi::Array<PrimExpr> args = GetTextureAccessArgs(op, op->buffer);
       args.push_back(op->value);
-      stmt = Evaluate(Call(args[0]->dtype, builtin::texture2d_store(), args));
+      stmt = Evaluate(Call(args[0].ty(), builtin::texture2d_store(), args));
     }
 
     return stmt;
@@ -147,7 +147,7 @@ class TextureFlattener : public TextureLoweringBase {
     PrimExpr col_offset = SimplifyOffset(col_dims, col_indices);
     PrimExpr depth_offset = SimplifyOffset(depth_dims, depth_indices);
     PrimExpr channel_size = IntImm(
-        DataType::Int(32, 1), *tirx::as_const_int(buffer->shape.back()) * buffer->dtype.bits());
+        PrimType::Int(32, 1), *tirx::as_const_int(buffer->shape.back()) * buffer->dtype.bits());
     args.push_back(row_offset);
     args.push_back(col_offset);
     args.push_back(depth_offset);
diff --git a/src/s_tir/data_layout.cc b/src/s_tir/data_layout.cc
index 787386c8ccb9..6fa2db0206e4 100644
--- a/src/s_tir/data_layout.cc
+++ b/src/s_tir/data_layout.cc
@@ -22,10 +22,10 @@
  * \brief Data SLayout expression.
  */
 #include <tvm/arith/analyzer.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ir/expr.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/s_tir/data_layout.h>
 #include <tvm/tirx/analysis.h>
@@ -113,8 +113,9 @@ SLayout::SLayout(const ffi::Array<IterVar>& axes) {
   data_ = std::move(node);
 }
 
-SLayout::SLayout(const std::string& name, DataType dtype) {  // NOLINT(*)
-  TVM_FFI_CHECK(dtype.is_int(), TypeError) << "The input dtype should be integer type";
+SLayout::SLayout(const std::string& name, PrimType index_ty) {  // NOLINT(*)
+  TVM_FFI_CHECK(index_ty.code() == DLDataTypeCode::kDLInt, TypeError)
+      << "The input dtype should be integer type";
   if (name == "__undef__") return;
 
   auto node = ffi::make_object<SLayoutNode>();
@@ -131,8 +132,8 @@ SLayout::SLayout(const std::string& name, DataType dtype) {  // NOLINT(*)
     if (c >= 'A' && c <= 'Z') {
       TVM_FFI_ICHECK_EQ(factor, 0) << "Invalid layout " << name << ": invalid factor size "
                                    << factor << " before dimension " << c;
-      IterVar axis(Range(IntImm(dtype, 0), Var(std::string(1, c), dtype)),
-                   Var(std::string(1, c), dtype), tirx::kDataPar);
+      IterVar axis(Range(IntImm(index_ty, 0), Var(std::string(1, c), index_ty)),
+                   Var(std::string(1, c), index_ty), tirx::kDataPar);
       if (!in_packing) {
         node->axes.push_back(axis);
       } else {
@@ -143,7 +144,7 @@ SLayout::SLayout(const std::string& name, DataType dtype) {  // NOLINT(*)
                                    << factor << " for dimension " << c;
       std::stringstream name;
       name << factor << c;
-      IterVar axis(Range(IntImm(dtype, 0), IntImm(dtype, factor)), Var(name.str(), dtype),
+      IterVar axis(Range(IntImm(index_ty, 0), IntImm(index_ty, factor)), Var(name.str(), index_ty),
                    tirx::kDataPar);
       if (!in_packing) {
         node->axes.push_back(axis);
@@ -174,8 +175,8 @@ SLayout::SLayout(const std::string& name, DataType dtype) {  // NOLINT(*)
         extent = extent * factor->value;
       }
       std::string grouped_name = ss.str();
-      IterVar grouped_axis(Range(IntImm(dtype, 0), IntImm(dtype, extent)), Var(grouped_name, dtype),
-                           tirx::kDataPar);
+      IterVar grouped_axis(Range(IntImm(index_ty, 0), IntImm(index_ty, extent)),
+                           Var(grouped_name, index_ty), tirx::kDataPar);
       node->axes.push_back(grouped_axis);
 
       in_packing = false;
@@ -231,21 +232,21 @@ ffi::Array<IterVar> SLayout::UnpackIterVar(IterVar packed_iter) {
   int64_t factor = 0, final_factor = 1;
 
   std::string name(packed_iter->var->name_hint.c_str());
-  DataType dtype = packed_iter->var.dtype();
+  PrimType index_ty = packed_iter->var.ty();
 
   for (auto ch : name) {
     if (ch >= '0' && ch <= '9') {
       factor = factor * 10 + (ch - '0');
     } else if (ch >= 'a' && ch <= 'z') {
       TVM_FFI_ICHECK(factor != 0) << "Invalid Factor Size";
-      result.push_back(IterVar(Range(IntImm(dtype, 0), IntImm(dtype, factor)),
-                               Var(std::string(1, ch), dtype), tirx::kDataPar));
+      result.push_back(IterVar(Range(IntImm(index_ty, 0), IntImm(index_ty, factor)),
+                               Var(std::string(1, ch), index_ty), tirx::kDataPar));
       final_factor *= factor;
       factor = 0;
     } else if (ch >= 'A' && ch <= 'Z') {
       TVM_FFI_ICHECK(factor == 0) << "Can't have non-zero factors for primal axis";
-      result.push_back(IterVar(Range(IntImm(dtype, 0), Var(std::string(1, ch), dtype)),
-                               Var(std::string(1, ch), dtype), tirx::kDataPar));
+      result.push_back(IterVar(Range(IntImm(index_ty, 0), Var(std::string(1, ch), index_ty)),
+                               Var(std::string(1, ch), index_ty), tirx::kDataPar));
     }
   }
 
@@ -256,7 +257,7 @@ IterVar SLayout::PackIterVar(ffi::Array<IterVar> iter_vars) {
   std::stringstream name;
   size_t extent = 1;
 
-  DataType dtype = iter_vars[0]->dom->extent.as<PrimExpr>().value()->dtype;
+  PrimType index_ty = iter_vars[0]->dom->extent.as<PrimExpr>().value().ty();
   for (auto itvar : iter_vars) {
     TVM_FFI_ICHECK(itvar->dom->extent.as<IntImm>())
         << "Packed Axis can contain only Subordinate Axes";
@@ -264,7 +265,7 @@ IterVar SLayout::PackIterVar(ffi::Array<IterVar> iter_vars) {
     extent = extent * itvar->dom->extent.as<IntImm>().value()->value;
   }
 
-  return IterVar(Range(IntImm(dtype, 0), IntImm(dtype, extent)), Var(name.str(), dtype),
+  return IterVar(Range(IntImm(index_ty, 0), IntImm(index_ty, extent)), Var(name.str(), index_ty),
                  tirx::kDataPar);
 }
 
@@ -357,7 +358,8 @@ inline bool GetStoreRule(ffi::Array<PrimExpr>* index_rule, ffi::Array<PrimExpr>*
             if (axis == sub_axis) {
               const auto* sub_extent = inter_unpacked_axes[l]->dom->extent.as<IntImmNode>();
               TVM_FFI_ICHECK(sub_extent) << "Expected Integer Extents for Offset Calculation";
-              factor_ij = factor_ij * IntImm(sub_extent->dtype, sub_extent->value);
+              factor_ij =
+                  factor_ij * IntImm(ffi::GetRef<PrimExpr>(sub_extent).ty(), sub_extent->value);
             }
           }
         }
@@ -498,11 +500,11 @@ inline ffi::Array<PrimExpr> TransformShape(const ffi::Array<PrimExpr>& src_shape
               << ", get " << orig_shape;
         }
       }
-      bind_map[orig_axis->var.get()] = IntImm(orig_axis->var->dtype, 0);
+      bind_map[orig_axis->var.get()] = IntImm(orig_axis->var.ty(), 0);
     } else {
-      bind_map[orig_axis->var.get()] = orig_axis->var->dtype == orig_shape->dtype
+      bind_map[orig_axis->var.get()] = orig_axis->var.ty()->dtype == orig_shape.ty()->dtype
                                            ? orig_shape
-                                           : cast(orig_axis->var->dtype, orig_shape);
+                                           : cast(orig_axis->var.ty(), orig_shape);
     }
   }
   // infer the target shape,
@@ -583,7 +585,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
-      .def("s_tir.SLayout", [](std::string name, DataType dtype) { return SLayout(name, dtype); })
+      .def("s_tir.SLayout", [](std::string name, PrimType dtype) { return SLayout(name, dtype); })
       .def("s_tir.SLayoutIndexOf",
            [](SLayout layout, std::string axis) -> int { return layout.IndexOf(axis); })
       .def("s_tir.SLayoutFactorOf",
diff --git a/src/s_tir/meta_schedule/arg_info.cc b/src/s_tir/meta_schedule/arg_info.cc
index dc452b370037..73fa41773883 100644
--- a/src/s_tir/meta_schedule/arg_info.cc
+++ b/src/s_tir/meta_schedule/arg_info.cc
@@ -98,7 +98,7 @@ ffi::Array<ArgInfo> ArgInfo::FromPrimFunc(const tirx::PrimFunc& func) {
   for (const tirx::Var& arg : func->params) {
     if (ffi::Optional<tirx::Buffer> _buffer = func->buffer_map.Get(arg)) {
       tirx::Buffer buffer = _buffer.value();
-      result.push_back(TensorInfo(/*dtype=*/buffer->dtype,
+      result.push_back(TensorInfo(/*dtype=*/buffer->dtype->dtype,
                                   /*shape=*/AsVector<PrimExpr, int64_t>(buffer->shape)));
     } else {
       TVM_FFI_THROW(ValueError) << "Unsupported argument type: " << arg;
@@ -117,7 +117,7 @@ ffi::Array<ArgInfo> ArgInfo::FromEntryFunc(const IRModule& mod, bool remove_prep
 
 /******** TensorInfo ********/
 
-TensorInfo::TensorInfo(runtime::DataType dtype, ffi::Shape shape) {
+TensorInfo::TensorInfo(DLDataType dtype, ffi::Shape shape) {
   ffi::ObjectPtr<TensorInfoNode> n = ffi::make_object<TensorInfoNode>();
   n->dtype = dtype;
   n->shape = shape;
@@ -150,7 +150,7 @@ TensorInfo TensorInfo::FromJSON(const ffi::ObjectRef& json_obj) {
   }
   std::vector<int64_t> s;
   std::transform(shape.begin(), shape.end(), std::back_inserter(s), [](int64_t i) { return i; });
-  return TensorInfo(DataType(dtype), ffi::Shape(s.begin(), s.end()));
+  return TensorInfo(dtype, ffi::Shape(s.begin(), s.end()));
 }
 
 /******** Repr ********/
@@ -182,10 +182,9 @@ TVM_FFI_STATIC_INIT_BLOCK() {
       .def("s_tir.meta_schedule.ArgInfoFromPrimFunc", ArgInfo::FromPrimFunc)
       .def("s_tir.meta_schedule.ArgInfoFromEntryFunc", ArgInfo::FromEntryFunc)
       .def("s_tir.meta_schedule.ArgInfoFromJSON", ArgInfo::FromJSON)
-      .def("s_tir.meta_schedule.TensorInfo",
-           [](runtime::DataType dtype, ffi::Shape shape) -> TensorInfo {
-             return TensorInfo(dtype, shape);
-           });
+      .def("s_tir.meta_schedule.TensorInfo", [](DLDataType dtype, ffi::Shape shape) -> TensorInfo {
+        return TensorInfo(dtype, shape);
+      });
 }
 
 }  // namespace meta_schedule
diff --git a/src/s_tir/meta_schedule/database/database_utils.cc b/src/s_tir/meta_schedule/database/database_utils.cc
index ea1473ae6500..826c38c8d1b0 100644
--- a/src/s_tir/meta_schedule/database/database_utils.cc
+++ b/src/s_tir/meta_schedule/database/database_utils.cc
@@ -32,7 +32,9 @@ void JSONDumps(Any json_obj, std::ostringstream& os) {
     os << "null";
   } else if (auto opt_int_imm = json_obj.try_cast<IntImm>()) {
     IntImm int_imm = *std::move(opt_int_imm);
-    if (int_imm->dtype == DataType::Bool()) {
+    PrimType int_ty = int_imm.ty();
+    if (int_ty.MatchesElementType(DLDataTypeCode::kDLBool, 8) && !int_ty.IsScalableVector() &&
+        !int_ty.IsFixedLengthVector()) {
       if (int_imm->value) {
         os << "true";
       } else {
@@ -154,7 +156,6 @@ class JSONTokenizer {
   bool NextFalse() { return NextLiteral("false", 5); }
 
   bool NextNumber(Token* token) {
-    using runtime::DataType;
     bool is_float = false;
     const char* st = cur_;
     for (; cur_ != end_; ++cur_) {
diff --git a/src/s_tir/meta_schedule/feature_extractor/per_store_feature.cc b/src/s_tir/meta_schedule/feature_extractor/per_store_feature.cc
index f0e3aa897cdd..2f87217db065 100644
--- a/src/s_tir/meta_schedule/feature_extractor/per_store_feature.cc
+++ b/src/s_tir/meta_schedule/feature_extractor/per_store_feature.cc
@@ -273,12 +273,12 @@ Pass SimplifyForFeatureExtraction() {
           HasBufferLoad(node->condition)) {
         return ffi::GetRef<Select>(node);
       }
-      return MakeConst(node->dtype, 1.0);
+      return MakeConst(node->ty(), 1.0);
     }
 
     PrimExpr VisitExpr_(const VarNode* var) final {
       if (unit_vars_.count(ffi::GetRef<Var>(var))) {
-        return MakeConst(var->dtype, 0.0);
+        return MakeConst(var->ty(), 0.0);
       }
       return ffi::GetRef<Var>(var);
     }
@@ -553,7 +553,7 @@ Feature::ArithOps::ArithOps(const BufferStoreNode* store, int64_t prod_loop_exte
   }
 #define TVM_FEATURE_BINARY(Type, FloatCounter, IntCounter) \
   void VisitExpr_(const Type* op) final {                  \
-    if (op->dtype.is_float()) {                            \
+    if (op->ty().code() == DLDataTypeCode::kDLFloat) {     \
       result_.FloatCounter += this->prod_loop_extent_;     \
     } else {                                               \
       result_.IntCounter += this->prod_loop_extent_;       \
@@ -589,13 +589,13 @@ Feature::ArithOps::ArithOps(const BufferStoreNode* store, int64_t prod_loop_exte
       bool is_pure =
           effect_kind == CallEffectKind::kPure || effect_kind == CallEffectKind::kExprAnnotation;
       if (is_pure) {
-        if (op->dtype.is_float()) {
+        if (op->ty().code() == DLDataTypeCode::kDLFloat) {
           result_.float_math_func += prod_loop_extent_;
         } else {
           result_.int_math_func += prod_loop_extent_;
         }
       } else {
-        if (op->dtype.is_float()) {
+        if (op->ty().code() == DLDataTypeCode::kDLFloat) {
           result_.float_other_func += prod_loop_extent_;
         } else {
           result_.int_other_func += prod_loop_extent_;
@@ -852,7 +852,7 @@ void Feature::SetRegion(const LoopNest& loop_nest, IntVec* for_touched_bytes,
       feature.access_shape = utils::RelaxAndUnion(feature.multi_indices, &numel, analyzer);
       numel = std::max<int64_t>(0, numel);
       feature.loop_accessed_numel[i][buffer] = numel;
-      touched_bytes += numel * buffer->dtype.bytes();
+      touched_bytes += numel * ((buffer->dtype.bits() + 7) / 8);
       (*buffer_touched_under_loop)[loop][buffer].push_back(numel);
     }
   }
@@ -880,7 +880,7 @@ void Feature::SubFeature::SetStride(const LoopNest& loop_nest, arith::AnalyzerOb
     TVM_FFI_ICHECK_EQ(access_shape.size(), buffer_shape.size());
     for (int i = ndim - 1; i >= 0; --i) {
       if (access_shape[i] == buffer_shape[i]) {
-        num_continuous_bytes = buffer_shape[i] * buffer->dtype.bytes();
+        num_continuous_bytes = buffer_shape[i] * ((buffer->dtype.bits() + 7) / 8);
         break;
       }
     }
@@ -953,7 +953,7 @@ void Feature::SubFeature::SetReuse(const LoopNest& loop_nest, int64_t top_loop_t
           const BufferNode* buffer = iter.first;
           const IntVec& numels = iter.second;
           int64_t numel = std::accumulate(numels.begin(), numels.end(), int64_t(0));
-          reuse_dis_bytes += numel * buffer->dtype.bytes();
+          reuse_dis_bytes += numel * ((buffer->dtype.bits() + 7) / 8);
         }
       }
       break;
@@ -973,7 +973,7 @@ void Feature::SubFeature::SetReuse(const LoopNest& loop_nest, int64_t top_loop_t
         const BufferNode* buffer = iter.first;
         const IntVec& numels = iter.second;
         int64_t numel = std::accumulate(numels.begin(), numels.end(), int64_t(0));
-        reuse_dis_bytes += numel * buffer->dtype.bytes();
+        reuse_dis_bytes += numel * ((buffer->dtype.bits() + 7) / 8);
       }
       reuse_dis_iter /= extent;
       reuse_dis_bytes /= extent;
@@ -983,7 +983,7 @@ void Feature::SubFeature::SetReuse(const LoopNest& loop_nest, int64_t top_loop_t
 }
 
 void Feature::SubFeature::SetFeature(const LoopNest& loop_nest, int64_t cache_line_bytes) {
-  int64_t dtype_bytes = this->buffer->dtype.bytes();
+  int64_t dtype_bytes = ((this->buffer->dtype.bits() + 7) / 8);
   this->stride = this->innermost_stride;
   this->bytes = dtype_bytes * loop_nest.prod;
   if (loop_nest.loops.empty()) {
@@ -1023,7 +1023,7 @@ Feature::Feature(const BufferStoreNode* store, const LoopNest& loop_nest, int64_
   int64_t top_loop_touch_bytes = 0.0;
   if (n_loops > 0) {
     for (const SubFeature& feature : sub_features) {
-      int64_t bytes = feature.buffer->dtype.bytes();
+      int64_t bytes = ((feature.buffer->dtype.bits() + 7) / 8);
       int64_t n_buffer = feature.loop_accessed_numel[0].size();
       top_loop_touch_bytes += bytes * n_buffer;
     }
@@ -1161,7 +1161,7 @@ struct Feature {
     for (int64_t x : shape) {
       numel *= x;
     }
-    alloc_size = numel * buffer->dtype.bytes();
+    alloc_size = numel * ((buffer->dtype.bits() + 7) / 8);
     alloc_prod = numel * loop_nest.prod;
     alloc_outer_prod = loop_nest.prod;
   }
diff --git a/src/s_tir/meta_schedule/measure_callback/add_to_database.cc b/src/s_tir/meta_schedule/measure_callback/add_to_database.cc
index 0c74e66d2af3..57008c7d953a 100644
--- a/src/s_tir/meta_schedule/measure_callback/add_to_database.cc
+++ b/src/s_tir/meta_schedule/measure_callback/add_to_database.cc
@@ -47,7 +47,7 @@ class AddToDatabaseNode : public MeasureCallbackNode {
       if (result->run_secs.defined()) {
         run_secs = result->run_secs.value();
       } else {
-        run_secs = ffi::Array<FloatImm>{FloatImm(DataType::Float(32), 1e10)};
+        run_secs = ffi::Array<FloatImm>{FloatImm(PrimType::Float(32), 1e10)};
       }
       database->CommitTuningRecord(TuningRecord(
           /*trace=*/candidate->sch->trace().value(),
diff --git a/src/s_tir/meta_schedule/mutator/mutator.cc b/src/s_tir/meta_schedule/mutator/mutator.cc
index d4060f5bf6b6..33d2b41b4aa7 100644
--- a/src/s_tir/meta_schedule/mutator/mutator.cc
+++ b/src/s_tir/meta_schedule/mutator/mutator.cc
@@ -54,27 +54,27 @@ Mutator Mutator::PyMutator(
 
 ffi::Map<Mutator, FloatImm> Mutator::DefaultLLVM() {
   return ffi::Map<Mutator, FloatImm>{
-      {Mutator::MutateTileSize(), FloatImm(DataType::Float(64), 0.9)},
-      {Mutator::MutateComputeLocation(), FloatImm(DataType::Float(64), 0.05)},
-      {Mutator::MutateUnroll(), FloatImm(DataType::Float(64), 0.03)},
-      {Mutator::MutateParallel(/*max_jobs_per_core=*/16), FloatImm(DataType::Float(64), 0.02)}};
+      {Mutator::MutateTileSize(), FloatImm(PrimType::Float(64), 0.9)},
+      {Mutator::MutateComputeLocation(), FloatImm(PrimType::Float(64), 0.05)},
+      {Mutator::MutateUnroll(), FloatImm(PrimType::Float(64), 0.03)},
+      {Mutator::MutateParallel(/*max_jobs_per_core=*/16), FloatImm(PrimType::Float(64), 0.02)}};
 }
 
 ffi::Map<Mutator, FloatImm> Mutator::DefaultCUDA() {
   return ffi::Map<Mutator, FloatImm>{
-      {Mutator::MutateTileSize(), FloatImm(DataType::Float(64), 0.9)},
-      {Mutator::MutateUnroll(), FloatImm(DataType::Float(64), 0.08)},
-      {Mutator::MutateThreadBinding(), FloatImm(DataType::Float(64), 0.02)}};
+      {Mutator::MutateTileSize(), FloatImm(PrimType::Float(64), 0.9)},
+      {Mutator::MutateUnroll(), FloatImm(PrimType::Float(64), 0.08)},
+      {Mutator::MutateThreadBinding(), FloatImm(PrimType::Float(64), 0.02)}};
 }
 
 ffi::Map<Mutator, FloatImm> Mutator::DefaultCUDATensorCore() { return Mutator::DefaultCUDA(); }
 
 ffi::Map<Mutator, FloatImm> Mutator::DefaultHexagon() {
   return ffi::Map<Mutator, FloatImm>{
-      {Mutator::MutateTileSize(), FloatImm(DataType::Float(64), 0.9)},
-      {Mutator::MutateComputeLocation(), FloatImm(DataType::Float(64), 0.05)},
-      {Mutator::MutateUnroll(), FloatImm(DataType::Float(64), 0.03)},
-      {Mutator::MutateParallel(/*max_jobs_per_core=*/16), FloatImm(DataType::Float(64), 0.02)}};
+      {Mutator::MutateTileSize(), FloatImm(PrimType::Float(64), 0.9)},
+      {Mutator::MutateComputeLocation(), FloatImm(PrimType::Float(64), 0.05)},
+      {Mutator::MutateUnroll(), FloatImm(PrimType::Float(64), 0.03)},
+      {Mutator::MutateParallel(/*max_jobs_per_core=*/16), FloatImm(PrimType::Float(64), 0.02)}};
 }
 
 // Pattern A (RM): auto-default repr from reflection.
diff --git a/src/s_tir/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/s_tir/meta_schedule/postproc/rewrite_cooperative_fetch.cc
index a2b57e3f5c5f..9221e8c3bfdd 100644
--- a/src/s_tir/meta_schedule/postproc/rewrite_cooperative_fetch.cc
+++ b/src/s_tir/meta_schedule/postproc/rewrite_cooperative_fetch.cc
@@ -93,9 +93,9 @@ size_t GetMaxUsedDtypeBytes(SBlock block) {
 
   tirx::PostOrderVisit(block->body, [&](const ffi::ObjectRef& obj) {
     if (const auto* store = obj.as<tirx::BufferStoreNode>()) {
-      max_bytes = std::max(max_bytes, static_cast<size_t>(store->value->dtype.bytes()));
+      max_bytes = std::max(max_bytes, static_cast<size_t>((store->value.ty().bits() + 7) / 8));
     } else if (const auto* load = obj.as<tirx::BufferLoadNode>()) {
-      max_bytes = std::max(max_bytes, static_cast<size_t>(load->dtype.bytes()));
+      max_bytes = std::max(max_bytes, static_cast<size_t>((load->ty().bits() + 7) / 8));
     } else if (const auto* call = obj.as<tirx::CallNode>()) {
       static const Op& q_multiply_shift_per_axis_op = Op::Get("tirx.q_multiply_shift_per_axis");
       static const Op& q_multiply_shift_op = Op::Get("tirx.q_multiply_shift");
@@ -104,7 +104,7 @@ size_t GetMaxUsedDtypeBytes(SBlock block) {
         max_bytes = std::max<size_t>(max_bytes, 8);
       }
     } else if (const auto* cast = obj.as<tirx::CastNode>()) {
-      max_bytes = std::max<size_t>(max_bytes, cast->dtype.bytes());
+      max_bytes = std::max<size_t>(max_bytes, (cast->ty().bits() + 7) / 8);
     }
   });
 
diff --git a/src/s_tir/meta_schedule/profiler.cc b/src/s_tir/meta_schedule/profiler.cc
index 91415447a48c..05580bcdee10 100644
--- a/src/s_tir/meta_schedule/profiler.cc
+++ b/src/s_tir/meta_schedule/profiler.cc
@@ -32,7 +32,7 @@ namespace meta_schedule {
 ffi::Map<ffi::String, FloatImm> ProfilerNode::Get() const {
   ffi::Map<ffi::String, FloatImm> ret;
   for (const auto& kv : stats_sec) {
-    ret.Set(kv.first, FloatImm(DataType::Float(64), kv.second));
+    ret.Set(kv.first, FloatImm(PrimType::Float(64), kv.second));
   }
   return ret;
 }
diff --git a/src/s_tir/meta_schedule/schedule/cuda/thread_bind.cc b/src/s_tir/meta_schedule/schedule/cuda/thread_bind.cc
index f9e8b14d8679..3aec6e51c364 100644
--- a/src/s_tir/meta_schedule/schedule/cuda/thread_bind.cc
+++ b/src/s_tir/meta_schedule/schedule/cuda/thread_bind.cc
@@ -60,7 +60,7 @@ std::function<ExprRV(int64_t)> MakeFactorSampler(Schedule sch, ffi::Array<int64_
     if (n == 1) {
       return IntImm::Int32(extents[0]);
     }
-    ffi::Array<FloatImm> probs(n, FloatImm(DataType::Float(32), 1.0 / n));
+    ffi::Array<FloatImm> probs(n, FloatImm(PrimType::Float(32), 1.0 / n));
     return sch->SampleCategorical(extents, probs);
   };
 }
diff --git a/src/s_tir/meta_schedule/schedule_rule/cross_thread_reduction.cc b/src/s_tir/meta_schedule/schedule_rule/cross_thread_reduction.cc
index 1c7506e83068..5f075ea1c210 100644
--- a/src/s_tir/meta_schedule/schedule_rule/cross_thread_reduction.cc
+++ b/src/s_tir/meta_schedule/schedule_rule/cross_thread_reduction.cc
@@ -78,7 +78,7 @@ class CrossThreadReductionNode : public ScheduleRuleNode {
 
     // Step 3. Try block fusion.
     int n_candidate = static_cast<int>(thread_extents.size());
-    ffi::Array<FloatImm> probs(n_candidate, FloatImm(DataType::Float(32), 1.0 / n_candidate));
+    ffi::Array<FloatImm> probs(n_candidate, FloatImm(PrimType::Float(32), 1.0 / n_candidate));
     s_tir::ExprRV thread_extent = tmp_sch->SampleCategorical(thread_extents, probs);
     if (fusible) {
       TVM_FFI_ICHECK(target_sblock.defined());
diff --git a/src/s_tir/meta_schedule/schedule_rule/multi_level_tiling.cc b/src/s_tir/meta_schedule/schedule_rule/multi_level_tiling.cc
index 2da29cc8e983..1cd504dfee68 100644
--- a/src/s_tir/meta_schedule/schedule_rule/multi_level_tiling.cc
+++ b/src/s_tir/meta_schedule/schedule_rule/multi_level_tiling.cc
@@ -369,15 +369,15 @@ void MultiLevelTilingNode::AnnotateCooperativeFetching(Schedule* sch,
   // Filter out invalid vector lanes according to the data type.
   const tirx::SBlockNode* block_node = (*sch)->GetSRef(block)->StmtAs<tirx::SBlockNode>();
   TVM_FFI_ICHECK_EQ(block_node->writes.size(), 1);
-  const runtime::DataType dtype = block_node->writes[0]->buffer->dtype;
+  const DLDataType dtype = block_node->writes[0]->buffer->dtype->dtype;
   std::function<bool(int)> f_filter = nullptr;
-  if (dtype == runtime::DataType::Float(32)) {
+  if (dtype == DLDataType{kDLFloat, 32, 1}) {
     f_filter = [&](int vector_len) { return vector_len <= 4; };
-  } else if (dtype == runtime::DataType::Float(16)) {
+  } else if (dtype == DLDataType{kDLFloat, 16, 1}) {
     f_filter = [&](int vector_len) {
       return (vector_len == 1 || vector_len % 2 == 0) && vector_len <= 8;
     };
-  } else if (dtype == runtime::DataType::Int(8)) {
+  } else if (dtype == DLDataType{kDLInt, 8, 1}) {
     f_filter = [&](int vector_len) { return vector_len <= 16; };
   }
   std::vector<int> valid_vector_lens;
@@ -396,7 +396,7 @@ void MultiLevelTilingNode::AnnotateCooperativeFetching(Schedule* sch,
     valid_vector_lens_arr.reserve(valid_vector_lens.size());
     for (int v : valid_vector_lens) valid_vector_lens_arr.push_back(static_cast<int64_t>(v));
     s_tir::ExprRV vector_load_len = (*sch)->SampleCategorical(
-        valid_vector_lens_arr, ffi::Array<FloatImm>(n, FloatImm(DataType::Float(32), prob)));
+        valid_vector_lens_arr, ffi::Array<FloatImm>(n, FloatImm(PrimType::Float(32), prob)));
     (*sch)->Annotate(block, s_tir::attr::meta_schedule_cooperative_fetch, vector_load_len);
   }
 }
diff --git a/src/s_tir/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/s_tir/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
index 1eb00bf8e6de..6a97fe642178 100644
--- a/src/s_tir/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
+++ b/src/s_tir/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
@@ -626,10 +626,12 @@ std::vector<State> MultiLevelTilingTensorCoreNode::AddReadReuseTensorCore(
     tirx::Buffer cache_read_buffer =
         s_tir::GetNthAccessBuffer(sch->state(), ffi::GetRef<tirx::SBlock>(cache_read_block), 0,
                                   s_tir::BufferIndexType::kWrite);
-    const DataType& dtype = cache_read_buffer->dtype;
-    if (dtype.is_float16()) {
+    const DLDataType dtype = cache_read_buffer->dtype->dtype;
+    // Storage alignment is chosen from element storage width; this schedule rule uses scalar
+    // cache-read buffers, so the old element-type-only test is preserved.
+    if ((((dtype).code == kDLFloat) && ((dtype).bits == 16))) {
       sch->StorageAlign(cache_read, 0, -2, 32, 8);
-    } else if (dtype.is_int() && dtype.bits() == 8) {
+    } else if (((dtype).code == kDLInt) && dtype.bits == 8) {
       sch->StorageAlign(cache_read, 0, -2, 32, 16);
     } else {
       TVM_PY_LOG(WARNING, logger) << "StorageAlign is not applied for data type " << dtype
diff --git a/src/s_tir/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc b/src/s_tir/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc
index a0c5f0a1f344..5ce5a1a8cc0e 100644
--- a/src/s_tir/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc
+++ b/src/s_tir/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc
@@ -75,7 +75,7 @@ class ParallelizeVectorizeUnrollNode : public ScheduleRuleNode {
     if (!unroll_max_steps.empty() && !s_tir::CheckSpatialPrimFunc(sch, root_rv)) {
       int n = unroll_max_steps.size();
       double prob = 1.0 / n;
-      ffi::Array<FloatImm> probs(n, FloatImm(DataType::Float(32), prob));
+      ffi::Array<FloatImm> probs(n, FloatImm(PrimType::Float(32), prob));
       PrimExpr max_step = sch->SampleCategorical(unroll_max_steps, probs);
       if (unroll_explicit) {
         sch->Annotate(root_rv, s_tir::attr::meta_schedule_unroll_explicit, max_step);
diff --git a/src/s_tir/meta_schedule/schedule_rule/schedule_rule.cc b/src/s_tir/meta_schedule/schedule_rule/schedule_rule.cc
index 6c421bd671be..eee9ef2685b8 100644
--- a/src/s_tir/meta_schedule/schedule_rule/schedule_rule.cc
+++ b/src/s_tir/meta_schedule/schedule_rule/schedule_rule.cc
@@ -16,8 +16,8 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/data_type.h>
 
 #include "../utils.h"
 
diff --git a/src/s_tir/meta_schedule/utils.h b/src/s_tir/meta_schedule/utils.h
index ed6e11e24384..946f7e58ebc4 100644
--- a/src/s_tir/meta_schedule/utils.h
+++ b/src/s_tir/meta_schedule/utils.h
@@ -463,7 +463,7 @@ inline ffi::Array<FloatImm> AsFloatArray(const ffi::ObjectRef& obj) {
   for (Any val : *arr) {
     auto float_value = [&]() -> FloatImm {
       if (auto opt_int_imm = val.try_cast<IntImm>()) {
-        return FloatImm(DataType::Float(32), (*opt_int_imm)->value);
+        return FloatImm(PrimType::Float(32), (*opt_int_imm)->value);
       } else if (auto opt_float_imm = val.try_cast<FloatImm>()) {
         return *std::move(opt_float_imm);
       } else {
diff --git a/src/s_tir/schedule/analysis/layout.cc b/src/s_tir/schedule/analysis/layout.cc
index 35e04cbced6c..223bd4683270 100644
--- a/src/s_tir/schedule/analysis/layout.cc
+++ b/src/s_tir/schedule/analysis/layout.cc
@@ -40,7 +40,7 @@ ffi::Array<PrimExpr> GetStrides(const Buffer& buffer) {
     return {};
   }
   ffi::Array<PrimExpr> strides(ndim, PrimExpr{nullptr});
-  PrimExpr stride = MakeConst(buffer->DefaultIndexType(), 1);
+  PrimExpr stride = MakeConst(PrimType(buffer->DefaultIndexType()), 1);
   for (int i = ndim - 1; i >= 0; --i) {
     strides.Set(i, stride);
     stride = stride * buffer->shape[i];
@@ -146,7 +146,7 @@ ffi::Optional<IndexMap> SuggestIndexMap(const Buffer& buffer, const ffi::Array<P
   // Step 2. Calculate a functor that flattens a multi-dimensional index
   auto f_flatten_index = [ndim, strides = GetStrides(buffer), dtype = buffer->DefaultIndexType()](
                              const ffi::Array<PrimExpr>& indices) -> PrimExpr {
-    PrimExpr flatten_index = IntImm(dtype, 0);
+    PrimExpr flatten_index = IntImm(PrimType(dtype), 0);
     for (int i = 0; i < ndim; ++i) {
       flatten_index = flatten_index + strides[i] * indices[i];
     }
@@ -223,7 +223,7 @@ ffi::Optional<IndexMap> SuggestIndexMap(const Buffer& buffer, const ffi::Array<P
     }
 
     // Step 6.2: Fuse all the indices. This is the inverse of Step 5.2.
-    PrimExpr flattened_index = IntImm(indices[0]->dtype, 0);
+    PrimExpr flattened_index = IntImm(indices[0].ty(), 0);
     int64_t stride = 1;
     for (int i = static_cast<int>(split_exprs.size()) - 1; i >= 0; --i) {
       flattened_index = inv_permuted_indices[i] * IntImm::Int32(stride) + flattened_index;
diff --git a/src/s_tir/schedule/analysis/reducer.cc b/src/s_tir/schedule/analysis/reducer.cc
index d6bb5c903492..f79afdacd16c 100644
--- a/src/s_tir/schedule/analysis/reducer.cc
+++ b/src/s_tir/schedule/analysis/reducer.cc
@@ -137,7 +137,7 @@ class PatternMatcher : public ExprVisitor {
     if (ptr == nullptr) {
       match_success_ = false;
     } else {
-      if (!runtime::TypeEqual(op->dtype, ptr->dtype)) {
+      if (op->ty()->dtype != ptr->ty()->dtype) {
         match_success_ = false;
       } else {
         PrimExpr tmp = expr_to_match_;
diff --git a/src/s_tir/schedule/concrete_schedule.cc b/src/s_tir/schedule/concrete_schedule.cc
index b891f6cb81be..7dd4f1da71bf 100644
--- a/src/s_tir/schedule/concrete_schedule.cc
+++ b/src/s_tir/schedule/concrete_schedule.cc
@@ -498,8 +498,8 @@ ffi::Array<LoopRV> ConcreteScheduleNode::Split(const LoopRV& loop_rv,
       if (is_const_int(factor) && !is_positive_const(factor)) {
         throw NonPositiveFactorError(state_->mod, factor.as<IntImmNode>()->value, i);
       }
-      if (factor.dtype().bits() > loop->extent.dtype().bits()) {
-        factor = cast(loop->extent.dtype(), factor);
+      if (factor.ty().bits() > loop->extent.ty().bits()) {
+        factor = cast(loop->extent.ty(), factor);
       }
       factors.push_back(factor);
       tot_length *= factor;
@@ -565,8 +565,8 @@ ffi::Array<LoopRV> ConcreteScheduleNode::LoopPartition(
       if (is_const_int(factor) && !is_positive_const(factor)) {
         throw NonPositiveFactorError(state_->mod, factor.as<IntImmNode>()->value, i);
       }
-      if (factor.dtype().bits() > loop->extent.dtype().bits()) {
-        factor = cast(loop->extent.dtype(), factor);
+      if (factor.ty().bits() > loop->extent.ty().bits()) {
+        factor = cast(loop->extent.ty(), factor);
       }
       factors.push_back(factor);
       tot_length += factor;
diff --git a/src/s_tir/schedule/concrete_schedule.h b/src/s_tir/schedule/concrete_schedule.h
index 5dd094dc388c..13bdaef6a224 100644
--- a/src/s_tir/schedule/concrete_schedule.h
+++ b/src/s_tir/schedule/concrete_schedule.h
@@ -369,7 +369,7 @@ inline T ConcreteScheduleNode::CreateRV(const StmtSRef& sref) {
 }
 
 inline ExprRV ConcreteScheduleNode::CreateRV(int64_t value) {
-  Var rv("v" + std::to_string(this->symbol_table_.size() + 1), DataType::Int(32));
+  Var rv("v" + std::to_string(this->symbol_table_.size() + 1), PrimType::Int(32));
   this->symbol_table_.Set(rv, IntImm::Int32(static_cast<int32_t>(value)));
   return rv;
 }
diff --git a/src/s_tir/schedule/ir_comparator.cc b/src/s_tir/schedule/ir_comparator.cc
index 1529923ca5fe..8b5ed55ed74d 100644
--- a/src/s_tir/schedule/ir_comparator.cc
+++ b/src/s_tir/schedule/ir_comparator.cc
@@ -94,8 +94,8 @@ bool TensorizeComparator::VisitStmt(const Stmt& n, const Stmt& other) {
 
 bool TensorizeComparator::VisitExpr(const PrimExpr& n, const PrimExpr& other) {
   bool equal = n.same_as(other) ||
-               ((n->type_index() == other->type_index()) &&
-                n.dtype().code() == other.dtype().code() && ExprComparator::VisitExpr(n, other)) ||
+               ((n->type_index() == other->type_index()) && n.ty().code() == other.ty().code() &&
+                ExprComparator::VisitExpr(n, other)) ||
                (ContainsVscaleCall(n) && analyzer_->CanProveEqual(n, other));
 
   if (!equal && assert_mode_) {
@@ -109,11 +109,11 @@ bool TensorizeComparator::VisitExpr(const PrimExpr& n, const PrimExpr& other) {
 bool TensorizeComparator::VisitExpr_(const CallNode* op, const PrimExpr& other) {
   const auto* rhs = other.as<CallNode>();
   if (!rhs->op.same_as(op->op)) return false;
-  if (op->dtype.code() != rhs->dtype.code()) {
+  if (op->ty().code() != rhs->ty().code()) {
     if (assert_mode_) {
       std::ostringstream os;
-      os << "CallNode data type codes do not match: op->dtype.code()=" << op->dtype.code()
-         << " vs rhs->dtype.code()=" << rhs->dtype.code();
+      os << "CallNode data type codes do not match: op->dtype.code()=" << op->ty().code()
+         << " vs rhs->dtype.code()=" << rhs->ty().code();
       EmitError(os.str());
     }
     return false;
@@ -330,11 +330,11 @@ bool TensorizeComparator::VisitExpr_(const VarNode* op, const PrimExpr& other) {
   const auto* rhs = other.as<VarNode>();
   auto lhs = ffi::GetRef<Var>(op);
   if (lhs.same_as(other)) return true;
-  if (op->dtype.code() != rhs->dtype.code()) {
+  if (op->ty().code() != rhs->ty().code()) {
     if (assert_mode_) {
       std::ostringstream os;
-      os << "VarNode data type codes do not match: op->dtype.code()=" << op->dtype.code()
-         << " vs rhs->dtype.code()=" << rhs->dtype.code();
+      os << "VarNode data type codes do not match: op->dtype.code()=" << op->ty().code()
+         << " vs rhs->dtype.code()=" << rhs->ty().code();
       EmitError(os.str());
     }
     return false;
@@ -363,7 +363,7 @@ bool TensorizeComparator::DefEqual(const Var& lhs, const Var& rhs) {
   equal_map_[lhs] = rhs;
   // Cast if necessary. This allows the workload and the tensor intrin to have different dtypes in
   // the indices.
-  analyzer_->Bind(lhs, cast(lhs.dtype(), rhs));
+  analyzer_->Bind(lhs, cast(lhs.ty(), rhs));
   return true;
 }
 
diff --git a/src/s_tir/schedule/primitive/block_annotate.cc b/src/s_tir/schedule/primitive/block_annotate.cc
index cbdfae481d14..5081f5e8aff4 100644
--- a/src/s_tir/schedule/primitive/block_annotate.cc
+++ b/src/s_tir/schedule/primitive/block_annotate.cc
@@ -298,7 +298,7 @@ class DTypeMutator : private ReplaceBufferMutator {
    * \param block_sref_reuse The block sref reuse map to be updated
    * \return The new block after the mutation
    */
-  static SBlock Mutate(const SBlock& allocate_site, const Buffer& old_buffer, const DataType& dtype,
+  static SBlock Mutate(const SBlock& allocate_site, const Buffer& old_buffer, DLDataType dtype,
                        ffi::Map<SBlock, SBlock>* block_sref_reuse) {
     Buffer new_buffer = WithDType(old_buffer, dtype);
     DTypeMutator mutator(old_buffer, new_buffer, dtype, block_sref_reuse);
@@ -307,16 +307,16 @@ class DTypeMutator : private ReplaceBufferMutator {
   }
 
  private:
-  DTypeMutator(const Buffer& old_buffer, Buffer new_buffer, const DataType& dtype,
+  DTypeMutator(const Buffer& old_buffer, Buffer new_buffer, DLDataType dtype,
                ffi::Map<SBlock, SBlock>* block_sref_reuse)
       : ReplaceBufferMutator(old_buffer, std::move(new_buffer), block_sref_reuse),
-        src_dtype_(old_buffer->dtype),
+        src_dtype_(old_buffer->dtype->dtype),
         tgt_dtype_(dtype) {}
 
   MatchBufferRegion VisitMatchBufferRegion(const MatchBufferRegion& match_buffer) final {
     auto it = buffer_var_map_.find(match_buffer->source->buffer->data.get());
     if (it != buffer_var_map_.end()) {
-      Buffer new_target_buffer = WithDType(match_buffer->buffer, it->second->dtype);
+      Buffer new_target_buffer = WithDType(match_buffer->buffer, it->second->dtype->dtype);
       buffer_var_map_[match_buffer->buffer->data.get()] = new_target_buffer;
       return MatchBufferRegion(new_target_buffer,
                                BufferRegion(it->second, match_buffer->source->region));
@@ -330,7 +330,7 @@ class DTypeMutator : private ReplaceBufferMutator {
     auto it = buffer_var_map_.find(node->buffer->data.get());
     if (it != buffer_var_map_.end()) {
       node.CopyOnWrite()->buffer = it->second;
-      node.CopyOnWrite()->value = Cast(tgt_dtype_, node->value);
+      node.CopyOnWrite()->value = Cast(PrimType(tgt_dtype_), node->value);
     }
     return node;
   }
@@ -339,12 +339,12 @@ class DTypeMutator : private ReplaceBufferMutator {
     BufferLoad node = StmtExprMutator::VisitExpr_(op).as_or_throw<BufferLoad>();
     auto it = buffer_var_map_.find(node->buffer->data.get());
     if (it != buffer_var_map_.end()) {
-      return Cast(src_dtype_, BufferLoad(it->second, node->indices));
+      return Cast(PrimType(src_dtype_), BufferLoad(it->second, node->indices));
     }
     return node;
   }
 
-  DataType src_dtype_, tgt_dtype_;
+  DLDataType src_dtype_, tgt_dtype_;
 };
 
 void UnsafeSetDType(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
@@ -352,10 +352,10 @@ void UnsafeSetDType(ScheduleState self, const StmtSRef& block_sref, int buffer_i
   const SBlockNode* block = TVM_SREF_TO_SBLOCK(block_sref);
   Buffer buffer =
       GetNthAccessBuffer(self, ffi::GetRef<SBlock>(block), buffer_index, BufferIndexType::kWrite);
-  DataType target_dtype(ffi::StringToDLDataType(dtype));
+  DLDataType target_dtype = ffi::StringToDLDataType(dtype);
 
   // Step 1. If `dtype` equals the original data type, just return.
-  if (buffer->dtype == target_dtype) {
+  if (buffer->dtype->dtype == target_dtype) {
     return;
   }
 
diff --git a/src/s_tir/schedule/primitive/blockize_tensorize.cc b/src/s_tir/schedule/primitive/blockize_tensorize.cc
index c5fa57e835ca..fae81d233b48 100644
--- a/src/s_tir/schedule/primitive/blockize_tensorize.cc
+++ b/src/s_tir/schedule/primitive/blockize_tensorize.cc
@@ -37,7 +37,7 @@ bool UsesVar(const T& x, const Var& var) {
 }
 
 Range RangeFromExtent(const PrimExpr& extent) {
-  return Range::FromMinExtent(IntImm(extent->dtype, 0), extent);
+  return Range::FromMinExtent(IntImm(extent.ty(), 0), extent);
 }
 
 template <class T>
@@ -256,7 +256,7 @@ ffi::Map<Var, PrimExpr> DeriveBlockBinding(
       // substitution
       if (is_one(outer_mark->extent) && !preserve_unit_iters) {
         // Simplify outer if not preserve_unit_iters
-        sub = IntImm(outer_mark->extent.dtype(), 0);
+        sub = IntImm(outer_mark->extent.ty(), 0);
       } else {
         sub = outer_iter;
       }
@@ -776,14 +776,14 @@ void Tensorize(ScheduleState self, const StmtSRef& sref, const TensorIntrin& int
   auto f_update_max_dtype_bits_from_region = [&](const ffi::Array<BufferRegion>& buffer_regions) {
     for (const BufferRegion& buffer_region : buffer_regions) {
       for (const auto& range : buffer_region->region) {
-        index_dtype_bits = std::max(index_dtype_bits, range->min.dtype().bits());
+        index_dtype_bits = std::max(index_dtype_bits, range->min.ty().bits());
       }
     }
   };
   f_update_max_dtype_bits_from_region(block_realize->block->reads);
   f_update_max_dtype_bits_from_region(block_realize->block->writes);
   TVM_FFI_ICHECK(index_dtype_bits > 0);
-  intrin_impl = IndexDataTypeNormalizer(DataType::Int(index_dtype_bits)).Rewrite(intrin_impl);
+  intrin_impl = IndexDataTypeNormalizer(PrimType::Int(index_dtype_bits)).Rewrite(intrin_impl);
   // Step 2: Structural pattern matching
   TensorizeComparator comparator(self->mod, /*assert_mode=*/true);
   comparator.VisitStmt(block_realize, intrin_desc->body);
@@ -829,12 +829,12 @@ void Tensorize(ScheduleState self, const StmtSRef& sref, const TensorIntrin& int
     new_region.reserve(cur->shape.size());
     for (int i = 0; i < offset; i++) {
       PrimExpr min = indices_base[i];
-      PrimExpr extent = MakeConst(min.dtype(), 1);
+      PrimExpr extent = MakeConst(min.ty(), 1);
       new_region.push_back(Range::FromMinExtent(min, extent));
     }
     for (int i = 0; i < static_cast<int>(old_region.size()); i++) {
       PrimExpr min = indices_base[i + offset];
-      PrimExpr extent = cast(min.dtype(), old_region[i]->extent);
+      PrimExpr extent = cast(min.ty(), old_region[i]->extent);
       new_region.push_back(Range::FromMinExtent(min, extent));
     }
     match_buffer_regions.push_back(MatchBufferRegion(impl, BufferRegion(cur, new_region)));
diff --git a/src/s_tir/schedule/primitive/cache_index.cc b/src/s_tir/schedule/primitive/cache_index.cc
index 1fc7dbdc263e..1ef05eed66d1 100644
--- a/src/s_tir/schedule/primitive/cache_index.cc
+++ b/src/s_tir/schedule/primitive/cache_index.cc
@@ -58,14 +58,14 @@ struct IndexInfo {
  * \param range The range of the integer.
  * \returns A data type that covers the input range.
  */
-DataType DetermineDatatype(const arith::IntSet& range) {
+PrimType DeterminePrimType(const arith::IntSet& range) {
   arith::Analyzer ana;
   if (ana->CanProve(range.min() >= INT32_MIN && range.max() <= INT32_MAX)) {
-    return DataType::Int(32);
+    return PrimType::Int(32);
   } else {
     TVM_FFI_ICHECK(ana->CanProve(range.min() >= IntImm::Int64(INT64_MIN) &&
                                  range.max() <= IntImm::Int64(INT64_MAX)));
-    return DataType::Int(64);
+    return PrimType::Int(64);
   }
 }
 
@@ -261,23 +261,23 @@ ffi::Array<SBlock> MakeIndexCacheStage(IndexInfo* info, const ffi::String& stora
       });
     }
 
-    DataType data_type = index_expr.dtype();
+    PrimType data_ty = index_expr.ty();
     Var index_buffer_var("index_var_" + std::to_string(expr_index),
-                         PointerType(PrimType(data_type), storage_scope));
+                         PointerType(data_ty, storage_scope));
     ffi::Array<PrimExpr> buffer_shape;
     for (const Var& it : info->origin_block_vars[expr_index]) {
       buffer_shape.push_back(
           arith::EvalSet(info->var_binding.at(it), arith::AsIntSet(info->range_map)).max() + 1);
     }
-    info->cache_buffer.push_back(Buffer(index_buffer_var, data_type, buffer_shape, {1}, {0},
+    info->cache_buffer.push_back(Buffer(index_buffer_var, data_ty->dtype, buffer_shape, {1}, {0},
                                         index_buffer_var->name_hint, 0, 0, kDefault));
 
     // Create loop vars and block vars' binding_value
     std::vector<Var> loop_vars;
     ffi::Map<Var, Var> replace_table;
     for (const Var& it : iter_vars) {
-      DataType data_type = DetermineDatatype(arith::IntSet::FromRange(info->range_map.at(it)));
-      Var loop_var("ax" + std::to_string(replace_table.size()), data_type);
+      PrimType data_ty = DeterminePrimType(arith::IntSet::FromRange(info->range_map.at(it)));
+      Var loop_var("ax" + std::to_string(replace_table.size()), data_ty);
       loop_vars.push_back(loop_var);
       replace_table.Set(it, loop_var);
     }
@@ -296,15 +296,15 @@ ffi::Array<SBlock> MakeIndexCacheStage(IndexInfo* info, const ffi::String& stora
     // Create block vars, block's accessed region and accessing indices
     for (size_t i = 0; i < info->origin_block_vars[expr_index].size(); i++) {
       const Var& block_var = info->origin_block_vars[expr_index][i];
-      Var var("v" + std::to_string(access_indices.size()), block_var.dtype());
-      Range range = Range::FromMinExtent(IntImm(block_var.dtype(), 0),
-                                         info->range_map.at(iter_vars[i])->extent);
+      Var var("v" + std::to_string(access_indices.size()), block_var.ty());
+      Range range =
+          Range::FromMinExtent(IntImm(block_var.ty(), 0), info->range_map.at(iter_vars[i])->extent);
       block_vars.push_back(IterVar(/*dom=*/range,
                                    /*var=*/var,
                                    /*IterVarType=*/kDataPar));
 
       access_indices.push_back(var);
-      access_region.push_back(Range::FromMinExtent(var, MakeConst(var.dtype(), 1)));
+      access_region.push_back(Range::FromMinExtent(var, MakeConst(var.ty(), 1)));
       block_var_map.Set(block_var, var);
     }
 
diff --git a/src/s_tir/schedule/primitive/cache_read_write.cc b/src/s_tir/schedule/primitive/cache_read_write.cc
index 46b8842a88b1..f5426f93cbae 100644
--- a/src/s_tir/schedule/primitive/cache_read_write.cc
+++ b/src/s_tir/schedule/primitive/cache_read_write.cc
@@ -165,7 +165,7 @@ SBlock MakeReindexCacheStage(const BufferRegion& cache_region, ReindexCacheStage
   ffi::Map<Var, Var> var_map;
   for (size_t i = 0; i < info->loop_vars.size(); ++i) {
     Var original_var = info->loop_vars[i];
-    Var loop_var(original_var->name_hint, original_var.dtype());
+    Var loop_var(original_var->name_hint, original_var.ty());
     var_map.Set(original_var, loop_var);
     loop_vars.push_back(loop_var);
   }
@@ -174,7 +174,7 @@ SBlock MakeReindexCacheStage(const BufferRegion& cache_region, ReindexCacheStage
     PrimExpr original_iter_value = info->block_iter_values[i];
     IterVar block_var = IterVar(
         /*dom=*/original_block_var->dom,
-        /*var=*/Var(original_block_var->var->name_hint, original_block_var->var.dtype()),
+        /*var=*/Var(original_block_var->var->name_hint, original_block_var->var.ty()),
         /*IterVarType=*/kDataPar);
     var_map.Set(original_block_var->var, block_var->var);
     block_vars.push_back(block_var);
@@ -247,7 +247,7 @@ SBlock MakeCacheStage(const BufferRegion& cache_region, CacheStageInfo* info,
   std::vector<PrimExpr> iter_values;
   // Create loop vars and block vars' binding_value
   for (const Range& axis_range : cache_region->region) {
-    Var loop_var("ax" + std::to_string(loop_vars.size()), axis_range->extent.dtype());
+    Var loop_var("ax" + std::to_string(loop_vars.size()), axis_range->extent.ty());
     loop_vars.push_back(loop_var);
     iter_values.push_back(cache_full_region ? (axis_range->min + loop_var) : loop_var);
   }
@@ -262,35 +262,35 @@ SBlock MakeCacheStage(const BufferRegion& cache_region, CacheStageInfo* info,
   // Create block vars, block's accessed region and accessing indices
   for (int i = 0; i < static_cast<int>(cache_region->buffer->shape.size()); ++i) {
     Range axis_range = cache_region->region[i];
-    Var var("v" + std::to_string(read_access_indices.size()), axis_range->extent.dtype());
+    Var var("v" + std::to_string(read_access_indices.size()), axis_range->extent.ty());
     if (cache_full_region) {
       PrimExpr dim = cache_region->buffer->shape[i];
-      block_vars.push_back(IterVar(/*dom=*/Range::FromMinExtent(IntImm(dim->dtype, 0), dim),
+      block_vars.push_back(IterVar(/*dom=*/Range::FromMinExtent(IntImm(dim.ty(), 0), dim),
                                    /*var=*/var,
                                    /*IterVarType=*/kDataPar));
       read_access_indices.push_back(var);
       write_access_indices.push_back(var);
-      read_access_region.push_back(Range::FromMinExtent(var, MakeConst(var.dtype(), 1)));
-      write_access_region.push_back(Range::FromMinExtent(var, MakeConst(var.dtype(), 1)));
+      read_access_region.push_back(Range::FromMinExtent(var, MakeConst(var.ty(), 1)));
+      write_access_region.push_back(Range::FromMinExtent(var, MakeConst(var.ty(), 1)));
     } else {
       block_vars.push_back(IterVar(
-          /*dom=*/Range::FromMinExtent(IntImm(axis_range->extent.dtype(), 0), axis_range->extent),
+          /*dom=*/Range::FromMinExtent(IntImm(axis_range->extent.ty(), 0), axis_range->extent),
           /*var=*/var,
           /*IterVarType=*/kDataPar));
       if (cache_region->buffer.same_as(info->read_buffer)) {
         // cache_read
         read_access_indices.push_back(axis_range->min + var);
         read_access_region.push_back(
-            Range::FromMinExtent(axis_range->min + var, MakeConst(var.dtype(), 1)));
+            Range::FromMinExtent(axis_range->min + var, MakeConst(var.ty(), 1)));
         write_access_indices.push_back(var);
-        write_access_region.push_back(Range::FromMinExtent(var, MakeConst(var.dtype(), 1)));
+        write_access_region.push_back(Range::FromMinExtent(var, MakeConst(var.ty(), 1)));
       } else {
         // cache_write
         write_access_indices.push_back(axis_range->min + var);
         write_access_region.push_back(
-            Range::FromMinExtent(axis_range->min + var, MakeConst(var.dtype(), 1)));
+            Range::FromMinExtent(axis_range->min + var, MakeConst(var.ty(), 1)));
         read_access_indices.push_back(var);
-        read_access_region.push_back(Range::FromMinExtent(var, MakeConst(var.dtype(), 1)));
+        read_access_region.push_back(Range::FromMinExtent(var, MakeConst(var.ty(), 1)));
       }
     }
   }
@@ -361,7 +361,7 @@ SBlock MakeReIndexStage(const SBlock& block, CacheStageInfo* info,
   std::unordered_set<int> skipped_block_iters;
   for (int i = 0, n = block->iter_vars.size(); i < n; ++i) {
     const IterVar& iter = block->iter_vars[i];
-    Var var("v" + std::to_string(new_block_iters.size()), iter->var->dtype);
+    Var var("v" + std::to_string(new_block_iters.size()), iter->var.ty());
     bool used = covered.count(iter->var);
     if (used) {
       new_block_iters.push_back(IterVar(/*dom=*/iter->dom,
@@ -415,7 +415,7 @@ SBlock MakeReIndexStage(const SBlock& block, CacheStageInfo* info,
     if (skipped_block_iters.count(i)) {
       continue;
     }
-    Var loop_var("ax" + std::to_string(loop_vars.size()), block->iter_vars[i]->var->dtype);
+    Var loop_var("ax" + std::to_string(loop_vars.size()), block->iter_vars[i]->var.ty());
     loop_vars.push_back(loop_var);
     iter_values.push_back(loop_var);
   }
@@ -1620,7 +1620,7 @@ class ReIndexRewriter : public StmtExprMutator {
       for (const IterVar& iter : block->iter_vars) {
         if (covered_.count(iter->var)) {
           indices_.push_back(iter->var);
-          region_.push_back(Range::FromMinExtent(iter->var, IntImm(iter->var->dtype, 1)));
+          region_.push_back(Range::FromMinExtent(iter->var, IntImm(iter->var.ty(), 1)));
         }
       }
       SBlock stmt = StmtExprMutator::VisitStmt_(block).as_or_throw<SBlock>();
diff --git a/src/s_tir/schedule/primitive/compute_at.cc b/src/s_tir/schedule/primitive/compute_at.cc
index 8e1050709173..9dcdb1bb04b1 100644
--- a/src/s_tir/schedule/primitive/compute_at.cc
+++ b/src/s_tir/schedule/primitive/compute_at.cc
@@ -267,12 +267,12 @@ class ScopeReconstructor : private StmtMutator {
     for (int i = 0; i < n_iters; ++i) {
       Range iter_dom = iter_doms[i].dom.CoverRange(block_->iter_vars[i]->dom);
       if (preserve_unit_loops || !is_one(iter_dom->extent)) {
-        int bits = std::max(iter_dom->min.dtype().bits(), iter_dom->extent.dtype().bits());
-        Var var("ax" + std::to_string(loop_vars.size()), DataType::Int(bits));
+        int bits = std::max(iter_dom->min.ty().bits(), iter_dom->extent.ty().bits());
+        Var var("ax" + std::to_string(loop_vars.size()), PrimType::Int(bits));
         loop_vars.push_back(var);
         loop_extents.push_back(analyzer->Simplify(iter_dom->extent));
         iter_values.push_back(iter_dom->min + var);
-        analyzer->Bind(var, Range::FromMinExtent(IntImm(var.dtype(), 0), iter_dom->extent));
+        analyzer->Bind(var, Range::FromMinExtent(IntImm(var.ty(), 0), iter_dom->extent));
       } else {
         iter_values.push_back(iter_dom->min);
       }
@@ -578,7 +578,7 @@ bool UpdateBlockVarDomainAffine(const BufferNode* buffer, const ffi::Array<IterV
   NDIntSet required_bound;
   for (size_t i = 0; i < ndim; ++i) {
     required_bound.push_back(
-        arith::IntSet::Interval(IntImm(buffer->shape[i]->dtype, 0), max(buffer->shape[i] - 1, 0)));
+        arith::IntSet::Interval(IntImm(buffer->shape[i].ty(), 0), max(buffer->shape[i] - 1, 0)));
   }
   ffi::Map<Var, arith::IntSet> var_dom =
       InverseAffineIterMap(res->indices, required_region, analyzer);
diff --git a/src/s_tir/schedule/primitive/compute_inline.cc b/src/s_tir/schedule/primitive/compute_inline.cc
index 3f177d52f81a..e295941dbdb1 100644
--- a/src/s_tir/schedule/primitive/compute_inline.cc
+++ b/src/s_tir/schedule/primitive/compute_inline.cc
@@ -513,7 +513,7 @@ class ComputeInliner : public BaseInliner {
     }
     idx_vars_.resize(buffer_ndim);
     for (size_t i = 0; i < idx_vars_.size(); ++i) {
-      idx_vars_[i] = Var("ph_" + std::to_string(i), inlined_store_->indices[i].dtype());
+      idx_vars_[i] = Var("ph_" + std::to_string(i), inlined_store_->indices[i].ty());
     }
     auto inverse_iter_map = arith::InverseAffineIterMap(
         res->indices, ffi::Array<PrimExpr>(idx_vars_.begin(), idx_vars_.end()));
@@ -726,7 +726,7 @@ class ReverseComputeInliner : public BaseInliner {
     if (producer_block->annotations.count(s_tir::attr::auto_copy) != 0) {
       auto bind = [&](const ForNode* loop) {
         analyzer_->Bind(loop->loop_var,
-                        Range::FromMinExtent(IntImm(loop->extent->dtype, 0), loop->extent));
+                        Range::FromMinExtent(IntImm(loop->extent.ty(), 0), loop->extent));
       };
       const ForNode* producer_inner_loop = producer_block->body.as<ForNode>();
       while (producer_inner_loop->body.as<ForNode>()) {
diff --git a/src/s_tir/schedule/primitive/decompose_padding.cc b/src/s_tir/schedule/primitive/decompose_padding.cc
index 0a62222a4a34..98e38d259b0c 100644
--- a/src/s_tir/schedule/primitive/decompose_padding.cc
+++ b/src/s_tir/schedule/primitive/decompose_padding.cc
@@ -173,7 +173,7 @@ class PaddingInfoAnalyzer {
     }
     for (const arith::IterSumExpr& sum : res->indices) {
       if (sum->args.empty()) {
-        region.push_back(Range::FromMinExtent(sum->base, IntImm(sum->base.dtype(), /* value */ 1)));
+        region.push_back(Range::FromMinExtent(sum->base, IntImm(sum->base.ty(), /* value */ 1)));
       } else {
         TVM_FFI_ICHECK_EQ(sum->args.size(), 1U);
         if (!analyzer_->CanProveEqual(sum->args[0]->scale, 1)) {
@@ -291,7 +291,7 @@ static std::pair<Stmt, SBlockRealize> CreateInBoundBlock(const SBlockRealizeNode
     const IterVar& origin_itervar = block->iter_vars[i];
     Var new_var = origin_itervar->var.copy_with_suffix("");
     Range new_range =
-        Range::FromMinExtent(IntImm(new_var->dtype, 0), info.in_bound_region[i]->extent);
+        Range::FromMinExtent(IntImm(new_var.ty(), 0), info.in_bound_region[i]->extent);
     new_iter_vars.push_back(IterVar(new_range, new_var, IterVarType::kDataPar));
     repl_dict.Set(origin_itervar->var, new_var + info.in_bound_region[i]->min);
 
diff --git a/src/s_tir/schedule/primitive/for_kind.cc b/src/s_tir/schedule/primitive/for_kind.cc
index cbb7437e54dd..121205b5500d 100644
--- a/src/s_tir/schedule/primitive/for_kind.cc
+++ b/src/s_tir/schedule/primitive/for_kind.cc
@@ -174,9 +174,9 @@ void ParallelizeComputation(const ScheduleState& self, const StmtSRef& loop_sref
   ffi::ObjectPtr<ForNode> new_loop = ffi::make_object<ForNode>(*loop);
   new_loop->kind = for_kind;
   if (thread_axis.has_value()) {
-    new_loop->thread_binding = IterVar(/*dom=*/Range(nullptr),                                    //
-                                       /*var=*/Var(thread_axis.value(), loop->loop_var.dtype()),  //
-                                       /*iter_type=*/kThreadIndex,                                //
+    new_loop->thread_binding = IterVar(/*dom=*/Range(nullptr),                                 //
+                                       /*var=*/Var(thread_axis.value(), loop->loop_var.ty()),  //
+                                       /*iter_type=*/kThreadIndex,                             //
                                        /*thread_tag=*/thread_axis.value());
   } else {
     new_loop->thread_binding = std::nullopt;
diff --git a/src/s_tir/schedule/primitive/layout_transformation.cc b/src/s_tir/schedule/primitive/layout_transformation.cc
index 91c2e5276f26..e9cbf4f75a2d 100644
--- a/src/s_tir/schedule/primitive/layout_transformation.cc
+++ b/src/s_tir/schedule/primitive/layout_transformation.cc
@@ -294,7 +294,7 @@ class TransformLayoutPlanner : private StmtExprVisitor {
       new_indices = inverse->initial_indices.Map([](Var var) {
         std::stringstream ss;
         ss << "v_" << var->name_hint;
-        return Var(ss.str(), var.dtype());
+        return Var(ss.str(), var.ty());
       });
 
       ffi::Map<Var, Var>
@@ -314,7 +314,7 @@ class TransformLayoutPlanner : private StmtExprVisitor {
         PrimExpr dim = new_buffer->shape[i];
         new_iter_values.push_back(var);
         new_iter_vars.push_back(
-            IterVar(Range::FromMinExtent(IntImm(dim.dtype(), 0), dim), virtual_var, kDataPar));
+            IterVar(Range::FromMinExtent(IntImm(dim.ty(), 0), dim), virtual_var, kDataPar));
         loop_var_to_virtual_var.Set(var, virtual_var);
       }
 
@@ -476,7 +476,7 @@ class TransformLayoutPlanner : private StmtExprVisitor {
     for (size_t i = 0; i < inverse->initial_indices.size(); i++) {
       const auto& loop_var = inverse->initial_indices[i];
       const auto& dim = new_buffer->shape[i];
-      Var block_var("v_" + loop_var->name_hint, loop_var->dtype);
+      Var block_var("v_" + loop_var->name_hint, loop_var.ty());
       IterVar iter_var(Range(0, dim), block_var, kDataPar);
       loop_indices_to_block_indices.Set(loop_var, block_var);
       indices.push_back(iter_var->var);
@@ -488,7 +488,7 @@ class TransformLayoutPlanner : private StmtExprVisitor {
     PrimExpr pad_value_at_index =
         pad_value.value()->MapIndices(indices, ffi::GetRef<arith::Analyzer>(analyzer))[0];
     PrimExpr expr = (!padding_predicate) || (BufferLoad(new_buffer, indices) == pad_value_at_index);
-    Stmt stmt = Evaluate(Call(DataType::Bool(), builtin::assume(), {expr}));
+    Stmt stmt = Evaluate(Call(PrimType::Bool(), builtin::assume(), {expr}));
 
     std::stringstream block_name;
     block_name << "buffer_" << new_buffer->name << "_assumptions";
@@ -571,7 +571,7 @@ class TransformLayoutPlanner : private StmtExprVisitor {
     for (size_t i = 0; i < inverse->initial_indices.size(); i++) {
       const auto& loop_var = inverse->initial_indices[i];
       const auto& dim = new_buffer->shape[i];
-      Var block_var("v_" + loop_var->name_hint, loop_var->dtype);
+      Var block_var("v_" + loop_var->name_hint, loop_var.ty());
       IterVar iter_var(Range(0, dim), block_var, kDataPar);
       indices.push_back(iter_var->var);
       iter_vars.push_back(iter_var);
@@ -991,7 +991,7 @@ class TransformationPaddingTypeError : public ScheduleError {
   TransformationPaddingTypeError(IRModule mod, Buffer buffer, IndexMap pad_value)
       : mod_(mod), buffer_(buffer), pad_value_(pad_value) {
     TVM_FFI_ICHECK_EQ(pad_value_->final_indices.size(), 1);
-    pad_value_dtype_ = pad_value_->final_indices[0].dtype();
+    pad_value_dtype_ = pad_value_->final_indices[0].ty()->dtype;
   }
 
   ffi::String FastErrorString() const final {
@@ -1015,7 +1015,7 @@ class TransformationPaddingTypeError : public ScheduleError {
   IRModule mod_;
   Buffer buffer_;
   IndexMap pad_value_;
-  DataType pad_value_dtype_;
+  DLDataType pad_value_dtype_;
 };
 
 class TransformationPaddingExpressionError : public ScheduleError {
@@ -1116,19 +1116,21 @@ IndexMap LegalizeIndexMapDType(const IndexMap& index_map, const ffi::Array<PrimE
 
   ffi::Array<Var> initial_indices;
   ffi::Map<Var, PrimExpr> var_map;
-  std::optional<DataType> index_dtype = std::nullopt;
+  std::optional<DLDataType> index_dtype = std::nullopt;
 
   for (size_t i = 0; i < args.size(); ++i) {
+    DLDataType arg_dtype = args[i].ty()->dtype;
     if (index_dtype.has_value()) {
-      TVM_FFI_ICHECK_EQ(*index_dtype, args[i]->dtype)
-          << "Buffer index " << args[i] << " has dtype " << args[i]->dtype
+      TVM_FFI_ICHECK_EQ(*index_dtype, arg_dtype)
+          << "Buffer index " << args[i] << " has dtype " << arg_dtype
           << ", but previous index for the same buffer access used index type " << *index_dtype;
     } else {
-      index_dtype = args[i]->dtype;
+      index_dtype = arg_dtype;
     }
 
-    if (args[i]->dtype != initial_indices_orig[i].dtype()) {
-      auto new_idx = Var(initial_indices_orig[i]->name_hint, args[i]->dtype);
+    DLDataType initial_dtype = initial_indices_orig[i].ty()->dtype;
+    if (arg_dtype != initial_dtype) {
+      auto new_idx = Var(initial_indices_orig[i]->name_hint, args[i].ty());
       initial_indices.push_back(new_idx);
       var_map.Set(initial_indices_orig[i], new_idx);
     } else {
@@ -1140,7 +1142,7 @@ IndexMap LegalizeIndexMapDType(const IndexMap& index_map, const ffi::Array<PrimE
     auto final_indices = index_map->final_indices.Map([&](PrimExpr index) {
       if (auto* ptr = index.as<IntImmNode>()) {
         TVM_FFI_ICHECK(index_dtype.has_value());
-        return tirx::MakeConst(*index_dtype, ptr->value);
+        return tirx::MakeConst(PrimType(*index_dtype), ptr->value);
       } else {
         return SubstituteWithDataTypeLegalization(index,
                                                   [&](const Var& var) { return var_map.Get(var); });
@@ -1176,7 +1178,7 @@ void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_
     if (pad_value.value()->final_indices.size() != 1) {
       throw TransformationPaddingIndexMapError(self->mod, pad_value.value());
     }
-    if (pad_value.value()->final_indices[0]->dtype != old_buffer->dtype) {
+    if (pad_value.value()->final_indices[0].ty() != old_buffer->dtype) {
       throw TransformationPaddingTypeError(self->mod, old_buffer, pad_value.value());
     }
 
@@ -1194,7 +1196,7 @@ void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_
     std::tie(opt_inverse, padding_predicate) = [&]() {
       ffi::Array<Range> region;
       for (const auto& dim : old_buffer->shape) {
-        region.push_back(Range::FromMinExtent(IntImm(dim.dtype(), 0), dim));
+        region.push_back(Range::FromMinExtent(IntImm(dim.ty(), 0), dim));
       }
       return index_map.NonSurjectiveInverse(region, analyzer);
     }();
@@ -1412,7 +1414,7 @@ void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
   ffi::Array<IterVar> new_block_iters;  // new block iters
   ffi::Array<PrimExpr> new_block_vars;  // iter_var->var of new block iters
   for (size_t i = 0; i < transformed_block_iters.size(); ++i) {
-    Var new_block_var{"v" + std::to_string(i), transformed_block_iters[i]->dtype};
+    Var new_block_var{"v" + std::to_string(i), transformed_block_iters[i].ty()};
     new_block_vars.push_back(new_block_var);
     IterVarType iter_type;
     if (is_one(new_block_iter_range[i])) {
@@ -1424,7 +1426,7 @@ void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
       throw OpaqueNewIterTypeError(self->mod, ffi::GetRef<SBlock>(block_ptr),
                                    transformed_block_iters[i]);
     }
-    auto dtype = new_block_var.dtype();
+    PrimType dtype = new_block_var.ty();
     new_block_iters.push_back(IterVar(
         /*dom=*/Range::FromMinExtent(IntImm(dtype, 0), cast(dtype, new_block_iter_range[i])),
         /*var=*/std::move(new_block_var), /*iter_type=*/iter_type));
@@ -1437,7 +1439,7 @@ void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
   {
     ffi::Array<Range> initial_ranges;
     for (const PrimExpr& extent : block_iter_range_array) {
-      initial_ranges.push_back(Range::FromMinExtent(IntImm(extent.dtype(), 0), extent));
+      initial_ranges.push_back(Range::FromMinExtent(IntImm(extent.ty(), 0), extent));
     }
     IndexMap inverse_index_map{nullptr};
     try {
@@ -1462,7 +1464,7 @@ void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
   // Make new loop vars
   ffi::Array<PrimExpr> new_loop_vars;
   for (int i = 0; i < static_cast<int>(new_block_iters.size()); ++i) {
-    new_loop_vars.push_back(Var("ax" + std::to_string(i), new_block_iters[i]->var.dtype()));
+    new_loop_vars.push_back(Var("ax" + std::to_string(i), new_block_iters[i]->var.ty()));
   }
 
   // Make new block realize
diff --git a/src/s_tir/schedule/primitive/loop_transformation.cc b/src/s_tir/schedule/primitive/loop_transformation.cc
index 18996f555d18..2864b190ea30 100644
--- a/src/s_tir/schedule/primitive/loop_transformation.cc
+++ b/src/s_tir/schedule/primitive/loop_transformation.cc
@@ -58,7 +58,7 @@ class SubstituteVarAndCollectOpaqueBlock : public StmtExprMutator {
   PrimExpr VisitExpr_(const VarNode* op) final {
     Var var = ffi::GetRef<Var>(op);
     if (ffi::Optional<PrimExpr> ret = vmap_(var)) {
-      return tvm::cast(var.dtype(), ret.value());
+      return tvm::cast(var.ty(), ret.value());
     } else {
       return var;
     }
@@ -411,13 +411,13 @@ ffi::Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref,
   CheckLoopStartsWithZero(self, loop_sref, analyzer.get());
 
   // Find the most common dtype
-  DataType dtype;
+  PrimType dtype = PrimType::Int(32);
   {
-    int bits = loop->loop_var.dtype().bits();
+    int bits = loop->loop_var.ty().bits();
     for (const PrimExpr& factor : factors) {
-      bits = std::max(bits, factor.dtype().bits());
+      bits = std::max(bits, factor.ty().bits());
     }
-    dtype = DataType::Int(bits);
+    dtype = PrimType::Int(bits);
   }
   int n = factors.size();
   PrimExpr substitute_value = IntImm(dtype, 0);
@@ -556,9 +556,9 @@ class BlockMutator : public StmtExprMutator {
     // As we are working on cloned block, we need to create new instances of iter_var
     ffi::Array<IterVar> new_iter_vars =
         MutateArray(new_block->iter_vars, [this, &iter_var_](const IterVar& iter) {
-          auto dtype = iter->var.dtype();
+          auto dtype = iter->var.ty();
           // Create new Var instance for each IterVar
-          Var new_var = Var(iter->var->name_hint, iter->var.dtype());
+          Var new_var = Var(iter->var->name_hint, iter->var.ty());
           IterVar new_iter = iter;
           new_iter.CopyOnWrite()->var = new_var;
           // Change the domain of IterVar corresponding to partitioned loop_var
@@ -623,7 +623,7 @@ class BlockMutator : public StmtExprMutator {
 
   Stmt VisitStmt_(const ForNode* op) final {
     For res = StmtMutator::VisitStmt_(op).as_or_throw<For>();
-    Var new_var = Var(op->loop_var->name_hint, op->loop_var.dtype());
+    Var new_var = Var(op->loop_var->name_hint, op->loop_var.ty());
 
     if (!op->loop_var.same_as(new_var)) {
       // If the partioned loop contains nested for loop, then create new iteration variable instance
@@ -655,13 +655,13 @@ ffi::Array<StmtSRef> LoopPartition(ScheduleState self, const StmtSRef& loop_sref
 
   arith::Analyzer analyzer;
   // Find the most common dtype
-  DataType dtype;
+  PrimType dtype = PrimType::Int(32);
   {
-    int bits = loop->loop_var.dtype().bits();
+    int bits = loop->loop_var.ty().bits();
     for (const PrimExpr& factor : factors) {
-      bits = std::max(bits, factor.dtype().bits());
+      bits = std::max(bits, factor.ty().bits());
     }
-    dtype = DataType::Int(bits);
+    dtype = PrimType::Int(bits);
   }
 
   ffi::String block_name = get_sblock_name(loop->body) + "_" + loop->loop_var->name_hint;
@@ -921,14 +921,14 @@ StmtSRef Fuse(ScheduleState self, const ffi::Array<StmtSRef>& loop_srefs,
   // Step 2. Create fused loop var and replace the original loop vars
   std::string suffix;
   int n = loops.size();
-  int bits = loops[0]->loop_var.dtype().bits();
+  int bits = loops[0]->loop_var.ty().bits();
   for (int i = 1; i < n; i++) {
     suffix += "_" + loops[i]->loop_var->name_hint;
-    bits = std::max(bits, loops[i]->loop_var.dtype().bits());
+    bits = std::max(bits, loops[i]->loop_var.ty().bits());
   }
   suffix += "_fused";
 
-  Var fused_var = loops[0]->loop_var.copy_with_suffix(suffix).copy_with_dtype(DataType::Int(bits));
+  Var fused_var = loops[0]->loop_var.copy_with_suffix(suffix).copy_with_dtype(PrimType::Int(bits));
   ffi::Array<PrimExpr> substitute_value;
   substitute_value.resize(loops.size());
   PrimExpr lower = 1;
@@ -1144,7 +1144,7 @@ void Reorder(ScheduleState self, const ffi::Array<StmtSRef>& ordered_loop_srefs)
 StmtSRef AddUnitLoop(ScheduleState self, StmtSRef sref) {
   if (sref->stmt->IsInstance<ForNode>()) {
     For new_loop =
-        For(Var("u", DataType::Int(32)), 0, 1, ForKind::kSerial, ffi::GetRef<Stmt>(sref->stmt));
+        For(Var("u", PrimType::Int(32)), 0, 1, ForKind::kSerial, ffi::GetRef<Stmt>(sref->stmt));
     self->Replace(sref, new_loop, {});
     return self->stmt2ref.at(new_loop.get());
   }
@@ -1154,7 +1154,7 @@ StmtSRef AddUnitLoop(ScheduleState self, StmtSRef sref) {
 
     Stmt VisitStmt_(const SBlockRealizeNode* realize) final {
       if (realize->block.get() == src_block_) {
-        new_loop_ = For(Var("u", DataType::Int(32)), 0, 1, ForKind::kSerial,
+        new_loop_ = For(Var("u", PrimType::Int(32)), 0, 1, ForKind::kSerial,
                         ffi::GetRef<SBlockRealize>(realize));
         return new_loop_;
       }
diff --git a/src/s_tir/schedule/primitive/pad_einsum.cc b/src/s_tir/schedule/primitive/pad_einsum.cc
index c03638b09b98..a0c65698d182 100644
--- a/src/s_tir/schedule/primitive/pad_einsum.cc
+++ b/src/s_tir/schedule/primitive/pad_einsum.cc
@@ -147,7 +147,7 @@ struct BufferPadding {
       PrimExpr pos = buffer_region->region[i]->min;
       TVM_FFI_ICHECK(pos->IsInstance<IntImmNode>() || pos->IsInstance<VarNode>());
       if (pos->IsInstance<IntImmNode>()) {
-        shape.push_back(IntImm(pos->dtype, 1));
+        shape.push_back(IntImm(pos.ty(), 1));
       } else if (ffi::Optional<PrimExpr> extent = iter_extents.Get(pos.as_or_throw<Var>())) {
         shape.push_back(extent.value());
       } else {
@@ -173,11 +173,11 @@ struct BufferPadding {
       } else {
         dim = buffer->shape[i];
       }
-      Range dom = Range::FromMinExtent(IntImm(dim->dtype, 0), dim);
-      loop_vars.push_back(Var("i" + std::to_string(i), dim->dtype));
+      Range dom = Range::FromMinExtent(IntImm(dim.ty(), 0), dim);
+      loop_vars.push_back(Var("i" + std::to_string(i), dim.ty()));
       loop_doms.push_back(dom);
-      IterVar iter_var(dom, Var("v" + std::to_string(i), dim->dtype), kDataPar);
-      instance_dom.push_back(Range::FromMinExtent(iter_var->var, IntImm(dim->dtype, 1)));
+      IterVar iter_var(dom, Var("v" + std::to_string(i), dim.ty()), kDataPar);
+      instance_dom.push_back(Range::FromMinExtent(iter_var->var, IntImm(dim.ty(), 1)));
       iter_vars.push_back(iter_var);
       indices.push_back(iter_var->var);
     }
@@ -190,8 +190,8 @@ struct BufferPadding {
         }
       }
       PrimExpr rhs = BufferLoad(buffer, indices);
-      body = BufferStore(padded_buffer, if_then_else(predicate, rhs, MakeConst(rhs->dtype, 0)),
-                         indices);
+      body =
+          BufferStore(padded_buffer, if_then_else(predicate, rhs, MakeConst(rhs.ty(), 0)), indices);
     } else {
       body = BufferStore(buffer, BufferLoad(padded_buffer, indices), indices);
     }
@@ -389,7 +389,7 @@ void PadEinsum(ScheduleState self, const StmtSRef& block_sref, const ffi::Array<
   for (int i = 0, n = padding.size(); i < n; ++i) {
     const IterVar& iter = block->iter_vars[i];
     PrimExpr dom = iter->dom->extent;
-    PrimExpr pad_imm = IntImm(dom->dtype, padding[i]);
+    PrimExpr pad_imm = IntImm(dom.ty(), padding[i]);
     PrimExpr new_dom = analyzer->Simplify(ceildiv(dom, pad_imm) * pad_imm);
     if (!analyzer->CanProveEqual(new_dom, dom)) {
       replacer.iter2padded_extents.Set(iter->var, new_dom);
diff --git a/src/s_tir/schedule/primitive/reduction.cc b/src/s_tir/schedule/primitive/reduction.cc
index 51fe3afde4e1..169508943b2d 100644
--- a/src/s_tir/schedule/primitive/reduction.cc
+++ b/src/s_tir/schedule/primitive/reduction.cc
@@ -318,7 +318,7 @@ struct ReducerRegistry {
                   return ffi::Array<PrimExpr>{x[0] + y[0]};
                 },
                 [](const ffi::Array<PrimExpr>& values) {
-                  return ffi::Array<PrimExpr>{MakeConst(values[0]->dtype, 0)};
+                  return ffi::Array<PrimExpr>{MakeConst(values[0].ty(), 0)};
                 }),
             CreateReducerGetter(
                 /*n_buffers=*/1,
@@ -326,7 +326,7 @@ struct ReducerRegistry {
                   return ffi::Array<PrimExpr>{x[0] * y[0]};
                 },
                 [](const ffi::Array<PrimExpr>& values) {
-                  return ffi::Array<PrimExpr>{MakeConst(values[0]->dtype, 1)};
+                  return ffi::Array<PrimExpr>{MakeConst(values[0].ty(), 1)};
                 }),
             CreateReducerGetter(
                 /*n_buffers=*/1,
@@ -334,7 +334,7 @@ struct ReducerRegistry {
                   return ffi::Array<PrimExpr>{min(x[0], y[0])};
                 },
                 [](const ffi::Array<PrimExpr>& values) {
-                  return ffi::Array<PrimExpr>{max_value(values[0]->dtype)};
+                  return ffi::Array<PrimExpr>{max_value(values[0].ty())};
                 }),
             CreateReducerGetter(
                 /*n_buffers=*/1,
@@ -342,7 +342,7 @@ struct ReducerRegistry {
                   return ffi::Array<PrimExpr>{max(x[0], y[0])};
                 },
                 [](const ffi::Array<PrimExpr>& values) {
-                  return ffi::Array<PrimExpr>{min_value(values[0]->dtype)};
+                  return ffi::Array<PrimExpr>{min_value(values[0].ty())};
                 }),
             CreateReducerGetter(
                 /*n_buffers=*/2,
@@ -350,8 +350,8 @@ struct ReducerRegistry {
                   return ffi::Array<PrimExpr>{x[0] + y[0], x[1] + y[1]};
                 },
                 [](const ffi::Array<PrimExpr>& values) {
-                  return ffi::Array<PrimExpr>{MakeConst(values[0]->dtype, 0),
-                                              MakeConst(values[1]->dtype, 0)};
+                  return ffi::Array<PrimExpr>{MakeConst(values[0].ty(), 0),
+                                              MakeConst(values[1].ty(), 0)};
                 }),
             CreateReducerGetter(
                 /*n_buffers=*/2,
@@ -361,8 +361,8 @@ struct ReducerRegistry {
                   return ffi::Array<PrimExpr>{idx, val};
                 },
                 [](const ffi::Array<PrimExpr>& values) {
-                  return ffi::Array<PrimExpr>{MakeConst(values[0]->dtype, -1),
-                                              min_value(values[1]->dtype)};
+                  return ffi::Array<PrimExpr>{MakeConst(values[0].ty(), -1),
+                                              min_value(values[1].ty())};
                 }),
             CreateReducerGetter(
                 /*n_buffers=*/2,
@@ -374,8 +374,8 @@ struct ReducerRegistry {
                   return ffi::Array<PrimExpr>{idx, val};
                 },
                 [](const ffi::Array<PrimExpr>& values) {
-                  return ffi::Array<PrimExpr>{MakeConst(values[0]->dtype, -1),
-                                              min_value(values[1]->dtype)};
+                  return ffi::Array<PrimExpr>{MakeConst(values[0].ty(), -1),
+                                              min_value(values[1].ty())};
                 }),
             CreateReducerGetter(
                 /*n_buffers=*/2,
@@ -385,8 +385,8 @@ struct ReducerRegistry {
                   return ffi::Array<PrimExpr>{idx, val};
                 },
                 [](const ffi::Array<PrimExpr>& values) {
-                  return ffi::Array<PrimExpr>{MakeConst(values[0]->dtype, -1),
-                                              max_value(values[1]->dtype)};
+                  return ffi::Array<PrimExpr>{MakeConst(values[0].ty(), -1),
+                                              max_value(values[1].ty())};
                 }),
             CreateReducerGetter(
                 /*n_buffers=*/2,
@@ -397,8 +397,8 @@ struct ReducerRegistry {
                   return ffi::Array<PrimExpr>{idx, val};
                 },
                 [](const ffi::Array<PrimExpr>& values) {
-                  return ffi::Array<PrimExpr>{MakeConst(values[0]->dtype, -1),
-                                              max_value(values[1]->dtype)};
+                  return ffi::Array<PrimExpr>{MakeConst(values[0].ty(), -1),
+                                              max_value(values[1].ty())};
                 })} {}
 
   static void RegisterReducer(
@@ -423,8 +423,8 @@ struct ReducerRegistry {
       ffi::Array<Var> lhs;
       ffi::Array<Var> rhs;
       for (int i = 0; i < n_buffers; ++i) {
-        lhs.push_back(Var("x" + std::to_string(i), values[i]->dtype));
-        rhs.push_back(Var("y" + std::to_string(i), values[i]->dtype));
+        lhs.push_back(Var("x" + std::to_string(i), values[i].ty()));
+        rhs.push_back(Var("y" + std::to_string(i), values[i].ty()));
       }
       return CommReducer(lhs, rhs, combiner_getter(lhs, rhs), identity_getter(values));
     };
@@ -741,7 +741,7 @@ class BaseBlockCreator {
     ffi::Array<Var> let_vars;
     let_vars.reserve(n_buffers_);
     for (int i = 0; i < n_buffers_; ++i) {
-      Var var("v_" + update_buffers_[i]->name, PrimType(stored_values[i]->dtype));
+      Var var("v_" + update_buffers_[i]->name, stored_values[i].ty());
       let_vars.push_back(var);
       buf_stores.push_back(BufferStore(update_buffers_[i], var, update_indices_[i]));
     }
@@ -932,7 +932,7 @@ class RFactorBlockCreator : public BaseBlockCreator {
       ffi::Array<Range> region = write_region->region;
       region.insert(
           region.begin() + factor_axis_,
-          Range::FromMinExtent(additional_iter_->var, MakeConst(additional_iter_->var.dtype(), 1)));
+          Range::FromMinExtent(additional_iter_->var, MakeConst(additional_iter_->var.ty(), 1)));
       ffi::Optional<Buffer> rf_buffer = buffer_map.Get(write_region->buffer);
       TVM_FFI_ICHECK(rf_buffer.defined());
       write_regions_.push_back(BufferRegion(rf_buffer.value(), Substitute(region, var_map_)));
@@ -1025,7 +1025,7 @@ class WriteBackBlockCreator : public BaseBlockCreator {
       ffi::Array<Range> region;
       region.reserve(buf_load->indices.size());
       for (const PrimExpr& index : buf_load->indices) {
-        region.push_back(Range::FromMinExtent(index, MakeConst(index.dtype(), 1)));
+        region.push_back(Range::FromMinExtent(index, MakeConst(index.ty(), 1)));
       }
       buf_regions.push_back(BufferRegion(buf_load->buffer, std::move(region)));
     }
diff --git a/src/s_tir/schedule/transform.cc b/src/s_tir/schedule/transform.cc
index 3e29d1b6691d..8df7f7df9da9 100644
--- a/src/s_tir/schedule/transform.cc
+++ b/src/s_tir/schedule/transform.cc
@@ -49,9 +49,9 @@ Buffer WithScope(const Buffer& buffer, const ffi::String& scope) {
   return Buffer(new_buffer);
 }
 
-Buffer WithDType(const Buffer& buffer, const DataType& dtype) {
+Buffer WithDType(const Buffer& buffer, DLDataType dtype) {
   ffi::ObjectPtr<BufferNode> new_buffer = ffi::make_object<BufferNode>(*buffer.get());
-  new_buffer->dtype = dtype;
+  new_buffer->dtype = PrimType(dtype);
   const auto* ptr_type = TVM_TYPE_AS(buffer->data->type_annotation, PointerTypeNode);
   new_buffer->data =
       Var(buffer->data->name_hint, PointerType(PrimType(dtype), ptr_type->storage_scope));
diff --git a/src/s_tir/schedule/transform.h b/src/s_tir/schedule/transform.h
index da6d54a96629..9a408845e8e5 100644
--- a/src/s_tir/schedule/transform.h
+++ b/src/s_tir/schedule/transform.h
@@ -61,7 +61,7 @@ Buffer WithScope(const Buffer& buffer, const ffi::String& scope);
  * \param scope The target data type.
  * \return The new buffer with target data type.
  */
-Buffer WithDType(const Buffer& buffer, const DataType& dtype);
+Buffer WithDType(const Buffer& buffer, DLDataType dtype);
 
 /*!
  * \brief Replaces the buffer within the specific sequence of regions
diff --git a/src/s_tir/schedule/utils.h b/src/s_tir/schedule/utils.h
index b50416c2e198..7509dad5bdbe 100644
--- a/src/s_tir/schedule/utils.h
+++ b/src/s_tir/schedule/utils.h
@@ -166,7 +166,7 @@ inline bool IsSingleStmt(const Stmt& stmt) {
  */
 inline IterVar IterVarFromLoop(const For& loop, ffi::String name, IterVarType iter_var_type) {
   return IterVar(Range::FromMinExtent(loop->min, loop->extent),
-                 Var(std::move(name), loop->loop_var.dtype()), iter_var_type);
+                 Var(std::move(name), loop->loop_var.ty()), iter_var_type);
 }
 
 /*!
@@ -241,7 +241,7 @@ inline ffi::Optional<Var> AnalyzeVarWithShift(const PrimExpr& expr,
   // match: "var - shift"
   if ((var - shift).Match(expr)) {
     IntImm result = shift.Eval();
-    *constant = IntImm(result->dtype, -result->value);
+    *constant = IntImm(result.ty(), -result->value);
     return var.Eval();
   }
   return std::nullopt;
diff --git a/src/s_tir/transform/bound_checker.cc b/src/s_tir/transform/bound_checker.cc
index ea0364c12823..364f4b3794c4 100644
--- a/src/s_tir/transform/bound_checker.cc
+++ b/src/s_tir/transform/bound_checker.cc
@@ -71,7 +71,7 @@ class BoundChecker : public StmtExprMutator {
 
   Stmt VisitStmt_(const AllocBufferNode* op) final {
     if (UpdateIsNeeded(op->buffer->data)) {
-      Update(op->buffer->data, op->buffer->shape, op->buffer->dtype);
+      Update(op->buffer->data, op->buffer->shape, op->buffer->dtype->dtype);
     }
     return StmtExprMutator::VisitStmt_(op);
   }
@@ -118,15 +118,17 @@ class BoundChecker : public StmtExprMutator {
     return (buffer_var.defined() && mem_to_shape_.count(buffer_var.get()));
   }
 
-  void Update(const Var& buffer_var, ffi::Array<PrimExpr> new_shape, const DataType& type) {
+  void Update(const Var& buffer_var, ffi::Array<PrimExpr> new_shape, DLDataType dtype) {
     // Sanity check at first.
     if (!ShapeIsValid(new_shape)) {
       return;
     }
 
+    int16_t lanes = static_cast<int16_t>(dtype.lanes);
+    TVM_FFI_ICHECK_GE(lanes, 0);
     new_shape.MutateByApply([&](const PrimExpr& dim) {
       // Cast to uint64 to avoid potential overflow.
-      return IntImm(DataType::UInt(64), type.lanes()) * dim;
+      return IntImm(PrimType::UInt(64), lanes) * dim;
     });
     mem_to_shape_[buffer_var.get()] = new_shape;
   }
@@ -175,7 +177,9 @@ class BoundChecker : public StmtExprMutator {
   }
 
   bool IsValidScalar(const PrimExpr& expr) const {
-    return expr.defined() && expr.dtype().is_scalar();
+    if (!expr.defined()) return false;
+    PrimType ty = expr.ty();
+    return !ty.IsFixedLengthVector() && !ty.IsScalableVector();
   }
 
   bool CanInstrument(const ffi::Array<PrimExpr>& indices, const Var& buffer_var) const {
@@ -210,8 +214,8 @@ class BoundChecker : public StmtExprMutator {
         upper_bound = analyzer_->Simplify(upper_bound);
 
         // Cast to the same type - signed, to be able to check lower bound.
-        index = Cast(DataType::Int(64), index);
-        upper_bound = Cast(DataType::Int(64), upper_bound);
+        index = Cast(PrimType::Int(64), index);
+        upper_bound = Cast(PrimType::Int(64), upper_bound);
 
         // Looks like a lower bound should always be zero after normalization.
         PrimExpr lower_bound = IntImm::Int64(0);
diff --git a/src/s_tir/transform/canonicalize_loop.cc b/src/s_tir/transform/canonicalize_loop.cc
index 9c10280eb6ef..9c18cb9c88d1 100644
--- a/src/s_tir/transform/canonicalize_loop.cc
+++ b/src/s_tir/transform/canonicalize_loop.cc
@@ -47,7 +47,7 @@ class LoopCanonicalizer : public StmtExprMutator {
       return StmtExprMutator::VisitStmt_(op);
     }
     const auto* loop_var = op->loop_var.get();
-    PrimExpr step = op->step.value_or(MakeConst(loop_var->dtype, 1));
+    PrimExpr step = op->step.value_or(MakeConst(loop_var->ty(), 1));
 
     // report warning for negative step, since it would be a forever loop
     if (!analyzer_->CanProveGreaterEqual(step, 1)) {
@@ -59,7 +59,7 @@ class LoopCanonicalizer : public StmtExprMutator {
     new_iter_info_[loop_var] = std::make_pair(step, op->min);
     auto n = CopyOnWrite(op);
     n->body = VisitStmt(op->body);
-    n->min = IntImm(loop_var->dtype, 0);
+    n->min = IntImm(ffi::GetRef<PrimExpr>(loop_var).ty(), 0);
     n->extent = analyzer_->Simplify(ceildiv(op->extent, step));
     n->step = std::nullopt;
     new_iter_info_.erase(loop_var);
diff --git a/src/s_tir/transform/compact_buffer_region.cc b/src/s_tir/transform/compact_buffer_region.cc
index 4ea7b63bfe89..c7a6e0fd1fef 100644
--- a/src/s_tir/transform/compact_buffer_region.cc
+++ b/src/s_tir/transform/compact_buffer_region.cc
@@ -181,7 +181,7 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
 
   void VisitStmt_(const BindNode* op) final {
     StmtExprVisitor::VisitExpr(op->value);
-    if (arith::IsIndexType(op->value->dtype)) {
+    if (arith::IsIndexTypedExpr(op->value)) {
       dom_analyzer_->Bind(op->var, op->value);
       dom_map_.emplace(op->var.get(), arith::IntSet::SinglePoint(op->value));
     }
@@ -189,12 +189,12 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
 
   void VisitExpr_(const LetNode* op) final {
     StmtExprVisitor::VisitExpr(op->value);
-    if (arith::IsIndexType(op->value->dtype)) {
+    if (arith::IsIndexTypedExpr(op->value)) {
       dom_analyzer_->Bind(op->var, op->value);
       dom_map_.emplace(op->var.get(), arith::IntSet::SinglePoint(op->value));
     }
     StmtExprVisitor::VisitExpr(op->body);
-    if (arith::IsIndexType(op->value->dtype)) {
+    if (arith::IsIndexTypedExpr(op->value)) {
       dom_map_.erase(op->var.get());
     }
   }
@@ -322,7 +322,7 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
       ancestor_iters_.push_back(iter);
       Range dom = iter->dom;
       if (!dom.defined()) {  // dom is empty for legacy te schedule
-        dom = Range::FromMinExtent(IntImm(op->value->dtype, 0), op->value);
+        dom = Range::FromMinExtent(IntImm(op->value.ty(), 0), op->value);
       }
       dom_analyzer_->Bind(iter->var, dom);
       dom_map_.emplace(iter->var.get(), arith::IntSet::FromRange(dom));
@@ -367,8 +367,9 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
       }
       // Step 2. Relax the access region
       auto normalize_pred = [](const PrimExpr& pred) {
-        if (pred->dtype.is_bool()) return pred;
-        return pred != IntImm(pred->dtype, 0);
+        PrimType pred_ty = pred.ty();
+        if (pred_ty.MatchesCode(DLDataTypeCode::kDLBool)) return pred;
+        return pred != IntImm(pred.ty(), 0);
       };
       PrimExpr predicate = dom_analyzer_->Simplify(std::accumulate(
           pending_conditions_.begin(), pending_conditions_.end(), PrimExpr(IntImm::Bool(true)),
@@ -439,7 +440,7 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
     for (size_t i = 0; i < nd_int_set.size(); ++i) {
       const arith::IntSet& int_set = nd_int_set[i];
       Range original =
-          Range(/*begin=*/IntImm(original_shape[i]->dtype, 0), /*end=*/original_shape[i]);
+          Range(/*begin=*/IntImm(original_shape[i].ty(), 0), /*end=*/original_shape[i]);
       Range range = int_set.CoverRange(original);
       PrimExpr min, extent;
       if (collect_inbound_) {
@@ -470,7 +471,7 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
         // try estimate a constant upperbound on region's extent
         int64_t upperbound = dom_analyzer_->const_int_bound(extent)->max_value;
         if (upperbound != arith::ConstIntBound::kPosInf) {
-          extent = MakeConst(extent->dtype, upperbound);
+          extent = MakeConst(extent.ty(), upperbound);
         } else {
           result_region.Set(i, original);
           continue;
@@ -699,15 +700,15 @@ ffi::Array<PrimExpr> CalcStrides(const BufferAllocInfo& alloc_info,
   if (alloc_info.dim_aligns.size()) {
     TVM_FFI_ICHECK(alloc_info.dim_aligns.size() == shape.size());
     strides.resize(shape.size());
-    PrimExpr stride = MakeConst(shape[0].dtype(), 1);
+    PrimExpr stride = MakeConst(shape[0].ty(), 1);
     for (size_t i = shape.size(); i != 0; --i) {
       size_t dim = i - 1;
       DimAlignInfo info = alloc_info.dim_aligns[dim];
       int align_factor = info.align_factor;
       int align_offset = info.align_offset;
       if (align_factor != 0) {
-        PrimExpr factor = MakeConst(stride.dtype(), align_factor);
-        PrimExpr offset = MakeConst(stride.dtype(), align_offset);
+        PrimExpr factor = MakeConst(stride.ty(), align_factor);
+        PrimExpr offset = MakeConst(stride.ty(), align_offset);
         stride = stride + indexmod(factor + offset - indexmod(stride, factor), factor);
       }
       strides[dim] = stride;
diff --git a/src/s_tir/transform/default_gpu_schedule.cc b/src/s_tir/transform/default_gpu_schedule.cc
index 970d4c10d399..70f701668f51 100644
--- a/src/s_tir/transform/default_gpu_schedule.cc
+++ b/src/s_tir/transform/default_gpu_schedule.cc
@@ -131,10 +131,10 @@ tirx::PrimFunc WrapBareSBlockBody(const tirx::PrimFunc& func) {
   if (inner->IsInstance<tirx::ForNode>() || inner->IsInstance<tirx::SBlockRealizeNode>()) {
     return func;
   }
-  tvm::IntImm zero(tvm::DataType::Int(32), 0);
-  tvm::IntImm one(tvm::DataType::Int(32), 1);
-  tirx::Var loop_var("u", tvm::DataType::Int(32));
-  tirx::Var iter_var_var("vu", tvm::DataType::Int(32));
+  tvm::IntImm zero(tvm::PrimType::Int(32), 0);
+  tvm::IntImm one(tvm::PrimType::Int(32), 1);
+  tirx::Var loop_var("u", tvm::PrimType::Int(32));
+  tirx::Var iter_var_var("vu", tvm::PrimType::Int(32));
   tirx::IterVar new_iter(tvm::Range::FromMinExtent(zero, one), iter_var_var,
                          tirx::IterVarType::kDataPar);
   tirx::SBlock inner_block = realize->block;
diff --git a/src/s_tir/transform/inject_double_buffer.cc b/src/s_tir/transform/inject_double_buffer.cc
index 874cf6ca677a..63339096d65c 100644
--- a/src/s_tir/transform/inject_double_buffer.cc
+++ b/src/s_tir/transform/inject_double_buffer.cc
@@ -164,15 +164,15 @@ class DoubleBufferInjector : public StmtExprMutator {
             << "It is better to split with multiple of 2";
         TVM_FFI_ICHECK(is_zero(old_loop->min));
         PrimExpr zero = old_loop->min;
-        PrimExpr new_ext = old_loop->extent - MakeConst(old_loop->loop_var.dtype(), 1);
-        PrimExpr factor = MakeConst(new_ext.dtype(), split_loop_);
+        PrimExpr new_ext = old_loop->extent - MakeConst(old_loop->loop_var.ty(), 1);
+        PrimExpr factor = MakeConst(new_ext.ty(), split_loop_);
         PrimExpr outer_ext = new_ext / factor;
         PrimExpr tail_base = outer_ext * factor;
-        Var outer_var(old_loop->loop_var->name_hint + ".outer", old_loop->loop_var.dtype());
+        Var outer_var(old_loop->loop_var->name_hint + ".outer", old_loop->loop_var.ty());
         std::unordered_map<const VarNode*, PrimExpr> vmap;
         std::vector<Stmt> loop_seq;
         for (int32_t i = 0; i < split_loop_; ++i) {
-          vmap[old_loop->loop_var.get()] = outer_var * factor + MakeConst(factor.dtype(), i);
+          vmap[old_loop->loop_var.get()] = outer_var * factor + MakeConst(factor.ty(), i);
           loop_seq.emplace_back(Substitute(old_loop->body, vmap));
         }
         Stmt loop = For(outer_var, zero, outer_ext, old_loop->kind, SeqStmt::Flatten(loop_seq));
@@ -180,7 +180,7 @@ class DoubleBufferInjector : public StmtExprMutator {
         std::vector<Stmt> tail_seq;
         Stmt tail_body = StripDoubleBufferWrite()(old_loop->body);
         for (int32_t i = 0; i < split_loop_; ++i) {
-          PrimExpr idx = tail_base + MakeConst(tail_base.dtype(), i);
+          PrimExpr idx = tail_base + MakeConst(tail_base.ty(), i);
           vmap[old_loop->loop_var.get()] = idx;
           tail_seq.emplace_back(IfThenElse(idx < old_loop->extent, Substitute(tail_body, vmap)));
         }
@@ -274,11 +274,11 @@ class DoubleBufferInjector : public StmtExprMutator {
     }
     StorageEntry& e = it->second;
     e.loop = loop_nest_.back();
-    PrimExpr zero = IntImm(e.loop->loop_var.dtype(), 0);
-    PrimExpr one = IntImm(e.loop->loop_var.dtype(), 1);
-    PrimExpr two = IntImm(e.loop->loop_var.dtype(), 2);
+    PrimExpr zero = IntImm(e.loop->loop_var.ty(), 0);
+    PrimExpr one = IntImm(e.loop->loop_var.ty(), 1);
+    PrimExpr two = IntImm(e.loop->loop_var.ty(), 2);
     PrimExpr loop_shift = e.loop->loop_var + one;
-    e.switch_write_var = Var(e.loop->loop_var->name_hint + ".db", e.loop->loop_var.dtype());
+    e.switch_write_var = Var(e.loop->loop_var->name_hint + ".db", e.loop->loop_var.ty());
     e.switch_read_var = indexmod(e.loop->loop_var, two);
     in_double_buffer_scope_ = true;
     Stmt body = this->VisitStmt(op->body);
diff --git a/src/s_tir/transform/inject_permuted_layout.cc b/src/s_tir/transform/inject_permuted_layout.cc
index a816c43e32a0..a0e896f0dc6a 100644
--- a/src/s_tir/transform/inject_permuted_layout.cc
+++ b/src/s_tir/transform/inject_permuted_layout.cc
@@ -269,7 +269,7 @@ class PermutedLayoutInjector : private IRMutatorWithAnalyzer {
       auto new_access_ptr = HandleAccessPtrAndOffset(access_ptr, smem_offset);
       auto new_call = call.CopyOnWrite();
       new_call->args.Set(5, new_access_ptr);
-      new_call->args.Set(6, IntImm(smem_offset->dtype, 0));
+      new_call->args.Set(6, IntImm(smem_offset.ty(), 0));
       return call;
     } else if (call->op.same_as(mma_store_op)) {
       // TODO(yixin): mma_store is not fully tested yet
diff --git a/src/s_tir/transform/inject_ptx_async_copy.cc b/src/s_tir/transform/inject_ptx_async_copy.cc
index 500c2623be41..514439c9f8fb 100644
--- a/src/s_tir/transform/inject_ptx_async_copy.cc
+++ b/src/s_tir/transform/inject_ptx_async_copy.cc
@@ -56,10 +56,10 @@ class PTXAsyncCopyInjector : public StmtMutator {
                  PrimExpr predicate_value = PrimExpr()) {
     if (load->buffer.scope() == "global") {
       TVM_FFI_ICHECK(load->indices.size() == 1 && store->indices.size() == 1);
-      TVM_FFI_ICHECK(load->indices[0]->dtype.lanes() == store->indices[0]->dtype.lanes());
+      TVM_FFI_ICHECK(load->indices[0].ty().lanes() == store->indices[0].ty().lanes());
 
-      const int indices_lanes = load->indices[0]->dtype.lanes();
-      const int bytes = indices_lanes * load->buffer->dtype.bytes();
+      const int indices_lanes = load->indices[0].ty().lanes();
+      const int bytes = indices_lanes * ((load->buffer->dtype.bits() + 7) / 8);
 
       if (bytes == 4 || bytes == 8 || bytes == 16) {
         auto dst_elem_type = GetPointerType(store->buffer->data->type_annotation);
@@ -72,13 +72,13 @@ class PTXAsyncCopyInjector : public StmtMutator {
           // The only case where src and dst have different dtypes is when the dst shared memory
           // is a byte buffer generated by merging dynamic shared memory.
           TVM_FFI_ICHECK(store->buffer.scope() == "shared.dyn");
-          TVM_FFI_ICHECK(dst_elem_type.value() == DataType::UInt(8));
+          TVM_FFI_ICHECK((dst_elem_type.value() == DLDataType{kDLUInt, 8, 1}));
           // BufferStore/Load have the "pointer reinterpret" semantics according to their
           // "value" dtype. Their "indices" are supposed to be applied after such pointer cast,
           // for example: ((*float16)(byte_buffer))[buffer->indices] = fp16_value;
           // To replace BufferStore/Load with cp.async, we need to multiply the store index by
           // the byte size of the "value" dtype, to get the correct offset into the byte buffer.
-          index_factor = src_elem_type->bytes();
+          index_factor = (src_elem_type.value().bits + 7) / 8;
         }
 
         if (indices_lanes == 1) {
diff --git a/src/s_tir/transform/inject_ptx_ldg32.cc b/src/s_tir/transform/inject_ptx_ldg32.cc
index 7b63b22f6965..2d07aafc5446 100644
--- a/src/s_tir/transform/inject_ptx_ldg32.cc
+++ b/src/s_tir/transform/inject_ptx_ldg32.cc
@@ -115,8 +115,9 @@ class PTXRewriter : public StmtMutator {
     }
     has_buffer_1 = true;
     // addr[0] -> global_addr /  addr[1] -> local_addr
-    addr_buffer = decl_buffer({IntImm::Int32(2)}, DataType::Int(32), "addr", "local");
-    predicate_buffer = decl_buffer({IntImm::Int32(1)}, DataType::Bool(), "predicate", "local");
+    addr_buffer = decl_buffer({IntImm::Int32(2)}, DLDataType{kDLInt, 32, 1}, "addr", "local");
+    predicate_buffer =
+        decl_buffer({IntImm::Int32(1)}, DLDataType{kDLBool, 8, 1}, "predicate", "local");
   }
 
   bool has_buffer_1 = false, has_buffer_2 = false;
diff --git a/src/s_tir/transform/inject_software_pipeline.cc b/src/s_tir/transform/inject_software_pipeline.cc
index 4e4307ef1f18..7269c41f7a4c 100644
--- a/src/s_tir/transform/inject_software_pipeline.cc
+++ b/src/s_tir/transform/inject_software_pipeline.cc
@@ -120,7 +120,7 @@ class PipelineOpaqueAccessRewriter {
         ffi::Array<PrimExpr> new_args = call->args;
         const Buffer& new_buffer = (*it).second;
         new_args.Set(4, RewriteWmmaFragmentIndex(buffer, new_buffer, call->args[4]));
-        return Call(call->dtype, call->op, new_args, call->attrs, call->span);
+        return Call(call.ty(), call->op, new_args, call->attrs, call->span);
       }
     } else if (call->op.same_as(mma_sync)) {
       ffi::Array<PrimExpr> new_args = call->args;
@@ -134,7 +134,7 @@ class PipelineOpaqueAccessRewriter {
           new_args.Set(i * 2 + 1, new_index);
         }
       }
-      return Call(call->dtype, call->op, new_args, call->attrs, call->span);
+      return Call(call.ty(), call->op, new_args, call->attrs, call->span);
     } else if (call->op.same_as(access_ptr)) {
       return RewriteBufferAccess(call, {1});
     } else if (call->op.same_as(ptx_mma_legacy)) {
@@ -197,7 +197,7 @@ class PipelineOpaqueAccessRewriter {
         new_args.Set(i + 1, new_index);
       }
     }
-    return Call(call->dtype, call->op, new_args, call->attrs, call->span);
+    return Call(call.ty(), call->op, new_args, call->attrs, call->span);
   }
 
   const ffi::Map<Var, Buffer>& buffer_data_to_buffer_;
@@ -767,7 +767,7 @@ class PipelineRewriter : public StmtExprMutator {
           // If the async operation that this wait_queue is waiting on is predicated, and we cannot
           // prove that the predicate is always true, the precise wait count is only valid
           // at iterations where the predicate is true;
-          auto wait_count = Call(DataType::Int(32), builtin::if_then_else(),
+          auto wait_count = Call(PrimType::Int(32), builtin::if_then_else(),
                                  {state.predicate.value(), state.pending_wait.wait_count, 0});
           attach_wait_scope(state.pending_wait.insert_before, stage_id, wait_count);
         } else {
diff --git a/src/s_tir/transform/inject_virtual_thread.cc b/src/s_tir/transform/inject_virtual_thread.cc
index 035236e8af38..58133bc4999b 100644
--- a/src/s_tir/transform/inject_virtual_thread.cc
+++ b/src/s_tir/transform/inject_virtual_thread.cc
@@ -218,17 +218,18 @@ class VTInjector : public arith::IRMutatorWithAnalyzer {
   PrimExpr VisitExpr_(const CallNode* op) final {
     if (op->op.same_as(builtin::tvm_access_ptr())) {
       TVM_FFI_ICHECK_EQ(op->args.size(), 5U);
-      DataType dtype = op->args[0].dtype();
+      DLDataType dtype = op->args[0].ty()->dtype;
       const VarNode* buffer = op->args[1].as<VarNode>();
       auto it = alloc_remap_.find(buffer);
       if (it == alloc_remap_.end()) return StmtExprMutator::VisitExpr_(op);
       visit_touched_var_ = true;
       PrimExpr offset = this->VisitExpr(op->args[2]);
       PrimExpr extent = this->VisitExpr(op->args[3]);
-      PrimExpr stride = it->second / MakeConst(offset.dtype(), dtype.lanes());
+      PrimExpr stride = it->second / MakeConst(offset.ty(), static_cast<int16_t>((dtype).lanes));
       offset = RewriteIndex(offset, stride);
 
-      return Call(op->dtype, op->op, {op->args[0], op->args[1], offset, extent, op->args[4]});
+      return Call(ffi::GetRef<PrimExpr>(op).ty(), op->op,
+                  {op->args[0], op->args[1], offset, extent, op->args[4]});
     } else if (op->op.same_as(builtin::tvm_context_id())) {
       return allow_share_ ? ffi::GetRef<PrimExpr>(op) : var_;
     } else {
@@ -465,15 +466,15 @@ class VTInjector : public arith::IRMutatorWithAnalyzer {
       // do unrolling if it is inside innermost content.
       ffi::Array<Stmt> seq;
       for (int i = 0; i < num_threads_; ++i) {
-        seq.push_back(Substitute(stmt, {{var_, MakeConst(var_.dtype(), i)}}));
+        seq.push_back(Substitute(stmt, {{var_, MakeConst(var_.ty(), i)}}));
       }
       return SeqStmt::Flatten(seq);
     } else {
       // insert a for loop
-      Var idx(var_->name_hint + ".s", var_->dtype);
+      Var idx(var_->name_hint + ".s", var_.ty());
       stmt = Substitute(stmt, {{var_, idx}});
-      return For(idx, IntImm(idx.dtype(), 0), MakeConst(idx.dtype(), num_threads_),
-                 ForKind::kSerial, stmt);
+      return For(idx, IntImm(idx.ty(), 0), MakeConst(idx.ty(), num_threads_), ForKind::kSerial,
+                 stmt);
     }
   }
 
diff --git a/src/s_tir/transform/lift_thread_binding.cc b/src/s_tir/transform/lift_thread_binding.cc
index 9aebcfe4b0ac..cd8be790c886 100644
--- a/src/s_tir/transform/lift_thread_binding.cc
+++ b/src/s_tir/transform/lift_thread_binding.cc
@@ -133,7 +133,7 @@ class ThreadBindingLifter : public StmtExprMutator {
       for (const auto& [iter_var, annotation] : it->second) {
         body = For(iter_var->var, iter_var->dom->min, iter_var->dom->extent,
                    ForKind::kThreadBinding, std::move(body),
-                   IterVar(Range(nullptr), Var(iter_var->thread_tag, iter_var->var->dtype),
+                   IterVar(Range(nullptr), Var(iter_var->thread_tag, iter_var->var.ty()),
                            kThreadIndex, iter_var->thread_tag),
                    annotation, std::nullopt);
       }
diff --git a/src/s_tir/transform/loop_partition.cc b/src/s_tir/transform/loop_partition.cc
index 87d31bae91d0..e755b8265b95 100644
--- a/src/s_tir/transform/loop_partition.cc
+++ b/src/s_tir/transform/loop_partition.cc
@@ -261,7 +261,7 @@ class PartitionFinder : public StmtExprVisitor {
       const IterVarNode* thread_axis = op->node.as<IterVarNode>();
       TVM_FFI_ICHECK(thread_axis);
       const VarNode* var = thread_axis->var.get();
-      IntSet dom = IntSet::FromRange(Range(IntImm(op->value.dtype(), 0), op->value));
+      IntSet dom = IntSet::FromRange(Range(IntImm(op->value.ty(), 0), op->value));
       hint_map_.insert({var, dom});
       relax_map_.insert({var, dom});
       StmtExprVisitor::VisitStmt_(op);
@@ -458,11 +458,11 @@ class LoopPartitioner : public StmtMutator {
     Stmt res;
     if (scope.rank == 1) {
       // threadIdx should be put into relax map, in case of divergence.
-      relax_map_.insert({var.get(), IntSet::Interval(IntImm(var.dtype(), 0), op->value - 1)});
+      relax_map_.insert({var.get(), IntSet::Interval(IntImm(var.ty(), 0), op->value - 1)});
       res = StmtMutator::VisitStmt_(op);
       relax_map_.erase(var.get());
     } else {
-      hint_map_.insert({var.get(), IntSet::Interval(IntImm(var.dtype(), 0), op->value - 1)});
+      hint_map_.insert({var.get(), IntSet::Interval(IntImm(var.ty(), 0), op->value - 1)});
       res = StmtMutator::VisitStmt_(op);
       hint_map_.erase(var.get());
     }
@@ -774,7 +774,7 @@ inline Stmt LoopPartitioner::MakeFor(const ffi::Object* node, PrimExpr extent, S
   } else {
     TVM_FFI_ICHECK(for_node->kind != ForKind::kThreadBinding);
     auto new_loop = ffi::make_object<ForNode>(*for_node);
-    new_loop->min = IntImm(for_node->min.dtype(), 0);
+    new_loop->min = IntImm(for_node->min.ty(), 0);
     new_loop->extent = extent;
     new_loop->body = body;
     return For(new_loop);
diff --git a/src/s_tir/transform/lower_async_dma.cc b/src/s_tir/transform/lower_async_dma.cc
index 89660d4fefd2..72e16a7ed039 100644
--- a/src/s_tir/transform/lower_async_dma.cc
+++ b/src/s_tir/transform/lower_async_dma.cc
@@ -76,11 +76,17 @@ class AsyncDMALowerer : public arith::IRMutatorWithAnalyzer {
 
     auto src = BufferLoad(mem_copy->source->buffer, {src_min});
     auto dst = BufferLoad(mem_copy->dest->buffer, {dst_min});
+    DLDataType src_dtype = src.ty()->dtype;
+    int src_bytes = (src_dtype.bits * static_cast<int16_t>(src_dtype.lanes) + 7) / 8;
+    PrimExpr dst_nbytes = dst_extent * src_bytes;
     return Evaluate(
-        Call(DataType::Int(32), builtin::dma_copy(),
-             {async_queue_id_.value(), Call(DataType::Handle(), builtin::address_of(), {dst}),
-              Call(DataType::Handle(), builtin::address_of(), {src}),
-              dst_extent * src->dtype.bytes(), dma_bypass_cache_}));
+        Call(PrimType::Int(32), builtin::dma_copy(),
+             ffi::Array<PrimExpr>{
+                 async_queue_id_.value(),
+                 Call(PrimType::Handle(), builtin::address_of(), ffi::Array<PrimExpr>{dst}, Span()),
+                 Call(PrimType::Handle(), builtin::address_of(), ffi::Array<PrimExpr>{src}, Span()),
+                 dst_nbytes, dma_bypass_cache_},
+             Span()));
   }
 
   Stmt VisitStmt_(const AttrStmtNode* op) final {
@@ -119,7 +125,7 @@ class AsyncDMALowerer : public arith::IRMutatorWithAnalyzer {
         return previsit;
       }
       auto call_dma_wait =
-          Evaluate(Call(DataType::Int(32), builtin::dma_wait(), {queue_id, async_wait->value}));
+          Evaluate(Call(PrimType::Int(32), builtin::dma_wait(), {queue_id, async_wait->value}));
 
       // concatenate the call with the body and return
       return SeqStmt({call_dma_wait, arith::IRMutatorWithAnalyzer::VisitStmt(async_wait->body)});
@@ -147,9 +153,9 @@ class AsyncDMALowerer : public arith::IRMutatorWithAnalyzer {
       auto result = arith::IRMutatorWithAnalyzer::VisitStmt_(op);
       if (dmas_in_group_ > 1) {
         auto call_dma_start_group = Evaluate(
-            Call(DataType::Int(32), builtin::dma_start_group(), {async_queue_id_.value()}));
+            Call(PrimType::Int(32), builtin::dma_start_group(), {async_queue_id_.value()}));
         auto call_dma_end_group =
-            Evaluate(Call(DataType::Int(32), builtin::dma_end_group(), {async_queue_id_.value()}));
+            Evaluate(Call(PrimType::Int(32), builtin::dma_end_group(), {async_queue_id_.value()}));
         result = SeqStmt({call_dma_start_group, result, call_dma_end_group});
       }
 
diff --git a/src/s_tir/transform/lower_cross_thread_reduction.cc b/src/s_tir/transform/lower_cross_thread_reduction.cc
index 626158203d2c..56c938e689e9 100644
--- a/src/s_tir/transform/lower_cross_thread_reduction.cc
+++ b/src/s_tir/transform/lower_cross_thread_reduction.cc
@@ -147,7 +147,7 @@ ffi::Array<Buffer> MakeScratchpads(const ffi::Array<Buffer>& reduction_buffers,
   for (const Buffer& buffer : reduction_buffers) {
     ffi::String name = is_cross_thread_buffer ? "cross" : "in";
     name = name + "_thread_" + buffer->name;
-    new_buffers.push_back(Buffer(/*ptr=*/Var(name, PointerType(PrimType(buffer->dtype), "local")),
+    new_buffers.push_back(Buffer(/*ptr=*/Var(name, PointerType(buffer->dtype, "local")),
                                  /*dtype=*/buffer->dtype,
                                  /*shape=*/{IntImm::Int32(1)},
                                  /*strides=*/{IntImm::Int32(1)},
@@ -377,7 +377,7 @@ Stmt TransformReductionBlock(const SBlockRealizeNode* realize,
     ffi::Array<PrimExpr> parameters;
     parameters.reserve(reduction_loops.size() + 4);
     // 1-st argument: number of buffers
-    parameters.push_back(IntImm(DataType::UInt(32), n_buffers));
+    parameters.push_back(IntImm(PrimType::UInt(32), n_buffers));
     // Next `n_buffers` arguments: sources
     if (it_buffers.defined()) {
       for (int i = 0; i < n_buffers; ++i) {
@@ -424,7 +424,7 @@ Stmt TransformReductionBlock(const SBlockRealizeNode* realize,
                         /*attr_key=*/s_tir::attr::reduce_scope,
                         /*value=*/ConstHandle(0),
                         /*body=*/
-                        Evaluate(Call(/*dtype=*/DataType::Handle(),
+                        Evaluate(Call(/*dtype=*/PrimType::Handle(),
                                       /*op=*/tirx::builtin::tvm_thread_allreduce(),
                                       /*args=*/std::move(parameters)))))));
   }
@@ -507,7 +507,7 @@ Stmt TransformReductionBlock(const SBlockRealizeNode* realize,
     if (wb_buffers[0].scope() != "local") {
       for (const ForNode* loop : reduction_loops) {
         if (loop->thread_binding.defined()) {
-          wb_predicate = wb_predicate && (loop->loop_var == IntImm(loop->loop_var->dtype, 0));
+          wb_predicate = wb_predicate && (loop->loop_var == IntImm(loop->loop_var.ty(), 0));
         }
       }
     }
@@ -862,7 +862,7 @@ class CrossThreadReductionTransformer : public StmtMutator {
     loop_vars.reserve(unbound_thread2range.size());
     for (auto [scope, range] : unbound_thread2range) {
       std::string dim_index(1, static_cast<char>(scope.dim_index + 'x'));
-      Var loop_var("t" + dim_index, range->min->dtype);
+      Var loop_var("t" + dim_index, range->min.ty());
       loop_vars.push_back(loop_var);
       predicate = (loop_var == range->min) && predicate;
     }
@@ -882,7 +882,7 @@ class CrossThreadReductionTransformer : public StmtMutator {
           /*kind=*/ForKind::kThreadBinding,                   //
           /*body=*/body,                                      //
           /*thread_binding=*/
-          IterVar(Range(), Var("", loop_vars[i]->dtype), IterVarType::kThreadIndex,
+          IterVar(Range(), Var("", loop_vars[i].ty()), IterVarType::kThreadIndex,
                   "threadIdx." + dim_index),
           /*annotations=*/{},
           /*step=*/std::nullopt);
diff --git a/src/s_tir/transform/lower_match_buffer.cc b/src/s_tir/transform/lower_match_buffer.cc
index f8e30e643494..2bedda4b4491 100644
--- a/src/s_tir/transform/lower_match_buffer.cc
+++ b/src/s_tir/transform/lower_match_buffer.cc
@@ -42,7 +42,7 @@ class MatchBufferLower : public StmtExprMutator {
   explicit MatchBufferLower(const PrimFunc& func) {
     for (const Var& param : func->params) {
       // Mark input var as const variable.
-      if (!param.dtype().is_handle()) var_map_.Set(param, param);
+      if (!param.ty().IsHandle()) var_map_.Set(param, param);
     }
   }
 
@@ -212,7 +212,7 @@ class MatchBufferLower : public StmtExprMutator {
         // Non-zero elem_offset is ill-defined for non-flat memory.
         // If needed in the future, will require `ffi::Array<PrimExpr>
         // elem_offsets`, with one offset for each flattened index.
-        Bind(buffer->elem_offset, IntImm(buffer->elem_offset.dtype(), 0));
+        Bind(buffer->elem_offset, IntImm(buffer->elem_offset.ty(), 0));
       }
     }
 
@@ -223,7 +223,7 @@ class MatchBufferLower : public StmtExprMutator {
     if (!buffer->strides.empty()) {
       TVM_FFI_ICHECK_EQ(buffer->strides.size(), buffer->shape.size());
       if (source_buffer->strides.empty()) {
-        PrimExpr stride = MakeConst(buffer->strides.back().dtype(), 1);
+        PrimExpr stride = MakeConst(buffer->strides.back().ty(), 1);
         for (size_t i = buffer->shape.size(); i > 0; --i) {
           const PrimExpr& shape = source_buffer->shape[i - 1 + offset];
           Bind(buffer->strides[i - 1], stride, buffer->name + ".strides_" + std::to_string(i - 1));
@@ -246,13 +246,16 @@ class MatchBufferLower : public StmtExprMutator {
   }
 
   void Bind(const PrimExpr& arg, PrimExpr value, const std::string& arg_name = "argument") {
-    if (arg.dtype() != value.dtype()) {
-      if (arg.dtype().is_int() && value.dtype().is_int() &&
-          arg.dtype().lanes() == value.dtype().lanes()) {
-        value = cast(arg.dtype(), value);
+    PrimType arg_ty = arg.ty();
+    PrimType value_ty = value.ty();
+    if (arg_ty->dtype != value_ty->dtype) {
+      bool same_lanes = arg_ty.lanes() == value_ty.lanes();
+      if (arg_ty.MatchesCode(DLDataTypeCode::kDLInt) &&
+          value_ty.MatchesCode(DLDataTypeCode::kDLInt) && same_lanes) {
+        value = cast(arg_ty, value);
       } else {
-        TVM_FFI_ICHECK_EQ(arg.dtype(), value.dtype())
-            << "The data type mismatched: " << arg->dtype << " vs. " << value->dtype;
+        TVM_FFI_ICHECK_EQ(arg_ty->dtype, value_ty->dtype)
+            << "The data type mismatched: " << arg_ty->dtype << " vs. " << value_ty->dtype;
       }
     }
     // Handle recursive case
diff --git a/src/s_tir/transform/lower_opaque_block.cc b/src/s_tir/transform/lower_opaque_block.cc
index 7560b3f33bb1..0f1c810b67c4 100644
--- a/src/s_tir/transform/lower_opaque_block.cc
+++ b/src/s_tir/transform/lower_opaque_block.cc
@@ -131,8 +131,8 @@ class OpaqueBlockLower : public StmtExprMutator {
 
     } else {
       PrimExpr expr = it->second;
-      if (expr.dtype() != var.dtype()) {
-        expr = tvm::cast(var.dtype(), std::move(expr));
+      if (expr.ty() != var.ty()) {
+        expr = tvm::cast(var.ty(), std::move(expr));
       }
       return expr;
     }
diff --git a/src/s_tir/transform/lower_thread_allreduce.cc b/src/s_tir/transform/lower_thread_allreduce.cc
index 0473690b7afa..ca3ff8699b48 100644
--- a/src/s_tir/transform/lower_thread_allreduce.cc
+++ b/src/s_tir/transform/lower_thread_allreduce.cc
@@ -180,14 +180,14 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
     TVM_FFI_ICHECK_EQ(size, size_of_args->value);
     ffi::Array<PrimExpr> inits = combiner->identity_element;
     std::vector<PrimExpr> values(size);
-    std::vector<DataType> types(size);
+    std::vector<DLDataType> dtypes(size);
     PrimExpr cond = call->args[size + 1];
     for (size_t idx = 0; idx < size; ++idx) {
       values[idx] = call->args[1 + idx];
       if (!is_one(cond)) {
         values[idx] = Select(cond, values[idx], inits[idx]);
       }
-      types[idx] = values[idx].dtype();
+      dtypes[idx] = values[idx].ty()->dtype;
     }
     std::vector<Buffer> buffers(size);
     for (size_t idx = 0; idx < size; ++idx) {
@@ -305,15 +305,14 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
     // In the second stage we use the first 16 lanes of the first warp to reduce
     // the remaining elements, and this reduction can also be optimized by
     // shuffle_down warp-level primitives.
-    PrimExpr zero_index = IntImm(reduce_index->dtype, 0);
-    if (IsWarpReduction(types, group_extent, reduce_extent, contiguous_reduce_extent)) {
+    PrimExpr zero_index = IntImm(reduce_index.ty(), 0);
+    if (IsWarpReduction(dtypes, group_extent, reduce_extent, contiguous_reduce_extent)) {
       std::vector<PrimExpr> reduce_results;
-      DataType mask_dtype = DataType::UInt(32);
-      PrimExpr mask = Call(mask_dtype, builtin::tvm_warp_activemask(), {});
+      PrimExpr mask = Call(PrimType::UInt(32), builtin::tvm_warp_activemask(), {});
 
       if (reduce_extent <= warp_size_) {
         std::tie(reduce_results, new_alloc_bufs) =
-            MakeWarpAllreduce(values, types, combiner, reduce_index, reduce_extent, group_index,
+            MakeWarpAllreduce(values, dtypes, combiner, reduce_index, reduce_extent, group_index,
                               mask, std::nullopt, &seq);
 
         // Broadcast the reduction result from lane 0 to all other lanes.
@@ -322,7 +321,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
         for (size_t i = 0; i < size; ++i) {
           Buffer buf = reduce_results[i].as_or_throw<BufferLoad>()->buffer;
           PrimExpr val = BufferLoad(buf, {zero_index});
-          TVM_FFI_ICHECK_EQ(val->dtype, types[i]);
+          TVM_FFI_ICHECK_EQ(val->ty()->dtype, dtypes[i]);
           PrimExpr splat = WarpShuffle(builtin::tvm_warp_shuffle(), new_alloc_bufs.back(), val,
                                        reduce_extent * group_index);
           seq.push_back(BufferStore(buf, splat, {zero_index}));
@@ -336,7 +335,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
         staging_shared_bufs.reserve(size);
         for (size_t i = 0; i < size; ++i) {
           Buffer staging_shared_buf = decl_buffer(
-              /*shape=*/{MakeConst(reduce_index->dtype, n_warps * group_extent)},
+              /*shape=*/{MakeConst(reduce_index.ty(), n_warps * group_extent)},
               /*dtype=*/buffers[i]->dtype, /*name=*/"red_buf_staging", /*storage_scope=*/"shared");
           staging_shared_bufs.push_back(staging_shared_buf);
           new_alloc_bufs.push_back(staging_shared_buf);
@@ -344,7 +343,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
 
         // 2. First round of allreduce.
         std::tie(reduce_results, local_bufs) =
-            MakeWarpAllreduce(values, types, combiner, reduce_index, warp_size_, group_index, mask,
+            MakeWarpAllreduce(values, dtypes, combiner, reduce_index, warp_size_, group_index, mask,
                               std::nullopt, &seq);
         new_alloc_bufs.insert(new_alloc_bufs.end(), local_bufs.begin(), local_bufs.end());
 
@@ -369,8 +368,8 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
                                  /*indices=*/{group_index * n_warps + reduce_index});
         }
         std::tie(reduce_results, local_bufs) = MakeWarpAllreduce(
-            values, types, combiner, reduce_index, n_warps, group_index, mask,
-            /*predicate=*/reduce_index < MakeConst(reduce_index->dtype, n_warps), &seq);
+            values, dtypes, combiner, reduce_index, n_warps, group_index, mask,
+            /*predicate=*/reduce_index < MakeConst(reduce_index.ty(), n_warps), &seq);
         new_alloc_bufs.insert(new_alloc_bufs.end(), local_bufs.begin(), local_bufs.end());
 
         // 5. Create shared memory buffer(s) of `group_extent` elements, storing
@@ -380,7 +379,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
         for (size_t i = 0; i < size; ++i) {
           new_alloc_bufs.push_back(reduce_results[i].as_or_throw<BufferLoad>()->buffer);
           Buffer broadcast_shared_buf = decl_buffer(
-              /*shape=*/{MakeConst(reduce_index->dtype, group_extent)},
+              /*shape=*/{MakeConst(reduce_index.ty(), group_extent)},
               /*dtype=*/buffers[i]->dtype, /*name=*/"red_result", /*storage_scope=*/"shared");
           write_result.push_back(
               BufferStore(broadcast_shared_buf, reduce_results[i], {group_index}));
@@ -395,7 +394,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       for (size_t i = 0; i < size; ++i) {
         TVM_FFI_ICHECK(!load_remap_.count(buffers[i]->data.get()));
         Buffer buf = reduce_results[i].as_or_throw<BufferLoad>()->buffer;
-        TVM_FFI_ICHECK_EQ(reduce_results[i]->dtype, types[i]);
+        TVM_FFI_ICHECK_EQ(reduce_results[i].ty()->dtype, dtypes[i]);
         load_remap_[buffers[i]->data.get()] = reduce_results[i];
 
         // The AllocBuffer doesn't need to be emitted here since alloc_remap_
@@ -418,20 +417,20 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       // previous iteration on the same buffer.
       seq.emplace_back(SyncThread("shared"));
       for (size_t idx = 0; idx < size; ++idx) {
-        shared_bufs[idx] = decl_buffer({IntImm(group_index->dtype, group_extent * reduce_extent)},
-                                       types[idx], "red_buf" + std::to_string(idx), "shared");
+        shared_bufs[idx] = decl_buffer({IntImm(group_index.ty(), group_extent * reduce_extent)},
+                                       dtypes[idx], "red_buf" + std::to_string(idx), "shared");
         seq.emplace_back(BufferStore(shared_bufs[idx], values[idx],
                                      {BufIndex(reduce_index, group_index, reduce_extent)}));
       }
       seq.emplace_back(SyncThread("shared"));
-      seq.emplace_back(MakeBufAllreduce(combiner, types, shared_bufs, reduce_index, group_index,
+      seq.emplace_back(MakeBufAllreduce(combiner, dtypes, shared_bufs, reduce_index, group_index,
                                         reduce_extent, group_extent, contiguous_reduce_extent));
       for (size_t idx = 0; idx < size; ++idx) {
         TVM_FFI_ICHECK(!load_remap_.count(buffers[idx]->data.get()));
-        PrimExpr pred = MakeConst(DataType::Bool(types[idx].lanes()), true);
+        PrimExpr pred = MakeConst(PrimType::Bool(static_cast<int16_t>(dtypes[idx].lanes)), true);
         BufferLoad load(shared_bufs[idx],
-                        {BufIndex(IntImm(reduce_index.dtype(), 0), group_index, reduce_extent)});
-        TVM_FFI_ICHECK_EQ(load->dtype, types[idx]);
+                        {BufIndex(IntImm(reduce_index.ty(), 0), group_index, reduce_extent)});
+        TVM_FFI_ICHECK_EQ(load->ty()->dtype, dtypes[idx]);
         load_remap_[buffers[idx]->data.get()] = load;
         alloc_remap_[buffers[idx]->data.get()] = shared_bufs[idx];
         var_remap_[buffers[idx]->data.get()] = shared_bufs[idx]->data;
@@ -455,7 +454,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
 
   std::pair<std::vector<PrimExpr>, std::vector<Buffer>> MakeWarpAllreduce(
       std::vector<PrimExpr> src_values,                  //
-      std::vector<DataType> dtypes,                      //
+      std::vector<DLDataType> dtypes,                    //
       const CommReducerNode* combiner,                   //
       PrimExpr reduce_index, int reduce_extent,          //
       PrimExpr group_index,                              //
@@ -496,7 +495,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
     // active channels.
     ffi::Optional<Buffer> mask_buffer;
     if (need_warp_shuffle_mask_) {
-      mask_buffer = decl_buffer(shape, mask->dtype, "mask", "local");
+      mask_buffer = decl_buffer(shape, mask.ty()->dtype, "mask", "local");
       seq->emplace_back(BufferStore(mask_buffer.value(), mask, zero_indices));
       // Push the buffer description.  Later this will have an
       // allocation built for it.
@@ -514,7 +513,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       for (int i = 0; i < n_buffers; ++i) {
         Buffer shared_buf = shared_bufs[i];
         BufferLoad val(shared_buf, zero_indices);
-        TVM_FFI_ICHECK_EQ(val->dtype, dtypes[i]);
+        TVM_FFI_ICHECK_EQ(val->ty()->dtype, dtypes[i]);
         a.push_back(val);
 
         // __shfl_*sync calls shall not appear in if_then_else expressions
@@ -535,7 +534,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
         seq->push_back(s);
 
         BufferLoad load = BufferLoad(local_buf, zero_indices);
-        TVM_FFI_ICHECK_EQ(load->dtype, dtypes[i]);
+        TVM_FFI_ICHECK_EQ(load->ty()->dtype, dtypes[i]);
         b.push_back(load);
       }
 
@@ -574,7 +573,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
   }
 
   // make allreduce.
-  Stmt MakeBufAllreduce(const CommReducerNode* combiner, const std::vector<DataType>& types,
+  Stmt MakeBufAllreduce(const CommReducerNode* combiner, const std::vector<DLDataType>& dtypes,
                         const ffi::Array<Buffer>& shared_bufs, PrimExpr reduce_index,
                         PrimExpr group_index, int reduce_extent, int group_extent,
                         int contiguous_reduce_extent) {
@@ -594,11 +593,11 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
       for (size_t i = 0; i < size; ++i) {
         BufferLoad b_load(shared_bufs[i],
                           {BufIndex(reduce_index + offset, group_index, reduce_extent)});
-        TVM_FFI_ICHECK_EQ(b_load->dtype, types[i]);
+        TVM_FFI_ICHECK_EQ(b_load->ty()->dtype, dtypes[i]);
         b.push_back(b_load);
 
         BufferLoad a_load(shared_bufs[i], {buf_index});
-        TVM_FFI_ICHECK_EQ(a_load->dtype, types[i]);
+        TVM_FFI_ICHECK_EQ(a_load->ty()->dtype, dtypes[i]);
         a.push_back(a_load);
       }
       ffi::Array<PrimExpr> ret = (*combiner)(a, b);
@@ -658,7 +657,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
         for (auto expr : loads) {
           Var var(
               "w_" + std::to_string(reduce_align) + "_" + std::to_string(in_warp_local_vars.size()),
-              expr->dtype);
+              expr.ty());
           in_warp_local_vars.push_back(var);
         }
 
@@ -717,7 +716,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
   }
   // sync thread op.
   static Stmt SyncThread(const std::string& sync) {
-    return Evaluate(Call(DataType::Int(32), builtin::tvm_storage_sync(), {StringImm(sync)}));
+    return Evaluate(Call(PrimType::Int(32), builtin::tvm_storage_sync(), {StringImm(sync)}));
   }
 
   // Emit warp shuffle  calls.
@@ -732,14 +731,14 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
     }
     PrimExpr width = IntImm::Int32(warp_size_);
     ffi::Array<PrimExpr> args{mask, val, delta_or_lane, width, width};
-    return Call(val.dtype(), op, args);
+    return Call(val.ty(), op, args);
   }
 
   // Check if we can use warp level reduction.
   //
   // Note: The ROCm backend will only have warp reductions for now.
   // Also, the warp/wavefront size differs (64 on rocm, 32 on cuda and metal).
-  bool IsWarpReduction(const std::vector<DataType>& types, int group_extent, int reduce_extent,
+  bool IsWarpReduction(const std::vector<DLDataType>& dtypes, int group_extent, int reduce_extent,
                        int contiguous_reduce_extent) {
     if ((target_->kind->name != "cuda") && (target_->kind->name != "rocm") &&
         (target_->kind->name != "metal") && (target_->kind->name != "webgpu")) {
@@ -750,19 +749,22 @@ class ThreadAllreduceBuilder final : public StmtExprMutator {
 
     // rocm only supports 32 bit operands for shuffling at the moment
     if ((target_->kind->name == "rocm") &&
-        (std::any_of(types.begin(), types.end(), [](DataType ty) {
-          if (ty.is_fixed_length_vector()) return ty.bits() * ty.lanes() != 32;
-          return ty.bits() != 32;
+        (std::any_of(dtypes.begin(), dtypes.end(), [](DLDataType dtype) {
+          int16_t lanes = static_cast<int16_t>(dtype.lanes);
+          if (lanes > 1) return dtype.bits * lanes != 32;
+          return dtype.bits != 32;
         }))) {
       return false;
     }
 
     // Supported types:
     // {u}int, {u}long, {u}long long, float, double, half/half2
-    if (std::any_of(types.begin(), types.end(), [](DataType ty) {
-          if (ty.is_float16()) return ty.lanes() > 2;
-          if (ty.is_fixed_length_vector()) return true;
-          return ty.bytes() < 4 || ty.bytes() > 8;
+    if (std::any_of(dtypes.begin(), dtypes.end(), [](DLDataType dtype) {
+          int16_t lanes = static_cast<int16_t>(dtype.lanes);
+          if (dtype.code == kDLFloat && dtype.bits == 16) return lanes > 2;
+          if (lanes > 1) return true;
+          int bytes = (dtype.bits * lanes + 7) / 8;
+          return bytes < 4 || bytes > 8;
         })) {
       return false;
     }
diff --git a/src/s_tir/transform/lower_vtcm_alloc.cc b/src/s_tir/transform/lower_vtcm_alloc.cc
index cd33d870628f..eb9ecefe7e49 100644
--- a/src/s_tir/transform/lower_vtcm_alloc.cc
+++ b/src/s_tir/transform/lower_vtcm_alloc.cc
@@ -43,9 +43,9 @@ class VtcmAllocator : public StmtExprMutator {
       ffi::Array<PrimExpr> args;
       args.push_back(StringImm(storage_scope));
       args.push_back(IntImm::Int64(op->buffer->shape.size()));
-      args.push_back(Call(DataType::Handle(), builtin::tvm_stack_make_shape(), op->buffer->shape));
+      args.push_back(Call(PrimType::Handle(), builtin::tvm_stack_make_shape(), op->buffer->shape));
       return Bind(op->buffer->data,
-                  Call(op->buffer->data.dtype(), builtin::nd_mem_alloc_with_scope(), args));
+                  Call(op->buffer->data.ty(), builtin::nd_mem_alloc_with_scope(), args));
     }
     return StmtExprMutator::VisitStmt_(op);
   }
diff --git a/src/s_tir/transform/memhammer_tensorcore_rewrite.cc b/src/s_tir/transform/memhammer_tensorcore_rewrite.cc
index 0e74dc1d0b17..25988c2637a5 100644
--- a/src/s_tir/transform/memhammer_tensorcore_rewrite.cc
+++ b/src/s_tir/transform/memhammer_tensorcore_rewrite.cc
@@ -105,8 +105,9 @@ ffi::Array<Range> RelaxIndices(const ffi::Array<PrimExpr>& indices,
  */
 Stmt RewriteWmmaLoad(Stmt stmt) {
   using arith::IntSet;
-  const DataType dtype = DataType::Float(16);
-  const DataType int32 = DataType::Int(32);
+  const PrimType dtype_ty = PrimType::Float(16);
+  const DLDataType dtype = dtype_ty->dtype;
+  const PrimType int32_ty = PrimType::Int(32);
 
   Stmt body = stmt;
   std::vector<const ForNode*> loops;
@@ -128,21 +129,21 @@ Stmt RewriteWmmaLoad(Stmt stmt) {
   Buffer tgt_buffer = buf_store->buffer;
   std::string layout = tgt_buffer.scope() == "wmma.matrix_a" ? "row_major" : "col_major";
   Buffer new_src_buffer(
-      /*data=*/Var("src", PointerType(PrimType(dtype), src_buffer.scope())),
+      /*data=*/Var("src", PointerType(dtype_ty, src_buffer.scope())),
       /*dtype=*/dtype,
       /*shape=*/{IntImm::Int32(16), IntImm::Int32(16)},
-      /*strides=*/{Var("s1", int32), Var("s0", int32)},
-      /*elem_offset=*/Var("src_elem_offset", int32),
+      /*strides=*/{Var("s1", int32_ty), Var("s0", int32_ty)},
+      /*elem_offset=*/Var("src_elem_offset", int32_ty),
       /*name=*/"src",
       /*data_alignment=*/64,
       /*offset_factor=*/16,
       /*buffer_type=*/kDefault);
   Buffer new_tgt_buffer(
-      /*data=*/Var("tgt", PointerType(PrimType(dtype), tgt_buffer.scope())),
+      /*data=*/Var("tgt", PointerType(dtype_ty, tgt_buffer.scope())),
       /*dtype=*/dtype,
       /*shape=*/{IntImm::Int32(16), IntImm::Int32(16)},
       /*strides=*/{},
-      /*elem_offset=*/Var("tgt_elem_offset", int32),
+      /*elem_offset=*/Var("tgt_elem_offset", int32_ty),
       /*name=*/"tgt",
       /*data_alignment=*/64,
       /*offset_factor=*/16,
@@ -160,7 +161,7 @@ Stmt RewriteWmmaLoad(Stmt stmt) {
           /*name_hint=*/"wmma_load",
           /*body=*/
           Evaluate(Call(
-              /*data=*/runtime::DataType::Handle(),
+              /*data=*/PrimType::Handle(),
               /*op=*/tvm_load_matrix_sync_op,
               {
                   /*0:*/ new_tgt_buffer->data,
@@ -171,7 +172,7 @@ Stmt RewriteWmmaLoad(Stmt stmt) {
                       floordiv(floormod(new_tgt_buffer->elem_offset, 256), 16),
                   /*5:*/
                   Call(
-                      /*dtype=*/runtime::DataType::Handle(),
+                      /*dtype=*/PrimType::Handle(),
                       /*op=*/builtin::tvm_access_ptr(),
                       /*args=*/
                       {
@@ -207,7 +208,7 @@ Stmt RewriteWmmaLoad(Stmt stmt) {
  */
 Stmt RewriteWmmaStore(Stmt stmt) {
   using arith::IntSet;
-  const DataType int32 = DataType::Int(32);
+  const PrimType int32_ty = PrimType::Int(32);
 
   Stmt body = stmt;
   std::vector<const ForNode*> loops;
@@ -236,22 +237,23 @@ Stmt RewriteWmmaStore(Stmt stmt) {
   Buffer src_buffer = buf_load->buffer;
   Buffer tgt_buffer = buf_store->buffer;
 
-  const DataType dtype = src_buffer->dtype;
+  PrimType dtype_ty = src_buffer->dtype;
+  const DLDataType dtype = dtype_ty->dtype;
 
-  Buffer new_src_buffer(/*data=*/Var("src", PointerType(PrimType(dtype), src_buffer.scope())),
+  Buffer new_src_buffer(/*data=*/Var("src", PointerType(dtype_ty, src_buffer.scope())),
                         /*dtype=*/dtype,
                         /*shape=*/{IntImm::Int32(16), IntImm::Int32(16)},
                         /*strides=*/{},
-                        /*elem_offset=*/Var("src_elem_offset", int32),
+                        /*elem_offset=*/Var("src_elem_offset", int32_ty),
                         /*name=*/"src",
                         /*data_alignment=*/64,
                         /*offset_factor=*/16,
                         /*buffer_type=*/kDefault);
-  Buffer new_tgt_buffer(/*data=*/Var("tgt", PointerType(PrimType(dtype), tgt_buffer.scope())),
+  Buffer new_tgt_buffer(/*data=*/Var("tgt", PointerType(dtype_ty, tgt_buffer.scope())),
                         /*dtype=*/dtype,
                         /*shape=*/{IntImm::Int32(16), IntImm::Int32(16)},
-                        /*strides=*/{Var("s1", int32), Var("s0", int32)},
-                        /*elem_offset=*/Var("tgt_elem_offset", int32),
+                        /*strides=*/{Var("s1", int32_ty), Var("s0", int32_ty)},
+                        /*elem_offset=*/Var("tgt_elem_offset", int32_ty),
                         /*name=*/"tgt",
                         /*data_alignment=*/64,
                         /*offset_factor=*/16,
@@ -268,7 +270,7 @@ Stmt RewriteWmmaStore(Stmt stmt) {
              /*writes=*/{BufferRegion(tgt_buffer, write_region)},
              /*name_hint=*/"wmma_store",
              Evaluate(Call(
-                 /*data=*/runtime::DataType::Handle(),
+                 /*data=*/PrimType::Handle(),
                  /*op=*/tvm_store_matrix_sync_op,
                  {/*0:*/ new_src_buffer->data,
                   /*1:*/ 16,
@@ -278,7 +280,7 @@ Stmt RewriteWmmaStore(Stmt stmt) {
                       floordiv(floormod(new_src_buffer->elem_offset, 256), 16),
                   /*5:*/
                   Call(
-                      /*data=*/runtime::DataType::Handle(),
+                      /*data=*/PrimType::Handle(),
                       /*op=*/builtin::tvm_access_ptr(),
                       {
                           /*0:*/ TypeAnnotation(new_tgt_buffer->dtype),
@@ -418,7 +420,7 @@ std::pair<Stmt, ffi::Optional<For>> TileMmaToGlobalBlock(Stmt stmt) {
  */
 Stmt RewriteMmaStore(Stmt stmt) {
   using arith::IntSet;
-  const DataType int32 = DataType::Int(32);
+  const PrimType int32_ty = PrimType::Int(32);
 
   // Step 1. Get inner loop body
   Stmt body = stmt;
@@ -458,21 +460,22 @@ Stmt RewriteMmaStore(Stmt stmt) {
   // Step 3.1. Generate new buffer
   Buffer src_buffer = buf_load->buffer;
   Buffer tgt_buffer = buf_store->buffer;
-  const DataType dtype = src_buffer->dtype;
-  Buffer new_src_buffer(/*data=*/Var("src", PointerType(PrimType(dtype), src_buffer.scope())),
+  PrimType dtype_ty = src_buffer->dtype;
+  const DLDataType dtype = dtype_ty->dtype;
+  Buffer new_src_buffer(/*data=*/Var("src", PointerType(dtype_ty, src_buffer.scope())),
                         /*dtype=*/dtype,
                         /*shape=*/{IntImm::Int32(8), IntImm::Int32(8)},
                         /*strides=*/{},
-                        /*elem_offset=*/Var("src_elem_offset", int32),
+                        /*elem_offset=*/Var("src_elem_offset", int32_ty),
                         /*name=*/"src",
                         /*data_alignment=*/64,
                         /*offset_factor=*/8,
                         /*buffer_type=*/kDefault);
-  Buffer new_tgt_buffer(/*data=*/Var("tgt", PointerType(PrimType(dtype), tgt_buffer.scope())),
+  Buffer new_tgt_buffer(/*data=*/Var("tgt", PointerType(dtype_ty, tgt_buffer.scope())),
                         /*dtype=*/dtype,
                         /*shape=*/{IntImm::Int32(8), IntImm::Int32(8)},
-                        /*strides=*/{Var("s1", int32), Var("s0", int32)},
-                        /*elem_offset=*/Var("tgt_elem_offset", int32),
+                        /*strides=*/{Var("s1", int32_ty), Var("s0", int32_ty)},
+                        /*elem_offset=*/Var("tgt_elem_offset", int32_ty),
                         /*name=*/"tgt",
                         /*data_alignment=*/64,
                         /*offset_factor=*/8,
diff --git a/src/s_tir/transform/merge_shared_memory_allocations.cc b/src/s_tir/transform/merge_shared_memory_allocations.cc
index c28f6b01c801..2ce0295d1675 100644
--- a/src/s_tir/transform/merge_shared_memory_allocations.cc
+++ b/src/s_tir/transform/merge_shared_memory_allocations.cc
@@ -338,9 +338,9 @@ class SharedMemoryRewriter : public StmtExprMutator {
    */
   Var MakeMergedBufferVar() {
     if (is_dynamic_) {
-      return Var("buf_dyn_shmem", PointerType(PrimType(DataType::UInt(8)), "shared.dyn"));
+      return Var("buf_dyn_shmem", PointerType(PrimType::UInt(8), "shared.dyn"));
     } else {
-      return Var("buf_shmem", PointerType(PrimType(DataType::UInt(8)), "shared"));
+      return Var("buf_shmem", PointerType(PrimType::UInt(8), "shared"));
     }
   }
 
@@ -390,8 +390,9 @@ class SharedMemoryRewriter : public StmtExprMutator {
       }
 
       // 7. Wrap with the merged-buffer AllocBuffer.
-      Buffer merged_buf(scope.merged_buf_var, DataType::UInt(8), {scope.merged_alloc_size}, {},
-                        PrimExpr(), scope.merged_buf_var->name_hint, 0, 0, BufferType::kDefault);
+      Buffer merged_buf(scope.merged_buf_var, DLDataType{kDLUInt, 8, 1}, {scope.merged_alloc_size},
+                        {}, PrimExpr(), scope.merged_buf_var->name_hint, 0, 0,
+                        BufferType::kDefault);
       ffi::Map<ffi::String, ffi::Any> annotations;
       if (scope.has_volatile_alloc) {
         annotations.Set(tirx::attr::kVolatile, true);
@@ -451,7 +452,7 @@ class SharedMemoryRewriter : public StmtExprMutator {
           << "and is to be run after "
           << "FlattenBuffer";
       ffi::Array<PrimExpr> indices = {
-          node->indices[0] + this->GetBufferOffset(node->buffer->data, node->buffer->dtype)};
+          node->indices[0] + this->GetBufferOffset(node->buffer->data, node->buffer->dtype->dtype)};
 
       auto writer = node.CopyOnWrite();
       writer->buffer = GetUpdatedBuffer(node->buffer);
@@ -490,7 +491,7 @@ class SharedMemoryRewriter : public StmtExprMutator {
     static const Op& ptx_cp_async_op = Op::Get("tirx.ptx.cp_async_raw");
     if (op->op.same_as(builtin::tvm_access_ptr())) {
       TVM_FFI_ICHECK_EQ(op->args.size(), 5U);
-      DataType dtype = op->args[0].dtype();
+      DLDataType dtype = op->args[0].ty()->dtype;
       Var buffer = op->args[1].as_or_throw<Var>();
       if (!IsAppropriateSharedMemory(buffer) || scope_stack_.empty() ||
           !scope_stack_.back().shmem_allocs.count(buffer.get())) {
@@ -500,7 +501,7 @@ class SharedMemoryRewriter : public StmtExprMutator {
 
       PrimExpr offset = this->VisitExpr(op->args[2]);
       PrimExpr extent = this->VisitExpr(op->args[3]);
-      return Call(op->dtype, op->op,
+      return Call(ffi::GetRef<PrimExpr>(op).ty(), op->op,
                   {op->args[0], scope_stack_.back().merged_buf_var, extra_offset + offset, extent,
                    op->args[4]});
     } else if (op->op.same_as(ptx_cp_async_op)) {
@@ -510,7 +511,7 @@ class SharedMemoryRewriter : public StmtExprMutator {
       TVM_FFI_ICHECK(ptr_type) << "The buffer should be a pointer type.";
       const auto* prim_type = ptr_type->element_type.as<PrimTypeNode>();
       TVM_FFI_ICHECK(prim_type) << "The buffer should be a pointer to a primitive type.";
-      DataType dtype = DataType(prim_type->dtype);
+      DLDataType dtype = prim_type->dtype;
       if (!IsAppropriateSharedMemory(buffer) || scope_stack_.empty() ||
           !scope_stack_.back().shmem_allocs.count(buffer.get())) {
         return StmtExprMutator::VisitExpr_(op);
@@ -520,15 +521,15 @@ class SharedMemoryRewriter : public StmtExprMutator {
       // the dst shared memory is a byte buffer generated by merging shared memory.
       // we need to multiply the offset index by the byte size of the original value dtype, to get
       // the correct offset of merged shared buffer.
-      int index_factor = dtype.bytes();
+      int index_factor = (static_cast<int>(dtype.bits) * static_cast<int>(dtype.lanes) + 7) / 8;
       if (op->args.size() == 5)
         return Call(
-            dtype, op->op,
+            ffi::GetRef<PrimExpr>(op).ty(), op->op,
             {scope_stack_.back().merged_buf_var, mul(extra_offset + offset, PrimExpr(index_factor)),
              op->args[2], op->args[3], op->args[4]});
       else
         return Call(
-            dtype, op->op,
+            ffi::GetRef<PrimExpr>(op).ty(), op->op,
             {scope_stack_.back().merged_buf_var, mul(extra_offset + offset, PrimExpr(index_factor)),
              op->args[2], op->args[3], op->args[4], op->args[5]});
     } else {
@@ -536,12 +537,13 @@ class SharedMemoryRewriter : public StmtExprMutator {
     }
   }
 
-  PrimExpr GetBufferOffset(Var buffer_var, DataType dtype) {
+  PrimExpr GetBufferOffset(Var buffer_var, DLDataType dtype) {
     TVM_FFI_ICHECK(!scope_stack_.empty());
     KernelScope& scope = scope_stack_.back();
     auto it = scope.buffer_byte_offsets.find(buffer_var.get());
     TVM_FFI_ICHECK(it != scope.buffer_byte_offsets.end());
-    return indexdiv(it->second, dtype.bytes());
+    int elem_bytes = (static_cast<int>(dtype.bits) * static_cast<int>(dtype.lanes) + 7) / 8;
+    return indexdiv(it->second, elem_bytes);
   }
 
   // Wrapper function to determine if the shared memory allocation for a variable is appropriate.
@@ -646,7 +648,9 @@ class SharedMemoryRewriter : public StmtExprMutator {
       for (int i = 0; i < static_cast<int>(e->allocs.size()); i++) {
         for (const VarNode* buffer : e->allocs[i]) {
           const Buffer& buf = scope.shmem_allocs.at(buffer);
-          align[i] = std::max(align[i], buf->dtype.bytes());
+          int elem_bytes =
+              (static_cast<int>(buf->dtype.bits()) * static_cast<int>(buf->dtype.lanes()) + 7) / 8;
+          align[i] = std::max(align[i], elem_bytes);
         }
       }
     }
@@ -658,13 +662,15 @@ class SharedMemoryRewriter : public StmtExprMutator {
         for (const VarNode* buffer : e->allocs[i]) {
           const Buffer& buf = scope.shmem_allocs.at(buffer);
           ffi::Array<PrimExpr> alloc_shape = GetBufferAllocationShape(buf);
-          int align_bytes = std::max(align[i], buf->dtype.bytes());
+          int elem_bytes =
+              (static_cast<int>(buf->dtype.bits()) * static_cast<int>(buf->dtype.lanes()) + 7) / 8;
+          int align_bytes = std::max(align[i], elem_bytes);
           if (buf->data_alignment > 0) {
             TVM_FFI_ICHECK(buf->data_alignment % align_bytes == 0)
                 << "The alignment of the buffer is not a multiple of the data type size.";
             align_bytes = buf->data_alignment;
           }
-          PrimExpr buffer_bytes = alloc_shape[0] * buf->dtype.bytes();
+          PrimExpr buffer_bytes = alloc_shape[0] * elem_bytes;
           inner_offset +=
               indexmod(align_bytes - indexmod(scope.merged_alloc_size + inner_offset, align_bytes),
                        align_bytes);
@@ -702,7 +708,8 @@ class SharedMemoryRewriter : public StmtExprMutator {
     // compiler can do a better job with register allocation.
     const uint64_t match_range = 16;
     ffi::Array<PrimExpr> alloc_shape = GetBufferAllocationShape(buf);
-    uint64_t op_elem_bits = buf->dtype.bits() * buf->dtype.lanes();
+    DLDataType dtype = buf->dtype->dtype;
+    uint64_t op_elem_bits = static_cast<uint64_t>(dtype.bits) * dtype.lanes;
     uint64_t const_nbits =
         static_cast<uint64_t>(ConstantAllocationSize(alloc_shape) * op_elem_bits);
     // disable reuse of small arrays, they will be lowered to registers in LLVM
diff --git a/src/s_tir/transform/profile_instrumentation.cc b/src/s_tir/transform/profile_instrumentation.cc
index 28b325ca9c60..c3af852e46a3 100644
--- a/src/s_tir/transform/profile_instrumentation.cc
+++ b/src/s_tir/transform/profile_instrumentation.cc
@@ -203,8 +203,8 @@ class InstrumentIntrin : public StmtMutator {
       return stmt;
     }
     PrimExpr id = static_cast<int32_t>(loop_info.id);
-    PrimExpr start_call = Call(DataType::Handle(), builtin::start_profile_intrinsic(), {id});
-    PrimExpr end_call = Call(DataType::Handle(), builtin::end_profile_intrinsic(), {id});
+    PrimExpr start_call = Call(PrimType::Handle(), builtin::start_profile_intrinsic(), {id});
+    PrimExpr end_call = Call(PrimType::Handle(), builtin::end_profile_intrinsic(), {id});
     const Stmt start_profile = Evaluate(start_call);
     const Stmt end_profile = Evaluate(end_call);
     Stmt new_stmt = SeqStmt({start_profile, stmt, end_profile});
@@ -243,8 +243,8 @@ PrimFunc AddProfileBuiltins(PrimFunc func, int32_t max_instr_depth, int32_t min_
 
   PrimExpr e = start_id++;
   if (!disable_func_instrumentation) {
-    PrimExpr start_call = Call(DataType::Handle(), builtin::start_profile_intrinsic(), {e});
-    PrimExpr end_call = Call(DataType::Handle(), builtin::end_profile_intrinsic(), {e});
+    PrimExpr start_call = Call(PrimType::Handle(), builtin::start_profile_intrinsic(), {e});
+    PrimExpr end_call = Call(PrimType::Handle(), builtin::end_profile_intrinsic(), {e});
     const Stmt start_profile = Evaluate(start_call);
     const Stmt end_profile = Evaluate(end_call);
     func_ptr->body = SeqStmt({start_profile, std::move(func_ptr->body), end_profile});
diff --git a/src/s_tir/transform/renew_defs.cc b/src/s_tir/transform/renew_defs.cc
index f192d6a416a9..499124756542 100644
--- a/src/s_tir/transform/renew_defs.cc
+++ b/src/s_tir/transform/renew_defs.cc
@@ -54,7 +54,7 @@ class RenewDefMutator : public StmtExprMutator {
       params.push_back(generator.ReDefineVar(param));
     }
     for (const auto& param : func->params) {
-      if (param->dtype.is_handle()) {
+      if (param->ty().IsHandle()) {
         const Buffer& buffer = func->buffer_map.at(param);
         for (const PrimExpr& e : buffer->shape) {
           if (const auto* v = e.as<VarNode>()) {
@@ -69,7 +69,7 @@ class RenewDefMutator : public StmtExprMutator {
     // TODO(Siyuan Feng): checking var is used after define
     ffi::Map<tirx::Var, Buffer> buffer_map;
     for (const auto& param : func->params) {
-      if (param->dtype.is_handle()) {
+      if (param->ty().IsHandle()) {
         const Buffer& buffer = func->buffer_map.at(param);
         Var new_param = generator.VisitExpr(param).as_or_throw<Var>();
         Buffer new_buffer = generator.DefineBuffer(buffer);
diff --git a/src/s_tir/transform/renormalize_split_pattern.cc b/src/s_tir/transform/renormalize_split_pattern.cc
index 2fbadfabd4c9..83fcb62e8ccf 100644
--- a/src/s_tir/transform/renormalize_split_pattern.cc
+++ b/src/s_tir/transform/renormalize_split_pattern.cc
@@ -83,8 +83,8 @@ class SplitPatternReNormalizer : public IRMutatorWithAnalyzer {
       if (c1_val > 0 && c2_val > 0) {
         int64_t c3 = ZeroAwareGCD(c1_val, c2_val);
         if (c3 > 1) {
-          IntImm c1_div = IntImm(c1.Eval().dtype(), c1_val / c3);
-          IntImm c2_div = IntImm(c2.Eval().dtype(), c2_val / c3);
+          IntImm c1_div = IntImm(c1.Eval().ty(), c1_val / c3);
+          IntImm c2_div = IntImm(c2.Eval().ty(), c2_val / c3);
           return RecursiveRewrite(floordiv(x.Eval() * c1_div + floordiv(y.Eval(), c3), c2_div));
         }
       }
@@ -95,12 +95,12 @@ class SplitPatternReNormalizer : public IRMutatorWithAnalyzer {
       if (c1_val > 0 && c2_val > 0) {
         int64_t c3 = ZeroAwareGCD(c1_val, c2_val);
         if (c3 > 1) {
-          IntImm c1_div = IntImm(c1.Eval().dtype(), c1_val / c3);
-          IntImm c2_div = IntImm(c2.Eval().dtype(), c2_val / c3);
-          return RecursiveRewrite(floordiv(
-              x.Eval() * Broadcast(c1_div, lanes.Eval()) +
-                  floordiv(y.Eval(), Broadcast(IntImm(c1.Eval().dtype(), c3), lanes.Eval())),
-              Broadcast(c2_div, lanes.Eval())));
+          IntImm c1_div = IntImm(c1.Eval().ty(), c1_val / c3);
+          IntImm c2_div = IntImm(c2.Eval().ty(), c2_val / c3);
+          return RecursiveRewrite(
+              floordiv(x.Eval() * Broadcast(c1_div, lanes.Eval()) +
+                           floordiv(y.Eval(), Broadcast(IntImm(c1.Eval().ty(), c3), lanes.Eval())),
+                       Broadcast(c2_div, lanes.Eval())));
         }
       }
     }
@@ -112,8 +112,8 @@ class SplitPatternReNormalizer : public IRMutatorWithAnalyzer {
       if (c1_val > 0 && c2_val > 0) {
         int64_t c3 = ZeroAwareGCD(c1_val, c2_val);
         if (c3 > 1) {
-          IntImm c1_div = IntImm(c1.Eval().dtype(), c1_val / c3);
-          IntImm c2_div = IntImm(c2.Eval().dtype(), c2_val / c3);
+          IntImm c1_div = IntImm(c1.Eval().ty(), c1_val / c3);
+          IntImm c2_div = IntImm(c2.Eval().ty(), c2_val / c3);
           return RecursiveRewrite(
               floordiv(x.Eval() * c1_div + floordiv(y.Eval() + z.Eval(), c3), c2_div));
         }
@@ -125,12 +125,12 @@ class SplitPatternReNormalizer : public IRMutatorWithAnalyzer {
       if (c1_val > 0 && c2_val > 0) {
         int64_t c3 = ZeroAwareGCD(c1_val, c2_val);
         if (c3 > 1) {
-          IntImm c1_div = IntImm(c1.Eval().dtype(), c1_val / c3);
-          IntImm c2_div = IntImm(c2.Eval().dtype(), c2_val / c3);
+          IntImm c1_div = IntImm(c1.Eval().ty(), c1_val / c3);
+          IntImm c2_div = IntImm(c2.Eval().ty(), c2_val / c3);
           return RecursiveRewrite(
               floordiv(x.Eval() * Broadcast(c1_div, lanes.Eval()) +
                            floordiv(y.Eval() + z.Eval(),
-                                    Broadcast(IntImm(c1.Eval().dtype(), c3), lanes.Eval())),
+                                    Broadcast(IntImm(c1.Eval().ty(), c3), lanes.Eval())),
                        Broadcast(c2_div, lanes.Eval())));
         }
       }
diff --git a/src/s_tir/transform/rewrite_unsafe_select.cc b/src/s_tir/transform/rewrite_unsafe_select.cc
index 8a0c3f1b4bd3..38a60ae81933 100644
--- a/src/s_tir/transform/rewrite_unsafe_select.cc
+++ b/src/s_tir/transform/rewrite_unsafe_select.cc
@@ -117,10 +117,11 @@ class UnsafeSelectRewriter : public StmtExprMutator {
     PrimExpr expr = StmtExprMutator::VisitExpr_(op);
     op = expr.as<SelectNode>();
     UnsafeExprDetector unsafe;
-    bool cond_is_scalar_bool = op->condition.dtype().is_bool() && op->condition.dtype().is_scalar();
+    PrimType cond_ty = op->condition.ty();
+    bool cond_is_scalar_bool = cond_ty.MatchesCode(DLDataTypeCode::kDLBool) && cond_ty.IsScalar();
     if ((unsafe.VisitExpr(op->true_value) || unsafe.VisitExpr(op->false_value)) &&
         cond_is_scalar_bool) {
-      return Call(op->dtype, builtin::if_then_else(),
+      return Call(ffi::GetRef<PrimExpr>(op).ty(), builtin::if_then_else(),
                   {op->condition, op->true_value, op->false_value});
     } else {
       return expr;
diff --git a/src/s_tir/transform/storage_access.cc b/src/s_tir/transform/storage_access.cc
index 0a347abb71c7..d4dddbde6243 100644
--- a/src/s_tir/transform/storage_access.cc
+++ b/src/s_tir/transform/storage_access.cc
@@ -43,7 +43,8 @@ void StorageAccessVisitor::VisitExpr_(const BufferLoadNode* op) {
     AccessEntry e;
     e.threads = env_threads();
     e.buffer = buf;
-    e.dtype = op->dtype.element_of();
+    e.dtype = op->ty()->dtype;
+    e.dtype.lanes = 1;
     for (const auto& index : op->indices) {
       e.touched.push_back(arith::IntSet::Vector(index));
     }
@@ -66,7 +67,8 @@ void StorageAccessVisitor::VisitStmt_(const BufferStoreNode* op) {
     AccessEntry e;
     e.threads = env_threads();
     e.buffer = buf;
-    e.dtype = op->value.dtype().element_of();
+    e.dtype = op->value.ty()->dtype;
+    e.dtype.lanes = 1;
     for (const auto& index : op->indices) {
       e.touched.push_back(arith::IntSet::Vector(index));
     }
@@ -240,7 +242,7 @@ void StorageAccessVisitor::VisitExpr_(const CallNode* op) {
     StmtExprVisitor::VisitExpr_(load);
   } else if (op->op.same_as(builtin::tvm_access_ptr())) {
     TVM_FFI_ICHECK_EQ(op->args.size(), 5U);
-    DataType dtype = op->args[0].dtype();
+    DLDataType dtype = op->args[0].ty()->dtype;
     const VarNode* buffer = op->args[1].as<VarNode>();
     if (buffer == nullptr) {
       // args[1] is not a raw Var — e.g. a nested tvm_access_ptr or some
diff --git a/src/s_tir/transform/storage_access.h b/src/s_tir/transform/storage_access.h
index d85dc5a3c3ae..b3bb8df1801b 100644
--- a/src/s_tir/transform/storage_access.h
+++ b/src/s_tir/transform/storage_access.h
@@ -61,7 +61,7 @@ class StorageAccessVisitor : public StmtExprVisitor {
     /*! \brief The buffer variable, if any */
     Var buffer = Var(ffi::ObjectPtr<VarNode>(nullptr));
     /*! \brief The access data type */
-    DataType dtype;
+    DLDataType dtype;
     /*! \brief The touched access range
      *
      * Has one IntSet for each index in the buffer being accessed.
diff --git a/src/s_tir/transform/thread_storage_sync.cc b/src/s_tir/transform/thread_storage_sync.cc
index 254a2d72e36e..3d7122fc821f 100644
--- a/src/s_tir/transform/thread_storage_sync.cc
+++ b/src/s_tir/transform/thread_storage_sync.cc
@@ -293,7 +293,7 @@ class ThreadSyncAfterWaitQueueInserter : public StmtExprMutator {
 
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == s_tir::attr::async_wait_queue_scope) {
-      auto sync = Evaluate(Call(DataType::Int(32), builtin::tvm_storage_sync(),
+      auto sync = Evaluate(Call(PrimType::Int(32), builtin::tvm_storage_sync(),
                                 {StringImm(sync_scope_.to_string())}));
       auto inner = op->body.as<AttrStmtNode>();
       TVM_FFI_ICHECK(inner && inner->attr_key == s_tir::attr::async_wait_inflight_count);
@@ -318,7 +318,7 @@ class ThreadSyncInserter : public StmtExprMutator {
   Stmt VisitStmt(const Stmt& stmt) final {
     if (syncs_.size() == 0) return stmt;
     if (syncs_.count(stmt.get())) {
-      Stmt barrier = Evaluate(Call(DataType::Int(32), builtin::tvm_storage_sync(),
+      Stmt barrier = Evaluate(Call(PrimType::Int(32), builtin::tvm_storage_sync(),
                                    {StringImm(sync_scope_.to_string())}));
       // Mutate after query, to avoid stmt change.
       auto ret = StmtExprMutator::VisitStmt(stmt);
diff --git a/src/s_tir/transform/unify_thread_binding.cc b/src/s_tir/transform/unify_thread_binding.cc
index c3c0b5a170c9..d3a32d8fd17f 100644
--- a/src/s_tir/transform/unify_thread_binding.cc
+++ b/src/s_tir/transform/unify_thread_binding.cc
@@ -55,7 +55,7 @@ class ThreadBindingUnifier : public StmtExprMutator {
     }
     IterVar old_iter_var = op->node.as_or_throw<IterVar>();
     return UnifyThreadBindingImpl(op, old_iter_var->var, old_iter_var,
-                                  Range::FromMinExtent(IntImm(op->value->dtype, 0), op->value));
+                                  Range::FromMinExtent(IntImm(op->value.ty(), 0), op->value));
   }
 
   Stmt VisitStmt_(const ForNode* op) final {
@@ -76,12 +76,12 @@ class ThreadBindingUnifier : public StmtExprMutator {
 
     } else {
       // Create a new unit loop with the annotation.
-      DataType dtype = op->loop_var->dtype;
-      return For(/*loop_var=*/Var("var", dtype),   //
-                 /*min=*/IntImm(dtype, 0),         //
-                 /*extent=*/IntImm(dtype, 1),      //
-                 /*kind=*/ForKind::kSerial, stmt,  //
-                 /*thread_binding=*/std::nullopt,  //
+      PrimType loop_ty = op->loop_var.ty();
+      return For(/*loop_var=*/Var("var", loop_ty),  //
+                 /*min=*/IntImm(loop_ty, 0),        //
+                 /*extent=*/IntImm(loop_ty, 1),     //
+                 /*kind=*/ForKind::kSerial, stmt,   //
+                 /*thread_binding=*/std::nullopt,   //
                  /*annotation=*/std::move(annotations),
                  /*step=*/std::nullopt);
     }
@@ -121,7 +121,7 @@ class ThreadBindingUnifier : public StmtExprMutator {
           << "` should have the same extent. However, there are two loops with extent "
           << new_iter_var->dom->extent << " and " << dom->extent << ", which are not equal";
     } else {
-      new_iter_var = IterVar(dom, Var(thread_tag, dom->extent.dtype()), old_iter_var->iter_type,
+      new_iter_var = IterVar(dom, Var(thread_tag, dom->extent.ty()), old_iter_var->iter_type,
                              old_iter_var->thread_tag);
       thread_tag2iter_var_map_.Set(thread_tag, new_iter_var);
       launch_threads_.push_back(new_iter_var);
@@ -130,7 +130,7 @@ class ThreadBindingUnifier : public StmtExprMutator {
     // Step 4. We will substitute the occurrences of the old variable in the old IterVar with the
     // new variable in further mutation. Thus, we store the mapping entry. Cast to old dtype if
     // needed (we assume both old and new dtype are valid for the range of the thread extent).
-    var_substitution_map_.Set(old_var, cast(old_var.dtype(), new_iter_var->var));
+    var_substitution_map_.Set(old_var, cast(old_var.ty(), new_iter_var->var));
 
     // Step 5. Mutate recursively, update the body with the new IterVar, and restore the depth
     // counter. Emit for-loops to launch threads if current statement is the outermost thread
diff --git a/src/script/printer/doc_printer/python_doc_printer.cc b/src/script/printer/doc_printer/python_doc_printer.cc
index 295b3e20e4e3..55da056f407a 100644
--- a/src/script/printer/doc_printer/python_doc_printer.cc
+++ b/src/script/printer/doc_printer/python_doc_printer.cc
@@ -323,7 +323,8 @@ void PythonDocPrinter::PrintTypedDoc(const LiteralDoc& doc) {
   if (value == nullptr) {
     output_ << "None";
   } else if (const auto* int_imm = value.as<IntImmNode>()) {
-    if (int_imm->dtype.is_bool()) {
+    PrimType int_ty = int_imm->ty();
+    if (int_ty.MatchesCode(DLDataTypeCode::kDLBool)) {
       output_ << (int_imm->value ? "True" : "False");
     } else {
       output_ << int_imm->value;
diff --git a/src/script/printer/ir/distributed.cc b/src/script/printer/ir/distributed.cc
index f748f4e9bd6b..a2840d60e4e9 100644
--- a/src/script/printer/ir/distributed.cc
+++ b/src/script/printer/ir/distributed.cc
@@ -16,6 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include <tvm/ffi/container/shape.h>
 #include <tvm/ir/expr.h>
 
 #include "./utils.h"
diff --git a/src/script/printer/script_printer.cc b/src/script/printer/script_printer.cc
index c0d4b88b3107..d46b061401c2 100644
--- a/src/script/printer/script_printer.cc
+++ b/src/script/printer/script_printer.cc
@@ -70,13 +70,13 @@ PrinterConfig::PrinterConfig(ffi::Map<ffi::String, Any> config_dict) {
     n->module_alias = v.value().as_or_throw<ffi::String>();
   }
   if (auto v = config_dict.Get("buffer_dtype")) {
-    n->buffer_dtype = DataType(ffi::StringToDLDataType(v.value().as_or_throw<ffi::String>()));
+    n->buffer_dtype = ffi::StringToDLDataType(v.value().as_or_throw<ffi::String>());
   }
   if (auto v = config_dict.Get("int_dtype")) {
-    n->int_dtype = DataType(ffi::StringToDLDataType(v.value().as_or_throw<ffi::String>()));
+    n->int_dtype = ffi::StringToDLDataType(v.value().as_or_throw<ffi::String>());
   }
   if (auto v = config_dict.Get("float_dtype")) {
-    n->float_dtype = DataType(ffi::StringToDLDataType(v.value().as_or_throw<ffi::String>()));
+    n->float_dtype = ffi::StringToDLDataType(v.value().as_or_throw<ffi::String>());
   }
   if (auto v = config_dict.Get("verbose_expr")) {
     n->verbose_expr = v.value().cast<bool>();
diff --git a/src/script/printer/utils.h b/src/script/printer/utils.h
index b30a26e15686..99e98556b6c6 100644
--- a/src/script/printer/utils.h
+++ b/src/script/printer/utils.h
@@ -114,8 +114,10 @@ inline ExprDoc Relax(const IRDocsifier& d, const ffi::String& attr) {
   return IdDoc(d->cfg->GetExtraConfig<ffi::String>("relax.prefix", "R"))->Attr(attr);
 }
 
-inline std::string DType2Str(const runtime::DataType& dtype) {
-  return dtype.is_void() ? "void" : ffi::DLDataTypeToString(dtype);
+inline std::string DType2Str(DLDataType dtype) {
+  return (((dtype).code == kDLOpaqueHandle) && ((dtype).bits == 0) && ((dtype).lanes == 0))
+             ? "void"
+             : ffi::DLDataTypeToString(dtype);
 }
 
 /*! \brief Add headers as comments to doc if needed */
diff --git a/src/target/build_common.h b/src/target/build_common.h
index 4ad5e9434449..e2b24bf1174d 100644
--- a/src/target/build_common.h
+++ b/src/target/build_common.h
@@ -50,7 +50,7 @@ inline ffi::Map<ffi::String, runtime::FunctionInfo> ExtractFuncInfo(const IRModu
     ffi::Array<DLDataType> arg_types;
     ffi::Array<runtime::ArgExtraTags> arg_extra_tags;
     for (size_t i = 0; i < f->params.size(); ++i) {
-      arg_types.push_back(f->params[i].dtype());
+      arg_types.push_back(f->params[i].ty()->dtype);
       auto is_tensormap = [](const tirx::Var& var) -> bool {
         const auto* type = var->type_annotation.as<PointerTypeNode>();
         if (type == nullptr) {
diff --git a/src/target/intrin_rule.cc b/src/target/intrin_rule.cc
index 300d9c00544e..1729ec1c95f9 100644
--- a/src/target/intrin_rule.cc
+++ b/src/target/intrin_rule.cc
@@ -128,18 +128,19 @@ TVM_REGISTER_OP("tirx.tvm_access_ptr")
       const CallNode* call = e.as<CallNode>();
       TVM_FFI_ICHECK(call != nullptr);
       TVM_FFI_ICHECK_EQ(call->args.size(), 5U);
-      DataType dtype = call->args[0].dtype();
+      PrimType dtype = call->args[0].ty();
       Var buffer_var = call->args[1].as_or_throw<Var>();
       PrimExpr offset = call->args[2];
-      TVM_FFI_ICHECK(call->dtype.is_handle());
+      TVM_FFI_ICHECK(call->ty().IsHandle());
       if (dtype.lanes() != 1) {
-        offset = offset * MakeConst(offset.dtype(), dtype.lanes());
-        offset = Ramp(offset, MakeConst(offset.dtype(), 1), dtype.lanes());
+        PrimType offset_ty = offset.ty();
+        offset = offset * MakeConst(offset_ty, dtype.lanes());
+        offset = Ramp(offset, MakeConst(offset_ty, 1), dtype.lanes());
       }
-      Buffer dummy_buf(buffer_var, dtype.element_of(), {offset + 1}, {}, 0, buffer_var->name_hint,
+      Buffer dummy_buf(buffer_var, dtype.WithLanes(1), {offset + 1}, {}, 0, buffer_var->name_hint,
                        0, 0, kDefault);
       BufferLoad buf_load(dummy_buf, {offset});
-      return Call(DataType::Handle(), builtin::address_of(), {buf_load});
+      return Call(PrimType::Handle(), builtin::address_of(), {buf_load});
     });
 
 PrimExpr DispatchFastErf(const PrimExpr& e) {
@@ -148,9 +149,10 @@ PrimExpr DispatchFastErf(const PrimExpr& e) {
   TVM_FFI_ICHECK(call != nullptr);
   TVM_FFI_ICHECK_EQ(call->args.size(), 1);
   PrimExpr arg = call->args[0];
-  int bits = arg.dtype().bits();
+  PrimType arg_ty = arg.ty();
+  int bits = arg_ty.bits();
   PrimExpr res;
-  if (arg.dtype().is_float() && (bits == 16 || bits == 32)) {
+  if (arg_ty.code() == DLDataTypeCode::kDLFloat && (bits == 16 || bits == 32)) {
     res = fast_erf_float_expr(arg, bits);
   } else {
     TVM_FFI_THROW(InternalError) << "Unsupported type in Metal fast_erf";
@@ -163,9 +165,10 @@ PrimExpr DispatchNumericalStableTanh(const PrimExpr& e) {
   const tirx::CallNode* call = e.as<tirx::CallNode>();
   TVM_FFI_ICHECK(call != nullptr);
   const PrimExpr& x = call->args[0];
-  PrimExpr one = MakeConst(x.dtype(), 1);
-  PrimExpr two = MakeConst(x.dtype(), 2);
-  PrimExpr neg_two = MakeConst(x.dtype(), -2);
+  PrimType x_ty = x.ty();
+  PrimExpr one = MakeConst(x_ty, 1);
+  PrimExpr two = MakeConst(x_ty, 2);
+  PrimExpr neg_two = MakeConst(x_ty, -2);
 
   PrimExpr exp_neg2x = exp(neg_two * x);
   PrimExpr exp_pos2x = exp(two * x);
@@ -173,7 +176,7 @@ PrimExpr DispatchNumericalStableTanh(const PrimExpr& e) {
   PrimExpr tanh_pos = (one - exp_neg2x) / (one + exp_neg2x);
   PrimExpr tanh_neg = (exp_pos2x - one) / (exp_pos2x + one);
   // MakeConst can handle both vector and scalar types.
-  return tirx::Select(x >= MakeConst(x.dtype(), 0), tanh_pos, tanh_neg);
+  return tirx::Select(x >= MakeConst(x_ty, 0), tanh_pos, tanh_neg);
 }
 
 }  // namespace intrin
@@ -186,7 +189,7 @@ TVM_REGISTER_OP("tirx.rsqrt")
     .set_attr<FLegalize>("default.FLegalize", [](const PrimExpr& e) -> PrimExpr {
       const CallNode* call = e.as<CallNode>();
       TVM_FFI_ICHECK(call != nullptr);
-      auto one = MakeConst(call->args[0].dtype(), 1);
+      auto one = MakeConst(call->args[0].ty(), 1);
       return one / sqrt(call->args[0]);
     });
 
@@ -194,7 +197,7 @@ TVM_REGISTER_OP("tirx.sigmoid")
     .set_attr<FLegalize>("default.FLegalize", [](const PrimExpr& e) -> PrimExpr {
       const CallNode* call = e.as<CallNode>();
       TVM_FFI_ICHECK(call != nullptr);
-      auto one = MakeConst(call->args[0].dtype(), 1);
+      auto one = MakeConst(call->args[0].ty(), 1);
       return one / (one + exp(-call->args[0]));
     });
 
@@ -226,14 +229,19 @@ TVM_REGISTER_OP("tirx.isinf")
 static PrimExpr QMultiplyShift(PrimExpr x, PrimExpr y, PrimExpr q, PrimExpr left_shift,
                                PrimExpr right_shift, PrimExpr is_left_shift_required) {
   // Only int32 types are supported (any number of lanes is allowed)
-  TVM_FFI_ICHECK(y.dtype().code() == DLDataTypeCode::kDLInt && y.dtype().bits() == 32);
-  TVM_FFI_ICHECK(left_shift.dtype().code() == DLDataTypeCode::kDLInt &&
-                 left_shift.dtype().bits() == 32);
-  TVM_FFI_ICHECK(right_shift.dtype().code() == DLDataTypeCode::kDLInt &&
-                 right_shift.dtype().bits() == 32);
-
-  DataType hp_dtype = DataType::Int(64, x.dtype().lanes());
-  DataType lp_dtype = DataType::Int(32, x.dtype().lanes());
+  TVM_FFI_ICHECK(y.ty().MatchesElementType(DLDataTypeCode::kDLInt, 32));
+  TVM_FFI_ICHECK(left_shift.ty().MatchesElementType(DLDataTypeCode::kDLInt, 32));
+  TVM_FFI_ICHECK(right_shift.ty().MatchesElementType(DLDataTypeCode::kDLInt, 32));
+
+  PrimType x_ty = x.ty();
+  auto signed_int_ty = [](int bits, const PrimType& source_ty) {
+    if (source_ty.IsScalableVector()) {
+      return PrimType::ScalableVector(DLDataTypeCode::kDLInt, bits, source_ty.VScaleFactor());
+    }
+    return PrimType::Int(bits, source_ty.lanes());
+  };
+  PrimType hp_dtype = signed_int_ty(64, x_ty);
+  PrimType lp_dtype = signed_int_ty(32, x_ty);
 
   // 1) Cast and Multiply the integer multiplier
   PrimExpr one = MakeConst(hp_dtype, 1);
@@ -290,7 +298,11 @@ TVM_REGISTER_OP("tirx.q_multiply_shift")
           return x << exp;
         } else {
           // power of 2 is less than 0, round and then apply right shift.
-          DataType lp_dtype = DataType::Int(32, x.dtype().lanes());
+          PrimType x_ty = x.ty();
+          PrimType lp_dtype =
+              x_ty.IsScalableVector()
+                  ? PrimType::ScalableVector(DLDataTypeCode::kDLInt, 32, x_ty.VScaleFactor())
+                  : PrimType::Int(32, x_ty.lanes());
           PrimExpr one = MakeConst(lp_dtype, 1);
           exp = -exp;
           PrimExpr rounding_factor = one << (exp - 1);
@@ -299,10 +311,11 @@ TVM_REGISTER_OP("tirx.q_multiply_shift")
         }
       } else {
         // Only int32 types are supported (any number of lanes is allowed)
-        TVM_FFI_ICHECK(s.dtype().code() == DLDataTypeCode::kDLInt && s.dtype().bits() == 32);
+        TVM_FFI_ICHECK(s.ty().MatchesElementType(DLDataTypeCode::kDLInt, 32));
 
         // Calculating integer shifts. MakeConst can handle both vector and scalar types.
-        PrimExpr zero = MakeConst(s.dtype(), 0);
+        PrimType s_ty = s.ty();
+        PrimExpr zero = MakeConst(s_ty, 0);
         PrimExpr left_shift = tirx::Select(s > zero, s, zero);
         PrimExpr right_shift = tirx::Select(s > zero, zero, -s);
         PrimExpr is_left_shift_required = (left_shift != zero);
diff --git a/src/target/intrin_rule.h b/src/target/intrin_rule.h
index a5f5a8931283..cf72a291ada6 100644
--- a/src/target/intrin_rule.h
+++ b/src/target/intrin_rule.h
@@ -25,6 +25,7 @@
 #define TVM_TARGET_INTRIN_RULE_H_
 
 #include <tvm/ffi/function.h>
+#include <tvm/ir/type.h>
 #include <tvm/tirx/builtin.h>
 #include <tvm/tirx/expr.h>
 
@@ -37,10 +38,10 @@ using namespace tirx;
 
 // Add float suffix to the intrinsics
 struct FloatSuffix {
-  std::string operator()(DataType t, std::string name) const {
-    if (t == DataType::Float(32)) {
+  std::string operator()(PrimType t, std::string name) const {
+    if (t->dtype == DLDataType{kDLFloat, 32, 1}) {
       return name + 'f';
-    } else if (t == DataType::Float(64)) {
+    } else if (t->dtype == DLDataType{kDLFloat, 64, 1}) {
       return name;
     } else {
       return "";
@@ -50,7 +51,7 @@ struct FloatSuffix {
 
 // Return the intrinsic name
 struct Direct {
-  std::string operator()(DataType t, std::string name) const { return name; }
+  std::string operator()(PrimType t, std::string name) const { return name; }
 };
 
 /*!
@@ -69,13 +70,10 @@ inline PrimExpr DispatchPureExtern(const PrimExpr& e) {
   TVM_FFI_ICHECK(op != nullptr);
   std::string name = op->name;
   TVM_FFI_ICHECK_EQ(name.substr(0, 5), "tirx.");
-  DataType dtype;
   if (dtype_from_arg) {
     TVM_FFI_ICHECK_EQ(call->args.size(), 1U);
-    dtype = call->args[0].dtype();
-  } else {
-    dtype = call->dtype;
   }
+  PrimType dtype = dtype_from_arg ? call->args[0].ty() : call->ty();
   name = T()(dtype, name.substr(5));
 
   if (name.length() != 0) {
@@ -83,7 +81,7 @@ inline PrimExpr DispatchPureExtern(const PrimExpr& e) {
     for (auto arg : call->args) {
       new_args.push_back(arg);
     }
-    return Call(call->dtype, builtin::call_pure_extern(), new_args);
+    return Call(e.ty(), builtin::call_pure_extern(), new_args);
   } else {
     return e;
   }
diff --git a/src/target/llvm/codegen_arm.cc b/src/target/llvm/codegen_arm.cc
index a9a0acb41213..149e3ee43f4f 100644
--- a/src/target/llvm/codegen_arm.cc
+++ b/src/target/llvm/codegen_arm.cc
@@ -67,17 +67,18 @@ llvm::Value* CodeGenARM::CreateIntrinsic(const CallNode* op) {
 PrimExpr CodeGenARM::ARMPopcount(const CallNode* call) {
   using namespace tirx;
   const PrimExpr& e = call->args[1];
+  PrimType call_ty = call->ty();
   llvm::Intrinsic::ID ctpop_id = llvm::Intrinsic::ctpop;
   llvm::Intrinsic::ID vpaddlu_id = llvm::Intrinsic::arm_neon_vpaddlu;
 
   // Fallback to default llvm lowering rule if input type not a full vector or half vector length
-  int total_size = call->dtype.bits() * call->dtype.lanes();
-  if (!call->dtype.is_fixed_length_vector() || call->dtype.bits() == 8 ||
+  int total_size = call_ty.bits() * call_ty.lanes();
+  if (!call_ty.IsFixedLengthVector() || call_ty.bits() == 8 ||
       (total_size != 128 && total_size != 64)) {
     ffi::Array<PrimExpr> vcnt_args;
-    vcnt_args.push_back(IntImm(DataType::UInt(32), ctpop_id));
+    vcnt_args.push_back(IntImm(PrimType::UInt(32), ctpop_id));
     vcnt_args.push_back(e);
-    return tirx::Call(call->dtype, builtin_call_llvm_pure_intrin_, vcnt_args);
+    return tirx::Call(call->ty(), builtin_call_llvm_pure_intrin_, vcnt_args);
   }
 
   // Popcount lowering rule:
@@ -86,11 +87,12 @@ PrimExpr CodeGenARM::ARMPopcount(const CallNode* call) {
   // to return back to original input type
 
   // Dvisions are always divisible (number of bits = 64 or 128)
-  DataType uint8_type = DataType(e.dtype().code(), 8, e.dtype().bits() * e.dtype().lanes() / 8);
-  DataType uint16_type =
-      DataType(uint8_type.code(), 16, uint8_type.bits() * uint8_type.lanes() / 16);
-  DataType uint32_type =
-      DataType(uint16_type.code(), 32, uint8_type.bits() * uint8_type.lanes() / 32);
+  PrimType e_dtype(e.ty()->dtype);
+  PrimType uint8_type = PrimType(e_dtype.code(), 8, e_dtype.bits() * e_dtype.lanes() / 8);
+  PrimType uint16_type =
+      PrimType(uint8_type.code(), 16, uint8_type.bits() * uint8_type.lanes() / 16);
+  PrimType uint32_type =
+      PrimType(uint16_type.code(), 32, uint8_type.bits() * uint8_type.lanes() / 32);
 
   // Interpret input as vector of 8bit values
   PrimExpr input8 = reinterpret(uint8_type, e);
@@ -98,33 +100,33 @@ PrimExpr CodeGenARM::ARMPopcount(const CallNode* call) {
   const CallNode* c0 = input8.as<CallNode>();
   TVM_FFI_ICHECK(c0 != nullptr);
   ffi::Array<PrimExpr> vcnt8_args;
-  vcnt8_args.push_back(IntImm(DataType::UInt(32), ctpop_id));
+  vcnt8_args.push_back(IntImm(PrimType::UInt(32), ctpop_id));
   vcnt8_args.push_back(input8);
   PrimExpr vcnt8 = tirx::Call(uint8_type, builtin_call_llvm_pure_intrin_, vcnt8_args);
 
   // Accumulation 8->16bit
   ffi::Array<PrimExpr> vcnt16_args;
-  vcnt16_args.push_back(IntImm(DataType::UInt(32), vpaddlu_id));
+  vcnt16_args.push_back(IntImm(PrimType::UInt(32), vpaddlu_id));
   vcnt16_args.push_back(vcnt8);
   PrimExpr vcnt16 = tirx::Call(uint16_type, builtin_call_llvm_pure_intrin_, vcnt16_args);
-  if (call->dtype.bits() == 16) {
+  if (call_ty.bits() == 16) {
     return vcnt16;
   }
 
   // Accumulation 16->32bit
   ffi::Array<PrimExpr> vcnt32_args;
-  vcnt32_args.push_back(IntImm(DataType::UInt(32), vpaddlu_id));
+  vcnt32_args.push_back(IntImm(PrimType::UInt(32), vpaddlu_id));
   vcnt32_args.push_back(vcnt16);
   PrimExpr vcnt32 = tirx::Call(uint32_type, builtin_call_llvm_pure_intrin_, vcnt32_args);
-  if (call->dtype.bits() == 32) {
+  if (call_ty.bits() == 32) {
     return vcnt32;
   }
 
   // Accumulation 32->64bit
   ffi::Array<PrimExpr> vcnt64_args;
-  vcnt64_args.push_back(IntImm(DataType::UInt(32), vpaddlu_id));
+  vcnt64_args.push_back(IntImm(PrimType::UInt(32), vpaddlu_id));
   vcnt64_args.push_back(vcnt32);
-  return tirx::Call(call->dtype, builtin_call_llvm_pure_intrin_, vcnt64_args);
+  return tirx::Call(call->ty(), builtin_call_llvm_pure_intrin_, vcnt64_args);
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index b43afb021454..fa73cf51c6fd 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -80,7 +80,7 @@ void CodeGenCPU::Init(const std::string& module_name, LLVMTarget* llvm_target,
 
   // Runtime types.
   t_tvm_shape_index_ =
-      llvm::Type::getIntNTy(*llvm_target_->GetContext(), DataType::ShapeIndex().bits());
+      llvm::Type::getIntNTy(*llvm_target_->GetContext(), DefaultIndexPrimType().bits());
   // Defined in 3rdparty/dlpack/include/dlpack/dlpack.h:
   // typedef struct { DLDeviceType device_type; int device_id; } DLDevice;
   t_tvm_device_ = llvm::StructType::create({t_int_, t_int_});
@@ -278,7 +278,7 @@ std::unique_ptr<llvm::Module> CodeGenCPU::Finish() {
   return CodeGenLLVM::Finish();
 }
 
-CodeGenLLVM::TypedPointer CodeGenCPU::CreateStructRefPtr(DataType t, llvm::Value* buf,
+CodeGenLLVM::TypedPointer CodeGenCPU::CreateStructRefPtr(PrimType t, llvm::Value* buf,
                                                          llvm::Value* index, int kind) {
   if (kind < builtin::kDLTensorKindBound_) {
     if (buf->getType() == t_void_p_) {
@@ -366,21 +366,21 @@ CodeGenLLVM::TypedPointer CodeGenCPU::CreateStructRefPtr(DataType t, llvm::Value
       buf = builder_->CreatePointerCast(buf, llvmGetPointerTo(t_tvm_ffi_any_, 0));
       // field 2 is the union value
       buf = builder_->CreateInBoundsGEP(t_tvm_ffi_any_, buf, {index, ConstInt32(2)});
-      if (t.is_bool()) {
+      if (t.MatchesCode(DLDataTypeCode::kDLBool)) {
         // it should be safe to set the pointer to the first byte of the union value
         buf = builder_->CreatePointerCast(buf, llvmGetPointerTo(DTypeToLLVMType(t), 0));
         return TypedPointer(t_int8_, buf);
-      } else if (t.is_int() && t.bits() == 64) {
+      } else if (t.MatchesCode(DLDataTypeCode::kDLInt) && t.bits() == 64) {
         buf = builder_->CreatePointerCast(buf, llvmGetPointerTo(t_int64_, 0));
         return TypedPointer(t_int64_, buf);
-      } else if (t.is_float() && t.bits() == 64) {
+      } else if (t.MatchesCode(DLDataTypeCode::kDLFloat) && t.bits() == 64) {
         buf = builder_->CreatePointerCast(buf, llvmGetPointerTo(t_float64_, 0));
         return TypedPointer(t_float64_, buf);
-      } else if (t.is_handle()) {
+      } else if (t.IsHandle()) {
         buf = builder_->CreatePointerCast(buf, llvmGetPointerTo(t_void_p_, 0));
         return TypedPointer(t_void_p_, buf);
       } else {
-        LOG(DEBUG) << "DataType " << t << " cannot be stored into a TVMFFIAny's value field";
+        LOG(DEBUG) << "PrimType " << t << " cannot be stored into a TVMFFIAny's value field";
       }
     }
     case builtin::kInt64ArrayElem: {
@@ -559,7 +559,7 @@ void CodeGenCPU::CreateComputeScope(const AttrStmtNode* op) {
     llvm::Argument* v = &(*it);
     const Var& var = vargs[idx];
     var_map_[var.get()] = v;
-    if (var.dtype().is_handle() && !alias_var_set_.count(var.get())) {
+    if (var.ty().IsHandle() && !alias_var_set_.count(var.get())) {
       // set non alias.
       fcompute->addParamAttr(idx, llvm::Attribute::NoAlias);
       // always not inline compute function to make the code structure clean
@@ -577,8 +577,8 @@ void CodeGenCPU::CreateComputeScope(const AttrStmtNode* op) {
   }
 
   function_ = fcompute;
-  di_subprogram_ = CreateDebugFunction(MakeStringRef(value->value), vargs.Map(GetType),
-                                       PrimType(DataType::Int(32)));
+  di_subprogram_ =
+      CreateDebugFunction(MakeStringRef(value->value), vargs.Map(GetType), PrimType::Int(32));
   auto* compute_entry = llvm::BasicBlock::Create(*ctx, "entry", function_);
   builder_->SetInsertPoint(compute_entry);
   this->VisitStmt(op->body);
@@ -655,8 +655,8 @@ void CodeGenCPU::CreateParallelLaunch(const Stmt& body, int num_task, std::strin
   UnpackClosureData(cdata, vfields, &new_vmap);
   // setup parallel env
   ParallelEnv par_env;
-  par_env.task_id = Var("task_id", DataType::Int(32));
-  par_env.num_task = Var("num_task", DataType::Int(32));
+  par_env.task_id = Var("task_id", PrimType::Int(32));
+  par_env.num_task = Var("num_task", PrimType::Int(32));
   new_vmap[par_env.task_id.get()] = task_id;
   new_vmap[par_env.num_task.get()] = builder_->CreateLoad(
       t_int32_,
@@ -787,7 +787,7 @@ llvm::Value* CodeGenCPU::GetPackedFuncHandle(const std::string& fname) {
 }
 
 CodeGenCPU::PackedCall CodeGenCPU::MakeCallPackedLowered(const ffi::Array<PrimExpr>& args,
-                                                         const DataType& r_type,
+                                                         const PrimType& r_type,
                                                          const int64_t begin, const int64_t end,
                                                          bool use_env_lookup) {
   std::string func_name = [&]() {
@@ -835,9 +835,9 @@ CodeGenCPU::PackedCall CodeGenCPU::MakeCallPackedLowered(const ffi::Array<PrimEx
 
   PackedCall pc = {nullptr};
 
-  if (!r_type.is_void()) {
+  if (!r_type.IsVoid()) {
     // Load the return value and cast it to the designated type (r_type).
-    DataType r_api_type = tirx::APIType(r_type);
+    PrimType r_api_type = tirx::APIType(r_type);
     llvm::Type* llvm_r_api_type = DTypeToLLVMType(r_api_type);
     llvm::Value* result_value =
         builder_->CreateInBoundsGEP(t_tvm_ffi_any_, result, {ConstInt32(0), ConstInt32(2)});
@@ -860,14 +860,16 @@ CodeGenCPU::PackedCall CodeGenCPU::MakeCallPackedLowered(const ffi::Array<PrimEx
 llvm::Value* CodeGenCPU::CreateCallPacked(const CallNode* op) {
   TVM_FFI_ICHECK_EQ(op->args.size(), 4U);
   bool use_string_lookup = op->op.same_as(builtin::tvm_call_packed_lowered());
-  PackedCall pc = MakeCallPackedLowered(op->args, op->dtype, op->args[2].as<IntImmNode>()->value,
+  PackedCall pc = MakeCallPackedLowered(op->args, PrimType(op->ty()->dtype),
+                                        op->args[2].as<IntImmNode>()->value,
                                         op->args[3].as<IntImmNode>()->value, use_string_lookup);
   return pc.ret_value;
 }
 
 llvm::Value* CodeGenCPU::CreateCallTracePacked(const CallNode* op) {
   TVM_FFI_ICHECK_EQ(op->args.size(), 5U);
-  PackedCall pc = MakeCallPackedLowered(op->args, op->dtype, op->args[2].as<IntImmNode>()->value,
+  PackedCall pc = MakeCallPackedLowered(op->args, PrimType(op->ty()->dtype),
+                                        op->args[2].as<IntImmNode>()->value,
                                         op->args[3].as<IntImmNode>()->value, true);
   llvm::LLVMContext* ctx = llvm_target_->GetContext();
   // Get traced value.
@@ -1029,16 +1031,17 @@ llvm::Value* CodeGenCPU::CreateIntrinsic(const CallNode* op) {
   } else if (op->op.same_as(builtin::tvm_struct_get())) {
     TVM_FFI_ICHECK_EQ(op->args.size(), 3U);
     int kind = op->args[2].as<IntImm>().value()->value;
+    PrimType op_dtype(op->ty()->dtype);
     TypedPointer ref =
-        CreateStructRefPtr(op->dtype, MakeValue(op->args[0]), MakeValue(op->args[1]), kind);
+        CreateStructRefPtr(op_dtype, MakeValue(op->args[0]), MakeValue(op->args[1]), kind);
     if (kind == builtin::kDLTensorAddr) {
       return builder_->CreatePointerCast(ref.addr, t_void_p_);
     }
 
     llvm::Value* struct_value = builder_->CreateLoad(ref.type, ref.addr);
 
-    if (op->dtype == DataType::Bool()) {
-      struct_value = CreateCast(DataType::Int(64), op->dtype, struct_value);
+    if (op_dtype == PrimType::Bool()) {
+      struct_value = CreateCast(PrimType::Int(64), op_dtype, struct_value);
     }
 
     return struct_value;
@@ -1046,7 +1049,7 @@ llvm::Value* CodeGenCPU::CreateIntrinsic(const CallNode* op) {
     TVM_FFI_ICHECK_EQ(op->args.size(), 4U);
     int kind = op->args[2].as<IntImm>().value()->value;
     llvm::Value* value = MakeValue(op->args[3]);
-    TypedPointer ref = CreateStructRefPtr(op->args[3].dtype(), MakeValue(op->args[0]),
+    TypedPointer ref = CreateStructRefPtr(PrimType(op->args[3].ty()->dtype), MakeValue(op->args[0]),
                                           MakeValue(op->args[1]), kind);
     TVM_FFI_ICHECK(kind != builtin::kDLTensorAddr);
     if (value->getType()->isPointerTy()) {
@@ -1180,7 +1183,7 @@ void CodeGenCPU::VisitStmt_(const ForNode* op) {
       TVM_FFI_ICHECK(parallel_env_.task_id.defined());
       TVM_FFI_ICHECK(parallel_env_.num_task.defined());
       TVM_FFI_ICHECK(parallel_env_.penv != nullptr);
-      DataType t = op->extent.dtype();
+      PrimType t(op->extent.ty()->dtype);
       PrimExpr num_task = cast(t, parallel_env_.num_task);
       PrimExpr task_id = cast(t, parallel_env_.task_id);
       TVM_FFI_ICHECK(!parallel_env_.in_parallel_loop)
diff --git a/src/target/llvm/codegen_cpu.h b/src/target/llvm/codegen_cpu.h
index 6096cc140517..d090687b7e31 100644
--- a/src/target/llvm/codegen_cpu.h
+++ b/src/target/llvm/codegen_cpu.h
@@ -128,7 +128,7 @@ class CodeGenCPU : public CodeGenLLVM {
   llvm::Value* GetPackedFuncHandle(const std::string& str);
   TypedPointer PackClosureData(const ffi::Array<Var>& fields, uint64_t* num_bytes,
                                std::string struct_name = "");
-  TypedPointer CreateStructRefPtr(DataType t, llvm::Value* buffer, llvm::Value* index, int kind);
+  TypedPointer CreateStructRefPtr(PrimType t, llvm::Value* buffer, llvm::Value* index, int kind);
   void UnpackClosureData(TypedPointer cdata, const ffi::Array<Var>& fields,
                          std::unordered_map<const VarNode*, llvm::Value*>* vmap);
   // Make packed call.
@@ -137,7 +137,7 @@ class CodeGenCPU : public CodeGenLLVM {
     llvm::Value* ret_type_index;
     llvm::BasicBlock* end_block;
   };
-  PackedCall MakeCallPackedLowered(const ffi::Array<PrimExpr>& args, const DataType& r_type,
+  PackedCall MakeCallPackedLowered(const ffi::Array<PrimExpr>& args, const PrimType& r_type,
                                    const int64_t begin, const int64_t end, bool use_string_lookup);
   // create call into tvm packed function.
   llvm::Value* CreateCallPacked(const CallNode* op);
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 4eb0a503f09b..0a5acd348a6c 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -99,6 +99,18 @@
 namespace tvm {
 namespace codegen {
 
+namespace {
+
+int GetLanesOrVScaleFactor(const PrimType& dtype) {
+  return dtype.IsScalableVector() ? dtype.VScaleFactor() : dtype.lanes();
+}
+
+PrimType WithScalableVScaleFactor(const PrimType& dtype, int vscale_factor) {
+  return PrimType::ScalableVector(dtype.code(), dtype.bits(), vscale_factor);
+}
+
+}  // namespace
+
 // CodeGenLLVM has members of type std::unique_ptr<T>. These members will be
 // instantiated in the constructor, which will requre that the type T is
 // complete at that point. Put the constructor (and destructor) here, since
@@ -253,7 +265,7 @@ llvm::Function* CodeGenLLVM::DeclareFunctionInternal(const GlobalVar& gvar, cons
   is_restricted_ = func->HasNonzeroAttr(tirx::attr::kNoAlias);
   for (Var param : func->params) {
     param_types.push_back(GetLLVMType(param));
-    if (!is_restricted_ && param.dtype().is_handle()) {
+    if (!is_restricted_ && PrimType(param.ty()->dtype).IsHandle()) {
       alias_var_set_.insert(param.get());
     }
   }
@@ -304,7 +316,7 @@ void CodeGenLLVM::AddFunctionInternal(const GlobalVar& gvar, const PrimFunc& f)
     var_map_[var.get()] = v;
     v->setName(std::string(var->name_hint));
     if (is_restricted_) {
-      if (var.dtype().is_handle() && !alias_var_set_.count(var.get())) {
+      if (PrimType(var.ty()->dtype).IsHandle() && !alias_var_set_.count(var.get())) {
         // set non alias.
         function_->addParamAttr(i, llvm::Attribute::NoAlias);
       }
@@ -558,21 +570,21 @@ int CodeGenLLVM::NativeVectorBits(const runtime::StorageScope& storage_scope) co
 
 unsigned CodeGenLLVM::GetGlobalAddressSpace() const { return 0; }
 
-llvm::Type* CodeGenLLVM::DTypeToLLVMType(const DataType& dtype) const {
-  if (dtype.is_handle()) {
+llvm::Type* CodeGenLLVM::DTypeToLLVMType(const PrimType& dtype) const {
+  if (dtype.IsHandle()) {
     TVM_FFI_ICHECK_EQ(dtype.lanes(), 1);
     return t_void_p_;
   }
-  if (dtype.is_void()) {
+  if (dtype.IsVoid()) {
     return t_void_;
   }
   llvm::Type* etype = nullptr;
   llvm::LLVMContext* ctx = llvm_target_->GetContext();
-  if (dtype.is_int() || dtype.is_uint()) {
+  if (dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
     etype = llvm::Type::getIntNTy(*ctx, dtype.bits());
-  } else if (dtype.is_bool()) {
+  } else if (dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
     etype = t_int1_;
-  } else if (dtype.is_float()) {
+  } else if (dtype.MatchesCode(DLDataTypeCode::kDLFloat)) {
     switch (dtype.bits()) {
       case 16:
         etype = llvm::Type::getHalfTy(*ctx);
@@ -586,21 +598,24 @@ llvm::Type* CodeGenLLVM::DTypeToLLVMType(const DataType& dtype) const {
       default:
         TVM_FFI_THROW(InternalError) << "do not support " << dtype;
     }
-  } else if (dtype.code() == DataType::kFloat8_e3m4 || dtype.code() == DataType::kFloat8_e4m3 ||
-             dtype.code() == DataType::kFloat8_e4m3b11fnuz ||
-             dtype.code() == DataType::kFloat8_e4m3fn ||
-             dtype.code() == DataType::kFloat8_e4m3fnuz || dtype.code() == DataType::kFloat8_e5m2 ||
-             dtype.code() == DataType::kFloat8_e5m2fnuz ||
-             dtype.code() == DataType::kFloat8_e8m0fnu) {
+  } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e3m4 ||
+             dtype.code() == DLDataTypeCode::kDLFloat8_e4m3 ||
+             dtype.code() == DLDataTypeCode::kDLFloat8_e4m3b11fnuz ||
+             dtype.code() == DLDataTypeCode::kDLFloat8_e4m3fn ||
+             dtype.code() == DLDataTypeCode::kDLFloat8_e4m3fnuz ||
+             dtype.code() == DLDataTypeCode::kDLFloat8_e5m2 ||
+             dtype.code() == DLDataTypeCode::kDLFloat8_e5m2fnuz ||
+             dtype.code() == DLDataTypeCode::kDLFloat8_e8m0fnu) {
     etype = llvm::Type::getInt8Ty(*ctx);
-  } else if (dtype.code() == DataType::kFloat6_e2m3fn || dtype.code() == DataType::kFloat6_e3m2fn) {
+  } else if (dtype.code() == DLDataTypeCode::kDLFloat6_e2m3fn ||
+             dtype.code() == DLDataTypeCode::kDLFloat6_e3m2fn) {
     etype = llvm::Type::getIntNTy(*ctx, 6);
-  } else if (dtype.code() == DataType::kFloat4_e2m1fn) {
+  } else if (dtype.code() == DLDataTypeCode::kDLFloat4_e2m1fn) {
     etype = llvm::Type::getIntNTy(*ctx, 4);
   }
-  if (!dtype.is_scalar()) {
-    if (dtype.is_scalable_vector()) {
-      return llvm::VectorType::get(etype, dtype.vscale_factor(), true);
+  if (!dtype.IsScalar()) {
+    if (dtype.IsScalableVector()) {
+      return llvm::VectorType::get(etype, dtype.VScaleFactor(), true);
     } else {
       return llvm::FixedVectorType::get(etype, dtype.lanes());
     }
@@ -611,12 +626,12 @@ llvm::Type* CodeGenLLVM::DTypeToLLVMType(const DataType& dtype) const {
 
 llvm::Type* CodeGenLLVM::GetLLVMType(const Type& type) const {
   if (auto* ptr = type.as<PrimTypeNode>()) {
-    return DTypeToLLVMType(ptr->dtype);
+    return DTypeToLLVMType(PrimType(ptr->dtype));
   } else if (auto* ptr = type.as<PointerTypeNode>()) {
     // LLVM IR doesn't allow void*, so pointer element types that do not
     // have an LLVM scalar equivalent need explicit handling.
     if (auto* primtype = ptr->element_type.as<PrimTypeNode>()) {
-      if (primtype->dtype.is_void()) {
+      if (PrimType(primtype->dtype).IsVoid()) {
         return t_void_p_;
       }
     } else if (ptr->element_type->IsInstance<TensorMapTypeNode>()) {
@@ -645,7 +660,7 @@ llvm::Type* CodeGenLLVM::GetLLVMType(const PrimExpr& expr) const {
 // This trick comes from Halide's CodeGen_LLVM
 //
 void CodeGenLLVM::AddAliasInfo(llvm::Instruction* inst, const VarNode* buffer_var, PrimExpr index,
-                               DataType access_dtype) {
+                               PrimType access_dtype) {
   if (alias_var_set_.count(buffer_var) != 0) {
     // Mark all possibly aliased pointer as same type.
     llvm::MDNode* meta = md_tbaa_alias_set_;
@@ -666,7 +681,7 @@ void CodeGenLLVM::AddAliasInfo(llvm::Instruction* inst, const VarNode* buffer_va
     base = ptr->value;
     xwith = 1;
   }
-  if (access_dtype.is_scalable_vector()) {
+  if (access_dtype.IsScalableVector()) {
     llvm::MDNode* meta = md_tbaa_root_;
     std::ostringstream buffer_addr;
     buffer_addr << buffer_var;
@@ -707,7 +722,7 @@ void CodeGenLLVM::AddAliasInfo(llvm::Instruction* inst, const VarNode* buffer_va
   inst->setMetadata("tbaa", md_builder_->createTBAAStructTagNode(meta, meta, 0));
 }
 
-void CodeGenLLVM::GetAlignment(DataType t, const VarNode* buf_var, const PrimExpr& index,
+void CodeGenLLVM::GetAlignment(PrimType t, const VarNode* buf_var, const PrimExpr& index,
                                int* p_alignment, int* p_native_bits) {
   int max_align_bits = t.bits();
   auto it = alloc_storage_info_.find(buf_var);
@@ -736,7 +751,7 @@ void CodeGenLLVM::GetAlignment(DataType t, const VarNode* buf_var, const PrimExp
   *p_alignment = align_bits / 8;
 }
 
-llvm::GlobalVariable* CodeGenLLVM::AllocateSharedMemory(DataType dtype, size_t size,
+llvm::GlobalVariable* CodeGenLLVM::AllocateSharedMemory(PrimType dtype, size_t size,
                                                         unsigned int shared_address_space,
                                                         int alignment,
                                                         llvm::GlobalValue::LinkageTypes linkage) {
@@ -794,7 +809,7 @@ llvm::Value* CodeGenLLVM::CreateVecFlip(llvm::Value* vec) {
 }
 
 llvm::Value* CodeGenLLVM::CreateVecPad(llvm::Value* vec, int target_lanes) {
-  llvm::Value* mask = llvm::UndefValue::get(DTypeToLLVMType(DataType::Int(32, target_lanes)));
+  llvm::Value* mask = llvm::UndefValue::get(DTypeToLLVMType(PrimType::Int(32, target_lanes)));
   int num_elems = GetVectorNumElements(vec);
   if (num_elems == target_lanes) return vec;
   TVM_FFI_ICHECK_LT(num_elems, target_lanes);
@@ -869,7 +884,7 @@ void CodeGenLLVM::CreateSerialFor(llvm::Value* begin, llvm::Value* end, llvm::Va
   TVM_FFI_ICHECK(!var_map_.count(loop_var.get()));
   var_map_[loop_var.get()] = loop_value;
 
-  auto lt = CreateLT(loop_var.dtype(), loop_value, end);
+  auto lt = CreateLT(PrimType(loop_var.ty()->dtype), loop_value, end);
   builder_->CreateCondBr(lt, for_body, for_end, md_very_likely_branch_);
   builder_->SetInsertPoint(for_body);
   EmitDebugLocation(body->span);
@@ -881,47 +896,56 @@ void CodeGenLLVM::CreateSerialFor(llvm::Value* begin, llvm::Value* end, llvm::Va
 
   builder_->CreateBr(for_next);
   builder_->SetInsertPoint(for_next);
-  llvm::Value* loop_next = CreateAdd(loop_var.dtype(), loop_value, stride);
+  llvm::Value* loop_next = CreateAdd(PrimType(loop_var.ty()->dtype), loop_value, stride);
   loop_value->addIncoming(loop_next, builder_->GetInsertBlock());
   builder_->CreateBr(for_begin);
   builder_->SetInsertPoint(for_end);
 }
 
 // cast operatpr
-llvm::Value* CodeGenLLVM::CreateCast(DataType from, DataType to, llvm::Value* value) {
+llvm::Value* CodeGenLLVM::CreateCast(PrimType from, PrimType to, llvm::Value* value) {
   llvm::Type* target = DTypeToLLVMType(to);
   if (value->getType() == target) return value;
   // TODO(tvm-team): consider add native support
-  TVM_FFI_ICHECK(!from.is_bfloat16()) << "BF16 needs to be storaged lowered first";
-  TVM_FFI_ICHECK(!to.is_bfloat16()) << "BF16 needs to be storaged lowered first";
-
-  if (to.is_handle()) {
+  // Storage lowering depends on scalar element type; LLVM vector type construction
+  // preserves the lane information separately.
+  TVM_FFI_ICHECK(!from.MatchesElementType(DLDataTypeCode::kDLBfloat, 16))
+      << "BF16 needs to be storaged lowered first";
+  TVM_FFI_ICHECK(!to.MatchesElementType(DLDataTypeCode::kDLBfloat, 16))
+      << "BF16 needs to be storaged lowered first";
+
+  if (to.IsHandle()) {
     return builder_->CreateBitCast(value, target);
-  } else if (to.is_bool()) {
-    if (from.is_float()) {
+  } else if (to.MatchesCode(DLDataTypeCode::kDLBool)) {
+    if (from.MatchesCode(DLDataTypeCode::kDLFloat)) {
       llvm::Constant* zero = llvm::ConstantFP::get(DTypeToLLVMType(from), 0.);
       return builder_->CreateFCmpUNE(value, zero);
     } else {
       llvm::Constant* zero = llvm::ConstantInt::get(DTypeToLLVMType(from), 0);
       return builder_->CreateICmpNE(value, zero);
     }
-  } else if (!from.is_float() && !to.is_float()) {
-    return builder_->CreateIntCast(value, target, from.is_int());
-  } else if (from.is_float() && to.is_int()) {
+  } else if (!from.MatchesCode(DLDataTypeCode::kDLFloat) &&
+             !to.MatchesCode(DLDataTypeCode::kDLFloat)) {
+    return builder_->CreateIntCast(value, target, from.MatchesCode(DLDataTypeCode::kDLInt));
+  } else if (from.MatchesCode(DLDataTypeCode::kDLFloat) && to.MatchesCode(DLDataTypeCode::kDLInt)) {
     return builder_->CreateFPToSI(value, target);
-  } else if (from.is_float() && to.is_uint()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLFloat) &&
+             to.MatchesCode(DLDataTypeCode::kDLUInt)) {
     if (to.bits() < 8) {
-      value = builder_->CreateFPToUI(value, DTypeToLLVMType(to.with_bits(8)));
+      value = builder_->CreateFPToUI(value, DTypeToLLVMType(to.WithBits(8)));
       return builder_->CreateIntCast(value, target, false);
     } else {
       return builder_->CreateFPToUI(value, target);
     }
-  } else if (from.is_int() && to.is_float()) {
+  } else if (from.MatchesCode(DLDataTypeCode::kDLInt) && to.MatchesCode(DLDataTypeCode::kDLFloat)) {
     return builder_->CreateSIToFP(value, target);
-  } else if ((from.is_uint() || from.is_bool()) && to.is_float()) {
+  } else if ((from.MatchesCode(DLDataTypeCode::kDLUInt) ||
+              from.MatchesCode(DLDataTypeCode::kDLBool)) &&
+             to.MatchesCode(DLDataTypeCode::kDLFloat)) {
     return builder_->CreateUIToFP(value, target);
   } else {
-    TVM_FFI_ICHECK(from.is_float() && to.is_float());
+    TVM_FFI_ICHECK(from.MatchesCode(DLDataTypeCode::kDLFloat) &&
+                   to.MatchesCode(DLDataTypeCode::kDLFloat));
     return builder_->CreateFPCast(value, target);
   }
 }
@@ -951,9 +975,9 @@ llvm::Constant* CodeGenLLVM::GetConstString(const std::string& str) {
 }
 
 CodeGenLLVM::TypedPointer CodeGenLLVM::CreateBufferPtr(llvm::Value* buffer_ptr,
-                                                       DataType buffer_element_dtype,
+                                                       PrimType buffer_element_dtype,
                                                        llvm::ArrayRef<llvm::Value*> indices,
-                                                       DataType value_dtype) {
+                                                       PrimType value_dtype) {
   TVM_FFI_ICHECK_EQ(indices.size(), 1)
       << "CodeGenLLVM requires all buffers to be flat 1-d buffers.";
   llvm::Value* index = indices[0];
@@ -1360,7 +1384,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
   } else if (op->op.same_as(builtin::shift_left())) {
     return builder_->CreateShl(MakeValue(op->args[0]), MakeValue(op->args[1]));
   } else if (op->op.same_as(builtin::shift_right())) {
-    if (op->args[0].dtype().is_int()) {
+    if (PrimType(op->args[0].ty()->dtype).MatchesCode(DLDataTypeCode::kDLInt)) {
       return builder_->CreateAShr(MakeValue(op->args[0]), MakeValue(op->args[1]));
     } else {
       return builder_->CreateLShr(MakeValue(op->args[0]), MakeValue(op->args[1]));
@@ -1382,7 +1406,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     }
 
     TypedPointer buffer_ptr = CreateBufferPtr(MakeValue(load->buffer->data), load->buffer->dtype,
-                                              indices_val, load->dtype);
+                                              indices_val, PrimType(load->ty()->dtype));
     return buffer_ptr.addr;
   } else if (op->op.same_as(builtin::reinterpret()) && is_zero(op->args[0])) {
     return llvm::Constant::getNullValue(t_void_p_);
@@ -1397,9 +1421,9 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     uint64_t low = static_cast<uint64_t>(op->args[0].as_or_throw<IntImm>()->value);
     uint64_t high = static_cast<uint64_t>(op->args[1].as_or_throw<IntImm>()->value);
     uint64_t val = (high << 32U) | low;
-    return llvm::ConstantInt::get(DTypeToLLVMType(op->dtype), val);
+    return llvm::ConstantInt::get(DTypeToLLVMType(PrimType(op->ty()->dtype)), val);
   } else if (op->op.same_as(builtin::if_then_else())) {
-    TVM_FFI_ICHECK_EQ(op->args[0].dtype().lanes(), 1)
+    TVM_FFI_ICHECK_EQ(PrimType(op->args[0].ty()->dtype).lanes(), 1)
         << "if_then_else can only take scalar condition";
     llvm::LLVMContext* ctx = llvm_target_->GetContext();
     auto* then_block = llvm::BasicBlock::Create(*ctx, "if_then", function_);
@@ -1453,7 +1477,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     builder_->SetInsertPoint(post_dummy);
     return post_dummy;
   } else if (op->op.same_as(builtin::reinterpret())) {
-    llvm::Type* target = DTypeToLLVMType(op->dtype);
+    llvm::Type* target = DTypeToLLVMType(PrimType(op->ty()->dtype));
     llvm::Value* value = MakeValue(op->args[0]);
     if (value->getType()->isPointerTy() && target->isIntegerTy()) {
       return builder_->CreatePtrToInt(value, target);
@@ -1500,7 +1524,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     return builder_->CreateCall(f);
   } else if (op->op.same_as(builtin::get_active_lane_mask())) {
     llvm::Intrinsic::ID id = llvm::Intrinsic::get_active_lane_mask;
-    llvm::Function* f = GetIntrinsicDecl(id, DTypeToLLVMType(op->dtype),
+    llvm::Function* f = GetIntrinsicDecl(id, DTypeToLLVMType(PrimType(op->ty()->dtype)),
                                          {builder_->getInt32Ty(), builder_->getInt32Ty()});
     return builder_->CreateCall(f, {MakeValue(op->args[0]), MakeValue(op->args[1])});
   } else {
@@ -1510,13 +1534,13 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
 
 void CodeGenLLVM::Scalarize(const PrimExpr& e, std::function<void(int i, llvm::Value* v)> f) {
   if (const RampNode* ramp = e.as<RampNode>()) {
-    for (int i = 0; i < ramp->dtype.lanes(); ++i) {
+    for (int i = 0; i < PrimType(ramp->ty()->dtype).lanes(); ++i) {
       PrimExpr offset = ramp->base + (ramp->stride * i);
       f(i, MakeValue(offset));
     }
   } else {
     llvm::Value* value = MakeValue(e);
-    for (int i = 0; i < e.dtype().lanes(); ++i) {
+    for (int i = 0; i < PrimType(e.ty()->dtype).lanes(); ++i) {
       f(i, builder_->CreateExtractElement(value, i));
     }
   }
@@ -1526,58 +1550,59 @@ void CodeGenLLVM::Scalarize(const PrimExpr& e, std::function<void(int i, llvm::V
 llvm::Value* CodeGenLLVM::VisitExpr_(const VarNode* op) { return GetVarValue(op); }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const CastNode* op) {
-  return CreateCast(op->value.dtype(), op->dtype, MakeValue(op->value));
+  return CreateCast(PrimType(op->value.ty()->dtype), PrimType(op->ty()->dtype),
+                    MakeValue(op->value));
 }
 llvm::Value* CodeGenLLVM::VisitExpr_(const IntImmNode* op) {
-  return llvm::ConstantInt::getSigned(DTypeToLLVMType(op->dtype), op->value);
+  return llvm::ConstantInt::getSigned(DTypeToLLVMType(PrimType(op->ty()->dtype)), op->value);
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const FloatImmNode* op) {
-  return llvm::ConstantFP::get(DTypeToLLVMType(op->dtype), op->value);
+  return llvm::ConstantFP::get(DTypeToLLVMType(PrimType(op->ty()->dtype)), op->value);
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const StringImmNode* op) { return GetConstString(op->value); }
 
-#define DEFINE_CODEGEN_BINARY_OP(Op)                                                 \
-  llvm::Value* CodeGenLLVM::Create##Op(DataType t, llvm::Value* a, llvm::Value* b) { \
-    if (t.is_int()) {                                                                \
-      if (t.bits() >= 32) {                                                          \
-        return builder_->CreateNSW##Op(a, b);                                        \
-      } else {                                                                       \
-        return builder_->Create##Op(a, b);                                           \
-      }                                                                              \
-    } else if (t.is_uint()) {                                                        \
-      if (t.bits() >= 32) {                                                          \
-        return builder_->CreateNUW##Op(a, b);                                        \
-      } else {                                                                       \
-        return builder_->Create##Op(a, b);                                           \
-      }                                                                              \
-    } else {                                                                         \
-      TVM_FFI_ICHECK(t.is_float());                                                  \
-      return builder_->CreateF##Op(a, b);                                            \
-    }                                                                                \
-  }                                                                                  \
-  llvm::Value* CodeGenLLVM::VisitExpr_(const Op##Node* op) {                         \
-    return Create##Op(op->dtype, MakeValue(op->a), MakeValue(op->b));                \
+#define DEFINE_CODEGEN_BINARY_OP(Op)                                                  \
+  llvm::Value* CodeGenLLVM::Create##Op(PrimType t, llvm::Value* a, llvm::Value* b) {  \
+    if (t.MatchesCode(DLDataTypeCode::kDLInt)) {                                      \
+      if (t.bits() >= 32) {                                                           \
+        return builder_->CreateNSW##Op(a, b);                                         \
+      } else {                                                                        \
+        return builder_->Create##Op(a, b);                                            \
+      }                                                                               \
+    } else if (t.MatchesCode(DLDataTypeCode::kDLUInt)) {                              \
+      if (t.bits() >= 32) {                                                           \
+        return builder_->CreateNUW##Op(a, b);                                         \
+      } else {                                                                        \
+        return builder_->Create##Op(a, b);                                            \
+      }                                                                               \
+    } else {                                                                          \
+      TVM_FFI_ICHECK(t.MatchesCode(DLDataTypeCode::kDLFloat));                        \
+      return builder_->CreateF##Op(a, b);                                             \
+    }                                                                                 \
+  }                                                                                   \
+  llvm::Value* CodeGenLLVM::VisitExpr_(const Op##Node* op) {                          \
+    return Create##Op(PrimType(op->ty()->dtype), MakeValue(op->a), MakeValue(op->b)); \
   }
 
 DEFINE_CODEGEN_BINARY_OP(Add);
 DEFINE_CODEGEN_BINARY_OP(Sub);
 DEFINE_CODEGEN_BINARY_OP(Mul);
 
-#define DEFINE_CODEGEN_CMP_OP(Op)                                                    \
-  llvm::Value* CodeGenLLVM::Create##Op(DataType t, llvm::Value* a, llvm::Value* b) { \
-    if (t.is_int()) {                                                                \
-      return builder_->CreateICmpS##Op(a, b);                                        \
-    } else if (t.is_uint()) {                                                        \
-      return builder_->CreateICmpU##Op(a, b);                                        \
-    } else {                                                                         \
-      TVM_FFI_ICHECK(t.is_float());                                                  \
-      return builder_->CreateFCmpO##Op(a, b);                                        \
-    }                                                                                \
-  }                                                                                  \
-  llvm::Value* CodeGenLLVM::VisitExpr_(const Op##Node* op) {                         \
-    return Create##Op(op->a.dtype(), MakeValue(op->a), MakeValue(op->b));            \
+#define DEFINE_CODEGEN_CMP_OP(Op)                                                       \
+  llvm::Value* CodeGenLLVM::Create##Op(PrimType t, llvm::Value* a, llvm::Value* b) {    \
+    if (t.MatchesCode(DLDataTypeCode::kDLInt)) {                                        \
+      return builder_->CreateICmpS##Op(a, b);                                           \
+    } else if (t.MatchesCode(DLDataTypeCode::kDLUInt)) {                                \
+      return builder_->CreateICmpU##Op(a, b);                                           \
+    } else {                                                                            \
+      TVM_FFI_ICHECK(t.MatchesCode(DLDataTypeCode::kDLFloat));                          \
+      return builder_->CreateFCmpO##Op(a, b);                                           \
+    }                                                                                   \
+  }                                                                                     \
+  llvm::Value* CodeGenLLVM::VisitExpr_(const Op##Node* op) {                            \
+    return Create##Op(PrimType(op->a.ty()->dtype), MakeValue(op->a), MakeValue(op->b)); \
   }
 
 DEFINE_CODEGEN_CMP_OP(LT);
@@ -1588,12 +1613,13 @@ DEFINE_CODEGEN_CMP_OP(GE);
 llvm::Value* CodeGenLLVM::VisitExpr_(const DivNode* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
-  if (op->dtype.is_int()) {
+  PrimType dtype(op->ty()->dtype);
+  if (dtype.MatchesCode(DLDataTypeCode::kDLInt)) {
     return builder_->CreateSDiv(a, b);
-  } else if (op->dtype.is_uint()) {
+  } else if (dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
     return builder_->CreateUDiv(a, b);
   } else {
-    TVM_FFI_ICHECK(op->dtype.is_float());
+    TVM_FFI_ICHECK(dtype.MatchesCode(DLDataTypeCode::kDLFloat));
     return builder_->CreateFDiv(a, b);
   }
 }
@@ -1601,12 +1627,13 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const DivNode* op) {
 llvm::Value* CodeGenLLVM::VisitExpr_(const ModNode* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
-  if (op->dtype.is_int()) {
+  PrimType dtype(op->ty()->dtype);
+  if (dtype.MatchesCode(DLDataTypeCode::kDLInt)) {
     return builder_->CreateSRem(a, b);
-  } else if (op->dtype.is_uint()) {
+  } else if (dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
     return builder_->CreateURem(a, b);
   } else {
-    TVM_FFI_ICHECK(op->dtype.is_float());
+    TVM_FFI_ICHECK(dtype.MatchesCode(DLDataTypeCode::kDLFloat));
     return builder_->CreateFRem(a, b);
   }
 }
@@ -1614,19 +1641,20 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const ModNode* op) {
 llvm::Value* CodeGenLLVM::VisitExpr_(const MinNode* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
-  return builder_->CreateSelect(CreateLT(op->a.dtype(), a, b), a, b);
+  return builder_->CreateSelect(CreateLT(PrimType(op->a.ty()->dtype), a, b), a, b);
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const MaxNode* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
-  return builder_->CreateSelect(CreateGT(op->a.dtype(), a, b), a, b);
+  return builder_->CreateSelect(CreateGT(PrimType(op->a.ty()->dtype), a, b), a, b);
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const EQNode* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
-  if (op->a.dtype().is_int() || op->a.dtype().is_uint()) {
+  PrimType dtype(op->a.ty()->dtype);
+  if (dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
     return builder_->CreateICmpEQ(a, b);
   } else {
     return builder_->CreateFCmpOEQ(a, b);
@@ -1636,7 +1664,8 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const EQNode* op) {
 llvm::Value* CodeGenLLVM::VisitExpr_(const NENode* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
-  if (op->a.dtype().is_int() || op->a.dtype().is_uint()) {
+  PrimType dtype(op->a.ty()->dtype);
+  if (dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
     return builder_->CreateICmpNE(a, b);
   } else {
     return builder_->CreateFCmpONE(a, b);
@@ -1675,23 +1704,23 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const LetNode* op) {
   return MakeValue(op->body);
 }
 
-bool CodeGenLLVM::HasAlignmentPadding(DataType dtype) {
-  if (dtype.is_scalable_vector()) {
+bool CodeGenLLVM::HasAlignmentPadding(PrimType dtype) {
+  if (dtype.IsScalableVector()) {
     return false;
   }
   const llvm::DataLayout& data_layout = module_->getDataLayout();
   int bytes = data_layout.getTypeAllocSize(DTypeToLLVMType(dtype));
-  int bytes_scalar = data_layout.getTypeAllocSize(DTypeToLLVMType(dtype.element_of()));
+  int bytes_scalar = data_layout.getTypeAllocSize(DTypeToLLVMType(dtype.WithLanes(1)));
   return bytes != bytes_scalar * dtype.lanes();
 }
 
 void CodeGenLLVM::BufferAccessHelper(
     Buffer buffer, ffi::Array<PrimExpr> indices, ffi::Optional<PrimExpr> predicate,
-    DataType value_dtype,
+    PrimType value_dtype,
     std::function<llvm::Instruction*(TypedPointer buffer_ptr, int subelement_i,
                                      llvm::Value* predicate, int alignment, bool is_volatile)>
         make_instruction) {
-  DataType buffer_element_dtype = buffer->dtype;
+  PrimType buffer_element_dtype = buffer->dtype;
 
   TVM_FFI_ICHECK_GE(indices.size(), 1)
       << "Buffer " << buffer->name << " is accessed with no indices.  "
@@ -1703,21 +1732,20 @@ void CodeGenLLVM::BufferAccessHelper(
   // requires 1-d indices.
   std::vector<llvm::Value*> earlier_index_values;
   for (size_t i = 0; i < indices.size() - 1; i++) {
-    TVM_FFI_ICHECK_EQ(indices[i].dtype().lanes(), 1)
+    TVM_FFI_ICHECK_EQ(PrimType(indices[i].ty()->dtype).lanes(), 1)
         << "Buffer " << buffer->name << " is accessed with a multi-lane index at position " << i
         << ".  Multi-lane indices are only supported as the last index.";
     earlier_index_values.push_back(MakeValue(indices[i]));
   }
 
   PrimExpr last_index = indices[indices.size() - 1];
-  int last_index_lanes = last_index.dtype().get_lanes_or_vscale_factor();
-  int buffer_element_lanes = buffer_element_dtype.get_lanes_or_vscale_factor();
-  TVM_FFI_ICHECK_EQ(value_dtype.get_lanes_or_vscale_factor(),
-                    last_index_lanes * buffer_element_lanes);
+  int last_index_lanes = GetLanesOrVScaleFactor(PrimType(last_index.ty()->dtype));
+  int buffer_element_lanes = GetLanesOrVScaleFactor(buffer_element_dtype);
+  TVM_FFI_ICHECK_EQ(GetLanesOrVScaleFactor(value_dtype), last_index_lanes * buffer_element_lanes);
 
   // Record index and elemtype in original form used for alias info
   PrimExpr last_index_origin = last_index;
-  DataType buffer_element_dtype_origin = buffer_element_dtype;
+  PrimType buffer_element_dtype_origin = buffer_element_dtype;
 
   bool is_volatile = volatile_buf_.count(buffer->data.get());
 
@@ -1726,17 +1754,18 @@ void CodeGenLLVM::BufferAccessHelper(
   if (const RampNode* ramp_index = last_index.as<RampNode>()) {
     if (is_one(ramp_index->stride)) {
       last_index = ramp_index->base;
-      last_index_lanes = last_index.dtype().get_lanes_or_vscale_factor();
+      last_index_lanes = GetLanesOrVScaleFactor(PrimType(last_index.ty()->dtype));
     }
   }
 
   // All TVM arrays are densely packed.  If the vectorized LLVM type
   // contains padding for alignment, we need to index based on the
   // size of the scalar type to avoid introducing that padding.
-  bool last_index_is_scalar = !last_index.dtype().is_scalable_vector() && last_index_lanes == 1;
+  bool last_index_is_scalar =
+      !PrimType(last_index.ty()->dtype).IsScalableVector() && last_index_lanes == 1;
   if (last_index_is_scalar && HasAlignmentPadding(buffer_element_dtype)) {
     last_index = buffer_element_lanes * last_index;
-    buffer_element_dtype = buffer_element_dtype.element_of();
+    buffer_element_dtype = buffer_element_dtype.WithLanes(1);
     buffer_element_lanes = 1;
   }
 
@@ -1754,7 +1783,7 @@ void CodeGenLLVM::BufferAccessHelper(
     alignment = value_dtype.bits() / 8;
   }
 
-  TVM_FFI_ICHECK(!last_index.dtype().is_scalable_vector())
+  TVM_FFI_ICHECK(!PrimType(last_index.ty()->dtype).IsScalableVector())
       << "Scalable vector indices are not supported in LLVM buffer access codegen";
   llvm::Value* cached_vector_index = nullptr;
   for (int i = 0; i < last_index_lanes; ++i) {
@@ -1763,7 +1792,7 @@ void CodeGenLLVM::BufferAccessHelper(
     if (const RampNode* ramp = last_index.as<RampNode>()) {
       PrimExpr offset = ramp->base + (ramp->stride * i);
       last_index_value = MakeValue(offset);
-    } else if (last_index.dtype().is_vector()) {
+    } else if (!PrimType(last_index.ty()->dtype).IsScalar()) {
       if (i == 0) {
         cached_vector_index = MakeValue(last_index);
       }
@@ -1782,12 +1811,12 @@ void CodeGenLLVM::BufferAccessHelper(
     }
 
     TypedPointer buffer_ptr =
-        value_dtype.is_scalable_vector()
+        value_dtype.IsScalableVector()
             ? CreateBufferPtr(MakeValue(buffer->data), buffer_element_dtype, all_index_values,
-                              value_dtype.with_scalable_vscale_factor(value_dtype.vscale_factor() /
-                                                                      last_index_lanes))
+                              WithScalableVScaleFactor(
+                                  value_dtype, value_dtype.VScaleFactor() / last_index_lanes))
             : CreateBufferPtr(MakeValue(buffer->data), buffer_element_dtype, all_index_values,
-                              value_dtype.with_lanes(value_dtype.lanes() / last_index_lanes));
+                              value_dtype.WithLanes(value_dtype.lanes() / last_index_lanes));
     auto instruction =
         make_instruction(buffer_ptr, subelement_i, predicate_value, alignment, is_volatile);
     AddAliasInfo(instruction, buffer->data.get(), last_index_origin, buffer_element_dtype_origin);
@@ -1795,7 +1824,7 @@ void CodeGenLLVM::BufferAccessHelper(
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const BufferLoadNode* op) {
-  DataType value_dtype = op->dtype;
+  PrimType value_dtype(op->ty()->dtype);
 
   std::vector<llvm::Value*> loads;
 
@@ -1868,13 +1897,14 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const CallNode* op) {
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const RampNode* op) {
-  llvm::Value* vec = llvm::UndefValue::get(DTypeToLLVMType(op->dtype));
+  PrimType dtype(op->ty()->dtype);
+  llvm::Value* vec = llvm::UndefValue::get(DTypeToLLVMType(dtype));
   // TODO(ekalda): P4 in https://github.com/apache/tvm/issues/16455
-  TVM_FFI_ICHECK(!op->dtype.is_scalable_vector());
-  int lanes = op->dtype.lanes();
+  TVM_FFI_ICHECK(!dtype.IsScalableVector());
+  int lanes = dtype.lanes();
   for (int i = 0; i < lanes; ++i) {
     vec = builder_->CreateInsertElement(
-        vec, MakeValue(op->base + op->stride * MakeConst(op->stride.dtype(), i)), ConstInt32(i));
+        vec, MakeValue(op->base + op->stride * MakeConst(op->stride.ty(), i)), ConstInt32(i));
   }
   return vec;
 }
@@ -1884,7 +1914,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const ShuffleNode* op) {
   int total_lanes = 0;
   for (int i = 0, e = op->vectors.size(); i < e; ++i) {
     vecs[i] = VisitExpr(op->vectors[i]);
-    total_lanes += op->vectors[i].dtype().lanes();
+    total_lanes += PrimType(op->vectors[i].ty()->dtype).lanes();
   }
   llvm::Value* v0 = CreateVecConcat(vecs);
   std::vector<uint32_t> idx(op->indices.size());
@@ -1905,21 +1935,21 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const ShuffleNode* op) {
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const BroadcastNode* op) {
-  DataType dtype = op->dtype;
+  PrimType dtype(op->ty()->dtype);
   llvm::Value* value = MakeValue(op->value);
   llvm::Type* type = DTypeToLLVMType(dtype);
   llvm::Constant* undef = llvm::UndefValue::get(type);
   llvm::Constant* zero = ConstInt32(0);
   value = builder_->CreateInsertElement(undef, value, zero);
   llvm::ElementCount ec =
-      llvm::ElementCount::get(dtype.get_lanes_or_vscale_factor(), dtype.is_scalable_vector());
+      llvm::ElementCount::get(GetLanesOrVScaleFactor(dtype), dtype.IsScalableVector());
   llvm::Constant* mask = llvm::ConstantVector::getSplat(ec, zero);
   return builder_->CreateShuffleVector(value, undef, mask);
 }
 
 void CodeGenLLVM::VisitStmt_(const BufferStoreNode* op) {
   EmitDebugLocation(op);
-  DataType value_dtype = op->value.dtype();
+  PrimType value_dtype = PrimType(op->value.ty()->dtype);
   Var buffer_var = op->buffer->data;
 
   llvm::Value* value = MakeValue(op->value);
@@ -1960,7 +1990,7 @@ void CodeGenLLVM::VisitStmt_(const ForNode* op) {
   } else {
     TVM_FFI_ICHECK(op->kind == ForKind::kSerial);
   }
-  PrimExpr step = op->step.value_or(MakeConst(op->extent->dtype, 1));
+  PrimExpr step = op->step.value_or(MakeConst(op->extent.ty(), 1));
   PrimExpr end = is_zero(op->min) ? op->extent : analyzer_->Simplify(op->min + op->extent);
   llvm::Value* begin_value = MakeValue(op->min);
   llvm::Value* end_value = MakeValue(end);
@@ -2087,7 +2117,7 @@ void CodeGenLLVM::VisitStmt_(const BindNode* op) {
   EmitDebugLocation(op);
   const VarNode* v = op->var.get();
   TVM_FFI_ICHECK(!var_map_.count(v));
-  if (v->dtype.is_handle()) {
+  if (v->ty().IsHandle()) {
     if (!is_restricted_) {
       alias_var_set_.insert(v);
     }
@@ -2098,10 +2128,10 @@ void CodeGenLLVM::VisitStmt_(const BindNode* op) {
   // Therefore, to have the correct LLVM type for pointers, we may
   // need to introduce a pointer-cast, even though pointer-to-pointer
   // casts are not expressible with the `tirx::CastNode`.
-  if (v->dtype.is_handle() && v->type_annotation.defined()) {
-    TVM_FFI_ICHECK(op->value->dtype.is_handle())
+  if (v->ty().IsHandle() && v->type_annotation.defined()) {
+    TVM_FFI_ICHECK(op->value.ty().IsHandle())
         << "Variable " << op->var << " is a pointer with type " << op->value
-        << ", but is being bound to expression with type " << op->value->dtype;
+        << ", but is being bound to expression with type " << op->value.ty();
     auto* llvm_type = GetLLVMType(v->type_annotation);
     if (llvm_type != value->getType()) {
       value->setName((v->name_hint + "_void_ptr").c_str());
@@ -2274,7 +2304,10 @@ llvm::DIType* CodeGenLLVM::GetDebugType(const Type& ty_tir, llvm::Type* ty_llvm)
 
   } else if (ty_llvm->isPointerTy()) {
     auto* ptr_type = ty_tir.as<PointerTypeNode>();
-    TVM_FFI_ICHECK(ptr_type != nullptr || GetRuntimeDataType(ty_tir).is_handle())
+    DLDataType runtime_dtype = GetRuntimeDataType(ty_tir);
+    TVM_FFI_ICHECK(ptr_type != nullptr ||
+                   (runtime_dtype.code == static_cast<uint8_t>(DLDataTypeCode::kDLOpaqueHandle) &&
+                    !(runtime_dtype.bits == 0 && static_cast<int16_t>(runtime_dtype.lanes) == 0)))
         << "Got LLVM pointer type from non-pointer IR type: " << ty_tir;
     auto* pointee_type = ptr_type != nullptr ? GetDebugType(ptr_type->element_type,
                                                             GetLLVMType(ptr_type->element_type))
@@ -2283,24 +2316,24 @@ llvm::DIType* CodeGenLLVM::GetDebugType(const Type& ty_tir, llvm::Type* ty_llvm)
                                                      ty_llvm->getPrimitiveSizeInBits());
 
   } else if (auto* prim_type = ty_tir.as<PrimTypeNode>()) {
-    DataType dtype = prim_type->dtype;
+    PrimType dtype(prim_type->dtype);
     llvm::dwarf::TypeKind dwarf_type;
-    if (dtype.is_bool()) {
+    if (dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
       dwarf_type = llvm::dwarf::DW_ATE_boolean;
-    } else if (dtype.is_float()) {
+    } else if (dtype.MatchesCode(DLDataTypeCode::kDLFloat)) {
       dwarf_type = llvm::dwarf::DW_ATE_float;
-    } else if (dtype.is_int()) {
+    } else if (dtype.MatchesCode(DLDataTypeCode::kDLInt)) {
       dwarf_type = llvm::dwarf::DW_ATE_signed;
-    } else if (dtype.is_uint()) {
+    } else if (dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
       dwarf_type = llvm::dwarf::DW_ATE_unsigned;
     } else {
       return nullptr;
     }
 
-    if (dtype.is_scalable_vector()) return nullptr;
+    if (dtype.IsScalableVector()) return nullptr;
 
     return dbg_info_->di_builder_->createBasicType(
-        ffi::DLDataTypeToString(dtype).operator std::string(), dtype.bits() * dtype.lanes(),
+        ffi::DLDataTypeToString(dtype->dtype).operator std::string(), dtype.bits() * dtype.lanes(),
         dwarf_type);
 
   } else {
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index a2c3b6e4ff48..777eebe8097b 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -351,7 +351,7 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    */
   void BufferAccessHelper(
       Buffer buffer, ffi::Array<PrimExpr> indices, ffi::Optional<PrimExpr> predicate,
-      DataType value_dtype,
+      PrimType value_dtype,
       std::function<llvm::Instruction*(TypedPointer buffer_ptr, int subelement_i,
                                        llvm::Value* predicate, int alignment, bool is_volatile)>
           make_instruction);
@@ -400,7 +400,7 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    *
    * \return LLVM type of dtype
    */
-  llvm::Type* DTypeToLLVMType(const DataType& dtype) const;
+  llvm::Type* DTypeToLLVMType(const PrimType& dtype) const;
   /*!
    * \brief Get the LLVM Type for a given type.
    * \param dtype The runtime dtype.
@@ -450,28 +450,28 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
   // initialize the function state.
   void InitFuncState();
   // Get alignment given index.
-  void GetAlignment(DataType t, const VarNode* buf_var, const PrimExpr& index, int* p_alignment,
+  void GetAlignment(PrimType t, const VarNode* buf_var, const PrimExpr& index, int* p_alignment,
                     int* p_native_bits);
   // Returns whether the LLVM type has padding for alignment
-  bool HasAlignmentPadding(DataType dtype);
+  bool HasAlignmentPadding(PrimType dtype);
   // do a scalarize call with f
   llvm::Value* CreateScalarizedCall(const CallNode* op, llvm::Function* f,
                                     const std::vector<llvm::Value*>& args);
   // handle module import
   void HandleImport(const std::string& code);
   // cast operatpr
-  llvm::Value* CreateCast(DataType from, DataType to, llvm::Value* value);
+  llvm::Value* CreateCast(PrimType from, PrimType to, llvm::Value* value);
   // comparison op
   llvm::Value* GetVarValue(const VarNode* v) const;
-  llvm::Value* CreateLT(DataType t, llvm::Value* a, llvm::Value* b);
-  llvm::Value* CreateLE(DataType t, llvm::Value* a, llvm::Value* b);
-  llvm::Value* CreateGT(DataType t, llvm::Value* a, llvm::Value* b);
-  llvm::Value* CreateGE(DataType t, llvm::Value* a, llvm::Value* b);
-  llvm::Value* CreateAdd(DataType t, llvm::Value* a, llvm::Value* b);
-  llvm::Value* CreateSub(DataType t, llvm::Value* a, llvm::Value* b);
-  llvm::Value* CreateMul(DataType t, llvm::Value* a, llvm::Value* b);
-  virtual TypedPointer CreateBufferPtr(llvm::Value* buffer_ptr, DataType buffer_element_dtype,
-                                       llvm::ArrayRef<llvm::Value*> indices, DataType value_dtype);
+  llvm::Value* CreateLT(PrimType t, llvm::Value* a, llvm::Value* b);
+  llvm::Value* CreateLE(PrimType t, llvm::Value* a, llvm::Value* b);
+  llvm::Value* CreateGT(PrimType t, llvm::Value* a, llvm::Value* b);
+  llvm::Value* CreateGE(PrimType t, llvm::Value* a, llvm::Value* b);
+  llvm::Value* CreateAdd(PrimType t, llvm::Value* a, llvm::Value* b);
+  llvm::Value* CreateSub(PrimType t, llvm::Value* a, llvm::Value* b);
+  llvm::Value* CreateMul(PrimType t, llvm::Value* a, llvm::Value* b);
+  virtual TypedPointer CreateBufferPtr(llvm::Value* buffer_ptr, PrimType buffer_element_dtype,
+                                       llvm::ArrayRef<llvm::Value*> indices, PrimType value_dtype);
   // Vector concatenation.
   llvm::Value* CreateVecSlice(llvm::Value* vec, int begin, int extent);
   llvm::Value* CreateVecFlip(llvm::Value* vec);
@@ -482,9 +482,9 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
                        const Var& loop_var, const Stmt& body);
   // add alias information.
   void AddAliasInfo(llvm::Instruction* inst, const VarNode* buffer_var, PrimExpr index,
-                    DataType access_dtype);
+                    PrimType access_dtype);
 
-  llvm::GlobalVariable* AllocateSharedMemory(DataType dtype, size_t size,
+  llvm::GlobalVariable* AllocateSharedMemory(PrimType dtype, size_t size,
                                              unsigned int shared_address_space, int alignment,
                                              llvm::GlobalValue::LinkageTypes linkage);
 
diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index 0633c4fcb3b6..70a407b75984 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -78,8 +78,8 @@ llvm::ConstantArray* TensorToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::T
   TVM_FFI_ICHECK(arr.IsContiguous()) << "CodegenParams: only support contiguous arrays";
   TVM_FFI_ICHECK_EQ(arr->device.device_type, kDLCPU)
       << "CodegenParams: only support contiguous arrays";
-  TVM_FFI_ICHECK_EQ(arr_type.lanes(), 1)
-      << "CodegenParams: only support generating 1-lane parameters; saw " << arr_type.lanes();
+  TVM_FFI_ICHECK_EQ(arr_type.lanes, 1)
+      << "CodegenParams: only support generating 1-lane parameters; saw " << arr_type.lanes;
 
   auto shape = arr.Shape();
   int num_elements = 1;
@@ -89,15 +89,15 @@ llvm::ConstantArray* TensorToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::T
 
   std::vector<llvm::Constant*> elements;
 
-  switch (arr_type.code()) {
-    case runtime::DataType::kInt:
-      TVM_FFI_ICHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
-                     arr_type.bits() == 64)
+  switch (arr_type.code) {
+    case kDLInt:
+      TVM_FFI_ICHECK(arr_type.bits == 8 || arr_type.bits == 16 || arr_type.bits == 32 ||
+                     arr_type.bits == 64)
           << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
-          << arr_type.bits() << "-bit array";
-      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+          << arr_type.bits << "-bit array";
+      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits);
 
-      switch (arr_type.bits()) {
+      switch (arr_type.bits) {
         case 8:
           BuildLLVMVector<int8_t>(element_type, arr->data, num_elements, &elements);
           break;
@@ -116,14 +116,14 @@ llvm::ConstantArray* TensorToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::T
       }
       break;
 
-    case runtime::DataType::TypeCode::kUInt:
-      TVM_FFI_ICHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
-                     arr_type.bits() == 64)
+    case kDLUInt:
+      TVM_FFI_ICHECK(arr_type.bits == 8 || arr_type.bits == 16 || arr_type.bits == 32 ||
+                     arr_type.bits == 64)
           << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
-          << arr_type.bits() << "-bit array";
-      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+          << arr_type.bits << "-bit array";
+      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits);
 
-      switch (arr_type.bits()) {
+      switch (arr_type.bits) {
         case 8:
           BuildLLVMVector<uint8_t>(element_type, arr->data, num_elements, &elements);
           break;
@@ -142,11 +142,11 @@ llvm::ConstantArray* TensorToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::T
       }
       break;
 
-    case runtime::DataType::TypeCode::kFloat:
-      switch (arr_type.bits()) {
+    case kDLFloat:
+      switch (arr_type.bits) {
         case 16:
           // NOTE: float16 is treated as uint16_t.
-          element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+          element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits);
           BuildLLVMVector<uint16_t>(element_type, arr->data, num_elements, &elements);
           break;
         case 32:
@@ -159,15 +159,15 @@ llvm::ConstantArray* TensorToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::T
           break;
         default:
           TVM_FFI_ICHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
-                                << arr_type.bits() << "-bit array";
+                                << arr_type.bits << "-bit array";
           break;
       }
       break;
 
-    case runtime::DataType::TypeCode::kBFloat:
-      TVM_FFI_ICHECK(arr_type.bits() == 16)
-          << "CodegenParams: only support 16-bit bfloat; saw " << arr_type.bits() << "-bit array";
-      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits());
+    case kDLBfloat:
+      TVM_FFI_ICHECK(arr_type.bits == 16)
+          << "CodegenParams: only support 16-bit bfloat; saw " << arr_type.bits << "-bit array";
+      element_type = llvm::Type::getIntNTy(*ctx, arr_type.bits);
       BuildLLVMVector<uint16_t>(element_type, arr->data, num_elements, &elements);
 
     default:
diff --git a/src/target/llvm/codegen_x86_64.cc b/src/target/llvm/codegen_x86_64.cc
index 292b20caa6ae..0ae7cd146c33 100644
--- a/src/target/llvm/codegen_x86_64.cc
+++ b/src/target/llvm/codegen_x86_64.cc
@@ -53,9 +53,10 @@ llvm::Value* CodeGenX86_64::VisitExpr_(const CastNode* op) {
   // LLVM does not automatically generate the correct instruction sequences for
   // half -> float conversion (i.e. using AVX2/AVX-512 vectorized variants of
   // vcvtph2ps), so we explicitly generate them ourselves.
-  const auto from = op->value.dtype();
-  const auto to = op->dtype;
-  if (from.is_float() && to.is_float() && from.bits() == 16 && to.bits() == 32) {
+  const auto from = PrimType(op->value.ty()->dtype);
+  const auto to = PrimType(op->ty()->dtype);
+  if (from.MatchesCode(DLDataTypeCode::kDLFloat) && to.MatchesCode(DLDataTypeCode::kDLFloat) &&
+      from.bits() == 16 && to.bits() == 32) {
     TVM_FFI_ICHECK_EQ(from.lanes(), to.lanes());
 
     const auto has_avx512 = llvm_target_->TargetHasCPUFeature("avx512f");
@@ -63,12 +64,12 @@ llvm::Value* CodeGenX86_64::VisitExpr_(const CastNode* op) {
     if (from.lanes() >= 16 && has_avx512) {
       return CallVectorIntrin(
           llvm::Intrinsic::x86_avx512_mask_vcvtph2ps_512, 16,
-          DTypeToLLVMType(DataType::Float(32, from.lanes())),
+          DTypeToLLVMType(PrimType::Float(32, from.lanes())),
           {
-              MakeValue(tirx::Call(DataType::Int(16, from.lanes()), tirx::builtin::reinterpret(),
+              MakeValue(tirx::Call(PrimType::Int(16, from.lanes()), tirx::builtin::reinterpret(),
                                    {op->value})),
-              MakeValue(tirx::Broadcast(FloatImm(DataType::Float(32), 0), from.lanes())),
-              /*mask=*/MakeValue(IntImm(DataType::Int(16), -1)),
+              MakeValue(tirx::Broadcast(FloatImm(PrimType::Float(32), 0), from.lanes())),
+              /*mask=*/MakeValue(IntImm(PrimType::Int(16), -1)),
               /*rounding-mode=*/MakeValue(IntImm::Int32(4)),
           });
     }
diff --git a/src/target/llvm/intrin_rule_llvm.cc b/src/target/llvm/intrin_rule_llvm.cc
index 33c74d90ddca..4ade49d44fdd 100644
--- a/src/target/llvm/intrin_rule_llvm.cc
+++ b/src/target/llvm/intrin_rule_llvm.cc
@@ -126,7 +126,7 @@ TVM_REGISTER_OP("tirx.exp10")
       const tirx::CallNode* call = e.as<tirx::CallNode>();
       TVM_FFI_ICHECK(call != nullptr);
       const PrimExpr& x = call->args[0];
-      PrimExpr ln10 = MakeConst(x.dtype(), 2.302585093);
+      PrimExpr ln10 = MakeConst(x.ty(), 2.302585093);
       PrimExpr ret = exp(x * ln10);
       return ret;
     });
@@ -162,8 +162,9 @@ TVM_REGISTER_OP("tirx.atanh")
       const tirx::CallNode* call = e.as<tirx::CallNode>();
       TVM_FFI_ICHECK(call != nullptr) << "Invalid call node in atanh legalization";
       const PrimExpr& x = call->args[0];
-      PrimExpr one = MakeConst(x.dtype(), 1.0);
-      return (log(one + x) - log(one - x)) * MakeConst(x.dtype(), 0.5);
+      PrimType x_ty = x.ty();
+      PrimExpr one = MakeConst(x_ty, 1.0);
+      return (log(one + x) - log(one - x)) * MakeConst(x_ty, 0.5);
     });
 
 TVM_REGISTER_OP("tirx.clz")
@@ -172,12 +173,12 @@ TVM_REGISTER_OP("tirx.clz")
       TVM_FFI_ICHECK(call != nullptr);
       TVM_FFI_ICHECK_EQ(call->args.size(), 1);
       ffi::Array<PrimExpr> cargs;
-      cargs.push_back(IntImm(DataType::UInt(32), ::llvm::Intrinsic::ctlz));
+      cargs.push_back(IntImm(PrimType::UInt(32), ::llvm::Intrinsic::ctlz));
       cargs.push_back(call->args[0]);
-      cargs.push_back(IntImm(DataType::Int(1), 1));  // is_zero_undef
+      cargs.push_back(IntImm(PrimType::Int(1), 1));  // is_zero_undef
       // LLVM requires that the return type must match the first argument type
-      auto clz = tirx::Call(call->args[0]->dtype, tirx::builtin::call_llvm_intrin(), cargs);
-      return cast(call->dtype, clz);
+      auto clz = tirx::Call(call->args[0].ty(), tirx::builtin::call_llvm_intrin(), cargs);
+      return cast(call->ty(), clz);
     });
 
 }  // namespace legalize
diff --git a/src/target/llvm/intrin_rule_llvm.h b/src/target/llvm/intrin_rule_llvm.h
index b70d2b8001e0..5fb8801386d1 100644
--- a/src/target/llvm/intrin_rule_llvm.h
+++ b/src/target/llvm/intrin_rule_llvm.h
@@ -43,7 +43,7 @@ inline PrimExpr DispatchLLVMPureIntrin(const PrimExpr& e) {
   TVM_FFI_ICHECK(call != nullptr);
   ffi::Array<PrimExpr> cargs;
   // intrin id.
-  cargs.push_back(IntImm(DataType::UInt(32), id));
+  cargs.push_back(IntImm(PrimType::UInt(32), id));
   TVM_FFI_ICHECK_EQ(call->args.size(), num_signature)
       << "llvm.call_llvm_intrin" << llvmGetIntrinName(id) << "expects " << num_signature
       << " arguments, but got " << call->args.size();
@@ -51,7 +51,7 @@ inline PrimExpr DispatchLLVMPureIntrin(const PrimExpr& e) {
   for (PrimExpr arg : call->args) {
     cargs.push_back(arg);
   }
-  return tirx::Call(call->dtype, tirx::builtin::call_llvm_pure_intrin(), cargs);
+  return tirx::Call(call->ty(), tirx::builtin::call_llvm_pure_intrin(), cargs);
 }
 
 template <unsigned id, int num_signature>
@@ -60,14 +60,14 @@ inline PrimExpr DispatchLLVMIntrin(const PrimExpr& e) {
   TVM_FFI_ICHECK(call != nullptr);
   ffi::Array<PrimExpr> cargs;
   // intrin id.
-  cargs.push_back(IntImm(DataType::UInt(32), id));
+  cargs.push_back(IntImm(PrimType::UInt(32), id));
   TVM_FFI_ICHECK_EQ(call->args.size(), num_signature)
       << "llvm.call_llvm_intrin" << llvmGetIntrinName(id) << "expects " << num_signature
       << " arguments, but got " << call->args.size();
   for (PrimExpr arg : call->args) {
     cargs.push_back(arg);
   }
-  return tirx::Call(call->dtype, tirx::builtin::call_llvm_intrin(), cargs);
+  return tirx::Call(call->ty(), tirx::builtin::call_llvm_intrin(), cargs);
 }
 
 }  // namespace codegen
diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index 3ada4404b0be..c0924b799099 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -109,7 +109,7 @@ void CodeGenC::PrintFunctionSignature(const ffi::String& function_name, const Pr
     }
 
     bool no_alias = func->HasNonzeroAttr(tirx::attr::kNoAlias);
-    bool is_handle = v.dtype().is_handle();
+    bool is_handle = v.ty().IsHandle();
     auto* ptr = v->type_annotation.as<PointerTypeNode>();
     if (ptr && ptr->element_type.as<TensorMapTypeNode>()) {
       is_handle = false;
@@ -205,7 +205,7 @@ void CodeGenC::PrintExpr(const PrimExpr& n, std::ostream& os) {  // NOLINT(*)
   if (print_ssa_form_) {
     std::ostringstream temp;
     VisitExpr(n, temp);
-    os << SSAGetID(temp.str(), n.dtype());
+    os << SSAGetID(temp.str(), n.ty()->dtype);
   } else {
     VisitExpr(n, os);
   }
@@ -213,8 +213,8 @@ void CodeGenC::PrintExpr(const PrimExpr& n, std::ostream& os) {  // NOLINT(*)
 
 static bool CheckOutermostBracketMatch(const std::string& s);
 
-void CodeGenC::PrintSSAAssign(const std::string& target, const std::string& src, DataType t) {
-  PrintType(t, stream);
+void CodeGenC::PrintSSAAssign(const std::string& target, const std::string& src, PrimType t) {
+  PrintType(t->dtype, stream);
   stream << ' ' << target << " = ";
   if (CheckOutermostBracketMatch(src)) {
     stream << src.substr(1, src.length() - 2);
@@ -225,7 +225,8 @@ void CodeGenC::PrintSSAAssign(const std::string& target, const std::string& src,
 }
 
 // Print a reference expression to a buffer.
-std::string CodeGenC::GetBufferRef(DataType t, const BufferNode* buffer, PrimExpr index) {
+std::string CodeGenC::GetBufferRef(DLDataType t, const BufferNode* buffer, PrimExpr index) {
+  PrimType t_ty(t);
   const VarNode* buffer_var = buffer->data.get();
   std::ostringstream os;
   std::string vid = GetVarID(buffer_var);
@@ -235,7 +236,7 @@ std::string CodeGenC::GetBufferRef(DataType t, const BufferNode* buffer, PrimExp
   }
   bool is_vol = IsVolatile(buffer_var);
 
-  auto ptr_cast = [this, is_vol, scope](DataType pointed_to) {
+  auto ptr_cast = [this, is_vol, scope](DLDataType pointed_to) {
     std::ostringstream ptr_os;
     ptr_os << "(";
     if (is_vol) {
@@ -249,7 +250,7 @@ std::string CodeGenC::GetBufferRef(DataType t, const BufferNode* buffer, PrimExp
     return ptr_os.str();
   };
 
-  DataType buffer_element_dtype = buffer->dtype;
+  DLDataType buffer_element_dtype = buffer->dtype->dtype;
 
   std::string buffer_str = vid;
   if (!HandleTypeMatch(buffer_var, buffer_element_dtype) || is_vol) {
@@ -259,19 +260,20 @@ std::string CodeGenC::GetBufferRef(DataType t, const BufferNode* buffer, PrimExp
   }
 
   std::string index_str = PrintExpr(index);
-  if ((t.bits() == 4 && !t.is_float4()) || (t.bits() == 1 && t.is_int())) {
+  if ((t.bits == 4 && t_ty.code() != DLDataTypeCode::kDLFloat4_e2m1fn) ||
+      (t.bits == 1 && t_ty.MatchesCode(DLDataTypeCode::kDLInt))) {
     // This is a special case, because CodegenCUDA::PrintType()
     // returns "int" for bool and for 4-bit integers. In most cases,
     // we divide by the number of lanes to determine the index.
     // However, the backing type for scalar int4 and scalar bool is
     // int32.  Therefore, we need to divide by the ratio of their
     // sizes in that case.
-    int div_factor = (t.lanes() == 1) ? (32 / t.bits()) : t.lanes();
+    int div_factor = (t_ty.lanes() == 1) ? (32 / t.bits) : t_ty.lanes();
 
     os << "*("
        << "(" << ptr_cast(t) << vid << ")"
        << " + " << index_str << " / " << div_factor << ")";
-  } else if (t.is_float4_e2m1fn() && t.lanes() == 1) {
+  } else if (t_ty.code() == DLDataTypeCode::kDLFloat4_e2m1fn && t_ty.lanes() == 1) {
     // float4_e2m1fn: sizeof(__nv_fp4_e2m1) = 1 byte, but data is packed
     // 2 elements per byte.  Divide element index by 2 to get byte offset.
     // This returns an lvalue so it works for address_of() and stores.
@@ -287,8 +289,9 @@ std::string CodeGenC::GetBufferRef(DataType t, const BufferNode* buffer, PrimExp
 }
 
 // Print a reference expression to a buffer.
-std::string CodeGenC::GetStructRef(DataType t, const PrimExpr& buffer, const PrimExpr& index,
+std::string CodeGenC::GetStructRef(DLDataType t, const PrimExpr& buffer, const PrimExpr& index,
                                    int kind) {
+  PrimType t_ty(t);
   if (kind < builtin::kDLTensorKindBound_) {
     std::ostringstream os;
     os << "(((DLTensor*)";
@@ -357,11 +360,11 @@ std::string CodeGenC::GetStructRef(DataType t, const PrimExpr& buffer, const Pri
     os << "(((TVMFFIAny*)";
     this->PrintExpr(buffer, os);
     os << ")[" << index << "].";
-    if (t.is_handle()) {
+    if (t_ty.IsHandle()) {
       os << "v_ptr";
-    } else if (t.is_float()) {
+    } else if (t_ty.code() == DLDataTypeCode::kDLFloat) {
       os << "v_float64";
-    } else if (t.is_int()) {
+    } else if (t_ty.MatchesCode(DLDataTypeCode::kDLInt)) {
       os << "v_int64";
     } else {
       TVM_FFI_THROW(InternalError) << "Do not know how to handle type" << t;
@@ -382,13 +385,13 @@ std::string CodeGenC::GetStructRef(DataType t, const PrimExpr& buffer, const Pri
   }
 }
 
-bool CodeGenC::HandleTypeMatch(const VarNode* buf_var, DataType t) const {
+bool CodeGenC::HandleTypeMatch(const VarNode* buf_var, DLDataType t) const {
   auto it = handle_data_type_.find(buf_var);
   if (it == handle_data_type_.end()) return false;
   return it->second == t;
 }
 
-void CodeGenC::RegisterHandleType(const VarNode* buf_var, DataType t) {
+void CodeGenC::RegisterHandleType(const VarNode* buf_var, DLDataType t) {
   auto it = handle_data_type_.find(buf_var);
   if (it == handle_data_type_.end()) {
     handle_data_type_[buf_var] = t;
@@ -401,39 +404,39 @@ void CodeGenC::RegisterHandleTypeFromPointer(const tirx::Var& var, const PrimExp
   if (value == nullptr) return;
   auto* call = value->as<tirx::CallNode>();
   if (call == nullptr || !call->op.same_as(builtin::ptr_byte_offset())) return;
-  std::optional<DataType> value_dtype = tirx::GetPointerType(GetType(*value));
+  std::optional<DLDataType> value_dtype = tirx::GetPointerType(GetType(*value));
   if (!value_dtype.has_value()) return;
   RegisterHandleType(var.get(), value_dtype.value());
   pointer_offset_vars_.insert(var.get());
 }
 
-void CodeGenC::PrintVecElemLoad(const std::string& vec, DataType t, int i,
+void CodeGenC::PrintVecElemLoad(const std::string& vec, DLDataType t, int i,
                                 std::ostream& os) {  // NOLINT(*)
   os << vec << ".s" << std::hex << i << std::dec;
 }
 
-void CodeGenC::PrintVecElemStore(const std::string& vec, DataType t, int i,
+void CodeGenC::PrintVecElemStore(const std::string& vec, DLDataType t, int i,
                                  const std::string& value) {
   this->PrintIndent();
   stream << vec << ".s" << std::hex << i << " = " << value << ";\n" << std::dec;
 }
 
-std::string CodeGenC::GetVecLoad(DataType t, const BufferNode* buffer, PrimExpr base) {
+std::string CodeGenC::GetVecLoad(DLDataType t, const BufferNode* buffer, PrimExpr base) {
   return GetBufferRef(t, buffer, base);
 }
 
-void CodeGenC::PrintVecStore(const BufferNode* buffer, DataType t, PrimExpr base,
+void CodeGenC::PrintVecStore(const BufferNode* buffer, DLDataType t, PrimExpr base,
                              const std::string& value) {
   std::string ref = GetBufferRef(t, buffer, base);
   this->PrintIndent();
   stream << ref << " = " << value << ";\n";
 }
 
-void CodeGenC::PrintVecConstructor(DataType t, std::ostream& os) {  // NOLINT(*)
+void CodeGenC::PrintVecConstructor(DLDataType t, std::ostream& os) {  // NOLINT(*)
   PrintType(t, os);
 }
 
-std::string CodeGenC::CastFromTo(std::string value, DataType from, DataType target) {
+std::string CodeGenC::CastFromTo(std::string value, DLDataType from, DLDataType target) {
   if (from == target) return value;
   std::ostringstream os;
   os << "((";
@@ -454,21 +457,21 @@ void CodeGenC::PrintStorageScope(const std::string& scope, std::ostream& os) {
 }
 
 inline void PrintConst(const IntImmNode* op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
-  if (op->dtype == DataType::Int(32)) {
+  if (op->ty() == PrimType::Int(32)) {
     std::ostringstream temp;
     temp << op->value;
     p->MarkConst(temp.str());
     os << temp.str();
   } else {
     os << "(";
-    p->PrintType(op->dtype, os);
+    p->PrintType(op->ty()->dtype, os);
     os << ")" << op->value;
   }
 }
 
-inline void PrintUIntConst(DataType dtype, uint64_t val, std::ostream& os,
+inline void PrintUIntConst(DLDataType dtype, uint64_t val, std::ostream& os,
                            CodeGenC* p) {  // NOLINT(*)
-  if (dtype == DataType::UInt(32)) {
+  if (dtype == DLDataType{kDLUInt, 32, 1}) {
     std::ostringstream temp;
     temp << val << "U";
     p->MarkConst(temp.str());
@@ -481,24 +484,24 @@ inline void PrintUIntConst(DataType dtype, uint64_t val, std::ostream& os,
 }
 
 inline void PrintConst(const FloatImmNode* op, std::ostream& os, CodeGenC* p) {  // NOLINT(*)
-  switch (op->dtype.bits()) {
+  switch (op->ty().bits()) {
     case 64:
     case 32: {
       std::ostringstream temp;
       temp << std::scientific << op->value;
-      if (op->dtype.bits() == 32) temp << 'f';
+      if (op->ty().bits() == 32) temp << 'f';
       p->MarkConst(temp.str());
       os << temp.str();
       break;
     }
     case 16: {
       os << '(';
-      p->PrintType(op->dtype, os);
+      p->PrintType(op->ty()->dtype, os);
       os << ')' << std::scientific << op->value << 'f';
       break;
     }
     default:
-      TVM_FFI_THROW(InternalError) << "Bad bit-width for float: " << op->dtype << "\n";
+      TVM_FFI_THROW(InternalError) << "Bad bit-width for float: " << op->ty()->dtype << "\n";
   }
 }
 
@@ -517,7 +520,7 @@ template <typename T>
 inline void PrintBinaryExpr(const T* op, const char* opstr,
                             std::ostream& os,  // NOLINT(*)
                             CodeGenC* p) {
-  if (op->dtype.lanes() == 1) {
+  if (op->ty().lanes() == 1) {
     if (isalpha(opstr[0])) {
       os << opstr << '(';
       p->PrintExpr(op->a, os);
@@ -532,14 +535,14 @@ inline void PrintBinaryExpr(const T* op, const char* opstr,
       os << ')';
     }
   } else {
-    p->PrintVecBinaryOp(opstr, op->dtype, op->a, op->b, os);
+    p->PrintVecBinaryOp(opstr, op->ty()->dtype, op->a, op->b, os);
   }
 }
 
 inline void PrintBinaryIntrinsic(const CallNode* op, const char* opstr,
                                  std::ostream& os,  // NOLINT(*)
                                  CodeGenC* p) {
-  if (op->dtype.lanes() == 1) {
+  if (op->ty().lanes() == 1) {
     TVM_FFI_ICHECK_EQ(op->args.size(), 2U);
     os << '(';
     p->PrintExpr(op->args[0], os);
@@ -547,13 +550,13 @@ inline void PrintBinaryIntrinsic(const CallNode* op, const char* opstr,
     p->PrintExpr(op->args[1], os);
     os << ')';
   } else {
-    p->PrintVecBinaryOp(opstr, op->dtype, op->args[0], op->args[1], os);
+    p->PrintVecBinaryOp(opstr, op->ty()->dtype, op->args[0], op->args[1], os);
   }
 }
 void CodeGenC::VisitExpr_(const CastNode* op, std::ostream& os) {  // NOLINT(*)
   std::stringstream value;
   this->PrintExpr(op->value, value);
-  os << CastFromTo(value.str(), op->value.dtype(), op->dtype);
+  os << CastFromTo(value.str(), op->value.ty()->dtype, op->ty()->dtype);
 }
 void CodeGenC::VisitExpr_(const VarNode* op, std::ostream& os) {  // NOLINT(*)
   os << GetVarID(op);
@@ -571,19 +574,20 @@ void CodeGenC::VisitExpr_(const DivNode* op, std::ostream& os) {  // NOLINT(*)
   PrintBinaryExpr(op, "/", os, this);
 }
 void CodeGenC::VisitExpr_(const ModNode* op, std::ostream& os) {  // NOLINT(*)
-  if (op->dtype.is_int() || op->dtype.is_uint()) {
+  PrimType op_ty = op->ty();
+  if (op_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
     PrintBinaryExpr(op, "%", os, this);
   } else {
-    TVM_FFI_ICHECK(op->dtype.is_float())
-        << "Expected floating point or integer dtype in Mod, but got " << op->dtype;
-    if (op->dtype.bits() == 32) {
+    TVM_FFI_ICHECK(op_ty.code() == DLDataTypeCode::kDLFloat)
+        << "Expected floating point or integer dtype in Mod, but got " << op->ty()->dtype;
+    if (op_ty.bits() == 32) {
       PrintBinaryExpr(op, "fmodf", os, this);
-    } else if (op->dtype.bits() == 64) {
+    } else if (op_ty.bits() == 64) {
       PrintBinaryExpr(op, "fmod", os, this);
     } else {
       TVM_FFI_ICHECK(false)
           << "Non single or double precision floating point in Mod, expected 32 or 64 bits but got "
-          << op->dtype.bits() << " bits.";
+          << op_ty.bits() << " bits.";
     }
   }
 }
@@ -658,7 +662,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
         for (size_t i = 1; i < op->args.size(); i++) {
           arg_types.push_back(GetType(op->args[i]));
         }
-        Type ret_type = GetTypeFromRuntimeDataType(op->dtype);
+        Type ret_type = GetType(ffi::GetRef<PrimExpr>(op));
         this->GenerateForwardFunctionDeclarations(func->value, arg_types, ret_type);
       }
     } else if (op_attr_global_symbol_.count(call_op)) {
@@ -672,7 +676,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
       uint64_t low = static_cast<uint64_t>(op->args[0].as_or_throw<IntImm>()->value);
       uint64_t high = static_cast<uint64_t>(op->args[1].as_or_throw<IntImm>()->value);
       uint64_t val = (high << 32U) | low;
-      PrintUIntConst(op->dtype, val, os, this);
+      PrintUIntConst(op->ty()->dtype, val, os, this);
     } else if (op->op.same_as(builtin::bitwise_xor())) {
       PrintBinaryIntrinsic(op, " ^ ", os, this);
     } else if (op->op.same_as(builtin::bitwise_or())) {
@@ -691,7 +695,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
       std::string result = name_supply_->FreshName("condval");
       std::string cond = PrintExpr(op->args[0]);
       this->PrintIndent();
-      PrintType(op->dtype, this->stream);
+      PrintType(op->ty()->dtype, this->stream);
       this->stream << " " << result << ";\n";
       this->PrintIndent();
       this->stream << "if (" << cond << ") {\n";
@@ -721,13 +725,14 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
         TVM_FFI_ICHECK_EQ(load->indices.size(), 1)
             << "CodeGenC only supports flat memory allocations.";
         const VarNode* data = load->buffer->data.get();
-        if (pointer_offset_vars_.count(data) && HandleTypeMatch(data, load->buffer->dtype) &&
+        if (pointer_offset_vars_.count(data) && HandleTypeMatch(data, load->buffer->dtype->dtype) &&
             !IsVolatile(data)) {
           os << "(" << GetVarID(data) << " + ";
           this->PrintExpr(load->indices[0], os);
           os << ")";
         } else {
-          os << "(&(" << GetBufferRef(load->dtype, load->buffer.get(), load->indices[0]) << "))";
+          os << "(&(" << GetBufferRef(load->ty()->dtype, load->buffer.get(), load->indices[0])
+             << "))";
         }
       } else {
         auto* var = op->args[0].as<tirx::VarNode>();
@@ -752,7 +757,8 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
       }
     } else if (op->op.same_as(builtin::tvm_struct_get())) {
       TVM_FFI_ICHECK_EQ(op->args.size(), 3U);
-      os << GetStructRef(op->dtype, op->args[0], op->args[1], op->args[2].as<IntImmNode>()->value);
+      os << GetStructRef(op->ty()->dtype, op->args[0], op->args[1],
+                         op->args[2].as<IntImmNode>()->value);
     } else if (op->op.same_as(builtin::isnullptr())) {
       TVM_FFI_ICHECK_EQ(op->args.size(), 1U);
       os << "(";
@@ -761,7 +767,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
     } else if (op->op.same_as(builtin::ptr_byte_offset())) {
       TVM_FFI_ICHECK_EQ(op->args.size(), 3U);
       os << "((";
-      PrintType(op->args[2].dtype(), os);
+      PrintType(op->args[2].ty()->dtype, os);
       os << "*)(((char*)";
       this->PrintExpr(op->args[0], os);
       os << ") + ";
@@ -775,10 +781,10 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
       this->PrintExpr(op->args[1], os);
       os << "))";
     } else if (op->op.same_as(builtin::reinterpret())) {
-      auto target_dtype = op->dtype;
-      auto source_dtype = op->args[0]->dtype;
-      TVM_FFI_ICHECK_EQ(target_dtype.lanes() * target_dtype.bits(),
-                        source_dtype.lanes() * source_dtype.bits())
+      auto target_dtype = op->ty()->dtype;
+      auto source_dtype = op->args[0].ty()->dtype;
+      TVM_FFI_ICHECK_EQ(PrimType(target_dtype).lanes() * target_dtype.bits,
+                        PrimType(source_dtype).lanes() * source_dtype.bits)
           << "reinterpret expects source and target to have the same number of bits";
       int ssa_scope = BeginScope();
       std::string rhs = SSAGetID(PrintExpr(op->args[0]), source_dtype);
@@ -815,7 +821,7 @@ void CodeGenC::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
   }
 }
 
-void CodeGenC::PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr lhs, PrimExpr rhs,
+void CodeGenC::PrintVecBinaryOp(const std::string& op, DLDataType t, PrimExpr lhs, PrimExpr rhs,
                                 std::ostream& os) {  // NOLINT(*)
   if (isalpha(op[0])) {
     os << op << "(";
@@ -840,16 +846,18 @@ void CodeGenC::VisitExpr_(const BufferLoadNode* op, std::ostream& os) {  // NOLI
   TVM_FFI_ICHECK_EQ(op->indices.size(), 1) << "Load from non-flat memory not supported.";
   TVM_FFI_ICHECK(!op->predicate.defined()) << "Predicated buffer load is not supported.";
 
-  DataType value_dtype = op->dtype;
+  DLDataType value_dtype = op->ty()->dtype;
+  PrimType value_ty(value_dtype);
   PrimExpr index = op->indices[0];
   Var buffer_var = op->buffer->data;
-  DataType element_dtype = op->buffer->dtype;
+  DLDataType element_dtype = op->buffer->dtype->dtype;
+  PrimType element_ty(element_dtype);
 
-  int lanes = op->dtype.lanes();
+  int lanes = value_ty.lanes();
   // delcare type.
-  if (value_dtype.lanes() == element_dtype.lanes()) {
-    std::string ref = GetBufferRef(op->dtype, op->buffer.get(), index);
-    if (value_dtype.is_float4_e2m1fn() && value_dtype.lanes() == 1) {
+  if (value_ty.lanes() == element_ty.lanes()) {
+    std::string ref = GetBufferRef(op->ty()->dtype, op->buffer.get(), index);
+    if (value_ty.code() == DLDataTypeCode::kDLFloat4_e2m1fn && value_ty.lanes() == 1) {
       // GetBufferRef returns an lvalue: *(ptr + index/2), which reads the
       // full byte.  Extract the correct nibble (low for even, high for odd).
       std::string index_str = PrintExpr(index);
@@ -863,34 +871,34 @@ void CodeGenC::VisitExpr_(const BufferLoadNode* op, std::ostream& os) {  // NOLI
   } else {
     bool can_vector_load = false;
     arith::PVar<PrimExpr> base;
-    if (arith::ramp(base, 1, op->dtype.lanes()).Match(index)) {
+    if (arith::ramp(base, 1, value_ty.lanes()).Match(index)) {
       const RampNode* ramp = index.as<RampNode>();
       TVM_FFI_ICHECK(ramp);
       arith::ModularSet me = arith::Analyzer()->modular_set(ramp->base);
       // The condition: {k * coeff + base} divisible by the alignment for any k
-      if (me->coeff % op->dtype.lanes() == 0 && me->base % op->dtype.lanes() == 0) {
+      if (me->coeff % value_ty.lanes() == 0 && me->base % value_ty.lanes() == 0) {
         can_vector_load = true;
       }
     }
 
-    if (value_dtype.is_float4_e2m1fn() && lanes != 1) {
+    if (value_ty.code() == DLDataTypeCode::kDLFloat4_e2m1fn && lanes != 1) {
       // A float4_e2m1fn element has 4 bits, which is an incomplete byte.
       // So we cannot vector load it.
       can_vector_load = false;
     }
     if (can_vector_load) {
-      std::string ref = GetVecLoad(op->dtype, op->buffer.get(), base.Eval());
+      std::string ref = GetVecLoad(op->ty()->dtype, op->buffer.get(), base.Eval());
       HandleVolatileLoads(ref, op, os);
     } else {
       std::ostringstream svalue_expr;
-      std::string sindex = SSAGetID(PrintExpr(index), index.dtype());
+      std::string sindex = SSAGetID(PrintExpr(index), index.ty()->dtype);
       std::string vid = GetVarID(buffer_var.get());
-      DataType elem_type = op->dtype.element_of();
+      DLDataType elem_type{value_dtype.code, value_dtype.bits, 1};
       for (int i = 0; i < lanes; ++i) {
         std::ostringstream value_temp;
         if (!HandleTypeMatch(buffer_var.get(), elem_type)) {
           value_temp << "((";
-          if (buffer_var.get()->dtype.is_handle()) {
+          if (buffer_var.get()->ty().IsHandle()) {
             auto it = alloc_storage_scope_.find(buffer_var.get());
             if (it != alloc_storage_scope_.end()) {
               PrintStorageScope(it->second, value_temp);
@@ -902,9 +910,9 @@ void CodeGenC::VisitExpr_(const BufferLoadNode* op, std::ostream& os) {  // NOLI
           value_temp << vid;
         }
         value_temp << '[';
-        PrintVecElemLoad(sindex, index.dtype(), i, value_temp);
+        PrintVecElemLoad(sindex, index.ty()->dtype, i, value_temp);
         value_temp << ']';
-        PrintVecElemLoadExpr(op->dtype, i, value_temp.str(), svalue_expr);
+        PrintVecElemLoadExpr(op->ty()->dtype, i, value_temp.str(), svalue_expr);
       }
       os << svalue_expr.str();
     }
@@ -915,12 +923,14 @@ void CodeGenC::VisitStmt_(const BufferStoreNode* op) {
   TVM_FFI_ICHECK_EQ(op->indices.size(), 1) << "Store to non-flat memory not supported.";
   TVM_FFI_ICHECK(!op->predicate.defined()) << "Predicated buffer store is not supported.";
 
-  DataType value_dtype = op->value.dtype();
-  DataType element_dtype = op->buffer->dtype;
+  DLDataType value_dtype = op->value.ty()->dtype;
+  PrimType value_ty(value_dtype);
+  DLDataType element_dtype = op->buffer->dtype->dtype;
+  PrimType element_ty(element_dtype);
   PrimExpr index_expr = op->indices[0];
   Var buffer_var = op->buffer->data;
 
-  if (value_dtype.lanes() == element_dtype.lanes()) {
+  if (value_ty.lanes() == element_ty.lanes()) {
     std::string value = this->PrintExpr(op->value);
     std::string ref = this->GetBufferRef(value_dtype, op->buffer.get(), index_expr);
     this->PrintIndent();
@@ -928,8 +938,8 @@ void CodeGenC::VisitStmt_(const BufferStoreNode* op) {
   } else {
     arith::PVar<PrimExpr> base;
 
-    if (arith::ramp(base, 1, value_dtype.lanes()).Match(index_expr) &&
-        !value_dtype.is_float4_e2m1fn()) {
+    if (arith::ramp(base, 1, value_ty.lanes()).Match(index_expr) &&
+        value_ty.code() != DLDataTypeCode::kDLFloat4_e2m1fn) {
       std::string value = this->PrintExpr(op->value);
       this->PrintVecStore(op->buffer.get(), value_dtype, base.Eval(), value);
     } else {
@@ -938,15 +948,15 @@ void CodeGenC::VisitStmt_(const BufferStoreNode* op) {
       int vec_scope = BeginScope();
 
       // store elements separately
-      std::string index = SSAGetID(PrintExpr(index_expr), index_expr.dtype());
-      std::string value = SSAGetID(PrintExpr(op->value), op->value.dtype());
+      std::string index = SSAGetID(PrintExpr(index_expr), index_expr.ty()->dtype);
+      std::string value = SSAGetID(PrintExpr(op->value), op->value.ty()->dtype);
       std::string vid = GetVarID(buffer_var.get());
-      for (int i = 0; i < value_dtype.lanes(); ++i) {
+      for (int i = 0; i < value_ty.lanes(); ++i) {
         this->PrintIndent();
-        DataType elem_type = value_dtype.element_of();
+        DLDataType elem_type{value_dtype.code, value_dtype.bits, 1};
         if (!HandleTypeMatch(buffer_var.get(), elem_type)) {
           stream << "((";
-          if (buffer_var.get()->dtype.is_handle()) {
+          if (buffer_var.get()->ty().IsHandle()) {
             auto it = alloc_storage_scope_.find(buffer_var.get());
             if (it != alloc_storage_scope_.end()) {
               PrintStorageScope(it->second, stream);
@@ -958,9 +968,9 @@ void CodeGenC::VisitStmt_(const BufferStoreNode* op) {
           stream << vid;
         }
         stream << '[';
-        PrintVecElemLoad(index, index_expr.dtype(), i, stream);
+        PrintVecElemLoad(index, index_expr.ty()->dtype, i, stream);
         stream << "] = ";
-        PrintVecElemLoad(value, op->value.dtype(), i, stream);
+        PrintVecElemLoad(value, op->value.ty()->dtype, i, stream);
         stream << ";\n";
       }
       EndScope(vec_scope);
@@ -983,13 +993,13 @@ void CodeGenC::VisitExpr_(const LetNode* op, std::ostream& os) {  // NOLINT(*)
     var_idmap_[op->var.get()] = value;
   } else {
     PrintIndent();
-    if (op->var.dtype() == DataType::Handle() && handle_data_type_.count(op->var.get())) {
+    if (op->var.ty().IsHandle() && handle_data_type_.count(op->var.get())) {
       PrintType(handle_data_type_.at(op->var.get()), this->stream);
       this->stream << "* " << AllocVarID(op->var.get()) << " = (";
       PrintType(handle_data_type_.at(op->var.get()), this->stream);
       this->stream << "*)" << value << ";\n";
     } else {
-      PrintType(op->var.dtype(), this->stream);
+      PrintType(op->var.ty()->dtype, this->stream);
       this->stream << ' ' << AllocVarID(op->var.get()) << " = " << value << ";\n";
     }
   }
@@ -1004,8 +1014,8 @@ void CodeGenC::VisitExpr_(const LetNode* op, std::ostream& os) {  // NOLINT(*)
 void CodeGenC::VisitExpr_(const RampNode* op, std::ostream& os) {  // NOLINT(*)
   // NOTE: C have comma expression so cannot use (int2)(v0, v1)
   // instead should use int2(v0, v1)
-  PrintType(op->dtype, os);
-  int lanes = op->dtype.lanes();
+  PrintType(op->ty()->dtype, os);
+  int lanes = op->ty().lanes();
   os << "(";
   for (int i = 0; i < lanes; i++) {
     os << "(" << PrintExpr(op->base) << ")"
@@ -1031,11 +1041,11 @@ void CodeGenC::VisitExpr_(const ShuffleNode* op, std::ostream& os) {  // NOLINT(
   if (op->vectors.size() > 1) {
     for (const PrimExpr& vec : op->vectors) {
       std::string vec_value = this->PrintExpr(vec);
-      if (vec.dtype().lanes() == 1) {
+      if (vec.ty().lanes() == 1) {
         concat_vec.push_back(vec_value);
       } else {
         // print out each element
-        for (int i = 0; i < vec.dtype().lanes(); ++i) {
+        for (int i = 0; i < vec.ty().lanes(); ++i) {
           // access i-th element of each vector
           std::ostringstream vec_elem_strm;
           vec_elem_strm << vec_value << "[" << i << "]";
@@ -1046,14 +1056,14 @@ void CodeGenC::VisitExpr_(const ShuffleNode* op, std::ostream& os) {  // NOLINT(
   } else {
     // Extract elements from a single vector-type value.
     std::string vec_value = "(" + this->PrintExpr(op->vectors[0]) + ")";
-    if (op->vectors[0].dtype().lanes() == 1) {
+    if (op->vectors[0].ty().lanes() == 1) {
       concat_vec.push_back(vec_value);
     } else {
       // print out each element
-      for (int i = 0; i < op->vectors[0].dtype().lanes(); ++i) {
+      for (int i = 0; i < op->vectors[0].ty().lanes(); ++i) {
         // access i-th element of each vector
         std::ostringstream vec_elem_strm;
-        PrintVecElemLoad(vec_value, op->vectors[0].dtype(), i, vec_elem_strm);
+        PrintVecElemLoad(vec_value, op->vectors[0].ty()->dtype, i, vec_elem_strm);
         concat_vec.push_back(vec_elem_strm.str());
       }
     }
@@ -1071,7 +1081,7 @@ void CodeGenC::VisitExpr_(const ShuffleNode* op, std::ostream& os) {  // NOLINT(
   } else {
     // Print the shuffle as vector constructor
     // vec(e0, e1, e2, .. en)
-    PrintVecConstructor(op->dtype, os);
+    PrintVecConstructor(op->ty()->dtype, os);
     os << '(';
     for (size_t i = 0; i < op->indices.size(); ++i) {
       if (i != 0) os << ", ";
@@ -1108,13 +1118,13 @@ void CodeGenC::VisitStmt_(const BindNode* op) {
     var_idmap_[op->var.get()] = value;
   } else {
     PrintIndent();
-    if (op->var.dtype() == DataType::Handle() && handle_data_type_.count(op->var.get())) {
+    if (op->var.ty().IsHandle() && handle_data_type_.count(op->var.get())) {
       PrintType(handle_data_type_.at(op->var.get()), stream);
       stream << "* " << AllocVarID(op->var.get()) << " = (";
       PrintType(handle_data_type_.at(op->var.get()), stream);
       stream << "*)" << value << ";\n";
     } else {
-      PrintType(op->var.dtype(), this->stream);
+      PrintType(op->var.ty()->dtype, this->stream);
       this->stream << ' ' << AllocVarID(op->var.get()) << " = " << value << ";\n";
     }
   }
@@ -1138,10 +1148,10 @@ void CodeGenC::VisitStmt_(const AllocBufferNode* op) {
   alloc_storage_scope_[op->buffer->data.get()] = scope;
   PrintStorageScope(scope, stream);
 
-  PrintType(op->buffer->dtype, stream);
+  PrintType(op->buffer->dtype->dtype, stream);
   stream << ' ' << vid << '[' << constant_size << "];\n";
 
-  RegisterHandleType(op->buffer->data.get(), op->buffer->dtype);
+  RegisterHandleType(op->buffer->data.get(), op->buffer->dtype->dtype);
   if (op->annotations.count(tirx::attr::kVolatile)) {
     MarkVolatile(op->buffer->data.get());
   }
@@ -1247,7 +1257,7 @@ void CodeGenC::VisitStmt_(const ForNode* op) {
   PrintIndent();
   std::string vid = AllocVarID(op->loop_var.get());
   stream << "for (";
-  PrintType(op->loop_var.dtype(), stream);
+  PrintType(op->loop_var.ty()->dtype, stream);
   stream << ' ' << vid << " = " << begin_str << "; " << vid << " < " << end_str << "; ";
   if (step_str.empty()) {
     stream << "++" << vid;
@@ -1326,23 +1336,23 @@ void CodeGenC::VisitStmt_(const EvaluateNode* op) {
     } else if (call->op.same_as(builtin::tvm_struct_set())) {
       TVM_FFI_ICHECK_EQ(call->args.size(), 4);
       int kind = call->args[2].as<IntImmNode>()->value;
-      DataType store_dtype = call->args[3].dtype();
+      DLDataType store_dtype = call->args[3].ty()->dtype;
+      PrimType store_ty(store_dtype);
       std::string ref = GetStructRef(store_dtype, call->args[0], call->args[1], kind);
       std::string value = PrintExpr(call->args[3]);
       std::string cast;
 
-      if (kind == builtin::kTVMFFIAnyUnionValue &&
-          (store_dtype.bits() < 64 || store_dtype.is_handle())) {
+      if (kind == builtin::kTVMFFIAnyUnionValue && (store_dtype.bits < 64 || store_ty.IsHandle())) {
         this->PrintIndent();
         // when we set any union value, we need to be careful to
         // clear off the union value to zero if the set size is less than 64 bits
-        this->stream << GetStructRef(DataType::Int(64), call->args[0], call->args[1], kind)
+        this->stream << GetStructRef(DLDataType{kDLInt, 64, 1}, call->args[0], call->args[1], kind)
                      << " = 0;\n";
       }
 
       if (kind == builtin::kDLTensorStrides) {
         // cast void* to int64_t*
-        cast = call->args[3]->dtype.is_handle() ? "(int64_t*)" : "";
+        cast = call->args[3].ty().IsHandle() ? "(int64_t*)" : "";
       } else if (kind == builtin::kDLTensorDeviceType) {
         // cast int to enum
         cast = "(DLDeviceType)";
@@ -1359,9 +1369,12 @@ void CodeGenC::VisitStmt_(const EvaluateNode* op) {
   }
 }
 
-void CodeGenC::PrintVecElemLoadExpr(DataType t, int i, const std::string& value, std::ostream& os) {
-  TVM_FFI_ICHECK_GT(t.lanes(), 1);
-  if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
+void CodeGenC::PrintVecElemLoadExpr(DLDataType t, int i, const std::string& value,
+                                    std::ostream& os) {
+  PrimType t_ty(t);
+  int lanes = t_ty.lanes();
+  TVM_FFI_ICHECK_GT(lanes, 1);
+  if (t.bits == 8 && (t_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt))) {
     if (i != 0) {
       os << "|";
     }
@@ -1377,7 +1390,7 @@ void CodeGenC::PrintVecElemLoadExpr(DataType t, int i, const std::string& value,
     os << "(";
   }
   os << value;
-  if (i != t.lanes() - 1) {
+  if (i != lanes - 1) {
     os << ",";
   } else {
     os << "))";
diff --git a/src/target/source/codegen_c.h b/src/target/source/codegen_c.h
index a023277ed19c..61d640b66947 100644
--- a/src/target/source/codegen_c.h
+++ b/src/target/source/codegen_c.h
@@ -209,25 +209,27 @@ class CodeGenC : public ExprFunctor<void(const PrimExpr&, std::ostream&)>,
   virtual void PrintStorageScope(const std::string& scope, std::ostream& os);  // NOLINT(*)
   virtual void PrintStorageSync(const CallNode* op);                           // NOLINT(*)
   // Binary vector op.
-  virtual void PrintVecBinaryOp(const std::string& op, DataType op_type, PrimExpr lhs, PrimExpr rhs,
+  virtual void PrintVecBinaryOp(const std::string& op, DLDataType op_type, PrimExpr lhs,
+                                PrimExpr rhs,
                                 std::ostream& os);  // NOLINT(*)
   // print vector load
-  virtual std::string GetVecLoad(DataType t, const BufferNode* buffer, PrimExpr base);
+  virtual std::string GetVecLoad(DLDataType t, const BufferNode* buffer, PrimExpr base);
   // print vector store
-  virtual void PrintVecStore(const BufferNode* buffer, DataType t, PrimExpr base,
+  virtual void PrintVecStore(const BufferNode* buffer, DLDataType t, PrimExpr base,
                              const std::string& value);  // NOLINT(*)
   // print load of single element
-  virtual void PrintVecElemLoad(const std::string& vec, DataType t, int i,
+  virtual void PrintVecElemLoad(const std::string& vec, DLDataType t, int i,
                                 std::ostream& os);  // NOLINT(*)
   // print store of single element.
-  virtual void PrintVecElemStore(const std::string& vec, DataType t, int i,
+  virtual void PrintVecElemStore(const std::string& vec, DLDataType t, int i,
                                  const std::string& value);
   // print vector constructor
-  virtual void PrintVecConstructor(DataType t, std::ostream& os);
+  virtual void PrintVecConstructor(DLDataType t, std::ostream& os);
   // Get a cast type from to
-  virtual std::string CastFromTo(std::string value, DataType from, DataType target);
+  virtual std::string CastFromTo(std::string value, DLDataType from, DLDataType target);
   // Get load of single element with expression
-  virtual void PrintVecElemLoadExpr(DataType t, int i, const std::string& value, std::ostream& os);
+  virtual void PrintVecElemLoadExpr(DLDataType t, int i, const std::string& value,
+                                    std::ostream& os);
   // Print restrict keyword for a given Var if applicable
   virtual void PrintRestrict(const Var& v, std::ostream& os);
 
@@ -239,9 +241,9 @@ class CodeGenC : public ExprFunctor<void(const PrimExpr&, std::ostream&)>,
   /*! \brief Print a C string literal with proper escaping of special chars. */
   void PrintEscapedCString(const std::string& str, std::ostream& os);
   // Print reference to struct location
-  std::string GetStructRef(DataType t, const PrimExpr& buffer, const PrimExpr& index, int kind);
+  std::string GetStructRef(DLDataType t, const PrimExpr& buffer, const PrimExpr& index, int kind);
   // Print reference to a buffer as type t in index.
-  virtual std::string GetBufferRef(DataType t, const BufferNode* buffer, PrimExpr index);
+  virtual std::string GetBufferRef(DLDataType t, const BufferNode* buffer, PrimExpr index);
 
   /*!
    * \brief Handle volatile loads.
@@ -294,13 +296,13 @@ class CodeGenC : public ExprFunctor<void(const PrimExpr&, std::ostream&)>,
    * \param buf_var The buffer variable.
    * \param t The type to be checked.
    */
-  bool HandleTypeMatch(const VarNode* buf_var, DataType t) const;
+  bool HandleTypeMatch(const VarNode* buf_var, DLDataType t) const;
   /*!
    * \brief Register the data type of buf_var
    * \param buf_var The buffer variable.
    * \param t The type to be checked.
    */
-  void RegisterHandleType(const VarNode* buf_var, DataType t);
+  void RegisterHandleType(const VarNode* buf_var, DLDataType t);
   /*!
    * \brief Register a typed pointer produced by explicit pointer-offset intrinsics.
    *
@@ -310,7 +312,7 @@ class CodeGenC : public ExprFunctor<void(const PrimExpr&, std::ostream&)>,
    */
   void RegisterHandleTypeFromPointer(const tirx::Var& var, const PrimExpr* value);
   // override
-  void PrintSSAAssign(const std::string& target, const std::string& src, DataType t) override;
+  void PrintSSAAssign(const std::string& target, const std::string& src, PrimType t) override;
   /*! \brief reserves common C keywords */
   void ReserveKeywordsAsUnique();
 
@@ -324,7 +326,7 @@ class CodeGenC : public ExprFunctor<void(const PrimExpr&, std::ostream&)>,
   /*! \brief the storage scope of allocation */
   std::unordered_map<const VarNode*, std::string> alloc_storage_scope_;
   /*! \brief the data type of allocated buffers */
-  std::unordered_map<const VarNode*, DataType> handle_data_type_;
+  std::unordered_map<const VarNode*, DLDataType> handle_data_type_;
   /*! \brief Handle vars whose address_of(buffer[index]) should print as ptr + index. */
   std::unordered_set<const VarNode*> pointer_offset_vars_;
   /*! \brief Record of ops that have pre-defined global symbol. */
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index 1319ae4a4b57..0071dc15c7e3 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -120,24 +120,24 @@ void CodeGenCHost::PrintFuncPrefix(std::ostream& os) {  // NOLINT(*)
      << "TVM_DLL ";
 }
 
-void CodeGenCHost::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
-  int lanes = t.lanes();
-  if (t.is_handle()) {
+void CodeGenCHost::PrintType(DLDataType t, std::ostream& os) {  // NOLINT(*)
+  int lanes = static_cast<int16_t>(t.lanes);
+  if (t.code == kDLOpaqueHandle && !(t.bits == 0 && lanes == 0)) {
     TVM_FFI_ICHECK_EQ(lanes, 1) << "does not support vector types";
     os << "void*";
     return;
   }
-  if (t.is_void()) {
+  if (t.code == kDLOpaqueHandle && t.bits == 0 && lanes == 0) {
     os << "void";
     return;
   }
-  if (t == DataType::Bool()) {
+  if (t == DLDataType{kDLBool, 1, 1}) {
     os << "bool";
     return;
   }
   bool fail = false;
-  if (t.is_float()) {
-    switch (t.bits()) {
+  if (t.code == kDLFloat) {
+    switch (t.bits) {
       case 16:
         os << "half";
         break;
@@ -156,11 +156,11 @@ void CodeGenCHost::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
       os << lanes;
       return;
     }
-  } else if (t.is_uint() || t.is_int()) {
-    if (t.is_uint()) {
+  } else if (t.code == kDLUInt || t.code == kDLInt) {
+    if (t.code == kDLUInt) {
       os << 'u';
     }
-    switch (t.bits()) {
+    switch (t.bits) {
       case 8:
         os << "int8_t";
         break;
@@ -191,9 +191,9 @@ void CodeGenCHost::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
 
 void CodeGenCHost::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NOLINT(*)
   std::string v = PrintExpr(op->value);
-  int lanes = op->dtype.lanes();
+  int lanes = static_cast<int16_t>(op->ty()->dtype.lanes);
   os << "((";
-  PrintType(op->dtype, os);
+  PrintType(op->ty()->dtype, os);
   os << ")(";
   for (int i = 0; i < lanes; ++i) {
     if (i != 0) os << ", ";
@@ -356,10 +356,10 @@ inline void CodeGenCHost::PrintTernaryCondExpr(const T* op, const char* compare,
                                                std::ostream& os) {  // NOLINT(*)
   std::ostringstream temp_a;
   VisitExpr(op->a, temp_a);
-  std::string a_id = SSAGetID(temp_a.str(), op->a.dtype());
+  std::string a_id = SSAGetID(temp_a.str(), op->a.ty()->dtype);
   std::ostringstream temp_b;
   VisitExpr(op->b, temp_b);
-  std::string b_id = SSAGetID(temp_b.str(), op->b.dtype());
+  std::string b_id = SSAGetID(temp_b.str(), op->b.ty()->dtype);
 
   os << "((" << a_id << ") " << compare << " (" << b_id << ") "
      << "? (" << a_id << ") : (" << b_id << "))";
diff --git a/src/target/source/codegen_c_host.h b/src/target/source/codegen_c_host.h
index edeebe7da1cc..a384dc957a1e 100644
--- a/src/target/source/codegen_c_host.h
+++ b/src/target/source/codegen_c_host.h
@@ -57,8 +57,8 @@ class CodeGenCHost : public CodeGenC {
   void DefineModuleName();
 
   using CodeGenC::PrintType;
-  void PrintType(DataType t, std::ostream& os) final;  // NOLINT(*)
-  void PrintFuncPrefix(std::ostream& os) final;        // NOLINT(*)
+  void PrintType(DLDataType t, std::ostream& os) final;  // NOLINT(*)
+  void PrintFuncPrefix(std::ostream& os) final;          // NOLINT(*)
 
   // overload visitor functions
   void VisitExpr_(const BroadcastNode* op, std::ostream& os) final;  // NOLINT(*)
diff --git a/src/target/source/codegen_params.cc b/src/target/source/codegen_params.cc
index ae915f278f57..6f4cd1a12094 100644
--- a/src/target/source/codegen_params.cc
+++ b/src/target/source/codegen_params.cc
@@ -163,8 +163,8 @@ void PrintFloatingPointArray(void* data, size_t num_elements, int indent_chars,
 void TensorDataToC(::tvm::runtime::Tensor arr, int indent_chars, std::ostream& os,
                    const std::string& eol) {
   auto arr_type = arr.DataType();
-  TVM_FFI_ICHECK_EQ(arr_type.lanes(), 1)
-      << "CodegenParams: only support generating 1-lane parameters; saw " << arr_type.lanes();
+  TVM_FFI_ICHECK_EQ(arr_type.lanes, 1)
+      << "CodegenParams: only support generating 1-lane parameters; saw " << arr_type.lanes;
 
   auto shape = arr.Shape();
   int num_elements = 1;
@@ -176,72 +176,73 @@ void TensorDataToC(::tvm::runtime::Tensor arr, int indent_chars, std::ostream& o
   os.setf(std::ios::internal | std::ios::hex,
           std::ios::adjustfield | std::ios::basefield | std::ios::showbase);
   os.fill('0');
-  switch (arr_type.code()) {
-    case runtime::DataType::kInt:
-      TVM_FFI_ICHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
-                     arr_type.bits() == 64)
+  switch (static_cast<DLDataTypeCode>(arr_type.code)) {
+    case DLDataTypeCode::kDLInt:
+      TVM_FFI_ICHECK(arr_type.bits == 8 || arr_type.bits == 16 || arr_type.bits == 32 ||
+                     arr_type.bits == 64)
           << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
-          << arr_type.bits() << "-bit array";
-      if (arr_type.bits() == 8) {
+          << arr_type.bits << "-bit array";
+      if (arr_type.bits == 8) {
         PrintIntegralArray<int8_t>(arr->data, num_elements, indent_chars, os, eol);
-      } else if (arr_type.bits() == 16) {
+      } else if (arr_type.bits == 16) {
         PrintIntegralArray<int16_t>(arr->data, num_elements, indent_chars, os, eol);
-      } else if (arr_type.bits() == 32) {
+      } else if (arr_type.bits == 32) {
         PrintIntegralArray<int32_t>(arr->data, num_elements, indent_chars, os, eol);
-      } else if (arr_type.bits() == 64) {
+      } else if (arr_type.bits == 64) {
         PrintIntegralArray<int64_t>(arr->data, num_elements, indent_chars, os, eol);
       } else {
         TVM_FFI_ICHECK(false) << "should not get here";
       }
       break;
 
-    case runtime::DataType::TypeCode::kUInt:
-      TVM_FFI_ICHECK(arr_type.bits() == 8 || arr_type.bits() == 16 || arr_type.bits() == 32 ||
-                     arr_type.bits() == 64)
+    case DLDataTypeCode::kDLUInt:
+      TVM_FFI_ICHECK(arr_type.bits == 8 || arr_type.bits == 16 || arr_type.bits == 32 ||
+                     arr_type.bits == 64)
           << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
-          << arr_type.bits() << "-bit array";
+          << arr_type.bits << "-bit array";
 
-      if (arr_type.bits() == 8) {
+      if (arr_type.bits == 8) {
         PrintIntegralArray<uint8_t>(arr->data, num_elements, indent_chars, os, eol);
-      } else if (arr_type.bits() == 16) {
+      } else if (arr_type.bits == 16) {
         PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os, eol);
-      } else if (arr_type.bits() == 32) {
+      } else if (arr_type.bits == 32) {
         PrintIntegralArray<uint32_t>(arr->data, num_elements, indent_chars, os, eol);
-      } else if (arr_type.bits() == 64) {
+      } else if (arr_type.bits == 64) {
         PrintIntegralArray<uint64_t>(arr->data, num_elements, indent_chars, os, eol);
       } else {
         TVM_FFI_ICHECK(false) << "should not get here";
       }
       break;
 
-    case runtime::DataType::TypeCode::kFloat: {
+    case DLDataTypeCode::kDLFloat: {
       os.fill(' ');
       os.setf(std::ios::left, std::ios::adjustfield);
-      if (arr_type.bits() == 16) {
+      if (arr_type.bits == 16) {
         // NOTE: print types not widely supported by C as uint16_t.
         PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os, eol);
-      } else if (arr_type.bits() == 32) {
+      } else if (arr_type.bits == 32) {
         PrintFloatingPointArray<float>(arr->data, num_elements, indent_chars, os, eol);
-      } else if (arr_type.bits() == 64) {
+      } else if (arr_type.bits == 64) {
         PrintFloatingPointArray<double>(arr->data, num_elements, indent_chars, os, eol);
       } else {
         TVM_FFI_ICHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
-                              << arr_type.bits() << "-bit array";
+                              << arr_type.bits << "-bit array";
       }
       break;
     }
 
-    case runtime::DataType::TypeCode::kBFloat: {
+    case DLDataTypeCode::kDLBfloat: {
       // NOTE: print types not widely supported by C as uint16_t.
-      TVM_FFI_ICHECK(arr_type.bits() == 16)
-          << "CodegenParams: only support generating 16-bit bfloat params; saw " << arr_type.bits()
+      TVM_FFI_ICHECK(arr_type.bits == 16)
+          << "CodegenParams: only support generating 16-bit bfloat params; saw " << arr_type.bits
           << "-bit array";
       PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os, eol);
       break;
     }
 
     default:
-      TVM_FFI_ICHECK(false) << "Data type '" << arr_type << "' not supported";
+      TVM_FFI_ICHECK(false) << "Data type '" << ffi::DLDataTypeToString(arr_type)
+                            << "' not supported";
   }
 
   os.flags(old_fmtflags);
diff --git a/src/target/source/codegen_source_base.cc b/src/target/source/codegen_source_base.cc
index 2646a6597ef4..6318fbe514e2 100644
--- a/src/target/source/codegen_source_base.cc
+++ b/src/target/source/codegen_source_base.cc
@@ -34,7 +34,7 @@ void CodeGenSourceBase::ClearFuncState() {
   scope_mark_.clear();
 }
 
-std::string CodeGenSourceBase::SSAGetID(std::string src, DataType t) {
+std::string CodeGenSourceBase::SSAGetID(std::string src, PrimType t) {
   if (name_supply_->ContainsName(src)) return src;
   auto it = ssa_assign_map_.find(src);
   if (it != ssa_assign_map_.end()) {
@@ -99,50 +99,51 @@ void CodeGenSourceBase::EndScope(int scope_id) {
   indent_ -= 2;
 }
 
-void CodeGenSourceBase::PrintType(DataType type, std::ostream& os) {  // NOLINT(*)
-  TVM_FFI_ICHECK_EQ(type.lanes(), 1) << "do not yet support vector types";
-  if (type.is_handle()) {
+void CodeGenSourceBase::PrintType(DLDataType type, std::ostream& os) {  // NOLINT(*)
+  int lanes = static_cast<int16_t>(type.lanes);
+  TVM_FFI_ICHECK_EQ(lanes, 1) << "do not yet support vector types";
+  if (type.code == kDLOpaqueHandle && !(type.bits == 0 && lanes == 0)) {
     os << "void*";
     return;
   }
-  if (type.is_void()) {
+  if (type.code == kDLOpaqueHandle && type.bits == 0 && lanes == 0) {
     os << "void";
     return;
   }
   // default c may be have bool type, can be handled in subclass
-  if (type.is_bool()) {
+  if (type.code == kDLBool) {
     os << "int";
     return;
   }
-  if (type.is_float()) {
-    if (type.bits() == 32) {
+  if (type.code == kDLFloat) {
+    if (type.bits == 32) {
       os << "float";
       return;
     }
-    if (type.bits() == 64) {
+    if (type.bits == 64) {
       os << "double";
       return;
     }
-  } else if (type.is_uint()) {
-    switch (type.bits()) {
+  } else if (type.code == kDLUInt) {
+    switch (type.bits) {
       case 8:
       case 16:
       case 32:
       case 64: {
-        os << "uint" << type.bits() << "_t";
+        os << "uint" << static_cast<int>(type.bits) << "_t";
         return;
       }
       case 1:
         os << "int";
         return;
     }
-  } else if (type.is_int()) {
-    switch (type.bits()) {
+  } else if (type.code == kDLInt) {
+    switch (type.bits) {
       case 8:
       case 16:
       case 32:
       case 64: {
-        os << "int" << type.bits() << "_t";
+        os << "int" << static_cast<int>(type.bits) << "_t";
         return;
       }
     }
diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h
index f6e58cc9efba..d869a811fe5e 100644
--- a/src/target/source/codegen_source_base.h
+++ b/src/target/source/codegen_source_base.h
@@ -58,7 +58,7 @@ class CodeGenSourceBase {
    * \param t The type representation.
    * \param os The stream to print the ctype into
    */
-  virtual void PrintType(DataType type, std::ostream& os);  // NOLINT(*)
+  virtual void PrintType(DLDataType type, std::ostream& os);  // NOLINT(*)
   /*!
    * Print Type representation of type type.
    * \param type The type representation.
@@ -96,7 +96,10 @@ class CodeGenSourceBase {
    * \param src The source expression
    * \param t The type of the expression.
    */
-  std::string SSAGetID(std::string src, DataType t);
+  std::string SSAGetID(std::string src, PrimType t);
+  std::string SSAGetID(std::string src, DLDataType t) {
+    return SSAGetID(std::move(src), PrimType(t));
+  }
   /*!
    * \brief mark the beginning of a new scope
    * \return The scope id.
@@ -113,7 +116,7 @@ class CodeGenSourceBase {
    * \param src The source expression.
    * \param t The type of target.
    */
-  virtual void PrintSSAAssign(const std::string& target, const std::string& src, DataType t) = 0;
+  virtual void PrintSSAAssign(const std::string& target, const std::string& src, PrimType t) = 0;
 
   /*! \brief the declaration stream */
   std::ostringstream decl_stream;
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 57b82491c03d..972c85c3806e 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -198,7 +198,7 @@ class ConcreteCodegenSourceBase : public CodeGenSourceBase {
   /*!
    * \brief Do nothing as this class exist to get access to methods of CodeGenSourceBase
    */
-  void PrintSSAAssign(const std::string& target, const std::string& src, DataType t) final {
+  void PrintSSAAssign(const std::string& target, const std::string& src, PrimType t) final {
     return;
   }
 };
diff --git a/src/te/operation/compute_op.cc b/src/te/operation/compute_op.cc
index 0fa57d0f4617..a407fafa18fa 100644
--- a/src/te/operation/compute_op.cc
+++ b/src/te/operation/compute_op.cc
@@ -75,9 +75,9 @@ static inline void AssertReduceEqual(const tirx::ReduceNode* a, const tirx::Redu
 
 int ComputeOpNode::num_outputs() const { return body.size(); }
 
-DataType ComputeOpNode::output_dtype(size_t idx) const {
+PrimType ComputeOpNode::output_dtype(size_t idx) const {
   TVM_FFI_ICHECK_LT(idx, num_outputs());
-  return body[idx].dtype();
+  return body[idx].ty();
 }
 
 ffi::Array<PrimExpr> BaseComputeOpNode::output_shape(size_t idx) const {
@@ -100,8 +100,8 @@ Tensor compute(ffi::Array<PrimExpr> shape, FCompute fcompute, std::string name,
   for (size_t i = 0; i < ndim; ++i) {
     std::ostringstream os;
     os << "ax" << i;
-    axis.emplace_back(IterVar(Range(IntImm(shape[i]->dtype, 0), shape[i]),
-                              Var(os.str(), shape[i].dtype()), kDataPar));
+    axis.emplace_back(
+        IterVar(Range(IntImm(shape[i].ty(), 0), shape[i]), Var(os.str(), shape[i].ty()), kDataPar));
     args.push_back(axis.back()->var);
   }
 
@@ -117,8 +117,8 @@ ffi::Array<Tensor> compute(ffi::Array<PrimExpr> shape, FBatchCompute fcompute, s
   for (size_t i = 0; i < ndim; ++i) {
     std::ostringstream os;
     os << "ax" << i;
-    axis.emplace_back(IterVar(Range(IntImm(shape[i]->dtype, 0), shape[i]),
-                              Var(os.str(), shape[i].dtype()), kDataPar));
+    axis.emplace_back(
+        IterVar(Range(IntImm(shape[i].ty(), 0), shape[i]), Var(os.str(), shape[i].ty()), kDataPar));
     args.push_back(axis.back()->var);
   }
 
diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 79ba7d8a3918..6127336296e7 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -387,7 +387,7 @@ Stmt GenerateBodyStmt(const ffi::Array<PrimExpr>& indices, const ffi::Array<Buff
       const PrimExpr& right = analyzer->Simplify(f_transform_and_remap(reduce->source[i]));
       lhs.push_back(left);
       rhs.push_back(right);
-      TVM_FFI_ICHECK_EQ(left->dtype, right->dtype);
+      TVM_FFI_ICHECK_EQ(left.ty()->dtype, right.ty()->dtype);
     }
 
     ffi::Array<Var> temp_vars;
@@ -404,7 +404,7 @@ Stmt GenerateBodyStmt(const ffi::Array<PrimExpr>& indices, const ffi::Array<Buff
       const Buffer& buffer = buffers[i];
       PrimExpr value{nullptr};
       if (n_buffers > 1) {
-        temp_vars.push_back(Var("v_" + buffer->name, PrimType(lhs[i].dtype())));
+        temp_vars.push_back(Var("v_" + buffer->name, lhs[i].ty()));
         value = temp_vars.back();
       } else {
         PrimExpr combined = reduce->combiner.get()->operator()(lhs, rhs)[i];
@@ -493,8 +493,8 @@ Stmt GenerateStmtFromCompute(const te::ComputeOp& compute_op, CreateFuncInfo* in
     NestedScopeInfo cur_scope;
     for (size_t j = 0; j < axes.size(); ++j) {
       const IterVar& axis = axes[j];
-      DataType index_type =
-          DataType::Int(std::max(axis->dom->min.dtype().bits(), axis->dom->extent.dtype().bits()));
+      PrimType index_type =
+          PrimType::Int(std::max(axis->dom->min.ty().bits(), axis->dom->extent.ty().bits()));
       bool first_times_define =
           std::find(axes_levels[i].begin(), axes_levels[i].end(), axis) != axes_levels[i].end();
       if (first_times_define) {
@@ -524,7 +524,7 @@ Stmt GenerateStmtFromCompute(const te::ComputeOp& compute_op, CreateFuncInfo* in
     }
     if (i == axes_levels.size() - 1 && cur_scope.block_iters.empty()) {
       // for the leaf scope, we ensure at least one block var exists
-      IterVar dummy(Range::FromMinExtent(0, 1), Var("vi", DataType::Int(32)),
+      IterVar dummy(Range::FromMinExtent(0, 1), Var("vi", PrimType::Int(32)),
                     IterVarType::kDataPar);
       cur_scope.AddBlockIter(std::nullopt, dummy, 0);
     }
@@ -740,8 +740,9 @@ PrimFunc GenerateAndCompletePrimFunc(const ffi::Array<te::Tensor>& arg_list,
                                      const ffi::Array<Stmt>& root_stmts, CreateFuncInfo* info) {
   ffi::Array<Var> parameters;
   ffi::Map<Var, Buffer> buffer_map;
+  PrimType handle_ty = PrimType::Handle();
   for (const te::Tensor& tensor : arg_list) {
-    Var arg("var_" + tensor->GetNameHint(), PrimType(DataType::Handle()));
+    Var arg("var_" + tensor->GetNameHint(), handle_ty);
     parameters.push_back(arg);
     auto it = info->tensor2buffers.find(tensor);
     TVM_FFI_ICHECK(it != info->tensor2buffers.end());
@@ -760,7 +761,7 @@ PrimFunc GenerateAndCompletePrimFunc(const ffi::Array<te::Tensor>& arg_list,
 }
 
 PrimFunc CreatePrimFunc(const ffi::Array<te::Tensor>& arg_list,
-                        std::optional<DataType> index_dtype_override) {
+                        std::optional<PrimType> index_dtype_override) {
   // Information used in CreatePrimFunc and its sub-functions.
   CreateFuncInfo info(arg_list);
   // Root body stmts.
@@ -792,10 +793,10 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def_packed("te.CreatePrimFunc", [](ffi::PackedArgs args, ffi::Any* ret) {
     ffi::Array<ffi::ObjectRef> arg_list = args[0].cast<ffi::Array<ffi::ObjectRef>>();
-    std::optional<DataType> index_dtype_override{std::nullopt};
+    std::optional<PrimType> index_dtype_override{std::nullopt};
     // Add conversion to make std::optional compatible with FFI.
     if (args[1] != nullptr) {
-      index_dtype_override = args[1].cast<DataType>();
+      index_dtype_override = args[1].cast<PrimType>();
     }
     *ret = CreatePrimFunc(arg_list, index_dtype_override);
   });
@@ -806,10 +807,11 @@ PrimFunc GenerateAndCompletePrimFunc(const ffi::Array<ffi::ObjectRef>& arg_tir_v
                                      const ffi::Array<Stmt>& root_stmts, CreateFuncInfo* info) {
   ffi::Array<Var> parameters;
   ffi::Map<Var, Buffer> buffer_map;
+  PrimType handle_ty = PrimType::Handle();
   for (const ffi::ObjectRef& arg : arg_tir_var_list) {
     if (auto opt_tensor = arg.as<te::Tensor>()) {
       te::Tensor tensor = opt_tensor.value();
-      Var arg("var_" + tensor->GetNameHint(), PrimType(DataType::Handle()));
+      Var arg("var_" + tensor->GetNameHint(), handle_ty);
       parameters.push_back(arg);
       auto it = info->tensor2buffers.find(tensor);
       TVM_FFI_ICHECK(it != info->tensor2buffers.end());
@@ -831,7 +833,7 @@ PrimFunc GenerateAndCompletePrimFunc(const ffi::Array<ffi::ObjectRef>& arg_tir_v
 }
 
 PrimFunc CreatePrimFunc(const ffi::Array<ffi::ObjectRef>& arg_list,
-                        std::optional<DataType> index_dtype_override) {
+                        std::optional<PrimType> index_dtype_override) {
   ffi::Array<te::Tensor> tensor_arg_list;
   for (const ffi::ObjectRef& x : arg_list) {
     if (auto tensor_node = x.as<te::TensorNode>()) {
diff --git a/src/te/operation/create_primfunc.h b/src/te/operation/create_primfunc.h
index 107a22d33fe5..9b17dd135bda 100644
--- a/src/te/operation/create_primfunc.h
+++ b/src/te/operation/create_primfunc.h
@@ -31,11 +31,11 @@ namespace tirx {
 
 /*! \brief Use Tensor Expression to create a schedulable TensorIR func. */
 PrimFunc CreatePrimFunc(const ffi::Array<te::Tensor>& arg_list,
-                        std::optional<DataType> index_dtype_override = std::nullopt);
+                        std::optional<PrimType> index_dtype_override = std::nullopt);
 
 /*! \brief Use Tensor Expression to create a schedulable TensorIR func. */
 PrimFunc CreatePrimFunc(const ffi::Array<ffi::ObjectRef>& arg_list,
-                        std::optional<DataType> index_dtype_override);
+                        std::optional<PrimType> index_dtype_override);
 
 }  // namespace tirx
 }  // namespace tvm
diff --git a/src/te/operation/extern_op.cc b/src/te/operation/extern_op.cc
index b6b7c17691b9..6fbaf4482b5c 100644
--- a/src/te/operation/extern_op.cc
+++ b/src/te/operation/extern_op.cc
@@ -37,7 +37,9 @@ TVM_FFI_STATIC_INIT_BLOCK() { ExternOpNode::RegisterReflection(); }
 
 int ExternOpNode::num_outputs() const { return static_cast<int>(output_placeholders.size()); }
 
-DataType ExternOpNode::output_dtype(size_t i) const { return output_placeholders[i]->dtype; }
+PrimType ExternOpNode::output_dtype(size_t i) const {
+  return output_placeholders[i]->ElementType();
+}
 
 ffi::Array<PrimExpr> ExternOpNode::output_shape(size_t i) const {
   return output_placeholders[i]->shape;
diff --git a/src/te/operation/placeholder_op.cc b/src/te/operation/placeholder_op.cc
index 17f4791d7615..36e4629ef6fe 100644
--- a/src/te/operation/placeholder_op.cc
+++ b/src/te/operation/placeholder_op.cc
@@ -35,7 +35,7 @@ TVM_FFI_STATIC_INIT_BLOCK() { PlaceholderOpNode::RegisterReflection(); }
 
 int PlaceholderOpNode::num_outputs() const { return 1; }
 
-DataType PlaceholderOpNode::output_dtype(size_t i) const {
+PrimType PlaceholderOpNode::output_dtype(size_t i) const {
   TVM_FFI_ICHECK_EQ(i, 0U);
   return dtype;
 }
@@ -45,7 +45,7 @@ ffi::Array<PrimExpr> PlaceholderOpNode::output_shape(size_t i) const {
   return shape;
 }
 
-PlaceholderOp::PlaceholderOp(std::string name, ffi::Array<PrimExpr> shape, DataType dtype) {
+PlaceholderOp::PlaceholderOp(std::string name, ffi::Array<PrimExpr> shape, PrimType dtype) {
   auto n = ffi::make_object<PlaceholderOpNode>();
   n->name = name;
   n->shape = shape;
@@ -53,14 +53,14 @@ PlaceholderOp::PlaceholderOp(std::string name, ffi::Array<PrimExpr> shape, DataT
   data_ = std::move(n);
 }
 
-Tensor placeholder(ffi::Array<PrimExpr> shape, DataType dtype, std::string name) {
+Tensor placeholder(ffi::Array<PrimExpr> shape, PrimType dtype, std::string name) {
   return PlaceholderOp(name, shape, dtype).output(0);
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("te.Placeholder", [](ffi::Variant<PrimExpr, ffi::Array<PrimExpr>> shape_arg,
-                                             DataType dtype, std::string name) {
+                                             DLDataType dtype, std::string name) {
     auto shape = [&]() -> ffi::Array<PrimExpr> {
       if (auto arg_expr = shape_arg.as<PrimExpr>()) {
         return {arg_expr.value()};
@@ -70,7 +70,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
         TVM_FFI_THROW(InternalError) << "Variant did not contain either allowed type";
       }
     }();
-    return placeholder(shape, dtype, name);
+    return placeholder(shape, PrimType(dtype), name);
   });
 }
 
diff --git a/src/te/operation/scan_op.cc b/src/te/operation/scan_op.cc
index 5e8d4361ec85..96232b293b18 100644
--- a/src/te/operation/scan_op.cc
+++ b/src/te/operation/scan_op.cc
@@ -36,7 +36,7 @@ TVM_FFI_STATIC_INIT_BLOCK() { ScanOpNode::RegisterReflection(); }
 
 int ScanOpNode::num_outputs() const { return static_cast<int>(update.size()); }
 
-DataType ScanOpNode::output_dtype(size_t i) const { return update[i]->dtype; }
+PrimType ScanOpNode::output_dtype(size_t i) const { return update[i]->GetDataType(); }
 
 ffi::Array<PrimExpr> ScanOpNode::output_shape(size_t i) const {
   TVM_FFI_ICHECK_LT(i, state_placeholder.size());
diff --git a/src/te/tensor.cc b/src/te/tensor.cc
index e05f91cad049..d50349f6b508 100644
--- a/src/te/tensor.cc
+++ b/src/te/tensor.cc
@@ -41,15 +41,15 @@ void TensorNode::RegisterReflection() {
 TVM_FFI_STATIC_INIT_BLOCK() { TensorNode::RegisterReflection(); }
 
 IterVar thread_axis(Range dom, std::string tag) {
-  return IterVar(dom, Var(tag, dom.defined() ? dom->extent.dtype() : DataType::Int(32)),
-                 kThreadIndex, tag);
+  return IterVar(dom, Var(tag, dom.defined() ? dom->extent.ty() : PrimType::Int(32)), kThreadIndex,
+                 tag);
 }
 
 IterVar reduce_axis(Range dom, std::string name) {
-  return IterVar(dom, Var(name, dom->extent.dtype()), kCommReduce);
+  return IterVar(dom, Var(name, dom->extent.ty()), kCommReduce);
 }
 
-Var var(std::string name_hint, DataType t) { return Var(name_hint, t); }
+Var var(std::string name_hint, PrimType t) { return Var(name_hint, t); }
 
 // Tensor
 inline PrimExpr Tensor::IndexTensor(ffi::Array<PrimExpr> indices,
@@ -65,7 +65,7 @@ inline PrimExpr Tensor::IndexTensor(ffi::Array<PrimExpr> indices,
   if (support_negative_indices) {
     for (size_t i = 0; i < shape.size(); i++) {
       PrimExpr new_index =
-          Select(indices[i] < IntImm(indices[i]->dtype, 0), indices[i] + shape[i], indices[i]);
+          Select(indices[i] < IntImm(indices[i].ty(), 0), indices[i] + shape[i], indices[i]);
       indices.Set(i, new_index);
     }
   }
@@ -105,7 +105,7 @@ Tensor Operation::output(size_t i) const {
   return Tensor(node);
 }
 
-Tensor::Tensor(ffi::Array<PrimExpr> shape, DataType dtype, Operation op, int value_index) {
+Tensor::Tensor(ffi::Array<PrimExpr> shape, PrimType dtype, Operation op, int value_index) {
   auto n = ffi::make_object<TensorNode>();
   n->shape = std::move(shape);
   n->dtype = dtype;
@@ -117,8 +117,8 @@ Tensor::Tensor(ffi::Array<PrimExpr> shape, DataType dtype, Operation op, int val
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def(
-      "te.Tensor", [](ffi::Array<PrimExpr> shape, DataType dtype, Operation op, int value_index) {
-        return Tensor(shape, dtype, op, value_index);
+      "te.Tensor", [](ffi::Array<PrimExpr> shape, DLDataType dtype, Operation op, int value_index) {
+        return Tensor(shape, PrimType(dtype), op, value_index);
       });
 }
 
@@ -129,6 +129,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def_method("te.TensorEqual", &Tensor::operator==)
+      .def("te.TensorDType", [](Tensor tensor) -> DLDataType { return tensor->dtype->dtype; })
       .def("te.TensorHash",
            [](Tensor tensor) -> int64_t {
              return static_cast<int64_t>(std::hash<Tensor>()(tensor));
diff --git a/src/tirx/analysis/deep_equal.cc b/src/tirx/analysis/deep_equal.cc
index 53700a85a94a..dbf2e53c561d 100644
--- a/src/tirx/analysis/deep_equal.cc
+++ b/src/tirx/analysis/deep_equal.cc
@@ -30,17 +30,25 @@
 namespace tvm {
 namespace tirx {
 
-#define DEFINE_DEEP_EQUAL_BIN_EXPR(OpNode)                              \
-  bool VisitExpr_(const OpNode* plhs, const PrimExpr& rhs) final {      \
-    const auto* prhs = rhs.as<OpNode>();                                \
-    return plhs->dtype == prhs->dtype && VisitExpr(plhs->a, prhs->a) && \
-           VisitExpr(plhs->b, prhs->b);                                 \
+namespace {
+
+template <typename LHS, typename RHS>
+TVM_FFI_INLINE bool SameType(const LHS* lhs, const RHS* rhs) {
+  return lhs->ty() == rhs->ty();
+}
+
+}  // namespace
+
+#define DEFINE_DEEP_EQUAL_BIN_EXPR(OpNode)                                                     \
+  bool VisitExpr_(const OpNode* plhs, const PrimExpr& rhs) final {                             \
+    const auto* prhs = rhs.as<OpNode>();                                                       \
+    return SameType(plhs, prhs) && VisitExpr(plhs->a, prhs->a) && VisitExpr(plhs->b, prhs->b); \
   }
 
-#define DEFINE_DEEP_EQUAL_IMM_EXPR(OpNode)                           \
-  bool VisitExpr_(const OpNode* plhs, const PrimExpr& rhs) final {   \
-    const auto* prhs = rhs.as<OpNode>();                             \
-    return plhs->dtype == prhs->dtype && plhs->value == prhs->value; \
+#define DEFINE_DEEP_EQUAL_IMM_EXPR(OpNode)                         \
+  bool VisitExpr_(const OpNode* plhs, const PrimExpr& rhs) final { \
+    const auto* prhs = rhs.as<OpNode>();                           \
+    return SameType(plhs, prhs) && plhs->value == prhs->value;     \
   }
 
 class ExprDeepEqualChecker : private ExprFunctor<bool(const PrimExpr&, const PrimExpr&)> {
@@ -53,7 +61,7 @@ class ExprDeepEqualChecker : private ExprFunctor<bool(const PrimExpr&, const Pri
     if (lhs->type_index() != rhs->type_index()) return false;
     if (auto* plhs = lhs.as<IntImmNode>()) {
       auto* prhs = rhs.as<IntImmNode>();
-      return plhs->dtype == prhs->dtype && plhs->value == prhs->value;
+      return SameType(plhs, prhs) && plhs->value == prhs->value;
     }
     return ExprDeepEqualChecker().VisitExpr(lhs, rhs);
   }
@@ -104,7 +112,7 @@ class ExprDeepEqualChecker : private ExprFunctor<bool(const PrimExpr&, const Pri
   bool VisitExpr_(const BufferLoadNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<BufferLoadNode>();
     // we run pointer comparison of the buffer
-    return plhs->dtype == prhs->dtype && plhs->buffer.same_as(prhs->buffer) &&
+    return SameType(plhs, prhs) && plhs->buffer.same_as(prhs->buffer) &&
            ArrayDeepEqual(plhs->indices, prhs->indices) &&
            OptionalDeepEqual(plhs->predicate, prhs->predicate);
   }
@@ -112,26 +120,26 @@ class ExprDeepEqualChecker : private ExprFunctor<bool(const PrimExpr&, const Pri
   bool VisitExpr_(const ProducerLoadNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<ProducerLoadNode>();
     // run shallow pointer comparison of the producer
-    return plhs->dtype == prhs->dtype && plhs->producer.same_as(prhs->producer) &&
+    return SameType(plhs, prhs) && plhs->producer.same_as(prhs->producer) &&
            ArrayDeepEqual(plhs->indices, prhs->indices);
   }
 
   bool VisitExpr_(const LetNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<LetNode>();
-    return plhs->dtype == prhs->dtype && VisitExpr(plhs->var, prhs->var) &&
+    return SameType(plhs, prhs) && VisitExpr(plhs->var, prhs->var) &&
            VisitExpr(plhs->value, prhs->value) && VisitExpr(plhs->body, prhs->body);
   }
 
   bool VisitExpr_(const CallNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<CallNode>();
-    return plhs->dtype == prhs->dtype && plhs->op.same_as(prhs->op) &&
+    return SameType(plhs, prhs) && plhs->op.same_as(prhs->op) &&
            ArrayDeepEqual(plhs->args, prhs->args) &&
            ffi::StructuralEqual()(plhs->attrs, prhs->attrs);
   }
 
   bool VisitExpr_(const ReduceNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<ReduceNode>();
-    return plhs->dtype == prhs->dtype && plhs->combiner.same_as(prhs->combiner) &&
+    return SameType(plhs, prhs) && plhs->combiner.same_as(prhs->combiner) &&
            ArrayDeepEqual(plhs->source, prhs->source) && ArrayDeepEqual(plhs->init, prhs->init) &&
            ArrayDeepEqual(plhs->axis, prhs->axis) && VisitExpr(plhs->condition, prhs->condition) &&
            plhs->value_index == prhs->value_index;
@@ -139,36 +147,36 @@ class ExprDeepEqualChecker : private ExprFunctor<bool(const PrimExpr&, const Pri
 
   bool VisitExpr_(const CastNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<CastNode>();
-    return plhs->dtype == prhs->dtype && VisitExpr(plhs->value, prhs->value);
+    return SameType(plhs, prhs) && VisitExpr(plhs->value, prhs->value);
   }
 
   bool VisitExpr_(const NotNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<NotNode>();
-    return plhs->dtype == prhs->dtype && VisitExpr(plhs->a, prhs->a);
+    return SameType(plhs, prhs) && VisitExpr(plhs->a, prhs->a);
   }
 
   bool VisitExpr_(const SelectNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<SelectNode>();
-    return plhs->dtype == prhs->dtype && VisitExpr(plhs->condition, prhs->condition) &&
+    return SameType(plhs, prhs) && VisitExpr(plhs->condition, prhs->condition) &&
            VisitExpr(plhs->true_value, prhs->true_value) &&
            VisitExpr(plhs->false_value, prhs->false_value);
   }
 
   bool VisitExpr_(const RampNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<RampNode>();
-    return plhs->dtype == prhs->dtype && VisitExpr(plhs->base, prhs->base) &&
+    return SameType(plhs, prhs) && VisitExpr(plhs->base, prhs->base) &&
            VisitExpr(plhs->stride, prhs->stride) && VisitExpr(plhs->lanes, prhs->lanes);
   }
 
   bool VisitExpr_(const ShuffleNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<ShuffleNode>();
-    return plhs->dtype == prhs->dtype && ArrayDeepEqual(plhs->vectors, prhs->vectors) &&
+    return SameType(plhs, prhs) && ArrayDeepEqual(plhs->vectors, prhs->vectors) &&
            ArrayDeepEqual(plhs->indices, prhs->indices);
   }
 
   bool VisitExpr_(const BroadcastNode* plhs, const PrimExpr& rhs) final {
     const auto* prhs = rhs.as<BroadcastNode>();
-    return plhs->dtype == prhs->dtype && VisitExpr(plhs->value, prhs->value) &&
+    return SameType(plhs, prhs) && VisitExpr(plhs->value, prhs->value) &&
            VisitExpr(plhs->lanes, prhs->lanes);
   }
 
diff --git a/src/tirx/ir/buffer.cc b/src/tirx/ir/buffer.cc
index af3a75e5fc28..66c66149d500 100644
--- a/src/tirx/ir/buffer.cc
+++ b/src/tirx/ir/buffer.cc
@@ -51,10 +51,11 @@ ffi::Array<PrimExpr> SimplifyArray(arith::AnalyzerObj* ana, ffi::Array<PrimExpr>
   return array;
 }
 
-Buffer decl_buffer(ffi::Array<PrimExpr> shape, DataType dtype, ffi::String name,
+Buffer decl_buffer(ffi::Array<PrimExpr> shape, DLDataType dtype, ffi::String name,
                    ffi::String storage_scope, ffi::Optional<ffi::Array<IntImm>> axis_separators,
                    Span span) {
-  DataType storage_dtype = (dtype == DataType::Bool() ? DataType::Int(8) : dtype);
+  DLDataType storage_dtype =
+      (dtype == DLDataType{kDLBool, 8, 1} ? DLDataType{kDLInt, 8, 1} : dtype);
   return Buffer(Var(name, PointerType(PrimType(storage_dtype), storage_scope), span), dtype, shape,
                 ffi::Array<PrimExpr>(), PrimExpr(), name, 0, 0, kDefault,
                 axis_separators.value_or(ffi::Array<IntImm>()), span, std::nullopt,
@@ -322,20 +323,20 @@ ffi::Array<PrimExpr> BufferNode::ElemOffset(ffi::Array<PrimExpr> input_indices,
 }
 
 inline ffi::Array<PrimExpr> BufferOffset(const BufferNode* n, ffi::Array<PrimExpr> index,
-                                         DataType dtype) {
+                                         PrimType dtype) {
   ffi::Array<PrimExpr> offsets = n->ElemOffset(index);
   // If the Buffer has element type with more than one lane, scale to
   // get the offset in number of scalars.
-  if (n->dtype.lanes() != 1) {
+  if (PrimType(n->dtype).lanes() != 1) {
     PrimExpr last_offset = offsets[offsets.size() - 1];
-    offsets.Set(offsets.size() - 1, last_offset * MakeConst(last_offset.dtype(), dtype.lanes()));
+    offsets.Set(offsets.size() - 1, last_offset * MakeConst(last_offset.ty(), dtype.lanes()));
   }
 
   // If the requested type has more than one lane, make a RampNode at
   // that offset.
   if (dtype.lanes() != 1) {
     PrimExpr last_offset = offsets[offsets.size() - 1];
-    PrimExpr stride = MakeConst(last_offset.dtype(), 1);
+    PrimExpr stride = MakeConst(last_offset.ty(), 1);
     offsets.Set(offsets.size() - 1, tirx::Ramp(last_offset, stride, dtype.lanes()));
   }
 
@@ -404,8 +405,7 @@ Buffer Buffer::GetFlattenedBuffer() const {
   // The axis_separators for the output buffer.
   ffi::Array<IntImm> output_axis_separators;
   for (size_t i = 0; i < self->axis_separators.size(); i++) {
-    auto dtype = self->axis_separators[i]->dtype;
-    output_axis_separators.push_back(IntImm(dtype, i + 1));
+    output_axis_separators.push_back(IntImm(self->axis_separators[i].ty(), i + 1));
   }
 
   if (output_shape.size() == self->shape.size() && self->strides.empty()) {
@@ -427,20 +427,26 @@ Buffer Buffer::GetFlattenedBuffer() const {
   }
 }
 
-PrimExpr Buffer::vload(ffi::Array<PrimExpr> begin, DataType value_dtype,
+PrimExpr Buffer::vload(ffi::Array<PrimExpr> begin, PrimType value_dtype,
                        ffi::Optional<PrimExpr> predicate) const {
-  // specially handle bool, stored as DataType::Int(8)
+  // Specially handle bool, stored as int8 in buffers.
   const BufferNode* n = operator->();
   TVM_FFI_ICHECK(n != nullptr);
-  TVM_FFI_ICHECK(value_dtype.element_of() == n->dtype.element_of() &&
-                 value_dtype.get_lanes_or_vscale_factor() % n->dtype.lanes() == 0)
+  PrimType buffer_dtype(n->dtype);
+  int value_lanes =
+      value_dtype.IsScalableVector() ? value_dtype.VScaleFactor() : value_dtype.lanes();
+  int buffer_lanes =
+      buffer_dtype.IsScalableVector() ? buffer_dtype.VScaleFactor() : buffer_dtype.lanes();
+  TVM_FFI_ICHECK(value_dtype.WithLanes(1)->dtype == buffer_dtype.WithLanes(1)->dtype &&
+                 value_lanes % buffer_lanes == 0)
       << "Cannot load " << value_dtype << " from buffer of " << n->dtype;
 
   ffi::Array<PrimExpr> indices = begin;
   PrimExpr base = indices[indices.size() - 1];
-  if (value_dtype.is_fixed_length_vector()) {
-    int factor = value_dtype.lanes() / n->dtype.lanes();
-    if (factor > 1 && base.dtype().is_scalar()) {
+  if (value_dtype.IsFixedLengthVector()) {
+    int factor = value_dtype.lanes() / buffer_dtype.lanes();
+    PrimType base_ty = base.ty();
+    if (factor > 1 && !base_ty.IsFixedLengthVector() && !base_ty.IsScalableVector()) {
       indices.Set(indices.size() - 1, Ramp(base, 1, factor));
     }
   }
@@ -449,19 +455,25 @@ PrimExpr Buffer::vload(ffi::Array<PrimExpr> begin, DataType value_dtype,
 
 Stmt Buffer::vstore(ffi::Array<PrimExpr> begin, PrimExpr value,
                     ffi::Optional<PrimExpr> predicate) const {
-  // specially handle bool, stored as DataType::Int(8)
+  // Specially handle bool, stored as int8 in buffers.
   const BufferNode* n = operator->();
   TVM_FFI_ICHECK(n != nullptr);
-  DataType value_dtype = value.dtype();
-  TVM_FFI_ICHECK(value_dtype.element_of() == n->dtype.element_of() &&
-                 value_dtype.get_lanes_or_vscale_factor() % n->dtype.lanes() == 0)
+  PrimType value_dtype = value.ty();
+  PrimType buffer_dtype(n->dtype);
+  int value_lanes =
+      value_dtype.IsScalableVector() ? value_dtype.VScaleFactor() : value_dtype.lanes();
+  int buffer_lanes =
+      buffer_dtype.IsScalableVector() ? buffer_dtype.VScaleFactor() : buffer_dtype.lanes();
+  TVM_FFI_ICHECK(value_dtype.WithLanes(1)->dtype == buffer_dtype.WithLanes(1)->dtype &&
+                 value_lanes % buffer_lanes == 0)
       << "Cannot store " << value_dtype << " to buffer of " << n->dtype;
 
   ffi::Array<PrimExpr> indices = begin;
   PrimExpr base = indices[indices.size() - 1];
-  if (value_dtype.is_fixed_length_vector()) {
-    int factor = value_dtype.lanes() / n->dtype.lanes();
-    if (factor > 1 && base.dtype().is_scalar()) {
+  if (value_dtype.IsFixedLengthVector()) {
+    int factor = value_dtype.lanes() / buffer_dtype.lanes();
+    PrimType base_ty = base.ty();
+    if (factor > 1 && !base_ty.IsFixedLengthVector() && !base_ty.IsScalableVector()) {
       indices.Set(indices.size() - 1, Ramp(base, 1, factor));
     }
   }
@@ -484,7 +496,7 @@ Buffer Buffer::MakeStrideView() const {
   const BufferNode* self = operator->();
   TVM_FFI_ICHECK(self != nullptr);
   auto n = ffi::make_object<BufferNode>(*self);
-  PrimExpr acc = MakeConst(n->DefaultIndexType(), 1);
+  PrimExpr acc = MakeConst(PrimType(n->DefaultIndexType()), 1);
   for (size_t i = n->shape.size(); i != 0; --i) {
     temp.push_back(acc);
     acc = acc * n->shape[i - 1];
@@ -537,14 +549,14 @@ Buffer Buffer::MakeSlice(ffi::Array<PrimExpr> begins, ffi::Array<PrimExpr> exten
   return slice;
 }
 
-PrimExpr Buffer::access_ptr(int access_mask, DataType ptr_type, int content_lanes, PrimExpr offset,
+PrimExpr Buffer::access_ptr(int access_mask, PrimType ptr_type, int content_lanes, PrimExpr offset,
                             ffi::Optional<PrimExpr> input_extent) const {
   const BufferNode* self = operator->();
   TVM_FFI_ICHECK(self != nullptr);
   PrimExpr e_dtype;
   PrimExpr extent;
   if (self->shape.size() == 0) {
-    extent = MakeConst(self->DefaultIndexType(), 1);
+    extent = MakeConst(PrimType(self->DefaultIndexType()), 1);
   } else if (self->strides.size() == self->shape.size()) {
     int highest_dim = 0;
     extent = self->strides[highest_dim] * self->shape[highest_dim] - offset;
@@ -555,9 +567,9 @@ PrimExpr Buffer::access_ptr(int access_mask, DataType ptr_type, int content_lane
   }
   PrimExpr elem_offset = self->elem_offset + offset;
   if (content_lanes > 1) {
-    e_dtype = tirx::TypeAnnotation(self->dtype.with_lanes(content_lanes));
-    extent = extent / MakeConst(self->elem_offset.dtype(), content_lanes);
-    elem_offset = self->elem_offset / MakeConst(self->elem_offset.dtype(), content_lanes);
+    e_dtype = tirx::TypeAnnotation(PrimType(self->dtype).WithLanes(content_lanes));
+    extent = extent / MakeConst(self->elem_offset.ty(), content_lanes);
+    elem_offset = self->elem_offset / MakeConst(self->elem_offset.ty(), content_lanes);
   } else {
     e_dtype = tirx::TypeAnnotation(self->dtype);
   }
@@ -570,14 +582,14 @@ PrimExpr Buffer::access_ptr(int access_mask, DataType ptr_type, int content_lane
   return tirx::Call(ptr_type, tirx::builtin::tvm_access_ptr(), acc_args);
 }
 
-Buffer::Buffer(Var data, DataType dtype, ffi::Array<PrimExpr> shape, ffi::Array<PrimExpr> strides,
+Buffer::Buffer(Var data, PrimType dtype, ffi::Array<PrimExpr> shape, ffi::Array<PrimExpr> strides,
                PrimExpr elem_offset, ffi::String name, int data_alignment, int offset_factor,
                BufferType buffer_type, ffi::Array<IntImm> axis_separators, Span span,
                ffi::Optional<Layout> layout, ffi::Array<PrimExpr> allocated_addr) {
-  DataType storage_dtype = dtype;
+  DLDataType storage_dtype = dtype->dtype;
   // specially handle bool
-  if (storage_dtype == DataType::Bool()) {
-    storage_dtype = DataType::Int(8);
+  if (storage_dtype == DLDataType{kDLBool, 8, 1}) {
+    storage_dtype = DLDataType{kDLInt, 8, 1};
   }
   // The buffer dtype may differ from the dtype of the underlying
   // allocation, such as a single allocation that backs multiple
@@ -606,7 +618,7 @@ Buffer::Buffer(Var data, DataType dtype, ffi::Array<PrimExpr> shape, ffi::Array<
   n->axis_separators = std::move(axis_separators);
   n->name = std::move(name);
   if (!elem_offset.defined()) {
-    elem_offset = IntImm(n->DefaultIndexType(), 0);
+    elem_offset = IntImm(PrimType(n->DefaultIndexType()), 0);
   }
   if (data_alignment <= 0) {
     data_alignment = runtime::kAllocAlignment;
@@ -620,7 +632,7 @@ Buffer::Buffer(Var data, DataType dtype, ffi::Array<PrimExpr> shape, ffi::Array<
   n->buffer_type = buffer_type;
   if (n->buffer_type == kAutoBroadcast && n->shape.size() > 0 && n->strides.empty()) {
     for (size_t i = 0; i < n->shape.size(); ++i) {
-      n->strides.push_back(Var("stride", n->shape[i].dtype()));
+      n->strides.push_back(Var("stride", n->shape[i].ty()));
     }
   }
   n->span = std::move(span);
@@ -633,10 +645,11 @@ Buffer::Buffer(Var data, DataType dtype, ffi::Array<PrimExpr> shape, ffi::Array<
   data_ = std::move(n);
 }
 
-tirx::Buffer BufferWithOffsetAlignment(ffi::Array<PrimExpr> shape, DataType dtype, std::string name,
-                                       int data_alignment, int offset_factor, bool compact,
-                                       std::string memory_scope) {
-  DataType storage_dtype = (dtype == DataType::Bool() ? DataType::Int(8) : dtype);
+tirx::Buffer BufferWithOffsetAlignment(ffi::Array<PrimExpr> shape, DLDataType dtype,
+                                       std::string name, int data_alignment, int offset_factor,
+                                       bool compact, std::string memory_scope) {
+  DLDataType storage_dtype =
+      (dtype == DLDataType{kDLBool, 8, 1} ? DLDataType{kDLInt, 8, 1} : dtype);
   auto data = tirx::Var(name, PointerType(PrimType(storage_dtype), memory_scope));
   bool has_any = false;
   if (!compact) {
@@ -651,7 +664,7 @@ tirx::Buffer BufferWithOffsetAlignment(ffi::Array<PrimExpr> shape, DataType dtyp
 
   PrimExpr elem_offset;
   if (offset_factor != 0) {
-    elem_offset = tirx::Var(name + "_elem_offset", shape[0].dtype());
+    elem_offset = tirx::Var(name + "_elem_offset", shape[0].ty());
   } else {
     elem_offset = PrimExpr();
   }
@@ -667,7 +680,7 @@ Buffer Buffer::with_allocated_addr(ffi::Array<PrimExpr> allocated_addr) const {
   return output;
 }
 
-Buffer Buffer::with_dtype(DataType dtype) const {
+Buffer Buffer::with_dtype(PrimType dtype) const {
   Buffer output = *this;
   auto writer = output.CopyOnWrite();
   writer->dtype = dtype;
@@ -682,7 +695,7 @@ Buffer Buffer::with_data(Var data) const {
 }
 
 PrimExpr Buffer::OffsetOf_p(const Array<PrimExpr>& indices) const {
-  return tirx::Call(DataType::Int(32), tirx::builtin::buffer_offset(),
+  return tirx::Call(PrimType::Int(32), tirx::builtin::buffer_offset(),
                     {BufferLoad(*this, indices)});
 }
 
@@ -705,7 +718,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                     auto buffer_type = args[8].cast<ffi::String>();
                     BufferType type = (buffer_type == "auto_broadcast") ? kAutoBroadcast : kDefault;
                     auto data = args[0].cast<Var>();
-                    auto dtype = args[1].cast<DataType>();
+                    auto dtype = args[1].cast<PrimType>();
                     auto shape = args[2].cast<ffi::Array<PrimExpr>>();
                     auto strides = args[3].cast<ffi::Array<PrimExpr>>();
                     auto elem_offset = args[4].cast<PrimExpr>();
@@ -718,15 +731,21 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                     *ret = Buffer(data, dtype, shape, strides, elem_offset, name, data_alignment,
                                   offset_factor, type, axis_separators, span, layout);
                   })
-      .def_method("tirx.BufferAccessPtr", &Buffer::access_ptr)
+      .def_method(
+          "tirx.BufferAccessPtr",
+          static_cast<PrimExpr (Buffer::*)(int, PrimType, int, PrimExpr, ffi::Optional<PrimExpr>)
+                          const>(&Buffer::access_ptr))
       .def_method("tirx.BufferGetFlattenedBuffer", &Buffer::GetFlattenedBuffer)
       .def_method("tirx.BufferOffsetOf", &Buffer::OffsetOf)
       .def_method("tirx.BufferOffsetOfp", &Buffer::OffsetOf_p)
-      .def_method("tirx.BufferVLoad", &Buffer::vload)
+      .def_method("tirx.BufferVLoad",
+                  static_cast<PrimExpr (Buffer::*)(ffi::Array<PrimExpr>, PrimType,
+                                                   ffi::Optional<PrimExpr>) const>(&Buffer::vload))
       .def_method("tirx.BufferVStore", &Buffer::vstore)
       .def_method("tirx.BufferStorageScope", &Buffer::scope)
       .def_method("tirx.BufferWithAllocatedAddr", &Buffer::with_allocated_addr)
-      .def_method("tirx.BufferWithDtype", &Buffer::with_dtype)
+      .def_method("tirx.BufferWithDtype",
+                  static_cast<Buffer (Buffer::*)(DLDataType) const>(&Buffer::with_dtype))
       .def_method("tirx.BufferWithData", &Buffer::with_data)
       .def_method("tirx.BufferIsScalar", &Buffer::IsScalar);
 }
diff --git a/src/tirx/ir/buffer_common.h b/src/tirx/ir/buffer_common.h
index b6aebba2d327..41c4b15fbd81 100644
--- a/src/tirx/ir/buffer_common.h
+++ b/src/tirx/ir/buffer_common.h
@@ -23,8 +23,8 @@
 #ifndef TVM_TIR_IR_BUFFER_COMMON_H_
 #define TVM_TIR_IR_BUFFER_COMMON_H_
 
+#include <tvm/ffi/dtype.h>
 #include <tvm/ir/type.h>
-#include <tvm/runtime/data_type.h>
 
 #include <optional>
 
@@ -36,11 +36,11 @@ namespace tirx {
  *
  * \param type The type to be checked.
  *
- * \return An std::optional<DataType> object. If the type is a pointer
+ * \return An std::optional<DLDataType> object. If the type is a pointer
  * to a primitive type, the object has a value which is the pointed-to
  * type. Otherwise the object is nullopt.
  */
-inline std::optional<runtime::DataType> GetPointerType(const Type& type) {
+inline std::optional<DLDataType> GetPointerType(const Type& type) {
   if (type.defined()) {
     if (auto* ptr_type = type.as<PointerTypeNode>()) {
       if (auto* prim_type = ptr_type->element_type.as<PrimTypeNode>()) {
diff --git a/src/tirx/ir/data_type_rewriter.cc b/src/tirx/ir/data_type_rewriter.cc
index 769f635a6957..29eb5d0e0197 100644
--- a/src/tirx/ir/data_type_rewriter.cc
+++ b/src/tirx/ir/data_type_rewriter.cc
@@ -49,10 +49,10 @@ Stmt DataTypeLegalizer::VisitStmt_(const ForNode* op) {
   PrimExpr e = VisitExpr(op->loop_var);
   Var var = e.as_or_throw<Var>();
   auto n = CopyOnWrite(op);
-  n->min = cast(var.dtype(), op->min);
-  n->extent = cast(var.dtype(), op->extent);
+  n->min = cast(var.ty(), op->min);
+  n->extent = cast(var.ty(), op->extent);
   if (op->step.has_value()) {
-    n->step = cast(var.dtype(), *op->step);
+    n->step = cast(var.ty(), *op->step);
   }
   return For(n);
 }
@@ -62,8 +62,8 @@ Stmt DataTypeLegalizer::VisitStmt_(const SBlockRealizeNode* op) {
   ffi::Array<PrimExpr> new_iter_values;
   bool changed = false;
   for (int i = 0; i < static_cast<int>(op->iter_values.size()); ++i) {
-    auto dtype = realize->block->iter_vars[i]->var->dtype;
-    if (op->iter_values[i]->dtype != dtype) {
+    PrimType dtype = realize->block->iter_vars[i]->var.ty();
+    if (op->iter_values[i].ty() != dtype) {
       new_iter_values.push_back(cast(dtype, realize->iter_values[i]));
       changed = true;
     } else {
@@ -80,8 +80,8 @@ Stmt DataTypeLegalizer::VisitStmt_(const SBlockNode* op) {
   SBlock new_block = StmtExprMutator::VisitStmt_(op).as_or_throw<SBlock>();
   ffi::Array<IterVar> new_iter_vars =
       MutateArray(new_block->iter_vars, [/*this*/](const IterVar& iter) {
-        auto dtype = iter->var.dtype();
-        if (iter->dom->min->dtype != dtype || iter->dom->extent->dtype != dtype) {
+        PrimType dtype = iter->var.ty();
+        if (iter->dom->min.ty() != dtype || iter->dom->extent.ty() != dtype) {
           IterVar new_iter = iter;
           new_iter.CopyOnWrite()->dom =
               Range(cast(dtype, iter->dom->min), cast(dtype, iter->dom->extent));
@@ -111,15 +111,17 @@ Stmt DataTypeLegalizer::VisitStmt_(const AttrStmtNode* op) {
       Range dom = iv->dom;
       if (dom.defined()) {
         PrimExpr extend = dom->extent;
-        TVM_FFI_ICHECK(extend.dtype().is_int() && var.dtype().is_int());
-        if (var.dtype().bits() != extend.dtype().bits()) {
-          DataType dtype = var.dtype();
-          dom = Range(cast(dtype, dom->min), cast(dtype, extend), dom->span);
+        PrimType extend_ty = extend.ty();
+        PrimType var_ty = var.ty();
+        TVM_FFI_ICHECK(extend_ty.MatchesCode(DLDataTypeCode::kDLInt) &&
+                       var_ty.MatchesCode(DLDataTypeCode::kDLInt));
+        if (var_ty.bits() != extend_ty.bits()) {
+          dom = Range(cast(var_ty, dom->min), cast(var_ty, extend), dom->span);
         }
       }
       ivmap_[iv] = IterVar(dom, var, iv->iter_type, iv->thread_tag);
     }
-    return AttrStmt(ivmap_[iv], op->attr_key, cast(var.dtype(), op->value), op->body);
+    return AttrStmt(ivmap_[iv], op->attr_key, cast(var.ty(), op->value), op->body);
   }
   return StmtExprMutator::VisitStmt_(op);
 }
@@ -128,8 +130,8 @@ PrimExpr DataTypeLegalizer::VisitExpr_(const LetNode* op) {
   PrimExpr value = this->VisitExpr(op->value);
   Var var = op->var;
 
-  if (value.dtype() != op->var->dtype) {
-    var = op->var.copy_with_dtype(value.dtype());
+  if (value.ty() != op->var.ty()) {
+    var = op->var.copy_with_dtype(value.ty());
     var_remap_[op->var.get()] = var;
   }
 
@@ -146,8 +148,8 @@ Stmt DataTypeLegalizer::VisitStmt_(const BindNode* op) {
   PrimExpr value = this->VisitExpr(op->value);
   Var var = op->var;
 
-  if (value.dtype() != op->var->dtype) {
-    var = op->var.copy_with_dtype(value.dtype());
+  if (value.ty() != op->var.ty()) {
+    var = op->var.copy_with_dtype(value.ty());
     var_remap_[op->var.get()] = var;
   }
 
@@ -170,13 +172,15 @@ PrimExpr DataTypeLegalizer::VisitExpr_(const SelectNode* op) {
   PrimExpr true_value = this->VisitExpr(op->true_value);
   PrimExpr false_value = this->VisitExpr(op->false_value);
   if (condition.same_as(op->condition) && true_value.same_as(op->true_value) &&
-      false_value.same_as(op->false_value) && true_value.dtype() == false_value.dtype()) {
+      false_value.same_as(op->false_value) && true_value.ty() == false_value.ty()) {
     return ffi::GetRef<PrimExpr>(op);
   } else {
-    int bits = std::max(true_value.dtype().bits(), false_value.dtype().bits());
-    DataType dtype = true_value.dtype().with_bits(bits);
-    if (true_value.dtype() != dtype) true_value = cast(dtype, true_value);
-    if (false_value.dtype() != dtype) false_value = cast(dtype, false_value);
+    PrimType true_dtype = true_value.ty();
+    PrimType false_dtype = false_value.ty();
+    int bits = std::max(true_dtype.bits(), false_dtype.bits());
+    PrimType dtype = true_dtype.WithBits(bits);
+    if (true_dtype != dtype) true_value = cast(dtype, true_value);
+    if (false_dtype != dtype) false_value = cast(dtype, false_value);
     return Select(condition, true_value, false_value);
   }
 }
@@ -184,14 +188,17 @@ PrimExpr DataTypeLegalizer::VisitExpr_(const SelectNode* op) {
 PrimExpr DataTypeLegalizer::VisitExpr_(const RampNode* op) {
   PrimExpr base = VisitExpr(op->base);
   PrimExpr stride = VisitExpr(op->stride);
-  if (base.same_as(op->base) && stride.same_as(op->stride) && base.dtype() == stride.dtype()) {
+  if (base.same_as(op->base) && stride.same_as(op->stride) && base.ty() == stride.ty()) {
     return ffi::GetRef<PrimExpr>(op);
   } else {
-    TVM_FFI_ICHECK(base.dtype().is_int() && stride.dtype().is_int());
-    int bits = std::max(base.dtype().bits(), stride.dtype().bits());
-    DataType dtype = base.dtype().with_bits(bits);
-    if (base.dtype() != dtype) base = cast(dtype, base);
-    if (stride.dtype() != dtype) stride = cast(dtype, stride);
+    PrimType base_dtype = base.ty();
+    PrimType stride_dtype = stride.ty();
+    TVM_FFI_ICHECK(base_dtype.MatchesCode(DLDataTypeCode::kDLInt) &&
+                   stride_dtype.MatchesCode(DLDataTypeCode::kDLInt));
+    int bits = std::max(base_dtype.bits(), stride_dtype.bits());
+    PrimType dtype = base_dtype.WithBits(bits);
+    if (base_dtype->dtype != dtype->dtype) base = cast(dtype, base);
+    if (stride_dtype->dtype != dtype->dtype) stride = cast(dtype, stride);
     return Ramp(base, stride, op->lanes);
   }
 }
@@ -200,15 +207,15 @@ PrimExpr DataTypeLegalizer::VisitExpr_(const CastNode* op) {
   return StmtExprMutator::VisitExpr_(op);
 }
 
-#define TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC)             \
-  PrimExpr DataTypeLegalizer::VisitExpr_(const OP* op) {                  \
-    PrimExpr a = this->VisitExpr(op->a);                                  \
-    PrimExpr b = this->VisitExpr(op->b);                                  \
-    if (op->a.same_as(a) && op->b.same_as(b) && a.dtype() == b.dtype()) { \
-      return ffi::GetRef<PrimExpr>(op);                                   \
-    } else {                                                              \
-      return FUNC(a, b);                                                  \
-    }                                                                     \
+#define TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC)       \
+  PrimExpr DataTypeLegalizer::VisitExpr_(const OP* op) {            \
+    PrimExpr a = this->VisitExpr(op->a);                            \
+    PrimExpr b = this->VisitExpr(op->b);                            \
+    if (op->a.same_as(a) && op->b.same_as(b) && a.ty() == b.ty()) { \
+      return ffi::GetRef<PrimExpr>(op);                             \
+    } else {                                                        \
+      return FUNC(a, b);                                            \
+    }                                                               \
   }
 
 TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(AddNode, operator+);
@@ -251,15 +258,18 @@ PrimExpr DataTypeLegalizer::VisitExpr_(const CallNode* op) {
   if (op->op.same_as(pow_op)) {
     return pow(op->args[0], op->args[1]);
   } else if (op->op.same_as(builtin::if_then_else())) {
-    return Call(op->dtype, op->op, {op->args[0], op->args[1], op->args[2]}, op->attrs, op->span);
+    return Call(ffi::GetRef<PrimExpr>(op).ty(), op->op, {op->args[0], op->args[1], op->args[2]},
+                op->attrs, op->span);
   } else if (op->op.same_as(clz_op)) {
-    DataType before_dtype = before->args[0]->dtype;
-    DataType after_dtype = op->args[0]->dtype;
-    TVM_FFI_ICHECK((before_dtype.is_int() || before_dtype.is_uint()) &&
+    PrimType before_dtype = before->args[0].ty();
+    PrimType after_dtype = op->args[0].ty();
+    TVM_FFI_ICHECK((before_dtype.code() == DLDataTypeCode::kDLInt ||
+                    before_dtype.code() == DLDataTypeCode::kDLUInt) &&
                    (before_dtype.bits() == 32 || before_dtype.bits() == 64))
         << "clz only supports 32 or 64 bit integer types, but get type before legalizing: "
         << before_dtype;
-    TVM_FFI_ICHECK((after_dtype.is_int() || after_dtype.is_uint()) &&
+    TVM_FFI_ICHECK((after_dtype.code() == DLDataTypeCode::kDLInt ||
+                    after_dtype.code() == DLDataTypeCode::kDLUInt) &&
                    (after_dtype.bits() == 32 || after_dtype.bits() == 64))
         << "clz only supports 32 or 64 bit integer types, but get type after legalizing: "
         << after_dtype;
@@ -434,7 +444,8 @@ Stmt IndexDataTypeRewriter::VisitStmt_(const BufferStoreNode* op) {
 
   Buffer new_buffer = VisitBufferUse(op->buffer);
   auto value = this->VisitExpr(op->value);
-  if (new_buffer->dtype != value->dtype && value->dtype.is_scalar()) {
+  PrimType value_dtype = value.ty();
+  if (new_buffer->dtype != value_dtype && value_dtype.IsScalar()) {
     value = cast(new_buffer->dtype, value);
   }
   auto indices = VisitIndices(op->indices);
@@ -514,12 +525,12 @@ Stmt IndexDataTypeRewriter::VisitStmt_(const ForNode* op) {
     For new_for = ffi::GetRef<For>(op);
     auto* n = new_for.CopyOnWrite();
     n->loop_var = new_loop_var;
-    n->min = cast(new_loop_var.dtype(), min);
-    n->extent = cast(new_loop_var.dtype(), extent);
+    n->min = cast(new_loop_var.ty(), min);
+    n->extent = cast(new_loop_var.ty(), extent);
     if (op->thread_binding.defined()) {
       auto old_thread_binding = op->thread_binding.value();
       auto* ptr = old_thread_binding.CopyOnWrite();
-      ptr->var = old_thread_binding->var.copy_with_dtype(new_loop_var.dtype());
+      ptr->var = old_thread_binding->var.copy_with_dtype(new_loop_var.ty());
       n->thread_binding = ffi::Optional<IterVar>(std::move(old_thread_binding));
     }
     n->body = new_body;
@@ -540,17 +551,18 @@ Stmt IndexDataTypeRewriter::VisitStmt_(const BindNode* op) {
   PrimExpr value = VisitExpr(op->value);
   Var var = var_remap_[bind_stmt->var.get()];
   is_enabled_ = is_enabled;
-  TVM_FFI_ICHECK(value.dtype() == var.dtype());
+  TVM_FFI_ICHECK(value.ty() == var.ty());
   return Bind(var, value, bind_stmt->span);
 }
 
-#define TVM_DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC)                     \
-  PrimExpr IndexDataTypeRewriter::VisitExpr_(const OP* op) {                       \
-    bool is_enabled = is_enabled_;                                                 \
-    is_enabled_ = is_condition_ && op->a->dtype.is_int() && op->b->dtype.is_int(); \
-    auto result = Parent::VisitExpr_(op);                                          \
-    is_enabled_ = is_enabled;                                                      \
-    return result;                                                                 \
+#define TVM_DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC)                       \
+  PrimExpr IndexDataTypeRewriter::VisitExpr_(const OP* op) {                         \
+    bool is_enabled = is_enabled_;                                                   \
+    is_enabled_ = is_condition_ && op->a.ty().MatchesCode(DLDataTypeCode::kDLInt) && \
+                  op->b.ty().MatchesCode(DLDataTypeCode::kDLInt);                    \
+    auto result = Parent::VisitExpr_(op);                                            \
+    is_enabled_ = is_enabled;                                                        \
+    return result;                                                                   \
   }
 
 TVM_DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(EQNode, operator==);
@@ -567,8 +579,8 @@ PrimExpr IndexDataTypeRewriter::VisitExpr_(const CallNode* op) {
     is_condition_ = true;
     PrimExpr cond = VisitExpr(op->args[0]);
     is_condition_ = is_condition;
-    return Call(op->dtype, op->op, {cond, VisitExpr(op->args[1]), VisitExpr(op->args[2])},
-                op->attrs, op->span);
+    return Call(ffi::GetRef<PrimExpr>(op).ty(), op->op,
+                {cond, VisitExpr(op->args[1]), VisitExpr(op->args[2])}, op->attrs, op->span);
   }
   return Parent::VisitExpr_(op);
 }
@@ -582,20 +594,22 @@ PrimExpr IndexDataTypeRewriter::VisitExpr_(const SelectNode* op) {
   PrimExpr false_value = this->VisitExpr(op->false_value);
 
   if (condition.same_as(op->condition) && true_value.same_as(op->true_value) &&
-      false_value.same_as(op->false_value) && true_value.dtype() == false_value.dtype()) {
+      false_value.same_as(op->false_value) && true_value.ty() == false_value.ty()) {
     return ffi::GetRef<PrimExpr>(op);
   } else {
-    int bits = std::max(true_value.dtype().bits(), false_value.dtype().bits());
-    DataType dtype = true_value.dtype().with_bits(bits);
-    if (true_value.dtype() != dtype) true_value = cast(dtype, true_value);
-    if (false_value.dtype() != dtype) false_value = cast(dtype, false_value);
+    PrimType true_dtype = true_value.ty();
+    PrimType false_dtype = false_value.ty();
+    int bits = std::max(true_dtype.bits(), false_dtype.bits());
+    PrimType dtype = true_dtype.WithBits(bits);
+    if (true_dtype->dtype != dtype->dtype) true_value = cast(dtype, true_value);
+    if (false_dtype->dtype != dtype->dtype) false_value = cast(dtype, false_value);
     return Select(condition, true_value, false_value);
   }
 }
 
 #undef TVM_DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH
 
-IndexDataTypeNormalizer::IndexDataTypeNormalizer(DataType target_data_type)
+IndexDataTypeNormalizer::IndexDataTypeNormalizer(PrimType target_data_type)
     : target_data_type_(std::move(target_data_type)) {}
 
 PrimFunc IndexDataTypeNormalizer::Rewrite(PrimFunc func) {
@@ -612,7 +626,7 @@ PrimFunc IndexDataTypeNormalizer::Rewrite(PrimFunc func) {
   bool is_enabled = true;
   std::swap(is_enabled_, is_enabled);
   ffi::Array<Var> params = func->params.Map([this](Var param) {
-    if (param.dtype().is_int()) {
+    if (param.ty().MatchesCode(DLDataTypeCode::kDLInt)) {
       return this->VisitExpr(param).as_or_throw<Var>();
     } else {
       return param;
@@ -627,12 +641,12 @@ PrimFunc IndexDataTypeNormalizer::Rewrite(PrimFunc func) {
   return func;
 }
 
-bool IndexDataTypeNormalizer::CanRewriteDType(DataType dtype) const {
-  return dtype.is_int() && dtype.bits() >= 32;
+bool IndexDataTypeNormalizer::CanRewriteDType(PrimType dtype) const {
+  return dtype.code() == DLDataTypeCode::kDLInt && dtype.bits() >= 32;
 }
 
 PrimExpr IndexDataTypeNormalizer::VisitExpr_(const IntImmNode* op) {
-  if (is_enabled_ && CanRewriteDType(op->dtype)) {
+  if (is_enabled_ && CanRewriteDType(op->ty())) {
     TVM_FFI_ICHECK_LE(op->value, max_value(target_data_type_).as_or_throw<IntImm>()->value);
     return cast(target_data_type_, ffi::GetRef<IntImm>(op));
   }
@@ -640,7 +654,8 @@ PrimExpr IndexDataTypeNormalizer::VisitExpr_(const IntImmNode* op) {
 }
 
 PrimExpr IndexDataTypeNormalizer::VisitExpr_(const VarNode* op) {
-  if (is_enabled_ && CanRewriteDType(op->dtype) && op->dtype != target_data_type_ &&
+  PrimType dtype = op->ty();
+  if (is_enabled_ && CanRewriteDType(dtype) && dtype->dtype != target_data_type_->dtype &&
       !var_remap_.count(op)) {
     var_remap_[op] = ffi::GetRef<Var>(op).copy_with_dtype(target_data_type_);
   }
@@ -651,9 +666,10 @@ PrimExpr IndexDataTypeNormalizer::VisitExpr_(const CastNode* op) {
   // Unwrap the cast only when the dtype of this cast is integer dtype.
   // When the dtype of this cast is not integer dtype, it means that this cast
   // has some other purpose, and we should not unwrap the cast.
-  if (is_enabled_ && CanRewriteDType(op->dtype)) {
+  PrimType dtype = op->ty();
+  if (is_enabled_ && CanRewriteDType(dtype)) {
     PrimExpr value = IndexDataTypeNormalizer::VisitExpr(op->value);
-    return value->dtype == target_data_type_ ? value : Cast(target_data_type_, value);
+    return value.ty()->dtype == target_data_type_->dtype ? value : Cast(target_data_type_, value);
   }
   return IndexDataTypeRewriter::VisitExpr_(op);
 }
diff --git a/src/tirx/ir/data_type_rewriter.h b/src/tirx/ir/data_type_rewriter.h
index 1bea362f6283..193aa2e09b55 100644
--- a/src/tirx/ir/data_type_rewriter.h
+++ b/src/tirx/ir/data_type_rewriter.h
@@ -141,7 +141,7 @@ class IndexDataTypeRewriter : public DataTypeLegalizer {
  */
 class IndexDataTypeNormalizer : public IndexDataTypeRewriter {
  public:
-  explicit IndexDataTypeNormalizer(DataType target_data_type);
+  explicit IndexDataTypeNormalizer(PrimType target_data_type);
   PrimFunc Rewrite(PrimFunc func);
 
  protected:
@@ -153,9 +153,9 @@ class IndexDataTypeNormalizer : public IndexDataTypeRewriter {
   PrimExpr VisitExpr_(const CastNode* op) override;
 
   /*! \brief Specifies which data type we can rewrite */
-  virtual bool CanRewriteDType(DataType dtype) const;
+  virtual bool CanRewriteDType(PrimType dtype) const;
 
-  DataType target_data_type_ = DataType::Int(64);
+  PrimType target_data_type_ = PrimType::Int(64);
 };
 
 }  // namespace tirx
diff --git a/src/tirx/ir/exec_scope.cc b/src/tirx/ir/exec_scope.cc
index 582ac578ceac..072666610ed3 100644
--- a/src/tirx/ir/exec_scope.cc
+++ b/src/tirx/ir/exec_scope.cc
@@ -389,7 +389,7 @@ ffi::Array<PrimExpr> ResolveCuda(ScopeBinding binding,
       ffi::Array<PrimExpr> ret;
       for (int i = 0; i < out_dim; ++i) {
         ret.push_back(
-            tirx::Call(DataType::Int(32), ptx_fetch_register_op,
+            tirx::Call(PrimType::Int(32), ptx_fetch_register_op,
                        {IntImm::Int32(32), StringImm("clusterid." + std::string(1, 'x' + i))}));
       }
       return ret;
@@ -438,8 +438,8 @@ ffi::Array<PrimExpr> ScopeIdResolve::Resolve(ScopeBinding binding,
 
 PrimExpr ScopeIdResolve::ComputeWarpIdInCta(const LaunchParams& params) {
   PrimExpr warp_id = FloorDiv(GetLinearThreadIndex(params), 32);
-  PrimExpr mask = IntImm(DataType::UInt(32), 0xffffffff);
-  return Call(warp_id.dtype(), builtin::tvm_warp_shuffle(),
+  PrimExpr mask = IntImm(PrimType::UInt(32), 0xffffffff);
+  return Call(warp_id.ty(), builtin::tvm_warp_shuffle(),
               {mask, warp_id, IntImm::Int32(0), IntImm::Int32(32), IntImm::Int32(32)});
 }
 
diff --git a/src/tirx/ir/expr.cc b/src/tirx/ir/expr.cc
index c2e89c6ec0de..0e250924c296 100644
--- a/src/tirx/ir/expr.cc
+++ b/src/tirx/ir/expr.cc
@@ -55,6 +55,20 @@ std::optional<int> ExtractVscaleFactor(const PrimExpr& lanes) {
   }
   return std::nullopt;
 }
+
+int GetLanesOrVScaleFactor(const PrimType& ty) {
+  return ty.IsScalableVector() ? ty.VScaleFactor() : ty.lanes();
+}
+
+TVM_FFI_INLINE const PrimTypeNode* GetPrimTypeNode(const PrimExpr& expr) {
+  // Avoid PrimExpr::ty() ObjectRef materialization in expression constructor hot paths.
+  const auto* node = expr.get();
+  TVM_FFI_DCHECK(node != nullptr);
+  TVM_FFI_DCHECK(node->BaseExprNode::ty.defined());
+  const auto* prim_ty = node->BaseExprNode::ty.as<PrimTypeNode>();
+  TVM_FFI_DCHECK(prim_ty != nullptr);
+  return prim_ty;
+}
 }  // namespace
 
 TVM_FFI_STATIC_INIT_BLOCK() {
@@ -109,44 +123,46 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   // src/script/printer/tirx/expr.cc (-> ReprPrintTIR which delegates to TVMScriptPrinter).
 }
 
-#define TVM_DEFINE_BINOP_CONSTRUCTOR(Name)                                    \
-  Name::Name(PrimExpr a, PrimExpr b, Span span) {                             \
-    using T = Name::ContainerType;                                            \
-    TVM_FFI_CHECK(a.defined(), ValueError) << "a is undefined\n";             \
-    TVM_FFI_CHECK(b.defined(), ValueError) << "b is undefined\n";             \
-    TVM_FFI_CHECK(a.dtype() == b.dtype(), TypeError)                          \
-        << "mismatched types. " << a.dtype() << " vs. " << b.dtype() << "\n"; \
-    ffi::ObjectPtr<T> node = ffi::make_object<T>();                           \
-    node->dtype = a.dtype();                                                  \
-    node->a = std::move(a);                                                   \
-    node->b = std::move(b);                                                   \
-    node->span = std::move(span);                                             \
-    data_ = std::move(node);                                                  \
+#define TVM_DEFINE_BINOP_CONSTRUCTOR(Name)                                        \
+  Name::Name(PrimExpr a, PrimExpr b, Span span) {                                 \
+    using T = Name::ContainerType;                                                \
+    TVM_FFI_CHECK(a.defined(), ValueError) << "a is undefined\n";                 \
+    TVM_FFI_CHECK(b.defined(), ValueError) << "b is undefined\n";                 \
+    const PrimTypeNode* a_ty = GetPrimTypeNode(a);                                \
+    const PrimTypeNode* b_ty = GetPrimTypeNode(b);                                \
+    TVM_FFI_CHECK(a_ty->dtype == b_ty->dtype, TypeError)                          \
+        << "mismatched types. " << a_ty->dtype << " vs. " << b_ty->dtype << "\n"; \
+    ffi::ObjectPtr<T> node = ffi::make_object<T>();                               \
+    node->BaseExprNode::ty = a.get()->BaseExprNode::ty;                           \
+    node->a = std::move(a);                                                       \
+    node->b = std::move(b);                                                       \
+    node->span = std::move(span);                                                 \
+    data_ = std::move(node);                                                      \
   }
 
-#define TVM_DEFINE_CMPOP_CONSTRUCTOR(Name)                                                  \
-  Name::Name(PrimExpr a, PrimExpr b, Span span) {                                           \
-    using T = Name::ContainerType;                                                          \
-    TVM_FFI_CHECK(a.defined(), ValueError) << "a is undefined\n";                           \
-    TVM_FFI_CHECK(b.defined(), ValueError) << "b is undefined\n";                           \
-    TVM_FFI_CHECK(a.dtype() == b.dtype(), TypeError)                                        \
-        << "mismatched types. " << a.dtype() << " vs. " << b.dtype() << "\n";               \
-    ffi::ObjectPtr<T> node = ffi::make_object<T>();                                         \
-    DataType a_dtype = a.dtype();                                                           \
-    node->dtype =                                                                           \
-        DataType::Bool(a_dtype.get_lanes_or_vscale_factor(), a_dtype.is_scalable_vector()); \
-    node->a = std::move(a);                                                                 \
-    node->b = std::move(b);                                                                 \
-    node->span = std::move(span);                                                           \
-    data_ = std::move(node);                                                                \
+#define TVM_DEFINE_CMPOP_CONSTRUCTOR(Name)                                        \
+  Name::Name(PrimExpr a, PrimExpr b, Span span) {                                 \
+    using T = Name::ContainerType;                                                \
+    TVM_FFI_CHECK(a.defined(), ValueError) << "a is undefined\n";                 \
+    TVM_FFI_CHECK(b.defined(), ValueError) << "b is undefined\n";                 \
+    const PrimTypeNode* a_ty = GetPrimTypeNode(a);                                \
+    const PrimTypeNode* b_ty = GetPrimTypeNode(b);                                \
+    TVM_FFI_CHECK(a_ty->dtype == b_ty->dtype, TypeError)                          \
+        << "mismatched types. " << a_ty->dtype << " vs. " << b_ty->dtype << "\n"; \
+    ffi::ObjectPtr<T> node = ffi::make_object<T>();                               \
+    node->BaseExprNode::ty = PrimType(DLDataType{kDLBool, 8, a_ty->dtype.lanes}); \
+    node->a = std::move(a);                                                       \
+    node->b = std::move(b);                                                       \
+    node->span = std::move(span);                                                 \
+    data_ = std::move(node);                                                      \
   }
 
 // Var
-Var::Var(ffi::String name_hint, DataType dtype, Span span) {
+Var::Var(ffi::String name_hint, PrimType dtype, Span span) {
   auto n = ffi::make_object<VarNode>();
   n->name_hint = std::move(name_hint);
-  n->type_annotation = GetTypeFromRuntimeDataType(dtype);
-  n->dtype = std::move(dtype);
+  n->type_annotation = dtype;
+  n->BaseExprNode::ty = dtype;
   n->span = std::move(span);
   data_ = std::move(n);
 }
@@ -154,8 +170,12 @@ Var::Var(ffi::String name_hint, DataType dtype, Span span) {
 Var::Var(ffi::String name_hint, Type type_annotation, Span span) {
   auto n = ffi::make_object<VarNode>();
   n->name_hint = std::move(name_hint);
-  n->dtype = GetRuntimeDataType(type_annotation);
   n->type_annotation = std::move(type_annotation);
+  if (n->type_annotation.as<PrimTypeNode>()) {
+    n->BaseExprNode::ty = n->type_annotation;
+  } else {
+    n->BaseExprNode::ty = PrimType(GetRuntimeDLDataType(n->type_annotation));
+  }
   n->span = std::move(span);
   data_ = std::move(n);
 }
@@ -176,7 +196,7 @@ Var Var::copy_with_suffix(const ffi::String& suffix) const {
   return this->copy_with_name(get()->name_hint + suffix);
 }
 
-Var Var::copy_with_dtype(DataType dtype) const {
+Var Var::copy_with_dtype(PrimType dtype) const {
   const VarNode* node = get();
   ffi::ObjectPtr<VarNode> new_ptr;
   if (auto* ptr = this->as<SizeVarNode>()) {
@@ -184,8 +204,8 @@ Var Var::copy_with_dtype(DataType dtype) const {
   } else {
     new_ptr = ffi::make_object<VarNode>(*node);
   }
-  new_ptr->type_annotation = GetTypeFromRuntimeDataType(dtype);
-  new_ptr->dtype = std::move(dtype);
+  new_ptr->type_annotation = dtype;
+  new_ptr->BaseExprNode::ty = dtype;
   return Var(new_ptr);
 }
 
@@ -195,17 +215,17 @@ TVM_FFI_STATIC_INIT_BLOCK() {
     if (type.as<Type>()) {
       return Var(name_hint, type.cast<Type>(), span);
     } else {
-      return Var(name_hint, type.cast<DataType>(), span);
+      return Var(name_hint, type.cast<PrimType>(), span);
     }
   });
 }
 
 // SizeVar
-SizeVar::SizeVar(ffi::String name_hint, DataType dtype, Span span) {
+SizeVar::SizeVar(ffi::String name_hint, PrimType dtype, Span span) {
   auto n = ffi::make_object<SizeVarNode>();
   n->name_hint = std::move(name_hint);
-  n->type_annotation = GetTypeFromRuntimeDataType(dtype);
-  n->dtype = std::move(dtype);
+  n->type_annotation = dtype;
+  n->BaseExprNode::ty = n->type_annotation;
   n->span = std::move(span);
   data_ = std::move(n);
 }
@@ -213,8 +233,8 @@ SizeVar::SizeVar(ffi::String name_hint, DataType dtype, Span span) {
 SizeVar::SizeVar(ffi::String name_hint, Type type_annotation, Span span) {
   auto n = ffi::make_object<SizeVarNode>();
   n->name_hint = std::move(name_hint);
-  n->dtype = GetRuntimeDataType(type_annotation);
   n->type_annotation = std::move(type_annotation);
+  n->BaseExprNode::ty = PrimType(GetRuntimeDLDataType(n->type_annotation));
   n->span = std::move(span);
   data_ = std::move(n);
 }
@@ -222,20 +242,22 @@ SizeVar::SizeVar(ffi::String name_hint, Type type_annotation, Span span) {
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tirx.SizeVar",
-                        [](ffi::String s, DataType t, Span span) { return SizeVar(s, t, span); });
+                        [](ffi::String s, PrimType t, Span span) { return SizeVar(s, t, span); });
 }
 
 // IterVar
 IterVar::IterVar(Range dom, Var var, IterVarType t, ffi::String thread_tag, Span span) {
   ffi::ObjectPtr<IterVarNode> n = ffi::make_object<IterVarNode>();
   if (dom.defined() && dom->extent.defined()) {
-    TVM_FFI_ICHECK(dom->extent.dtype().is_int())
+    PrimType extent_ty = dom->extent.ty();
+    PrimType var_ty = var.ty();
+    TVM_FFI_ICHECK(extent_ty.code() == DLDataTypeCode::kDLInt)
         << "The dtype of the domain of an IterVar must be an integer type. However, the domain's "
            "dtype is "
-        << dom->extent.dtype();
-    TVM_FFI_ICHECK_EQ(dom->extent.dtype(), var.dtype())
-        << "The dtype of the extent of an IterVar (" << dom->extent.dtype()
-        << ") must match its associated Var's dtype (" << var.dtype() << ")";
+        << extent_ty->dtype;
+    TVM_FFI_ICHECK(extent_ty == var_ty)
+        << "The dtype of the extent of an IterVar (" << extent_ty->dtype
+        << ") must match its associated Var's dtype (" << var_ty->dtype << ")";
   }
   n->dom = dom;
   n->var = var;
@@ -256,7 +278,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 // StringImm
 StringImm::StringImm(ffi::String value, Span span) {
   ffi::ObjectPtr<StringImmNode> node = ffi::make_object<StringImmNode>();
-  node->dtype = DataType::Handle();
+  node->BaseExprNode::ty = PrimType::Handle();
   node->value = std::move(value);
   node->span = std::move(span);
   data_ = std::move(node);
@@ -269,12 +291,12 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 }
 
 // Cast
-Cast::Cast(DataType t, PrimExpr value, Span span) {
+Cast::Cast(PrimType value_ty, PrimExpr value, Span span) {
   TVM_FFI_ICHECK(value.defined());
-  TVM_FFI_ICHECK_EQ(t.get_lanes_or_vscale_factor(), value.dtype().get_lanes_or_vscale_factor());
-  TVM_FFI_ICHECK(t.is_scalable_vector() == value.dtype().is_scalable_vector());
+  PrimType value_expr_ty = value.ty();
+  TVM_FFI_ICHECK_EQ(value_ty->dtype.lanes, value_expr_ty->dtype.lanes);
   ffi::ObjectPtr<CastNode> node = ffi::make_object<CastNode>();
-  node->dtype = t;
+  node->BaseExprNode::ty = std::move(value_ty);
   node->value = std::move(value);
   node->span = std::move(span);
   data_ = std::move(node);
@@ -282,7 +304,7 @@ Cast::Cast(DataType t, PrimExpr value, Span span) {
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("tirx.Cast", [](DataType dtype, PrimExpr value, Span span) {
+  refl::GlobalDef().def("tirx.Cast", [](PrimType dtype, PrimExpr value, Span span) {
     return Cast(dtype, value, span);
   });
 }
@@ -426,13 +448,14 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 And::And(PrimExpr a, PrimExpr b, Span span) {
   TVM_FFI_CHECK(a.defined(), ValueError) << "a is undefined";
   TVM_FFI_CHECK(b.defined(), ValueError) << "b is undefined";
-  TVM_FFI_ICHECK(a.dtype().is_bool());
-  TVM_FFI_ICHECK(b.dtype().is_bool());
-  TVM_FFI_CHECK(a.dtype() == b.dtype(), TypeError) << "mismatched types";
+  PrimType a_ty = a.ty();
+  PrimType b_ty = b.ty();
+  TVM_FFI_ICHECK(a_ty.MatchesCode(DLDataTypeCode::kDLBool));
+  TVM_FFI_ICHECK(b_ty.MatchesCode(DLDataTypeCode::kDLBool));
+  TVM_FFI_CHECK(a_ty == b_ty, TypeError) << "mismatched types";
 
   ffi::ObjectPtr<AndNode> node = ffi::make_object<AndNode>();
-  node->dtype =
-      DataType::Bool(a.dtype().get_lanes_or_vscale_factor(), a.dtype().is_scalable_vector());
+  node->BaseExprNode::ty = PrimType(DLDataType{kDLBool, 8, a_ty->dtype.lanes});
   node->a = std::move(a);
   node->b = std::move(b);
   node->span = std::move(span);
@@ -449,13 +472,14 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 Or::Or(PrimExpr a, PrimExpr b, Span span) {
   TVM_FFI_CHECK(a.defined(), ValueError) << "a is undefined";
   TVM_FFI_CHECK(b.defined(), ValueError) << "b is undefined";
-  TVM_FFI_ICHECK(a.dtype().is_bool());
-  TVM_FFI_ICHECK(b.dtype().is_bool());
-  TVM_FFI_CHECK(a.dtype() == b.dtype(), TypeError) << "mismatched types";
+  PrimType a_ty = a.ty();
+  PrimType b_ty = b.ty();
+  TVM_FFI_ICHECK(a_ty.MatchesCode(DLDataTypeCode::kDLBool));
+  TVM_FFI_ICHECK(b_ty.MatchesCode(DLDataTypeCode::kDLBool));
+  TVM_FFI_CHECK(a_ty == b_ty, TypeError) << "mismatched types";
 
   ffi::ObjectPtr<OrNode> node = ffi::make_object<OrNode>();
-  node->dtype =
-      DataType::Bool(a.dtype().get_lanes_or_vscale_factor(), a.dtype().is_scalable_vector());
+  node->BaseExprNode::ty = PrimType(DLDataType{kDLBool, 8, a_ty->dtype.lanes});
   node->a = std::move(a);
   node->b = std::move(b);
   node->span = std::move(span);
@@ -471,11 +495,11 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 // Not
 Not::Not(PrimExpr a, Span span) {
   TVM_FFI_CHECK(a.defined(), ValueError) << "a is undefined";
-  TVM_FFI_ICHECK(a.dtype().is_bool());
+  PrimType a_ty = a.ty();
+  TVM_FFI_ICHECK(a_ty.MatchesCode(DLDataTypeCode::kDLBool));
 
   ffi::ObjectPtr<NotNode> node = ffi::make_object<NotNode>();
-  DataType a_dtype = a.dtype();
-  node->dtype = DataType::Bool(a_dtype.get_lanes_or_vscale_factor(), a_dtype.is_scalable_vector());
+  node->BaseExprNode::ty = PrimType(DLDataType{kDLBool, 8, a_ty->dtype.lanes});
   node->a = std::move(a);
   node->span = std::move(span);
   data_ = std::move(node);
@@ -491,16 +515,18 @@ Select::Select(PrimExpr condition, PrimExpr true_value, PrimExpr false_value, Sp
   TVM_FFI_CHECK(condition.defined(), ValueError) << "condition is undefined";
   TVM_FFI_CHECK(true_value.defined(), ValueError) << "true_value is undefined";
   TVM_FFI_CHECK(false_value.defined(), ValueError) << "true_value is undefined";
-  TVM_FFI_ICHECK(condition.dtype().is_bool());
-  TVM_FFI_ICHECK(condition.dtype().get_lanes_or_vscale_factor() ==
-                     true_value.dtype().get_lanes_or_vscale_factor() ||
-                 condition.dtype().is_scalar());
-  TVM_FFI_CHECK(false_value.dtype() == true_value.dtype(), TypeError)
+  PrimType condition_ty = condition.ty();
+  PrimType true_ty = true_value.ty();
+  PrimType false_ty = false_value.ty();
+  TVM_FFI_ICHECK(condition_ty.MatchesCode(DLDataTypeCode::kDLBool));
+  TVM_FFI_ICHECK(GetLanesOrVScaleFactor(condition_ty) == GetLanesOrVScaleFactor(true_ty) ||
+                 condition_ty.IsScalar());
+  TVM_FFI_CHECK(false_ty == true_ty, TypeError)
       << "mismatched types. "
-      << "False type: " << false_value.dtype() << "; True type: " << true_value.dtype();
+      << "False type: " << false_ty->dtype << "; True type: " << true_ty->dtype;
 
   ffi::ObjectPtr<SelectNode> node = ffi::make_object<SelectNode>();
-  node->dtype = true_value.dtype();
+  node->BaseExprNode::ty = true_ty;
   node->condition = std::move(condition);
   node->true_value = std::move(true_value);
   node->false_value = std::move(false_value);
@@ -520,10 +546,12 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 Ramp::Ramp(PrimExpr base, PrimExpr stride, PrimExpr lanes, Span span) {
   TVM_FFI_ICHECK(base.defined());
   TVM_FFI_ICHECK(stride.defined());
-  TVM_FFI_ICHECK(base.dtype().is_scalar());
-  TVM_FFI_ICHECK(stride.dtype().is_scalar());
-  if (stride.dtype() != base.dtype()) {
-    stride = cast(base.dtype(), stride);
+  PrimType base_ty = base.ty();
+  PrimType stride_ty = stride.ty();
+  TVM_FFI_ICHECK(base_ty.IsScalar());
+  TVM_FFI_ICHECK(stride_ty.IsScalar());
+  if (stride_ty != base_ty) {
+    stride = cast(base_ty, stride);
   }
 
   ffi::ObjectPtr<RampNode> node = ffi::make_object<RampNode>();
@@ -531,15 +559,16 @@ Ramp::Ramp(PrimExpr base, PrimExpr stride, PrimExpr lanes, Span span) {
   if (lanes_as_int) {
     int lanes = static_cast<int>(lanes_as_int->value);
     TVM_FFI_ICHECK_GT(lanes, 1);
-    node->dtype = base.dtype().with_lanes(lanes);
+    node->BaseExprNode::ty = base_ty.WithLanes(lanes);
     // Stick to int32 lanes for fixed length vectors
     node->lanes = lanes;
   } else { /* scalable vector */
     std::optional<int> vscale_factor = ExtractVscaleFactor(lanes);
     TVM_FFI_ICHECK(vscale_factor) << "Invalid expression for scalable lanes " << lanes;
 
-    node->dtype = base.dtype().with_scalable_vscale_factor(vscale_factor.value());
-    lanes = Mul(Call(DataType::Int(32), tirx::builtin::vscale(), {}), vscale_factor.value());
+    node->BaseExprNode::ty =
+        PrimType::ScalableVector(base_ty.code(), base_ty.bits(), vscale_factor.value());
+    lanes = Mul(Call(PrimType::Int(32), tirx::builtin::vscale(), {}), vscale_factor.value());
     node->lanes = lanes;
   }
   node->base = base;
@@ -558,22 +587,24 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 // Broadcast
 Broadcast::Broadcast(PrimExpr value, PrimExpr lanes, Span span) {
   TVM_FFI_ICHECK(value.defined());
-  TVM_FFI_ICHECK(value.dtype().is_scalar());
+  PrimType value_ty = value.ty();
+  TVM_FFI_ICHECK(value_ty.IsScalar());
 
   ffi::ObjectPtr<BroadcastNode> node = ffi::make_object<BroadcastNode>();
   auto* lanes_int = lanes.as<IntImmNode>();
   if (lanes_int) {
     int lanes = static_cast<int>(lanes_int->value);
     TVM_FFI_ICHECK_GT(lanes, 1);
-    node->dtype = value.dtype().with_lanes(lanes);
+    node->BaseExprNode::ty = value_ty.WithLanes(lanes);
     // Stick to int32 lanes for fixed length vectors
     node->lanes = lanes;
   } else { /* scalable vector */
     std::optional<int> vscale_factor = ExtractVscaleFactor(lanes);
     TVM_FFI_ICHECK(vscale_factor) << "Invalid expression for scalable lanes " << lanes;
 
-    node->dtype = value.dtype().with_scalable_vscale_factor(vscale_factor.value());
-    lanes = Mul(Call(DataType::Int(32), tirx::builtin::vscale(), {}), vscale_factor.value());
+    node->BaseExprNode::ty =
+        PrimType::ScalableVector(value_ty.code(), value_ty.bits(), vscale_factor.value());
+    lanes = Mul(Call(PrimType::Int(32), tirx::builtin::vscale(), {}), vscale_factor.value());
     node->lanes = lanes;
   }
   node->value = std::move(value);
@@ -592,10 +623,10 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 Let::Let(Var var, PrimExpr value, PrimExpr body, Span span) {
   TVM_FFI_ICHECK(value.defined());
   TVM_FFI_ICHECK(body.defined());
-  TVM_FFI_ICHECK_EQ(value.dtype(), var.dtype());
+  TVM_FFI_ICHECK(value.ty() == var.ty());
 
   ffi::ObjectPtr<LetNode> node = ffi::make_object<LetNode>();
-  node->dtype = body.dtype();
+  node->BaseExprNode::ty = body.ty();
   node->var = std::move(var);
   node->value = std::move(value);
   node->body = std::move(body);
@@ -628,7 +659,7 @@ static ffi::Array<PrimExpr> ConvertCallArgs(ffi::Array<CallArg> args) {
         if (is_one(r->extent)) {
           indices.push_back(r->min);
         } else if (r->extent.as<IntImmNode>()) {
-          indices.push_back(tirx::Ramp(r->min, MakeConst(r->min->dtype, 1), r->extent));
+          indices.push_back(tirx::Ramp(r->min, MakeConst(r->min.ty(), 1), r->extent));
         } else {
           TVM_FFI_THROW(ValueError)
               << "Cannot convert to BufferLoad: " << ffi::GetRef<BufferRegion>(br);
@@ -642,13 +673,13 @@ static ffi::Array<PrimExpr> ConvertCallArgs(ffi::Array<CallArg> args) {
   return prim_expr_args;
 }
 
-Call::Call(DataType dtype, RelaxExpr op, ffi::Array<PrimExpr> args, Attrs attrs, Span span) {
+Call::Call(PrimType ret_ty, RelaxExpr op, ffi::Array<PrimExpr> args, Attrs attrs, Span span) {
   for (size_t i = 0; i < args.size(); ++i) {
     TVM_FFI_ICHECK(args[i].defined()) << "arg " << i << " is not defined()";
   }
 
   ffi::ObjectPtr<CallNode> node = ffi::make_object<CallNode>();
-  node->dtype = dtype;
+  node->BaseExprNode::ty = std::move(ret_ty);
   node->op = std::move(op);
   node->args = std::move(args);
   node->attrs = std::move(attrs);
@@ -656,21 +687,21 @@ Call::Call(DataType dtype, RelaxExpr op, ffi::Array<PrimExpr> args, Attrs attrs,
   data_ = std::move(node);
 }
 
-Call::Call(DataType dtype, RelaxExpr op, ffi::Array<PrimExpr> args, Span span)
-    : Call(dtype, std::move(op), std::move(args), Attrs(), std::move(span)) {}
+Call::Call(PrimType ret_ty, RelaxExpr op, ffi::Array<PrimExpr> args, Span span)
+    : Call(std::move(ret_ty), std::move(op), std::move(args), Attrs(), std::move(span)) {}
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def("tirx.Call",
-           [](ffi::Optional<DataType> dtype, RelaxExpr op, ffi::Array<CallArg> args, Span span) {
-             return Call(dtype.value_or(DataType::Void()), op, ConvertCallArgs(args), Attrs(),
+           [](ffi::Optional<PrimType> dtype, RelaxExpr op, ffi::Array<CallArg> args, Span span) {
+             return Call(dtype.value_or(PrimType::Void()), op, ConvertCallArgs(args), Attrs(),
                          span);
            })
       .def("tirx.CallWithAttrs",
-           [](ffi::Optional<DataType> dtype, RelaxExpr op, ffi::Array<CallArg> args,
+           [](ffi::Optional<PrimType> dtype, RelaxExpr op, ffi::Array<CallArg> args,
               ffi::Optional<Attrs> attrs, Span span) {
-             return Call(dtype.value_or(DataType::Void()), op, ConvertCallArgs(args),
+             return Call(dtype.value_or(PrimType::Void()), op, ConvertCallArgs(args),
                          attrs.value_or(Attrs()), span);
            });
 }
@@ -680,17 +711,18 @@ Shuffle::Shuffle(ffi::Array<PrimExpr> vectors, ffi::Array<PrimExpr> indices, Spa
   TVM_FFI_ICHECK_NE(vectors.size(), 0U);
   TVM_FFI_ICHECK_NE(indices.size(), 0U);
 
-  DataType base_type = vectors[0].dtype().element_of();
+  PrimType base_type = vectors[0].ty().WithLanes(1);
   int total_lanes = 0;
 
   for (PrimExpr val : vectors) {
-    TVM_FFI_ICHECK(val.dtype().element_of() == base_type);
-    total_lanes += val.dtype().lanes();
+    PrimType val_ty = val.ty();
+    TVM_FFI_ICHECK(val_ty.WithLanes(1)->dtype == base_type->dtype);
+    total_lanes += val_ty.lanes();
   }
   TVM_FFI_ICHECK_LE(indices.size(), static_cast<size_t>(total_lanes));
 
   ffi::ObjectPtr<ShuffleNode> node = ffi::make_object<ShuffleNode>();
-  node->dtype = base_type.with_lanes(static_cast<int>(indices.size()));
+  node->BaseExprNode::ty = base_type.WithLanes(static_cast<int>(indices.size()));
   node->vectors = std::move(vectors);
   node->indices = std::move(indices);
   node->span = std::move(span);
@@ -705,7 +737,7 @@ PrimExpr Shuffle::Concat(ffi::Array<PrimExpr> vectors, Span span) {
   ffi::Array<PrimExpr> indices;
   int index = 0;
   for (const PrimExpr& e : vectors) {
-    for (int i = 0; i < e.dtype().lanes(); ++i) {
+    for (int i = 0; i < e.ty().lanes(); ++i) {
       indices.push_back(IntImm::Int32(index++));
     }
   }
@@ -743,7 +775,7 @@ CommReducer::CommReducer(ffi::Array<Var> lhs, ffi::Array<Var> rhs, ffi::Array<Pr
   std::unordered_map<const VarNode*, PrimExpr> var_map;
   var_map.reserve(n_group * 2);
   for (int i = 0; i < static_cast<int>(n_group); ++i) {
-    DataType dtype = identity_element[i].dtype();
+    PrimType dtype = identity_element[i].ty();
     Var l = lhs[i].copy_with_dtype(dtype);
     Var r = rhs[i].copy_with_dtype(dtype);
     var_map[lhs[i].get()] = l;
@@ -815,7 +847,7 @@ Reduce::Reduce(CommReducer combiner, ffi::Array<PrimExpr> source, ffi::Array<Ite
           << "but received " << init[i] << " of type " << init[i]->GetTypeKey();
     }
   }
-  n->dtype = source[value_index].dtype();
+  n->BaseExprNode::ty = source[value_index].ty();
   n->combiner = std::move(combiner);
   n->source = std::move(source);
   n->init = std::move(init);
@@ -838,28 +870,30 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 // BufferLoad
 void BufferLoadNode::LegalizeDType() {
   for (int i = 0; i < static_cast<int>(indices.size()) - 1; i++) {
-    TVM_FFI_ICHECK(indices[i].dtype().is_scalar())
+    TVM_FFI_ICHECK(indices[i].ty().IsScalar())
         << "Only the last index of a buffer access may be a vector type.";
   }
 
   if (indices.empty()) {
-    this->dtype = buffer->dtype;
+    this->BaseExprNode::ty = buffer->dtype;
   } else {
-    auto index_dtype = indices.back().dtype();
-    bool is_buffer_dtype_scalable = buffer->dtype.is_scalable_vector();
-    bool is_index_scalable = index_dtype.is_scalable_vector();
+    PrimType index_ty = indices.back().ty();
+    int16_t buffer_encoded_lanes = static_cast<int16_t>(buffer->dtype->dtype.lanes);
+    bool is_buffer_dtype_scalable = buffer_encoded_lanes < -1;
+    bool is_index_scalable = index_ty.IsScalableVector();
 
     TVM_FFI_ICHECK(!(is_index_scalable && is_buffer_dtype_scalable))
         << "Index dtype and buffer dtype can't both be scalable.";
 
     if (is_index_scalable) {
-      this->dtype = buffer->dtype.with_scalable_vscale_factor(index_dtype.vscale_factor() *
-                                                              buffer->dtype.lanes());
+      this->BaseExprNode::ty =
+          PrimType::ScalableVector(buffer->dtype.code(), buffer->dtype.bits(),
+                                   index_ty.VScaleFactor() * buffer->dtype.lanes());
     } else if (is_buffer_dtype_scalable) {
-      this->dtype = buffer->dtype.with_scalable_vscale_factor(buffer->dtype.vscale_factor() *
-                                                              index_dtype.lanes());
+      this->BaseExprNode::ty = PrimType::ScalableVector(buffer->dtype.code(), buffer->dtype.bits(),
+                                                        -buffer_encoded_lanes * index_ty.lanes());
     } else {
-      this->dtype = buffer->dtype.with_lanes(index_dtype.lanes() * buffer->dtype.lanes());
+      this->BaseExprNode::ty = buffer->dtype.WithLanes(index_ty.lanes() * buffer->dtype.lanes());
     }
   }
 }
@@ -872,25 +906,24 @@ BufferLoad::BufferLoad(Buffer buffer, ffi::Array<PrimExpr> indices,
       << "-dimensional indices provided.";
 
   if (predicate.defined()) {
-    DataType predicate_dtype = predicate.value().dtype();
-
-    bool is_index_scalable = indices.empty() ? false : indices.back().dtype().is_scalable_vector();
-    bool is_predicate_scalable = predicate_dtype.is_scalable_vector();
+    PrimType predicate_ty = predicate.value().ty();
+    bool is_index_scalable = indices.empty() ? false : indices.back().ty().IsScalableVector();
+    bool is_predicate_scalable = predicate_ty.IsScalableVector();
     TVM_FFI_ICHECK_EQ(is_index_scalable, is_predicate_scalable)
         << "Predicate mask dtype and load indices must both be scalable.";
 
-    int buffer_lanes = buffer->dtype.get_lanes_or_vscale_factor();
-    int index_lanes = indices.empty() ? 1 : indices.back().dtype().get_lanes_or_vscale_factor();
-    int predicate_lanes = predicate_dtype.get_lanes_or_vscale_factor();
+    int16_t buffer_encoded_lanes = static_cast<int16_t>(buffer->dtype->dtype.lanes);
+    int buffer_lanes = buffer_encoded_lanes < -1 ? -buffer_encoded_lanes : buffer_encoded_lanes;
+    int index_lanes = indices.empty() ? 1 : GetLanesOrVScaleFactor(indices.back().ty());
+    int predicate_lanes = GetLanesOrVScaleFactor(predicate_ty);
     TVM_FFI_ICHECK_EQ(index_lanes * buffer_lanes, predicate_lanes)
         << "Got a predicate mask with " << predicate_lanes
         << " lanes, but trying to load a vector with " << index_lanes
         << " lanes. The number of lanes must match.";
 
-    DataType predicate_element_dtype = predicate_dtype.element_of();
-    TVM_FFI_ICHECK(predicate_element_dtype.is_predicate_dtype())
-        << "Predicate mask elements must be boolean values, but got " << predicate_element_dtype
-        << ".";
+    TVM_FFI_ICHECK(predicate_ty.MatchesCode(DLDataTypeCode::kDLBool) ||
+                   predicate_ty.MatchesElementType(DLDataTypeCode::kDLUInt, 1))
+        << "Predicate mask elements must be boolean values, but got " << predicate_ty->dtype << ".";
   }
 
   ffi::ObjectPtr<BufferLoadNode> node = ffi::make_object<BufferLoadNode>();
@@ -913,7 +946,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 // ProducerLoad
 ProducerLoad::ProducerLoad(DataProducer producer, ffi::Array<PrimExpr> indices, Span span) {
   ffi::ObjectPtr<ProducerLoadNode> node = ffi::make_object<ProducerLoadNode>();
-  node->dtype = producer->GetDataType();
+  node->BaseExprNode::ty = producer->GetDataType();
   node->producer = std::move(producer);
   node->indices = std::move(indices);
   node->span = std::move(span);
diff --git a/src/tirx/ir/expr_functor.cc b/src/tirx/ir/expr_functor.cc
index aba96aae8c3a..056ed9419bc8 100644
--- a/src/tirx/ir/expr_functor.cc
+++ b/src/tirx/ir/expr_functor.cc
@@ -155,7 +155,7 @@ PrimExpr ExprMutator::VisitExpr_(const CallNode* op) {
   if (args.same_as(op->args)) {
     return ffi::GetRef<PrimExpr>(op);
   } else {
-    return Call(op->dtype, op->op, args, op->attrs, op->span);
+    return Call(ffi::GetRef<PrimExpr>(op).ty(), op->op, args, op->attrs, op->span);
   }
 }
 
@@ -227,7 +227,7 @@ PrimExpr ExprMutator::VisitExpr_(const CastNode* op) {
   if (value.same_as(op->value)) {
     return ffi::GetRef<PrimExpr>(op);
   } else {
-    return Cast(op->dtype, value);
+    return Cast(ffi::GetRef<PrimExpr>(op).ty(), value);
   }
 }
 
diff --git a/src/tirx/ir/function.cc b/src/tirx/ir/function.cc
index d6b171481ea7..7fc1439fb1ea 100644
--- a/src/tirx/ir/function.cc
+++ b/src/tirx/ir/function.cc
@@ -45,23 +45,25 @@ tvm::Type InferType(const PrimFunc& prim_func) {
       if (auto opt_buf = prim_func->buffer_map.Get(param)) {
         auto buf = opt_buf.value();
         relax::ShapeExpr shape(
-            buf->shape.Map([](PrimExpr dim) { return cast(DataType::Int(64), dim); }));
+            buf->shape.Map([](PrimExpr dim) { return cast(PrimType::Int(64), dim); }));
         return relax::TensorType(shape, buf->dtype);
       }
 
-      if (auto prim_type = param->type_annotation.as<PrimTypeNode>();
-          prim_type && prim_type->dtype.is_handle()) {
-        return relax::ObjectType();
+      if (auto prim_type = param->type_annotation.as<PrimTypeNode>()) {
+        const DLDataType& dtype = prim_type->dtype;
+        if (dtype.code == kDLOpaqueHandle && (dtype.bits != 0 || dtype.lanes != 0)) {
+          return relax::ObjectType();
+        }
       }
 
-      return PrimType(param->dtype);
+      return param.ty();
     }();
     params.push_back(param_ty);
   }
 
   tvm::Type ret = [&]() -> tvm::Type {
     if (const auto* prim = prim_func->ret_type.as<PrimTypeNode>()) {
-      return PrimType(prim->dtype);
+      return tvm::PrimType(prim->dtype);
     } else if (IsVoidType(prim_func->ret_type)) {
       return relax::TupleType(ffi::Array<tvm::Type>{});
     } else {
@@ -119,10 +121,10 @@ TensorIntrin::TensorIntrin(PrimFunc desc, PrimFunc impl) {
       << "The number of parameters of the description and the implementation of the "
          "tensor intrinsic doesn't match.";
   for (size_t i = 0; i < desc->params.size(); i++) {
-    TVM_FFI_CHECK(desc->params[i]->dtype.is_handle(), ValueError)
+    TVM_FFI_CHECK(desc->params[i].ty().IsHandle(), ValueError)
         << "Parameters of the description of the "
            "tensor intrinsic should be handle only.";
-    TVM_FFI_CHECK(impl->params[i]->dtype.is_handle(), ValueError)
+    TVM_FFI_CHECK(impl->params[i].ty().IsHandle(), ValueError)
         << "Parameters of the implementation of "
            "the tensor intrinsic should be handle only.";
   }
diff --git a/src/tirx/ir/index_map.cc b/src/tirx/ir/index_map.cc
index 4e9e7ecea8b6..382c75348941 100644
--- a/src/tirx/ir/index_map.cc
+++ b/src/tirx/ir/index_map.cc
@@ -53,7 +53,7 @@ IndexMap IndexMap::FromFunc(int ndim,
   ffi::Array<Var> initial_indices;
   initial_indices.reserve(ndim);
   for (int i = 0; i < ndim; ++i) {
-    initial_indices.push_back(Var("i" + std::to_string(i), DataType::Int(32)));
+    initial_indices.push_back(Var("i" + std::to_string(i), PrimType::Int(32)));
   }
   return IndexMap(initial_indices, func(initial_indices), std::move(inverse_index_map));
 }
@@ -83,7 +83,7 @@ std::pair<IndexMap, PrimExpr> IndexMapInverseImpl(const IndexMap& self,
     // should be named (X.outer,X.inner).
     std::stringstream ss;
     ss << "axis" << i;
-    Var var_index(ss.str(), index.dtype());
+    Var var_index(ss.str(), index.ty());
     output_vars.push_back(var_index);
   }
 
@@ -249,12 +249,13 @@ ffi::Array<Range> IndexMapNode::MapRanges(const ffi::Array<Range>& ranges,
   auto output_dtype = [&]() {
     int max_bits = ranges.empty() ? 32 : 0;
     for (const auto& range : ranges) {
-      max_bits = std::max(max_bits, range->extent.dtype().bits());
+      max_bits = std::max(max_bits, range->extent.ty().bits());
     }
-    return DataType::Int(max_bits);
+    return PrimType::Int(max_bits);
   }();
   output.MutateByApply([&](const Range& range) {
-    if (range->min.dtype() != output_dtype || range->extent.dtype() != output_dtype) {
+    if (range->min.ty()->dtype != output_dtype->dtype ||
+        range->extent.ty()->dtype != output_dtype->dtype) {
       return Range::FromMinExtent(cast(output_dtype, range->min),
                                   cast(output_dtype, range->extent));
     } else {
@@ -275,7 +276,7 @@ ffi::Array<PrimExpr> IndexMapNode::MapShape(const ffi::Array<PrimExpr>& shape,
 
   ffi::Array<Range> ranges;
   for (auto& dim : shape) {
-    ranges.push_back(Range(IntImm(dim.dtype(), 0), dim));
+    ranges.push_back(Range(IntImm(dim.ty(), 0), dim));
   }
   ffi::Array<Range> mapped = MapRanges(std::move(ranges), analyzer);
 
@@ -366,7 +367,7 @@ IndexMap IndexMap::RenameVariables(
           ffi::String name = opt_name.value();
           TVM_FFI_ICHECK(!name_supply->ContainsName(name, /*add_prefix=*/false));
           name_supply->ReserveName(name, /*add_prefix=*/false);
-          var_remap.Set(var, Var(name, var->dtype));
+          var_remap.Set(var, Var(name, var.ty()));
         }
       });
     });
diff --git a/src/tirx/ir/layout/axis_registry.cc b/src/tirx/ir/layout/axis_registry.cc
index 2afd290037c8..633296cee629 100644
--- a/src/tirx/ir/layout/axis_registry.cc
+++ b/src/tirx/ir/layout/axis_registry.cc
@@ -169,7 +169,7 @@ ffi::Array<Iter> SplitterGen(const Iter& iter, const Axis& axis_outer, const Axi
              analyzer->CanProveEqual(floormod(iter->extent * iter->stride, e_inner), 0)) {
     const auto& d = analyzer->Simplify(floordiv(e_inner, iter->stride));
     const auto& c = analyzer->Simplify(floordiv(iter->extent, d));
-    return {Iter(c, IntImm(e_inner.dtype(), 1), axis_outer), Iter(d, iter->stride, axis_inner)};
+    return {Iter(c, IntImm(e_inner.ty(), 1), axis_outer), Iter(d, iter->stride, axis_inner)};
   } else if (analyzer->CanProveEqual(floormod(iter->stride, e_inner), 0)) {
     const auto& d = analyzer->Simplify(floordiv(iter->stride, e_inner));
     return {Iter(iter->extent, d, axis_outer)};
diff --git a/src/tirx/ir/layout/tile_slice.cc b/src/tirx/ir/layout/tile_slice.cc
index 8b4181d2bfa8..b172f7fec0ff 100644
--- a/src/tirx/ir/layout/tile_slice.cc
+++ b/src/tirx/ir/layout/tile_slice.cc
@@ -118,7 +118,7 @@ ffi::Optional<TileLayout> SlicePerGroup(TileLayout layout, PrimExpr begin, PrimE
     return TileLayout(new_shard, layout->replica, new_offset);
   }
 
-  PrimExpr two = MakeConst(rem.dtype(), 2);
+  PrimExpr two = MakeConst(rem.ty(), 2);
   PrimExpr c = analyzer->Simplify(floordiv(rem, two));
   bool even = analyzer->CanProveEqual(floormod(rem, two), 0);
   bool mid = analyzer->CanProveEqual(analyzer->Simplify(d0[pivot] + c), Ek);
@@ -131,7 +131,7 @@ ffi::Optional<TileLayout> SlicePerGroup(TileLayout layout, PrimExpr begin, PrimE
       PrimExpr delta =
           analyzer->Simplify((pivot > 0 ? shard[pivot - 1]->stride : PrimExpr(0)) - (Ek - c) * Sk);
       std::vector<Iter> new_shard;
-      new_shard.push_back(Iter(MakeConst(c.dtype(), 2), delta, ak));
+      new_shard.push_back(Iter(MakeConst(c.ty(), 2), delta, ak));
       new_shard.push_back(Iter(c, Sk, ak));
       new_shard.insert(new_shard.end(), peeled_rev.rbegin(), peeled_rev.rend());
       return TileLayout(new_shard, layout->replica, new_offset);
diff --git a/src/tirx/ir/layout/utils.cc b/src/tirx/ir/layout/utils.cc
index 477f512a4e42..05828a66001c 100644
--- a/src/tirx/ir/layout/utils.cc
+++ b/src/tirx/ir/layout/utils.cc
@@ -73,7 +73,7 @@ std::vector<PrimExpr> GetDefaultStrides(const ffi::Array<PrimExpr>& data, PrimEx
   // get int32 strides and structurally differ from parser output.
   PrimExpr current_stride = initial_stride;
   if (const auto* imm = current_stride.as<IntImmNode>()) {
-    current_stride = MakeConst(data[0].dtype(), imm->value);
+    current_stride = MakeConst(data[0].ty(), imm->value);
   }
   for (int i = static_cast<int>(n) - 1; i >= 0; --i) {
     strides[i] = current_stride;
diff --git a/src/tirx/ir/script/script_complete.cc b/src/tirx/ir/script/script_complete.cc
index c432731ebad5..bb915e96acf8 100644
--- a/src/tirx/ir/script/script_complete.cc
+++ b/src/tirx/ir/script/script_complete.cc
@@ -45,8 +45,9 @@ class ScriptCompleter : public StmtMutator {
   ffi::Map<Var, Buffer>* buffer_var_map_;
   Stmt VisitStmt_(const SBlockRealizeNode* op) final {
     for (const PrimExpr& value : op->iter_values) {
-      TVM_FFI_ICHECK(value.dtype().is_int())
-          << "BlockRealize iter_value expected a IntImm, but got " << value.dtype();
+      PrimType value_ty = value.ty();
+      TVM_FFI_ICHECK(value_ty.code() == DLDataTypeCode::kDLInt)
+          << "BlockRealize iter_value expected a IntImm, but got " << value_ty->dtype;
     }
     return StmtMutator::VisitStmt_(op);
   }
diff --git a/src/tirx/ir/stmt.cc b/src/tirx/ir/stmt.cc
index 66f48355c5ce..d4908df436c3 100644
--- a/src/tirx/ir/stmt.cc
+++ b/src/tirx/ir/stmt.cc
@@ -33,6 +33,14 @@
 namespace tvm {
 namespace tirx {
 
+namespace {
+
+int GetLanesOrVScaleFactor(const PrimType& ty) {
+  return ty.IsScalableVector() ? ty.VScaleFactor() : ty.lanes();
+}
+
+}  // namespace
+
 TVM_FFI_STATIC_INIT_BLOCK() {
   StmtNode::RegisterReflection();
   BindNode::RegisterReflection();
@@ -59,12 +67,12 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 // Bind
 Bind::Bind(Var var, PrimExpr value, Span span) {
   TVM_FFI_ICHECK(value.defined());
-  auto vdtype = value.dtype();
+  PrimType value_ty = value.ty();
   // It is still valid to bind a pointer type var to a value that is of type handle.
   if (var->type_annotation.as<PointerTypeNode>()) {
-    TVM_FFI_ICHECK(vdtype.is_handle());
+    TVM_FFI_ICHECK(value_ty.IsHandle());
   } else {
-    TVM_FFI_ICHECK_EQ(value.dtype(), var.dtype());
+    TVM_FFI_ICHECK(value.ty() == var.ty());
   }
 
   ffi::ObjectPtr<BindNode> node = ffi::make_object<BindNode>();
@@ -108,9 +116,10 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 AssertStmt::AssertStmt(PrimExpr condition, StringImm error_kind,
                        ffi::Array<StringImm> message_parts, Span span) {
   TVM_FFI_ICHECK(condition.defined());
-  TVM_FFI_ICHECK(condition.dtype().is_predicate_dtype())
+  PrimType condition_ty = condition.ty();
+  TVM_FFI_ICHECK(condition_ty.MatchesCode(DLDataTypeCode::kDLBool))
       << "AssertStmt should have boolean condition, "
-      << "but received " << condition << " with dtype " << condition.dtype();
+      << "but received " << condition << " with dtype " << condition_ty;
   TVM_FFI_ICHECK(error_kind.defined());
 
   ffi::ObjectPtr<AssertStmtNode> node = ffi::make_object<AssertStmtNode>();
@@ -139,8 +148,9 @@ For::For(Var loop_var, PrimExpr min, PrimExpr extent, ForKind kind, Stmt body,
   TVM_FFI_ICHECK(body.defined());
 
   auto require_scalar_int_dtype = [&](PrimExpr expr, const char* field_name) {
-    auto dtype = expr.dtype();
-    TVM_FFI_ICHECK(dtype.is_scalar() && (dtype.is_int() || dtype.is_uint()))
+    PrimType dtype = expr.ty();
+    TVM_FFI_ICHECK(dtype.IsScalar() &&
+                   (dtype.MatchesCode(DLDataTypeCode::kDLUInt, DLDataTypeCode::kDLInt)))
         << "TIR For nodes require a scalar integer as the " << field_name << ", but received "
         << expr << " with dtype " << dtype;
   };
@@ -151,12 +161,14 @@ For::For(Var loop_var, PrimExpr min, PrimExpr extent, ForKind kind, Stmt body,
   // When extent, min or step is an IntImm but has narrower dtype than loop_var
   // we directly promote them without raising errors.
   auto try_promote_imm_dtype = [&](const PrimExpr& e) {
-    TVM_FFI_ICHECK(e.dtype().bits() <= loop_var.dtype().bits())
-        << " Loop variable's dtype (" << loop_var.dtype()
-        << ") is narrower than that of `min` or `extent` (" << e.dtype() << ")";
+    PrimType e_ty = e.ty();
+    PrimType loop_var_ty = loop_var.ty();
+    TVM_FFI_ICHECK(e_ty.bits() <= loop_var_ty.bits())
+        << " Loop variable's dtype (" << loop_var_ty
+        << ") is narrower than that of `min` or `extent` (" << e_ty << ")";
     const IntImmNode* a = e.as<IntImmNode>();
-    if (a && e.dtype().bits() < loop_var.dtype().bits()) {
-      return MakeConst(loop_var.dtype(), a->value);
+    if (a && e_ty.bits() < loop_var_ty.bits()) {
+      return MakeConst(loop_var_ty, a->value);
     } else {
       return e;
     }
@@ -165,15 +177,14 @@ For::For(Var loop_var, PrimExpr min, PrimExpr extent, ForKind kind, Stmt body,
   min = try_promote_imm_dtype(min);
   extent = try_promote_imm_dtype(extent);
 
-  TVM_FFI_ICHECK(loop_var.dtype() == min.dtype()) << loop_var.dtype() << " vs " << min.dtype();
-  TVM_FFI_ICHECK(loop_var.dtype() == extent.dtype())
-      << loop_var.dtype() << " vs " << extent.dtype();
+  TVM_FFI_ICHECK(loop_var.ty() == min.ty()) << loop_var.ty() << " vs " << min.ty();
+  TVM_FFI_ICHECK(loop_var.ty() == extent.ty()) << loop_var.ty() << " vs " << extent.ty();
 
   if (step.has_value()) {
     require_scalar_int_dtype(*step, "step");
     step = try_promote_imm_dtype(*step);
-    TVM_FFI_ICHECK(loop_var.dtype() == (*step).dtype())
-        << loop_var.dtype() << " vs " << (*step).dtype();
+    TVM_FFI_ICHECK(loop_var.ty() == step.value().ty())
+        << loop_var.ty() << " vs " << step.value().ty();
   }
 
   ffi::ObjectPtr<ForNode> node = ffi::make_object<ForNode>();
@@ -226,7 +237,7 @@ std::ostream& operator<<(std::ostream& out, ForKind type) {  // NOLINT(*)
 // While
 While::While(PrimExpr condition, Stmt body, Span span) {
   TVM_FFI_ICHECK(condition.defined());
-  TVM_FFI_ICHECK(condition.dtype().is_scalar());
+  TVM_FFI_ICHECK(condition.ty().IsScalar());
   TVM_FFI_ICHECK(body.defined());
 
   ffi::ObjectPtr<WhileNode> node = ffi::make_object<WhileNode>();
@@ -393,19 +404,21 @@ BufferStore::BufferStore(Buffer buffer, PrimExpr value, ffi::Array<PrimExpr> ind
       << "-dimensional indices provided.";
 
   for (int i = 0; i < static_cast<int>(indices.size()) - 1; i++) {
-    TVM_FFI_ICHECK(indices[i].dtype().is_scalar())
+    TVM_FFI_ICHECK(indices[i].ty().IsScalar())
         << "Only the last index of a buffer access may be a vector type.";
   }
 
-  bool is_index_scalable = indices.empty() ? false : indices.back().dtype().is_scalable_vector();
-  bool is_buffer_dtype_scalable = buffer->dtype.is_scalable_vector();
-  bool is_value_dtype_scalable = value.dtype().is_scalable_vector();
+  bool is_index_scalable = indices.empty() ? false : indices.back().ty().IsScalableVector();
+  int16_t buffer_encoded_lanes = static_cast<int16_t>(buffer->dtype->dtype.lanes);
+  bool is_buffer_dtype_scalable = buffer_encoded_lanes < -1;
+  PrimType value_ty = value.ty();
+  bool is_value_dtype_scalable = value_ty.IsScalableVector();
 
   TVM_FFI_ICHECK(!(is_index_scalable && is_buffer_dtype_scalable))
       << "Index dtype and buffer dtype can't both be scalable.";
 
   if (predicate.defined()) {
-    bool is_predicate_dtype_scalable = predicate.value().dtype().is_scalable_vector();
+    bool is_predicate_dtype_scalable = predicate.value().ty().IsScalableVector();
     TVM_FFI_ICHECK_EQ(is_value_dtype_scalable, is_predicate_dtype_scalable)
         << "Predicate mask dtype and value dtype must both be scalable.";
   }
@@ -414,9 +427,9 @@ BufferStore::BufferStore(Buffer buffer, PrimExpr value, ffi::Array<PrimExpr> ind
     TVM_FFI_ICHECK(is_value_dtype_scalable) << "Can't store non-scalable data into scalable buffer";
   }
 
-  int index_lanes = indices.empty() ? 1 : indices.back().dtype().get_lanes_or_vscale_factor();
-  int buffer_lanes = buffer->dtype.get_lanes_or_vscale_factor();
-  int value_dtype_lanes = value.dtype().get_lanes_or_vscale_factor();
+  int index_lanes = indices.empty() ? 1 : GetLanesOrVScaleFactor(indices.back().ty());
+  int buffer_lanes = is_buffer_dtype_scalable ? -buffer_encoded_lanes : buffer_encoded_lanes;
+  int value_dtype_lanes = GetLanesOrVScaleFactor(value_ty);
 
   TVM_FFI_ICHECK_EQ(index_lanes * buffer_lanes, value_dtype_lanes)
       << "Cannot store value with " << value_dtype_lanes << ", expected value with "
@@ -424,31 +437,33 @@ BufferStore::BufferStore(Buffer buffer, PrimExpr value, ffi::Array<PrimExpr> ind
       << " buffer element lanes)";
 
   if (predicate.defined()) {
-    DataType predicate_dtype = predicate.value().dtype();
-    int predicate_dtype_lanes = predicate_dtype.get_lanes_or_vscale_factor();
+    PrimType predicate_ty = predicate.value().ty();
+    int predicate_dtype_lanes = GetLanesOrVScaleFactor(predicate_ty);
     TVM_FFI_ICHECK_EQ(value_dtype_lanes, predicate_dtype_lanes)
         << "Got a predicate mask with " << predicate_dtype_lanes
         << " lanes, but trying to store a value with " << value_dtype_lanes
         << " lanes. The number of lanes must match.";
 
-    DataType predicate_element_dtype = predicate_dtype.element_of();
-    TVM_FFI_ICHECK(predicate_element_dtype.is_predicate_dtype())
-        << "Predicate mask elements must be boolean values, but got " << predicate_element_dtype
+    PrimType predicate_element_ty = predicate_ty.WithLanes(1);
+    TVM_FFI_ICHECK(predicate_element_ty.MatchesCode(DLDataTypeCode::kDLBool) ||
+                   predicate_element_ty.MatchesElementType(DLDataTypeCode::kDLUInt, 1))
+        << "Predicate mask elements must be boolean values, but got " << predicate_element_ty
         << ".";
   }
 
-  runtime::DataType buffer_dtype;
+  PrimType buffer_dtype = PrimType::Void();
   if (is_index_scalable || is_buffer_dtype_scalable) {
-    buffer_dtype = buffer->dtype.with_scalable_vscale_factor(buffer_lanes * index_lanes);
+    buffer_dtype = PrimType::ScalableVector(buffer->dtype.code(), buffer->dtype.bits(),
+                                            buffer_lanes * index_lanes);
   } else {
-    buffer_dtype = buffer->dtype.with_lanes(buffer_lanes * index_lanes);
+    buffer_dtype = buffer->dtype.WithLanes(buffer_lanes * index_lanes);
   }
-  if (buffer_dtype != value.dtype()) {
+  if (buffer_dtype != value_ty) {
     TVM_FFI_THROW(TypeError) << "dtype mismatch on BufferStore: "                 //
                              << "buffer's dtype is `" << buffer->dtype            //
                              << "`, the lanes of indexing are: `" << index_lanes  //
-                             << "`, the scalability is: `" << buffer_dtype.is_scalable_vector()
-                             << "`, but RHS's dtype is `" << value.dtype() << "`";
+                             << "`, the scalability is: `" << buffer_dtype.IsScalableVector()
+                             << "`, but RHS's dtype is `" << value_ty << "`";
   }
 
   ffi::ObjectPtr<BufferStoreNode> node = ffi::make_object<BufferStoreNode>();
@@ -478,7 +493,7 @@ PrimExpr BufferRegionNode::ToPrimExpr() const {
     if (tvm::tirx::is_one(r->extent)) {
       indices.push_back(r->min);
     } else if (r->extent.as<IntImmNode>()) {
-      indices.push_back(tirx::Ramp(r->min, tvm::tirx::MakeConst(r->min->dtype, 1), r->extent));
+      indices.push_back(tirx::Ramp(r->min, tvm::tirx::MakeConst(r->min.ty(), 1), r->extent));
     } else {
       TVM_FFI_THROW(ValueError) << "Cannot convert to BufferLoad: "
                                 << ffi::GetRef<BufferRegion>(this);
@@ -512,7 +527,7 @@ BufferRegion BufferRegion::FromPoint(Buffer buffer, ffi::Array<PrimExpr> indices
       region.push_back(
           Range::FromMinExtent(ramp_index->base, ramp_index->stride * ramp_index->lanes));
     } else {
-      region.push_back(Range::FromMinExtent(index, MakeConst(index.dtype(), 1)));
+      region.push_back(Range::FromMinExtent(index, MakeConst(index.ty(), 1)));
     }
   }
   return BufferRegion(buffer, region);
@@ -652,7 +667,8 @@ SBlockRealize::SBlockRealize(ffi::Array<PrimExpr> values, PrimExpr predicate, SB
                              Span span) {
   TVM_FFI_CHECK_EQ(block->iter_vars.size(), values.size(), ValueError)
       << "BlockRealize needs to have the same number of iter_vars and binding values";
-  TVM_FFI_CHECK(predicate.dtype().is_bool() || predicate.dtype() == DataType::UInt(1), TypeError)
+  PrimType predicate_ty = predicate.ty();
+  TVM_FFI_CHECK(predicate_ty.MatchesCode(DLDataTypeCode::kDLBool), TypeError)
       << "Expect Block.predicate to be a bool expression";
   ffi::ObjectPtr<SBlockRealizeNode> node = ffi::make_object<SBlockRealizeNode>();
   node->iter_values = std::move(values);
@@ -670,7 +686,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   });
 }
 
-PrimExpr TypeAnnotation(DataType dtype, Span span) {
+PrimExpr TypeAnnotation(PrimType dtype, Span span) {
   static const Op& type_annotation_op = Op::Get("tirx.type_annotation");
   return tirx::Call(dtype, type_annotation_op, {}, {}, span);
 }
diff --git a/src/tirx/ir/stmt_functor.cc b/src/tirx/ir/stmt_functor.cc
index 83864c88af8a..1fd06190e321 100644
--- a/src/tirx/ir/stmt_functor.cc
+++ b/src/tirx/ir/stmt_functor.cc
@@ -775,11 +775,11 @@ class IRSubstitute : public StmtExprMutator {
     if (ret.defined()) {
       // Allow substitution of void variables with any expression. The TVM script parser
       // uses void variables for lambda parameters (since exact types are not known yet).
-      if (!var.dtype().is_void()) {
+      if (!var.ty().IsVoid()) {
         PrimExpr ret_ex = ret.value().as_or_throw<PrimExpr>();
-        TVM_FFI_ICHECK(ret_ex.dtype() == var.dtype())
-            << "substituting " << var << ":" << var.dtype() << " -> " << ret_ex << ":"
-            << ret_ex.dtype();
+        TVM_FFI_ICHECK(ret_ex.ty()->dtype == var.ty()->dtype)
+            << "substituting " << var << ":" << var.ty()->dtype << " -> " << ret_ex << ":"
+            << ret_ex.ty()->dtype;
       }
       return ret.value();
     }
diff --git a/src/tirx/op/op.cc b/src/tirx/op/op.cc
index 21f9f601f809..e67dec179a82 100644
--- a/src/tirx/op/op.cc
+++ b/src/tirx/op/op.cc
@@ -49,6 +49,38 @@ bool IsVScaleCall(const PrimExpr& expr) {
   }
   return false;
 }
+
+TVM_FFI_INLINE const PrimTypeNode* GetPrimTypeNode(const PrimExpr& expr) {
+  // Avoid PrimExpr::ty() ObjectRef materialization on binary operator hot paths.
+  const auto* node = expr.get();
+  TVM_FFI_DCHECK(node != nullptr);
+  TVM_FFI_DCHECK(node->BaseExprNode::ty.defined());
+  const auto* prim_ty = node->BaseExprNode::ty.as<PrimTypeNode>();
+  TVM_FFI_DCHECK(prim_ty != nullptr);
+  return prim_ty;
+}
+
+bool IsFloatType(const PrimType& ty) { return ty.code() == DLDataTypeCode::kDLFloat; }
+
+bool IsBFloat16Type(const PrimType& ty) {
+  return ty.code() == DLDataTypeCode::kDLBfloat && ty.bits() == 16;
+}
+
+bool IsFloat8Type(const PrimType& ty) {
+  DLDataTypeCode code = ty.code();
+  return code == DLDataTypeCode::kDLFloat8_e3m4 || code == DLDataTypeCode::kDLFloat8_e4m3 ||
+         code == DLDataTypeCode::kDLFloat8_e4m3b11fnuz ||
+         code == DLDataTypeCode::kDLFloat8_e4m3fn || code == DLDataTypeCode::kDLFloat8_e4m3fnuz ||
+         code == DLDataTypeCode::kDLFloat8_e5m2 || code == DLDataTypeCode::kDLFloat8_e5m2fnuz ||
+         code == DLDataTypeCode::kDLFloat8_e8m0fnu;
+}
+
+bool IsFloat6Type(const PrimType& ty) {
+  DLDataTypeCode code = ty.code();
+  return code == DLDataTypeCode::kDLFloat6_e2m3fn || code == DLDataTypeCode::kDLFloat6_e3m2fn;
+}
+
+bool IsFloat4Type(const PrimType& ty) { return ty.code() == DLDataTypeCode::kDLFloat4_e2m1fn; }
 }  // namespace
 
 // macro to register an unary op
@@ -61,16 +93,16 @@ bool IsVScaleCall(const PrimExpr& expr) {
   TVM_TIR_REGISTER_OP(OpName).set_num_inputs(2).set_attr<TCallEffectKind>( \
       "TCallEffectKind", static_cast<int64_t>(CallEffectKind::kPure))
 
-runtime::DataType GetRuntimeDataType(const Type& type) {
+DLDataType GetRuntimeDLDataType(const Type& type) {
   if (auto* n = type.as<PrimTypeNode>()) {
     return n->dtype;
   } else if (type.as<PointerTypeNode>()) {
-    return DataType::Handle();
+    return DLDataType{kDLOpaqueHandle, 64, 1};
   } else if (IsVoidType(type)) {
-    return DataType::Void();
+    return DLDataType{kDLOpaqueHandle, 0, 0};
   } else {
     TVM_FFI_THROW(InternalError) << "Type " << type
-                                 << " does not have a corresponding runtime::DataType";
+                                 << " does not have a corresponding runtime DLPack dtype";
   }
 }
 
@@ -94,7 +126,7 @@ Type GetType(const PrimExpr& expr) {
       TVM_FFI_ICHECK(type_annotation->op.same_as(type_annotation_op))
           << "Expected the first argument of builtin tvm_access_ptr() "
           << "to be a type annotation, but found " << type_annotation->op;
-      return PointerType(PrimType(type_annotation->dtype));
+      return PointerType(type_annotation.ty());
     }
     if (access->op.same_as(builtin::ptr_byte_offset())) {
       TVM_FFI_ICHECK_EQ(access->args.size(), 3U);
@@ -102,7 +134,7 @@ Type GetType(const PrimExpr& expr) {
       TVM_FFI_ICHECK(type_annotation->op.same_as(type_annotation_op))
           << "Expected the third argument of builtin ptr_byte_offset() "
           << "to be a type annotation, but found " << type_annotation->op;
-      return PointerType(PrimType(type_annotation->dtype));
+      return PointerType(type_annotation.ty());
     }
   }
 
@@ -113,16 +145,16 @@ Type GetType(const PrimExpr& expr) {
           << address_of->args;
       auto* address = address_of->args[0].as<BufferLoadNode>();
       if (address) {
-        return PointerType(PrimType(address->dtype));
+        return PointerType(ffi::GetRef<PrimExpr>(address).ty());
       }
 
       if (auto* var = address_of->args[0].as<VarNode>()) {
         if (auto* ptr = var->type_annotation.as<PointerTypeNode>()) {
           if (ptr->element_type.as<TensorMapTypeNode>()) {
-            return PrimType(DataType::UInt(64));
+            return PrimType::UInt(64);
           }
         }
-        return PointerType(PrimType(var->dtype));
+        return PointerType(ffi::GetRef<PrimExpr>(var).ty());
       }
 
       TVM_FFI_ICHECK(false)
@@ -130,163 +162,149 @@ Type GetType(const PrimExpr& expr) {
           << "received argument " << address_of->args[0];
     }
   }
-  // Default: return the type indicated by the dtype.
-  runtime::DataType dtype = expr.dtype();
-  return GetTypeFromRuntimeDataType(dtype);
+  return expr.ty();
 }
 
-Type GetTypeFromRuntimeDataType(const DataType& dtype) {
-  if (dtype.is_void()) {
-    return VoidType();
-  }
-  return PrimType(dtype);
-}
+Type GetTypeFromRuntimeDataType(DLDataType dtype) { return PrimType(dtype); }
 
 // LargeUIntImm
-PrimExpr LargeUIntImm(DataType t, int64_t low, int64_t high, Span span) {
-  return tirx::Call(t, tirx::builtin::large_uint_imm(),
-                    {IntImm(DataType::UInt(32), low, span), IntImm(DataType::UInt(32), high, span)},
+PrimExpr LargeUIntImm(PrimType value_ty, int64_t low, int64_t high, Span span) {
+  return tirx::Call(value_ty, tirx::builtin::large_uint_imm(),
+                    {IntImm(PrimType::UInt(32), low, span), IntImm(PrimType::UInt(32), high, span)},
                     {}, span);
 }
 
 // Q-multiplication
 PrimExpr q_multiply_shift(PrimExpr x, PrimExpr y, PrimExpr q, PrimExpr s, Span span) {
-  return tirx::Call(DataType::Int(32, x.dtype().lanes()), tirx::builtin::q_multiply_shift(),
+  return tirx::Call(PrimType::Int(32, x.ty().lanes()), tirx::builtin::q_multiply_shift(),
                     {x, y, q, s}, {}, span);
 }
 
 void BroadcastToMatchLanes(PrimExpr& op_a, PrimExpr& op_b) {  // NOLINT(*)
-  DataType dtype_a = op_a.dtype();
-  DataType dtype_b = op_b.dtype();
-
-  if (!dtype_a.is_scalable_or_fixed_length_vector() &&
-      dtype_b.is_scalable_or_fixed_length_vector()) {
-    if (dtype_b.is_scalable_vector()) {
-      op_a = tirx::Broadcast(
-          op_a, tirx::Mul(dtype_b.vscale_factor(), Call(DataType::Int(32), builtin::vscale(), {})));
+  PrimType ty_a = op_a.ty();
+  PrimType ty_b = op_b.ty();
+
+  if (!ty_a.IsScalableVector() && !ty_a.IsFixedLengthVector() &&
+      (ty_b.IsScalableVector() || ty_b.IsFixedLengthVector())) {
+    if (ty_b.IsScalableVector()) {
+      PrimType i32_ty = PrimType::Int(32);
+      op_a = tirx::Broadcast(op_a,
+                             tirx::Mul(ty_b.VScaleFactor(), Call(i32_ty, builtin::vscale(), {})));
+    } else {
+      op_a = tirx::Broadcast(op_a, ty_b.lanes());
+    }
+  }
+}
+
+PrimType PromoteBinaryOpType(PrimType lhs_ty, PrimType rhs_ty) {
+  if (lhs_ty->dtype == rhs_ty->dtype) {
+    return lhs_ty;
+  }
+
+  // Keep conversion behavior consistent with the previous DataType-based path.
+  if (IsFloatType(lhs_ty) && IsFloatType(rhs_ty)) {
+    return lhs_ty.bits() < rhs_ty.bits() ? rhs_ty : lhs_ty;
+  } else if (!IsFloatType(lhs_ty) && IsFloatType(rhs_ty)) {
+    return rhs_ty;
+  } else if (IsFloatType(lhs_ty) && !IsFloatType(rhs_ty)) {
+    return lhs_ty;
+  } else if (!IsBFloat16Type(lhs_ty) && IsBFloat16Type(rhs_ty)) {
+    return rhs_ty;
+  } else if (IsBFloat16Type(lhs_ty) && !IsBFloat16Type(rhs_ty)) {
+    return lhs_ty;
+  } else if (!IsFloat8Type(lhs_ty) && IsFloat8Type(rhs_ty)) {
+    return rhs_ty;
+  } else if (IsFloat8Type(lhs_ty) && !IsFloat8Type(rhs_ty)) {
+    return lhs_ty;
+  } else if (!IsFloat6Type(lhs_ty) && IsFloat6Type(rhs_ty)) {
+    return rhs_ty;
+  } else if (IsFloat6Type(lhs_ty) && !IsFloat6Type(rhs_ty)) {
+    return lhs_ty;
+  } else if (!IsFloat4Type(lhs_ty) && IsFloat4Type(rhs_ty)) {
+    return rhs_ty;
+  } else if (IsFloat4Type(lhs_ty) && !IsFloat4Type(rhs_ty)) {
+    return lhs_ty;
+  } else if (lhs_ty.MatchesCode(DLDataTypeCode::kDLBool) &&
+             rhs_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
+    return rhs_ty;
+  } else if (lhs_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt) &&
+             rhs_ty.MatchesCode(DLDataTypeCode::kDLBool)) {
+    return lhs_ty;
+  } else if ((lhs_ty.MatchesCode(DLDataTypeCode::kDLInt) &&
+              rhs_ty.MatchesCode(DLDataTypeCode::kDLInt)) ||
+             (lhs_ty.MatchesCode(DLDataTypeCode::kDLUInt) &&
+              rhs_ty.MatchesCode(DLDataTypeCode::kDLUInt))) {
+    return lhs_ty.bits() < rhs_ty.bits() ? rhs_ty : lhs_ty;
+  } else if ((lhs_ty.MatchesCode(DLDataTypeCode::kDLInt) &&
+              rhs_ty.MatchesCode(DLDataTypeCode::kDLUInt)) ||
+             (lhs_ty.MatchesCode(DLDataTypeCode::kDLUInt) &&
+              rhs_ty.MatchesCode(DLDataTypeCode::kDLInt))) {
+    if (lhs_ty.bits() < rhs_ty.bits()) {
+      return rhs_ty;
+    } else if (lhs_ty.bits() > rhs_ty.bits()) {
+      return lhs_ty;
     } else {
-      op_a = tirx::Broadcast(op_a, dtype_b.lanes());
+      return lhs_ty.MatchesCode(DLDataTypeCode::kDLUInt) ? lhs_ty
+                                                         : lhs_ty.WithCode(DLDataTypeCode::kDLUInt);
     }
+  } else {
+    TVM_FFI_THROW(InternalError) << "Cannot match type " << lhs_ty->dtype << " vs "
+                                 << rhs_ty->dtype;
   }
+  return lhs_ty;
 }
 
 // The public function with a quick checking path.
 void BinaryOpMatchTypes(PrimExpr& lhs, PrimExpr& rhs, Span span) {  // NOLINT(*)
   TVM_FFI_CHECK(lhs.defined(), ValueError) << "`lhs` is null in the binary operator";
   TVM_FFI_CHECK(rhs.defined(), ValueError) << "`rhs` is null in the binary operator";
-  if (lhs.dtype() == rhs.dtype()) return;
+  const PrimTypeNode* lhs_ty_node = GetPrimTypeNode(lhs);
+  const PrimTypeNode* rhs_ty_node = GetPrimTypeNode(rhs);
+  if (lhs_ty_node == rhs_ty_node || lhs_ty_node->dtype == rhs_ty_node->dtype) return;
 
   BroadcastToMatchLanes(lhs, rhs);
   BroadcastToMatchLanes(rhs, lhs);
 
-  DataType ltype = lhs.dtype();
-  DataType rtype = rhs.dtype();
+  PrimType lhs_ty = lhs.ty();
+  PrimType rhs_ty = rhs.ty();
 
-  TVM_FFI_ICHECK(ltype.is_scalable_vector() == rtype.is_scalable_vector())
+  TVM_FFI_ICHECK(lhs_ty.IsScalableVector() == rhs_ty.IsScalableVector())
       << "Can't match scalable and fixed length vectors";
 
   bool lanes_match = false;
 
-  if (ltype.is_scalable_vector()) {
-    lanes_match = ltype.vscale_factor() == rtype.vscale_factor();
+  if (lhs_ty.IsScalableVector()) {
+    lanes_match = lhs_ty.VScaleFactor() == rhs_ty.VScaleFactor();
   } else {
-    lanes_match = ltype.lanes() == rtype.lanes();
+    lanes_match = lhs_ty.lanes() == rhs_ty.lanes();
   }
 
-  TVM_FFI_ICHECK(lanes_match) << "Cannot match type " << ltype << " vs " << rtype;
-  if (lhs.dtype() == rhs.dtype()) return;
-
-  ltype = lhs.dtype();
-  rtype = rhs.dtype();
-  // We keep dtypes conversion to be relatively consistent to reduce the amount code generated by
-  // operators. This can be helpful for users to find potential type conversion problems. The
-  // following are exceptions:
-  if (ltype.is_float() && rtype.is_float()) {
-    // Given two dissimilar floats, cast the lower bit version to the higher bit version.
-    // E.g. fp16 + fp32 --> fp32 + fp32
-    if (ltype.bits() < rtype.bits()) {
-      lhs = cast(rtype, lhs);
-    } else {
-      rhs = cast(ltype, rhs);
-    }
-  } else if (!ltype.is_float() && rtype.is_float()) {
-    // Cast int->float when the other operand is a float
-    lhs = cast(rtype, lhs);
-  } else if (ltype.is_float() && !rtype.is_float()) {
-    // Cast int->float when the other operand is a float
-    rhs = cast(ltype, rhs);
-  } else if (!ltype.is_bfloat16() && rtype.is_bfloat16()) {
-    // Cast int->bfloat16 when the other operand is a bfloat16
-    lhs = cast(rtype, lhs);
-  } else if (ltype.is_bfloat16() && !rtype.is_bfloat16()) {
-    // Cast int->bfloat16 when the other operand is a bfloat16
-    rhs = cast(ltype, rhs);
-  } else if (!ltype.is_float8() && rtype.is_float8()) {
-    // Cast int->float8 for lhs when rhs is a float8
-    lhs = cast(rtype, lhs);
-  } else if (ltype.is_float8() && !rtype.is_float8()) {
-    // Cast int->float8 for rhs when lhs is a float8
-    rhs = cast(ltype, rhs);
-  } else if (!ltype.is_float6() && rtype.is_float6()) {
-    // Cast int->float6 for lhs when rhs is a float6
-    lhs = cast(rtype, lhs);
-  } else if (ltype.is_float6() && !rtype.is_float6()) {
-    // Cast int->float6 for rhs when lhs is a float6
-    rhs = cast(ltype, rhs);
-  } else if (!ltype.is_float4() && rtype.is_float4()) {
-    // Cast int->float4 for lhs when rhs is a float4
-    lhs = cast(rtype, lhs);
-  } else if (ltype.is_float4() && !rtype.is_float4()) {
-    // Cast int->float4 for rhs when lhs is a float4
-    rhs = cast(ltype, rhs);
-  } else if (ltype.is_bool() && (rtype.is_int() || rtype.is_uint())) {
-    // Cast bool to int for lhs when rhs is a int or uint
-    lhs = cast(rtype, lhs);
-  } else if ((ltype.is_int() || ltype.is_uint()) && rtype.is_bool()) {
-    // Cast bool to int for rhs when lhs is a int or uint
-    rhs = cast(ltype, rhs);
-  } else if ((ltype.is_int() && rtype.is_int()) || (ltype.is_uint() && rtype.is_uint())) {
-    // Promote int to higher bits e.g. int8 + int16 --> int16 + int16
-    if (ltype.bits() < rtype.bits()) {
-      lhs = cast(rtype, lhs);
-    } else {
-      rhs = cast(ltype, rhs);
-    }
-  } else if ((ltype.is_int() && rtype.is_uint()) || (ltype.is_uint() && rtype.is_int())) {
-    // Handle mixing signed and unsigned integers
-    if (ltype.bits() < rtype.bits()) {
-      lhs = cast(rtype, lhs);
-    } else if (ltype.bits() > rtype.bits()) {
-      rhs = cast(ltype, rhs);
-    } else {
-      // The width of signed and unsigned integers is same.
-      if (ltype.is_uint()) {
-        rhs = cast(ltype, rhs);
-      } else {
-        lhs = cast(rtype, lhs);
-      }
-    }
-  } else {
-    LOG(INFO) << lhs << " " << rhs;
-    TVM_FFI_THROW(InternalError) << "Cannot match type " << ltype << " vs " << rtype;
+  TVM_FFI_ICHECK(lanes_match) << "Cannot match type " << lhs_ty->dtype << " vs " << rhs_ty->dtype;
+
+  PrimType promoted_ty = PromoteBinaryOpType(lhs_ty, rhs_ty);
+  if (lhs_ty->dtype != promoted_ty->dtype) {
+    lhs = cast(promoted_ty, lhs, span);
+  }
+  if (rhs_ty->dtype != promoted_ty->dtype) {
+    rhs = cast(promoted_ty, rhs, span);
   }
 }
 
 PrimExpr ret(PrimExpr value, Span span) {
   TVM_FFI_ICHECK(value.defined());
-  return tirx::Call(value.dtype(), tirx::builtin::ret(), {value}, {}, span);
+  return tirx::Call(value.ty(), tirx::builtin::ret(), {value}, {}, span);
 }
 
 PrimExpr thread_return(Span span) {
-  return tirx::Call(DataType::Void(), tirx::builtin::thread_return(), {}, {}, span);
+  return tirx::Call(PrimType::Void(), tirx::builtin::thread_return(), {}, {}, span);
 }
 
 PrimExpr continue_loop(Span span) {
-  return tirx::Call(DataType::Void(), tirx::builtin::continue_loop(), {}, {}, span);
+  return tirx::Call(PrimType::Void(), tirx::builtin::continue_loop(), {}, {}, span);
 }
 
 PrimExpr break_loop(Span span) {
-  return tirx::Call(DataType::Void(), tirx::builtin::break_loop(), {}, {}, span);
+  return tirx::Call(PrimType::Void(), tirx::builtin::break_loop(), {}, {}, span);
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
@@ -299,128 +317,131 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 };
 
 // maximum and min limits
-PrimExpr max_value(const DataType& dtype, Span span) {
+PrimExpr max_value(PrimType value_ty, Span span) {
   using namespace tirx;
+  PrimType dtype = value_ty;
   TVM_FFI_ICHECK_EQ(dtype.lanes(), 1);
-  if (dtype.is_int()) {
+  if (dtype.MatchesCode(DLDataTypeCode::kDLInt)) {
     if (dtype.bits() == 64) {
-      return IntImm(dtype, std::numeric_limits<int64_t>::max(), span);
+      return IntImm(value_ty, std::numeric_limits<int64_t>::max(), span);
     } else if (dtype.bits() < 64) {
       int64_t val = 1;
       val = (val << (dtype.bits() - 1)) - 1;
-      return IntImm(dtype, val, span);
+      return IntImm(value_ty, val, span);
     }
-  } else if (dtype.is_uint()) {
+  } else if (dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
     if (dtype.bits() == 64) {
       return MakeConst(dtype, std::numeric_limits<uint64_t>::max(), span);
     } else if (dtype.bits() < 64) {
       uint64_t val = 1;
       val = (val << static_cast<uint64_t>(dtype.bits())) - 1;
-      return IntImm(dtype, static_cast<int64_t>(val), span);
+      return IntImm(value_ty, static_cast<int64_t>(val), span);
     }
-  } else if (dtype.is_float()) {
+  } else if (IsFloatType(dtype)) {
     if (dtype.bits() == 64) {
-      return FloatImm(dtype, std::numeric_limits<double>::max(), span);
+      return FloatImm(value_ty, std::numeric_limits<double>::max(), span);
     } else if (dtype.bits() == 32) {
-      return FloatImm(dtype, std::numeric_limits<float>::max(), span);
+      return FloatImm(value_ty, std::numeric_limits<float>::max(), span);
     } else if (dtype.bits() == 16) {
-      return FloatImm(dtype, 65504.0, span);
+      return FloatImm(value_ty, 65504.0, span);
     }
-  } else if (dtype.is_bfloat16()) {
-    return FloatImm(dtype, std::numeric_limits<float>::max(), span);
-  } else if (dtype.is_float8()) {
+  } else if (IsBFloat16Type(dtype)) {
+    return FloatImm(value_ty, std::numeric_limits<float>::max(), span);
+  } else if (IsFloat8Type(dtype)) {
     // according to https://arxiv.org/pdf/2209.05433.pdf
-    if (dtype.code() == DataType::TypeCode::kFloat8_e5m2) {
-      return FloatImm(dtype, 57344.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e5m2fnuz) {
-      return FloatImm(dtype, 57344.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e4m3fn) {
-      return FloatImm(dtype, 448.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e4m3fnuz ||
-               dtype.code() == DataType::TypeCode::kFloat8_e4m3) {
-      return FloatImm(dtype, 448.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e4m3b11fnuz) {
-      return FloatImm(dtype, 30.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e3m4) {
-      return FloatImm(dtype, 31.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e8m0fnu) {
-      return FloatImm(dtype, 3.4028236692093846e+38, span);
+    if (dtype.code() == DLDataTypeCode::kDLFloat8_e5m2) {
+      return FloatImm(value_ty, 57344.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e5m2fnuz) {
+      return FloatImm(value_ty, 57344.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e4m3fn) {
+      return FloatImm(value_ty, 448.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e4m3fnuz ||
+               dtype.code() == DLDataTypeCode::kDLFloat8_e4m3) {
+      return FloatImm(value_ty, 448.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e4m3b11fnuz) {
+      return FloatImm(value_ty, 30.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e3m4) {
+      return FloatImm(value_ty, 31.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e8m0fnu) {
+      return FloatImm(value_ty, 3.4028236692093846e+38, span);
     }
-  } else if (dtype.is_float6()) {
-    if (dtype.code() == DataType::TypeCode::kFloat6_e2m3fn) {
-      return FloatImm(dtype, 7.5, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat6_e3m2fn) {
-      return FloatImm(dtype, 28.0, span);
+  } else if (IsFloat6Type(dtype)) {
+    if (dtype.code() == DLDataTypeCode::kDLFloat6_e2m3fn) {
+      return FloatImm(value_ty, 7.5, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat6_e3m2fn) {
+      return FloatImm(value_ty, 28.0, span);
     }
-  } else if (dtype.is_float4()) {
-    return FloatImm(dtype, 6.0, span);
+  } else if (IsFloat4Type(dtype)) {
+    return FloatImm(value_ty, 6.0, span);
   }
   TVM_FFI_THROW(InternalError) << "Cannot decide max_value for type" << dtype;
 }
 
-PrimExpr min_value(const DataType& dtype, Span span) {
+PrimExpr min_value(PrimType value_ty, Span span) {
   using namespace tirx;
+  PrimType dtype = value_ty;
   TVM_FFI_ICHECK_EQ(dtype.lanes(), 1);
-  if (dtype.is_int()) {
+  if (dtype.MatchesCode(DLDataTypeCode::kDLInt)) {
     if (dtype.bits() == 64) {
-      return IntImm(dtype, std::numeric_limits<int64_t>::lowest(), span);
+      return IntImm(value_ty, std::numeric_limits<int64_t>::lowest(), span);
     } else if (dtype.bits() < 64) {
       int64_t val = 1;
       val = -(val << (dtype.bits() - 1));
-      return IntImm(dtype, val, span);
+      return IntImm(value_ty, val, span);
     }
-  } else if (dtype.is_uint()) {
-    return IntImm(dtype, 0, span);
-  } else if (dtype.is_float()) {
+  } else if (dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
+    return IntImm(value_ty, 0, span);
+  } else if (IsFloatType(dtype)) {
     if (dtype.bits() == 64) {
-      return FloatImm(dtype, std::numeric_limits<double>::lowest(), span);
+      return FloatImm(value_ty, std::numeric_limits<double>::lowest(), span);
     } else if (dtype.bits() == 32) {
-      return FloatImm(dtype, std::numeric_limits<float>::lowest(), span);
+      return FloatImm(value_ty, std::numeric_limits<float>::lowest(), span);
     } else if (dtype.bits() == 16) {
-      return FloatImm(dtype, -65504.0, span);
+      return FloatImm(value_ty, -65504.0, span);
     }
-  } else if (dtype.is_bfloat16()) {
-    return FloatImm(dtype, std::numeric_limits<float>::lowest(), span);
-  } else if (dtype.is_float8()) {
+  } else if (IsBFloat16Type(dtype)) {
+    return FloatImm(value_ty, std::numeric_limits<float>::lowest(), span);
+  } else if (IsFloat8Type(dtype)) {
     // according to https://arxiv.org/pdf/2209.05433.pdf
-    if (dtype.code() == DataType::TypeCode::kFloat8_e5m2) {
-      return FloatImm(dtype, -57344.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e5m2fnuz) {
-      return FloatImm(dtype, 0.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e4m3fn) {
-      return FloatImm(dtype, -448.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e4m3fnuz) {
-      return FloatImm(dtype, 0.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e4m3) {
-      return FloatImm(dtype, -448.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e4m3b11fnuz) {
-      return FloatImm(dtype, 0.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e3m4) {
-      return FloatImm(dtype, -31.0, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat8_e8m0fnu) {
-      return FloatImm(dtype, 0.0, span);
+    if (dtype.code() == DLDataTypeCode::kDLFloat8_e5m2) {
+      return FloatImm(value_ty, -57344.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e5m2fnuz) {
+      return FloatImm(value_ty, 0.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e4m3fn) {
+      return FloatImm(value_ty, -448.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e4m3fnuz) {
+      return FloatImm(value_ty, 0.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e4m3) {
+      return FloatImm(value_ty, -448.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e4m3b11fnuz) {
+      return FloatImm(value_ty, 0.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e3m4) {
+      return FloatImm(value_ty, -31.0, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat8_e8m0fnu) {
+      return FloatImm(value_ty, 0.0, span);
     }
-  } else if (dtype.is_float6()) {
-    if (dtype.code() == DataType::TypeCode::kFloat6_e2m3fn) {
-      return FloatImm(dtype, -7.5, span);
-    } else if (dtype.code() == DataType::TypeCode::kFloat6_e3m2fn) {
-      return FloatImm(dtype, -28.0, span);
+  } else if (IsFloat6Type(dtype)) {
+    if (dtype.code() == DLDataTypeCode::kDLFloat6_e2m3fn) {
+      return FloatImm(value_ty, -7.5, span);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat6_e3m2fn) {
+      return FloatImm(value_ty, -28.0, span);
     }
-  } else if (dtype.is_float4()) {
-    return FloatImm(dtype, -6.0, span);
+  } else if (IsFloat4Type(dtype)) {
+    return FloatImm(value_ty, -6.0, span);
   }
   TVM_FFI_THROW(InternalError) << "Cannot decide min_value for type" << dtype;
 }
 
 // infinity
-PrimExpr infinity(const DataType& dtype, Span span) {
+PrimExpr infinity(PrimType value_ty, Span span) {
   using namespace tirx;
+  PrimType dtype = value_ty;
   TVM_FFI_ICHECK_EQ(dtype.lanes(), 1);
-  if (dtype.is_float()) {
+  if (IsFloatType(dtype)) {
     if (dtype.bits() == 64) {
-      return FloatImm(dtype, std::numeric_limits<double>::infinity(), span);
+      return FloatImm(value_ty, std::numeric_limits<double>::infinity(), span);
     } else if (dtype.bits() == 32 || dtype.bits() == 16) {
-      return FloatImm(dtype, std::numeric_limits<float>::infinity(), span);
+      return FloatImm(value_ty, std::numeric_limits<float>::infinity(), span);
     }
   }
   TVM_FFI_THROW(InternalError) << "Cannot decide infinity for type " << dtype;
@@ -450,72 +471,88 @@ bool is_const_power_of_two_integer(const PrimExpr& x, int* shift) {
 }
 }  // namespace tirx
 
-PrimExpr cast(const DataType& t, PrimExpr value, Span span) {
+PrimExpr cast(PrimType t, PrimExpr value, Span span) {
   using tirx::FloatImmNode;
-  if (value.dtype() == t) return value;
+  PrimType dtype = t;
+  if (value.ty()->dtype == dtype->dtype) return value;
   // const fold IntImm as they are used in index computations
-  if (t.is_scalar()) {
+  if (dtype.IsScalar()) {
     if (const IntImmNode* op = value.as<IntImmNode>()) {
-      return MakeConst(t, op->value, op->span);
+      return MakeConst(dtype, op->value, op->span);
     } else if (const FloatImmNode* op = value.as<FloatImmNode>()) {
-      return MakeConst(t, op->value, op->span);
+      return MakeConst(dtype, op->value, op->span);
     }
-    TVM_FFI_ICHECK(!value.dtype().is_handle()) << "Can't cast a handle to other types.";
-    return tirx::Cast(t, value, span);
+    TVM_FFI_ICHECK(!value.ty().IsHandle()) << "Can't cast a handle to other types.";
+    return tirx::Cast(std::move(t), value, span);
   } else {
-    DataType vtype = t.element_of();
-    if (!value.dtype().is_scalable_or_fixed_length_vector()) {
+    PrimType elem_ty = dtype.WithLanes(1);
+    if (!value.ty().IsScalableVector() && !value.ty().IsFixedLengthVector()) {
       // manually unroll cast
-      if (value.dtype() != vtype) {
+      if (value.ty()->dtype != elem_ty->dtype) {
         if (const IntImmNode* op = value.as<IntImmNode>()) {
-          value = MakeConst(vtype, op->value, op->span);
+          value = MakeConst(elem_ty, op->value, op->span);
         } else if (const FloatImmNode* op = value.as<FloatImmNode>()) {
-          value = MakeConst(vtype, op->value, op->span);
+          value = MakeConst(elem_ty, op->value, op->span);
         } else {
-          value = tirx::Cast(vtype, value, span);
+          value = tirx::Cast(elem_ty, value, span);
         }
       }
-      if (t.is_scalable_vector()) {
+      if (dtype.IsScalableVector()) {
         return tirx::Broadcast(
-            value, tirx::Mul(t.vscale_factor(), Call(DataType::Int(32), builtin::vscale(), {})),
+            value, tirx::Mul(dtype.VScaleFactor(), Call(PrimType::Int(32), builtin::vscale(), {})),
             span);
       } else {
-        return tirx::Broadcast(value, t.lanes(), span);
+        return tirx::Broadcast(value, dtype.lanes(), span);
       }
     } else { /* value is a vector */
-      TVM_FFI_ICHECK(value.dtype().is_scalable_vector() == t.is_scalable_vector());
+      TVM_FFI_ICHECK(value.ty().IsScalableVector() == dtype.IsScalableVector());
 
       bool lanes_match = false;
-      if (value.dtype().is_scalable_vector()) {
-        lanes_match = value.dtype().vscale_factor() == t.vscale_factor();
+      if (value.ty().IsScalableVector()) {
+        lanes_match = value.ty().VScaleFactor() == dtype.VScaleFactor();
       } else {
-        lanes_match = value.dtype().lanes() == t.lanes();
+        lanes_match = value.ty().lanes() == dtype.lanes();
       }
       TVM_FFI_ICHECK(lanes_match);
       if (const auto* broadcast = value.as<tirx::BroadcastNode>()) {
-        return tirx::Broadcast(cast(vtype, broadcast->value, span), broadcast->lanes, span);
+        return tirx::Broadcast(cast(elem_ty, broadcast->value, span), broadcast->lanes, span);
       } else if (const auto* ramp = value.as<tirx::RampNode>()) {
-        if (t.is_int() || t.is_uint()) {
+        if (dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
           // only cast to index data type can be folded to ramp
-          return tirx::Ramp(cast(vtype, ramp->base, span), cast(vtype, ramp->stride, span),
+          return tirx::Ramp(cast(elem_ty, ramp->base, span), cast(elem_ty, ramp->stride, span),
                             ramp->lanes, span);
         }
       }
-      return tirx::Cast(t, value, span);
+      return tirx::Cast(std::move(t), value, span);
     }
   }
 }
 
+PrimExpr cast(DLDataType t, PrimExpr value, Span span) {
+  return cast(PrimType(t), std::move(value), std::move(span));
+}
+
 // reinterpret
-PrimExpr reinterpret(const DataType& t, PrimExpr value, Span span) {
-  if (value.dtype() == t) return value;
-  if (!t.is_scalable_vector() && !value.dtype().is_scalable_vector()) {
-    TVM_FFI_ICHECK(value.dtype().bits() * value.dtype().lanes() == t.bits() * t.lanes() ||
-                   ((value.dtype().is_float4_e2m1fn() || t.is_float4_e2m1fn()) &&
-                    value.dtype().bytes() * value.dtype().lanes() == t.bytes() * t.lanes()))
-        << "Reinterpret requires size match " << t << " vs " << value.dtype();
+PrimExpr reinterpret(PrimType t, PrimExpr value, Span span) {
+  PrimType target_dtype = t;
+  PrimType value_dtype = value.ty();
+  if (value.ty()->dtype == t->dtype) return value;
+  if (!target_dtype.IsScalableVector() && !value_dtype.IsScalableVector()) {
+    int value_bits = value_dtype.bits() * value_dtype.lanes();
+    int target_bits = target_dtype.bits() * target_dtype.lanes();
+    auto storage_bytes = [](PrimType dtype) { return (dtype.bits() + 7) / 8; };
+    TVM_FFI_ICHECK(value_bits == target_bits ||
+                   ((value_dtype.code() == DLDataTypeCode::kDLFloat4_e2m1fn ||
+                     target_dtype.code() == DLDataTypeCode::kDLFloat4_e2m1fn) &&
+                    storage_bytes(value_dtype) * value_dtype.lanes() ==
+                        storage_bytes(target_dtype) * target_dtype.lanes()))
+        << "Reinterpret requires size match " << target_dtype << " vs " << value_dtype;
   }
-  return tirx::Call(t, tirx::builtin::reinterpret(), {value}, {}, span);
+  return tirx::Call(std::move(t), tirx::builtin::reinterpret(), {value}, {}, span);
+}
+
+PrimExpr reinterpret(DLDataType t, PrimExpr value, Span span) {
+  return reinterpret(PrimType(t), std::move(value), std::move(span));
 }
 
 // operator+
@@ -535,9 +572,9 @@ PrimExpr neg(PrimExpr a, Span span) {
   using tirx::IntImmNode;
   const IntImmNode* pa = a.as<IntImmNode>();
   const FloatImmNode* fa = a.as<FloatImmNode>();
-  if (pa) return IntImm(a.dtype(), -pa->value, span);
-  if (fa) return FloatImm(a.dtype(), -fa->value, span);
-  return MakeConst(a.dtype(), 0, span) - a;
+  if (pa) return IntImm(a.ty(), -pa->value, span);
+  if (fa) return FloatImm(a.ty(), -fa->value, span);
+  return MakeConst(a.ty(), 0, span) - a;
 }
 
 PrimExpr operator-(PrimExpr a, PrimExpr b) { return sub(a, b); }
@@ -562,8 +599,8 @@ PrimExpr div(PrimExpr a, PrimExpr b, Span span) {
 }
 
 PrimExpr truncdiv(PrimExpr a, PrimExpr b, Span span) {
-  TVM_FFI_ICHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
-  TVM_FFI_ICHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
+  TVM_FFI_ICHECK(a.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) << a;
+  TVM_FFI_ICHECK(b.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) << b;
   return div(a, b, span);
 }
 
@@ -585,16 +622,16 @@ PrimExpr shapediv(PrimExpr a, PrimExpr b, Span span) { return ceildiv(a, b, span
 PrimExpr indexmod(PrimExpr a, PrimExpr b, Span span) { return floormod(a, b, span); }
 
 PrimExpr floordiv(PrimExpr a, PrimExpr b, Span span) {
-  TVM_FFI_ICHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
-  TVM_FFI_ICHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
+  TVM_FFI_ICHECK(a.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) << a;
+  TVM_FFI_ICHECK(b.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) << b;
   BinaryOpMatchTypes(a, b, span);
   if (auto ret = arith::TryConstFold<tirx::FloorDiv>(a, b)) return ret.value();
   return tirx::FloorDiv(a, b, span);
 }
 
 PrimExpr logaddexp(PrimExpr a, PrimExpr b, Span span) {
-  TVM_FFI_ICHECK(a.dtype().is_float()) << a;
-  TVM_FFI_ICHECK(b.dtype().is_float()) << b;
+  TVM_FFI_ICHECK(IsFloatType(a.ty())) << a;
+  TVM_FFI_ICHECK(IsFloatType(b.ty())) << b;
   BinaryOpMatchTypes(a, b, span);
   PrimExpr exp_sum = add(exp(a), exp(b));
   PrimExpr log_exp_sum = log(exp_sum);
@@ -602,16 +639,16 @@ PrimExpr logaddexp(PrimExpr a, PrimExpr b, Span span) {
 }
 
 PrimExpr ceildiv(PrimExpr a, PrimExpr b, Span span) {
-  TVM_FFI_ICHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
-  TVM_FFI_ICHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
+  TVM_FFI_ICHECK(a.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) << a;
+  TVM_FFI_ICHECK(b.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) << b;
   BinaryOpMatchTypes(a, b, span);
   if (auto ret = arith::TryConstFold<tirx::FloorDiv>(a + b - 1, b)) return ret.value();
   return tirx::FloorDiv(a + b - 1, b, span);
 }
 
 PrimExpr floormod(PrimExpr a, PrimExpr b, Span span) {
-  TVM_FFI_ICHECK(a.dtype().is_int() || a.dtype().is_uint()) << a;
-  TVM_FFI_ICHECK(b.dtype().is_int() || b.dtype().is_uint()) << b;
+  TVM_FFI_ICHECK(a.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) << a;
+  TVM_FFI_ICHECK(b.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) << b;
   BinaryOpMatchTypes(a, b, span);
   if (auto ret = arith::TryConstFold<tirx::FloorMod>(a, b)) return ret.value();
   return tirx::FloorMod(a, b, span);
@@ -645,7 +682,7 @@ PrimExpr max(PrimExpr a, PrimExpr b, Span span) {
 
 // if_then_else
 PrimExpr if_then_else(PrimExpr cond, PrimExpr true_value, PrimExpr false_value, Span span) {
-  TVM_FFI_ICHECK(cond.dtype() == DataType::Bool())
+  TVM_FFI_ICHECK(cond.ty().MatchesCode(DLDataTypeCode::kDLBool))
       << "if_then_else only accept the condition to be boolean type.";
   BinaryOpMatchTypes(true_value, false_value, span);
   if (const IntImmNode* op = cond.as<IntImmNode>()) {
@@ -656,14 +693,14 @@ PrimExpr if_then_else(PrimExpr cond, PrimExpr true_value, PrimExpr false_value,
     }
   }
 
-  return tirx::Call(true_value.dtype(), tirx::builtin::if_then_else(),
-                    {cond, true_value, false_value}, {}, span);
+  return tirx::Call(true_value.ty(), tirx::builtin::if_then_else(), {cond, true_value, false_value},
+                    {}, span);
 }
 
 // likely
 PrimExpr likely(PrimExpr cond, Span span) {
   if (is_const_int(cond)) return cond;
-  return tirx::Call(cond.dtype(), tirx::builtin::likely(), {cond}, {}, span);
+  return tirx::Call(cond.ty(), tirx::builtin::likely(), {cond}, {}, span);
 }
 
 // operator>
@@ -712,38 +749,44 @@ PrimExpr not_equal(PrimExpr a, PrimExpr b, Span span) {
 
 namespace {
 void type_check_boolean_args(const PrimExpr& arg, const char* op) {
-  TVM_FFI_ICHECK(arg.dtype().is_bool()) << "Expected boolean argument for " << op
-                                        << ", but received " << arg << " of type " << arg.dtype();
+  TVM_FFI_ICHECK(arg.ty().MatchesCode(DLDataTypeCode::kDLBool))
+      << "Expected boolean argument for " << op << ", but received " << arg << " of type "
+      << arg.ty();
 }
 void type_check_boolean_args(const PrimExpr& lhs, const PrimExpr& rhs, const char* op) {
-  TVM_FFI_ICHECK(lhs.dtype().is_bool()) << "Expected boolean argument as LHS of " << op
-                                        << ", but received " << lhs << " of type " << lhs.dtype();
-  TVM_FFI_ICHECK(rhs.dtype().is_bool()) << "Expected boolean argument as RHS of " << op
-                                        << ", but received " << rhs << " of type " << rhs.dtype();
+  TVM_FFI_ICHECK(lhs.ty().MatchesCode(DLDataTypeCode::kDLBool))
+      << "Expected boolean argument as LHS of " << op << ", but received " << lhs << " of type "
+      << lhs.ty();
+  TVM_FFI_ICHECK(rhs.ty().MatchesCode(DLDataTypeCode::kDLBool))
+      << "Expected boolean argument as RHS of " << op << ", but received " << rhs << " of type "
+      << rhs.ty();
 }
 
 void type_check_int_or_bool_args(const PrimExpr& arg, const char* op) {
-  TVM_FFI_ICHECK(arg.dtype().is_int() || arg.dtype().is_uint() || arg.dtype().is_bool())
+  TVM_FFI_ICHECK(arg.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt,
+                                      DLDataTypeCode::kDLBool))
       << "Expected integer or boolean argument for " << op << ", but received " << arg
-      << " of type " << arg.dtype();
+      << " of type " << arg.ty();
 }
 
 void type_check_integer_args(const PrimExpr& lhs, const PrimExpr& rhs, const char* op) {
-  TVM_FFI_ICHECK(lhs.dtype().is_int() || lhs.dtype().is_uint())
+  TVM_FFI_ICHECK(lhs.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt))
       << "Expected integer argument as LHS of " << op << ", but received " << lhs << " of type "
-      << lhs.dtype();
-  TVM_FFI_ICHECK(rhs.dtype().is_int() || rhs.dtype().is_uint())
+      << lhs.ty();
+  TVM_FFI_ICHECK(rhs.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt))
       << "Expected integer argument as RHS of " << op << ", but received " << rhs << " of type "
-      << rhs.dtype();
+      << rhs.ty();
 }
 
 void type_check_int_or_bool_args(const PrimExpr& lhs, const PrimExpr& rhs, const char* op) {
-  TVM_FFI_ICHECK(lhs.dtype().is_int() || lhs.dtype().is_uint() || lhs.dtype().is_bool())
+  TVM_FFI_ICHECK(lhs.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt,
+                                      DLDataTypeCode::kDLBool))
       << "Expected integer argument as LHS of " << op << ", but received " << lhs << " of type "
-      << lhs.dtype();
-  TVM_FFI_ICHECK(rhs.dtype().is_int() || rhs.dtype().is_uint() || rhs.dtype().is_bool())
+      << lhs.ty();
+  TVM_FFI_ICHECK(rhs.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt,
+                                      DLDataTypeCode::kDLBool))
       << "Expected integer argument as RHS of " << op << ", but received " << rhs << " of type "
-      << rhs.dtype();
+      << rhs.ty();
 }
 }  // namespace
 
@@ -776,20 +819,20 @@ PrimExpr right_shift(PrimExpr a, PrimExpr b, Span span) {
 
   BinaryOpMatchTypes(a, b, span);
   TVM_INDEX_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
+    PrimType result_ty = a.ty();
     if (pb)
-      TVM_FFI_ICHECK(pb->value >= 0 && pb->value < rtype.bits())
-          << "Shift amount must be non-negative and less than " << rtype.bits() << " for type "
-          << rtype;
+      TVM_FFI_ICHECK(pb->value >= 0 && pb->value < result_ty.bits())
+          << "Shift amount must be non-negative and less than " << result_ty.bits() << " for type "
+          << result_ty;
     if (pa && pb) {
-      return IntImm(rtype, (pa->value >> pb->value), span);
+      return IntImm(result_ty, (pa->value >> pb->value), span);
     }
     if (pb) {
       if (pb->value == 0) return a;
     }
   });
 
-  return tirx::Call(a.dtype(), tirx::builtin::shift_right(), {a, b}, {}, span);
+  return tirx::Call(a.ty(), tirx::builtin::shift_right(), {a, b}, {}, span);
 }
 
 // shift left
@@ -798,17 +841,17 @@ PrimExpr left_shift(PrimExpr a, PrimExpr b, Span span) {
   type_check_integer_args(a, b, "<< operator (left shift)");
   BinaryOpMatchTypes(a, b, span);
   TVM_INDEX_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
+    PrimType result_ty = a.ty();
     if (pb)
-      TVM_FFI_ICHECK(pb->value >= 0 && pb->value < rtype.bits())
-          << "Shift amount must be non-negative and less than " << rtype.bits() << " for type "
-          << rtype;
-    if (pa && pb) return IntImm(rtype, (pa->value << pb->value), span);
+      TVM_FFI_ICHECK(pb->value >= 0 && pb->value < result_ty.bits())
+          << "Shift amount must be non-negative and less than " << result_ty.bits() << " for type "
+          << result_ty;
+    if (pa && pb) return IntImm(result_ty, (pa->value << pb->value), span);
     if (pb) {
       if (pb->value == 0) return a;
     }
   });
-  return tirx::Call(a.dtype(), tirx::builtin::shift_left(), {a, b}, {}, span);
+  return tirx::Call(a.ty(), tirx::builtin::shift_left(), {a, b}, {}, span);
 }
 
 // bitwise and
@@ -817,10 +860,10 @@ PrimExpr bitwise_and(PrimExpr a, PrimExpr b, Span span) {
   type_check_int_or_bool_args(a, b, "& operator (bitwise AND)");
   BinaryOpMatchTypes(a, b, span);
   TVM_INDEX_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
-    if (pa && pb) return IntImm(rtype, (pa->value & pb->value), span);
+    PrimType result_ty = a.ty();
+    if (pa && pb) return IntImm(result_ty, (pa->value & pb->value), span);
   });
-  return tirx::Call(a.dtype(), tirx::builtin::bitwise_and(), {a, b}, {}, span);
+  return tirx::Call(a.ty(), tirx::builtin::bitwise_and(), {a, b}, {}, span);
 }
 
 // bitwise_or
@@ -829,10 +872,10 @@ PrimExpr bitwise_or(PrimExpr a, PrimExpr b, Span span) {
   type_check_int_or_bool_args(a, b, "| operator (bitwise OR)");
   BinaryOpMatchTypes(a, b, span);
   TVM_INDEX_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
-    if (pa && pb) return IntImm(rtype, (pa->value | pb->value), span);
+    PrimType result_ty = a.ty();
+    if (pa && pb) return IntImm(result_ty, (pa->value | pb->value), span);
   });
-  return tirx::Call(a.dtype(), tirx::builtin::bitwise_or(), {a, b}, {}, span);
+  return tirx::Call(a.ty(), tirx::builtin::bitwise_or(), {a, b}, {}, span);
 }
 
 // bitwise_xor
@@ -841,10 +884,10 @@ PrimExpr bitwise_xor(PrimExpr a, PrimExpr b, Span span) {
   type_check_int_or_bool_args(a, b, "^ operator (bitwise XOR)");
   BinaryOpMatchTypes(a, b, span);
   TVM_INDEX_CONST_PROPAGATION({
-    const DataType& rtype = a.dtype();
-    if (pa && pb) return IntImm(rtype, (pa->value ^ pb->value), span);
+    PrimType result_ty = a.ty();
+    if (pa && pb) return IntImm(result_ty, (pa->value ^ pb->value), span);
   });
-  return tirx::Call(a.dtype(), tirx::builtin::bitwise_xor(), {a, b}, {}, span);
+  return tirx::Call(a.ty(), tirx::builtin::bitwise_xor(), {a, b}, {}, span);
 }
 
 // bitwise_not
@@ -852,7 +895,7 @@ PrimExpr operator~(PrimExpr a) { return bitwise_neg(a); }
 
 PrimExpr bitwise_neg(PrimExpr a, Span span) {
   type_check_int_or_bool_args(a, "~ operator (bitwise NOT)");
-  return tirx::Call(a.dtype(), tirx::builtin::bitwise_not(), {a}, {}, span);
+  return tirx::Call(a.ty(), tirx::builtin::bitwise_not(), {a}, {}, span);
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
@@ -864,10 +907,10 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 // pow
 PrimExpr pow(PrimExpr x, PrimExpr y, Span span) {
   BinaryOpMatchTypes(x, y, span);
-  TVM_FFI_ICHECK(x.dtype().is_float()) << "power only applies to float";
+  TVM_FFI_ICHECK(IsFloatType(x.ty())) << "power only applies to float";
 
   // If we detect pow(x, 3), suggest using x * x * x
-  if (y.dtype().is_int()) {
+  if (y.ty().MatchesCode(DLDataTypeCode::kDLInt)) {
     using tirx::IntImmNode;
     const IntImmNode* px = y.as<IntImmNode>();
     if (px) {
@@ -878,7 +921,7 @@ PrimExpr pow(PrimExpr x, PrimExpr y, Span span) {
                "`pow(x, 2) * pow(x, 2) ...`.";
       }
     }
-  } else if (y.dtype().is_float()) {
+  } else if (IsFloatType(y.ty())) {
     using tirx::FloatImmNode;
     const FloatImmNode* fx = y.as<FloatImmNode>();
     if (fx) {
@@ -892,33 +935,33 @@ PrimExpr pow(PrimExpr x, PrimExpr y, Span span) {
   }
 
   static const Op& pow_op = Op::Get("tirx.pow");
-  return tirx::Call(x.dtype(), pow_op, {x, y}, {}, span);
+  return tirx::Call(x.ty(), pow_op, {x, y}, {}, span);
 }
 
 TVM_TIR_REGISTER_PURE_BINARY_OP("pow").set_attr<TVectorizable>("TVectorizable", true);
 
 // abs
 PrimExpr abs(PrimExpr x, Span span) {
-  if (x.dtype().is_int()) {
+  if (x.ty().MatchesCode(DLDataTypeCode::kDLInt)) {
     using tirx::IntImmNode;
     const IntImmNode* px = x.as<IntImmNode>();
     if (px) {
-      return IntImm(x.dtype(), std::abs(px->value), px->span);
+      return IntImm(x.ty(), std::abs(px->value), px->span);
     }
     // MakeConst can handle both vector and scalar types.
-    return tirx::Select(x >= MakeConst(x.dtype(), 0), x, -x, span);
-  } else if (x.dtype().is_float() || x.dtype().is_bfloat()) {
+    return tirx::Select(x >= MakeConst(x.ty(), 0), x, -x, span);
+  } else if (IsFloatType(x.ty()) || IsBFloat16Type(x.ty())) {
     using tirx::FloatImmNode;
     const FloatImmNode* fx = x.as<FloatImmNode>();
     if (fx) {
-      return FloatImm(x.dtype(), std::fabs(fx->value), fx->span);
+      return FloatImm(x.ty(), std::fabs(fx->value), fx->span);
     }
     static const Op& fabs_op = Op::Get("tirx.fabs");
-    return tirx::Call(x.dtype(), fabs_op, {x}, {}, span);
-  } else if (x.dtype().is_uint()) {
+    return tirx::Call(x.ty(), fabs_op, {x}, {}, span);
+  } else if (x.ty().MatchesCode(DLDataTypeCode::kDLUInt)) {
     return x;
   } else {
-    TVM_FFI_THROW(InternalError) << "Data type " << x.dtype()
+    TVM_FFI_THROW(InternalError) << "Data type " << x.ty()
                                  << " not supported for absolute op. Skipping absolute op...";
     return x;
   }
@@ -928,39 +971,40 @@ TVM_TIR_REGISTER_PURE_UNARY_OP("fabs").set_attr<TVectorizable>("TVectorizable",
 
 // isnan
 PrimExpr isnan(PrimExpr x, Span span) {
-  DataType t = DataType::Bool(x.dtype().lanes());
-  if (x.dtype().is_int() || x.dtype().is_uint()) {
+  PrimType t = PrimType::Bool(x.ty().lanes());
+  PrimType bool_ty(t);
+  if (x.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
     return MakeConst(t, false);
-  } else if (x.dtype().is_float()) {
+  } else if (IsFloatType(x.ty())) {
     using tirx::FloatImmNode;
     const FloatImmNode* fx = x.as<FloatImmNode>();
     if (fx) {
       return MakeConst(t, std::isnan(fx->value), fx->span);
     }
-    if (x.dtype().bits() == 16) {
+    if (x.ty().bits() == 16) {
       static const Op& isnan_op = Op::Get("tirx.isnan");
-      return tirx::Call(t, isnan_op, {cast(DataType::Float(32, t.lanes()), std::move(x), span)}, {},
-                        span);
+      PrimType f32_ty = PrimType::Float(32, t.lanes());
+      return tirx::Call(bool_ty, isnan_op, {cast(f32_ty, std::move(x), span)}, {}, span);
     } else {
       static const Op& isnan_op = Op::Get("tirx.isnan");
-      return tirx::Call(t, isnan_op, {x}, {}, span);
+      return tirx::Call(bool_ty, isnan_op, {x}, {}, span);
     }
   } else {
-    TVM_FFI_THROW(InternalError) << "Data type " << x.dtype()
+    TVM_FFI_THROW(InternalError) << "Data type " << x.ty()
                                  << " not supported for isnan op. Skipping isnan op...";
   }
 }
 
 // isinf
 PrimExpr isinf(PrimExpr x, Span span) {
-  DataType t = DataType::Bool(x.dtype().lanes());
-  if (x.dtype().is_int() || x.dtype().is_uint()) {
+  PrimType t = PrimType::Bool(x.ty().lanes());
+  if (x.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
     return MakeConst(t, false, span);
-  } else if (x.dtype().is_float()) {
-    PrimExpr infX = infinity(x.dtype(), span);
+  } else if (IsFloatType(x.ty())) {
+    PrimExpr infX = infinity(x.ty(), span);
     return abs(x, span) == infX && !isnan(x, span);
   } else {
-    TVM_FFI_THROW(InternalError) << "Data type " << x.dtype()
+    TVM_FFI_THROW(InternalError) << "Data type " << x.ty()
                                  << " not supported for finiteness ops. Skipping it...";
   }
 }
@@ -969,57 +1013,57 @@ PrimExpr isinf(PrimExpr x, Span span) {
 PrimExpr isfinite(PrimExpr x, Span span) { return !isinf(x, span) && !isnan(x, span); }
 
 PrimExpr sum(PrimExpr source, ffi::Array<IterVar> rdom, ffi::Array<PrimExpr> init, Span span) {
-  Var x("x", source.dtype(), span), y("y", source.dtype(), span);
+  Var x("x", source.ty(), span), y("y", source.ty(), span);
   PrimExpr result = tirx::Add(x, y, span);
-  PrimExpr identity_element = MakeConst(source.dtype(), 0, span);
+  PrimExpr identity_element = MakeConst(source.ty(), 0, span);
   tirx::CommReducer combiner = tirx::CommReducer({x}, {y}, {result}, {identity_element}, span);
   return tirx::Reduce(combiner, {source}, rdom, IntImm::Bool(true), 0, init, span);
 }
 
 PrimExpr all(PrimExpr source, ffi::Array<IterVar> rdom, ffi::Array<PrimExpr> init, Span span) {
   type_check_boolean_args(source, "tvm::all");
-  Var x("x", source.dtype(), span), y("y", source.dtype());
+  Var x("x", source.ty(), span), y("y", source.ty());
   PrimExpr result = tirx::And(x, y, span);
-  PrimExpr identity_element = MakeConst(source.dtype(), true, span);
+  PrimExpr identity_element = MakeConst(source.ty(), true, span);
   tirx::CommReducer combiner = tirx::CommReducer({x}, {y}, {result}, {identity_element}, span);
   return tirx::Reduce(combiner, {source}, rdom, IntImm::Bool(true), 0, init, span);
 }
 
 PrimExpr any(PrimExpr source, ffi::Array<IterVar> rdom, ffi::Array<PrimExpr> init, Span span) {
   type_check_boolean_args(source, "tvm::any");
-  Var x("x", source.dtype(), span), y("y", source.dtype(), span);
+  Var x("x", source.ty(), span), y("y", source.ty(), span);
   PrimExpr result = tirx::Or(x, y, span);
-  PrimExpr identity_element = MakeConst(source.dtype(), false, span);
+  PrimExpr identity_element = MakeConst(source.ty(), false, span);
   tirx::CommReducer combiner = tirx::CommReducer({x}, {y}, {result}, {identity_element}, span);
   return tirx::Reduce(combiner, {source}, rdom, IntImm::Bool(true), 0, init, span);
 }
 
 PrimExpr max(PrimExpr source, ffi::Array<IterVar> rdom, ffi::Array<PrimExpr> init, Span span) {
-  Var x("x", source.dtype(), span), y("y", source.dtype(), span);
+  Var x("x", source.ty(), span), y("y", source.ty(), span);
   PrimExpr result = tirx::Max(x, y, span);
-  PrimExpr identity_element = min_value(source.dtype(), span);
+  PrimExpr identity_element = min_value(source.ty(), span);
   tirx::CommReducer combiner = tirx::CommReducer({x}, {y}, {result}, {identity_element}, span);
   return tirx::Reduce(combiner, {source}, rdom, IntImm::Bool(true), 0, init, span);
 }
 
 PrimExpr min(PrimExpr source, ffi::Array<IterVar> rdom, ffi::Array<PrimExpr> init, Span span) {
-  Var x("x", source.dtype(), span), y("y", source.dtype(), span);
+  Var x("x", source.ty(), span), y("y", source.ty(), span);
   PrimExpr result = tirx::Min(x, y, span);
-  PrimExpr identity_element = max_value(source.dtype(), span);
+  PrimExpr identity_element = max_value(source.ty(), span);
   tirx::CommReducer combiner = tirx::CommReducer({x}, {y}, {result}, {identity_element}, span);
   return tirx::Reduce(combiner, {source}, rdom, IntImm::Bool(true), 0, init, span);
 }
 
 PrimExpr prod(PrimExpr source, ffi::Array<IterVar> rdom, ffi::Array<PrimExpr> init, Span span) {
-  if (source.dtype().is_bool()) {
+  if (source.ty().MatchesCode(DLDataTypeCode::kDLBool)) {
     // Bool product (prod) has the same truth table as logical AND.  Reuse all() to
     // avoid lowering bool prod through Mul, which LLVM codegen does not support.
     return all(source, rdom, init, span);
   } else {
     // For non-bool types, we lower prod through Mul.
-    Var x("x", source.dtype(), span), y("y", source.dtype(), span);
+    Var x("x", source.ty(), span), y("y", source.ty(), span);
     PrimExpr result = tirx::Mul(x, y, span);
-    PrimExpr identity_element = MakeConst(source.dtype(), 1, span);
+    PrimExpr identity_element = MakeConst(source.ty(), 1, span);
     tirx::CommReducer combiner = tirx::CommReducer({x}, {y}, {result}, {identity_element}, span);
     return tirx::Reduce(combiner, {source}, rdom, IntImm::Bool(true), 0, init, span);
   }
@@ -1028,82 +1072,87 @@ PrimExpr prod(PrimExpr source, ffi::Array<IterVar> rdom, ffi::Array<PrimExpr> in
 // fmod
 PrimExpr fmod(PrimExpr x, PrimExpr y, Span span) {
   BinaryOpMatchTypes(x, y, span);
-  TVM_FFI_ICHECK(x.dtype().is_float()) << "fmod only applies to float";
+  TVM_FFI_ICHECK(IsFloatType(x.ty())) << "fmod only applies to float";
   static const Op& fmod_op = Op::Get("tirx.fmod");
-  return tirx::Call(x.dtype(), fmod_op, {x, y}, {}, span);
+  return tirx::Call(x.ty(), fmod_op, {x, y}, {}, span);
 }
 
 TVM_TIR_REGISTER_PURE_UNARY_OP("fmod");
 
 // floor
 PrimExpr floor(PrimExpr x, Span span) {
-  if (x.dtype().is_int() || x.dtype().is_uint() || x.dtype().is_bool()) {
+  if (x.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt,
+                         DLDataTypeCode::kDLBool)) {
     return x;
   }
   using tirx::FloatImmNode;
   const FloatImmNode* fx = x.as<FloatImmNode>();
-  if (fx) return FloatImm(x.dtype(), std::floor(fx->value), fx->span);
+  if (fx) return FloatImm(x.ty(), std::floor(fx->value), fx->span);
   static const Op& floor_op = Op::Get("tirx.floor");
-  return tirx::Call(x.dtype(), floor_op, {x}, {}, span);
+  return tirx::Call(x.ty(), floor_op, {x}, {}, span);
 }
 
 TVM_TIR_REGISTER_PURE_UNARY_OP("floor").set_attr<TVectorizable>("TVectorizable", true);
 
 // ceil
 PrimExpr ceil(PrimExpr x, Span span) {
-  if (x.dtype().is_int() || x.dtype().is_uint() || x.dtype().is_bool()) {
+  if (x.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt,
+                         DLDataTypeCode::kDLBool)) {
     return x;
   }
   using tirx::FloatImmNode;
   const FloatImmNode* fx = x.as<FloatImmNode>();
-  if (fx) return FloatImm(x.dtype(), std::ceil(fx->value), fx->span);
+  if (fx) return FloatImm(x.ty(), std::ceil(fx->value), fx->span);
   static const Op& ceil_op = Op::Get("tirx.ceil");
-  return tirx::Call(x.dtype(), ceil_op, {x}, {}, span);
+  return tirx::Call(x.ty(), ceil_op, {x}, {}, span);
 }
 
 TVM_TIR_REGISTER_PURE_UNARY_OP("ceil").set_attr<TVectorizable>("TVectorizable", true);
 
 // round
 PrimExpr round(PrimExpr x, Span span) {
-  if (x.dtype().is_int() || x.dtype().is_uint() || x.dtype().is_bool()) {
+  if (x.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt,
+                         DLDataTypeCode::kDLBool)) {
     return x;
   }
   using tirx::FloatImmNode;
   const FloatImmNode* fx = x.as<FloatImmNode>();
-  if (fx) return FloatImm(x.dtype(), std::nearbyint(fx->value), fx->span);
+  if (fx) return FloatImm(x.ty(), std::nearbyint(fx->value), fx->span);
   static const Op& round_op = Op::Get("tirx.round");
-  return tirx::Call(x.dtype(), round_op, {x}, {}, span);
+  return tirx::Call(x.ty(), round_op, {x}, {}, span);
 }
 
 TVM_TIR_REGISTER_PURE_UNARY_OP("round").set_attr<TVectorizable>("TVectorizable", true);
 
 // nearbyint
 PrimExpr nearbyint(PrimExpr x, Span span) {
-  if (x.dtype().is_int() || x.dtype().is_uint() || x.dtype().is_bool()) {
+  if (x.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt,
+                         DLDataTypeCode::kDLBool)) {
     return x;
   }
   using tirx::FloatImmNode;
   const FloatImmNode* fx = x.as<FloatImmNode>();
-  if (fx) return FloatImm(x.dtype(), std::nearbyint(fx->value), fx->span);
+  if (fx) return FloatImm(x.ty(), std::nearbyint(fx->value), fx->span);
   static const Op& nearbyint_op = Op::Get("tirx.nearbyint");
-  return tirx::Call(x.dtype(), nearbyint_op, {x}, {}, span);
+  return tirx::Call(x.ty(), nearbyint_op, {x}, {}, span);
 }
 
 TVM_TIR_REGISTER_PURE_UNARY_OP("nearbyint");
 
 // trunc
 PrimExpr trunc(PrimExpr x, Span span) {
-  if (x.dtype().is_int() || x.dtype().is_uint() || x.dtype().is_bool()) {
+  if (x.ty().MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt,
+                         DLDataTypeCode::kDLBool)) {
     return x;
   }
   using tirx::FloatImmNode;
   const FloatImmNode* fx = x.as<FloatImmNode>();
   if (fx) {
-    return FloatImm(x.dtype(), (fx->value < 0 ? std::ceil(fx->value) : std::floor(fx->value)),
+    return FloatImm(x.ty(), (fx->value < 0 ? std::ceil(fx->value) : std::floor(fx->value)),
                     fx->span);
   }
   static const Op& trunc_op = Op::Get("tirx.trunc");
-  return tirx::Call(x.dtype(), trunc_op, {x}, {}, span);
+  return tirx::Call(x.ty(), trunc_op, {x}, {}, span);
 }
 
 TVM_TIR_REGISTER_PURE_UNARY_OP("trunc").set_attr<TVectorizable>("TVectorizable", true);
@@ -1185,9 +1234,9 @@ TVM_FFI_STATIC_INIT_BLOCK() {
       .def_packed("node._const",
                   [](ffi::PackedArgs args, ffi::Any* ret) {
                     if (auto opt = args[0].try_cast<int64_t>()) {
-                      *ret = tirx::MakeConst(args[1].cast<DataType>(), *opt, args[2].cast<Span>());
+                      *ret = tirx::MakeConst(args[1].cast<PrimType>(), *opt, args[2].cast<Span>());
                     } else if (auto opt = args[0].try_cast<double>()) {
-                      *ret = tirx::MakeConst(args[1].cast<DataType>(), *opt, args[2].cast<Span>());
+                      *ret = tirx::MakeConst(args[1].cast<PrimType>(), *opt, args[2].cast<Span>());
                     } else {
                       TVM_FFI_THROW(InternalError)
                           << "First argument to tvm.tirx.const must be int, float, or bool, "
@@ -1196,9 +1245,9 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                     }
                   })
       .def("node.LargeUIntImm", LargeUIntImm)
-      .def("tirx.min_value", min_value)
-      .def("tirx.max_value", max_value)
-      .def("tirx.infinity", infinity)
+      .def("tirx.min_value", static_cast<PrimExpr (*)(PrimType, Span)>(&min_value))
+      .def("tirx.max_value", static_cast<PrimExpr (*)(PrimType, Span)>(&max_value))
+      .def("tirx.infinity", static_cast<PrimExpr (*)(PrimType, Span)>(&infinity))
       .def("tirx.abs", tvm::abs)
       .def("tirx.likely", tvm::likely)
       .def("tirx.isnan", tvm::isnan)
@@ -1209,8 +1258,11 @@ TVM_FFI_STATIC_INIT_BLOCK() {
       .def("tirx.round", tvm::round)
       .def("tirx.nearbyint", tvm::nearbyint)
       .def("tirx.trunc", tvm::trunc)
-      .def("tirx._cast", tvm::cast)
-      .def("tirx.reinterpret", tvm::reinterpret);
+      .def("tirx._cast",
+           [](PrimType dtype, PrimExpr value, Span span) { return tvm::cast(dtype, value, span); })
+      .def("tirx.reinterpret", [](PrimType dtype, PrimExpr value, Span span) {
+        return tvm::reinterpret(dtype, value, span);
+      });
 }
 
 // operator overloading, smarter than make
@@ -1269,24 +1321,25 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 }
 
 PrimExpr fast_erf_float_expr(PrimExpr arg, int bits) {
-  auto plus_4 = FloatImm(DataType::Float(bits), 4.f);
-  auto minus_4 = FloatImm(DataType::Float(bits), -4.f);
+  PrimType fp_ty = PrimType::Float(bits);
+  auto plus_4 = FloatImm(fp_ty, 4.f);
+  auto minus_4 = FloatImm(fp_ty, -4.f);
 
   // The monomial coefficients of the numerator polynomial (odd).
-  auto alpha_1 = FloatImm(DataType::Float(bits), -1.60960333262415e-02f);
-  auto alpha_3 = FloatImm(DataType::Float(bits), -2.95459980854025e-03f);
-  auto alpha_5 = FloatImm(DataType::Float(bits), -7.34990630326855e-04f);
-  auto alpha_7 = FloatImm(DataType::Float(bits), -5.69250639462346e-05f);
-  auto alpha_9 = FloatImm(DataType::Float(bits), -2.10102402082508e-06f);
-  auto alpha_11 = FloatImm(DataType::Float(bits), 2.77068142495902e-08f);
-  auto alpha_13 = FloatImm(DataType::Float(bits), -2.72614225801306e-10f);
+  auto alpha_1 = FloatImm(fp_ty, -1.60960333262415e-02f);
+  auto alpha_3 = FloatImm(fp_ty, -2.95459980854025e-03f);
+  auto alpha_5 = FloatImm(fp_ty, -7.34990630326855e-04f);
+  auto alpha_7 = FloatImm(fp_ty, -5.69250639462346e-05f);
+  auto alpha_9 = FloatImm(fp_ty, -2.10102402082508e-06f);
+  auto alpha_11 = FloatImm(fp_ty, 2.77068142495902e-08f);
+  auto alpha_13 = FloatImm(fp_ty, -2.72614225801306e-10f);
 
   // The monomial coefficients of the denominator polynomial (even).
-  auto beta_0 = FloatImm(DataType::Float(bits), -1.42647390514189e-02f);
-  auto beta_2 = FloatImm(DataType::Float(bits), -7.37332916720468e-03f);
-  auto beta_4 = FloatImm(DataType::Float(bits), -1.68282697438203e-03f);
-  auto beta_6 = FloatImm(DataType::Float(bits), -2.13374055278905e-04f);
-  auto beta_8 = FloatImm(DataType::Float(bits), -1.45660718464996e-05f);
+  auto beta_0 = FloatImm(fp_ty, -1.42647390514189e-02f);
+  auto beta_2 = FloatImm(fp_ty, -7.37332916720468e-03f);
+  auto beta_4 = FloatImm(fp_ty, -1.68282697438203e-03f);
+  auto beta_6 = FloatImm(fp_ty, -2.13374055278905e-04f);
+  auto beta_8 = FloatImm(fp_ty, -1.45660718464996e-05f);
 
   // clamp x
   auto x = tvm::max(tvm::min(arg, plus_4), minus_4);
@@ -1340,18 +1393,20 @@ int ExtractInt(const ffi::PackedArgs& args, int index) {
   }
 }
 
-PrimExpr PrintOpPacked(Var data, DataType dtype, bool is_string, bool is_scalar, int dim_num,
+PrimExpr PrintOpPacked(Var data, DLDataType dtype, bool is_string, bool is_scalar, int dim_num,
                        ffi::Array<PrimExpr> shape) {
+  PrimType value_ty(dtype);
+  PrimType u32_ty = PrimType::UInt(32);
   ffi::Array<PrimExpr> args;
   args.push_back(data);
   args.push_back(tirx::StringImm(ffi::DLDataTypeToString(dtype)));
   args.push_back(IntImm::Bool(is_string));
   args.push_back(IntImm::Bool(is_scalar));
-  args.push_back(IntImm(DataType::UInt(32), dim_num));
+  args.push_back(IntImm(u32_ty, dim_num));
   for (const auto& dim : shape) {
     args.push_back(dim);
   }
-  return tirx::Call(dtype, tirx::builtin::print_buffer(), args);
+  return tirx::Call(value_ty, tirx::builtin::print_buffer(), args);
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
@@ -1359,7 +1414,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   refl::GlobalDef().def_packed("tirx.print_buffer", [](ffi::PackedArgs args, ffi::Any* ret) {
     // Expected arguments:
     // args[0]: buffer_var (Var)
-    // args[1]: dtype (DataType)
+    // args[1]: dtype (DLDataType)
     // args[2]: is_string (bool or IntImm)
     // args[3]: is_scalar (bool or IntImm)
     // args[4]: dim_num (int or IntImm)
@@ -1368,7 +1423,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
     TVM_FFI_ICHECK_GE(args.size(), 5) << "print_buffer expects at least 5 arguments";
 
     Var buffer_var = args[0].cast<Var>();
-    DataType dtype = args[1].cast<DataType>();
+    DLDataType dtype = args[1].cast<DLDataType>();
     bool is_string = ExtractBool(args, 2);
     bool is_scalar = ExtractBool(args, 3);
     int dim_num = ExtractInt(args, 4);
diff --git a/src/tirx/script/builder/ir.cc b/src/tirx/script/builder/ir.cc
index a75025a0ddd1..a732a14958b7 100644
--- a/src/tirx/script/builder/ir.cc
+++ b/src/tirx/script/builder/ir.cc
@@ -41,7 +41,7 @@ namespace tirx {
 using tvm::tirx::IterVar;
 using tvm::tirx::Layout;
 
-Buffer BufferDecl(ffi::Array<PrimExpr> shape, DataType dtype, ffi::String buffer_name,
+Buffer BufferDecl(ffi::Array<PrimExpr> shape, PrimType dtype, ffi::String buffer_name,
                   ffi::Optional<Var> data, ffi::Optional<ffi::Array<PrimExpr>> strides,
                   ffi::Optional<PrimExpr> elem_offset, ffi::String storage_scope, int align,
                   int offset_factor, ffi::String buffer_type,
@@ -57,16 +57,16 @@ Buffer BufferDecl(ffi::Array<PrimExpr> shape, DataType dtype, ffi::String buffer
   }
   Var buffer_data;
   if (!data.defined()) {
-    DataType storage_dtype = dtype;
-    if (storage_dtype == DataType::Bool()) {
-      storage_dtype = DataType::Int(8);
+    DLDataType storage_dtype = dtype->dtype;
+    if (storage_dtype == DLDataType{kDLBool, 8, 1}) {
+      storage_dtype = DLDataType{kDLInt, 8, 1};
     }
     buffer_data = tvm::tirx::Var(buffer_name, PointerType(PrimType(storage_dtype), storage_scope));
   } else {
     buffer_data = data.value();
   }
   if (!elem_offset.defined() && offset_factor) {
-    DataType shape_dtype = shape.empty() ? DataType::Int(32) : shape[0]->dtype;
+    PrimType shape_dtype = shape.empty() ? PrimType::Int(32) : shape[0].ty();
     elem_offset = tvm::tirx::Var("elem_offset", shape_dtype);
   }
   return Buffer(buffer_data, dtype, shape, strides.value_or(ffi::Array<PrimExpr>()),
@@ -100,7 +100,7 @@ Var Arg(ffi::String name, Var var) {
 Buffer Arg(ffi::String name, Buffer buffer) {
   PrimFuncFrame frame = FindPrimFuncFrame("T.Arg");
   details::Namer::Name(buffer, name);
-  Var handle(buffer->name + "_handle", DataType::Handle());
+  Var handle(buffer->name + "_handle", PrimType::Handle());
   frame->args.push_back(handle);
   frame->buffer_map.Set(handle, buffer);
   return buffer;
@@ -148,7 +148,7 @@ tvm::Type FuncRet(tvm::Type ret_type) {
   return ret_type;
 }
 
-Buffer MatchBuffer(ffi::ObjectRef param, ffi::Array<PrimExpr> shape, DataType dtype,
+Buffer MatchBuffer(ffi::ObjectRef param, ffi::Array<PrimExpr> shape, PrimType dtype,
                    ffi::Optional<Var> data, ffi::Array<PrimExpr> strides, PrimExpr elem_offset,
                    ffi::String storage_scope, int align, int offset_factor,
                    ffi::String buffer_type_str, ffi::Optional<ffi::Array<IntImm>> axis_separators,
@@ -367,7 +367,7 @@ void BlockAttrs(ffi::Map<ffi::String, Any> attrs) {
 }
 
 ffi::Variant<Buffer, AllocBufferFrame> SBlockAllocBuffer(
-    ffi::Array<PrimExpr> shape, DataType dtype, ffi::Optional<Var> data,
+    ffi::Array<PrimExpr> shape, PrimType dtype, ffi::Optional<Var> data,
     ffi::Array<PrimExpr> strides, PrimExpr elem_offset, ffi::String storage_scope, int align,
     int offset_factor, ffi::String buffer_type_str,
     ffi::Optional<ffi::Array<IntImm>> axis_separators, ffi::Optional<Layout> layout,
@@ -418,14 +418,17 @@ IterVar PushBlockVar(IterVar iter_var, PrimExpr binding) {
   return iter_var;
 }
 
-#define TVM_TIRX_IR_BUILDER_AXIS(Method, Kind, Name)                                          \
-  Var Method(Range dom, PrimExpr binding, DataType dtype) {                                   \
-    TVM_FFI_ICHECK(dom.defined()) << Name << " axis must have a domain";                      \
-    int bits = std::max({dom->min.dtype().bits(), dom->extent.dtype().bits(), dtype.bits()}); \
-    return PushBlockVar(IterVar(/*dom=*/dom, /*var=*/Var("", dtype.with_bits(bits)),          \
-                                /*iter_type=*/Kind, /*thread_tag=*/""),                       \
-                        binding)                                                              \
-        ->var;                                                                                \
+#define TVM_TIRX_IR_BUILDER_AXIS(Method, Kind, Name)                                      \
+  Var Method(Range dom, PrimExpr binding, PrimType dtype) {                               \
+    TVM_FFI_ICHECK(dom.defined()) << Name << " axis must have a domain";                  \
+    PrimType min_ty = dom->min.ty();                                                      \
+    PrimType extent_ty = dom->extent.ty();                                                \
+    int bits = std::max({min_ty.bits(), extent_ty.bits(), dtype.bits()});                 \
+    PrimType var_ty = dtype.WithBits(bits);                                               \
+    return PushBlockVar(IterVar(/*dom=*/dom, /*var=*/Var("", var_ty), /*iter_type=*/Kind, \
+                                /*thread_tag=*/""),                                       \
+                        binding)                                                          \
+        ->var;                                                                            \
   }
 TVM_TIRX_IR_BUILDER_AXIS(Spatial, tvm::tirx::IterVarType::kDataPar, "Spatial");
 TVM_TIRX_IR_BUILDER_AXIS(Reduce, tvm::tirx::IterVarType::kCommReduce, "Reduction");
@@ -433,7 +436,7 @@ TVM_TIRX_IR_BUILDER_AXIS(Scan, tvm::tirx::IterVarType::kOrdered, "Scan");
 TVM_TIRX_IR_BUILDER_AXIS(Opaque, tvm::tirx::IterVarType::kOpaque, "Opaque");
 #undef TVM_TIRX_IR_BUILDER_AXIS
 
-ffi::Array<Var> Remap(ffi::String kinds, ffi::Array<PrimExpr> bindings, DataType dtype) {
+ffi::Array<Var> Remap(ffi::String kinds, ffi::Array<PrimExpr> bindings, PrimType dtype) {
   using namespace tvm::tirx;
   ffi::Array<Var> results;
   TVM_FFI_ICHECK_EQ(kinds.size(), bindings.size());
@@ -462,7 +465,7 @@ ffi::Array<Var> Remap(ffi::String kinds, ffi::Array<PrimExpr> bindings, DataType
     }
     TVM_FFI_ICHECK(dom.defined()) << "TypeError: Variable is not in the loop: "
                                   << ffi::GetRef<Var>(v);
-    DataType dtype = v->dtype;
+    PrimType dtype = v->ty();
     if (c == 'S') {
       results.push_back(PushBlockVar(IterVar(/*dom=*/dom,
                                              /*var=*/Var("", dtype),
@@ -493,8 +496,10 @@ ffi::Array<Var> Remap(ffi::String kinds, ffi::Array<PrimExpr> bindings, DataType
     PrimExpr min = start;                                                                     \
     PrimExpr extent = arith::Analyzer()->Simplify(stop - start);                              \
     ffi::ObjectPtr<ForFrameNode> n = ffi::make_object<ForFrameNode>();                        \
-    int bits = std::max(min.dtype().bits(), extent.dtype().bits());                           \
-    n->vars = {Var("v", DataType(min.dtype().code(), bits, 1))};                              \
+    PrimType min_ty = min.ty();                                                               \
+    PrimType extent_ty = extent.ty();                                                         \
+    int bits = std::max(min_ty.bits(), extent_ty.bits());                                     \
+    n->vars = {Var("v", min_ty.WithBits(bits).WithLanes(1))};                                 \
     n->doms = {Range::FromMinExtent(min, extent)};                                            \
     n->steps = {step};                                                                        \
     n->f_make_for_loop = [annotations](ffi::Array<Var> vars, ffi::Array<Range> doms,          \
@@ -522,8 +527,10 @@ ForFrame ThreadBinding(PrimExpr start, PrimExpr stop, ffi::String thread,
   PrimExpr min = start;
   PrimExpr extent = arith::Analyzer()->Simplify(stop - start);
   ffi::ObjectPtr<ForFrameNode> n = ffi::make_object<ForFrameNode>();
-  int bits = std::max(min.dtype().bits(), extent.dtype().bits());
-  DataType dtype = DataType(min.dtype().code(), bits, 1);
+  PrimType min_ty = min.ty();
+  PrimType extent_ty = extent.ty();
+  int bits = std::max(min_ty.bits(), extent_ty.bits());
+  PrimType dtype = min_ty.WithBits(bits).WithLanes(1);
   n->vars = {Var("v", dtype)};
   n->doms = {Range::FromMinExtent(min, extent)};
   n->steps = {std::nullopt};
@@ -549,12 +556,12 @@ ForFrame Grid(ffi::Array<ffi::Variant<PrimExpr, ffi::Tuple<PrimExpr, PrimExpr>>>
   for (const auto& extent : extents) {
     if (auto prim_expr = extent.as<PrimExpr>()) {
       // extent is a single PrimExpr
-      DataType dtype = prim_expr.value().dtype();
+      PrimType dtype = prim_expr.value().ty();
       n->vars.push_back(Var("v", dtype));
       n->doms.push_back(Range(tvm::IntImm(dtype, 0), prim_expr.value()));
     } else if (auto tuple = extent.as<ffi::Tuple<PrimExpr, PrimExpr>>()) {
       // extent is a tuple of two PrimExpr (start, extent)
-      DataType dtype = tuple.value().get<0>().dtype();
+      PrimType dtype = tuple.value().get<0>().ty();
       n->vars.push_back(Var("v", dtype));
       n->doms.push_back(Range::FromMinExtent(tuple.value().get<0>(), tuple.value().get<1>()));
     } else {
@@ -598,7 +605,7 @@ Var Bind(PrimExpr value, ffi::Optional<Type> type_annotation, ffi::Optional<Var>
     } else if (type_annotation.defined()) {
       return Var("v", type_annotation.value());
     } else {
-      return Var("v", value.dtype());
+      return Var("v", value.ty());
     }
   }();
   AddToParent(tvm::tirx::Bind(bind_var, value));
@@ -621,7 +628,7 @@ LaunchThreadFrame LaunchThread(Var var, PrimExpr extent) {
   ffi::ObjectPtr<LaunchThreadFrameNode> n = ffi::make_object<LaunchThreadFrameNode>();
   if (!iter_var->dom.defined()) {
     const_cast<tvm::tirx::IterVarNode*>(iter_var.get())->dom =
-        Range(tvm::IntImm(extent.dtype(), 0), extent);
+        Range(tvm::IntImm(extent.ty(), 0), extent);
   } else if (!arith::Analyzer()->CanProveEqual(iter_var->dom->extent, extent)) {
     TVM_FFI_THROW(InternalError) << "ValueError: Inconsistent extents of environment thread. "
                                  << iter_var->dom->extent << " vs " << extent;
@@ -633,7 +640,7 @@ LaunchThreadFrame LaunchThread(Var var, PrimExpr extent) {
 }
 
 LaunchThreadFrame LaunchThread(ffi::String thread_tag, PrimExpr extent) {
-  return LaunchThread(EnvThread(thread_tag, extent.dtype()), extent);
+  return LaunchThread(EnvThread(thread_tag, extent.ty()), extent);
 }
 
 AttrFrame Attr(ffi::Any node, ffi::String attr_key, PrimExpr value) {
@@ -721,7 +728,7 @@ ComposeOpFrame ComposeOp(ffi::Map<ffi::String, Buffer> workspace,
   return ComposeOpFrame(n);
 }
 
-Var EnvThread(ffi::String thread_tag, DataType dtype) {
+Var EnvThread(ffi::String thread_tag, PrimType dtype) {
   IterVar iter_var(Range{nullptr}, Var("", dtype), tvm::tirx::IterVarType::kThreadIndex,
                    thread_tag);
   Var var = iter_var->var;
@@ -735,9 +742,10 @@ Var EnvThread(ffi::String thread_tag, DataType dtype) {
 
 void BufferStore(Buffer buffer, PrimExpr value, ffi::Array<PrimExpr> indices,
                  ffi::Optional<PrimExpr> predicate = std::nullopt) {
-  runtime::DataType buffer_dtype = buffer->dtype;
-  bool is_index_scalable = indices.empty() ? false : indices.back().dtype().is_scalable_vector();
-  bool is_buffer_dtype_scalable = buffer_dtype.is_scalable_vector();
+  PrimType buffer_dtype = buffer->dtype;
+  PrimType index_ty = indices.empty() ? PrimType::Int(32) : indices.back().ty();
+  bool is_index_scalable = !indices.empty() && index_ty.IsScalableVector();
+  bool is_buffer_dtype_scalable = buffer_dtype.IsScalableVector();
 
   TVM_FFI_ICHECK(!(is_index_scalable && is_buffer_dtype_scalable))
       << "Index dtype and buffer dtype can't both be scalable.";
@@ -746,29 +754,30 @@ void BufferStore(Buffer buffer, PrimExpr value, ffi::Array<PrimExpr> indices,
   if (indices.empty()) {
     index_lanes = 1;
   } else if (is_index_scalable) {
-    index_lanes = indices.back().dtype().vscale_factor();
+    index_lanes = index_ty.VScaleFactor();
   } else {
-    index_lanes = indices.back().dtype().lanes();
+    index_lanes = index_ty.lanes();
   }
 
-  int buffer_lanes = is_buffer_dtype_scalable ? buffer_dtype.vscale_factor() : buffer_dtype.lanes();
+  int buffer_lanes = is_buffer_dtype_scalable ? buffer_dtype.VScaleFactor() : buffer_dtype.lanes();
 
-  runtime::DataType lhs_dtype;
+  PrimType lhs_dtype = buffer_dtype;
   if (is_buffer_dtype_scalable || is_index_scalable) {
-    lhs_dtype = buffer_dtype.with_scalable_vscale_factor(buffer_lanes * index_lanes);
+    lhs_dtype = PrimType::ScalableVector(buffer_dtype.code(), buffer_dtype.bits(),
+                                         buffer_lanes * index_lanes);
   } else {
-    lhs_dtype = buffer_dtype.with_lanes(buffer_dtype.lanes() * index_lanes);
+    lhs_dtype = buffer_dtype.WithLanes(buffer_dtype.lanes() * index_lanes);
   }
 
-  runtime::DataType rhs_dtype = value->dtype;
+  PrimType rhs_dtype = value.ty();
 
   if (lhs_dtype != rhs_dtype) {
-    TVM_FFI_ICHECK(lhs_dtype.is_scalable_vector() == rhs_dtype.is_scalable_vector())
+    TVM_FFI_ICHECK(lhs_dtype.IsScalableVector() == rhs_dtype.IsScalableVector())
         << "Can't mix scalable and fixed length vectors in a statement";
 
     bool lanes_match = false;
-    if (lhs_dtype.is_scalable_vector()) {
-      lanes_match = lhs_dtype.vscale_factor() == rhs_dtype.vscale_factor();
+    if (lhs_dtype.IsScalableVector()) {
+      lanes_match = lhs_dtype.VScaleFactor() == rhs_dtype.VScaleFactor();
     } else {
       lanes_match = lhs_dtype.lanes() == rhs_dtype.lanes();
     }
@@ -781,14 +790,13 @@ void BufferStore(Buffer buffer, PrimExpr value, ffi::Array<PrimExpr> indices,
     if (lhs_dtype.code() != rhs_dtype.code()) {
       if (
           // Case 1. lhs is handle, and rhs needs to be casted to handle.
-          (lhs_dtype.code() == runtime::DataType::kHandle) ||
+          (lhs_dtype.code() == DLDataTypeCode::kDLOpaqueHandle) ||
           // Case 2. rhs is handle, and it needs to be casted to non-handle.
-          (rhs_dtype.code() == runtime::DataType::kHandle) ||
+          (rhs_dtype.code() == DLDataTypeCode::kDLOpaqueHandle) ||
           // Case 3. rhs is float or bfloat, and casting to non-float can lose precision.
-          ((lhs_dtype.code() == runtime::DataType::kInt ||
-            lhs_dtype.code() == runtime::DataType::kUInt) &&
-           (rhs_dtype.code() == runtime::DataType::kFloat ||
-            rhs_dtype.code() == runtime::DataType::kBFloat))) {
+          ((lhs_dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) &&
+           (rhs_dtype.code() == DLDataTypeCode::kDLFloat ||
+            rhs_dtype.code() == DLDataTypeCode::kDLBfloat))) {
         LOG(WARNING) << "Casting in BufferStore may lose precision"
                      << ": LHS is `" << lhs_dtype << "`, RHS is `" << rhs_dtype
                      << "`, indexing lanes: " << index_lanes;
@@ -799,7 +807,7 @@ void BufferStore(Buffer buffer, PrimExpr value, ffi::Array<PrimExpr> indices,
   AddToParent(tvm::tirx::BufferStore(buffer, value, indices, predicate));
 }
 
-DeclBufferFrame DeclBuffer(ffi::Array<PrimExpr> shape, DataType dtype, ffi::String buffer_name,
+DeclBufferFrame DeclBuffer(ffi::Array<PrimExpr> shape, PrimType dtype, ffi::String buffer_name,
                            ffi::Optional<Var> data, ffi::Optional<ffi::Array<PrimExpr>> strides,
                            ffi::Optional<PrimExpr> elem_offset, ffi::String storage_scope,
                            int align, int offset_factor, ffi::String buffer_type,
@@ -841,7 +849,7 @@ DeclBufferFrame DeclBuffer(ffi::Array<PrimExpr> shape, DataType dtype, ffi::Stri
   return DeclBufferFrame(n);
 }
 
-Buffer AllocBuffer(ffi::Array<PrimExpr> shape, DataType dtype, ffi::String storage_scope,
+Buffer AllocBuffer(ffi::Array<PrimExpr> shape, PrimType dtype, ffi::String storage_scope,
                    ffi::Optional<ffi::Map<ffi::String, ffi::Any>> annotations) {
   Buffer buffer = BufferDecl(shape, dtype, "", std::nullopt, std::nullopt, std::nullopt,
                              storage_scope, 0, 0, "", std::nullopt);
@@ -852,8 +860,7 @@ Buffer AllocBuffer(ffi::Array<PrimExpr> shape, DataType dtype, ffi::String stora
 
 void Evaluate(PrimExpr value) { AddToParent(tvm::tirx::Evaluate(value)); }
 
-PrimExpr Ptr(runtime::DataType dtype, ffi::String storage_scope = "global",
-             bool is_size_var = false) {
+PrimExpr Ptr(DLDataType dtype, ffi::String storage_scope = "global", bool is_size_var = false) {
   PointerType type_annotation(PrimType(dtype), storage_scope);
   return is_size_var ? tvm::tirx::SizeVar("", type_annotation)
                      : tvm::tirx::Var("", type_annotation);
@@ -922,7 +929,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def("script.ir_builder.tirx.Buffer",
-           static_cast<Buffer (*)(ffi::Array<PrimExpr>, DataType, ffi::String, ffi::Optional<Var>,
+           static_cast<Buffer (*)(ffi::Array<PrimExpr>, PrimType, ffi::String, ffi::Optional<Var>,
                                   ffi::Optional<ffi::Array<PrimExpr>>, ffi::Optional<PrimExpr>,
                                   ffi::String, int, int, ffi::String,
                                   ffi::Optional<ffi::Array<IntImm>>, ffi::Optional<Layout>,
diff --git a/src/tirx/script/builder/utils.h b/src/tirx/script/builder/utils.h
index 4d7821a84d5a..a8cd6e5b496b 100644
--- a/src/tirx/script/builder/utils.h
+++ b/src/tirx/script/builder/utils.h
@@ -129,7 +129,7 @@ inline IfFrame FindIfFrame(const ffi::String& method) {
 inline tvm::tirx::BufferRegion BufferRegionFromLoad(tvm::tirx::BufferLoad buffer_load) {
   ffi::Array<Range> ranges;
   for (const PrimExpr& index : buffer_load->indices) {
-    ranges.push_back(Range::FromMinExtent(index, IntImm(index->dtype, 1)));
+    ranges.push_back(Range::FromMinExtent(index, IntImm(index.ty(), 1)));
   }
   return tvm::tirx::BufferRegion(buffer_load->buffer, ranges);
 }
diff --git a/src/tirx/script/printer/block.cc b/src/tirx/script/printer/block.cc
index 6d7902a4a89f..71fc0b2e7ecb 100644
--- a/src/tirx/script/printer/block.cc
+++ b/src/tirx/script/printer/block.cc
@@ -149,7 +149,9 @@ Doc PrintBlock(IRDocsifier d, tirx::SBlock block, AccessPath block_p,  //
 
   // Step 2. Handle block predicate
   if (realize) {
-    TVM_FFI_ICHECK(realize->predicate.defined() && realize->predicate->dtype.is_bool());
+    PrimType predicate_ty = realize->predicate.ty();
+    TVM_FFI_ICHECK(realize->predicate.defined() &&
+                   predicate_ty.MatchesCode(DLDataTypeCode::kDLBool));
     if (!tirx::is_one(realize->predicate)) {
       (*frame)->stmts.push_back(ExprStmtDoc(
           TIR(d, "where")
diff --git a/src/tirx/script/printer/buffer.cc b/src/tirx/script/printer/buffer.cc
index 6dd24e6b9a3c..015c3685817e 100644
--- a/src/tirx/script/printer/buffer.cc
+++ b/src/tirx/script/printer/buffer.cc
@@ -93,9 +93,9 @@ ffi::Map<ffi::String, ExprDoc> BufferAttrs(tirx::Buffer buffer, const AccessPath
   }
   // Step 2. Handle `buffer.dtype`
   {
-    DataType default_buf_dtype = d->cfg->buffer_dtype;
-    if (buffer->dtype != default_buf_dtype) {
-      kwargs.Set("dtype", LiteralDoc::DataType(buffer->dtype, buffer_p->Attr("dtype")));
+    DLDataType default_buf_dtype = d->cfg->buffer_dtype;
+    if (buffer->dtype->dtype != default_buf_dtype) {
+      kwargs.Set("dtype", LiteralDoc::DataType(buffer->dtype->dtype, buffer_p->Attr("dtype")));
     }
   }
   // Step 3. Handle `buffer.data`
@@ -145,7 +145,7 @@ ffi::Map<ffi::String, ExprDoc> BufferAttrs(tirx::Buffer buffer, const AccessPath
   // Step 5. Handle `buffer.elem_offset`
   bool needs_print_factor = false;
   if (const auto* int_imm = buffer->elem_offset.as<IntImmNode>()) {
-    if (int_imm->value != 0 || int_imm->dtype != buffer->DefaultIndexType()) {
+    if (int_imm->value != 0 || int_imm->ty()->dtype != buffer->DefaultIndexType()) {
       kwargs.Set("elem_offset",
                  d->AsDoc<ExprDoc>(buffer->elem_offset,  //
                                    buffer_p->Attr("elem_offset")));
@@ -329,7 +329,7 @@ ExprDoc BufferAttn(const tirx::Buffer& buffer, const AccessPath& p, const Frame&
       BufferAttrs(buffer, p, frame, d, BufferVarDefinition::DataPointer);
   ExprDoc shape = attrs.Get("shape").value();
   ExprDoc dtype =
-      attrs.Get("dtype").value_or(LiteralDoc::DataType(buffer->dtype, p->Attr("dtype")));
+      attrs.Get("dtype").value_or(LiteralDoc::DataType(buffer->dtype->dtype, p->Attr("dtype")));
   return TIR(d, "Buffer")->Call({shape, dtype}, {}, {});
 }
 
diff --git a/src/tirx/script/printer/expr.cc b/src/tirx/script/printer/expr.cc
index 32a6251d54d3..1d2168b13a03 100644
--- a/src/tirx/script/printer/expr.cc
+++ b/src/tirx/script/printer/expr.cc
@@ -54,7 +54,7 @@ ExprDoc PrintVarCreation(const tirx::Var& var, const AccessPath& var_p, const IR
       rhs = TIR(d, "TensorMap")->Call({}, {}, {});
     }
   } else {
-    rhs = TIR(d, DType2Str(var->dtype));
+    rhs = TIR(d, DType2Str(var.ty()->dtype));
     rhs->source_paths.push_back(var_p->Attr("dtype"));
     rhs = rhs->Call({}, kwargs_keys, kwargs_values);
   }
@@ -121,7 +121,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tirx::Cast>("", [](tirx::Cast cast, AccessPath p, IRDocsifier d) -> Doc {
-      ExprDoc dtype = LiteralDoc::DataType(cast->dtype, p->Attr("dtype"));
+      ExprDoc dtype = LiteralDoc::DataType(cast.ty()->dtype, p->Attr("dtype"));
       ExprDoc value = d->AsDoc<ExprDoc>(cast->value, p->Attr("value"));
       return TIR(d, "Cast")->Call({dtype, value});
     });
@@ -258,6 +258,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<tirx::Call>("", [](tirx::Call call, AccessPath call_p, IRDocsifier d) -> Doc {
+      DLDataType call_dtype = call.ty()->dtype;
       if (call->attrs.defined()) {
         ffi::Array<ExprDoc> call_args;
         int n_args = call->args.size();
@@ -269,7 +270,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
                              ? LiteralDoc::Str(call->op.as<Op>().value()->name, call_p->Attr("op"))
                              : d->AsDoc<ExprDoc>(call->op, call_p->Attr("op"));
         return TIR(d, "Call")->Call(
-            {LiteralDoc::DataType(call->dtype, call_p->Attr("dtype")), op_doc, ListDoc(call_args)},
+            {LiteralDoc::DataType(call_dtype, call_p->Attr("dtype")), op_doc, ListDoc(call_args)},
             {"attrs"}, {d->AsDoc<ExprDoc>(call->attrs, call_p->Attr("attrs"))});
       }
       static const OpAttrMap<tirx::TScriptPrinterName>& op_names =
@@ -297,7 +298,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           ffi::Array<ExprDoc> args;
           args.reserve(n_args + 1);
           if (dtype_print_location == tirx::ScriptDtypePrintLocation::kFirst) {
-            args.push_back(LiteralDoc::DataType(call->dtype, call_p->Attr("dtype")));
+            args.push_back(LiteralDoc::DataType(call_dtype, call_p->Attr("dtype")));
           }
 
           for (int i = 0; i < n_args; ++i) {
@@ -309,7 +310,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
             }
           }
           if (dtype_print_location == tirx::ScriptDtypePrintLocation::kLast) {
-            args.push_back(LiteralDoc::DataType(call->dtype, call_p->Attr("dtype")));
+            args.push_back(LiteralDoc::DataType(call_dtype, call_p->Attr("dtype")));
           }
           return prefix.value()->Call(args);
         }
@@ -334,9 +335,9 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
           kw_keys.push_back("source_code");
           kw_vals.push_back(src);
           // If non-void return type, print return_type keyword.
-          if (call->dtype != DataType::Void()) {
+          if (!call.ty().IsVoid()) {
             kw_keys.push_back("return_type");
-            kw_vals.push_back(LiteralDoc::DataType(call->dtype, call_p->Attr("dtype")));
+            kw_vals.push_back(LiteralDoc::DataType(call_dtype, call_p->Attr("dtype")));
           }
           return prefix.value()->Call(args, kw_keys, kw_vals);
         }
@@ -349,14 +350,14 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       int n_args = call->args.size();
       args.reserve(n_args + 1);
       if (dtype_print_location == tirx::ScriptDtypePrintLocation::kFirst) {
-        args.push_back(LiteralDoc::DataType(call->dtype, call_p->Attr("dtype")));
+        args.push_back(LiteralDoc::DataType(call_dtype, call_p->Attr("dtype")));
       }
 
       for (int i = 0; i < n_args; ++i) {
         args.push_back(d->AsDoc<ExprDoc>(call->args[i], call_p->Attr("args")->ArrayItem(i)));
       }
       if (dtype_print_location == tirx::ScriptDtypePrintLocation::kLast) {
-        args.push_back(LiteralDoc::DataType(call->dtype, call_p->Attr("dtype")));
+        args.push_back(LiteralDoc::DataType(call_dtype, call_p->Attr("dtype")));
       }
       return prefix.value()->Call(args);
     });
@@ -391,8 +392,10 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       if (!ret->IsInstance<tirx::DivNode>()) {
         return TIR(d, "Div")->Call({a, b});
       }
-      if ((node->a->dtype.is_int() || node->a->dtype.is_uint()) &&
-          (node->b->dtype.is_int() || node->b->dtype.is_uint())) {
+      PrimType a_ty = node->a.ty();
+      PrimType b_ty = node->b.ty();
+      if ((a_ty.code() == DLDataTypeCode::kDLInt || a_ty.code() == DLDataTypeCode::kDLUInt) &&
+          (b_ty.code() == DLDataTypeCode::kDLInt || b_ty.code() == DLDataTypeCode::kDLUInt)) {
         return TIR(d, "Div")->Call({a, b});
       }
       return OperationDoc(OperationDocNode::Kind::kDiv, {a, b});
diff --git a/src/tirx/script/printer/for_loop.cc b/src/tirx/script/printer/for_loop.cc
index 249e151b9774..a1edcb8fe5e7 100644
--- a/src/tirx/script/printer/for_loop.cc
+++ b/src/tirx/script/printer/for_loop.cc
@@ -34,8 +34,8 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
       };
       if (d->cfg->syntax_sugar) {
         for (const tirx::ForNode* l = loop.get(); l != nullptr; l = l->body.as<tirx::ForNode>()) {
-          TVM_FFI_ICHECK(l->loop_var->dtype == l->min->dtype);
-          TVM_FFI_ICHECK(l->loop_var->dtype == l->extent->dtype);
+          TVM_FFI_ICHECK(l->loop_var.ty()->dtype == l->min.ty()->dtype);
+          TVM_FFI_ICHECK(l->loop_var.ty()->dtype == l->extent.ty()->dtype);
           if (l->kind != tirx::ForKind::kSerial ||  //
               !tirx::is_zero(l->min) ||             //
               !l->annotations.empty() ||            //
diff --git a/src/tirx/script/printer/ir.cc b/src/tirx/script/printer/ir.cc
index d7817da8269d..d5d399a33d01 100644
--- a/src/tirx/script/printer/ir.cc
+++ b/src/tirx/script/printer/ir.cc
@@ -28,10 +28,10 @@ TVM_FFI_STATIC_INIT_BLOCK() { TIRFrameNode::RegisterReflection(); }
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<IntImm>("", [](IntImm imm, AccessPath imm_p, IRDocsifier d) -> Doc {
-      DataType dtype = imm->dtype;
+      DLDataType dtype = imm->ty()->dtype;
       if (dtype == d->cfg->int_dtype) {
         return LiteralDoc::Int(imm->value, imm_p->Attr("value"));
-      } else if (dtype == DataType::Bool()) {
+      } else if (dtype == DLDataType{kDLBool, 8, 1}) {
         return TIR(d, DType2Str(dtype))
             ->Call({LiteralDoc::Boolean(imm->value, imm_p->Attr("value"))});
       } else {
@@ -41,7 +41,7 @@ TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
 
 TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable)
     .set_dispatch<FloatImm>("", [](FloatImm imm, AccessPath imm_p, IRDocsifier d) -> Doc {
-      DataType dtype = imm->dtype;
+      DLDataType dtype = imm->ty()->dtype;
       if (dtype == d->cfg->float_dtype) {
         return LiteralDoc::Float(imm->value, imm_p->Attr("value"));
       } else {
diff --git a/src/tirx/script/printer/stmt.cc b/src/tirx/script/printer/stmt.cc
index 0aa697175355..392b1c7c86da 100644
--- a/src/tirx/script/printer/stmt.cc
+++ b/src/tirx/script/printer/stmt.cc
@@ -502,7 +502,7 @@ ffi::Optional<ExprDoc> TryDeclBufferSugarWithParent(const tirx::Buffer& child, c
     }
     if (shapes_compatible) {
       ExprDoc dtype_doc =
-          LiteralDoc::Str(DType2Str(child->dtype), p->Attr("buffer")->Attr("dtype"));
+          LiteralDoc::Str(DType2Str(child->dtype->dtype), p->Attr("buffer")->Attr("dtype"));
       return pdoc->Attr("view")->Call({dtype_doc});
     }
   }
@@ -723,7 +723,7 @@ Doc AllocBufferDoc(tirx::AllocBuffer stmt, AccessPath p, IRDocsifier d) {
       d->Define(stmt->buffer->data, d->frames.back(),
                 [d, buf, p]() { return d->AsDoc<ExprDoc>(buf, p->Attr("buffer"))->Attr("data"); });
     }
-    ExprDoc type_ann = TIR(d, DType2Str(stmt->buffer->dtype));
+    ExprDoc type_ann = TIR(d, DType2Str(stmt->buffer->dtype->dtype));
     return AssignDoc(lhs, std::nullopt, type_ann);
   }
   ExprDoc rhs = BufferDecl(stmt->buffer, "alloc_buffer", {}, p->Attr("buffer"), d->frames.back(), d,
@@ -814,7 +814,7 @@ ExprDoc DocsifyLaunchThread(const tirx::AttrStmt& attr_stmt, const AccessPath& a
 /*! \brief Check whether an AttrStmt has node=IntImm(int32, 0) (the dict-attr pattern). */
 static bool IsDictAttrPattern(const tirx::AttrStmt& stmt) {
   if (auto int_imm = stmt->node.as<IntImmNode>()) {
-    return int_imm->dtype == DataType::Int(32) && int_imm->value == 0;
+    return int_imm->ty()->dtype == DLDataType{kDLInt, 32, 1} && int_imm->value == 0;
   }
   return false;
 }
diff --git a/src/tirx/transform/common_subexpr_elim.cc b/src/tirx/transform/common_subexpr_elim.cc
index 2221df935226..8bca3931cb10 100644
--- a/src/tirx/transform/common_subexpr_elim.cc
+++ b/src/tirx/transform/common_subexpr_elim.cc
@@ -296,7 +296,8 @@ class CSEPlanner : public StmtExprVisitor {
     // the predicate directly. BoolImm is already filtered above as an IntImm
     // leaf, so this rule only affects compound bool expressions
     // (LT/LE/GT/GE/EQ/NE/And/Or/Not/Cast-to-bool/Select-of-bool).
-    if (expr.dtype().is_bool()) return false;
+    PrimType expr_ty = expr.ty();
+    if (expr_ty.MatchesCode(DLDataTypeCode::kDLBool)) return false;
     if (CheckContains::ExprContains(expr, IsForbiddenNode)) return false;
     return true;
   }
@@ -662,7 +663,7 @@ class CSEPlanner : public StmtExprVisitor {
       // entry->repr may already contain CSE vars from shallower entries.
       ++counter;
       std::string name = "cse_v" + std::to_string(counter);
-      Var cse_var(name, entry->repr.dtype());
+      Var cse_var(name, entry->repr.ty());
       Stmt bind = Bind(cse_var, entry->repr);
 
       // Step 3c: Record in output tables.
diff --git a/src/tirx/transform/dtype_conversion.cc b/src/tirx/transform/dtype_conversion.cc
index 7cf1593d822b..08d70dab2b33 100644
--- a/src/tirx/transform/dtype_conversion.cc
+++ b/src/tirx/transform/dtype_conversion.cc
@@ -27,30 +27,41 @@ namespace tvm {
 namespace tirx {
 
 PrimExpr ReinterpretAsUInt(PrimExpr value) {
-  return reinterpret(GetStorageUIntDType(value.dtype()), value);
+  return reinterpret(GetStorageUIntDType(value.ty()), value);
 }
 
-DataType GetStorageUIntDType(DataType dtype) { return DataType::UInt(dtype.bits(), dtype.lanes()); }
+PrimType GetStorageUIntDType(PrimType dtype) {
+  if (dtype.IsScalableVector()) {
+    return PrimType::ScalableVector(DLDataTypeCode::kDLUInt, dtype.bits(), dtype.VScaleFactor());
+  }
+  return PrimType::UInt(dtype.bits(), dtype.lanes());
+}
 
-PrimExpr DTypeConversion(PrimExpr src_value, DataType tgt_dtype, RoundingMode round_mode) {
-  DataType src_dtype = src_value.dtype();
+PrimExpr DTypeConversion(PrimExpr src_value, PrimType tgt_dtype, RoundingMode round_mode) {
+  PrimType src_dtype = src_value.ty();
   // Step 1: check dtype
   // The lanes of src dtype and target dtype must match.
-  TVM_FFI_ICHECK_EQ(src_dtype.lanes(), tgt_dtype.lanes())
+  TVM_FFI_ICHECK_EQ(src_dtype->dtype.lanes, tgt_dtype->dtype.lanes)
       << "The lanes for data type for source value must matches the target datatype.";
-  auto is_floating_point = [](DataType dtype) {
-    return dtype.is_float() || dtype.is_bfloat16() || dtype.is_float8() || dtype.is_float6() ||
-           dtype.is_float4();
+  auto is_floating_point = [](PrimType dtype) {
+    DLDataTypeCode code = dtype.code();
+    return code == DLDataTypeCode::kDLFloat ||
+           (code == DLDataTypeCode::kDLBfloat && dtype.bits() == 16) ||
+           code == DLDataTypeCode::kDLFloat8_e3m4 || code == DLDataTypeCode::kDLFloat8_e4m3 ||
+           code == DLDataTypeCode::kDLFloat8_e4m3b11fnuz ||
+           code == DLDataTypeCode::kDLFloat8_e4m3fn || code == DLDataTypeCode::kDLFloat8_e4m3fnuz ||
+           code == DLDataTypeCode::kDLFloat8_e5m2 || code == DLDataTypeCode::kDLFloat8_e5m2fnuz ||
+           code == DLDataTypeCode::kDLFloat8_e8m0fnu || code == DLDataTypeCode::kDLFloat6_e2m3fn ||
+           code == DLDataTypeCode::kDLFloat6_e3m2fn || code == DLDataTypeCode::kDLFloat4_e2m1fn;
   };
   // Both source dtype and target dtype should be floating point.
   TVM_FFI_ICHECK(is_floating_point(src_dtype) && is_floating_point(tgt_dtype));
-  FloatConfig src_fp = FloatConfig::FromDataType(src_value.dtype()),
+  FloatConfig src_fp = FloatConfig::FromDataType(src_dtype),
               tgt_fp = FloatConfig::FromDataType(tgt_dtype);
   int exponent_delta = tgt_fp.exponent - src_fp.exponent;
   int bias_delta = tgt_fp.bias - src_fp.bias;
   int mantissa_delta = tgt_fp.mantissa - src_fp.mantissa;
-  DataType src_uint = GetStorageUIntDType(src_value.dtype()),
-           tgt_uint = GetStorageUIntDType(tgt_dtype);
+  PrimType src_uint = GetStorageUIntDType(src_dtype), tgt_uint = GetStorageUIntDType(tgt_dtype);
   PrimExpr src_uint_value = ReinterpretAsUInt(src_value);
   if (mantissa_delta < 0) {
     // use rounding
diff --git a/src/tirx/transform/dtype_conversion.h b/src/tirx/transform/dtype_conversion.h
index 21bd5bf355bd..d6026cf75fe6 100644
--- a/src/tirx/transform/dtype_conversion.h
+++ b/src/tirx/transform/dtype_conversion.h
@@ -98,12 +98,20 @@ class FloatConfig {
    * \param dtype The data type, must be a floating point.
    * \return The FloatConfig class containing internal floating point representation.
    */
-  static FloatConfig FromDataType(DataType dtype) {
-    TVM_FFI_ICHECK(dtype.is_float() || dtype.is_bfloat16() || dtype.is_float8() ||
-                   dtype.is_float6() || dtype.is_float4())
+  static FloatConfig FromDataType(PrimType dtype) {
+    DLDataTypeCode code = dtype.code();
+    TVM_FFI_ICHECK(
+        code == DLDataTypeCode::kDLFloat ||
+        (code == DLDataTypeCode::kDLBfloat && dtype.bits() == 16) ||
+        code == DLDataTypeCode::kDLFloat8_e3m4 || code == DLDataTypeCode::kDLFloat8_e4m3 ||
+        code == DLDataTypeCode::kDLFloat8_e4m3b11fnuz || code == DLDataTypeCode::kDLFloat8_e4m3fn ||
+        code == DLDataTypeCode::kDLFloat8_e4m3fnuz || code == DLDataTypeCode::kDLFloat8_e5m2 ||
+        code == DLDataTypeCode::kDLFloat8_e5m2fnuz || code == DLDataTypeCode::kDLFloat8_e8m0fnu ||
+        code == DLDataTypeCode::kDLFloat6_e2m3fn || code == DLDataTypeCode::kDLFloat6_e3m2fn ||
+        code == DLDataTypeCode::kDLFloat4_e2m1fn)
         << "FloatConfig is only applicable to floating point data types, got " << dtype
         << " instead.";
-    if (dtype.is_float()) {
+    if (code == DLDataTypeCode::kDLFloat) {
       // IEEE 754 binary formats
       // Reference: https://en.wikipedia.org/wiki/Floating-point_arithmetic
       switch (dtype.bits()) {
@@ -115,46 +123,53 @@ class FloatConfig {
           // float64
           return FloatConfig(11, 52, 1023, InftyStyle::kIEEE, NaNStyle::kIEEE);
       }
-    } else if (dtype.is_bfloat16()) {
+    } else if (code == DLDataTypeCode::kDLBfloat && dtype.bits() == 16) {
       // bfloat16,
       return FloatConfig(8, 7, 127, InftyStyle::kIEEE, NaNStyle::kIEEE);
-    } else if (dtype.is_float8()) {  // float8
+    } else if (code == DLDataTypeCode::kDLFloat8_e3m4 || code == DLDataTypeCode::kDLFloat8_e4m3 ||
+               code == DLDataTypeCode::kDLFloat8_e4m3b11fnuz ||
+               code == DLDataTypeCode::kDLFloat8_e4m3fn ||
+               code == DLDataTypeCode::kDLFloat8_e4m3fnuz ||
+               code == DLDataTypeCode::kDLFloat8_e5m2 ||
+               code == DLDataTypeCode::kDLFloat8_e5m2fnuz ||
+               code == DLDataTypeCode::kDLFloat8_e8m0fnu) {  // float8
       // NVIDIA/Arm/Intel's FP8 formats for Deep Learning
       // Reference: https://arxiv.org/abs/2209.05433
-      switch (dtype.code()) {
-        case DataType::kFloat8_e3m4:
+      switch (code) {
+        case DLDataTypeCode::kDLFloat8_e3m4:
           // E3M4 format, not consistent with IEEE-754
           return FloatConfig(3, 4, 3, InftyStyle::kNone, NaNStyle::kAllOnes);
-        case DataType::kFloat8_e4m3:
+        case DLDataTypeCode::kDLFloat8_e4m3:
           // E4M3 format, not consistent with IEEE-754
           return FloatConfig(4, 3, 7, InftyStyle::kNone, NaNStyle::kAllOnes);
-        case DataType::kFloat8_e4m3b11fnuz:
+        case DLDataTypeCode::kDLFloat8_e4m3b11fnuz:
           // E4M3 variant with b11 encoding, not consistent with IEEE-754
           return FloatConfig(4, 3, 7, InftyStyle::kNone, NaNStyle::kAllOnes);
-        case DataType::kFloat8_e4m3fn:
+        case DLDataTypeCode::kDLFloat8_e4m3fn:
           // E4M3 format, not consistent with IEEE-754
           return FloatConfig(4, 3, 7, InftyStyle::kNone, NaNStyle::kAllOnes);
-        case DataType::kFloat8_e4m3fnuz:
+        case DLDataTypeCode::kDLFloat8_e4m3fnuz:
           // UE4M3 format, not consistent with IEEE-754
           return FloatConfig(4, 3, 7, InftyStyle::kNone, NaNStyle::kAllOnes);
-        case DataType::kFloat8_e5m2:
+        case DLDataTypeCode::kDLFloat8_e5m2:
           // UE5M2 format, consistent with IEEE-754
           return FloatConfig(5, 2, 15, InftyStyle::kIEEE, NaNStyle::kIEEE);
-        case DataType::kFloat8_e5m2fnuz:
+        case DLDataTypeCode::kDLFloat8_e5m2fnuz:
           // UE5M2 format, not consistent with IEEE-754
           return FloatConfig(5, 2, 15, InftyStyle::kNone, NaNStyle::kAllOnes);
-        case DataType::kFloat8_e8m0fnu:
+        case DLDataTypeCode::kDLFloat8_e8m0fnu:
           // UE8M0 format, not consistent with IEEE-754
           return FloatConfig(8, 0, 127, InftyStyle::kNone, NaNStyle::kAllOnes);
         default:
           TVM_FFI_THROW(InternalError) << "Unknown float8 variant: " << dtype;
       }
-    } else if (dtype.is_float6()) {  // float6
-      switch (dtype.code()) {
-        case DataType::kFloat6_e2m3fn:
+    } else if (code == DLDataTypeCode::kDLFloat6_e2m3fn ||
+               code == DLDataTypeCode::kDLFloat6_e3m2fn) {  // float6
+      switch (code) {
+        case DLDataTypeCode::kDLFloat6_e2m3fn:
           // E2M3 format, not consistent with IEEE-754
           return FloatConfig(2, 3, 1, InftyStyle::kNone, NaNStyle::kNone);
-        case DataType::kFloat6_e3m2fn:
+        case DLDataTypeCode::kDLFloat6_e3m2fn:
           // E3M2 format, not consistent with IEEE-754
           return FloatConfig(3, 2, 3, InftyStyle::kNone, NaNStyle::kNone);
         default:
@@ -182,7 +197,7 @@ PrimExpr ReinterpretAsUInt(PrimExpr value);
  * \return The uint data type, the number of bits is
  *   the same as input dtype.
  */
-DataType GetStorageUIntDType(DataType dtype);
+PrimType GetStorageUIntDType(PrimType dtype);
 
 /*!
  * \brief Conversion routine from value stored in one floating point data type to another floating
@@ -193,7 +208,7 @@ DataType GetStorageUIntDType(DataType dtype);
  * \return The converted value in target floating point data type.
  * \note Used when there is no native data type conversion implementation.
  */
-PrimExpr DTypeConversion(PrimExpr src_value, DataType tgt_dtype,
+PrimExpr DTypeConversion(PrimExpr src_value, PrimType tgt_dtype,
                          RoundingMode round_mode = RoundingMode::kHalfToEven);
 
 }  // namespace tirx
diff --git a/src/tirx/transform/flatten_buffer.cc b/src/tirx/transform/flatten_buffer.cc
index c18a3bccb964..48e7edc4171f 100644
--- a/src/tirx/transform/flatten_buffer.cc
+++ b/src/tirx/transform/flatten_buffer.cc
@@ -24,6 +24,7 @@
 #include <tvm/arith/iter_affine_map.h>
 #include <tvm/ffi/cast.h>
 #include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/type.h>
 #include <tvm/tirx/analysis.h>
 #include <tvm/tirx/layout.h>
 #include <tvm/tirx/stmt_functor.h>
@@ -113,9 +114,9 @@ class BufferFlattener : public arith::IRMutatorWithAnalyzer {
 
     auto new_buf = GetFlattenedBuffer(node->buffer);
     // TODO(Lunderberg): Move the handling of boolean into a dedicated pass.
-    if (new_buf->dtype == DataType::Bool()) {
+    if (new_buf->dtype->dtype == DLDataType{kDLBool, 8, 1}) {
       auto writer = new_buf.CopyOnWrite();
-      writer->dtype = DataType::Int(8);
+      writer->dtype = PrimType::Int(8);
     }
     if (!node->buffer.same_as(new_buf)) {
       node.CopyOnWrite()->buffer = new_buf;
@@ -145,8 +146,8 @@ class BufferFlattener : public arith::IRMutatorWithAnalyzer {
 
     // TODO(Lunderberg): Move the handling of boolean into a
     // dedicated pass.
-    if (flattened->dtype == DataType::Bool()) {
-      writer->dtype = DataType::Int(8);
+    if (flattened->dtype->dtype == DLDataType{kDLBool, 8, 1}) {
+      writer->dtype = PrimType::Int(8);
     }
     // canonicalize shape
     for (size_t i = 0; i < flattened->shape.size(); ++i) {
@@ -160,7 +161,8 @@ class BufferFlattener : public arith::IRMutatorWithAnalyzer {
 
   Stmt VisitStmt_(const BufferStoreNode* op) final {
     BufferStore store = StmtExprMutator::VisitStmt_(op).as_or_throw<BufferStore>();
-    bool store_returns_bool = (op->value.dtype() == DataType::Bool());
+    PrimType store_value_ty = op->value.ty();
+    bool store_returns_bool = store_value_ty.MatchesCode(DLDataTypeCode::kDLBool);
     store = VisitBufferAccess(store);
 
     // Handle casts from the value's dtype to the dtype of the
@@ -168,27 +170,28 @@ class BufferFlattener : public arith::IRMutatorWithAnalyzer {
     // TODO(Lunderberg): Move the handling of boolean into a
     // dedicated pass.
     if (store_returns_bool) {
-      TVM_FFI_ICHECK_EQ(store->buffer->dtype, DataType::Int(8))
+      TVM_FFI_ICHECK_EQ(store->buffer->dtype->dtype, (DLDataType{kDLInt, 8, 1}))
           << "Expected int8 backing array for boolean tensor";
       auto writer = store.CopyOnWrite();
-      writer->value = tvm::cast(DataType::Int(8), store->value);
+      writer->value = tvm::cast(PrimType::Int(8), store->value);
       return store;
     }
     return store;
   }
 
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
-    bool load_returns_bool = (op->dtype == DataType::Bool());
+    PrimType load_ty = op->ty();
+    bool load_returns_bool = load_ty.MatchesCode(DLDataTypeCode::kDLBool);
     BufferLoad load = StmtExprMutator::VisitExpr_(op).as_or_throw<BufferLoad>();
     load = VisitBufferAccess(load);
     // Handle casts from dtype of the backing array to value's dtype.
     // TODO(Lunderberg): Move the handling of boolean into a
     // dedicated pass.
     if (load_returns_bool) {
-      TVM_FFI_ICHECK_EQ(load->buffer->dtype, DataType::Int(8))
+      TVM_FFI_ICHECK_EQ(load->buffer->dtype->dtype, (DLDataType{kDLInt, 8, 1}))
           << "Expected int8 backing array for boolean tensor";
-      load.CopyOnWrite()->dtype = DataType::Int(8);
-      return tvm::cast(DataType::Bool(), load);
+      load.CopyOnWrite()->BaseExprNode::ty = PrimType::Int(8);
+      return tvm::cast(PrimType::Bool(), load);
     } else {
       return load;
     }
diff --git a/src/tirx/transform/force_narrow_index_to_i32.cc b/src/tirx/transform/force_narrow_index_to_i32.cc
index 68ae7e73f636..65988f0e9647 100644
--- a/src/tirx/transform/force_narrow_index_to_i32.cc
+++ b/src/tirx/transform/force_narrow_index_to_i32.cc
@@ -38,7 +38,7 @@ class Int32DTypeNarrower : public IndexDataTypeNormalizer {
   static PrimFunc RewriteDataType(PrimFunc func) {
     // Check if the integer parameter buffers have dtype other than int32.
     for (auto it : func->buffer_map) {
-      if (it.second->dtype.is_int() && it.second->dtype.bits() > 32) {
+      if (it.second->dtype.code() == DLDataTypeCode::kDLInt && it.second->dtype.bits() > 32) {
         TVM_FFI_THROW(InternalError)
             << "The buffer " << it.second << " in the function buffer map has dtype "
             << it.second->dtype << ". The function is " << func;
@@ -51,11 +51,11 @@ class Int32DTypeNarrower : public IndexDataTypeNormalizer {
 
  private:
   explicit Int32DTypeNarrower(PrimFunc func)
-      : IndexDataTypeNormalizer(DataType::Int(32)), func_(std::move(func)) {}
+      : IndexDataTypeNormalizer(PrimType::Int(32)), func_(std::move(func)) {}
 
   PrimExpr VisitExpr_(const IntImmNode* op) final {
     // ignore the enabled condition and always rewrite i64
-    if (op->dtype == DataType::Int(64)) {
+    if (op->ty() == PrimType::Int(64)) {
       TVM_FFI_ICHECK_LE(op->value, max_value(target_data_type_).as_or_throw<IntImm>()->value);
       return IntImm::Int32(op->value);
     }
@@ -66,7 +66,7 @@ class Int32DTypeNarrower : public IndexDataTypeNormalizer {
     SBlock block_ = IndexDataTypeNormalizer::VisitStmt_(block).as_or_throw<SBlock>();
     // Check if the allocated integer buffers have dtype other than int32.
     for (const Buffer& buf : block_->alloc_buffers) {
-      if (buf->dtype.is_int() && buf->dtype.bits() > 32) {
+      if (buf->dtype.code() == DLDataTypeCode::kDLInt && buf->dtype.bits() > 32) {
         TVM_FFI_THROW(InternalError)
             << "The buffer " << buf << " allocated in the function has dtype " << buf->dtype
             << ". The function is " << func_;
diff --git a/src/tirx/transform/ir_utils.cc b/src/tirx/transform/ir_utils.cc
index c81b7c686775..342fb5df025e 100644
--- a/src/tirx/transform/ir_utils.cc
+++ b/src/tirx/transform/ir_utils.cc
@@ -459,7 +459,7 @@ class IRConvertSSA final : public StmtExprMutator {
           if (var->type_annotation.defined()) {
             return Var(var->name_hint, var->type_annotation);
           } else {
-            return Var(var->name_hint, var->dtype);
+            return Var(var->name_hint, var.ty());
           }
         }();
 
@@ -542,9 +542,9 @@ class IRConvertSSA final : public StmtExprMutator {
       }
     } else {
       if (is_size_var) {
-        return SizeVar(old_var->name_hint, old_var->dtype);
+        return SizeVar(old_var->name_hint, old_var.ty());
       } else {
-        return Var(old_var->name_hint, old_var->dtype);
+        return Var(old_var->name_hint, old_var.ty());
       }
     }
   }
@@ -750,7 +750,8 @@ ffi::Optional<arith::IntConstraints> ConditionalBoundsContext::TrySolveCondition
         if (obj.same_as(e)) {
           return;
         } else if (const VarNode* var = obj.as<VarNode>()) {
-          if (var->dtype.is_int() || var->dtype.is_uint()) {
+          PrimType var_ty = var->ty();
+          if (var_ty.code() == DLDataTypeCode::kDLInt || var_ty.code() == DLDataTypeCode::kDLUInt) {
             cand_vars.push_back(ffi::GetRef<Var>(var));
           }
         } else {
diff --git a/src/tirx/transform/ir_utils.h b/src/tirx/transform/ir_utils.h
index 1bb9aac7f25d..556f77e0085f 100644
--- a/src/tirx/transform/ir_utils.h
+++ b/src/tirx/transform/ir_utils.h
@@ -95,7 +95,7 @@ inline ffi::Array<T> UpdateArray(ffi::Array<T> arr, F fupdate) {
  * \param kind The data kind.
  * \return the get expression.
  */
-inline PrimExpr TVMStructGet(DataType dtype, Var handle, int index,
+inline PrimExpr TVMStructGet(PrimType dtype, Var handle, int index,
                              builtin::TVMStructFieldKind kind) {
   ffi::Array<PrimExpr> args = {handle, IntImm::Int32(index), IntImm::Int32(static_cast<int>(kind))};
   return Call(dtype, builtin::tvm_struct_get(), args);
@@ -107,14 +107,14 @@ inline PrimExpr TVMStructGet(DataType dtype, Var handle, int index,
  * \param dtype The data type.
  * \param offset the offset index.
  */
-inline PrimExpr AddressOffset(Var handle, DataType dtype, int offset) {
+inline PrimExpr AddressOffset(Var handle, PrimType dtype, int offset) {
   PrimExpr offset_expr = IntImm::Int32(offset * dtype.lanes());
   ffi::Array<PrimExpr> shape = {offset_expr + 1};
-  Buffer dummy_buf(handle, dtype, shape, {}, 0, handle->name_hint, 0, 0, kDefault, {}, Span(),
-                   std::nullopt);
+  Buffer dummy_buf(handle, dtype->dtype, shape, {}, 0, handle->name_hint, 0, 0, kDefault, {},
+                   Span(), std::nullopt);
   BufferLoad buf_load(dummy_buf, {offset_expr});
 
-  return Call(DataType::Handle(), builtin::address_of(), {buf_load});
+  return Call(PrimType::Handle(), builtin::address_of(), {buf_load});
 }
 
 /*!
@@ -123,18 +123,19 @@ inline PrimExpr AddressOffset(Var handle, DataType dtype, int offset) {
  * \param dtype The data type.
  * \param offset the offset index.
  */
-inline PrimExpr AddressOffset(Var handle, DataType dtype, PrimExpr offset) {
+inline PrimExpr AddressOffset(Var handle, PrimType dtype, PrimExpr offset) {
   if (dtype.lanes() != 1) {
-    offset = offset * MakeConst(offset.dtype(), dtype.lanes());
-    offset = Ramp(offset, MakeConst(offset.dtype(), 1), dtype.lanes());
+    PrimType offset_ty = offset.ty();
+    offset = offset * MakeConst(offset_ty, dtype.lanes());
+    offset = Ramp(offset, MakeConst(offset_ty, 1), dtype.lanes());
   }
 
   ffi::Array<PrimExpr> shape = {offset + 1};
-  Buffer dummy_buf(handle, dtype.element_of(), shape, {}, 0, handle->name_hint, 0, 0, kDefault, {},
-                   Span(), std::nullopt);
+  Buffer dummy_buf(handle, dtype.WithLanes(1)->dtype, shape, {}, 0, handle->name_hint, 0, 0,
+                   kDefault, {}, Span(), std::nullopt);
   BufferLoad buf_load(dummy_buf, {offset});
 
-  return Call(DataType::Handle(), builtin::address_of(), {buf_load});
+  return Call(PrimType::Handle(), builtin::address_of(), {buf_load});
 }
 
 /*!
@@ -148,7 +149,7 @@ inline PrimExpr AddressOffset(Var handle, DataType dtype, PrimExpr offset) {
 inline Stmt TVMStructSet(Var handle, int index, builtin::TVMStructFieldKind kind, PrimExpr value) {
   ffi::Array<PrimExpr> args = {handle, IntImm::Int32(index), IntImm::Int32(static_cast<int>(kind)),
                                value};
-  return Evaluate(Call(DataType::Int(32), builtin::tvm_struct_set(), args));
+  return Evaluate(Call(PrimType::Int(32), builtin::tvm_struct_set(), args));
 }
 
 /*!
@@ -156,13 +157,15 @@ inline Stmt TVMStructSet(Var handle, int index, builtin::TVMStructFieldKind kind
  * \param t The original type.
  * \return The corresponding API type.
  */
-inline DataType APIType(DataType t) {
-  TVM_FFI_ICHECK(!t.is_void()) << "Cannot pass void type through packed API.";
-  if (t.is_handle()) return t;
+inline PrimType APIType(const PrimType& t) {
+  TVM_FFI_ICHECK(!t.IsVoid()) << "Cannot pass void type through packed API.";
+  if (t.IsHandle()) return t;
   TVM_FFI_ICHECK_EQ(t.lanes(), 1) << "Cannot pass vector type through packed API.";
-  if (t.is_bool() || t.is_uint() || t.is_int()) return DataType::Int(64);
-  TVM_FFI_ICHECK(t.is_float());
-  return DataType::Float(64);
+  if (t.MatchesCode(DLDataTypeCode::kDLBool, DLDataTypeCode::kDLUInt, DLDataTypeCode::kDLInt)) {
+    return PrimType::Int(64);
+  }
+  TVM_FFI_ICHECK_EQ(t.code(), DLDataTypeCode::kDLFloat);
+  return PrimType::Float(64);
 }
 
 /*!
@@ -171,7 +174,7 @@ inline DataType APIType(DataType t) {
  * \param const_size The constant size of the array.
  * \return the alignment
  */
-inline int GetTempAllocaAlignment(DataType type, int32_t const_size) {
+inline int GetTempAllocaAlignment(const PrimType& type, int32_t const_size) {
   int align = runtime::kTempAllocaAlignment;
   if (const_size > 0) {
     int64_t const_s = static_cast<int64_t>(const_size) * type.bits() * type.lanes() / 8;
@@ -200,7 +203,7 @@ inline PrimExpr ConstInt32(size_t index) {
  */
 inline PrimExpr StackAlloca(std::string type, size_t num) {
   ffi::Array<PrimExpr> args = {StringImm(type), ConstInt32(num)};
-  return Call(DataType::Handle(), builtin::tvm_stack_alloca(), args);
+  return Call(PrimType::Handle(), builtin::tvm_stack_alloca(), args);
 }
 
 /*!
diff --git a/src/tirx/transform/lower_intrin.cc b/src/tirx/transform/lower_intrin.cc
index 804a582d900b..cd2527579a3d 100644
--- a/src/tirx/transform/lower_intrin.cc
+++ b/src/tirx/transform/lower_intrin.cc
@@ -117,8 +117,8 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
     op = ret.as<FloorDivNode>();
     if (op == nullptr) return ret;
     int shift;
-    const DataType& dtype = op->dtype;
-    TVM_FFI_ICHECK(dtype.is_int() || dtype.is_uint());
+    PrimType dtype = op->ty();
+    TVM_FFI_ICHECK(dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt));
 
     if (support_bitwise_op_ && is_const_power_of_two_integer(op->b, &shift)) {
       // lower to right shift if possible.
@@ -145,7 +145,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
       // condition on b >= 0.
       // truncmod(a, b) < 0 will implies ceildiv,
       // So we need to correct these cases.
-      if ((dtype == DataType::Int(32) || dtype == DataType::Int(64)) && support_bitwise_op_) {
+      if ((dtype == PrimType::Int(32) || dtype == PrimType::Int(64)) && support_bitwise_op_) {
         // equivalent to rdiv + (rmod >= 0 ? 0: -1);
         return rdiv + (rmod >> MakeConst(dtype, dtype.bits() - 1));
       } else {
@@ -153,7 +153,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
       }
 
     } else {
-      if (dtype.is_float()) {
+      if (dtype.code() == DLDataTypeCode::kDLFloat) {
         // floor(a / b)
         return VisitExpr_(tvm::floor(op->a / op->b).as<CallNode>());
       } else {
@@ -178,8 +178,8 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
     if (op == nullptr) return ret;
     // Lower floordiv to native truncdiv.
     int shift;
-    const DataType& dtype = op->dtype;
-    TVM_FFI_ICHECK(dtype.is_int() || dtype.is_uint());
+    PrimType dtype = op->ty();
+    TVM_FFI_ICHECK(dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt));
 
     if (support_bitwise_op_ && is_const_power_of_two_integer(op->b, &shift)) {
       // lower to masking if possible.
@@ -205,7 +205,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
       // mod(a, b) < 0 will imply we are doing ceildiv,
       // So we need to correct these cases.
       PrimExpr rmod = truncmod(op->a, op->b);
-      if ((dtype == DataType::Int(32) || dtype == DataType::Int(64)) && support_bitwise_op_) {
+      if ((dtype == PrimType::Int(32) || dtype == PrimType::Int(64)) && support_bitwise_op_) {
         // (rmod >> shift) & b
         // -> (rmod >= 0 ? 0: -1) & b
         // -> rmod >= 0 ? 0 : b
@@ -215,7 +215,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
       }
 
     } else {
-      if (dtype.is_float()) {
+      if (dtype.code() == DLDataTypeCode::kDLFloat) {
         // a - floor(a / b) * b
         return op->a - (VisitExpr_(tvm::floor(op->a / op->b).as<CallNode>()) * op->b);
       } else {
@@ -274,24 +274,28 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
     if (const BroadcastNode* bcast = e.as<BroadcastNode>()) {
       if (const CastNode* cast = bcast->value.as<CastNode>()) {
         auto should_swap = [&]() {
+          PrimType cast_ty = cast->ty();
+          PrimType value_ty = cast->value.ty();
           // Maintain behaviour (int8 -> int16, fp16 -> fp32).
-          if (cast->dtype.bits() == cast->value.dtype().bits() * 2) {
+          if (cast_ty.bits() == value_ty.bits() * 2) {
             return true;
           }
           // Check both operands are integer-like.
-          if (!cast->dtype.is_uint() && !cast->dtype.is_int()) {
+          if (cast_ty.code() != DLDataTypeCode::kDLUInt &&
+              cast_ty.code() != DLDataTypeCode::kDLInt) {
             return false;
           }
-          if (!cast->value.dtype().is_uint() && !cast->value.dtype().is_int()) {
+          if (value_ty.code() != DLDataTypeCode::kDLUInt &&
+              value_ty.code() != DLDataTypeCode::kDLInt) {
             return false;
           }
           // If both are integer-like, swap if we have a widening cast.
-          return cast->dtype.bits() > cast->value.dtype().bits();
+          return cast_ty.bits() > value_ty.bits();
         };
 
         if (should_swap()) {
           PrimExpr new_bcast = Broadcast(cast->value, bcast->lanes);
-          return Cast(bcast->dtype, new_bcast);
+          return Cast(ffi::GetRef<PrimExpr>(bcast).ty(), new_bcast);
         }
       }
     }
@@ -303,8 +307,8 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
     PrimExpr lhs = SwapBroadcastCast(a);
     PrimExpr rhs = SwapBroadcastCast(b);
 
-    if (fma_ != nullptr && op->dtype.is_float()) {
-      PrimExpr r = fma_(Call(op->dtype, builtin::fma(), {lhs, rhs, c}));
+    if (fma_ != nullptr && op->ty().code() == DLDataTypeCode::kDLFloat) {
+      PrimExpr r = fma_(Call(ffi::GetRef<PrimExpr>(op).ty(), builtin::fma(), {lhs, rhs, c}));
       if (r.defined()) return this->VisitExpr(r);
     } else {
       if (!lhs.same_as(a) || !rhs.same_as(b)) {
@@ -334,8 +338,10 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
     if (const_int_bound_a->min_value >= 0) {
       return std::nullopt;
     }
+    PrimType a_ty = a.ty();
+    // This overflow check is scalar element based. Lane count is intentionally ignored.
     const int64_t max_value_of_dtype =
-        tvm::max_value(a->dtype.element_of()).as_or_throw<IntImm>()->value;
+        tvm::max_value(PrimType(a_ty.code(), a_ty.bits())).as_or_throw<IntImm>()->value;
 
     // NOTE: ensures that (b-1) - a_min does not overflow
     // also note: max_value_of_dtype + const_int_bound_a->min_value won't overflow
diff --git a/src/tirx/transform/lower_tirx_cleanup.cc b/src/tirx/transform/lower_tirx_cleanup.cc
index 62f9bd31246e..b1586b9aed23 100644
--- a/src/tirx/transform/lower_tirx_cleanup.cc
+++ b/src/tirx/transform/lower_tirx_cleanup.cc
@@ -171,8 +171,8 @@ class LayoutApplier : public arith::IRMutatorWithAnalyzer {
     }
     // TODO(Lunderberg): Move the handling of boolean into a
     // dedicated pass.
-    if (flattened->dtype == DataType::Bool()) {
-      writer->dtype = DataType::Int(8);
+    if (flattened->dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
+      writer->dtype = PrimType::Int(8);
     }
     // canonicalize shape
     for (size_t i = 0; i < flattened->shape.size(); ++i) {
@@ -187,7 +187,8 @@ class LayoutApplier : public arith::IRMutatorWithAnalyzer {
 
   Stmt VisitStmt_(const BufferStoreNode* op) final {
     BufferStore store = StmtExprMutator::VisitStmt_(op).as_or_throw<BufferStore>();
-    bool store_returns_bool = (op->value.dtype() == DataType::Bool());
+    PrimType store_value_ty = op->value.ty();
+    bool store_returns_bool = store_value_ty.MatchesCode(DLDataTypeCode::kDLBool);
     store = VisitBufferAccess(store);
 
     // Handle casts from the value's dtype to the dtype of the
@@ -195,27 +196,28 @@ class LayoutApplier : public arith::IRMutatorWithAnalyzer {
     // TODO(Lunderberg): Move the handling of boolean into a
     // dedicated pass.
     if (store_returns_bool) {
-      TVM_FFI_ICHECK_EQ(store->buffer->dtype, DataType::Int(8))
+      TVM_FFI_ICHECK_EQ(store->buffer->dtype, PrimType::Int(8))
           << "Expected int8 backing array for boolean tensor";
       auto writer = store.CopyOnWrite();
-      writer->value = tvm::cast(DataType::Int(8), store->value);
+      writer->value = tvm::cast(PrimType::Int(8), store->value);
       return std::move(store);
     }
     return std::move(store);
   }
 
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
-    bool load_returns_bool = (op->dtype == DataType::Bool());
+    PrimType load_ty = op->ty();
+    bool load_returns_bool = load_ty.MatchesCode(DLDataTypeCode::kDLBool);
     BufferLoad load = StmtExprMutator::VisitExpr_(op).as_or_throw<BufferLoad>();
     load = VisitBufferAccess(load);
     // Handle casts from dtype of the backing array to value's dtype.
     // TODO(Lunderberg): Move the handling of boolean into a
     // dedicated pass.
     if (load_returns_bool) {
-      TVM_FFI_ICHECK_EQ(load->buffer->dtype, DataType::Int(8))
+      TVM_FFI_ICHECK_EQ(load->buffer->dtype, PrimType::Int(8))
           << "Expected int8 backing array for boolean tensor";
-      load.CopyOnWrite()->dtype = DataType::Int(8);
-      return tvm::cast(DataType::Bool(), load);
+      load.CopyOnWrite()->BaseExprNode::ty = PrimType::Int(8);
+      return tvm::cast(PrimType::Bool(), load);
     } else {
       return std::move(load);
     }
diff --git a/src/tirx/transform/lower_tirx_opaque.cc b/src/tirx/transform/lower_tirx_opaque.cc
index 9c822e1f9558..03e40d4ec824 100644
--- a/src/tirx/transform/lower_tirx_opaque.cc
+++ b/src/tirx/transform/lower_tirx_opaque.cc
@@ -146,8 +146,8 @@ class TIRxOpaqueLower : public StmtExprMutator {
       return var;
     } else {
       PrimExpr expr = it->second;
-      if (expr.dtype() != var.dtype()) {
-        expr = tvm::cast(var.dtype(), std::move(expr));
+      if (expr.ty() != var.ty()) {
+        expr = tvm::cast(var.ty(), std::move(expr));
       }
       return expr;
     }
diff --git a/src/tirx/transform/lower_tvm_builtin.cc b/src/tirx/transform/lower_tvm_builtin.cc
index 14772cead49f..23ca5951ce86 100644
--- a/src/tirx/transform/lower_tvm_builtin.cc
+++ b/src/tirx/transform/lower_tvm_builtin.cc
@@ -39,6 +39,14 @@
 namespace tvm {
 namespace tirx {
 
+namespace {
+
+TVM_FFI_INLINE int GetVectorBytes(const PrimType& dtype) {
+  return (dtype.bits() * dtype.lanes() + 7) / 8;
+}
+
+}  // namespace
+
 // Calculate the statistics of packed function.
 // These information are needed during codegen.
 class BuiltinLower : public StmtExprMutator {
@@ -99,8 +107,8 @@ class BuiltinLower : public StmtExprMutator {
   // Record stack frame for existing scope.
   struct AllocaScope {
     Buffer stack_shape;
-    Var stack_array = Var("stack_array", DataType::Handle());
-    Var stack_ffi_any = Var("stack_ffi_any", DataType::Handle());
+    Var stack_array = Var("stack_array", PrimType::Handle());
+    Var stack_ffi_any = Var("stack_ffi_any", PrimType::Handle());
 
     StackSizes max_sizes;
     StackSizes run_sizes;
@@ -130,7 +138,7 @@ class BuiltinLower : public StmtExprMutator {
     {
       // NOTE: this scope reference is invalid after any mutation is applied to alloca_scope_.
       auto& scope = precheck.alloca_scope_.back();
-      scope.stack_shape = decl_buffer({IntImm::Int64(0)}, DataType::Int(64), "stack_shape");
+      scope.stack_shape = decl_buffer({IntImm::Int64(0)}, PrimType::Int(64), "stack_shape");
     }
 
     precheck.VisitStmt(stmt);
@@ -171,7 +179,7 @@ class BuiltinLower : public StmtExprMutator {
 
       if (scope.max_sizes.shape_stack != -1) {
         scope.stack_shape = decl_buffer({IntImm::Int64(scope.max_sizes.shape_stack)},
-                                        DataType::Int(64), "stack_shape");
+                                        PrimType::Int(64), "stack_shape");
         alloca_stmts.push_back(
             Bind(scope.stack_shape->data, StackAlloca("shape", scope.max_sizes.shape_stack)));
         stmt = SeqStmt::Flatten(DeclBuffer(scope.stack_shape), stmt);
@@ -245,7 +253,7 @@ class BuiltinLower : public StmtExprMutator {
         return stmt;
       }
     }
-    if (op->buffer->dtype.is_scalable_vector()) {
+    if (op->buffer->dtype.IsScalableVector()) {
       return stmt;
     }
     int64_t nbytes = GetVectorBytes(op->buffer->dtype);
@@ -261,22 +269,22 @@ class BuiltinLower : public StmtExprMutator {
         }
       }
     }
-    PrimExpr total_bytes = IntImm(DataType::UInt(64), nbytes);
+    PrimExpr total_bytes = IntImm(PrimType::UInt(64), nbytes);
     for (size_t i = 0; i < op->buffer->shape.size(); ++i) {
       total_bytes = total_bytes * op->buffer->shape[i];
     }
     TVM_FFI_ICHECK(device_type_) << "Unknown device type in current IR";
     TVM_FFI_ICHECK(device_id_) << "Unknown device id in current IR";
-    Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {}));
+    Stmt throw_last_error = Evaluate(Call(PrimType::Int(32), builtin::tvm_throw_last_error(), {}));
 
     Stmt alloc_nullptr_check = IfThenElse(
-        Call(DataType::Bool(), builtin::isnullptr(), {op->buffer->data}), throw_last_error);
+        Call(PrimType::Bool(), builtin::isnullptr(), {op->buffer->data}), throw_last_error);
 
     static const Op& free_workspace_op = Op::Get("tirx.TVMBackendFreeWorkspace");
     static const Op& alloc_workspace_op = Op::Get("tirx.TVMBackendAllocWorkspace");
-    PrimExpr free_op = Call(DataType::Int(32), free_workspace_op,
-                            {cast(DataType::Int(32), device_type_.value()),
-                             cast(DataType::Int(32), device_id_.value()), op->buffer->data});
+    PrimExpr free_op = Call(PrimType::Int(32), free_workspace_op,
+                            {cast(PrimType::Int(32), device_type_.value()),
+                             cast(PrimType::Int(32), device_id_.value()), op->buffer->data});
     Stmt free_stmt = IfThenElse(free_op != IntImm::Int32(0), throw_last_error);
 
     // Push free to enclosing scope's pending_frees (LIFO ordering preserved).
@@ -284,9 +292,9 @@ class BuiltinLower : public StmtExprMutator {
 
     Stmt alloc_bind = Bind(
         op->buffer->data,
-        Call(op->buffer->data.dtype(), alloc_workspace_op,
-             {cast(DataType::Int(32), device_type_.value()),
-              cast(DataType::Int(32), device_id_.value()), total_bytes,
+        Call(op->buffer->data.ty(), alloc_workspace_op,
+             {cast(PrimType::Int(32), device_type_.value()),
+              cast(PrimType::Int(32), device_id_.value()), total_bytes,
               IntImm::Int32(op->buffer->dtype.code()), IntImm::Int32(op->buffer->dtype.bits())}));
 
     return SeqStmt({alloc_bind, alloc_nullptr_check});
@@ -390,7 +398,7 @@ class BuiltinLower : public StmtExprMutator {
     } else if (op->op.same_as(builtin::tvm_stack_make_array())) {
       return MakeArray(op);
     } else if (op->op.same_as(builtin::tvm_context_id())) {
-      return IntImm(op->dtype, 0);
+      return IntImm(ffi::GetRef<PrimExpr>(op).ty(), 0);
     } else if (op->op.same_as(builtin::dma_copy())) {
       return MakeDMACopy(op);
     } else if (op->op.same_as(builtin::dma_wait())) {
@@ -426,7 +434,7 @@ class BuiltinLower : public StmtExprMutator {
     PrimExpr bypass_cache = op->args[4];
 
     auto method_name = GetDeviceMethodName("dma_copy");
-    Call call_packed = Call(DataType::Int(32), builtin::tvm_call_packed(),
+    Call call_packed = Call(PrimType::Int(32), builtin::tvm_call_packed(),
                             {method_name, queue_id, dst, src, size, bypass_cache});
     return VisitExpr(call_packed);
   }
@@ -437,7 +445,7 @@ class BuiltinLower : public StmtExprMutator {
 
     auto method_name = GetDeviceMethodName("dma_wait");
     Call call_packed =
-        Call(DataType::Int(32), builtin::tvm_call_packed(), {method_name, queue_id, inflight});
+        Call(PrimType::Int(32), builtin::tvm_call_packed(), {method_name, queue_id, inflight});
     return VisitExpr(call_packed);
   }
 
@@ -445,7 +453,7 @@ class BuiltinLower : public StmtExprMutator {
     PrimExpr queue_id = op->args[0];
 
     auto method_name = GetDeviceMethodName("dma_start_group");
-    Call call_packed = Call(DataType::Int(32), builtin::tvm_call_packed(), {method_name, queue_id});
+    Call call_packed = Call(PrimType::Int(32), builtin::tvm_call_packed(), {method_name, queue_id});
     return VisitExpr(call_packed);
   }
 
@@ -453,7 +461,7 @@ class BuiltinLower : public StmtExprMutator {
     PrimExpr queue_id = op->args[0];
 
     auto method_name = GetDeviceMethodName("dma_end_group");
-    Call call_packed = Call(DataType::Int(32), builtin::tvm_call_packed(), {method_name, queue_id});
+    Call call_packed = Call(PrimType::Int(32), builtin::tvm_call_packed(), {method_name, queue_id});
     return VisitExpr(call_packed);
   }
 
@@ -472,10 +480,10 @@ class BuiltinLower : public StmtExprMutator {
     op = expr.as<CallNode>();
     // no need to perform any store for a scalar shape
     for (size_t i = 0; i < op->args.size(); ++i) {
-      prep_seq.emplace_back(BufferStore(scope.stack_shape, cast(DataType::Int(64), op->args[i]),
+      prep_seq.emplace_back(BufferStore(scope.stack_shape, cast(PrimType::Int(64), op->args[i]),
                                         {ConstInt32(stack_begin + i)}));
     }
-    return AddressOffset(scope.stack_shape->data, DataType::Int(64), stack_begin);
+    return AddressOffset(scope.stack_shape->data, PrimType::Int(64), stack_begin);
   }
   // make array
   PrimExpr MakeArray(const CallNode* op) {
@@ -499,31 +507,31 @@ class BuiltinLower : public StmtExprMutator {
     prep_seq.emplace_back(TVMStructSet(scope.stack_array, idx, builtin::kDLTensorStrides, strides));
     prep_seq.emplace_back(
         TVMStructSet(scope.stack_array, idx, builtin::kDLTensorNDim, op->args[3]));
-    DataType dtype = op->args[4].dtype();
+    PrimType dtype = op->args[4].ty();
     prep_seq.emplace_back(TVMStructSet(scope.stack_array, idx, builtin::kDLTensorTypeCode,
-                                       IntImm(DataType::UInt(8), static_cast<int>(dtype.code()))));
+                                       IntImm(PrimType::UInt(8), static_cast<int>(dtype.code()))));
     prep_seq.emplace_back(TVMStructSet(scope.stack_array, idx, builtin::kDLTensorTypeBits,
-                                       IntImm(DataType::UInt(8), dtype.bits())));
+                                       IntImm(PrimType::UInt(8), dtype.bits())));
     prep_seq.emplace_back(TVMStructSet(scope.stack_array, idx, builtin::kDLTensorTypeLanes,
-                                       IntImm(DataType::UInt(16), dtype.lanes())));
+                                       IntImm(PrimType::UInt(16), dtype.lanes())));
     // set byte offset
     int data_bytes = GetVectorBytes(dtype);
     PrimExpr elem_offset = op->args[5];
     PrimExpr byte_offset;
     if (!is_zero(elem_offset)) {
-      byte_offset = elem_offset * MakeConst(elem_offset.dtype(), data_bytes);
+      byte_offset = elem_offset * MakeConst(elem_offset.ty(), data_bytes);
     } else {
       byte_offset = elem_offset;
     }
     prep_seq.emplace_back(TVMStructSet(scope.stack_array, idx, builtin::kDLTensorByteOffset,
-                                       cast(DataType::UInt(64), byte_offset)));
+                                       cast(PrimType::UInt(64), byte_offset)));
     TVM_FFI_ICHECK(device_type_) << "Unknown device type in current IR";
     TVM_FFI_ICHECK(device_id_) << "Unknown device id in current IR";
     prep_seq.emplace_back(TVMStructSet(scope.stack_array, idx, builtin::kDLTensorDeviceId,
-                                       cast(DataType::Int(32), device_id_.value())));
+                                       cast(PrimType::Int(32), device_id_.value())));
     prep_seq.emplace_back(TVMStructSet(scope.stack_array, idx, builtin::kDLTensorDeviceType,
-                                       cast(DataType::Int(32), device_type_.value())));
-    return TVMStructGet(DataType::Handle(), scope.stack_array, idx, builtin::kDLTensorAddr);
+                                       cast(PrimType::Int(32), device_type_.value())));
+    return TVMStructGet(PrimType::Handle(), scope.stack_array, idx, builtin::kDLTensorAddr);
   }
 
   void SetPackedArg(PrimExpr arg, const Var& args_stack, size_t stack_offset,
@@ -533,26 +541,28 @@ class BuiltinLower : public StmtExprMutator {
       // call runtime function to set anylist
       static const Op& anylist_set_packed_arg_op = Op::Get("tirx.TVMBackendAnyListSetPackedArg");
       prep_seq->emplace_back(Evaluate(Call(
-          DataType::Int(32), anylist_set_packed_arg_op,
+          PrimType::Int(32), anylist_set_packed_arg_op,
           {call_pattern->args[0], call_pattern->args[1], args_stack, ConstInt32(stack_offset)})));
     } else {
-      DataType api_dtype = APIType(arg.dtype());
-      if (arg.dtype() != api_dtype) {
-        arg = Cast(api_dtype, arg);
+      PrimType arg_ty = arg.ty();
+      PrimType api_ty = APIType(arg_ty);
+      if (arg_ty != api_ty) {
+        arg = Cast(api_ty, arg);
       }
 
       int arg_type_index = [&]() {
-        if (api_dtype.is_bool()) return ffi::TypeIndex::kTVMFFIBool;
-        if (api_dtype.is_int() || api_dtype.is_uint()) return ffi::TypeIndex::kTVMFFIInt;
-        if (api_dtype.is_float()) return ffi::TypeIndex::kTVMFFIFloat;
-        if (api_dtype.is_handle() && arg.as<StringImmNode>()) {
+        if (api_ty.MatchesCode(DLDataTypeCode::kDLBool)) return ffi::TypeIndex::kTVMFFIBool;
+        if (api_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt))
+          return ffi::TypeIndex::kTVMFFIInt;
+        if (api_ty.code() == DLDataTypeCode::kDLFloat) return ffi::TypeIndex::kTVMFFIFloat;
+        if (api_ty.IsHandle() && arg.as<StringImmNode>()) {
           return ffi::TypeIndex::kTVMFFIRawStr;
         } else if (IsArrayHandle(arg)) {
           return ffi::TypeIndex::kTVMFFIDLTensorPtr;
-        } else if (api_dtype.is_handle()) {
+        } else if (api_ty.IsHandle()) {
           return ffi::TypeIndex::kTVMFFIOpaquePtr;
         } else {
-          TVM_FFI_THROW(InternalError) << "Unsupported type: " << api_dtype;
+          TVM_FFI_THROW(InternalError) << "Unsupported type: " << api_ty;
           TVM_FFI_UNREACHABLE();
         }
       }();
@@ -560,7 +570,7 @@ class BuiltinLower : public StmtExprMutator {
       // opaque handle need to set the kind properly
       if (arg_type_index == ffi::TypeIndex::kTVMFFIOpaquePtr) {
         prep_seq->emplace_back(
-            IfThenElse(Call(DataType::Bool(), builtin::isnullptr(), {arg}),
+            IfThenElse(Call(PrimType::Bool(), builtin::isnullptr(), {arg}),
                        TVMStructSet(args_stack, stack_offset, builtin::kTVMFFIAnyTypeIndex,
                                     ConstInt32(ffi::TypeIndex::kTVMFFINone)),
                        TVMStructSet(args_stack, stack_offset, builtin::kTVMFFIAnyTypeIndex,
@@ -592,7 +602,7 @@ class BuiltinLower : public StmtExprMutator {
     prep_seq.emplace_back(Evaluate(call));
     static const Op& anylist_move_from_packed_return_op =
         Op::Get("tirx.TVMBackendAnyListMoveFromPackedReturn");
-    return Call(DataType::Int(32), anylist_move_from_packed_return_op,
+    return Call(PrimType::Int(32), anylist_move_from_packed_return_op,
                 {list_handle, list_index, args_stack, ret_offset});
   }
   /*!
@@ -652,16 +662,18 @@ class BuiltinLower : public StmtExprMutator {
       // used by call_packed_traced
       packed_args.push_back(op->args[op->args.size() - 1]);
     }
-    return Call(op->dtype, lowered_packed_op, packed_args);
+    return Call(ffi::GetRef<PrimExpr>(op).ty(), lowered_packed_op, packed_args);
   }
 
   Stmt MakeNdMemAllocWithScope(const BindNode* let, const CallNode* call) {
     TVM_FFI_ICHECK(device_type_) << "Unknown device type in current IR";
     TVM_FFI_ICHECK(device_id_) << "Unknown device id in current IR";
-    Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {}));
+    Stmt throw_last_error = Evaluate(Call(PrimType::Int(32), builtin::tvm_throw_last_error(), {}));
 
-    DataType dtype =
-        let->var->type_annotation.as<PointerTypeNode>()->element_type.as<PrimTypeNode>()->dtype;
+    const auto* dtype_node =
+        let->var->type_annotation.as<PointerTypeNode>()->element_type.as<PrimTypeNode>();
+    TVM_FFI_ICHECK(dtype_node);
+    PrimType dtype = ffi::GetRef<PrimType>(dtype_node);
 
     ffi::Array<PrimExpr> args = {
         GetDeviceMethodName("alloc_nd"), device_type_.value(),        device_id_.value(),
@@ -672,14 +684,14 @@ class BuiltinLower : public StmtExprMutator {
       args.push_back(call->args[i]);
     }
 
-    Call call_packed = Call(let->var.dtype(), builtin::tvm_call_packed(), args);
+    Call call_packed = Call(let->var.ty(), builtin::tvm_call_packed(), args);
     Stmt null_check =
-        IfThenElse(Call(DataType::Bool(), builtin::isnullptr(), {let->var}), throw_last_error);
+        IfThenElse(Call(PrimType::Bool(), builtin::isnullptr(), {let->var}), throw_last_error);
 
     // Construct free_nd call and register in current scope.
     // The free will be emitted on scope exit, matching the old LetStmt body semantics.
     PrimExpr storage_scope = call->args[0];
-    Call free_op = Call(DataType::Int(32), builtin::tvm_call_packed(),
+    Call free_op = Call(PrimType::Int(32), builtin::tvm_call_packed(),
                         {GetDeviceMethodName("free_nd"), device_type_.value(), device_id_.value(),
                          storage_scope, let->var});
     Stmt free_stmt = IfThenElse(free_op != IntImm::Int32(0), throw_last_error);
diff --git a/src/tirx/transform/lower_warp_memory.cc b/src/tirx/transform/lower_warp_memory.cc
index be21efaa3694..57b9dde61fed 100644
--- a/src/tirx/transform/lower_warp_memory.cc
+++ b/src/tirx/transform/lower_warp_memory.cc
@@ -155,9 +155,10 @@ class WarpStoreCoeffFinder : private StmtExprVisitor {
                                              << "Has FlattenBuffer been run?";
 
     PrimExpr index = op->indices[0];
-    if (op->value.dtype().lanes() != 1) {
+    PrimType value_ty = op->value.ty();
+    if (value_ty.lanes() != 1) {
       arith::PVar<PrimExpr> base;
-      TVM_FFI_ICHECK(arith::ramp(base, 1, op->value.dtype().lanes()).Match(index))
+      TVM_FFI_ICHECK(arith::ramp(base, 1, value_ty.lanes()).Match(index))
           << "LowerWarpMemory failed due to store index=" << index
           << ", can only handle continuous store";
       UpdatePattern(base.Eval());
@@ -294,7 +295,7 @@ class WarpAccessRewriter : protected StmtExprMutator {
         new_args.Set(i + 1, local_index);
       }
     }
-    return Call(op->dtype, op->op, new_args, op->attrs, op->span);
+    return Call(ffi::GetRef<PrimExpr>(op).ty(), op->op, new_args, op->attrs, op->span);
   }
 
   PrimExpr VisitExpr_(const CallNode* op) override {
@@ -390,8 +391,8 @@ class WarpAccessRewriter : protected StmtExprMutator {
       return load;
     }
 
-    PrimExpr mask = Call(DataType::UInt(32), builtin::tvm_warp_activemask(), {});
-    return Call(load.dtype(), builtin::tvm_warp_shuffle(), {mask, load, group, width_, warp_size_});
+    PrimExpr mask = Call(PrimType::UInt(32), builtin::tvm_warp_activemask(), {});
+    return Call(load.ty(), builtin::tvm_warp_shuffle(), {mask, load, group, width_, warp_size_});
   }
 
   // Split the index to the two component
@@ -400,15 +401,16 @@ class WarpAccessRewriter : protected StmtExprMutator {
   // source index is the corresponding source index
   // in this access pattern.
   std::pair<PrimExpr, PrimExpr> SplitIndexByGroup(const PrimExpr& index) {
-    if (index.dtype().lanes() != 1) {
+    PrimType index_ty = index.ty();
+    if (index_ty.lanes() != 1) {
       arith::PVar<PrimExpr> base;
-      TVM_FFI_ICHECK(arith::ramp(base, 1, index.dtype().lanes()).Match(index));
+      TVM_FFI_ICHECK(arith::ramp(base, 1, index_ty.lanes()).Match(index));
 
       auto [local_index, group] = SplitIndexByGroup(base.Eval());
-      local_index = Ramp(local_index, MakeConst(local_index.dtype(), 1), index.dtype().lanes());
+      local_index = Ramp(local_index, MakeConst(local_index.ty(), 1), index_ty.lanes());
       return std::make_pair(local_index, group);
     }
-    PrimExpr m = MakeConst(index.dtype(), warp_coeff_);
+    PrimExpr m = MakeConst(index_ty, warp_coeff_);
 
     // simple case, warp index is on the highest.
     if (warp_group_ == 1) {
@@ -417,9 +419,9 @@ class WarpAccessRewriter : protected StmtExprMutator {
       return std::make_pair(x, z);
     } else {
       PrimExpr x = analyzer_->canonical_simplify(indexmod(index, m));
-      PrimExpr y = index / MakeConst(index.dtype(), warp_coeff_ * width_);
+      PrimExpr y = index / MakeConst(index_ty, warp_coeff_ * width_);
       y = y * m + x;
-      PrimExpr z = indexdiv(indexmod(index, MakeConst(index.dtype(), warp_coeff_ * width_)), m);
+      PrimExpr z = indexdiv(indexmod(index, MakeConst(index_ty, warp_coeff_ * width_)), m);
       return std::make_pair(analyzer_->canonical_simplify(y), analyzer_->canonical_simplify(z));
     }
   }
diff --git a/src/tirx/transform/make_packed_api.cc b/src/tirx/transform/make_packed_api.cc
index d2d4113cdcfc..56a30ba6eb91 100644
--- a/src/tirx/transform/make_packed_api.cc
+++ b/src/tirx/transform/make_packed_api.cc
@@ -80,22 +80,22 @@ class ReturnRewriter : public StmtMutator {
     ConvertedInfo info;
 
     // convert val's data type to FFI data type, return type code
-    DataType dtype = val.dtype();
-    if (dtype.is_bool()) {
+    PrimType dtype = val.ty();
+    if (dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
       info.type_index = ffi::TypeIndex::kTVMFFIBool;
-      info.expr = Cast(DataType::Int(64), val);
+      info.expr = Cast(PrimType::Int(64), val);
 
-    } else if (dtype.is_int() || dtype.is_uint()) {
+    } else if (dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
       info.type_index = ffi::TypeIndex::kTVMFFIInt;
-      info.expr = Cast(DataType::Int(64), val);
-    } else if (dtype.is_float()) {
+      info.expr = Cast(PrimType::Int(64), val);
+    } else if (dtype.code() == DLDataTypeCode::kDLFloat) {
       info.type_index = ffi::TypeIndex::kTVMFFIFloat;
-      info.expr = Cast(DataType::Float(64), val);
-    } else if (dtype.is_void()) {
+      info.expr = Cast(PrimType::Float(64), val);
+    } else if (dtype.IsVoid()) {
       info.type_index = ffi::TypeIndex::kTVMFFINone;
       info.expr = val;
     } else {
-      TVM_FFI_THROW(InternalError) << "data type " << dtype << " not supported yet";
+      TVM_FFI_THROW(InternalError) << "data type " << dtype->dtype << " not supported yet";
     }
     return info;
   }
@@ -103,15 +103,15 @@ class ReturnRewriter : public StmtMutator {
   Stmt WriteToOut(PrimExpr val) {
     auto info = ConvertForFFI(val);
     Stmt store_tindex = tirx::Evaluate(
-        tirx::Call(DataType::Int(32), tirx::builtin::tvm_struct_set(),
+        tirx::Call(PrimType::Int(32), tirx::builtin::tvm_struct_set(),
                    {ret_var_, IntImm::Int32(0), IntImm::Int32(tirx::builtin::kTVMFFIAnyTypeIndex),
                     IntImm::Int32(info.type_index)}));
     Stmt store_zero_padding = tirx::Evaluate(
-        tirx::Call(DataType::Int(32), tirx::builtin::tvm_struct_set(),
+        tirx::Call(PrimType::Int(32), tirx::builtin::tvm_struct_set(),
                    {ret_var_, IntImm::Int32(0), IntImm::Int32(tirx::builtin::kTVMFFIAnyZeroPadding),
                     IntImm::Int32(0)}));
     Stmt store_val =
-        tirx::Evaluate(tirx::Call(DataType::Int(32), tirx::builtin::tvm_struct_set(),
+        tirx::Evaluate(tirx::Call(PrimType::Int(32), tirx::builtin::tvm_struct_set(),
                                   {ret_var_, IntImm::Int32(0),
                                    IntImm::Int32(tirx::builtin::kTVMFFIAnyUnionValue), info.expr}));
     Stmt ret_zero = Evaluate(tvm::ret(0));
@@ -154,7 +154,7 @@ class SubroutineCallRewriter : public StmtExprMutator {
         // push an empty handle to be compatible with current cpacked convention
         cpacked_args.push_back(tirx::ConstHandle(0));
         made_change_ = true;
-        return tirx::Call(node->dtype, tirx::builtin::tvm_call_cpacked(), cpacked_args);
+        return tirx::Call(node.ty(), tirx::builtin::tvm_call_cpacked(), cpacked_args);
       }
     }
 
@@ -219,14 +219,14 @@ PrimFunc MakePackedAPI(PrimFunc func) {
   const Stmt nop = Evaluate(0);
 
   // Data field definitions
-  Var v_self_handle("self_handle", DataType::Handle());
-  Var v_packed_args("args", DataType::Handle());
-  Var v_num_packed_args("num_args", DataType::Int(32));
-  Var v_result("result", PointerType(PrimType(DataType::Void())));
+  Var v_self_handle("self_handle", PrimType::Handle());
+  Var v_packed_args("args", PrimType::Handle());
+  Var v_num_packed_args("num_args", PrimType::Int(32));
+  Var v_result("result", PointerType(PrimType::Void()));
 
   // The device context
   Var device_id("dev_id");
-  IntImm device_type(DataType::Int(32), target_device_type);
+  IntImm device_type(PrimType::Int(32), target_device_type);
 
   // Create TVMFFIABIBuilder and decode all packed args
   TVMFFIABIBuilder binder(name_hint, func_ptr->params, func_ptr->buffer_map, v_packed_args,
@@ -257,7 +257,7 @@ PrimFunc MakePackedAPI(PrimFunc func) {
 
     if (runtime::DeviceAPI::NeedSetDevice(target_device_type)) {
       Stmt set_device =
-          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(),
+          Evaluate(Call(PrimType::Int(32), builtin::tvm_call_packed(),
                         {StringImm(runtime::symbol::tvm_set_device), device_type, device_id}));
       body = SeqStmt({set_device, body});
     }
@@ -278,7 +278,7 @@ PrimFunc MakePackedAPI(PrimFunc func) {
       << " are used, but are not passed in as API arguments";
 
   func_ptr->buffer_map = ffi::Map<Var, Buffer>();
-  func_ptr->ret_type = PrimType(DataType::Int(32));
+  func_ptr->ret_type = PrimType::Int(32);
 
   // return the function.
   return func;
diff --git a/src/tirx/transform/narrow_datatype.cc b/src/tirx/transform/narrow_datatype.cc
index 9dfdf88c0c06..fa9e431f9253 100644
--- a/src/tirx/transform/narrow_datatype.cc
+++ b/src/tirx/transform/narrow_datatype.cc
@@ -79,15 +79,16 @@ class DataTypeVisitor final : public StmtExprVisitor {
   explicit DataTypeVisitor(int target_bits) : bits_(target_bits), target_bits_(target_bits) {}
 
   void VisitExpr(const PrimExpr& e) {
-    if (e.dtype().is_int()) {
+    PrimType e_ty = e.ty();
+    if (e_ty.MatchesCode(DLDataTypeCode::kDLInt)) {
       int bits = max_bits_;
       if (bound_.find(e) == bound_.end()) {
         analyzer_->const_int_bound(e, &bound_);
       }
       ConstIntBound bound = bound_[e];
-      int64_t ubound = max_value(DataType::Int(target_bits_)).as_or_throw<IntImm>()->value;
-      int64_t lbound = min_value(DataType::Int(target_bits_)).as_or_throw<IntImm>()->value;
-      if (e.dtype().bits() <= target_bits_ ||
+      int64_t ubound = max_value(PrimType::Int(target_bits_)).as_or_throw<IntImm>()->value;
+      int64_t lbound = min_value(PrimType::Int(target_bits_)).as_or_throw<IntImm>()->value;
+      if (e_ty.bits() <= target_bits_ ||
           (bound->max_value <= ubound && bound->min_value >= lbound)) {
         bits = target_bits_;
       }
@@ -109,14 +110,14 @@ class DataTypeVisitor final : public StmtExprVisitor {
 
   void VisitStmt_(const ForNode* op) {
     analyzer_->Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent));
-    vextent_[op->loop_var.as<VarNode>()] = op->extent.dtype();
+    vextent_.insert_or_assign(op->loop_var.as<VarNode>(), op->extent.ty());
     return StmtExprVisitor::VisitStmt_(op);
   }
 
   void VisitStmt_(const SBlockNode* op) {
     for (const IterVar& iter : op->iter_vars) {
       analyzer_->Bind(iter->var, Range::FromMinExtent(iter->dom->min, iter->dom->extent));
-      vextent_[iter->var.as<VarNode>()] = iter->dom->extent.dtype();
+      vextent_.insert_or_assign(iter->var.as<VarNode>(), iter->dom->extent.ty());
     }
     StmtExprVisitor::VisitStmt_(op);
   }
@@ -126,7 +127,7 @@ class DataTypeVisitor final : public StmtExprVisitor {
       IterVar iv = op->node.as_or_throw<IterVar>();
       TVM_FFI_ICHECK_NE(iv->thread_tag.length(), 0U);
       analyzer_->Bind(iv->var, Range::FromMinExtent(0, op->value));
-      vextent_[iv->var.as<VarNode>()] = op->value.dtype();
+      vextent_.insert_or_assign(iv->var.as<VarNode>(), op->value.ty());
       StmtExprVisitor::VisitStmt_(op);
     } else {
       StmtExprVisitor::VisitStmt_(op);
@@ -137,57 +138,59 @@ class DataTypeVisitor final : public StmtExprVisitor {
     // Setup the domain information before simplification.
     for (const IterVar& iv : op->axis) {
       analyzer_->Bind(iv->var, iv->dom);
-      vextent_[iv->var.as<VarNode>()] = iv->dom->extent.dtype();
+      vextent_.insert_or_assign(iv->var.as<VarNode>(), iv->dom->extent.ty());
     }
     // Recursively call simplification when necessary.
     StmtExprVisitor::VisitExpr_(op);
   }
 
   void VisitExpr_(const VarNode* op) {
-    if (vextent_.find(op) != vextent_.end()) {
+    if (auto vextent_it = vextent_.find(op); vextent_it != vextent_.end()) {
       // We only narrow and never promote, so the result dtype
       // is upperbounded by its original dtype before rewrite.
-      int bits = std::min(vextent_[op].bits(), bits_);
-      if (vmap.find(op) == vmap.end()) {
-        vmap[op] = op->dtype.with_bits(bits);
+      int bits = std::min(vextent_it->second.bits(), bits_);
+      if (auto it = vmap.find(op); it == vmap.end()) {
+        vmap.emplace(op, op->ty().WithBits(bits));
       } else {
         // We take maximum bits for all the possible Expr where a var occurs
-        vmap[op] = op->dtype.with_bits(std::max(vmap[op].bits(), bits));
+        it->second = op->ty().WithBits(std::max(it->second.bits(), bits));
       }
     }
     StmtExprVisitor::VisitExpr_(op);
   }
 
   void VisitExpr_(const IntImmNode* op) {
-    if (op->dtype.is_int()) {
+    PrimType op_ty = op->ty();
+    if (op_ty.MatchesCode(DLDataTypeCode::kDLInt)) {
       // We only narrow and never promote, so the result dtype
       // is upperbounded by its original dtype before rewrite.
-      int bits = std::min(op->dtype.bits(), bits_);
-      if (vmap.find(op) == vmap.end()) {
-        vmap[op] = op->dtype.with_bits(bits);
+      int bits = std::min(op_ty.bits(), bits_);
+      if (auto it = vmap.find(op); it == vmap.end()) {
+        vmap.emplace(op, op_ty.WithBits(bits));
       } else {
-        vmap[op] = op->dtype.with_bits(std::max(vmap[op].bits(), bits));
+        it->second = op_ty.WithBits(std::max(it->second.bits(), bits));
       }
     }
     StmtExprVisitor::VisitExpr_(op);
   }
 
   void VisitExpr_(const CastNode* op) {
-    if (op->dtype.is_int()) {
+    PrimType op_ty = op->ty();
+    if (op_ty.MatchesCode(DLDataTypeCode::kDLInt)) {
       // We only narrow and never promote, so the result dtype
       // is upperbounded by its original dtype before rewrite.
-      int bits = std::min(op->dtype.bits(), bits_);
-      if (vmap.find(op) == vmap.end()) {
-        vmap[op] = op->dtype.with_bits(bits);
+      int bits = std::min(op_ty.bits(), bits_);
+      if (auto it = vmap.find(op); it == vmap.end()) {
+        vmap.emplace(op, op_ty.WithBits(bits));
       } else {
-        vmap[op] = op->dtype.with_bits(std::max(vmap[op].bits(), bits));
+        it->second = op_ty.WithBits(std::max(it->second.bits(), bits));
       }
     }
     StmtExprVisitor::VisitExpr_(op);
   }
 
   // the narrowed datatype of Var and IntImm
-  std::unordered_map<const PrimExprNode*, DataType> vmap;
+  std::unordered_map<const PrimExprNode*, PrimType> vmap;
 
  protected:
   // internal analyzer
@@ -201,7 +204,7 @@ class DataTypeVisitor final : public StmtExprVisitor {
   // the target bits
   int target_bits_;
   // the extent of vars to be rewritten
-  std::unordered_map<const VarNode*, DataType> vextent_;
+  std::unordered_map<const VarNode*, PrimType> vextent_;
   // the memorized bound generated by ConstIntBoundAnalyzer
   arith::ConstIntBoundAnalyzer::BoundMapType bound_;
 };
@@ -215,7 +218,7 @@ class NarrowDataTypeRewriter : public IndexDataTypeRewriter {
     visitor_(s);
     for (auto i = visitor_.vmap.begin(), last = visitor_.vmap.end(); i != last;) {
       PrimExpr e = ffi::GetRef<PrimExpr>(i->first);
-      if (e.dtype() == i->second) {
+      if (e.ty() == i->second) {
         i = visitor_.vmap.erase(i);
       } else {
         ++i;
@@ -243,7 +246,7 @@ class NarrowDataTypeRewriter : public IndexDataTypeRewriter {
   PrimExpr VisitExpr_(const IntImmNode* op) final {
     if (is_enabled_) {
       if (visitor_.vmap.find(op) != visitor_.vmap.end()) {
-        return IntImm(visitor_.vmap[op], op->value);
+        return IntImm(visitor_.vmap.at(op), op->value);
       }
     }
     return Parent::VisitExpr_(op);
@@ -256,8 +259,8 @@ class NarrowDataTypeRewriter : public IndexDataTypeRewriter {
       TVM_FFI_ICHECK(new_op != nullptr) << "Expected type to be CastNode"
                                         << ", but get " << e->GetTypeKey();
       PrimExpr new_value = new_op->value;
-      DataType cast_type = visitor_.vmap[op];
-      if (new_value.dtype() != cast_type) {
+      PrimType cast_type = visitor_.vmap.at(op);
+      if (new_value.ty() != cast_type) {
         new_value = Cast(cast_type, new_value);
       }
       return new_value;
@@ -265,24 +268,24 @@ class NarrowDataTypeRewriter : public IndexDataTypeRewriter {
     return Parent::VisitExpr_(op);
   }
 
-#define TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC)             \
-  PrimExpr VisitExpr_(const OP* op) {                                     \
-    PrimExpr a = this->VisitExpr(op->a);                                  \
-    PrimExpr b = this->VisitExpr(op->b);                                  \
-    if (op->a.same_as(a) && op->b.same_as(b) && a.dtype() == b.dtype()) { \
-      return ffi::GetRef<PrimExpr>(op);                                   \
-    } else {                                                              \
-      if (a.dtype() != b.dtype()) {                                       \
-        bool is_enabled = is_enabled_;                                    \
-        is_enabled_ = true;                                               \
-        PrimExpr lhs = this->VisitExpr(op->a);                            \
-        PrimExpr rhs = this->VisitExpr(op->b);                            \
-        is_enabled_ = is_enabled;                                         \
-        return FUNC(lhs, rhs);                                            \
-      } else {                                                            \
-        return FUNC(a, b);                                                \
-      }                                                                   \
-    }                                                                     \
+#define TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC)       \
+  PrimExpr VisitExpr_(const OP* op) {                               \
+    PrimExpr a = this->VisitExpr(op->a);                            \
+    PrimExpr b = this->VisitExpr(op->b);                            \
+    if (op->a.same_as(a) && op->b.same_as(b) && a.ty() == b.ty()) { \
+      return ffi::GetRef<PrimExpr>(op);                             \
+    } else {                                                        \
+      if (a.ty() != b.ty()) {                                       \
+        bool is_enabled = is_enabled_;                              \
+        is_enabled_ = true;                                         \
+        PrimExpr lhs = this->VisitExpr(op->a);                      \
+        PrimExpr rhs = this->VisitExpr(op->b);                      \
+        is_enabled_ = is_enabled;                                   \
+        return FUNC(lhs, rhs);                                      \
+      } else {                                                      \
+        return FUNC(a, b);                                          \
+      }                                                             \
+    }                                                               \
   }
 
   TVM_DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(AddNode, operator+);
diff --git a/src/tirx/transform/split_host_device.cc b/src/tirx/transform/split_host_device.cc
index d66d1d23f226..42d1fb424801 100644
--- a/src/tirx/transform/split_host_device.cc
+++ b/src/tirx/transform/split_host_device.cc
@@ -141,7 +141,7 @@ class HostDeviceSplitter : public StmtMutator {
         std::sort(params.begin(), params.end(), [](const Var& a, const Var& b) {
           auto sort_key = [](const Var& var) {
             return std::tuple{
-                !var->dtype.is_handle(),
+                !var->ty().IsHandle(),
                 var->name_hint,
             };
           };
@@ -167,10 +167,10 @@ class HostDeviceSplitter : public StmtMutator {
       auto kind = device_target->GetTargetDeviceType();
       return kind == kDLCPU || kind == kDLExtDev || kind == kDLHexagon;
     }();
-    IntImm success(DataType::Int(32), 0);
+    IntImm success(PrimType::Int(32), 0);
     Type kernel_ret_type;
     if (can_propagate_errors) {
-      kernel_ret_type = PrimType(DataType::Int(32));
+      kernel_ret_type = PrimType::Int(32);
       body = SeqStmt::Flatten(body, Evaluate(ret(success)));
     } else {
       kernel_ret_type = VoidType();
@@ -202,14 +202,14 @@ class HostDeviceSplitter : public StmtMutator {
     ffi::Array<PrimExpr> args = params.Map([](const Var& var) -> PrimExpr { return var; });
 
     if (can_propagate_errors) {
-      Var kernel_error_code("kernel_error_code", success->dtype);
-      Call kernel_call(success->dtype, kernel_symbol_global, args);
+      Var kernel_error_code("kernel_error_code", success.ty());
+      Call kernel_call(success.ty(), kernel_symbol_global, args);
       AssertStmt assert_success(kernel_error_code == success, StringImm("RuntimeError"),
                                 {StringImm("Error executing compute kernel")});
       return SeqStmt({Bind(kernel_error_code, kernel_call), assert_success});
 
     } else {
-      return Evaluate(Call(DataType::Void(), kernel_symbol_global, args));
+      return Evaluate(Call(PrimType::Void(), kernel_symbol_global, args));
     }
   }
 
@@ -353,7 +353,7 @@ class DeviceInfoCollector : public StmtVisitor {
       for (const auto& extent : op->buffer->shape) {
         dyn_size *= extent;
       }
-      dyn_size *= op->buffer->dtype.bytes();
+      dyn_size *= (op->buffer->dtype.bits() * op->buffer->dtype.lanes() + 7) / 8;
 
       // Inline any locally-bound variables (e.g. from CSE).
       if (bind_map_.size()) {
@@ -570,7 +570,7 @@ class DeviceKernelMutator : public StmtExprMutator {
         for (const auto& arg : node->args) {
           args.push_back(arg);
         }
-        return Call(node->dtype, builtin::call_extern(), args);
+        return Call(node.ty(), builtin::call_extern(), args);
       }
     }
 
@@ -607,9 +607,9 @@ class DeviceKernelMutator : public StmtExprMutator {
       call_args.push_back(Substitute(launch_arg, param_map));
     }
 
-    auto dtype = node->dtype.is_void() ? DataType::Int(32) : node->dtype;
+    PrimType ret_ty = node->ty().IsVoid() ? PrimType::Int(32) : node.ty();
 
-    return Call(dtype, builtin::tvm_call_packed(), call_args);
+    return Call(ret_ty, builtin::tvm_call_packed(), call_args);
   }
 
   ffi::Optional<Target> current_target_;
diff --git a/src/tirx/transform/storage_rewrite.cc b/src/tirx/transform/storage_rewrite.cc
index 81f71f9b74ce..02ca714cb474 100644
--- a/src/tirx/transform/storage_rewrite.cc
+++ b/src/tirx/transform/storage_rewrite.cc
@@ -52,6 +52,22 @@ namespace tirx {
 using runtime::StorageRank;
 using runtime::StorageScope;
 
+namespace {
+
+struct PrimTypeHash {
+  size_t operator()(const PrimType& ty) const {
+    DLDataType dtype = ty->dtype;
+    return (static_cast<size_t>(dtype.code) << 24) ^ (static_cast<size_t>(dtype.bits) << 16) ^
+           static_cast<size_t>(dtype.lanes);
+  }
+};
+
+struct PrimTypeEqual {
+  bool operator()(const PrimType& lhs, const PrimType& rhs) const { return lhs == rhs; }
+};
+
+}  // namespace
+
 // Find a linear pattern of storage access
 // Used for liveness analysis.
 // Composite scopes(loop/thread_launch/IfThen) is represented by two points:
@@ -356,7 +372,7 @@ class InplaceOpVerifier : public StmtExprVisitor {
       return;
     }
     if (src_ == buf) {
-      if (store_ == nullptr || store_->value.dtype() != op->dtype) {
+      if (store_ == nullptr || store_->value.ty() != op->ty()) {
         result_ = false;
         return;
       }
@@ -482,7 +498,7 @@ class StoragePlanRewriter : public StmtExprMutator {
   PrimExpr VisitExpr_(const CallNode* op) final {
     if (op->op.same_as(builtin::tvm_access_ptr())) {
       TVM_FFI_ICHECK_EQ(op->args.size(), 5U);
-      DataType dtype = op->args[0].dtype();
+      PrimType dtype = op->args[0].ty();
       const VarNode* buffer = op->args[1].as<VarNode>();
       auto it = alloc_map_.find(buffer);
       if (it == alloc_map_.end()) {
@@ -494,10 +510,10 @@ class StoragePlanRewriter : public StmtExprMutator {
       uint64_t elem_bits = dtype.bits() * dtype.lanes();
       TVM_FFI_ICHECK_EQ(se->bits_offset % elem_bits, 0U);
       if (se->bits_offset != 0) {
-        offset = MakeConst(offset.dtype(), se->bits_offset / elem_bits) + offset;
+        offset = MakeConst(offset.ty(), se->bits_offset / elem_bits) + offset;
       }
-      return Call(op->dtype, op->op, {op->args[0], se->alloc_var, offset, extent, op->args[4]},
-                  op->attrs, op->span);
+      return Call(ffi::GetRef<PrimExpr>(op).ty(), op->op,
+                  {op->args[0], se->alloc_var, offset, extent, op->args[4]}, op->attrs, op->span);
     } else {
       return StmtExprMutator::VisitExpr_(op);
     }
@@ -589,7 +605,7 @@ class StoragePlanRewriter : public StmtExprMutator {
     // The var expr of new allocation.
     Var alloc_var;
     // The allocation element type.
-    DataType elem_type;
+    PrimType elem_type = PrimType::Void();
     // Whether any constituent allocation was marked volatile.
     bool is_volatile{false};
     // This is non-zero if this alloc_buffer is folded into another one
@@ -629,11 +645,11 @@ class StoragePlanRewriter : public StmtExprMutator {
     return body;
   }
   // Remap the index
-  PrimExpr RemapIndex(DataType dtype, PrimExpr index, StorageEntry* e) {
+  PrimExpr RemapIndex(PrimType dtype, PrimExpr index, StorageEntry* e) {
     if (e->bits_offset == 0) return index;
     uint64_t elem_bits = dtype.bits();
     TVM_FFI_ICHECK_EQ(e->bits_offset % elem_bits, 0U);
-    return MakeConst(index.dtype(), e->bits_offset / elem_bits) + index;
+    return MakeConst(index.ty(), e->bits_offset / elem_bits) + index;
   }
   // Prepare the new allocations
   void PrepareNewAlloc() {
@@ -667,7 +683,7 @@ class StoragePlanRewriter : public StmtExprMutator {
           NewAllocTagMerged(e);
           continue;
         }
-        if (e->allocs.size() == 1 && e->allocs[0]->buffer->dtype.is_scalable_vector()) {
+        if (e->allocs.size() == 1 && e->allocs[0]->buffer->dtype.IsScalableVector()) {
           // Scalable vector lanes are runtime-dependent.  Keep these allocations exact rather
           // than trying to compare or merge their compile-time bit size.
           e->alloc_var = e->allocs[0]->buffer->data;
@@ -681,7 +697,7 @@ class StoragePlanRewriter : public StmtExprMutator {
         }
         // Get the allocation size;
         e->alloc_var = e->allocs[0]->buffer->data;
-        DataType alloc_type = e->allocs[0]->buffer->dtype;
+        PrimType alloc_type = e->allocs[0]->buffer->dtype;
         for (const AllocBufferNode* op : e->allocs) {
           if (op->buffer->dtype.lanes() > alloc_type.lanes()) {
             alloc_type = op->buffer->dtype;
@@ -691,7 +707,7 @@ class StoragePlanRewriter : public StmtExprMutator {
         bool all_allocs_identical = std::all_of(
             e->allocs.begin() + 1, e->allocs.end(), [&](const AllocBufferNode* op) -> bool {
               const AllocBufferNode* first = *e->allocs.begin();
-              if (op->buffer->dtype != first->buffer->dtype) {
+              if (op->buffer->dtype->dtype != first->buffer->dtype->dtype) {
                 return false;
               }
               if (op->buffer->shape.size() != first->buffer->shape.size()) {
@@ -789,7 +805,7 @@ class StoragePlanRewriter : public StmtExprMutator {
     }
     uint64_t type_bits = e->elem_type.bits() * e->elem_type.lanes();
     PrimExpr alloc_size =
-        MakeConst(e->allocs[0]->buffer->shape[0].dtype(), (total_bits + type_bits - 1) / type_bits);
+        MakeConst(e->allocs[0]->buffer->shape[0].ty(), (total_bits + type_bits - 1) / type_bits);
     Buffer buf(e->alloc_var, e->elem_type, {alloc_size}, {}, PrimExpr(), e->alloc_var->name_hint, 0,
                0, BufferType::kDefault);
     bool any_volatile = e->is_volatile;
@@ -888,8 +904,8 @@ class StoragePlanRewriter : public StmtExprMutator {
                 StorageEntry* src_entry = alloc_map_.at(src);
                 if (src_entry->scope == storage_scope &&
                     src_entry->attach_scope_ == thread_scope_ &&
-                    !alloc->buffer->dtype.is_scalable_vector() &&
-                    src_entry->elem_type == alloc->buffer->dtype.element_of() &&
+                    !alloc->buffer->dtype.IsScalableVector() &&
+                    src_entry->elem_type == alloc->buffer->dtype.WithLanes(1) &&
                     visitor.Check(s.stmt, var, src)) {
                   int64_t const_size = AllocBuffer(ffi::GetRef<AllocBuffer>(alloc))
                                            .ConstantAllocationSize()
@@ -957,7 +973,7 @@ class StoragePlanRewriter : public StmtExprMutator {
     auto entry = std::make_unique<StorageEntry>();
     entry->attach_scope_ = attach_scope;
     entry->scope = scope;
-    entry->elem_type = op->buffer->dtype.element_of();
+    entry->elem_type = op->buffer->dtype.WithLanes(1);
     entry->const_nbits = const_nbits;
     StorageEntry* e = entry.get();
     alloc_vec_.emplace_back(std::move(entry));
@@ -971,7 +987,7 @@ class StoragePlanRewriter : public StmtExprMutator {
     // skip plan for local variable,
     // compiler can do a better job with register allocation.
     const uint64_t match_range = 16;
-    bool is_scalable_vector = op->buffer->dtype.is_scalable_vector();
+    bool is_scalable_vector = op->buffer->dtype.IsScalableVector();
     uint64_t op_elem_bits =
         is_scalable_vector ? 0 : op->buffer->dtype.bits() * op->buffer->dtype.lanes();
     int64_t const_size =
@@ -991,7 +1007,7 @@ class StoragePlanRewriter : public StmtExprMutator {
     // disable reuse of small arrays, they will be lowered to registers in LLVM
     // This rules only apply if we are using non special memory
     bool is_small_array = (scope.tag.length() == 0) &&
-                          (scope.rank >= StorageRank::kWarp || op->buffer->dtype.is_handle() ||
+                          (scope.rank >= StorageRank::kWarp || op->buffer->dtype.IsHandle() ||
                            (is_known_size && const_nbits <= 32));
 
     if (is_scalable_vector || !enable_reuse || is_small_array || !is_flat_memory_space) {
@@ -1023,7 +1039,7 @@ class StoragePlanRewriter : public StmtExprMutator {
         StorageEntry* e = it->second;
         if (e->attach_scope_ != attach_scope) continue;
         if (e->scope != scope) continue;
-        if (e->elem_type != op->buffer->dtype.element_of()) continue;
+        if (e->elem_type != op->buffer->dtype.WithLanes(1)) continue;
         if (reuse_require_exact_matched_dtype && e->elem_type != op->buffer->dtype) {
           continue;
         }
@@ -1037,7 +1053,7 @@ class StoragePlanRewriter : public StmtExprMutator {
         StorageEntry* e = *it;
         if (e->attach_scope_ != attach_scope) continue;
         if (e->scope != scope) continue;
-        if (e->elem_type != op->buffer->dtype.element_of()) continue;
+        if (e->elem_type != op->buffer->dtype.WithLanes(1)) continue;
         sym_free_list_.erase(it);
         return e;
       }
@@ -1055,8 +1071,8 @@ class StoragePlanRewriter : public StmtExprMutator {
     // This rules only apply if we are using non special memory
     if (e->scope.tag.length() == 0) {
       // Disable sharing of local memory.
-      if (e->scope.rank >= StorageRank::kWarp || e->allocs[0]->buffer->dtype.is_handle() ||
-          e->allocs[0]->buffer->dtype.is_scalable_vector()) {
+      if (e->scope.rank >= StorageRank::kWarp || e->allocs[0]->buffer->dtype.IsHandle() ||
+          e->allocs[0]->buffer->dtype.IsScalableVector()) {
         return;
       }
       // disable reuse of small arrays
@@ -1113,7 +1129,7 @@ struct BufferVarInfo {
   Var var;
 
   // The data type of an element of the buffer.
-  DataType element_dtype;
+  PrimType element_dtype;
 
   /* The extent of the buffer.
    *
@@ -1130,18 +1146,18 @@ struct BufferVarInfo {
   // differ both in base type (e.g. int32* cast to float32* after
   // packing in StorageRewrite) or in number of lanes (e.g. float16*
   // cast to float16x4*).
-  std::unordered_set<DataType> access_dtype;
+  std::unordered_set<PrimType, PrimTypeHash, PrimTypeEqual> access_dtype;
   // Data types used for scalar reads. This is used to record vectorized read dtypes that can be
   // shuffled for scalar reads when rewrite_scalar_read_to_vector_shuffle is enabled.
-  std::unordered_set<DataType> scalar_read_dtype;
+  std::unordered_set<PrimType, PrimTypeHash, PrimTypeEqual> scalar_read_dtype;
 
-  DataType get_preferred_dtype() const {
-    std::unordered_set<DataType> base_access_dtype;
+  PrimType get_preferred_dtype() const {
+    std::unordered_set<PrimType, PrimTypeHash, PrimTypeEqual> base_access_dtype;
     for (auto dtype : access_dtype) {
-      base_access_dtype.insert(dtype.element_of());
+      base_access_dtype.insert(dtype.WithLanes(1));
     }
     for (auto dtype : scalar_read_dtype) {
-      base_access_dtype.insert(dtype.element_of());
+      base_access_dtype.insert(dtype.WithLanes(1));
     }
     // If the array is accessed as multiple base types within a
     // function, no point in changing the declared type.  CodeGenC can
@@ -1152,7 +1168,7 @@ struct BufferVarInfo {
       return element_dtype;
     }
 
-    DataType preferred_base_type = *base_access_dtype.begin();
+    PrimType preferred_base_type = *base_access_dtype.begin();
 
     // If there is only one vectorizable size used to access the
     // buffer, and if that access size is compatible with the array
@@ -1177,7 +1193,7 @@ struct BufferVarInfo {
       }
     }
 
-    return preferred_base_type.with_lanes(preferred_lanes);
+    return preferred_base_type.WithLanes(preferred_lanes);
   }
 };
 
@@ -1208,7 +1224,7 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
     for (auto it : buffer_map) {
       Buffer& buffer = it.second;
       Var buffer_var = buffer->data;
-      DataType dtype = buffer->dtype;
+      PrimType dtype = buffer->dtype;
       PrimExpr extent = buffer->shape.size() ? buffer->shape[buffer->shape.size() - 1] : 0;
       OnArrayDeclaration(buffer_var, dtype, extent, BufferVarInfo::kPrimFuncParam);
     }
@@ -1218,7 +1234,7 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
     for (Var buffer_var : params) {
       auto pointer_type = GetPointerType(buffer_var->type_annotation);
       if (pointer_type.has_value() && (buffer_map.count(buffer_var) == 0)) {
-        DataType dtype = pointer_type.value();
+        PrimType dtype(pointer_type.value());
         PrimExpr extent = 0;
         OnArrayDeclaration(buffer_var, dtype, extent, BufferVarInfo::kPrimFuncBufferMap);
       }
@@ -1226,18 +1242,18 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
   }
 
   void VisitExpr_(const BufferLoadNode* op) final {
-    OnArrayAccess(op->dtype, op->buffer->data.get(), op->indices, /*is_buffer_load=*/true);
+    OnArrayAccess(op->ty(), op->buffer->data.get(), op->indices, /*is_buffer_load=*/true);
     StmtExprVisitor::VisitExpr_(op);
   }
 
   void VisitStmt_(const BufferStoreNode* op) final {
-    OnArrayAccess(op->value.dtype(), op->buffer->data.get(), op->indices, /*is_buffer_load=*/false);
+    OnArrayAccess(op->value.ty(), op->buffer->data.get(), op->indices, /*is_buffer_load=*/false);
     StmtExprVisitor::VisitStmt_(op);
   }
 
   void VisitExpr_(const CallNode* op) final {
     if (op->op.same_as(builtin::tvm_access_ptr())) {
-      DataType dtype = op->args[0].dtype();
+      PrimType dtype = op->args[0].ty();
       const VarNode* buffer = op->args[1].as<VarNode>();
       PrimExpr index = op->args[2];
       // args[1] may be a nested Call (e.g. another tvm_access_ptr) rather
@@ -1248,7 +1264,7 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
       }
     } else if (op->op.same_as(builtin::address_of())) {
       BufferLoad load = op->args[0].as_or_throw<BufferLoad>();
-      OnArrayAccess(load->dtype, load->buffer->data.get(), load->indices, /*is_buffer_load=*/false);
+      OnArrayAccess(load->ty(), load->buffer->data.get(), load->indices, /*is_buffer_load=*/false);
     }
     StmtExprVisitor::VisitExpr_(op);
   }
@@ -1273,12 +1289,12 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
   }
 
   void HandleLetNode(Var let_var) {
-    if (let_var->dtype.is_handle()) {
+    if (let_var.ty().IsHandle()) {
       auto pointer_type = GetPointerType(let_var->type_annotation);
       if (pointer_type.has_value()) {
-        OnArrayDeclaration(let_var, pointer_type.value(), 0, BufferVarInfo::kLetNode);
+        OnArrayDeclaration(let_var, PrimType(pointer_type.value()), 0, BufferVarInfo::kLetNode);
       } else if (allow_untyped_pointers_) {
-        OnArrayDeclaration(let_var, let_var->dtype, 0, BufferVarInfo::kLetNode);
+        OnArrayDeclaration(let_var, let_var.ty(), 0, BufferVarInfo::kLetNode);
       } else {
         TVM_FFI_THROW(InternalError) << "Let statement of variable " << let_var->name_hint
                                      << " is missing a type annotation, "
@@ -1300,15 +1316,16 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
    * @param declaration_location How the buffer was allocated, so that
    * some locations can be rewritten without others.
    */
-  void OnArrayDeclaration(Var buffer, DataType element_dtype, PrimExpr extent,
+  void OnArrayDeclaration(Var buffer, PrimType element_dtype, PrimExpr extent,
                           BufferVarInfo::DeclarationLocation declaration_location) {
     TVM_FFI_ICHECK(info_map_.find(buffer.get()) == info_map_.end())
         << "Array declaration of " << buffer->name_hint << " occurred multiple times.";
 
-    if (element_dtype == DataType::Bool()) {
-      element_dtype = DataType::Int(8).with_lanes(element_dtype.lanes());
+    if (element_dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
+      element_dtype = PrimType::Int(8, element_dtype.lanes());
     }
-    info_map_[buffer.get()] = BufferVarInfo{buffer, element_dtype, extent, declaration_location};
+    info_map_.insert_or_assign(buffer.get(),
+                               BufferVarInfo{buffer, element_dtype, extent, declaration_location});
   }
 
   /* Update the type map for a buffer based on its usage
@@ -1322,13 +1339,13 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
    *
    * @param is_buffer_load Whether the access is BufferLoad
    */
-  void OnArrayAccess(DataType value_dtype, const VarNode* buffer,
+  void OnArrayAccess(PrimType value_dtype, const VarNode* buffer,
                      const ffi::Array<PrimExpr>& indices, bool is_buffer_load) {
     auto it = info_map_.find(buffer);
     TVM_FFI_ICHECK(it != info_map_.end()) << "Load/Store of buffer " << buffer->name_hint << " ("
                                           << buffer << ") occurred before its declaration.";
 
-    if (value_dtype.is_scalable_vector()) {
+    if (value_dtype.IsScalableVector()) {
       // Scalable types are not currently supported in storage_rewrite. Scalable buffer
       // accesses are not currently checked and therefore are not rewritten.
       return;
@@ -1336,24 +1353,24 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
 
     BufferVarInfo& var_info = it->second;
 
-    if (value_dtype.element_of() == DataType::Bool()) {
-      value_dtype = DataType::Int(8).with_lanes(value_dtype.lanes());
+    if (value_dtype.WithLanes(1).MatchesCode(DLDataTypeCode::kDLBool)) {
+      value_dtype = PrimType::Int(8, value_dtype.lanes());
     }
 
-    if (var_info.element_dtype.is_handle()) {
+    if (var_info.element_dtype.IsHandle()) {
       TVM_FFI_ICHECK(allow_untyped_pointers_)
           << "Variable " << buffer->name_hint
           << " was missing a type annotation in its declaration";
-      var_info.element_dtype = value_dtype.element_of();
+      var_info.element_dtype = value_dtype.WithLanes(1);
     }
 
     for (int i = 0; i < static_cast<int>(indices.size()) - 1; i++) {
-      TVM_FFI_ICHECK(indices[i].dtype().is_scalar())
+      TVM_FFI_ICHECK(indices[i].ty().IsScalar())
           << "Only the last index of a buffer access may be a vector type.";
     }
-    int index_lanes = indices.size() ? indices.back().dtype().lanes() : 1;
+    int index_lanes = indices.size() ? indices.back().ty().lanes() : 1;
 
-    DataType access_dtype = value_dtype;
+    PrimType access_dtype = value_dtype;
 
     int lanes_used = var_info.element_dtype.lanes();
 
@@ -1366,7 +1383,7 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
     if (index_lanes * var_info.element_dtype.lanes() != value_dtype.lanes()) {
       TVM_FFI_ICHECK_EQ(index_lanes, value_dtype.lanes());
       lanes_used = 1;
-      var_info.element_dtype = var_info.element_dtype.with_lanes(1);
+      var_info.element_dtype = var_info.element_dtype.WithLanes(1);
     }
 
     // TODO(Lunderberg): Uncomment this check once it can be applied.
@@ -1399,13 +1416,13 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
 
     if (detect_scalar_read_patterns_ && is_buffer_load && indices.size()) {
       const PrimExpr last_dim_index = indices[indices.size() - 1];
-      if (last_dim_index.dtype().lanes() == 1) {
+      if (last_dim_index.ty().lanes() == 1) {
         arith::ModularSet me = analyzer_->modular_set(last_dim_index);
-        var_info.scalar_read_dtype.emplace(access_dtype.with_lanes(me->coeff));
+        var_info.scalar_read_dtype.emplace(access_dtype.WithLanes(me->coeff));
         return;
       }
     }
-    var_info.access_dtype.insert(access_dtype.with_lanes(lanes_used));
+    var_info.access_dtype.insert(access_dtype.WithLanes(lanes_used));
   }
 
   // Map of buffer variable information determined
@@ -1488,15 +1505,16 @@ class VectorTypeRewriter : public StmtExprMutator {
     // Rewrite any buffer variables whose preferred type isn't their current type.
     for (const auto& pair : info_map) {
       const auto& var_info = pair.second;
-      DataType preferred = var_info.get_preferred_dtype();
+      PrimType preferred = var_info.get_preferred_dtype();
       if (preferred != var_info.element_dtype && (rewrite_mask & var_info.declaration_location)) {
         Var old_buffer_var = var_info.var;
         Var new_buffer_var(old_buffer_var->name_hint,
-                           PointerType(PrimType(preferred), GetPtrStorageScope(old_buffer_var)),
+                           PointerType(preferred, GetPtrStorageScope(old_buffer_var)),
                            old_buffer_var->span);
 
-        rewrite_map_[var_info.var.get()] = {var_info.var, new_buffer_var, var_info.element_dtype,
-                                            preferred};
+        rewrite_map_.insert_or_assign(
+            var_info.var.get(),
+            RewriteInfo{var_info.var, new_buffer_var, var_info.element_dtype, preferred});
       }
     }
   }
@@ -1523,7 +1541,7 @@ class VectorTypeRewriter : public StmtExprMutator {
     const PrimExpr& last_dim_index = indices[indices.size() - 1];
     const RampNode* ramp_index = indices[indices.size() - 1].as<RampNode>();
 
-    if (node->buffer->dtype.is_scalable_vector() || last_dim_index.dtype().is_scalable_vector()) {
+    if (node->buffer->dtype.IsScalableVector() || last_dim_index.ty().IsScalableVector()) {
       // Scalable types are not currently supported in storage_rewrite. Scalable buffer
       // accesses are not currently checked and therefore are not rewritten.
       return {node, shuffle_index};
@@ -1531,17 +1549,17 @@ class VectorTypeRewriter : public StmtExprMutator {
 
     if (ramp_index && is_one(ramp_index->stride) && ramp_index->lanes->IsInstance<IntImmNode>()) {
       int lanes = static_cast<int>(ramp_index->lanes.as_or_throw<IntImm>()->value);
-      PrimExpr new_index = ramp_index->base / MakeConst(ramp_index->base.dtype(), lanes);
+      PrimExpr new_index = ramp_index->base / MakeConst(ramp_index->base.ty(), lanes);
       if (lanes != info.factor()) {
         TVM_FFI_ICHECK(info.factor() && lanes % info.factor() == 0);
         int new_lanes = lanes / info.factor();
         new_index = Ramp(new_index * new_lanes, ramp_index->stride, new_lanes, ramp_index->span);
       }
       indices.Set(indices.size() - 1, new_index);
-    } else if (last_dim_index.dtype().lanes() == 1 && info.factor() > 1) {
+    } else if (last_dim_index.ty().lanes() == 1 && info.factor() > 1) {
       arith::ModularSet me = analyzer_->modular_set(last_dim_index);
       TVM_FFI_ICHECK(me->coeff == 0 || info.factor() % me->coeff == 0);
-      PrimExpr new_index = last_dim_index / MakeConst(last_dim_index.dtype(), info.factor());
+      PrimExpr new_index = last_dim_index / MakeConst(last_dim_index.ty(), info.factor());
       shuffle_index = me->base % info.factor();
       indices.Set(indices.size() - 1, new_index);
     }
@@ -1612,7 +1630,7 @@ class VectorTypeRewriter : public StmtExprMutator {
 
       ffi::Array<PrimExpr> shape = buf->shape;
       PrimExpr last_dim = shape[shape.size() - 1];
-      shape.Set(shape.size() - 1, last_dim / MakeConst(last_dim.dtype(), info.factor()));
+      shape.Set(shape.size() - 1, last_dim / MakeConst(last_dim.ty(), info.factor()));
 
       auto writer = buf.CopyOnWrite();
       writer->data = info.new_buffer_var;
@@ -1647,13 +1665,13 @@ class VectorTypeRewriter : public StmtExprMutator {
 
       PrimExpr e_dtype = tirx::TypeAnnotation(info.new_element_dtype);
       int factor = info.factor();
-      extent = extent / MakeConst(extent.dtype(), factor);
-      index = index / MakeConst(index.dtype(), factor);
+      extent = extent / MakeConst(extent.ty(), factor);
+      index = index / MakeConst(index.ty(), factor);
       ffi::Array<PrimExpr> acc_args{e_dtype, info.new_buffer_var, index, extent, flag};
       // tvm_access_ptr produces a pointer; its Call.dtype must be handle
       // (the lowering rule in src/target/intrin_rule.cc ICHECKs this).
       // The element dtype is conveyed via the first arg (e_dtype marker).
-      return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
+      return Call(PrimType::Handle(), builtin::tvm_access_ptr(), acc_args);
 
     } else {
       return StmtExprMutator::VisitExpr_(op);
@@ -1710,8 +1728,8 @@ class VectorTypeRewriter : public StmtExprMutator {
   struct RewriteInfo {
     Var old_buffer_var;
     Var new_buffer_var;
-    DataType old_element_dtype;
-    DataType new_element_dtype;
+    PrimType old_element_dtype;
+    PrimType new_element_dtype;
 
     int factor() const {
       int old_lanes = old_element_dtype.lanes();
diff --git a/src/tirx/transform/tile_primitive_dispatch.cc b/src/tirx/transform/tile_primitive_dispatch.cc
index 6052adcdc7ac..213264b1a2ae 100644
--- a/src/tirx/transform/tile_primitive_dispatch.cc
+++ b/src/tirx/transform/tile_primitive_dispatch.cc
@@ -605,7 +605,7 @@ class TilePrimitiveDispatcher : public StmtExprMutator {
     // Synthesize the warp_id_in_cta helper (CUDA only) when threadIdx is set.
     if (launch_params_.count("threadIdx.x") > 0) {
       PrimExpr shuffled = ScopeIdResolve::ComputeWarpIdInCta(launch_params_);
-      Var warp_id_in_cta_var("warp_id_in_cta", shuffled.dtype());
+      Var warp_id_in_cta_var("warp_id_in_cta", shuffled.ty());
       scope_binds->push_back({warp_id_in_cta_var, shuffled});
       IterVar warp_iv(Range::FromMinExtent(0, 1), warp_id_in_cta_var, kThreadIndex,
                       "warp_id_in_cta");
@@ -664,8 +664,8 @@ class TilePrimitiveDispatcher : public StmtExprMutator {
         // to map Vars back to their ScopeBinding.
         Var bind_var = def->def_ids[i];
         PrimExpr value = resolved[i];
-        if (bind_var->dtype != value.dtype()) {
-          value = Cast(bind_var->dtype, value);
+        if (bind_var.ty() != value.ty()) {
+          value = Cast(bind_var.ty(), value);
         }
         scope_binds->push_back({bind_var, value});
         if (is_implicit(bind_var)) {
@@ -1157,8 +1157,8 @@ class TilePrimitiveDispatcher : public StmtExprMutator {
         << "TIRxError: tirx.filter expects (var, cond); got " << call->args.size() << " args";
     auto target = ResolveScopeIdTarget(call->args[0]);
     if (target && ElectSyncFinder::Contains(call->args[1])) {
-      PrimExpr selector = tirx::Call(call->args[0].dtype(), tirx::builtin::selector(),
-                                     {call->args[0], call->args[1]});
+      PrimExpr selector =
+          tirx::Call(call->args[0].ty(), tirx::builtin::selector(), {call->args[0], call->args[1]});
       int pushed = TryPushSelectorForTarget(*target, selector) ? 1 : 0;
       return pushed + PushPredicateCtx(call->args[1]);
     }
@@ -1269,7 +1269,7 @@ class TilePrimitiveDispatcher : public StmtExprMutator {
         auto lane = FindLaneScopeVar();
         if (!lane) return -1;
         ScopeIdTarget target{ScopeBinding::kWarpThread, 0, 1};
-        PrimExpr selector = tirx::Call(lane->dtype(), tirx::builtin::selector(), {*lane, cond});
+        PrimExpr selector = tirx::Call(lane->ty(), tirx::builtin::selector(), {*lane, cond});
         return TryPushSelectorForTarget(target, selector) ? 1 : 0;
       }
       return -1;
@@ -1337,7 +1337,7 @@ class TilePrimitiveDispatcher : public StmtExprMutator {
     if (!lane) return false;
     ScopeIdTarget target{ScopeBinding::kWarpThread, 0, 1};
     PrimExpr selector =
-        tirx::Call(lane->dtype(), tirx::builtin::selector(), {*lane, atom.elect_sync_call});
+        tirx::Call(lane->ty(), tirx::builtin::selector(), {*lane, atom.elect_sync_call});
     return TryPushSelectorForTarget(target, selector);
   }
 
@@ -1399,17 +1399,18 @@ class TilePrimitiveDispatcher : public StmtExprMutator {
         args.push_back(new_arg);
       }
       if (changed) {
-        return tirx::Call(call->dtype, call->op, args, call->attrs, call->span);
+        return tirx::Call(call->ty(), call->op, args, call->attrs, call->span);
       }
     }
     return pred;
   }
 
   PrimExpr AsBool(PrimExpr pred) const {
-    if (pred.dtype().is_bool()) {
+    PrimType pred_ty = pred.ty();
+    if (pred_ty.MatchesCode(DLDataTypeCode::kDLBool)) {
       return pred;
     }
-    return pred != IntImm(pred.dtype(), 0);
+    return pred != IntImm(pred.ty(), 0);
   }
 
   ffi::Map<Var, Range> var_range_map_;
diff --git a/src/tirx/transform/tvm_ffi_binder.cc b/src/tirx/transform/tvm_ffi_binder.cc
index ba0e671dd94b..2535e0db15e0 100644
--- a/src/tirx/transform/tvm_ffi_binder.cc
+++ b/src/tirx/transform/tvm_ffi_binder.cc
@@ -69,10 +69,10 @@ TVMFFIABIBuilder::TVMFFIABIBuilder(const ffi::String& func_name, const ffi::Arra
         shape_os << buf->shape[j];
         os << shape_os.str();
       }
-      os << "], " << buf->dtype << ")";
+      os << "], " << buf->dtype->dtype << ")";
       param_names_[static_cast<int>(i)] = buf_name;
     } else {
-      os << param->name_hint << ": " << param.dtype();
+      os << param->name_hint << ": " << param.ty()->dtype;
       param_names_[static_cast<int>(i)] = param->name_hint;
     }
   }
@@ -87,7 +87,7 @@ TVMFFIABIBuilder::TVMFFIABIBuilder(const ffi::String& func_name, const ffi::Arra
 
   // Emit null-pointer check for packed args (early check)
   if (num_args > 0) {
-    EmitAssert(!Call(DataType::Bool(), builtin::isnullptr(), {v_packed_args}),
+    EmitAssert(!Call(PrimType::Bool(), builtin::isnullptr(), {v_packed_args}),
                "TypeError",  //
                "args pointer is NULL", when_calling_imm_, sig_imm_, "`");
   }
@@ -163,7 +163,7 @@ int TVMFFIABIBuilder::GetParamIndex(const ffi::reflection::AccessPath& path) con
 
 bool TVMFFIABIBuilder::BindScalar(const PrimExpr& arg, const PrimExpr& value,
                                   const ffi::reflection::AccessPath& path, bool with_lets) {
-  TVM_FFI_ICHECK_EQ(arg.dtype(), value.dtype());
+  TVM_FFI_ICHECK(arg.ty()->dtype == value.ty()->dtype);
   if (arg.as<VarNode>()) {
     Var v_arg = arg.as_or_throw<Var>();
     auto it = var_defs_.find(v_arg.get());
@@ -368,8 +368,8 @@ void TVMFFIABIBuilder::BindBuffer(const Buffer& arg, const Buffer& value,
     if (BindScalar(arg->elem_offset, value->elem_offset, offset_path, false)) {
       if (arg->offset_factor > 1) {
         PrimExpr offset = value->elem_offset;
-        PrimExpr factor = IntImm(offset.dtype(), arg->offset_factor);
-        PrimExpr zero = IntImm(offset.dtype(), 0);
+        PrimExpr factor = IntImm(offset.ty(), arg->offset_factor);
+        PrimExpr zero = IntImm(offset.ty(), 0);
         PrimExpr acond = analyzer_->Simplify(truncmod(offset, factor) == zero);
         if (is_zero(acond)) {
           TVM_FFI_THROW(InternalError)
@@ -377,7 +377,9 @@ void TVMFFIABIBuilder::BindBuffer(const Buffer& arg, const Buffer& value,
         }
         if (!is_one(acond)) {
           int param_index = GetParamIndex(base_path);
-          int data_bytes = GetVectorBytes(arg->dtype);
+          int data_bytes =
+              ((((arg->dtype->dtype).bits * static_cast<int16_t>((arg->dtype->dtype).lanes)) + 7) /
+               8);
           EmitAssert(acond, "ValueError",  //
                      "Misaligned buffer data on argument #", std::to_string(param_index),
                      when_calling_imm_, sig_imm_, "`,\n  expected data alignment=",
@@ -422,12 +424,12 @@ void TVMFFIABIBuilder::BindBuffer(const Buffer& arg, const Buffer& value,
 
 /*! \brief Load the i-th packed argument as the given type. */
 PrimExpr TVMFFIABIBuilder::LoadTVMFFIAnyUnionValue(const Var& v_packed_args, int param_index,
-                                                   DataType arg_type) {
+                                                   PrimType arg_type) {
   ffi::Array<PrimExpr> call_args{v_packed_args, IntImm::Int32(param_index),
                                  IntImm::Int32(builtin::kTVMFFIAnyUnionValue)};
-  DataType api_type = APIType(arg_type);
+  PrimType api_type = APIType(arg_type);
   PrimExpr res = Call(api_type, builtin::tvm_struct_get(), call_args);
-  if (api_type != arg_type) {
+  if (api_type->dtype != arg_type->dtype) {
     res = Cast(arg_type, res);
   }
   return res;
@@ -447,8 +449,8 @@ PrimExpr TVMFFIABIBuilder::DecodeParamOpaqueHandle(int param_index, const Var& t
   const int64_t object_cell_offset = sizeof(TVMFFIObject);
   static_assert(sizeof(TVMFFIObject) == 24);
   PrimExpr arg_value =
-      LoadTVMFFIAnyUnionValue(v_packed_args_, param_index, params_[param_index].dtype());
-  PrimExpr handle_from_tensor = Call(DataType::Handle(), tirx::builtin::handle_add_byte_offset(),
+      LoadTVMFFIAnyUnionValue(v_packed_args_, param_index, params_[param_index].ty());
+  PrimExpr handle_from_tensor = Call(PrimType::Handle(), tirx::builtin::handle_add_byte_offset(),
                                      {arg_value, IntImm::Int32(object_cell_offset)});
   return Select(type_index == ffi::TypeIndex::kTVMFFITensor, handle_from_tensor, arg_value);
 }
@@ -459,11 +461,11 @@ PrimExpr TVMFFIABIBuilder::DecodeParamBool(int param_index, const Var& type_inde
       param_index,
       type_index == ffi::TypeIndex::kTVMFFIBool || type_index == ffi::TypeIndex::kTVMFFIInt,
       "boolean");
-  return Cast(DataType::Bool(),
-              LoadTVMFFIAnyUnionValue(v_packed_args_, param_index, DataType::Int(64)));
+  return Cast(PrimType::Bool(),
+              LoadTVMFFIAnyUnionValue(v_packed_args_, param_index, PrimType::Int(64)));
 }
 
-PrimExpr TVMFFIABIBuilder::DecodeParamInt(int param_index, const Var& type_index, DataType dtype) {
+PrimExpr TVMFFIABIBuilder::DecodeParamInt(int param_index, const Var& type_index, PrimType dtype) {
   // ── Type check: accept int or bool ─────────────────────────
   EmitTypeIndexCheck(
       param_index,
@@ -472,7 +474,7 @@ PrimExpr TVMFFIABIBuilder::DecodeParamInt(int param_index, const Var& type_index
 }
 
 PrimExpr TVMFFIABIBuilder::DecodeParamFloat(int param_index, const Var& type_index,
-                                            DataType dtype) {
+                                            PrimType dtype) {
   // ── Type check: accept float, int, or bool ─────────────────
   EmitTypeIndexCheck(param_index,
                      type_index == ffi::TypeIndex::kTVMFFIFloat ||
@@ -483,7 +485,7 @@ PrimExpr TVMFFIABIBuilder::DecodeParamFloat(int param_index, const Var& type_ind
       type_index == ffi::TypeIndex::kTVMFFIFloat,
       /* true_value = */ LoadTVMFFIAnyUnionValue(v_packed_args_, param_index, dtype),
       /* false_value = */
-      Cast(dtype, LoadTVMFFIAnyUnionValue(v_packed_args_, param_index, DataType::Int(64))));
+      Cast(dtype, LoadTVMFFIAnyUnionValue(v_packed_args_, param_index, PrimType::Int(64))));
 }
 
 // ============================================================
@@ -492,24 +494,24 @@ PrimExpr TVMFFIABIBuilder::DecodeParamFloat(int param_index, const Var& type_ind
 
 void TVMFFIABIBuilder::DecodeParam(int param_index) {
   Var param = params_[param_index];
-  DataType dtype = param.dtype();
+  PrimType dtype = param.ty();
 
   // Extract type_index from packed_args
-  Var type_index(param->name_hint + ".type_index", DataType::Int(32));
-  init_nest_.push_back(Bind(type_index, tirx::Call(DataType::Int(32), builtin::tvm_struct_get(),
+  Var type_index(param->name_hint + ".type_index", PrimType::Int(32));
+  init_nest_.push_back(Bind(type_index, tirx::Call(PrimType::Int(32), builtin::tvm_struct_get(),
                                                    {v_packed_args_, IntImm::Int32(param_index),
                                                     IntImm::Int32(builtin::kTVMFFIAnyTypeIndex)})));
 
   // Type-check and load value via per-dtype dispatch
   PrimExpr arg_value;
-  if (dtype.is_handle()) {
+  if (dtype.IsHandle()) {
     arg_value = DecodeParamOpaqueHandle(param_index, type_index);
-  } else if (dtype.is_bool()) {
+  } else if (dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
     arg_value = DecodeParamBool(param_index, type_index);
-  } else if (dtype.is_int() || dtype.is_uint()) {
+  } else if (dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
     arg_value = DecodeParamInt(param_index, type_index, dtype);
   } else {
-    TVM_FFI_ICHECK(dtype.is_float());
+    TVM_FFI_ICHECK_EQ(dtype.code(), DLDataTypeCode::kDLFloat);
     arg_value = DecodeParamFloat(param_index, type_index, dtype);
   }
 
@@ -553,9 +555,9 @@ void TVMFFIABIBuilder::DecodeAllParams() {
 
 Var TVMFFIABIBuilder::DLTensorGetFieldPtr(const Var& handle, int field_kind,
                                           const std::string& var_name) {
-  Var ptr(var_name, DataType::Handle());
+  Var ptr(var_name, PrimType::Handle());
   init_nest_.emplace_back(
-      Bind(ptr, TVMStructGet(DataType::Handle(), handle, 0,
+      Bind(ptr, TVMStructGet(PrimType::Handle(), handle, 0,
                              static_cast<builtin::TVMStructFieldKind>(field_kind))));
   return ptr;
 }
@@ -565,7 +567,7 @@ Var TVMFFIABIBuilder::DLTensorGetFieldPtr(const Var& handle, int field_kind,
 // ============================================================
 
 PrimExpr TVMFFIABIBuilder::LoadInt64ArrayElem(const Var& ptr, int index) {
-  return TVMStructGet(DataType::ShapeIndex(), ptr, index, builtin::kInt64ArrayElem);
+  return TVMStructGet(DefaultIndexPrimType(), ptr, index, builtin::kInt64ArrayElem);
 }
 
 // ============================================================
@@ -575,7 +577,7 @@ PrimExpr TVMFFIABIBuilder::LoadInt64ArrayElem(const Var& ptr, int index) {
 void TVMFFIABIBuilder::BindCompactStrides(const Buffer& buffer, const Var& strides_ptr,
                                           const PrimExpr& v_strides_is_null,
                                           const ffi::reflection::AccessPath& param_path) {
-  DataType stype = buffer->DefaultIndexType();
+  PrimType stype(buffer->DefaultIndexType());
   PrimExpr expect_stride = MakeConst(stype, 1);
   ffi::Array<PrimExpr> conds;
   for (size_t i = buffer->shape.size(); i != 0; --i) {
@@ -602,11 +604,11 @@ void TVMFFIABIBuilder::BindCompactStrides(const Buffer& buffer, const Var& strid
 void TVMFFIABIBuilder::BindAutoBroadcastStrides(const Buffer& buffer, const Var& strides_ptr,
                                                 const PrimExpr& v_strides_is_null,
                                                 const ffi::reflection::AccessPath& param_path) {
-  DataType stype = buffer->DefaultIndexType();
+  PrimType stype(buffer->DefaultIndexType());
   PrimExpr stride = MakeConst(stype, 1);
   for (size_t i = buffer->shape.size(); i != 0; --i) {
     size_t k = i - 1;
-    PrimExpr value = cast(buffer->shape[k].dtype(), LoadInt64ArrayElem(strides_ptr, k));
+    PrimExpr value = cast(buffer->shape[k].ty(), LoadInt64ArrayElem(strides_ptr, k));
     value = tvm::if_then_else(v_strides_is_null, stride, value);
     value = tvm::if_then_else(buffer->shape[k] == 1, 0, value);
     ffi::reflection::AccessPath strides_k_path =
@@ -621,13 +623,13 @@ void TVMFFIABIBuilder::BindRegularStrides(const Buffer& buffer, const Var& strid
                                           const ffi::reflection::AccessPath& param_path) {
   PrimExpr stride_from_shape = 1;
   for (int k = buffer->strides.size() - 1; k >= 0; k--) {
-    PrimExpr explicit_stride = cast(buffer->shape[k].dtype(), LoadInt64ArrayElem(strides_ptr, k));
+    PrimExpr explicit_stride = cast(buffer->shape[k].ty(), LoadInt64ArrayElem(strides_ptr, k));
     ffi::reflection::AccessPath strides_k_path =
         param_path->Attr(ffi::String("strides"))->ArrayItem(k);
     BindScalar(buffer->strides[k],
                tvm::if_then_else(v_strides_is_null, stride_from_shape, explicit_stride),
                strides_k_path, true);
-    stride_from_shape *= cast(buffer->shape[k].dtype(), LoadInt64ArrayElem(shape_ptr, k));
+    stride_from_shape *= cast(buffer->shape[k].ty(), LoadInt64ArrayElem(shape_ptr, k));
   }
 }
 
@@ -639,14 +641,14 @@ void TVMFFIABIBuilder::DecodeParamDLTensor(const Buffer& buffer, const PrimExpr&
                                            const PrimExpr& device_id, const Var& handle,
                                            const std::string& arg_name,
                                            ffi::reflection::AccessPath base_path) {
-  const DataType tvm_ndim_type = DataType::Int(32);
+  const PrimType tvm_ndim_type = PrimType::Int(32);
 
   std::string buf_name = buffer->name;
   ffi::reflection::AccessPath param_path = base_path;
   int param_index = GetParamIndex(base_path);
 
   // ── Section: Null pointer check ──────────────────────────────
-  EmitTypeIndexCheck(param_index, !Call(DataType::Bool(), builtin::isnullptr(), {handle}),
+  EmitTypeIndexCheck(param_index, !Call(PrimType::Bool(), builtin::isnullptr(), {handle}),
                      "Tensor");
 
   // ── Section: ndim ────────────────────────────────────────────
@@ -658,16 +660,21 @@ void TVMFFIABIBuilder::DecodeParamDLTensor(const Buffer& buffer, const PrimExpr&
 
   // ── Section: dtype ───────────────────────────────────────────
   {
-    PrimExpr cond = (TVMStructGet(DataType::UInt(8), handle, 0, builtin::kDLTensorTypeCode) ==
-                         IntImm(DataType::UInt(8), buffer->dtype.code()) &&
-                     TVMStructGet(DataType::UInt(8), handle, 0, builtin::kDLTensorTypeBits) ==
-                         IntImm(DataType::UInt(8), buffer->dtype.bits()) &&
-                     TVMStructGet(DataType::UInt(16), handle, 0, builtin::kDLTensorTypeLanes) ==
-                         IntImm(DataType::UInt(16), buffer->dtype.lanes()));
-    if (!(buffer->dtype == DataType::Int(1) || buffer->dtype == DataType::Int(4) ||
-          buffer->dtype == DataType::UInt(4))) {
+    PrimExpr code_matches =
+        TVMStructGet(PrimType::UInt(8), handle, 0, builtin::kDLTensorTypeCode) ==
+        IntImm(PrimType::UInt(8), buffer->dtype.code());
+    PrimExpr bits_matches =
+        TVMStructGet(PrimType::UInt(8), handle, 0, builtin::kDLTensorTypeBits) ==
+        IntImm(PrimType::UInt(8), buffer->dtype.bits());
+    PrimExpr lanes_matches =
+        TVMStructGet(PrimType::UInt(16), handle, 0, builtin::kDLTensorTypeLanes) ==
+        IntImm(PrimType::UInt(16), buffer->dtype.lanes());
+    PrimExpr cond = cast(PrimType::Bool(), code_matches) && cast(PrimType::Bool(), bits_matches) &&
+                    cast(PrimType::Bool(), lanes_matches);
+    if (!(buffer->dtype == PrimType::Int(1) || buffer->dtype == PrimType::Int(4) ||
+          buffer->dtype == PrimType::UInt(4))) {
       std::ostringstream dtype_os;
-      dtype_os << buffer->dtype;
+      dtype_os << buffer->dtype->dtype;
       EmitAssert(cond, "TypeError",  //
                  "Mismatched ", buf_name, ".dtype on argument #", std::to_string(param_index),
                  when_calling_imm_, sig_imm_, "`,\n  expected ", dtype_os.str());
@@ -677,18 +684,18 @@ void TVMFFIABIBuilder::DecodeParamDLTensor(const Buffer& buffer, const PrimExpr&
   // ── Section: shape ───────────────────────────────────────────
   Var shape_ptr = DLTensorGetFieldPtr(handle, builtin::kDLTensorShape, arg_name + "_shape");
   for (size_t k = 0; k < buffer->shape.size(); ++k) {
-    if (buffer->dtype == DataType::Int(4) || buffer->dtype == DataType::UInt(4) ||
-        buffer->dtype == DataType::Int(1)) {
+    if (buffer->dtype == PrimType::Int(4) || buffer->dtype == PrimType::UInt(4) ||
+        buffer->dtype == PrimType::Int(1)) {
       break;
     }
     ffi::reflection::AccessPath shape_k_path = param_path->Attr(ffi::String("shape"))->ArrayItem(k);
-    BindScalar(buffer->shape[k], cast(buffer->shape[k].dtype(), LoadInt64ArrayElem(shape_ptr, k)),
+    BindScalar(buffer->shape[k], cast(buffer->shape[k].ty(), LoadInt64ArrayElem(shape_ptr, k)),
                shape_k_path, true);
   }
 
   // ── Section: strides ─────────────────────────────────────────
   Var strides_ptr = DLTensorGetFieldPtr(handle, builtin::kDLTensorStrides, arg_name + "_strides");
-  PrimExpr v_strides_is_null = Call(DataType::Bool(), builtin::isnullptr(), {strides_ptr});
+  PrimExpr v_strides_is_null = Call(PrimType::Bool(), builtin::isnullptr(), {strides_ptr});
   if (buffer->strides.size() == 0) {
     BindCompactStrides(buffer, strides_ptr, v_strides_is_null, param_path);
   } else if (buffer->buffer_type == kAutoBroadcast) {
@@ -698,22 +705,22 @@ void TVMFFIABIBuilder::DecodeParamDLTensor(const Buffer& buffer, const PrimExpr&
   }
 
   // ── Section: byte_offset ─────────────────────────────────────
-  int data_bytes = GetVectorBytes(buffer->dtype);
+  int data_bytes = ((buffer->dtype.bits() * buffer->dtype.lanes()) + 7) / 8;
   ffi::reflection::AccessPath byte_offset_path = param_path->Attr(ffi::String("byte_offset"));
   if (const auto* const_offset = buffer->elem_offset.as<IntImmNode>()) {
-    BindScalar(IntImm(DataType::UInt(64), const_offset->value * data_bytes),
-               TVMStructGet(DataType::UInt(64), handle, 0, builtin::kDLTensorByteOffset),
+    BindScalar(IntImm(PrimType::UInt(64), const_offset->value * data_bytes),
+               TVMStructGet(PrimType::UInt(64), handle, 0, builtin::kDLTensorByteOffset),
                byte_offset_path, true);
   } else {
     if (BindScalar(buffer->elem_offset,
-                   cast(buffer->elem_offset.dtype(),
-                        (TVMStructGet(DataType::UInt(64), handle, 0, builtin::kDLTensorByteOffset) /
-                         MakeConst(DataType::UInt(64), data_bytes))),
+                   cast(buffer->elem_offset.ty(),
+                        (TVMStructGet(PrimType::UInt(64), handle, 0, builtin::kDLTensorByteOffset) /
+                         MakeConst(PrimType::UInt(64), data_bytes))),
                    byte_offset_path, true)) {
       if (buffer->offset_factor > 1) {
         PrimExpr offset = buffer->elem_offset;
-        PrimExpr factor = IntImm(offset.dtype(), buffer->offset_factor);
-        PrimExpr zero = IntImm(offset.dtype(), 0);
+        PrimExpr factor = IntImm(offset.ty(), buffer->offset_factor);
+        PrimExpr zero = IntImm(offset.ty(), 0);
         PrimExpr acond = analyzer_->Simplify(truncmod(offset, factor) == zero);
         if (is_zero(acond)) {
           TVM_FFI_THROW(InternalError)
@@ -732,7 +739,7 @@ void TVMFFIABIBuilder::DecodeParamDLTensor(const Buffer& buffer, const PrimExpr&
   // ── Section: device ──────────────────────────────────────────
   {
     PrimExpr actual_device_type =
-        TVMStructGet(DataType::Int(32), handle, 0, builtin::kDLTensorDeviceType);
+        TVMStructGet(PrimType::Int(32), handle, 0, builtin::kDLTensorDeviceType);
     // Use custom assertion for device_type to show human-readable device name
     if (const auto* const_dt = device_type_.as<IntImmNode>()) {
       PrimExpr cond = analyzer_->Simplify(IntImm::Int32(const_dt->value) == actual_device_type);
@@ -748,7 +755,7 @@ void TVMFFIABIBuilder::DecodeParamDLTensor(const Buffer& buffer, const PrimExpr&
       BindScalar(device_type_, actual_device_type, device_type_path, true);
     }
     ffi::reflection::AccessPath device_id_path = param_path->Attr(ffi::String("device_id"));
-    BindScalar(device_id_, TVMStructGet(DataType::Int(32), handle, 0, builtin::kDLTensorDeviceId),
+    BindScalar(device_id_, TVMStructGet(PrimType::Int(32), handle, 0, builtin::kDLTensorDeviceId),
                device_id_path, true);
   }
 
@@ -756,12 +763,12 @@ void TVMFFIABIBuilder::DecodeParamDLTensor(const Buffer& buffer, const PrimExpr&
   {
     ffi::reflection::AccessPath data_path = param_path->Attr(ffi::String("data"));
     if (BindScalar(buffer->data,
-                   TVMStructGet(DataType::Handle(), handle, 0, builtin::kDLTensorData), data_path,
+                   TVMStructGet(PrimType::Handle(), handle, 0, builtin::kDLTensorData), data_path,
                    true)) {
       Var vptr(buffer->data);
 
       auto alloc_size = [&]() -> PrimExpr {
-        PrimExpr product = IntImm(buffer->DefaultIndexType(), 1);
+        PrimExpr product = IntImm(PrimType(buffer->DefaultIndexType()), 1);
         for (const auto& dim : buffer->shape) {
           product *= dim;
         }
@@ -769,9 +776,10 @@ void TVMFFIABIBuilder::DecodeParamDLTensor(const Buffer& buffer, const PrimExpr&
       }();
       // Data pointer null and alignment checks go to asserts_ because alloc_size
       // references buffer->shape which may contain forward-referenced symbolic vars.
+      PrimExpr empty_alloc = cast(PrimType::Bool(), alloc_size == 0);
+      PrimExpr data_non_null = !Call(PrimType::Bool(), builtin::isnullptr(), {vptr});
       asserts_.emplace_back(AssertStmt(
-          alloc_size == 0 || !Call(DataType::Bool(), builtin::isnullptr(), {vptr}),
-          StringImm("ValueError"),
+          empty_alloc || data_non_null, StringImm("ValueError"),
           ffi::Array<StringImm>({StringImm(buf_name),
                                  StringImm(" data pointer is NULL on argument #"),
                                  StringImm(std::to_string(param_index)), when_calling_imm_,
@@ -781,10 +789,10 @@ void TVMFFIABIBuilder::DecodeParamDLTensor(const Buffer& buffer, const PrimExpr&
         // Check data pointer alignment
         if (buffer->data_alignment > 1) {
           PrimExpr ptr_as_int =
-              Call(DataType::UInt(64), builtin::reinterpret(), {cast(DataType::Handle(), vptr)});
+              Call(PrimType::UInt(64), builtin::reinterpret(), {cast(PrimType::Handle(), vptr)});
           PrimExpr align_cond =
-              truncmod(ptr_as_int, IntImm(DataType::UInt(64), buffer->data_alignment)) ==
-              IntImm(DataType::UInt(64), 0);
+              truncmod(ptr_as_int, IntImm(PrimType::UInt(64), buffer->data_alignment)) ==
+              IntImm(PrimType::UInt(64), 0);
           asserts_.emplace_back(AssertStmt(
               alloc_size == 0 || align_cond, StringImm("ValueError"),
               ffi::Array<StringImm>({StringImm("Misaligned Tensor data on argument #"),
diff --git a/src/tirx/transform/tvm_ffi_binder.h b/src/tirx/transform/tvm_ffi_binder.h
index 92af52df6bcb..a465025ad517 100644
--- a/src/tirx/transform/tvm_ffi_binder.h
+++ b/src/tirx/transform/tvm_ffi_binder.h
@@ -180,7 +180,7 @@ class TVMFFIABIBuilder {
 
   /*! \brief Load the i-th packed argument as the given type from the union value. */
   static PrimExpr LoadTVMFFIAnyUnionValue(const Var& v_packed_args, int param_index,
-                                          DataType arg_type);
+                                          PrimType arg_type);
 
   // ── Per-dtype type-check + value-load methods ──────────────────
   //
@@ -211,7 +211,7 @@ class TVMFFIABIBuilder {
    * \param dtype The expected data type for this parameter.
    * \return The loaded argument value.
    */
-  PrimExpr DecodeParamInt(int param_index, const Var& type_index, DataType dtype);
+  PrimExpr DecodeParamInt(int param_index, const Var& type_index, PrimType dtype);
 
   /*!
    * \brief Type-check and load a float argument.
@@ -220,7 +220,7 @@ class TVMFFIABIBuilder {
    * \param dtype The expected data type for this parameter.
    * \return The loaded argument value.
    */
-  PrimExpr DecodeParamFloat(int param_index, const Var& type_index, DataType dtype);
+  PrimExpr DecodeParamFloat(int param_index, const Var& type_index, PrimType dtype);
 
   // ── Private binding submethods (all take ffi::reflection::AccessPath) ───────────
 
diff --git a/src/tirx/transform/unroll_loop.cc b/src/tirx/transform/unroll_loop.cc
index 740176f50498..c41e717ca8f1 100644
--- a/src/tirx/transform/unroll_loop.cc
+++ b/src/tirx/transform/unroll_loop.cc
@@ -225,7 +225,7 @@ class LoopUnroller : public StmtExprMutator {
     ffi::Map<Var, PrimExpr> vmap;
     ffi::Array<Stmt> unrolled;
     for (int i = 0; i < value; ++i) {
-      vmap.Set(op->loop_var, op->min + MakeConst(op->loop_var.dtype(), i));
+      vmap.Set(op->loop_var, op->min + MakeConst(op->loop_var.ty(), i));
       Stmt step = Substitute(body, vmap);
       unrolled.push_back(step);
     }
diff --git a/src/tirx/transform/unsupported_dtype_legalize.cc b/src/tirx/transform/unsupported_dtype_legalize.cc
index 01ae31938117..cc9725dc7a23 100644
--- a/src/tirx/transform/unsupported_dtype_legalize.cc
+++ b/src/tirx/transform/unsupported_dtype_legalize.cc
@@ -37,6 +37,31 @@
 namespace tvm {
 namespace tirx {
 
+namespace {
+
+bool IsBFloat16Type(const PrimType& type) {
+  return type.MatchesElementType(DLDataTypeCode::kDLBfloat, 16);
+}
+
+bool IsFloat8Type(const PrimType& type) {
+  DLDataTypeCode code = type.code();
+  return code == DLDataTypeCode::kDLFloat8_e3m4 || code == DLDataTypeCode::kDLFloat8_e4m3 ||
+         code == DLDataTypeCode::kDLFloat8_e4m3b11fnuz ||
+         code == DLDataTypeCode::kDLFloat8_e4m3fn || code == DLDataTypeCode::kDLFloat8_e4m3fnuz ||
+         code == DLDataTypeCode::kDLFloat8_e5m2 || code == DLDataTypeCode::kDLFloat8_e5m2fnuz ||
+         code == DLDataTypeCode::kDLFloat8_e8m0fnu;
+}
+
+template <typename F>
+bool MatchPrimType(const Type& type, F f) {
+  if (const auto* prim_type = type.as<PrimTypeNode>()) {
+    return f(ffi::GetRef<PrimType>(prim_type));
+  }
+  return false;
+}
+
+}  // namespace
+
 // NOTE: do not touch buffer on function boundary
 // remap internal fp8/bf16 buffer to f32 if they meet the following condition
 // - constant allocation size
@@ -47,7 +72,7 @@ class ComputeLegalizePlanner : public StmtExprVisitor {
  public:
   ComputeLegalizePlanner(
       std::unordered_map<Buffer, Buffer, ffi::ObjectPtrHash, ffi::ObjectPtrEqual>* buffer_remap,
-      std::unordered_map<Var, Var>* var_remap, DataType promote_dtype)
+      std::unordered_map<Var, Var>* var_remap, PrimType promote_dtype)
       : buffer_remap_(buffer_remap), var_remap_(var_remap), promote_dtype_(promote_dtype) {}
 
   // run planning to populate buffer remap and var remap.
@@ -74,7 +99,7 @@ class ComputeLegalizePlanner : public StmtExprVisitor {
     }
   }
 
-  virtual bool MatchDType(DataType dtype) const = 0;
+  virtual bool MatchType(const Type& type) const = 0;
 
   void VisitStmt_(const BufferStoreNode* op) final {
     StmtExprVisitor::VisitStmt_(op);
@@ -88,14 +113,13 @@ class ComputeLegalizePlanner : public StmtExprVisitor {
 
   void VisitStmt_(const AllocBufferNode* op) final {
     // remap all intermediate constant buffer to promote data types (fp16/fp32)
-    if (MatchDType(op->buffer->dtype)) {
-      DataType dtype = promote_dtype_.with_lanes(op->buffer->dtype.lanes());
+    if (MatchType(op->buffer->dtype)) {
+      PrimType dtype = promote_dtype_.WithLanes(op->buffer->dtype.lanes());
       ffi::String storage_scope = "global";
       if (auto* ptr_type = op->buffer->data->type_annotation.as<PointerTypeNode>()) {
         storage_scope = ptr_type->storage_scope;
       }
-      Var buffer_var =
-          Var(op->buffer->data->name_hint, PointerType(PrimType(dtype), storage_scope));
+      Var buffer_var = Var(op->buffer->data->name_hint, PointerType(dtype, storage_scope));
       (*var_remap_)[op->buffer->data] = buffer_var;
     }
     return StmtExprVisitor::VisitStmt_(op);
@@ -109,7 +133,7 @@ class ComputeLegalizePlanner : public StmtExprVisitor {
   void VisitExpr_(const VarNode* op) final {
     StmtExprVisitor::VisitExpr_(op);
     Var buffer_var = ffi::GetRef<Var>(op);
-    if (buffer_var.dtype().is_handle()) {
+    if (buffer_var.ty().IsHandle()) {
       opaque_var_access_.insert(buffer_var);
     }
   }
@@ -119,7 +143,7 @@ class ComputeLegalizePlanner : public StmtExprVisitor {
     auto var_it = var_remap_->find(buf->data);
     if (var_it == var_remap_->end()) return;
 
-    Buffer new_buffer(var_it->second, promote_dtype_.with_lanes(buf->dtype.lanes()), buf->shape,
+    Buffer new_buffer(var_it->second, promote_dtype_.WithLanes(buf->dtype.lanes()), buf->shape,
                       buf->strides, buf->elem_offset, buf->name, buf->data_alignment,
                       buf->offset_factor, buf->buffer_type, buf->axis_separators, buf->span,
                       buf->layout, buf->allocated_addr);
@@ -129,25 +153,29 @@ class ComputeLegalizePlanner : public StmtExprVisitor {
   std::unordered_map<Buffer, Buffer, ffi::ObjectPtrHash, ffi::ObjectPtrEqual>* buffer_remap_;
   std::unordered_map<Var, Var>* var_remap_;
   std::unordered_set<Var> opaque_var_access_;
-  DataType promote_dtype_;
+  PrimType promote_dtype_;
 };
 
 class BF16ComputeLegalizePlanner : public ComputeLegalizePlanner {
  public:
   explicit BF16ComputeLegalizePlanner(
       std::unordered_map<Buffer, Buffer, ffi::ObjectPtrHash, ffi::ObjectPtrEqual>* buffer_remap,
-      std::unordered_map<Var, Var>* var_remap, DataType promote_dtype)
+      std::unordered_map<Var, Var>* var_remap, PrimType promote_dtype)
       : ComputeLegalizePlanner(buffer_remap, var_remap, promote_dtype) {}
-  bool MatchDType(DataType dtype) const { return dtype.is_bfloat16(); }
+  bool MatchType(const Type& type) const {
+    return MatchPrimType(type, [](const PrimType& prim_type) { return IsBFloat16Type(prim_type); });
+  }
 };
 
 class FP8ComputeLegalizePlanner : public ComputeLegalizePlanner {
  public:
   explicit FP8ComputeLegalizePlanner(
       std::unordered_map<Buffer, Buffer, ffi::ObjectPtrHash, ffi::ObjectPtrEqual>* buffer_remap,
-      std::unordered_map<Var, Var>* var_remap, DataType promote_dtype)
+      std::unordered_map<Var, Var>* var_remap, PrimType promote_dtype)
       : ComputeLegalizePlanner(buffer_remap, var_remap, promote_dtype) {}
-  bool MatchDType(DataType dtype) const { return dtype.is_float8(); }
+  bool MatchType(const Type& type) const {
+    return MatchPrimType(type, [](const PrimType& prim_type) { return IsFloat8Type(prim_type); });
+  }
 };
 
 #define DEFINE_BIOP_EXPR_LEGALIZE(OP, FUNC)                      \
@@ -169,7 +197,7 @@ class FP8ComputeLegalizePlanner : public ComputeLegalizePlanner {
 // point in the TIR lowering phases.
 class ComputeLegalizer : public StmtExprMutator {
  public:
-  explicit ComputeLegalizer(DataType promote_dtype) : promote_dtype_(promote_dtype) {}
+  explicit ComputeLegalizer(PrimType promote_dtype) : promote_dtype_(promote_dtype) {}
 
   PrimFunc LegalizeWithPlanner(PrimFunc func, ComputeLegalizePlanner* planner) {
     planner->Plan(func);
@@ -180,21 +208,22 @@ class ComputeLegalizer : public StmtExprMutator {
 
   virtual PrimFunc Legalize(PrimFunc func) = 0;
 
-  virtual bool MatchDType(DataType dtype) const = 0;
+  virtual bool MatchType(const Type& type) const = 0;
 
  protected:
   PrimExpr VisitExpr_(const CastNode* op) final {
     auto op_val = PromoteToTarget(this->VisitExpr(op->value));
 
     // all casts to matched data type (fp8/bf16) becomes f32
-    if (MatchDType(op->dtype)) {
-      return cast(promote_dtype_.with_lanes(op->dtype.lanes()), op_val);
+    PrimType op_ty = op->ty();
+    if (MatchType(op_ty)) {
+      return cast(promote_dtype_.WithLanes(op_ty.lanes()), op_val);
     }
 
     if (op_val.same_as(op->value)) {
       return ffi::GetRef<PrimExpr>(op);
     } else {
-      return cast(op->dtype, op_val);
+      return cast(op_ty, op_val);
     }
   }
 
@@ -237,18 +266,19 @@ class ComputeLegalizer : public StmtExprMutator {
     // update normal computations to return f32 instead.
     auto fmutate = [this](const PrimExpr& e) { return PromoteToTarget(this->VisitExpr(e)); };
     ffi::Array<PrimExpr> args = op->args.Map(fmutate);
-    if (MatchDType(op->dtype)) {
-      return Call(promote_dtype_.with_lanes(op->dtype.lanes()), op->op, args, op->attrs, op->span);
+    PrimType op_ty = op->ty();
+    if (MatchType(op_ty)) {
+      return Call(promote_dtype_.WithLanes(op_ty.lanes()), op->op, args, op->attrs, op->span);
     }
     if (args.same_as(op->args)) {
       return ffi::GetRef<PrimExpr>(op);
     } else {
-      return Call(op->dtype, op->op, args, op->attrs, op->span);
+      return Call(ffi::GetRef<PrimExpr>(op).ty(), op->op, args, op->attrs, op->span);
     }
   }
 
   PrimExpr VisitExpr_(const FloatImmNode* op) final {
-    if (MatchDType(op->dtype)) {
+    if (MatchType(op->ty())) {
       return FloatImm(promote_dtype_, op->value);
     }
     return ffi::GetRef<PrimExpr>(op);
@@ -268,8 +298,8 @@ class ComputeLegalizer : public StmtExprMutator {
   PrimExpr VisitExpr_(const LetNode* op) final {
     PrimExpr value = PromoteToTarget(op->value);
     Var var = op->var;
-    if (value.dtype() != op->value.dtype()) {
-      var = op->var.copy_with_dtype(op->value.dtype());
+    if (value.ty() != op->value.ty()) {
+      var = op->var.copy_with_dtype(op->value.ty());
       var_remap_[op->var] = var;
     }
 
@@ -298,8 +328,8 @@ class ComputeLegalizer : public StmtExprMutator {
   Stmt VisitStmt_(const BindNode* op) final {
     PrimExpr value = PromoteToTarget(op->value);
     Var var = op->var;
-    if (value.dtype() != op->value.dtype()) {
-      var = op->var.copy_with_dtype(op->value.dtype());
+    if (value.ty() != op->value.ty()) {
+      var = op->var.copy_with_dtype(op->value.ty());
       var_remap_[op->var] = var;
     }
 
@@ -321,17 +351,17 @@ class ComputeLegalizer : public StmtExprMutator {
     if (value.same_as(op->value) && indices.same_as(op->indices) && new_buf.same_as(op->buffer)) {
       return ffi::GetRef<Stmt>(op);
     } else {
-      if (MatchDType(new_buf->dtype)) {
-        int index_lanes = indices.size() ? indices.back().dtype().lanes() : 1;
+      if (MatchType(new_buf->dtype)) {
+        int index_lanes = indices.size() ? indices.back().ty().lanes() : 1;
         int buffer_lanes = new_buf->dtype.lanes();
-        DataType legalized_dtype = new_buf->dtype.with_lanes(index_lanes * buffer_lanes);
+        PrimType legalized_dtype = new_buf->dtype.WithLanes(index_lanes * buffer_lanes);
         value = CastTargetToDType(value, legalized_dtype);
       }
-      if (value.dtype() != new_buf->dtype) {
+      if (value.ty() != new_buf->dtype) {
         // this happens when buffer get rewritten to f32
         // but values remain as fp8/bf16
-        TVM_FFI_ICHECK(MatchDType(value->dtype));
-        value = DTypeConversion(value, new_buf->dtype.with_lanes(value.dtype().lanes()));
+        TVM_FFI_ICHECK(MatchType(value.ty()));
+        value = DTypeConversion(value, new_buf->dtype.WithLanes(value.ty().lanes()));
       }
       TVM_FFI_ICHECK(!op->predicate.defined())
           << "Predicated buffer store is not currently supported in "
@@ -360,12 +390,12 @@ class ComputeLegalizer : public StmtExprMutator {
       // Remap input variables
       for (size_t i = 0; i < legalized_identity_elements.size(); i++) {
         Var lhs_var = reducer->lhs[i];
-        if (lhs_var.dtype() != legalized_identity_elements[i].dtype()) {
-          var_remap_[lhs_var] = lhs_var.copy_with_dtype(legalized_identity_elements[i].dtype());
+        if (lhs_var.ty() != legalized_identity_elements[i].ty()) {
+          var_remap_[lhs_var] = lhs_var.copy_with_dtype(legalized_identity_elements[i].ty());
         }
         Var rhs_var = reducer->rhs[i];
-        if (rhs_var.dtype() != legalized_identity_elements[i].dtype()) {
-          var_remap_[rhs_var] = rhs_var.copy_with_dtype(legalized_identity_elements[i].dtype());
+        if (rhs_var.ty() != legalized_identity_elements[i].ty()) {
+          var_remap_[rhs_var] = rhs_var.copy_with_dtype(legalized_identity_elements[i].ty());
         }
       }
 
@@ -442,12 +472,12 @@ class ComputeLegalizer : public StmtExprMutator {
    * \return The converted value.
    */
   PrimExpr PromoteToTarget(PrimExpr value) {
-    if (!MatchDType(value.dtype())) return value;
+    PrimType value_ty = value.ty();
+    if (!MatchType(value_ty)) return value;
     if (const CastNode* cast = value.as<CastNode>()) {
-      if (cast->value.dtype() == promote_dtype_.with_lanes(value.dtype().lanes()))
-        return cast->value;
+      if (cast->value.ty() == promote_dtype_.WithLanes(value_ty.lanes())) return cast->value;
     }
-    return DTypeConversion(value, promote_dtype_.with_lanes(value.dtype().lanes()));
+    return DTypeConversion(value, promote_dtype_.WithLanes(value_ty.lanes()));
   }
 
   /*!
@@ -456,9 +486,10 @@ class ComputeLegalizer : public StmtExprMutator {
    * \param value The input value
    * \return The converted value.
    */
-  PrimExpr CastTargetToDType(PrimExpr value, DataType dtype) {
-    if (!value.dtype().is_float()) return value;
-    TVM_FFI_ICHECK_EQ(value.dtype(), this->promote_dtype_.with_lanes(value.dtype().lanes()));
+  PrimExpr CastTargetToDType(PrimExpr value, PrimType dtype) {
+    PrimType value_ty = value.ty();
+    if (value_ty.code() != DLDataTypeCode::kDLFloat) return value;
+    TVM_FFI_ICHECK_EQ(value.ty(), this->promote_dtype_.WithLanes(value_ty.lanes()));
     return DTypeConversion(value, dtype);
   }
 
@@ -471,29 +502,33 @@ class ComputeLegalizer : public StmtExprMutator {
   }
 
  protected:
-  DataType promote_dtype_;
+  PrimType promote_dtype_;
   std::unordered_map<Buffer, Buffer, ffi::ObjectPtrHash, ffi::ObjectPtrEqual> buffer_remap_;
   std::unordered_map<Var, Var> var_remap_;
 };
 
 class BF16ComputeLegalizer : public ComputeLegalizer {
  public:
-  BF16ComputeLegalizer() : ComputeLegalizer(DataType::Float(32)) {}
+  BF16ComputeLegalizer() : ComputeLegalizer(PrimType::Float(32)) {}
   PrimFunc Legalize(PrimFunc func) {
     BF16ComputeLegalizePlanner planner(&buffer_remap_, &var_remap_, promote_dtype_);
     return LegalizeWithPlanner(func, &planner);
   }
-  bool MatchDType(DataType dtype) const { return dtype.is_bfloat16(); }
+  bool MatchType(const Type& type) const {
+    return MatchPrimType(type, [](const PrimType& prim_type) { return IsBFloat16Type(prim_type); });
+  }
 };
 
 class FP8ComputeLegalizer : public ComputeLegalizer {
  public:
-  explicit FP8ComputeLegalizer(DataType promote_dtype) : ComputeLegalizer(promote_dtype) {}
+  explicit FP8ComputeLegalizer(PrimType promote_dtype) : ComputeLegalizer(promote_dtype) {}
   PrimFunc Legalize(PrimFunc func) {
     FP8ComputeLegalizePlanner planner(&buffer_remap_, &var_remap_, promote_dtype_);
     return LegalizeWithPlanner(func, &planner);
   }
-  bool MatchDType(DataType dtype) const { return dtype.is_float8(); }
+  bool MatchType(const Type& type) const {
+    return MatchPrimType(type, [](const PrimType& prim_type) { return IsFloat8Type(prim_type); });
+  }
 };
 
 /*!
@@ -529,13 +564,13 @@ class StorageLegalizer : public StmtExprMutator {
     // in a rare case the buffer didn't get remapped
     // because the original var is not bfloat*
     // force remap here
-    if (MatchDType(buf->dtype)) {
-      DataType new_dtype = GetStorageUIntDType(buf->dtype);
+    if (MatchType(buf->dtype)) {
+      PrimType new_dtype = GetStorageUIntDType(buf->dtype);
       ffi::String storage_scope = "global";
       if (auto* ptr_type = buf->data->type_annotation.as<PointerTypeNode>()) {
         storage_scope = ptr_type->storage_scope;
       }
-      Var new_data = Var(buf->data->name_hint, PointerType(PrimType(new_dtype), storage_scope));
+      Var new_data = Var(buf->data->name_hint, PointerType(new_dtype, storage_scope));
       var_remap_[buf->data] = new_data;
       buf = Buffer(new_data, new_dtype, buf->shape, buf->strides, buf->elem_offset, buf->name,
                    buf->data_alignment, buf->offset_factor, buf->buffer_type, buf->axis_separators,
@@ -556,7 +591,7 @@ class StorageLegalizer : public StmtExprMutator {
     // in a rare case the buffer didn't get remapped
     // because the original var is not bfloat*
     // force remap here
-    if (MatchDType(buf->dtype)) {
+    if (MatchType(buf->dtype)) {
       buf = Buffer(buf->data, GetStorageUIntDType(buf->dtype), buf->shape, buf->strides,
                    buf->elem_offset, buf->name, buf->data_alignment, buf->offset_factor,
                    buf->buffer_type, buf->axis_separators, buf->span, buf->layout,
@@ -600,8 +635,8 @@ class StorageLegalizer : public StmtExprMutator {
     if (new_buf.same_as(op->buffer) && indices.same_as(op->indices) && value.same_as(op->value)) {
       return ffi::GetRef<Stmt>(op);
     } else {
-      if (MatchDType(op->value.dtype())) {
-        TVM_FFI_ICHECK(new_buf->dtype.is_uint());
+      if (MatchType(op->value.ty())) {
+        TVM_FFI_ICHECK(new_buf->dtype.MatchesCode(DLDataTypeCode::kDLUInt));
       }
       TVM_FFI_ICHECK(!op->predicate.defined())
           << "Predicated buffer store is not currently supported in "
@@ -647,20 +682,21 @@ class StorageLegalizer : public StmtExprMutator {
     if (op->op.same_as(builtin::reinterpret())) {
       PrimExpr value = VisitExpr(op->args[0]);
       // sometimes the input dtype can change and we can skip.
-      if (value.dtype() == op->dtype) return value;
-      if (MatchDType(op->dtype)) {
-        return reinterpret(GetStorageUIntDType(op->dtype), value);
+      PrimType op_dtype = op->ty();
+      if (value.ty() == op_dtype) return value;
+      if (MatchType(op_dtype)) {
+        return reinterpret(GetStorageUIntDType(op_dtype), value);
       }
       if (op->args[0].same_as(value)) {
         return ffi::GetRef<PrimExpr>(op);
       } else {
-        return reinterpret(op->dtype, value);
+        return reinterpret(op_dtype, value);
       }
     }
     return StmtExprMutator::VisitExpr_(op);
   }
 
-  virtual bool MatchDType(DataType dtype) const = 0;
+  virtual bool MatchType(const Type& type) const = 0;
 
  private:
   /*!
@@ -669,10 +705,11 @@ class StorageLegalizer : public StmtExprMutator {
    * \return The converted value.
    */
   PrimExpr ChangeToUInt(PrimExpr value) {
-    if (!MatchDType(value->dtype)) return value;
+    PrimType value_dtype = value.ty();
+    if (!MatchType(value_dtype)) return value;
     auto* call = value.as<CallNode>();
     if (call && call->op.same_as(builtin::reinterpret())) {
-      return reinterpret(GetStorageUIntDType(value->dtype), call->args[0]);
+      return reinterpret(GetStorageUIntDType(value_dtype), call->args[0]);
     } else {
       return value;
     }
@@ -680,13 +717,13 @@ class StorageLegalizer : public StmtExprMutator {
 
   Var RemapVarDef(Var var) {
     // remap the var
-    if (var.dtype().is_handle()) {
+    if (var.ty().IsHandle()) {
       if (auto* ptr_type = var->type_annotation.as<PointerTypeNode>()) {
         if (auto* elem_type = ptr_type->element_type.as<PrimTypeNode>()) {
-          if (MatchDType(elem_type->dtype)) {
-            Var new_var =
-                Var(var->name_hint, PointerType(PrimType(GetStorageUIntDType(elem_type->dtype)),
-                                                ptr_type->storage_scope));
+          PrimType elem_prim_type = ffi::GetRef<PrimType>(elem_type);
+          if (MatchType(elem_prim_type)) {
+            Var new_var = Var(var->name_hint, PointerType(GetStorageUIntDType(elem_prim_type),
+                                                          ptr_type->storage_scope));
             var_remap_[var] = new_var;
             return new_var;
           }
@@ -704,12 +741,12 @@ class StorageLegalizer : public StmtExprMutator {
     Buffer new_buf = buf;
     auto var_it = var_remap_.find(buf->data);
     if (var_it != var_remap_.end()) {
-      DataType dtype = MatchDType(buf->dtype) ? GetStorageUIntDType(buf->dtype) : buf->dtype;
+      PrimType dtype = MatchType(buf->dtype) ? GetStorageUIntDType(buf->dtype) : buf->dtype;
       new_buf = Buffer(var_it->second, dtype, buf->shape, buf->strides, buf->elem_offset, buf->name,
                        buf->data_alignment, buf->offset_factor, buf->buffer_type,
                        buf->axis_separators, buf->span, buf->layout, buf->allocated_addr);
     } else {
-      TVM_FFI_ICHECK(!MatchDType(buf->dtype)) << "Cannot find var remap for " << buf;
+      TVM_FFI_ICHECK(!MatchType(buf->dtype)) << "Cannot find var remap for " << buf;
     }
 
     buffer_remap_[buf] = new_buf;
@@ -723,12 +760,16 @@ class StorageLegalizer : public StmtExprMutator {
 
 class BF16StorageLegalizer : public StorageLegalizer {
  public:
-  bool MatchDType(DataType dtype) const { return dtype.is_bfloat16(); }
+  bool MatchType(const Type& type) const {
+    return MatchPrimType(type, [](const PrimType& prim_type) { return IsBFloat16Type(prim_type); });
+  }
 };
 
 class FP8StorageLegalizer : public StorageLegalizer {
  public:
-  bool MatchDType(DataType dtype) const { return dtype.is_float8(); }
+  bool MatchType(const Type& type) const {
+    return MatchPrimType(type, [](const PrimType& prim_type) { return IsFloat8Type(prim_type); });
+  }
 };
 
 namespace transform {
@@ -787,7 +828,7 @@ Pass FP8ComputeLegalize(ffi::String promote_dtype) {
         CheckDataTypeSupport(opt_target.value(), "tvm.support.nvcc.supports_fp8")) {
       return f;
     }
-    return FP8ComputeLegalizer(DataType(ffi::StringToDLDataType(promote_dtype))).Legalize(f);
+    return FP8ComputeLegalizer(PrimType(ffi::StringToDLDataType(promote_dtype))).Legalize(f);
   };
   return CreatePrimFuncPass(pass_func, 0, "tirx.FP8ComputeLegalize", {});
 }
diff --git a/src/tirx/transform/vectorize_loop.cc b/src/tirx/transform/vectorize_loop.cc
index 03ee2d3eefde..271a9f20efa9 100644
--- a/src/tirx/transform/vectorize_loop.cc
+++ b/src/tirx/transform/vectorize_loop.cc
@@ -39,13 +39,20 @@
 #include <vector>
 
 #include "../../tirx/analysis/check_contains.h"
-#include "tvm/runtime/data_type.h"
+#include "tvm/ffi/dtype.h"
 #include "tvm/tirx/buffer.h"
 
 namespace tvm {
 namespace tirx {
 
 namespace {
+int GetLanesOrVScaleFactor(const PrimType& ty) {
+  if (ty.IsScalableVector()) {
+    return ty.VScaleFactor();
+  }
+  return ty.lanes();
+}
+
 // File-local helper: true if `expr` is a call to tirx::builtin::vscale().
 bool IsVScaleCall(const PrimExpr& expr) {
   if (const auto* call = expr.as<CallNode>()) {
@@ -56,9 +63,8 @@ bool IsVScaleCall(const PrimExpr& expr) {
 
 bool TargetHasRVV(Target target) {
   if (!target.defined()) return false;
-  static auto target_has_feature_fn =
-      tvm::ffi::Function::GetGlobalRequired("target.target_has_feature");
-  return target_has_feature_fn("v", target).cast<bool>();
+  static auto target_has_feature_fn = tvm::ffi::Function::GetGlobal("target.target_has_feature");
+  return target_has_feature_fn.has_value() && (*target_has_feature_fn)("v", target).cast<bool>();
 }
 
 // File-local helper: true if the target supports Variable-Length Array extensions
@@ -66,6 +72,16 @@ bool TargetHasRVV(Target target) {
 bool TargetHasVLA(Target target) {
   if (!target.defined()) return false;
   bool has_vla = target->GetAttr<bool>("feature.has_sve").value_or(false);
+  if (!has_vla) {
+    if (auto mattr = target->GetAttr<ffi::Array<ffi::String>>("mattr")) {
+      for (const ffi::String& attr : mattr.value()) {
+        if (attr == "+sve") {
+          has_vla = true;
+          break;
+        }
+      }
+    }
+  }
   has_vla |= TargetHasRVV(target);
   return has_vla;
 }
@@ -78,7 +94,7 @@ bool ContainsCallNode(const Stmt& stmt) {
 
 inline PrimExpr CreateNewLanes(bool is_scalable, int lanes_or_vscale_factor) {
   if (is_scalable) {
-    return Mul(Call(DataType::Int(32), builtin::vscale(), {}), lanes_or_vscale_factor);
+    return Mul(Call(PrimType::Int(32), builtin::vscale(), {}), lanes_or_vscale_factor);
   } else {
     return lanes_or_vscale_factor;
   }
@@ -86,23 +102,21 @@ inline PrimExpr CreateNewLanes(bool is_scalable, int lanes_or_vscale_factor) {
 
 inline PrimExpr BroadcastTo(PrimExpr e, int lanes, bool is_scalable) {
   // Check if e is already in the expected form
-  if (e.dtype().get_lanes_or_vscale_factor() == lanes &&
-      e.dtype().is_scalable_vector() == is_scalable)
-    return e;
+  if (GetLanesOrVScaleFactor(e.ty()) == lanes && e.ty().IsScalableVector() == is_scalable) return e;
 
   if (const BroadcastNode* op = e.as<BroadcastNode>()) {
-    TVM_FFI_ICHECK(op->dtype.is_scalable_vector() == is_scalable)
+    TVM_FFI_ICHECK(op->ty().IsScalableVector() == is_scalable)
         << "Can't broadcast between scalable and fixed length vectors.";
-    int e_lanes = op->dtype.get_lanes_or_vscale_factor();
+    int e_lanes = GetLanesOrVScaleFactor(op->ty());
 
     if (lanes % e_lanes == 0) {
       return Broadcast(op->value, CreateNewLanes(is_scalable, lanes));
     }
   }
 
-  TVM_FFI_ICHECK(e.dtype().is_scalar())
-      << "Cannot broadcast lanes=" << e.dtype().get_lanes_or_vscale_factor()
-      << " is_scalable=" << e.dtype().is_scalable_vector() << " to " << lanes;
+  TVM_FFI_ICHECK(e.ty().IsScalar())
+      << "Cannot broadcast lanes=" << GetLanesOrVScaleFactor(e.ty())
+      << " is_scalable=" << e.ty().IsScalableVector() << " to " << lanes;
 
   return Broadcast(e, CreateNewLanes(is_scalable, lanes));
 }
@@ -219,9 +233,10 @@ class TryPredicateBufferAccesses : public StmtExprMutator {
       }
     }
 
-    DataType buf_predicate_dtype =
-        DataType(DataType::kUInt, 1, ramp->dtype.get_lanes_or_vscale_factor(),
-                 ramp->dtype.is_scalable_vector());
+    PrimType buf_predicate_dtype =
+        ramp->ty().IsScalableVector() ? PrimType::ScalableVector(DLDataTypeCode::kDLUInt, 1,
+                                                                 GetLanesOrVScaleFactor(ramp->ty()))
+                                      : PrimType::UInt(1, GetLanesOrVScaleFactor(ramp->ty()));
     Call lane_mask = Call(buf_predicate_dtype, builtin::get_active_lane_mask(), {base_, limit_});
 
     num_accesses_rewritten_ += 1;
@@ -354,7 +369,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
 
   Vectorizer(Var var, PrimExpr var_lanes, Target target)
       : var_(var), var_lanes_(var_lanes), target_(target) {
-    ramp_ = Ramp(IntImm(var->dtype, 0), IntImm(var->dtype, 1), var_lanes);
+    ramp_ = Ramp(IntImm(var.ty(), 0), IntImm(var.ty(), 1), var_lanes);
   }
 
   Stmt VisitStmt(const Stmt& stmt) final {
@@ -384,28 +399,28 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     if (a.same_as(op->a) && b.same_as(op->b)) {
       return ffi::GetRef<PrimExpr>(op);
     } else {
-      bool is_vec_a = a.dtype().is_scalable_or_fixed_length_vector();
-      bool is_vec_b = b.dtype().is_scalable_or_fixed_length_vector();
+      bool is_vec_a = a.ty().IsScalableVector() || a.ty().IsFixedLengthVector();
+      bool is_vec_b = b.ty().IsScalableVector() || b.ty().IsFixedLengthVector();
       if (is_vec_a && is_vec_b) {
         // Let's not multiply scalable and fixed length vectors
-        TVM_FFI_ICHECK(a.dtype().is_scalable_vector() == b.dtype().is_scalable_vector())
+        TVM_FFI_ICHECK(a.ty().IsScalableVector() == b.ty().IsScalableVector())
             << "Fixed length and scalable vectors can't be mixed in multiplication.";
       }
       if (is_vec_a || is_vec_b) {
         const RampNode* b_ramp = b.as<RampNode>();
         const RampNode* a_ramp = a.as<RampNode>();
-        if (a_ramp && b.dtype().is_scalar() && analyzer_->CanProve(b > 0)) {
+        if (a_ramp && b.ty().IsScalar() && analyzer_->CanProve(b > 0)) {
           PrimExpr lanes = a_ramp->lanes;
           return Ramp(a_ramp->base * b, a_ramp->stride * b, lanes);
         }
-        if (b_ramp && a.dtype().is_scalar() && analyzer_->CanProve(a > 0)) {
+        if (b_ramp && a.ty().IsScalar() && analyzer_->CanProve(a > 0)) {
           PrimExpr lanes = b_ramp->lanes;
           return Ramp(b_ramp->base * a, b_ramp->stride * a, lanes);
         }
-        int a_lanes = a.dtype().get_lanes_or_vscale_factor();
-        int b_lanes = b.dtype().get_lanes_or_vscale_factor();
+        int a_lanes = GetLanesOrVScaleFactor(a.ty());
+        int b_lanes = GetLanesOrVScaleFactor(b.ty());
         int max_lanes = std::max(a_lanes, b_lanes);
-        bool is_scalable = a.dtype().is_scalable_vector() || b.dtype().is_scalable_vector();
+        bool is_scalable = a.ty().IsScalableVector() || b.ty().IsScalableVector();
         return Mul(BroadcastTo(a, max_lanes, is_scalable), BroadcastTo(b, max_lanes, is_scalable));
       }
     }
@@ -438,22 +453,22 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
   PrimExpr VisitExpr_(const RampNode* op) final {
     PrimExpr base = this->VisitExpr(op->base);
     PrimExpr stride = this->VisitExpr(op->stride);
-    TVM_FFI_ICHECK(!base.dtype().is_scalable_vector())
+    TVM_FFI_ICHECK(!base.ty().IsScalableVector())
         << "Creating scalable vectors from existing vectors is not supported.";
-    TVM_FFI_ICHECK(!stride.dtype().is_scalable_vector())
+    TVM_FFI_ICHECK(!stride.ty().IsScalableVector())
         << "Ramp stride with scalable dtype is not supported";
-    if (base.dtype().is_fixed_length_vector() && stride.dtype().is_scalar()) {
+    if (base.ty().IsFixedLengthVector() && stride.ty().IsScalar()) {
       TVM_FFI_ICHECK(op->lanes->IsInstance<IntImmNode>())
           << "Vectorizing over existing scalable vectors is not supported.";
       const RampNode* base_ramp = base.as<RampNode>();
       int op_lanes = static_cast<int>(op->lanes.as_or_throw<IntImm>()->value);
       int base_ramp_lanes = static_cast<int>(base_ramp->lanes.as_or_throw<IntImm>()->value);
       if (analyzer_->CanProve(base_ramp->stride ==
-                              stride * MakeConst(stride.dtype(), base_ramp_lanes))) {
+                              stride * MakeConst(stride.ty(), base_ramp_lanes))) {
         return Ramp(base_ramp->base, stride, op_lanes * base_ramp_lanes);
       }
     }
-    int lanes = std::max(base.dtype().lanes(), stride.dtype().lanes());
+    int lanes = std::max(base.ty().lanes(), stride.ty().lanes());
     base = BroadcastTo(base, lanes, false);
     stride = BroadcastTo(stride, lanes, false);
     ffi::Array<PrimExpr> elems;
@@ -466,7 +481,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
 
   PrimExpr VisitExpr_(const BroadcastNode* op) final {
     PrimExpr value = this->VisitExpr(op->value);
-    if (value.dtype().is_scalable_or_fixed_length_vector()) {
+    if (value.ty().IsScalableVector() || value.ty().IsFixedLengthVector()) {
       need_scalarize_ = true;
       return ffi::GetRef<PrimExpr>(op);
     }
@@ -484,12 +499,12 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     if (cond.same_as(op->condition) && t.same_as(op->true_value) && f.same_as(op->false_value)) {
       return ffi::GetRef<PrimExpr>(op);
     } else {
-      int cond_lanes = cond.dtype().get_lanes_or_vscale_factor();
-      int t_lanes = t.dtype().get_lanes_or_vscale_factor();
-      int f_lanes = f.dtype().get_lanes_or_vscale_factor();
+      int cond_lanes = GetLanesOrVScaleFactor(cond.ty());
+      int t_lanes = GetLanesOrVScaleFactor(t.ty());
+      int f_lanes = GetLanesOrVScaleFactor(f.ty());
       int lanes = std::max(std::max(cond_lanes, t_lanes), f_lanes);
-      bool is_scalable = cond.dtype().is_scalable_vector() || t.dtype().is_scalable_vector() ||
-                         f.dtype().is_scalable_vector();
+      bool is_scalable =
+          cond.ty().IsScalableVector() || t.ty().IsScalableVector() || f.ty().IsScalableVector();
       return Select(BroadcastTo(cond, lanes, is_scalable), BroadcastTo(t, lanes, is_scalable),
                     BroadcastTo(f, lanes, is_scalable));
     }
@@ -500,10 +515,12 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     if (value.same_as(op->value)) {
       return ffi::GetRef<PrimExpr>(op);
     } else {
-      if (value.dtype().is_scalable_vector()) {
-        return Cast(op->dtype.with_scalable_vscale_factor(value.dtype().vscale_factor()), value);
+      if (value.ty().IsScalableVector()) {
+        return Cast(
+            PrimType::ScalableVector(op->ty().code(), op->ty().bits(), value.ty().VScaleFactor()),
+            value);
       } else {
-        return Cast(op->dtype.with_lanes(value.dtype().lanes()), value);
+        return Cast(op->ty().WithLanes(value.ty().lanes()), value);
       }
     }
   }
@@ -531,7 +548,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
   // IfThenElse expr
   PrimExpr MutateIfThenElseExpr_(const CallNode* op) {
     PrimExpr cond = this->VisitExpr(op->args[0]);
-    if (cond.dtype().is_scalable_or_fixed_length_vector()) {
+    if (cond.ty().IsScalableVector() || cond.ty().IsFixedLengthVector()) {
       need_scalarize_ = true;
       return ffi::GetRef<PrimExpr>(op);
     }
@@ -540,17 +557,17 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     if (cond.same_as(op->args[0]) && t.same_as(op->args[1]) && f.same_as(op->args[2])) {
       return ffi::GetRef<PrimExpr>(op);
     } else {
-      int t_lanes = t.dtype().get_lanes_or_vscale_factor();
-      int f_lanes = f.dtype().get_lanes_or_vscale_factor();
+      int t_lanes = GetLanesOrVScaleFactor(t.ty());
+      int f_lanes = GetLanesOrVScaleFactor(f.ty());
       int lanes = std::max(t_lanes, f_lanes);
-      bool is_scalable = t.dtype().is_scalable_vector() || f.dtype().is_scalable_vector();
+      bool is_scalable = t.ty().IsScalableVector() || f.ty().IsScalableVector();
       t = BroadcastTo(t, lanes, is_scalable);
       f = BroadcastTo(f, lanes, is_scalable);
       if (is_scalable) {
-        return Call(op->dtype.with_scalable_vscale_factor(lanes), op->op, {cond, t, f}, op->attrs,
-                    op->span);
+        return Call(PrimType::ScalableVector(op->ty().code(), op->ty().bits(), lanes), op->op,
+                    {cond, t, f}, op->attrs, op->span);
       } else {
-        return Call(op->dtype.with_lanes(lanes), op->op, {cond, t, f}, op->attrs, op->span);
+        return Call(op->ty().WithLanes(lanes), op->op, {cond, t, f}, op->attrs, op->span);
       }
     }
   }
@@ -561,16 +578,16 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     if (value.same_as(op->args[0])) {
       return ffi::GetRef<PrimExpr>(op);
     } else {
-      int lanes = value.dtype().get_lanes_or_vscale_factor();
-      if (value.dtype().is_scalable_vector()) {
-        return Call(op->dtype.with_scalable_vscale_factor(lanes), op->op, {value}, op->attrs,
-                    op->span);
+      int lanes = GetLanesOrVScaleFactor(value.ty());
+      if (value.ty().IsScalableVector()) {
+        return Call(PrimType::ScalableVector(op->ty().code(), op->ty().bits(), lanes), op->op,
+                    {value}, op->attrs, op->span);
       } else {
-        int new_lanes = (op->dtype != DataType::Float4E2M1FN() &&
-                         op->args[0].dtype() != DataType::Float4E2M1FN())
-                            ? (value.dtype().bits() * value.dtype().lanes()) / op->dtype.bits()
-                            : value.dtype().lanes();
-        return Call(op->dtype.with_lanes(new_lanes), op->op, {value}, op->attrs, op->span);
+        int new_lanes = (op->ty().code() != DLDataTypeCode::kDLFloat4_e2m1fn &&
+                         op->args[0].ty().code() != DLDataTypeCode::kDLFloat4_e2m1fn)
+                            ? (value.ty().bits() * value.ty().lanes()) / op->ty().bits()
+                            : value.ty().lanes();
+        return Call(op->ty().WithLanes(new_lanes), op->op, {value}, op->attrs, op->span);
       }
     }
   }
@@ -581,46 +598,46 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     } else if (op->op.same_as(builtin::texture2d_load())) {
       int lane = 0;
       ffi::Array<PrimExpr> fcd = MutateArray({op->args.back()}, &lane);
-      auto dtype = op->args[0]
-                       .as<VarNode>()
-                       ->type_annotation.as<PointerTypeNode>()
-                       ->element_type.as<PrimTypeNode>()
-                       ->dtype;
-      TVM_FFI_ICHECK(lane * dtype.bits() <= op->args[4].as<IntImmNode>()->value)
+      DLDataType dtype = op->args[0]
+                             .as<VarNode>()
+                             ->type_annotation.as<PointerTypeNode>()
+                             ->element_type.as<PrimTypeNode>()
+                             ->dtype;
+      TVM_FFI_ICHECK(lane * dtype.bits <= op->args[4].as<IntImmNode>()->value)
           << "Expected Data to be Read is lesser than or equal to Texture Load length";
 
       auto new_args = op->args;
       new_args.pop_back();
       new_args.push_back(fcd[0]);
-      return Call(op->dtype.with_lanes(lane), op->op, new_args, op->attrs, op->span);
+      return Call(op->ty().WithLanes(lane), op->op, new_args, op->attrs, op->span);
     } else if (op->op.same_as(builtin::texture2d_store())) {
       int lane = 0;
       // Vectorize the value to store
       ffi::Array<PrimExpr> value{op->args.back()};
       ffi::Array<PrimExpr> mutated_value = MutateArray(value, &lane);
-      auto dtype = op->args[0]
-                       .as<VarNode>()
-                       ->type_annotation.as<PointerTypeNode>()
-                       ->element_type.as<PrimTypeNode>()
-                       ->dtype;
-      TVM_FFI_ICHECK(lane * dtype.bits() == op->args[4].as<IntImmNode>()->value)
+      DLDataType dtype = op->args[0]
+                             .as<VarNode>()
+                             ->type_annotation.as<PointerTypeNode>()
+                             ->element_type.as<PrimTypeNode>()
+                             ->dtype;
+      TVM_FFI_ICHECK(lane * dtype.bits == op->args[4].as<IntImmNode>()->value)
           << "Expected Data to be Written equal to Texture Store length";
       ffi::Array<PrimExpr> new_args{op->args[0], op->args[1], op->args[2],
                                     op->args[3], op->args[4], mutated_value[0]};
-      return Call(op->dtype.with_lanes(lane), op->op, new_args, op->attrs, op->span);
+      return Call(op->ty().WithLanes(lane), op->op, new_args, op->attrs, op->span);
     } else if (op->op.same_as(builtin::reinterpret())) {
       return MutateReinterpretExpr_(op);
     }
     auto optional_op = op->op.as<Op>();
     bool vectorizable = optional_op && op_vectorizable_.get(optional_op.value(), false) &&
-                        !op->dtype.is_scalable_vector();
+                        !op->ty().IsScalableVector();
 
     if (!vectorizable) {
       // Cannot vectorize this op
       ffi::Array<PrimExpr> new_args;
       for (auto arg : op->args) {
         auto new_arg = this->VisitExpr(arg);
-        if (new_arg.dtype().is_scalable_or_fixed_length_vector()) {
+        if (new_arg.ty().IsScalableVector() || new_arg.ty().IsFixedLengthVector()) {
           need_scalarize_ = true;
           return ffi::GetRef<PrimExpr>(op);
         }
@@ -629,7 +646,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
       if (op->args.same_as(new_args)) {
         return ffi::GetRef<PrimExpr>(op);
       } else {
-        return Call(op->dtype, op->op, new_args, op->attrs, op->span);
+        return Call(ffi::GetRef<PrimExpr>(op).ty(), op->op, new_args, op->attrs, op->span);
       }
     } else {
       int lane = 0;
@@ -655,7 +672,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
       if (op->args.same_as(new_args)) {
         return ffi::GetRef<PrimExpr>(op);
       } else {
-        return Call(op->dtype.with_lanes(lane), op->op, new_args, op->attrs, op->span);
+        return Call(op->ty().WithLanes(lane), op->op, new_args, op->attrs, op->span);
       }
     }
   }
@@ -688,9 +705,8 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
       TVM_FFI_ICHECK(deep_equal_(it->second, value))
           << "Let cannot bind the same var to two different values";
     }
-    if (value.dtype().get_lanes_or_vscale_factor() !=
-        op->value.dtype().get_lanes_or_vscale_factor()) {
-      Var new_var(op->var->name_hint, value.dtype());
+    if (GetLanesOrVScaleFactor(value.ty()) != GetLanesOrVScaleFactor(op->value.ty())) {
+      Var new_var(op->var->name_hint, value.ty());
       let_binding_[op->var] = new_var;
       return Let(new_var, value, this->VisitExpr(op->body));
     } else {
@@ -715,7 +731,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
       return ffi::GetRef<PrimExpr>(op);
     }
 
-    int new_vec_length = var_lanes_.as_or_throw<IntImm>()->value / op->vectors[0].dtype().lanes();
+    int new_vec_length = var_lanes_.as_or_throw<IntImm>()->value / op->vectors[0].ty().lanes();
     PrimExpr updated_index = indices[0];
     // Check that the indices satisfy the specific patterns.
     auto f_check_index = [this, op](const PrimExpr& index) {
@@ -741,7 +757,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
                 ramp->lanes.as_or_throw<IntImm>()->value ==
                     var_lanes_.as_or_throw<IntImm>()->value &&
                 broadcast->value->IsInstance<IntImmNode>() &&
-                broadcast->value.as_or_throw<IntImm>()->value == op->vectors[0]->dtype.lanes() &&
+                broadcast->value.as_or_throw<IntImm>()->value == op->vectors[0].ty().lanes() &&
                 broadcast->lanes->IsInstance<IntImmNode>() &&
                 broadcast->lanes.as_or_throw<IntImm>()->value ==
                     var_lanes_.as_or_throw<IntImm>()->value) {
@@ -756,12 +772,12 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     TVM_FFI_ICHECK(f_check_index(updated_index));
 
     if (new_vec_length == 1) {
-      return tirx::Substitute(op->vectors[0], {{var_, tvm::IntImm(var_->dtype, 0)}});
+      return tirx::Substitute(op->vectors[0], {{var_, tvm::IntImm(var_.ty(), 0)}});
     } else {
       PrimExpr prev_ramp = ramp_;
       PrimExpr prev_var_lanes = var_lanes_;
-      ramp_ = Ramp(IntImm(var_->dtype, 0), IntImm(var_->dtype, 2), new_vec_length);
-      var_lanes_ = tvm::IntImm(var_lanes_.dtype(), new_vec_length);
+      ramp_ = Ramp(IntImm(var_.ty(), 0), IntImm(var_.ty(), 2), new_vec_length);
+      var_lanes_ = tvm::IntImm(var_lanes_.ty(), new_vec_length);
       lane_vectors = 0;
       vectors = MutateArray(op->vectors, &lane_vectors);
       ramp_ = prev_ramp;
@@ -779,28 +795,28 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     PrimExpr value = this->VisitExpr(op->value);
 
     if (!indices.same_as(op->indices) || !value.same_as(op->value)) {
-      TVM_FFI_ICHECK(!op->buffer->dtype.is_scalable_vector())
+      TVM_FFI_ICHECK(!op->buffer->dtype.IsScalableVector())
           << "Vectorizing over scalable buffer elements is not supported in vectorizer.";
       // How many lanes of indexing are present in the index and
       // buffer element type, excluding the last index.
       int other_index_lanes = op->buffer->dtype.lanes();
       for (size_t i = 0; i < indices.size() - 1; i++) {
-        other_index_lanes *= indices[i].dtype().lanes();
+        other_index_lanes *= indices[i].ty().lanes();
         // Only allow the last index to be scalable
-        TVM_FFI_ICHECK(!indices[i].dtype().is_scalable_vector())
+        TVM_FFI_ICHECK(!indices[i].ty().IsScalableVector())
             << "Only the last index can be scalable.";
       }
 
       // The total number of lanes of indexing, including the last index.
-      auto last_index_dtype = indices[indices.size() - 1].dtype();
-      int lanes_in_last_index = last_index_dtype.get_lanes_or_vscale_factor();
+      PrimType last_index_dtype = indices[indices.size() - 1].ty();
+      int lanes_in_last_index = GetLanesOrVScaleFactor(last_index_dtype);
       int index_lanes = other_index_lanes * lanes_in_last_index;
 
       // The total number of lanes in this store operation.  Either
       // the index or the value will be broadcast out to this number
       // of lanes, depending on which has more lanes.
-      int value_dtype_lanes = value.dtype().get_lanes_or_vscale_factor();
-      bool is_last_index_scalable = last_index_dtype.is_scalable_vector();
+      int value_dtype_lanes = GetLanesOrVScaleFactor(value.ty());
+      bool is_last_index_scalable = last_index_dtype.IsScalableVector();
       int total_lanes = std::max(index_lanes, value_dtype_lanes);
 
       TVM_FFI_ICHECK_EQ(total_lanes % other_index_lanes, 0)
@@ -826,9 +842,9 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
       LOG(WARNING) << "Detect vectorize inside vectorized loop, ignoring...";
     }
     TVM_FFI_ICHECK(is_zero(op->min));
-    TVM_FFI_ICHECK(!op->extent.dtype().is_scalable_or_fixed_length_vector());
+    TVM_FFI_ICHECK(!op->extent.ty().IsScalableVector() && !op->extent.ty().IsFixedLengthVector());
     PrimExpr extent = this->VisitExpr(op->extent);
-    if (extent.dtype().is_scalable_or_fixed_length_vector()) {
+    if (extent.ty().IsScalableVector() || extent.ty().IsFixedLengthVector()) {
       return Scalarize(ffi::GetRef<Stmt>(op));
     }
     Stmt body = this->VisitStmt(op->body);
@@ -843,7 +859,8 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
   }
   // IfThenElse
   Stmt VisitStmt_(const IfThenElseNode* op) final {
-    TVM_FFI_ICHECK(!op->condition.dtype().is_scalable_or_fixed_length_vector());
+    TVM_FFI_ICHECK(!op->condition.ty().IsScalableVector() &&
+                   !op->condition.ty().IsFixedLengthVector());
     PrimExpr condition = this->VisitExpr(op->condition);
     // need scalarize can be marked as true during visit of condition
     bool cond_need_scalarize = false;
@@ -857,7 +874,8 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     }
     // Check if we can rewrite the condition with predicated buffers
     if (EnableBufferLevelPredication(target_) &&
-        condition.dtype().is_scalable_or_fixed_length_vector() && !else_case.defined()) {
+        (condition.ty().IsScalableVector() || condition.ty().IsFixedLengthVector()) &&
+        !else_case.defined()) {
       std::pair<bool, Stmt> success_stmt_pair =
           TryPredicateBufferAccesses(TargetHasRVV(target_)).Run(then_case, condition);
       bool can_remove_if_then_else = success_stmt_pair.first;
@@ -866,7 +884,8 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
       }
     }
 
-    if (cond_need_scalarize || condition.dtype().is_scalable_or_fixed_length_vector()) {
+    if (cond_need_scalarize || condition.ty().IsScalableVector() ||
+        condition.ty().IsFixedLengthVector()) {
       return Scalarize(ffi::GetRef<Stmt>(op));
     }
     if (condition.same_as(op->condition) && then_case.same_as(op->then_case) &&
@@ -893,9 +912,8 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     TVM_FFI_ICHECK(!let_binding_.count(op->var)) << "SSA violation, a single var is binded twice";
     let_binding_[op->var] = value;
 
-    if (value.dtype().get_lanes_or_vscale_factor() !=
-        op->value.dtype().get_lanes_or_vscale_factor()) {
-      Var new_var(op->var->name_hint, value.dtype());
+    if (GetLanesOrVScaleFactor(value.ty()) != GetLanesOrVScaleFactor(op->value.ty())) {
+      Var new_var(op->var->name_hint, value.ty());
       let_binding_[op->var] = new_var;
       return Bind(new_var, value);
     } else {
@@ -912,9 +930,9 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
 
   // scalarize the statment
   Stmt Scalarize(Stmt stmt) {
-    Var idx(var_->name_hint + ".s", var_->dtype);
+    Var idx(var_->name_hint + ".s", var_.ty());
     stmt = Substitute(stmt, {{var_, idx}});
-    return For(idx, IntImm(var_->dtype, 0), var_lanes_, ForKind::kSerial, stmt);
+    return For(idx, IntImm(var_.ty(), 0), var_lanes_, ForKind::kSerial, stmt);
   }
 
  private:
@@ -949,11 +967,11 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
       PrimExpr new_elem = this->VisitExpr(old_elem);
       if (!new_elem.same_as(old_elem)) changed = true;
       new_arr[i] = new_elem;
-      lanes = std::max(lanes, new_elem.dtype().lanes());
+      lanes = std::max(lanes, new_elem.ty().lanes());
     }
 
     for (size_t i = 0; i < arr.size(); ++i) {
-      if (new_arr[i].dtype().lanes() != lanes) {
+      if (new_arr[i].ty().lanes() != lanes) {
         new_arr[i] = BroadcastTo(new_arr[i], lanes, false);
         changed = true;
       }
@@ -969,10 +987,10 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     if (a.same_as(op->a) && b.same_as(op->b)) {
       return ffi::GetRef<PrimExpr>(op);
     } else {
-      int a_lanes = a.dtype().get_lanes_or_vscale_factor();
-      int b_lanes = b.dtype().get_lanes_or_vscale_factor();
+      int a_lanes = GetLanesOrVScaleFactor(a.ty());
+      int b_lanes = GetLanesOrVScaleFactor(b.ty());
       int lanes = std::max(a_lanes, b_lanes);
-      bool is_scalable = a.dtype().is_scalable_vector() || b.dtype().is_scalable_vector();
+      bool is_scalable = a.ty().IsScalableVector() || b.ty().IsScalableVector();
       return TOp(BroadcastTo(a, lanes, is_scalable), BroadcastTo(b, lanes, is_scalable));
     }
   }
@@ -983,21 +1001,21 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     if (a.same_as(op->a) && b.same_as(op->b)) {
       return ffi::GetRef<PrimExpr>(op);
     } else {
-      int a_lanes = a.dtype().get_lanes_or_vscale_factor();
-      int b_lanes = b.dtype().get_lanes_or_vscale_factor();
+      int a_lanes = GetLanesOrVScaleFactor(a.ty());
+      int b_lanes = GetLanesOrVScaleFactor(b.ty());
       int lanes = std::max(a_lanes, b_lanes);
       if (lanes != 1) {
         const RampNode* b_ramp = b.as<RampNode>();
         const RampNode* a_ramp = a.as<RampNode>();
-        if (a.dtype().is_scalar() && b_ramp) {
+        if (a.ty().IsScalar() && b_ramp) {
           return Ramp(fcompute(a, b_ramp->base),
-                      fcompute(IntImm(b_ramp->stride.dtype(), 0), b_ramp->stride), b_ramp->lanes);
+                      fcompute(IntImm(b_ramp->stride.ty(), 0), b_ramp->stride), b_ramp->lanes);
         }
-        if (b.dtype().is_scalar() && a_ramp) {
+        if (b.ty().IsScalar() && a_ramp) {
           return Ramp(fcompute(a_ramp->base, b), a_ramp->stride, a_ramp->lanes);
         }
       }
-      bool is_scalable = a.dtype().is_scalable_vector() || b.dtype().is_scalable_vector();
+      bool is_scalable = a.ty().IsScalableVector() || b.ty().IsScalableVector();
       return fcompute(BroadcastTo(a, lanes, is_scalable), BroadcastTo(b, lanes, is_scalable));
     }
   }
@@ -1050,13 +1068,13 @@ class LoopVectorizer : public StmtMutator {
     // Match the existing TIRx scalable-vector convention.  LLVM/RVV still
     // selects the runtime vector length with vsetvli.
     static constexpr int kDefaultVScaleFactor = 4;
-    DataType index_dtype = op->loop_var->dtype;
+    PrimType index_dtype = op->loop_var.ty();
     PrimExpr zero = IntImm(index_dtype, 0);
     PrimExpr fixed_extent = IntImm(index_dtype, extent);
     PrimExpr scalable_lanes = CreateNewLanes(/*is_scalable=*/true, kDefaultVScaleFactor);
-    DataType lane_dtype = scalable_lanes.dtype();
+    PrimType lane_dtype = scalable_lanes.ty();
     PrimExpr scalable_lanes_index = scalable_lanes;
-    if (scalable_lanes_index.dtype() != index_dtype) {
+    if (scalable_lanes_index.ty() != index_dtype) {
       scalable_lanes_index = Cast(index_dtype, scalable_lanes_index);
     }
     PrimExpr num_chunks = ceildiv(fixed_extent, scalable_lanes_index);
@@ -1064,7 +1082,7 @@ class LoopVectorizer : public StmtMutator {
     Var outer(op->loop_var->name_hint + ".vla.o", index_dtype);
     Var inner(op->loop_var->name_hint + ".vla.i", lane_dtype);
     PrimExpr inner_index = inner;
-    if (inner_index.dtype() != index_dtype) {
+    if (inner_index.ty() != index_dtype) {
       inner_index = Cast(index_dtype, inner_index);
     }
     PrimExpr index = outer * scalable_lanes_index + inner_index;
diff --git a/src/topi/einsum.cc b/src/topi/einsum.cc
index 5d3a7936967b..b9610d5fcedd 100644
--- a/src/topi/einsum.cc
+++ b/src/topi/einsum.cc
@@ -127,7 +127,7 @@ PrimExpr GetIndexForBroadcastedDim(const Var& index, const PrimExpr& extent,
   // Check if current dimension is being broadcasted to `broadcasted_extent` (symbolic shape is
   // handled)
   if (is_one(extent) && !is_one(broadcasted_extent)) {
-    return IntImm(index.dtype(), 0);
+    return IntImm(index.ty(), 0);
   }
   return index;
 }
@@ -219,7 +219,7 @@ class EinsumBuilder {
     PrepareOutputIndicesMapping(indices, &label_to_index, &ellipsis_indices);
     PrepareReductionIndicesMapping(indices, &label_to_index, &ellipsis_indices, &reduce_axes);
 
-    auto zero = MakeConst(inputs[0]->dtype, 0);
+    auto zero = MakeConst(PrimType(inputs[0]->dtype), 0);
 
     PrimExpr result = zero;
     for (int i = 0, n = static_cast<int>(inputs.size()); i < n; ++i) {
@@ -288,9 +288,9 @@ class EinsumBuilder {
         }
       } else {
         // Normal label
-        reduction_axes->push_back(IterVar(
-            Range(0, label_to_extent_[label]),
-            Var(std::string(1, label), label_to_extent_[label].dtype()), IterVarType::kCommReduce));
+        reduction_axes->push_back(IterVar(Range(0, label_to_extent_[label]),
+                                          Var(std::string(1, label), label_to_extent_[label].ty()),
+                                          IterVarType::kCommReduce));
         label_to_index->emplace(label, reduction_axes->back()->var);
       }
     }
diff --git a/src/topi/elemwise.cc b/src/topi/elemwise.cc
index 922c40619908..4b9d26f276e1 100644
--- a/src/topi/elemwise.cc
+++ b/src/topi/elemwise.cc
@@ -92,11 +92,11 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                   })
       .def_packed("topi.cast",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
-                    *rv = cast(args[0].cast<te::Tensor>(), args[1].cast<DataType>());
+                    *rv = cast(args[0].cast<te::Tensor>(), args[1].cast<PrimType>());
                   })
       .def_packed("topi.reinterpret",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
-                    *rv = reinterpret(args[0].cast<te::Tensor>(), args[1].cast<DataType>());
+                    *rv = reinterpret(args[0].cast<te::Tensor>(), args[1].cast<PrimType>());
                   })
       .def_packed("topi.elemwise_sum",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
@@ -106,7 +106,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                                   ffi::Any* rv) { *rv = sign(args[0].cast<te::Tensor>()); })
       .def_packed("topi.full",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
-                    *rv = full(args[0].cast<ffi::Array<PrimExpr>>(), args[1].cast<DataType>(),
+                    *rv = full(args[0].cast<ffi::Array<PrimExpr>>(), args[1].cast<PrimType>(),
                                args[2].cast<PrimExpr>());
                   })
       .def_packed("topi.full_like",
diff --git a/src/topi/nn.cc b/src/topi/nn.cc
index e7b0d9c69e44..cd4968a46145 100644
--- a/src/topi/nn.cc
+++ b/src/topi/nn.cc
@@ -91,7 +91,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def_packed("topi.nn.dense", [](ffi::PackedArgs args, ffi::Any* rv) {
     *rv = nn::dense(args[0].cast<te::Tensor>(), args[1].cast<te::Tensor>(),
-                    args[2].cast<te::Tensor>(), args[3].cast<DataType>());
+                    args[2].cast<te::Tensor>(), args[3].cast<PrimType>());
   });
 }
 
diff --git a/src/topi/transform.cc b/src/topi/transform.cc
index f0d9225fb567..a9d994c2a883 100644
--- a/src/topi/transform.cc
+++ b/src/topi/transform.cc
@@ -86,11 +86,11 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                   })
       .def_packed("topi.shape",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
-                    *rv = shape(args[0].cast<te::Tensor>(), args[1].cast<DataType>());
+                    *rv = shape(args[0].cast<te::Tensor>(), args[1].cast<PrimType>());
                   })
       .def_packed("topi.tensor_size",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
-                    *rv = tensor_size(args[0].cast<te::Tensor>(), args[1].cast<DataType>());
+                    *rv = tensor_size(args[0].cast<te::Tensor>(), args[1].cast<PrimType>());
                   })
       .def_packed("topi.split",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
@@ -141,7 +141,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
       .def_packed("topi.arange",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
                     *rv = arange(args[0].cast<PrimExpr>(), args[1].cast<PrimExpr>(),
-                                 args[2].cast<PrimExpr>(), args[3].cast<DataType>());
+                                 args[2].cast<PrimExpr>(), args[3].cast<PrimType>());
                   })
       .def_packed("topi.meshgrid",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
@@ -261,7 +261,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                   [](ffi::PackedArgs args, ffi::Any* rv) {
                     int depth = args[3].cast<int>();
                     int axis = args[4].cast<int>();
-                    DataType dtype = args[5].cast<DataType>();
+                    PrimType dtype = args[5].cast<PrimType>();
                     *rv = one_hot(args[0].cast<te::Tensor>(), args[1].cast<PrimExpr>(),
                                   args[2].cast<PrimExpr>(), depth, axis, dtype);
                   })
diff --git a/tests/cpp/arith_simplify_test.cc b/tests/cpp/arith_simplify_test.cc
index d5050446d6a5..a08968c9f954 100644
--- a/tests/cpp/arith_simplify_test.cc
+++ b/tests/cpp/arith_simplify_test.cc
@@ -99,7 +99,7 @@ TEST(AnalyzerObjectRef, CloneIsIndependent) {
 TEST(ConstantFold, Broadcast) {
   tvm::ffi::StructuralEqual checker;
   auto i32x4 = tvm::tirx::Broadcast(tvm::IntImm::Int32(10), 4);
-  auto i64x4 = tvm::cast(i32x4->dtype.with_bits(64), i32x4);
+  auto i64x4 = tvm::cast(i32x4.ty().WithBits(64), i32x4);
   auto i64x4_expected = tvm::tirx::Broadcast(tvm::IntImm::Int64(10), 4);
   ASSERT_TRUE(checker(i64x4, i64x4_expected));
 }
@@ -107,11 +107,11 @@ TEST(ConstantFold, Broadcast) {
 TEST(ConstantFold, Ramp) {
   tvm::ffi::StructuralEqual checker;
   auto i32x4 = tvm::tirx::Ramp(tvm::IntImm::Int32(10), tvm::IntImm::Int32(1), 4);
-  auto i64x4 = tvm::cast(i32x4->dtype.with_bits(64), i32x4);
+  auto i64x4 = tvm::cast(i32x4.ty().WithBits(64), i32x4);
   auto i64x4_expected = tvm::tirx::Ramp(tvm::IntImm::Int64(10), tvm::IntImm::Int64(1), 4);
   ASSERT_TRUE(checker(i64x4, i64x4_expected));
 
-  auto f32x4 = tvm::cast(tvm::DataType::Float(32, 4), i32x4);
-  auto f32x4_expected = tvm::tirx::Cast(tvm::DataType::Float(32, 4), i32x4);
+  auto f32x4 = tvm::cast(tvm::PrimType::Float(32, 4), i32x4);
+  auto f32x4_expected = tvm::tirx::Cast(tvm::PrimType::Float(32, 4), i32x4);
   ASSERT_TRUE(checker(f32x4, f32x4_expected));
 }
diff --git a/tests/cpp/expr_test.cc b/tests/cpp/expr_test.cc
index 2470dc25d6fd..f9083525732d 100644
--- a/tests/cpp/expr_test.cc
+++ b/tests/cpp/expr_test.cc
@@ -39,13 +39,22 @@ TEST(Expr, Basic) {
 TEST(Expr, VarTypeAnnotation) {
   using namespace tvm;
   using namespace tvm::tirx;
-  Var x("x", DataType::Float(32));
-  Var y("y", PrimType(DataType::Float(32)));
+  Var x("x", PrimType::Float(32));
+  Var y("y", PrimType::Float(32));
   tvm::ffi::StructuralEqual checker;
-  TVM_FFI_ICHECK(checker(x->dtype, y->dtype));
+  TVM_FFI_ICHECK(checker(x.ty(), y.ty()));
   TVM_FFI_ICHECK(checker(x->type_annotation, y->type_annotation));
 }
 
+TEST(Expr, PrimTypeBoolLanes) {
+  using namespace tvm;
+  PrimType boolx4 = PrimType::Bool(4);
+  TVM_FFI_ICHECK(boolx4.IsFixedLengthVector());
+  TVM_FFI_ICHECK(boolx4.MatchesCode(DLDataTypeCode::kDLBool));
+  TVM_FFI_ICHECK_EQ(boolx4.lanes(), 4);
+  TVM_FFI_ICHECK(boolx4.MatchesElementType(DLDataTypeCode::kDLBool, 8));
+}
+
 TEST(ExprNodeRef, Basic) {
   using namespace tvm;
   using namespace tvm::tirx;
diff --git a/tests/cpp/ir_functor_test.cc b/tests/cpp/ir_functor_test.cc
index 2befce821d79..62f05fd90dbf 100644
--- a/tests/cpp/ir_functor_test.cc
+++ b/tests/cpp/ir_functor_test.cc
@@ -152,8 +152,8 @@ TEST(IRF, StmtVisitor) {
   auto fmaketest = [&]() {
     auto z = x + 1;
     Stmt eval_body = Evaluate(z);
-    DataType dtype = DataType::Float(32);
-    Var data_var("b", PointerType(PrimType(dtype)));
+    PrimType dtype = PrimType::Float(32);
+    Var data_var("b", PointerType(dtype));
     Buffer buf(data_var, dtype, {z, z}, {}, PrimExpr(), "b", 0, 0, BufferType::kDefault);
     // AllocBuffer is flat (no body). Return as SeqStmt with eval.
     return SeqStmt({AllocBuffer(buf), eval_body});
@@ -166,8 +166,8 @@ TEST(IRF, StmtVisitor) {
   {
     // tests for block and block_realize
     Stmt body = fmaketest();
-    DataType dtype = DataType::Float(32);
-    Var buf_var("b", PointerType(PrimType(dtype)));
+    PrimType dtype = PrimType::Float(32);
+    Var buf_var("b", PointerType(dtype));
     Buffer buffer = decl_buffer({16});
     body = SeqStmt({DeclBuffer(buffer), std::move(body)});
     BufferRegion buffer_region(buffer, {Range::FromMinExtent(x + 1, 1)});
@@ -206,8 +206,8 @@ TEST(IRF, StmtMutator) {
   };
   auto fmakealloc = [&]() {
     auto z = x + 1;
-    DataType dtype = DataType::Float(32);
-    Var data_var("b", PointerType(PrimType(dtype)));
+    PrimType dtype = PrimType::Float(32);
+    Var data_var("b", PointerType(dtype));
     Buffer buf(data_var, dtype, {1, z}, {}, PrimExpr(), "b", 0, 0, BufferType::kDefault);
     return AllocBuffer(buf);
   };
@@ -258,7 +258,7 @@ TEST(IRF, StmtMutator) {
 
   {
     auto body =
-        Evaluate(Call(DataType::Int(32), builtin::call_extern(), {StringImm("xyz"), x + 1}));
+        Evaluate(Call(PrimType::Int(32), builtin::call_extern(), {StringImm("xyz"), x + 1}));
     auto res = v(std::move(body));
     TVM_FFI_ICHECK(res.as<EvaluateNode>()->value.as<CallNode>()->args[1].same_as(x));
   }
@@ -330,13 +330,13 @@ TEST(IRF, StmtMutator) {
 TEST(IRF, Substitute) {
   using namespace tvm;
   using namespace tvm::tirx;
-  DataType dtype = DataType::Float(32);
-  Var x("x", PointerType(PrimType(dtype), ""));
-  Var n("n", DataType::Int(32));
+  PrimType dtype = PrimType::Float(32);
+  Var x("x", PointerType(dtype, ""));
+  Var n("n", PrimType::Int(32));
 
   auto fmakebuffer = [&]() {
     return Buffer{/*data=*/x,
-                  /*dtype=*/DataType::Float(32),
+                  /*dtype=*/PrimType::Float(32),
                   /*shape=*/{n},
                   /*strides=*/{},
                   /*elem_offset=*/PrimExpr(),
@@ -349,7 +349,7 @@ TEST(IRF, Substitute) {
   {
     // test substitute buffer data var and shape var via DeclBuffer
     Var y = x.copy_with_suffix("subst");
-    Var m("m", DataType::Int(32));
+    Var m("m", PrimType::Int(32));
     Buffer buffer = fmakebuffer();
     Stmt store = BufferStore(buffer, FloatImm(dtype, 0), {IntImm::Int32(0)});
     Stmt decl = SeqStmt({DeclBuffer(buffer), store});
diff --git a/tests/cpp/ndarray_test.cc b/tests/cpp/ndarray_test.cc
index c02efecc5148..79695a299d49 100644
--- a/tests/cpp/ndarray_test.cc
+++ b/tests/cpp/ndarray_test.cc
@@ -24,7 +24,7 @@
 using namespace tvm;
 
 TEST(TensorTest, IsContiguous_ContiguousStride) {
-  auto array = runtime::Tensor::Empty({5, 10}, DataType::Float(32), {kDLCPU});
+  auto array = runtime::Tensor::Empty({5, 10}, DLDataType{kDLFloat, 32, 1}, {kDLCPU});
   DLManagedTensor* managed_tensor = array.ToDLPack();
 
   int64_t strides[] = {10, 1};
@@ -36,7 +36,7 @@ TEST(TensorTest, IsContiguous_ContiguousStride) {
 }
 
 TEST(TensorTest, IsContiguous_NullStride) {
-  auto array = runtime::Tensor::Empty({5, 10}, DataType::Float(32), {kDLCPU});
+  auto array = runtime::Tensor::Empty({5, 10}, DLDataType{kDLFloat, 32, 1}, {kDLCPU});
   DLManagedTensor* managed_tensor = array.ToDLPack();
 
   managed_tensor->dl_tensor.strides = nullptr;
@@ -47,7 +47,7 @@ TEST(TensorTest, IsContiguous_NullStride) {
 }
 
 TEST(TensorTest, IsContiguous_AnyStrideForSingular) {
-  auto array = runtime::Tensor::Empty({5, 1, 10}, DataType::Float(32), {kDLCPU});
+  auto array = runtime::Tensor::Empty({5, 1, 10}, DLDataType{kDLFloat, 32, 1}, {kDLCPU});
   DLManagedTensor* managed_tensor = array.ToDLPack();
 
   int64_t strides[] = {10, 1, 1};  // strides[1] is normalized to 1 because shape[1] == 1.
@@ -60,7 +60,7 @@ TEST(TensorTest, IsContiguous_AnyStrideForSingular) {
 }
 
 TEST(TensorTest, IsContiguous_UncontiguousStride) {
-  auto array = runtime::Tensor::Empty({5, 1, 10}, DataType::Float(32), {kDLCPU});
+  auto array = runtime::Tensor::Empty({5, 1, 10}, DLDataType{kDLFloat, 32, 1}, {kDLCPU});
   DLManagedTensor* managed_tensor = array.ToDLPack();
 
   int64_t strides[] = {1, 1, 1};
diff --git a/tests/cpp/nested_msg_test.cc b/tests/cpp/nested_msg_test.cc
index 26bfee06f47d..96f645924382 100644
--- a/tests/cpp/nested_msg_test.cc
+++ b/tests/cpp/nested_msg_test.cc
@@ -18,11 +18,11 @@
  */
 
 #include <gtest/gtest.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/extra/structural_equal.h>
 #include <tvm/relax/block_builder.h>
 #include <tvm/relax/nested_msg.h>
 #include <tvm/relax/type.h>
-#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/tirx/expr.h>
 
@@ -40,6 +40,17 @@
 using namespace tvm;
 using namespace tvm::relax;
 
+namespace {
+
+TensorType ScalarTensorType(PrimType dtype) {
+  auto n = tvm::ffi::make_object<TensorTypeNode>();
+  n->dtype = std::move(dtype);
+  n->ndim = 0;
+  return TensorType(n);
+}
+
+}  // namespace
+
 TEST(NestedMsg, Basic) {
   // start with no annotation
   relax::Var x("x", std::nullopt), y("y", std::nullopt);
@@ -145,9 +156,9 @@ TEST(NestedMsg, Equal) {
 }
 
 TEST(NestedMsg, MapAndDecompose) {
-  relax::Var x("x", PrimType(runtime::DataType::Int(16)));
-  relax::Var y("y", PrimType(runtime::DataType::Int(32)));
-  relax::Var z("z", PrimType(runtime::DataType::Int(64)));
+  relax::Var x("x", PrimType::Int(16));
+  relax::Var y("y", PrimType::Int(32));
+  relax::Var z("z", PrimType::Int(64));
 
   BlockBuilder bb = BlockBuilder::Create(std::nullopt);
   relax::Expr t0 = bb->Normalize(Tuple({x, y}));
@@ -171,7 +182,7 @@ TEST(NestedMsg, MapAndDecompose) {
   auto output2 = MapToNestedMsg<IntImm>(GetType(t1), [&](Type ty) -> NestedMsg<IntImm> {
     const auto* prim_ty = ty.as<PrimTypeNode>();
     if (prim_ty == nullptr) return std::nullopt;
-    int bits = prim_ty->dtype.bits();
+    int bits = prim_ty->dtype.bits;
     if (bits == 16) return c0;
     if (bits == 32) return c1;
     if (bits == 64) return c2;
@@ -201,7 +212,7 @@ TEST(NestedMsg, MapAndDecompose) {
 }
 
 TEST(NestedMsg, MapToNestedMsgByType) {
-  auto sf0 = TensorType(DataType::Float(32), /*ndim=*/0);
+  auto sf0 = ScalarTensorType(PrimType::Float(32));
   auto sf1 = TupleType({sf0, sf0});
   auto sf2 = TupleType({sf0, sf0});
   auto x = relax::Var("x", TupleType({sf1, sf2, sf0}));
@@ -223,7 +234,7 @@ TEST(NestedMsg, MapToNestedMsgByType) {
 }
 
 TEST(NestedMsg, NestedMsgToExpr) {
-  auto sf0 = TensorType(DataType::Float(32), /*ndim=*/0);
+  auto sf0 = ScalarTensorType(PrimType::Float(32));
   auto sf1 = TupleType({sf0, sf0});
 
   auto c0 = IntImm::Int32(0);
@@ -306,7 +317,7 @@ TEST(NestedMsg, TransformTupleLeaf) {
   NInt msg1 = {c0, {c0, c1}, c2, {c0, {c1, c2}}};
   NInt msg2 = {c1, {c2, c0}, c2, {c1, {c2, c0}}};
 
-  PrimType s = PrimType(runtime::DataType::Int(32));
+  PrimType s = PrimType::Int(32);
   relax::Var x("x", s), y("y", s), z("z", s);
   BlockBuilder bb = BlockBuilder::Create(std::nullopt);
   Expr expr = bb->Normalize(Tuple({x, Tuple({x, x}), x, Tuple({x, Tuple({x, x})})}));
diff --git a/tests/cpp/pattern_match_test.cc b/tests/cpp/pattern_match_test.cc
index ab668e9a4204..e9075c6faf9f 100644
--- a/tests/cpp/pattern_match_test.cc
+++ b/tests/cpp/pattern_match_test.cc
@@ -27,9 +27,9 @@ TEST(Pattern, Basic) {
   using namespace tvm::tirx;
   using namespace tvm::arith;
   tvm::tirx::Var x("x"), y("y"), z("z");
-  PrimExpr scalable_lanes = Mul(Call(DataType::Int(32), builtin::vscale(), {}), 4);
+  PrimExpr scalable_lanes = Mul(Call(PrimType::Int(32), builtin::vscale(), {}), 4);
   arith::PVar<PrimExpr> px, py, pz;
-  arith::PVar<DataType> pt;
+  arith::PVar<PrimType> pt;
   arith::PVar<PrimExpr> planes;
   arith::PCallExpr<PVscaleOp> vscale;
 
@@ -101,14 +101,14 @@ TEST(Pattern, Basic) {
   // cast pattern
   {
     TVM_FFI_ICHECK(
-        !cast(PConst<DataType>(DataType::Int(32)), px).Match(tirx::Cast(DataType::Float(64), x)));
-    TVM_FFI_ICHECK(cast(pt, px).Match(tirx::Cast(DataType::Float(64), x)));
-    TVM_FFI_ICHECK(pt.Eval() == DataType::Float(64));
+        !cast(PConst<PrimType>(PrimType::Int(32)), px).Match(tirx::Cast(PrimType::Float(64), x)));
+    TVM_FFI_ICHECK(cast(pt, px).Match(tirx::Cast(PrimType::Float(64), x)));
+    TVM_FFI_ICHECK(pt.Eval() == PrimType::Float(64));
     auto zz = cast(pt, px).Eval();
     TVM_FFI_ICHECK(
         (cast(pt, px) - cast(pt, py))
-            .Match(tirx::Cast(DataType::Float(64), x) - tirx::Cast(DataType::Int(64), x)));
-    auto expr = tirx::Cast(DataType::Int(32), tirx::Cast(DataType::Float(64), x));
+            .Match(tirx::Cast(PrimType::Float(64), x) - tirx::Cast(PrimType::Int(64), x)));
+    auto expr = tirx::Cast(PrimType::Int(32), tirx::Cast(PrimType::Float(64), x));
     TVM_FFI_ICHECK(!(cast(pt, cast(pt, px))).Match(expr));
   }
   // ramp pattern
@@ -150,21 +150,21 @@ TEST(Pattern, IntImm) {
 TEST(Pattern, MatchWithType) {
   using namespace tvm;
   // match expr with specified dtype
-  arith::PVarWithDataType<PrimExpr, arith::PConst<DataType>> pat(DataType::Float(32));
-  tirx::Var x("x", DataType::Float(32));
-  tirx::Var y("y", DataType::Float(32));
-  tirx::Var x_int("x", DataType::Int(32));
-  tirx::Var y_int("y", DataType::Int(32));
+  arith::PVarWithDataType<PrimExpr, arith::PConst<PrimType>> pat(PrimType::Float(32));
+  tirx::Var x("x", PrimType::Float(32));
+  tirx::Var y("y", PrimType::Float(32));
+  tirx::Var x_int("x", PrimType::Int(32));
+  tirx::Var y_int("y", PrimType::Int(32));
   TVM_FFI_ICHECK(pat.Match(x + y * 2.0f));
   TVM_FFI_ICHECK(!pat.Match(x_int + y_int * 2));
 
   // match vectorized expr with specified element dtype
-  arith::PVecDataType vec_ty(DataType::Float(32));
+  arith::PVecDataType vec_ty(PrimType::Float(32));
   arith::PVarWithDataType<PrimExpr, arith::PVecDataType> vpat(vec_ty);
-  tirx::Var vx = tirx::Var("x", DataType::Float(32, 8));
-  tirx::Var vy("y", DataType::Float(32, 8));
-  tirx::Var vx_int("x", DataType::Int(32, 8));
-  tirx::Var vy_int("y", DataType::Int(32, 8));
+  tirx::Var vx = tirx::Var("x", PrimType::Float(32, 8));
+  tirx::Var vy("y", PrimType::Float(32, 8));
+  tirx::Var vx_int("x", PrimType::Int(32, 8));
+  tirx::Var vy_int("y", PrimType::Int(32, 8));
   TVM_FFI_ICHECK(vpat.Match(vx + vy * tirx::Broadcast(2.0f, 8)));
   TVM_FFI_ICHECK(!vpat.Match(vx_int + vy_int * tirx::Broadcast(2, 8)));
 }
diff --git a/tests/cpp/te_compute_test.cc b/tests/cpp/te_compute_test.cc
index 6f1e6aa9b8cc..30397fb765bb 100644
--- a/tests/cpp/te_compute_test.cc
+++ b/tests/cpp/te_compute_test.cc
@@ -27,8 +27,8 @@ TEST(Tensor, Basic) {
 
   Var m("m"), n("n"), l("l");
 
-  Tensor A = placeholder({m, l}, DataType::Float(32), "A");
-  Tensor B = placeholder({n, l}, DataType::Float(32), "B");
+  Tensor A = placeholder({m, l}, PrimType::Float(32), "A");
+  Tensor B = placeholder({n, l}, PrimType::Float(32), "B");
 
   auto C = compute({m, n}, [&](Var i, Var j) { return A[i][j]; }, "C");
 
@@ -40,8 +40,8 @@ TEST(Tensor, Reduce) {
   using namespace tvm::te;
 
   Var m("m"), n("n"), l("l");
-  te::Tensor A = te::placeholder({m, l}, DataType::Float(32), "A");
-  te::Tensor B = te::placeholder({n, l}, DataType::Float(32), "B");
+  te::Tensor A = te::placeholder({m, l}, PrimType::Float(32), "A");
+  te::Tensor B = te::placeholder({n, l}, PrimType::Float(32), "B");
   IterVar rv = reduce_axis(Range{0, l}, "k");
 
   auto C = te::compute(
@@ -53,5 +53,5 @@ TEST(Tensor, Indexing) {
   using namespace tvm::te;
 
   Var x("x"), y("y");
-  te::Tensor A = te::placeholder({x, y}, DataType::Float(32), "A");
+  te::Tensor A = te::placeholder({x, y}, PrimType::Float(32), "A");
 }
diff --git a/tests/cpp/tir_analysis_side_effect.cc b/tests/cpp/tir_analysis_side_effect.cc
index bcc7128647b4..1183f37abee6 100644
--- a/tests/cpp/tir_analysis_side_effect.cc
+++ b/tests/cpp/tir_analysis_side_effect.cc
@@ -25,11 +25,11 @@
 
 TEST(SimplePasses, SideEffect) {
   using namespace tvm;
-  auto buf = tirx::decl_buffer({16}, DataType::Float(32));
-  auto i = tirx::Var("i", DataType::Int(32));
+  auto buf = tirx::decl_buffer({16}, PrimType::Float(32));
+  auto i = tirx::Var("i", PrimType::Int(32));
   TVM_FFI_ICHECK(tirx::SideEffect(tirx::BufferLoad(buf, {i})) == tirx::CallEffectKind::kReadState);
-  TVM_FFI_ICHECK(tirx::SideEffect(exp(tirx::Cast(DataType::Float(32), i + 1))) ==
+  TVM_FFI_ICHECK(tirx::SideEffect(exp(tirx::Cast(PrimType::Float(32), i + 1))) ==
                  tirx::CallEffectKind::kPure);
-  TVM_FFI_ICHECK(tirx::SideEffect(tirx::Call(DataType::Handle(), tirx::builtin::tvm_storage_sync(),
+  TVM_FFI_ICHECK(tirx::SideEffect(tirx::Call(PrimType::Handle(), tirx::builtin::tvm_storage_sync(),
                                              {})) == tirx::CallEffectKind::kUpdateState);
 }
diff --git a/tests/cpp/tir_scalable_datatype.cc b/tests/cpp/tir_scalable_datatype.cc
index a81915c74b97..015bae4564a1 100644
--- a/tests/cpp/tir_scalable_datatype.cc
+++ b/tests/cpp/tir_scalable_datatype.cc
@@ -19,7 +19,7 @@
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include <tvm/runtime/data_type.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/script/printer/printer.h>
 #include <tvm/tirx/builtin.h>
 #include <tvm/tirx/expr.h>
@@ -33,67 +33,68 @@
 using ::testing::HasSubstr;
 
 // ---------
-// Data Type
+// Prim Type
 // ---------
-TEST(ScalableDataType, TestCreateScalableType) {
-  tvm::DataType scalable_type = tvm::DataType(kDLInt, 32, 4, true);
+TEST(ScalablePrimType, TestCreateScalableType) {
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(kDLInt, 32, 4);
   ASSERT_EQ(scalable_type.code(), kDLInt);
   ASSERT_EQ(scalable_type.bits(), 32);
-  ASSERT_EQ(scalable_type.vscale_factor(), 4);
-  ASSERT_TRUE(scalable_type.is_scalable_vector());
-  ASSERT_TRUE(scalable_type.is_scalable_or_fixed_length_vector());
+  ASSERT_EQ(scalable_type.VScaleFactor(), 4);
+  ASSERT_TRUE(scalable_type.IsScalableVector());
+  ASSERT_TRUE(scalable_type.IsScalableVector() || scalable_type.IsFixedLengthVector());
 }
 
-TEST(ScalableDataType, TestScalableWithBits) {
-  tvm::DataType scalable_type = tvm::DataType(kDLInt, 1, 8, true);
-  scalable_type = scalable_type.with_bits(32);
+TEST(ScalablePrimType, TestScalableWithBits) {
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(kDLInt, 1, 8);
+  scalable_type = scalable_type.WithBits(32);
   ASSERT_EQ(scalable_type.bits(), 32);
-  ASSERT_TRUE(scalable_type.is_scalable_vector());
-  ASSERT_TRUE(scalable_type.is_scalable_or_fixed_length_vector());
+  ASSERT_TRUE(scalable_type.IsScalableVector());
+  ASSERT_TRUE(scalable_type.IsScalableVector() || scalable_type.IsFixedLengthVector());
 }
 
-TEST(ScalableDataType, TestScalableWithVscaleFactor) {
-  tvm::DataType type = tvm::DataType(kDLInt, 32, 1);
-  tvm::DataType scalable_type = type.with_scalable_vscale_factor(4);
-  ASSERT_EQ(scalable_type.vscale_factor(), 4);
-  ASSERT_TRUE(scalable_type.is_scalable_vector());
-  ASSERT_TRUE(scalable_type.is_scalable_or_fixed_length_vector());
+TEST(ScalablePrimType, TestScalableWithVscaleFactor) {
+  tvm::PrimType type = tvm::PrimType::Int(32);
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(type.code(), type.bits(), 4);
+  ASSERT_EQ(scalable_type.VScaleFactor(), 4);
+  ASSERT_TRUE(scalable_type.IsScalableVector());
+  ASSERT_TRUE(scalable_type.IsScalableVector() || scalable_type.IsFixedLengthVector());
 }
 
-TEST(ScalableDataType, TestAssignScalableDataType) {
-  tvm::DataType scalable_type = tvm::DataType(kDLInt, 32, 2, true);
-  tvm::DataType scalable_type_copy = scalable_type;
-  ASSERT_TRUE(scalable_type_copy.is_scalable_vector());
-  ASSERT_TRUE(scalable_type_copy.is_scalable_or_fixed_length_vector());
+TEST(ScalablePrimType, TestAssignScalablePrimType) {
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(kDLInt, 32, 2);
+  tvm::PrimType scalable_type_copy = scalable_type;
+  ASSERT_TRUE(scalable_type_copy.IsScalableVector());
+  ASSERT_TRUE(scalable_type_copy.IsScalableVector() || scalable_type_copy.IsFixedLengthVector());
 }
 
-TEST(ScalableDataType, TestScalableDataTypeEquality) {
-  ASSERT_TRUE(tvm::DataType(kDLInt, 32, 4, true) == tvm::DataType(kDLInt, 32, 4, true));
+TEST(ScalablePrimType, TestScalablePrimTypeEquality) {
+  ASSERT_TRUE(tvm::PrimType::ScalableVector(kDLInt, 32, 4) ==
+              tvm::PrimType::ScalableVector(kDLInt, 32, 4));
 }
 
-TEST(ScalableDataType, TestScalableDataTypeAndNonScalableDataTypeInequality) {
-  ASSERT_FALSE(tvm::DataType(kDLInt, 32, 4, true) == tvm::DataType(kDLInt, 32, 4));
+TEST(ScalablePrimType, TestScalablePrimTypeAndNonScalablePrimTypeInequality) {
+  ASSERT_FALSE(tvm::PrimType::ScalableVector(kDLInt, 32, 4) == tvm::PrimType::Int(32, 4));
 }
 
-TEST(ScalableDataType, TestIsScalar) {
-  ASSERT_FALSE(tvm::DataType(kDLInt, 32, 4, true).is_scalar());
-  ASSERT_TRUE(tvm::DataType(kDLInt, 32, 1, false).is_scalar());
-  ASSERT_FALSE(tvm::DataType(kDLInt, 32, 4, false).is_scalar());
-  ASSERT_FALSE(tvm::DataType(kDLOpaqueHandle, 1, 0, false).is_scalar());
+TEST(ScalablePrimType, TestIsScalar) {
+  ASSERT_FALSE(tvm::PrimType::ScalableVector(kDLInt, 32, 4).IsScalar());
+  ASSERT_TRUE(tvm::PrimType::Int(32).IsScalar());
+  ASSERT_FALSE(tvm::PrimType::Int(32, 4).IsScalar());
+  ASSERT_FALSE(tvm::PrimType::Void().IsScalar());
 }
 
-TEST(ScalableDataType, TestScalableDataTypeToString) {
-  tvm::DataType scalable_type = tvm::DataType(kDLInt, 32, 4, true);
-  EXPECT_EQ(tvm::ffi::DLDataTypeToString(scalable_type), "int32xvscalex4");
+TEST(ScalablePrimType, TestScalablePrimTypeToString) {
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(kDLInt, 32, 4);
+  EXPECT_EQ(tvm::ffi::DLDataTypeToString(scalable_type->dtype), "int32xvscalex4");
 }
 
-TEST(ScalableDataType, TestStringToScalableDataType) {
+TEST(ScalablePrimType, TestStringToScalablePrimType) {
   std::string scalable_type_str = "int32xvscalex4";
-  EXPECT_EQ(tvm::DataType(tvm::ffi::StringToDLDataType(scalable_type_str)),
-            tvm::DataType(kDLInt, 32, 4, true));
+  EXPECT_EQ(tvm::PrimType(tvm::ffi::StringToDLDataType(scalable_type_str)),
+            tvm::PrimType::ScalableVector(kDLInt, 32, 4));
 }
 
-TEST(ScalableDataType, TestInvalidStringToScalableDataType) {
+TEST(ScalablePrimType, TestInvalidStringToScalablePrimType) {
   std::string scalable_type_str = "int32x4xvscale";
   EXPECT_THROW(
       {
@@ -107,12 +108,13 @@ TEST(ScalableDataType, TestInvalidStringToScalableDataType) {
       tvm::ffi::Error);
 }
 
-TEST(ScalableDataType, TestGetScalableVectorBytes) {
-  tvm::DataType scalable_type = tvm::DataType(kDLInt, 32, 4, true);
+TEST(ScalablePrimType, TestGetScalableVectorBytes) {
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(kDLInt, 32, 4);
   EXPECT_THROW(
       {
         try {
-          tvm::runtime::GetVectorBytes(scalable_type);
+          int bytes = (scalable_type.bits() * scalable_type.lanes() + 7) / 8;
+          static_cast<void>(bytes);
         } catch (const tvm::ffi::Error& e) {
           EXPECT_THAT(e.what(),
                       HasSubstr("Can't fetch the lanes of a scalable vector at a compile time"));
@@ -122,11 +124,11 @@ TEST(ScalableDataType, TestGetScalableVectorBytes) {
       tvm::ffi::Error);
 }
 
-TEST(ScalableDataType, TestScalableDataTypeInvalidLanesError) {
+TEST(ScalablePrimType, TestScalablePrimTypeInvalidLanesError) {
   EXPECT_THROW(
       {
         try {
-          tvm::DataType(kDLFloat, 62, 1, true);
+          tvm::PrimType::ScalableVector(kDLFloat, 62, 1);
         } catch (const tvm::ffi::Error& e) {
           EXPECT_THAT(e.what(), HasSubstr("Invalid value for vscale factor"));
           throw;
@@ -135,14 +137,14 @@ TEST(ScalableDataType, TestScalableDataTypeInvalidLanesError) {
       tvm::ffi::Error);
 }
 
-TEST(ScalableDataType, TestScalableDataTypeInvalidVscaleFactorAccess) {
-  tvm::DataType fixed_length_type = tvm::DataType(kDLFloat, 32, 4);
-  ASSERT_TRUE(fixed_length_type.is_fixed_length_vector());
-  ASSERT_TRUE(fixed_length_type.is_scalable_or_fixed_length_vector());
+TEST(ScalablePrimType, TestScalablePrimTypeInvalidVscaleFactorAccess) {
+  tvm::PrimType fixed_length_type = tvm::PrimType::Float(32, 4);
+  ASSERT_TRUE(fixed_length_type.IsFixedLengthVector());
+  ASSERT_TRUE(fixed_length_type.IsScalableVector() || fixed_length_type.IsFixedLengthVector());
   EXPECT_THROW(
       {
         try {
-          fixed_length_type.vscale_factor();
+          fixed_length_type.VScaleFactor();
         } catch (const tvm::ffi::Error& e) {
           EXPECT_THAT(e.what(), HasSubstr("A fixed length vector doesn't have a vscale factor"));
           throw;
@@ -151,8 +153,8 @@ TEST(ScalableDataType, TestScalableDataTypeInvalidVscaleFactorAccess) {
       tvm::ffi::Error);
 }
 
-TEST(ScalableDataType, TestScalableDataTypeInvalidLanesAccess) {
-  tvm::DataType scalable_type = tvm::DataType(kDLFloat, 32, 4, true);
+TEST(ScalablePrimType, TestScalablePrimTypeInvalidLanesAccess) {
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(kDLFloat, 32, 4);
   EXPECT_THROW(
       {
         try {
@@ -166,28 +168,28 @@ TEST(ScalableDataType, TestScalableDataTypeInvalidLanesAccess) {
       tvm::ffi::Error);
 }
 
-TEST(ScalableDataType, TestScalableBool) {
-  tvm::DataType scalable_type = tvm::DataType::Bool(4, true);
+TEST(ScalablePrimType, TestScalableBool) {
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(kDLBool, 8, 4);
   ASSERT_EQ(scalable_type.code(), kDLBool);
   ASSERT_EQ(scalable_type.bits(), 8);
-  ASSERT_EQ(scalable_type.vscale_factor(), 4);
-  ASSERT_TRUE(scalable_type.is_scalable_vector());
+  ASSERT_EQ(scalable_type.VScaleFactor(), 4);
+  ASSERT_TRUE(scalable_type.IsScalableVector());
 }
 
-TEST(ScalableDataType, TestScalableUInt) {
-  tvm::DataType scalable_type = tvm::DataType::UInt(1, 4, true);
+TEST(ScalablePrimType, TestScalableUInt) {
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(kDLUInt, 1, 4);
   ASSERT_EQ(scalable_type.code(), kDLUInt);
   ASSERT_EQ(scalable_type.bits(), 1);
-  ASSERT_EQ(scalable_type.vscale_factor(), 4);
-  ASSERT_TRUE(scalable_type.is_scalable_vector());
+  ASSERT_EQ(scalable_type.VScaleFactor(), 4);
+  ASSERT_TRUE(scalable_type.IsScalableVector());
 }
 
 // -----------
 // Integration
 // -----------
 #ifdef TVM_LLVM_VERSION
-TEST(ScalableDataType, TestScalableIntrinCall) {
-  tvm::DataType scalable_type = tvm::DataType(kDLInt, 32, 4, true);
+TEST(ScalablePrimType, TestScalableIntrinCall) {
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(kDLInt, 32, 4);
   tvm::tirx::Call call = tvm::tirx::Call(scalable_type, tvm::tirx::builtin::call_llvm_intrin(),
 #if TVM_LLVM_VERSION >= 200
                                          {tvm::IntImm::Int32(::llvm::Intrinsic::stepvector)});
@@ -195,7 +197,7 @@ TEST(ScalableDataType, TestScalableIntrinCall) {
                                          {tvm::IntImm::Int32(
                                              ::llvm::Intrinsic::experimental_stepvector)});
 #endif
-  ASSERT_EQ(call->dtype, scalable_type);
+  ASSERT_EQ(tvm::PrimType(call.ty()->dtype), scalable_type);
   ASSERT_EQ(tvm::Script(call),
 #if TVM_LLVM_VERSION >= 200
             "T.call_llvm_intrin(\"int32xvscalex4\", \"llvm.stepvector\")");
@@ -205,7 +207,7 @@ TEST(ScalableDataType, TestScalableIntrinCall) {
 }
 #endif
 
-TEST(ScalableDataType, TestTIRScriptScalableDtype2Str) {
-  tvm::DataType scalable_type = tvm::DataType(kDLInt, 32, 4, true);
-  ASSERT_EQ(tvm::script::printer::DType2Str(scalable_type), "int32xvscalex4");
+TEST(ScalablePrimType, TestTIRScriptScalableDtype2Str) {
+  tvm::PrimType scalable_type = tvm::PrimType::ScalableVector(kDLInt, 32, 4);
+  ASSERT_EQ(tvm::script::printer::DType2Str(scalable_type->dtype), "int32xvscalex4");
 }
diff --git a/tests/cpp/topi_ewise_test.cc b/tests/cpp/topi_ewise_test.cc
index 9f4457de5192..41b48ed3c5be 100644
--- a/tests/cpp/topi_ewise_test.cc
+++ b/tests/cpp/topi_ewise_test.cc
@@ -26,7 +26,7 @@ namespace topi {
 TEST(Tensor, Basic) {
   using namespace tvm;
   Var m("m"), l("l");
-  Tensor A = placeholder({m, l}, DataType::Float(32), "A");
+  Tensor A = placeholder({m, l}, PrimType::Float(32), "A");
   auto C = topi::exp(A);
 }
 }  // namespace topi
diff --git a/tests/python/codegen/test_target_codegen_llvm.py b/tests/python/codegen/test_target_codegen_llvm.py
index 624d587b825f..186b2c6de318 100644
--- a/tests/python/codegen/test_target_codegen_llvm.py
+++ b/tests/python/codegen/test_target_codegen_llvm.py
@@ -1145,7 +1145,7 @@ def test_call_packed_returning_void():
 
     The LLVM codegen uses the CallNode's dtype to cast the return type
     of the PackedFunc into the appropriate LLVM output type.  However,
-    there is no API type for `DataType::Void()`.  When the return type
+    there is no runtime dtype value for a void return.  When the return type
     of a PackedFunc is void, the generated code should not attempt to
     read the return value.
 
diff --git a/tests/python/contrib/test_sort.py b/tests/python/contrib/test_sort.py
index 33d37ccdd372..abc814e52346 100644
--- a/tests/python/contrib/test_sort.py
+++ b/tests/python/contrib/test_sort.py
@@ -56,11 +56,13 @@ def test_sort():
     dev = tvm.cpu(0)
     target = "llvm"
     f = tvm.compile(te.create_prim_func([data, sort_num, out]), target=target)
-    a = tvm.runtime.tensor(np.array(input_data).astype(data.dtype), dev)
-    b = tvm.runtime.tensor(np.array(sort_num_input).astype(sort_num.dtype), dev)
-    c = tvm.runtime.tensor(np.zeros(a.shape, dtype=out.dtype), dev)
+    a = tvm.runtime.tensor(np.array(input_data).astype(data.dtype.dtype), dev)
+    b = tvm.runtime.tensor(np.array(sort_num_input).astype(sort_num.dtype.dtype), dev)
+    c = tvm.runtime.tensor(np.zeros(a.shape, dtype=out.dtype.dtype), dev)
     f(a, b, c)
-    tvm.testing.assert_allclose(c.numpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
+    tvm.testing.assert_allclose(
+        c.numpy(), np.array(sorted_index).astype(out.dtype.dtype), rtol=1e-5
+    )
 
 
 def test_sort_np():
@@ -88,9 +90,9 @@ def test_sort_np():
     np_data = np.random.uniform(size=dshape)
     np_out = np.argsort(np_data, axis=axis)
     sort_num_input = np.full(reduced_shape, dshape[axis])
-    a = tvm.runtime.tensor(np.array(np_data).astype(data.dtype), dev)
-    b = tvm.runtime.tensor(np.array(sort_num_input).astype(sort_num.dtype), dev)
-    c = tvm.runtime.tensor(np.zeros(a.shape, dtype=out.dtype), dev)
+    a = tvm.runtime.tensor(np.array(np_data).astype(data.dtype.dtype), dev)
+    b = tvm.runtime.tensor(np.array(sort_num_input).astype(sort_num.dtype.dtype), dev)
+    c = tvm.runtime.tensor(np.zeros(a.shape, dtype=out.dtype.dtype), dev)
     f(a, b, c)
     tvm.testing.assert_allclose(c.numpy(), np_out, rtol=1e-5)
 
diff --git a/tests/python/relax/frontend_nn_extern_module.cc b/tests/python/relax/frontend_nn_extern_module.cc
index 1ca2b300d3a8..33755cce581b 100644
--- a/tests/python/relax/frontend_nn_extern_module.cc
+++ b/tests/python/relax/frontend_nn_extern_module.cc
@@ -22,19 +22,18 @@
  */
 #include <dlpack/dlpack.h>
 #include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/dtype.h>
 #include <tvm/ffi/function.h>
-#include <tvm/runtime/data_type.h>
 
 namespace {
 
 int _scalar_add(DLTensor* a, DLTensor* b, DLTensor* c) {
-  using namespace tvm::runtime;
   TVM_FFI_ICHECK(a->ndim == 0);
   TVM_FFI_ICHECK(b->ndim == 0);
   TVM_FFI_ICHECK(c->ndim == 0);
-  TVM_FFI_ICHECK(DataType(a->dtype) == DataType::Float(32));
-  TVM_FFI_ICHECK(DataType(b->dtype) == DataType::Float(32));
-  TVM_FFI_ICHECK(DataType(c->dtype) == DataType::Float(32));
+  TVM_FFI_ICHECK((a->dtype == DLDataType{kDLFloat, 32, 1}));
+  TVM_FFI_ICHECK((b->dtype == DLDataType{kDLFloat, 32, 1}));
+  TVM_FFI_ICHECK((c->dtype == DLDataType{kDLFloat, 32, 1}));
   float* a_data = static_cast<float*>(a->data);
   float* b_data = static_cast<float*>(b->data);
   float* c_data = static_cast<float*>(c->data);
@@ -43,13 +42,12 @@ int _scalar_add(DLTensor* a, DLTensor* b, DLTensor* c) {
 }
 
 int _test_sym(DLTensor* a, DLTensor* b, DLTensor* c) {
-  using namespace tvm::runtime;
   TVM_FFI_ICHECK(a->ndim == 3);  // [x, y, 1]
   TVM_FFI_ICHECK(b->ndim == 3);  // [y, z, 5]
   TVM_FFI_ICHECK(c->ndim == 4);  // [x, y, z, 9]
-  TVM_FFI_ICHECK(DataType(a->dtype) == DataType::Float(32));
-  TVM_FFI_ICHECK(DataType(b->dtype) == DataType::Float(32));
-  TVM_FFI_ICHECK(DataType(c->dtype) == DataType::Float(32));
+  TVM_FFI_ICHECK((a->dtype == DLDataType{kDLFloat, 32, 1}));
+  TVM_FFI_ICHECK((b->dtype == DLDataType{kDLFloat, 32, 1}));
+  TVM_FFI_ICHECK((c->dtype == DLDataType{kDLFloat, 32, 1}));
   int x = a->shape[0];
   int y = a->shape[1];
   int z = b->shape[1];
diff --git a/tests/python/relax/test_analysis_well_formed.py b/tests/python/relax/test_analysis_well_formed.py
index a123dfe75e29..b13e87ea3fe4 100644
--- a/tests/python/relax/test_analysis_well_formed.py
+++ b/tests/python/relax/test_analysis_well_formed.py
@@ -662,7 +662,7 @@ def test_pass_dltensor_arg_to_tir():
 
     In TIR, a `DLTensor*` argument with unknown shape and dtype is
     represented as a `tirx.Var` with
-    `tvm::PrimType(DataType::Handle())`, and with no entry in the
+    `tvm::PrimType::Handle()`, and with no entry in the
     `PrimFuncNode::buffer_map`.  In Relax, this is represented as
     `R.Tensor`.  Calls from Relax to TIR that pass a tensor of unknown
     rank/shape are well-formed.
diff --git a/tests/python/tirx-base/test_tir_buffer.py b/tests/python/tirx-base/test_tir_buffer.py
index bcdd0830a7f3..1fb83a8bb9cb 100644
--- a/tests/python/tirx-base/test_tir_buffer.py
+++ b/tests/python/tirx-base/test_tir_buffer.py
@@ -33,7 +33,7 @@ def test_buffer():
     Bb = tvm.tirx.decl_buffer((n, l), "float32")
 
     assert isinstance(Ab, tvm.tirx.Buffer)
-    assert Ab.dtype == "float32"
+    assert Ab.dtype == tvm.ir.PrimType("float32")
     assert tuple(Ab.shape) == (m, n)
 
 
@@ -43,7 +43,7 @@ def test_buffer_access_ptr():
     Ab = tvm.tirx.decl_buffer((m, n), "float32", strides=[n + 1, 1])
     aptr = Ab.access_ptr("rw")
     tvm.ir.assert_structural_equal(aptr.args[3], Ab.strides[0] * m)
-    assert aptr.args[0].dtype == Ab.dtype
+    assert aptr.args[0].dtype == Ab.dtype.dtype
     assert aptr.args[4].value == Buffer.READ | Buffer.WRITE
     aptr = Ab.access_ptr("w")
     assert aptr.args[4].value == Buffer.WRITE
diff --git a/tests/python/tirx-base/test_tir_intrin.py b/tests/python/tirx-base/test_tir_intrin.py
index db4a42f2584e..43cf7fa2ebb6 100644
--- a/tests/python/tirx-base/test_tir_intrin.py
+++ b/tests/python/tirx-base/test_tir_intrin.py
@@ -48,8 +48,8 @@ def test_nearbyint():
 
     dev = tvm.cpu(0)
     n = 10
-    a = tvm.runtime.tensor(np.random.uniform(high=100, size=n).astype(A.dtype), dev)
-    a_rounded = tvm.runtime.tensor(np.random.uniform(size=n).astype(A_rounded.dtype), dev)
+    a = tvm.runtime.tensor(np.random.uniform(high=100, size=n).astype(A.dtype.dtype), dev)
+    a_rounded = tvm.runtime.tensor(np.random.uniform(size=n).astype(A_rounded.dtype.dtype), dev)
     func(a, a_rounded)
     # Note that numpys rint rounds to nearest integer with
     # ties to halfway is broken by rounding to even.
@@ -125,8 +125,8 @@ def run_test(tvm_intrin, np_func, atol=1e-5, rtol=1e-5):
 
         dev = tvm.cpu(0)
         n = 10
-        a = tvm.runtime.tensor(np.random.uniform(0.1, 0.5, size=n).astype(A.dtype), dev)
-        b = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(0.1, 0.5, size=n).astype(A.dtype.dtype), dev)
+        b = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype.dtype), dev)
         func(a, b)
         tvm.testing.assert_allclose(b.numpy(), np_func(a.numpy()), atol=atol, rtol=rtol)
 
@@ -140,7 +140,7 @@ def run_test(tvm_intrin, np_func, atol=1e-5, rtol=1e-5):
                     np.random.uniform(1.1, 2.0, size=n // 2),
                     np.random.uniform(-2.0, -1.1, size=n // 2),
                 ]
-            ).astype(A.dtype)
+            ).astype(A.dtype.dtype)
             a2 = tvm.runtime.tensor(out_np, dev)
             b2 = tvm.runtime.tensor(np.empty_like(out_np), dev)
             func(a2, b2)
@@ -148,7 +148,7 @@ def run_test(tvm_intrin, np_func, atol=1e-5, rtol=1e-5):
             assert np.all(np.isnan(b2.numpy()))
         if name == "exp":
             n = 8
-            out_np = np.random.randint(-20, 20, size=n).astype(A.dtype)
+            out_np = np.random.randint(-20, 20, size=n).astype(A.dtype.dtype)
             a2 = tvm.runtime.tensor(out_np, dev)
             b2 = tvm.runtime.tensor(np.empty_like(out_np), dev)
             func(a2, b2)
@@ -239,9 +239,9 @@ def run_test(tvm_intrin, np_func):
 
         dev = tvm.cpu(0)
         n = 10
-        a = tvm.runtime.tensor(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
-        b = tvm.runtime.tensor(np.random.uniform(0, 1, size=n).astype(B.dtype), dev)
-        c = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
+        a = tvm.runtime.tensor(np.random.uniform(0, 1, size=n).astype(A.dtype.dtype), dev)
+        b = tvm.runtime.tensor(np.random.uniform(0, 1, size=n).astype(B.dtype.dtype), dev)
+        c = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype.dtype), dev)
         func(a, b, c)
         tvm.testing.assert_allclose(c.numpy(), np_func(a.numpy(), b.numpy()), atol=1e-5, rtol=1e-5)
 
@@ -266,9 +266,9 @@ def test_ldexp():
 
     dev = tvm.cpu(0)
     n = 10
-    a = tvm.runtime.tensor(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
-    b = tvm.runtime.tensor(np.random.randint(0, 5, size=n).astype(B.dtype), dev)
-    c = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype), dev)
+    a = tvm.runtime.tensor(np.random.uniform(0, 1, size=n).astype(A.dtype.dtype), dev)
+    b = tvm.runtime.tensor(np.random.randint(0, 5, size=n).astype(B.dtype.dtype), dev)
+    c = tvm.runtime.tensor(np.random.uniform(size=n).astype(A.dtype.dtype), dev)
     func(a, b, c)
     tvm.testing.assert_allclose(c.numpy(), np.ldexp(a.numpy(), b.numpy()), atol=1e-5, rtol=1e-5)
 
diff --git a/tests/python/tirx-base/test_tir_specialize.py b/tests/python/tirx-base/test_tir_specialize.py
index 0529bd90a4db..f4c530584761 100644
--- a/tests/python/tirx-base/test_tir_specialize.py
+++ b/tests/python/tirx-base/test_tir_specialize.py
@@ -302,8 +302,8 @@ def test_specialize_buffer_var_to_expr():
     """Handle specialization of buffer var
 
     The `tirx::Buffer::data` field must be an explicit `tirx::Var`, and
-    cannot be replaced with a `tirx::PrimExpr` of type
-    `DataType::Handle()`.  However, these substitutions are useful
+    cannot be replaced with a handle-typed `tirx::PrimExpr`.  However,
+    these substitutions are useful
     when lowering.  If these occur, a binding of the `tirx::Var` is
     included in the specialized function.
     """
diff --git a/tests/python/tvmscript/test_tvmscript_parser_tir.py b/tests/python/tvmscript/test_tvmscript_parser_tir.py
index 9c1e26459dc8..f8cc6da3b95b 100644
--- a/tests/python/tvmscript/test_tvmscript_parser_tir.py
+++ b/tests/python/tvmscript/test_tvmscript_parser_tir.py
@@ -29,14 +29,14 @@ def test_tir_buffer_proxy():
     assert (
         isinstance(buffer_0, tirx.Buffer)
         and list(buffer_0.shape) == [128, 128]
-        and buffer_0.dtype == "float32"
+        and buffer_0.dtype == ir.PrimType("float32")
     )
 
     buffer_1 = T.Buffer((64, 64, 64), "int32")
     assert (
         isinstance(buffer_1, tirx.Buffer)
         and list(buffer_1.shape) == [64, 64, 64]
-        and buffer_1.dtype == "int32"
+        and buffer_1.dtype == ir.PrimType("int32")
     )
 
 
diff --git a/tests/python/tvmscript/test_tvmscript_roundtrip.py b/tests/python/tvmscript/test_tvmscript_roundtrip.py
index bdcaf668718e..03950d6b3569 100644
--- a/tests/python/tvmscript/test_tvmscript_roundtrip.py
+++ b/tests/python/tvmscript/test_tvmscript_roundtrip.py
@@ -2482,12 +2482,12 @@ def test_void_ptr_vs_handle():
     one of the two C++ representations.
     """
 
-    # Generates PointerType(PrimType(DataType::Void()))
+    # Generates PointerType(PrimType::Void())
     @T.prim_func(s_tir=True)
     def void_ptr(out_ret_value: T.handle("void")):
         T.evaluate(out_ret_value)
 
-    # Generates PrimType(DataType::Handle())
+    # Generates PrimType::Handle()
     @T.prim_func(s_tir=True)
     def handle(out_ret_value: T.handle):
         T.evaluate(out_ret_value)

From 44abb6c52298191e4358f0e2dadfc136daeb4a24 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 23 Jun 2026 16:40:06 +0000
Subject: [PATCH 2/8] [REFACTOR][IR] Fix PrimType follow-up review issues

---
 src/arith/pattern_match.h                     |  5 +-
 src/relax/op/memory/view.cc                   |  5 +-
 src/relax/transform/lower_alloc_tensor.cc     |  6 +-
 .../transform/static_plan_block_memory.cc     | 10 ++-
 src/relax/transform/utils.h                   |  2 +-
 src/s_tir/analysis/verify_gpu_code.cc         | 70 ++++++++-----------
 6 files changed, 48 insertions(+), 50 deletions(-)

diff --git a/src/arith/pattern_match.h b/src/arith/pattern_match.h
index f6a052089842..dda8e704cfed 100644
--- a/src/arith/pattern_match.h
+++ b/src/arith/pattern_match.h
@@ -71,7 +71,6 @@
 #include <tvm/tirx/expr.h>
 
 #include <cmath>
-#include <optional>
 #include <tuple>
 
 #include "const_fold.h"
@@ -201,7 +200,7 @@ class PVar : public Pattern<PVar<T>> {
   using Nested = const PVar<T>&;
 
   void InitMatch_() const {
-    value_.reset();
+    value_ = nullptr;
     filled_ = false;
   }
 
@@ -234,7 +233,7 @@ class PVar : public Pattern<PVar<T>> {
 
  protected:
   /*! \brief The matched value */
-  mutable std::optional<T> value_;
+  mutable ffi::Optional<T> value_;
   /*! \brief whether the variable has been filled */
   mutable bool filled_{false};
 };
diff --git a/src/relax/op/memory/view.cc b/src/relax/op/memory/view.cc
index f2c5b7da8614..d1674ef92d2d 100644
--- a/src/relax/op/memory/view.cc
+++ b/src/relax/op/memory/view.cc
@@ -172,10 +172,11 @@ Type InferTypeView(const Call& call, const BlockBuilder& ctx) {
 
   // Helper function returns the number of bytes per vectorized element.
   auto get_size_bytes = [](DLDataType dtype) -> ffi::Optional<IntImm> {
-    if ((((dtype).code == kDLOpaqueHandle) && ((dtype).bits == 0) && ((dtype).lanes == 0))) {
+    PrimType ty(dtype);
+    if (ty.IsVoid() || ty.IsScalableVector()) {
       return std::nullopt;
     } else {
-      auto size_bits = ((dtype).bits) * static_cast<int16_t>((dtype).lanes);
+      auto size_bits = ty.bits() * ty.lanes();
       return IntImm::Int64((size_bits + 7) / 8);
     }
   };
diff --git a/src/relax/transform/lower_alloc_tensor.cc b/src/relax/transform/lower_alloc_tensor.cc
index 67cbcc7e8791..f76649164bed 100644
--- a/src/relax/transform/lower_alloc_tensor.cc
+++ b/src/relax/transform/lower_alloc_tensor.cc
@@ -72,8 +72,10 @@ class Mutator : public ExprMutator {
       }();
 
       PrimExpr nbytes = [&]() -> PrimExpr {
-        PrimExpr nbytes = IntImm::Int64(
-            ((((dtype->value).bits * static_cast<int16_t>((dtype->value).lanes)) + 7) / 8));
+        PrimType dtype_ty(dtype->value);
+        TVM_FFI_ICHECK(!dtype_ty.IsScalableVector())
+            << "Cannot statically compute allocation size for scalable vector dtype " << dtype_ty;
+        PrimExpr nbytes = IntImm::Int64(((dtype_ty.bits() * dtype_ty.lanes()) + 7) / 8);
         for (const auto& dim : shape) {
           nbytes *= dim;
         }
diff --git a/src/relax/transform/static_plan_block_memory.cc b/src/relax/transform/static_plan_block_memory.cc
index 3d4fcb256d0e..cb672986e6e5 100644
--- a/src/relax/transform/static_plan_block_memory.cc
+++ b/src/relax/transform/static_plan_block_memory.cc
@@ -138,7 +138,10 @@ class StorageToken : public ffi::ObjectRef {
   explicit StorageToken(ffi::Array<PrimExpr> shape, DLDataType dtype, std::string storage_scope,
                         ffi::Optional<VDevice> vdevice = std::nullopt) {
     // Compute the tensor size from the shape.
-    int64_t const_coeff = ((((dtype).bits * static_cast<int16_t>((dtype).lanes)) + 7) / 8);
+    PrimType dtype_ty(dtype);
+    TVM_FFI_ICHECK(!dtype_ty.IsScalableVector())
+        << "Cannot statically plan storage size for scalable vector dtype " << dtype_ty;
+    int64_t const_coeff = (((dtype_ty.bits() * dtype_ty.lanes()) + 7) / 8);
     PrimExpr size = IntImm::Int64(1);
     bool size_computed = false;
 
@@ -977,7 +980,10 @@ class StorageAllocationRewriter : public ExprMutator {
           bytes *= upper_bounded_shape[i];
         }
         DLDataType dtype = ty->dtype->dtype;
-        bytes *= ((((dtype).bits * static_cast<int16_t>((dtype).lanes)) + 7) / 8);
+        PrimType dtype_ty(dtype);
+        TVM_FFI_ICHECK(!dtype_ty.IsScalableVector())
+            << "Cannot statically plan storage size for scalable vector dtype " << dtype_ty;
+        bytes *= (((dtype_ty.bits() * dtype_ty.lanes()) + 7) / 8);
         Call alloc_storage(mem_alloc_storage,
                            {/*size=*/ShapeExpr({bytes}),
                             /*virtual_device_index=*/call->args[2].as_or_throw<PrimValue>(),
diff --git a/src/relax/transform/utils.h b/src/relax/transform/utils.h
index 0dd6aa6e54a6..d4607459c74f 100644
--- a/src/relax/transform/utils.h
+++ b/src/relax/transform/utils.h
@@ -329,7 +329,7 @@ inline Constant MakeConstantScalar(T value, DLDataType dtype) {
     *static_cast<int32_t*>(arr->data) = static_cast<int32_t>(value);
   } else if (dtype == DLDataType{kDLInt, 64, 1}) {
     *static_cast<int64_t*>(arr->data) = static_cast<int64_t>(value);
-  } else if (dtype == DLDataType{kDLBool, 1, 1}) {
+  } else if (dtype == DLDataType{kDLBool, 8, 1}) {
     *static_cast<bool*>(arr->data) = static_cast<bool>(value);
   } else if (dtype == DLDataType{kDLUInt, 8, 1}) {
     *static_cast<uint8_t*>(arr->data) = static_cast<uint8_t>(value);
diff --git a/src/s_tir/analysis/verify_gpu_code.cc b/src/s_tir/analysis/verify_gpu_code.cc
index 837485d32de1..6c70033056a7 100644
--- a/src/s_tir/analysis/verify_gpu_code.cc
+++ b/src/s_tir/analysis/verify_gpu_code.cc
@@ -76,22 +76,20 @@ class GPUCodeVerifier : public StmtExprVisitor {
         break;
       }
     }
-    DLDataType dtype = op->buffer->dtype->dtype;
+    PrimType dtype_ty = op->buffer->dtype;
+    TVM_FFI_ICHECK(!dtype_ty.IsScalableVector())
+        << "Cannot verify GPU memory usage for scalable vector dtype " << dtype_ty;
     if (storage_scope.rank == runtime::StorageRank::kLocal) {
-      local_memory_per_block_ += static_cast<size_t>(const_size) * (((dtype).bits + 7) / 8) *
-                                 static_cast<int16_t>((dtype).lanes);
+      local_memory_per_block_ += static_cast<size_t>(const_size) * ElementBytes(dtype_ty);
     } else if (storage_scope.rank == runtime::StorageRank::kShared) {
-      shared_memory_per_block_ += static_cast<size_t>(const_size) * (((dtype).bits + 7) / 8) *
-                                  static_cast<int16_t>((dtype).lanes);
+      shared_memory_per_block_ += static_cast<size_t>(const_size) * ElementBytes(dtype_ty);
     }
-    if ((static_cast<int16_t>((dtype).lanes) > 1)) {
-      if (static_cast<size_t>(static_cast<int16_t>((dtype).lanes) * (((dtype).bits + 7) / 8)) >
-          max_vector_bytes_) {
+    if (dtype_ty.IsFixedLengthVector()) {
+      if (ElementBytes(dtype_ty) > max_vector_bytes_) {
         std::stringstream s;
-        s << "Number of lanes (" << static_cast<int16_t>((dtype).lanes)
-          << ") times number of bytes (" << (((dtype).bits + 7) / 8) << ") for dtype "
-          << op->buffer->dtype << " is greater than the maximum number of vector bytes ("
-          << max_vector_bytes_ << ")";
+        s << "Number of lanes (" << dtype_ty.lanes() << ") times number of bytes ("
+          << ((dtype_ty.bits() + 7) / 8) << ") for dtype " << dtype_ty
+          << " is greater than the maximum number of vector bytes (" << max_vector_bytes_ << ")";
         errors_.push_back(s.str());
       }
     }
@@ -205,15 +203,12 @@ class GPUCodeVerifier : public StmtExprVisitor {
     for (const auto index : indices) {
       if (const auto* ramp = index.as<RampNode>()) {
         PrimType ramp_ty = ramp->ty();
-        DLDataType ramp_dtype = ramp_ty->dtype;
         if (!is_one(ramp->stride) && ramp_ty.IsFixedLengthVector() &&
-            static_cast<size_t>(static_cast<int16_t>((ramp_dtype).lanes) *
-                                (((ramp_dtype).bits + 7) / 8)) > max_vector_bytes_) {
+            ElementBytes(ramp_ty) > max_vector_bytes_) {
           std::stringstream s;
-          s << "Number of lanes (" << static_cast<int16_t>((ramp_dtype).lanes)
-            << ") times number of bytes (" << (((ramp_dtype).bits + 7) / 8) << ") for dtype "
-            << ramp_dtype << " is greater than the maximum number of vector bytes ("
-            << max_vector_bytes_ << ")";
+          s << "Number of lanes (" << ramp_ty.lanes() << ") times number of bytes ("
+            << ((ramp_ty.bits() + 7) / 8) << ") for dtype " << ramp_ty
+            << " is greater than the maximum number of vector bytes (" << max_vector_bytes_ << ")";
           errors_.push_back(s.str());
         }
       }
@@ -222,15 +217,12 @@ class GPUCodeVerifier : public StmtExprVisitor {
 
   void VisitExpr_(const CastNode* op) {
     PrimType op_ty = op->ty();
-    DLDataType op_dtype = op_ty->dtype;
     if (op_ty.IsFixedLengthVector()) {
-      if (static_cast<size_t>(static_cast<int16_t>((op_dtype).lanes) *
-                              (((op_dtype).bits + 7) / 8)) > max_vector_bytes_) {
+      if (ElementBytes(op_ty) > max_vector_bytes_) {
         std::stringstream s;
-        s << "Number of lanes (" << static_cast<int16_t>((op_dtype).lanes)
-          << ") times number of bytes (" << (((op_dtype).bits + 7) / 8) << ") for dtype "
-          << op_dtype << " is greater than the maximum number of vector bytes ("
-          << max_vector_bytes_ << ")";
+        s << "Number of lanes (" << op_ty.lanes() << ") times number of bytes ("
+          << ((op_ty.bits() + 7) / 8) << ") for dtype " << op_ty
+          << " is greater than the maximum number of vector bytes (" << max_vector_bytes_ << ")";
         errors_.push_back(s.str());
       }
     }
@@ -239,15 +231,12 @@ class GPUCodeVerifier : public StmtExprVisitor {
 
   void VisitExpr_(const BufferLoadNode* op) {
     PrimType op_ty = op->ty();
-    DLDataType op_dtype = op_ty->dtype;
     if (op_ty.IsFixedLengthVector()) {
-      if (static_cast<size_t>(static_cast<int16_t>((op_dtype).lanes) *
-                              (((op_dtype).bits + 7) / 8)) > max_vector_bytes_) {
+      if (ElementBytes(op_ty) > max_vector_bytes_) {
         std::stringstream s;
-        s << "Number of lanes (" << static_cast<int16_t>((op_dtype).lanes)
-          << ") times number of bytes (" << (((op_dtype).bits + 7) / 8) << ") for dtype "
-          << op_dtype << " is greater than the maximum number of vector bytes ("
-          << max_vector_bytes_ << ")";
+        s << "Number of lanes (" << op_ty.lanes() << ") times number of bytes ("
+          << ((op_ty.bits() + 7) / 8) << ") for dtype " << op_ty
+          << " is greater than the maximum number of vector bytes (" << max_vector_bytes_ << ")";
         errors_.push_back(s.str());
       }
       CheckBufferIndicesVectorizable(op->indices);
@@ -257,15 +246,12 @@ class GPUCodeVerifier : public StmtExprVisitor {
 
   void VisitStmt_(const BufferStoreNode* op) {
     PrimType value_ty = op->value.ty();
-    DLDataType value_dtype = value_ty->dtype;
     if (value_ty.IsFixedLengthVector()) {
-      if (static_cast<size_t>(static_cast<int16_t>((value_dtype).lanes) *
-                              (((value_dtype).bits + 7) / 8)) > max_vector_bytes_) {
+      if (ElementBytes(value_ty) > max_vector_bytes_) {
         std::stringstream s;
-        s << "Number of lanes (" << static_cast<int16_t>((value_dtype).lanes)
-          << ") times number of bytes (" << (((value_dtype).bits + 7) / 8) << ") for dtype "
-          << value_dtype << " is greater than the maximum number of vector bytes ("
-          << max_vector_bytes_ << ")";
+        s << "Number of lanes (" << value_ty.lanes() << ") times number of bytes ("
+          << ((value_ty.bits() + 7) / 8) << ") for dtype " << value_ty
+          << " is greater than the maximum number of vector bytes (" << max_vector_bytes_ << ")";
         errors_.push_back(s.str());
       }
       CheckBufferIndicesVectorizable(op->indices);
@@ -294,6 +280,10 @@ class GPUCodeVerifier : public StmtExprVisitor {
 
   std::vector<ffi::String> errors_;
 
+  static size_t ElementBytes(const PrimType& ty) {
+    return static_cast<size_t>(ty.lanes()) * ((ty.bits() + 7) / 8);
+  }
+
   void Reset_() {
     local_memory_per_block_ = 0;
     shared_memory_per_block_ = 0;

From 7e665892be2a0c5062b226f157f2e4af6bf9a545 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 23 Jun 2026 16:55:47 +0000
Subject: [PATCH 3/8] [REFACTOR][IR] Address PrimType review follow-ups

---
 python/tvm/relax/op/_op_gradient.py         |  3 +++
 src/arith/bound_deducer.cc                  |  2 +-
 src/arith/const_fold.h                      |  6 +++---
 src/arith/int_constraints.cc                |  6 ++----
 src/arith/int_set.cc                        |  8 ++++----
 src/arith/transitive_comparison_analyzer.cc |  4 ++--
 src/relax/op/distributed/nn.cc              |  2 +-
 src/relax/op/distributed/unary.h            |  2 +-
 src/relax/op/nn/nn.cc                       |  8 ++++----
 src/relax/op/tensor/index.cc                |  3 +--
 src/relax/op/tensor/search.cc               |  2 +-
 src/relax/transform/gradient.cc             |  2 +-
 tests/python/te/test_te_create_primfunc.py  | 10 +++++-----
 13 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/python/tvm/relax/op/_op_gradient.py b/python/tvm/relax/op/_op_gradient.py
index 5aae26b75f20..809a2c19ee9e 100644
--- a/python/tvm/relax/op/_op_gradient.py
+++ b/python/tvm/relax/op/_op_gradient.py
@@ -22,6 +22,7 @@
 
 from tvm import relax
 from tvm.arith import Analyzer
+from tvm.ir import PrimType
 from tvm.relax.type import ShapeType
 
 from ...tirx import PrimExpr
@@ -81,6 +82,8 @@ def _get_dtype(expr: Expr) -> str:
         raise RuntimeError(
             f"Get the dtype of {expr} failed. Please normalize it first and ensure it is a Tensor."
         ) from error
+    if isinstance(dtype, PrimType):
+        dtype = dtype.dtype
     return dtype
 
 
diff --git a/src/arith/bound_deducer.cc b/src/arith/bound_deducer.cc
index bceeb4eafa2e..01d50da56e41 100644
--- a/src/arith/bound_deducer.cc
+++ b/src/arith/bound_deducer.cc
@@ -97,7 +97,7 @@ class BoundDeducer : public ExprFunctor<void(const PrimExpr&)> {
 
   SignType GetSignType(const PrimExpr& e) {
     PrimType e_ty = e.ty();
-    if (e_ty.code() == DLDataTypeCode::kDLUInt) {
+    if (e_ty.MatchesCode(DLDataTypeCode::kDLUInt)) {
       return kPositive;
     }
     return expr_map_[e].GetSignType();
diff --git a/src/arith/const_fold.h b/src/arith/const_fold.h
index ed1fc2d1a7a6..4793538316a3 100644
--- a/src/arith/const_fold.h
+++ b/src/arith/const_fold.h
@@ -94,7 +94,7 @@ inline int64_t GetFoldResultInt64Repr(int64_t x, const PrimType& dtype) {
   if (dtype.bits() < 64) {
     x &= (1LL << dtype.bits()) - 1;
   }
-  if (dtype.code() == DLDataTypeCode::kDLInt) {
+  if (dtype.MatchesCode(DLDataTypeCode::kDLInt)) {
     int64_t m = 1LL << (dtype.bits() - 1);
     x = (x ^ m) - m;
   }
@@ -164,8 +164,8 @@ inline ffi::Optional<PrimExpr> TryConstFold<tirx::Add>(PrimExpr a, PrimExpr b) {
 template <>
 inline ffi::Optional<PrimExpr> TryConstFold<tirx::Sub>(PrimExpr a, PrimExpr b) {
   TVM_ARITH_CONST_PROPAGATION({
-    TVM_FFI_ICHECK(!((pa && pa->ty().code() == DLDataTypeCode::kDLUInt && pa->value == 0U) &&
-                     (pb && pb->ty().code() == DLDataTypeCode::kDLUInt && pb->value > 0U)))
+    TVM_FFI_ICHECK(!((pa && pa->ty().MatchesCode(DLDataTypeCode::kDLUInt) && pa->value == 0U) &&
+                     (pb && pb->ty().MatchesCode(DLDataTypeCode::kDLUInt) && pb->value > 0U)))
         << "Checked failed. Minuend 's value is 0U and it's dtype is uint "
         << "while Subtrahend's dtype is uint; which will cause a negative uint";
     PrimType result_ty = a.ty();
diff --git a/src/arith/int_constraints.cc b/src/arith/int_constraints.cc
index bcd957aac0f2..b517324f378d 100644
--- a/src/arith/int_constraints.cc
+++ b/src/arith/int_constraints.cc
@@ -75,8 +75,7 @@ ffi::Array<PrimExpr> AsConditions(const ffi::Array<Var>& variables,
 IntGroupBounds::IntGroupBounds(PrimExpr coef, ffi::Array<PrimExpr> lower,
                                ffi::Array<PrimExpr> equal, ffi::Array<PrimExpr> upper) {
   PrimType coef_ty = coef.ty();
-  TVM_FFI_ICHECK(coef_ty.code() == DLDataTypeCode::kDLInt ||
-                 coef_ty.code() == DLDataTypeCode::kDLUInt)
+  TVM_FFI_ICHECK(coef_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt))
       << "Coefficient in IntGroupBounds must be integers";
   ffi::ObjectPtr<IntGroupBoundsNode> node = ffi::make_object<IntGroupBoundsNode>();
   node->coef = std::move(coef);
@@ -235,8 +234,7 @@ IntConstraints::IntConstraints(ffi::Array<Var> variables, ffi::Map<Var, Range> r
   TVM_FFI_ICHECK(relations.defined());
   for (const auto& var : variables) {
     PrimType var_ty = var.ty();
-    TVM_FFI_ICHECK(var_ty.code() == DLDataTypeCode::kDLInt ||
-                   var_ty.code() == DLDataTypeCode::kDLUInt)
+    TVM_FFI_ICHECK(var_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt))
         << "Variables in IntConstraints must be integers";
   }
   node->variables = std::move(variables);
diff --git a/src/arith/int_set.cc b/src/arith/int_set.cc
index ac966582e766..b3d111ffa7a8 100644
--- a/src/arith/int_set.cc
+++ b/src/arith/int_set.cc
@@ -74,8 +74,8 @@ IntervalSet Intersect(AnalyzerObj* analyzer, IntervalSet a, IntervalSet b) {
   PrimExpr min_value = max(a->min_value, b->min_value);
   PrimType max_ty = max_value.ty();
   PrimType min_ty = min_value.ty();
-  if ((max_ty.code() == DLDataTypeCode::kDLInt || max_ty.code() == DLDataTypeCode::kDLUInt) &&
-      (min_ty.code() == DLDataTypeCode::kDLInt || min_ty.code() == DLDataTypeCode::kDLUInt) &&
+  if (max_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt) &&
+      min_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt) &&
       analyzer->CanProve(max_value < min_value)) {
     return IntervalSet::Empty();
   } else {
@@ -583,7 +583,7 @@ class IntervalSetEvaluator : public ExprFunctor<IntervalSet(const PrimExpr&)> {
 
   IntervalSet VisitExpr_(const BufferLoadNode* op) final {
     PrimType op_ty = op->ty();
-    if (!(op_ty.code() == DLDataTypeCode::kDLInt || op_ty.code() == DLDataTypeCode::kDLUInt)) {
+    if (!op_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
       DLOG(WARNING) << "cannot evaluate set BufferLoad which loads from a " << op_ty->dtype
                     << " buffer";
       return IntervalSet::Everything();
@@ -1073,7 +1073,7 @@ IntSet EvalSet(PrimExpr e, const std::unordered_map<const VarNode*, IntSet>& dom
 IntSet EvalSet(Range r, const ffi::Map<Var, IntSet>& dom_map) {
   Analyzer ana;
   PrimType min_ty = r->min.ty();
-  if ((min_ty.code() == DLDataTypeCode::kDLInt || min_ty.code() == DLDataTypeCode::kDLUInt) &&
+  if (min_ty.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt) &&
       ana->CanProveEqual(r->extent, 1)) {
     return EvalSet(r->min, dom_map);
   }
diff --git a/src/arith/transitive_comparison_analyzer.cc b/src/arith/transitive_comparison_analyzer.cc
index e6465ad3cf93..7b740d6229c2 100644
--- a/src/arith/transitive_comparison_analyzer.cc
+++ b/src/arith/transitive_comparison_analyzer.cc
@@ -615,8 +615,8 @@ CompareResult TransitiveComparisonAnalyzer::Impl::TryCompare(const PrimExpr& lhs
                                                              const PrimExpr& rhs_expr,
                                                              bool propagate_inequalities) const {
   // Currently only supports integer checks
-  if (lhs_expr.ty().code() != DLDataTypeCode::kDLInt ||
-      rhs_expr.ty().code() != DLDataTypeCode::kDLInt) {
+  if (!lhs_expr.ty().MatchesCode(DLDataTypeCode::kDLInt) ||
+      !rhs_expr.ty().MatchesCode(DLDataTypeCode::kDLInt)) {
     return CompareResult::kUnknown;
   }
 
diff --git a/src/relax/op/distributed/nn.cc b/src/relax/op/distributed/nn.cc
index 386401521974..fcdc37c54046 100644
--- a/src/relax/op/distributed/nn.cc
+++ b/src/relax/op/distributed/nn.cc
@@ -35,7 +35,7 @@ Type InferDistTypeSoftmax(const Call& call, const BlockBuilder& ctx) {
   }
   PrimType input_dtype = input_tensor_ty->dtype;
   // Softmax validation preserves the old float-kind check; lanes do not affect this policy.
-  if (!input_tensor_ty->IsUnknownDtype() && input_dtype.code() != DLDataTypeCode::kDLFloat) {
+  if (!input_tensor_ty->IsUnknownDtype() && !input_dtype.MatchesCode(DLDataTypeCode::kDLFloat)) {
     TVM_FFI_VISIT_THROW(TypeError, call) << "Softmax requires the input tensor to have float "
                                             "dtype. However, the given input dtype is "
                                          << input_tensor_ty->dtype;
diff --git a/src/relax/op/distributed/unary.h b/src/relax/op/distributed/unary.h
index be7ca27d3ade..58e0a41e27cb 100644
--- a/src/relax/op/distributed/unary.h
+++ b/src/relax/op/distributed/unary.h
@@ -43,7 +43,7 @@ Type InferDistTypeUnary(const Call& call, const BlockBuilder& ctx, FType f_compu
   PrimType input_dtype = input_tensor_ty->dtype;
   // Unary op validation preserves the old float-kind check; lanes do not affect this policy.
   if (require_float_dtype && !input_tensor_ty->IsUnknownDtype() &&
-      input_dtype.code() != DLDataTypeCode::kDLFloat) {
+      !input_dtype.MatchesCode(DLDataTypeCode::kDLFloat)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << call->op
         << " requires the input tensor to have float dtype. However, the given input dtype is "
diff --git a/src/relax/op/nn/nn.cc b/src/relax/op/nn/nn.cc
index c34f7afbc79d..5deb6db937bb 100644
--- a/src/relax/op/nn/nn.cc
+++ b/src/relax/op/nn/nn.cc
@@ -124,7 +124,7 @@ Type InferTypePRelu(const Call& call, const BlockBuilder& ctx) {
   }
   PrimType data_dtype = data_ty->dtype;
   // PRelu preserves the old float-kind check; vector lanes are irrelevant to this check.
-  if (!data_ty->IsUnknownDtype() && data_dtype.code() != DLDataTypeCode::kDLFloat) {
+  if (!data_ty->IsUnknownDtype() && !data_dtype.MatchesCode(DLDataTypeCode::kDLFloat)) {
     TVM_FFI_VISIT_THROW(TypeError, call) << "Prelu requires the input tensor to have float "
                                             "dtype. However, the given input dtype is "
                                          << data_ty->dtype;
@@ -191,7 +191,7 @@ Type InferTypeSoftmax(const Call& call, const BlockBuilder& ctx) {
   if (!data_ty->IsUnknownDtype()) {
     PrimType data_dtype = data_ty->dtype;
     // Softmax only requires a floating element kind; lane encoding is irrelevant to the check.
-    if (data_dtype.code() != kDLFloat && data_dtype.code() != kDLBfloat) {
+    if (!data_dtype.MatchesCode(kDLFloat, kDLBfloat)) {
       TVM_FFI_VISIT_THROW(TypeError, call) << "Softmax requires the input tensor to have float "
                                               "dtype. However, the given input dtype is "
                                            << data_ty->dtype;
@@ -389,7 +389,7 @@ bool NormCheckDtypeAndShape(const Call& call, const BlockBuilder& ctx,
   if (!data_ty->IsUnknownDtype()) {
     PrimType data_dtype = data_ty->dtype;
     // Norm ops only require a floating element kind; lane encoding is irrelevant to the check.
-    if (data_dtype.code() != kDLFloat && data_dtype.code() != kDLBfloat) {
+    if (!data_dtype.MatchesCode(kDLFloat, kDLBfloat)) {
       TVM_FFI_VISIT_THROW(TypeError, call)
           << op << " requires the input data to have float dtype. However, the given data dtype is "
           << data_ty->dtype;
@@ -632,7 +632,7 @@ Type InferTypeGroupNorm(const Call& call, const BlockBuilder& ctx) {
   }
   PrimType data_dtype = data_ty->dtype;
   // GroupNorm preserves the old float-kind check; vector lanes are irrelevant to this check.
-  if (!data_ty->IsUnknownDtype() && data_dtype.code() != DLDataTypeCode::kDLFloat) {
+  if (!data_ty->IsUnknownDtype() && !data_dtype.MatchesCode(DLDataTypeCode::kDLFloat)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << op << " expects that data must be float, but got " << data_ty->dtype;
   }
diff --git a/src/relax/op/tensor/index.cc b/src/relax/op/tensor/index.cc
index e42feb0ae06c..5321798b8e48 100644
--- a/src/relax/op/tensor/index.cc
+++ b/src/relax/op/tensor/index.cc
@@ -86,8 +86,7 @@ Type InferTypeTake(const Call& call, const BlockBuilder& ctx) {
     LOG(WARNING) << "Data type of indices has not been specified. Assume it has an integer type.";
   } else {
     PrimType indices_dtype = indices_ty->dtype;
-    if (!indices_dtype.MatchesCode(DLDataTypeCode::kDLInt) &&
-        !indices_dtype.MatchesCode(DLDataTypeCode::kDLUInt)) {
+    if (!indices_dtype.MatchesCode(DLDataTypeCode::kDLInt, DLDataTypeCode::kDLUInt)) {
       TVM_FFI_VISIT_THROW(TypeError, call)
           << "Take op requires the input indices to have integer dtype. However, the "
              "given indices dtype is "
diff --git a/src/relax/op/tensor/search.cc b/src/relax/op/tensor/search.cc
index 635879db2be3..c5021f6f5aef 100644
--- a/src/relax/op/tensor/search.cc
+++ b/src/relax/op/tensor/search.cc
@@ -120,7 +120,7 @@ Type InferTypeWhere(const Call& call, const BlockBuilder& ctx) {
 
   PrimType cond_dtype = cond_ty->dtype;
   // Where condition validation only checks the boolean element kind; lanes are irrelevant here.
-  if (cond_dtype.code() != DLDataTypeCode::kDLBool) {
+  if (!cond_dtype.MatchesCode(DLDataTypeCode::kDLBool)) {
     TVM_FFI_VISIT_THROW(TypeError, call)
         << "Where requires the input condition tensor to have boolean dtype. However, "
            "the given condition dtype is "
diff --git a/src/relax/transform/gradient.cc b/src/relax/transform/gradient.cc
index e23524388435..992103de7d91 100644
--- a/src/relax/transform/gradient.cc
+++ b/src/relax/transform/gradient.cc
@@ -708,7 +708,7 @@ class GradientMutator : private ExprMutator {
   static bool IsFloatTensorType(const Type& ty) {
     auto* tensor_ty = ty.as<TensorTypeNode>();
     // Gradient eligibility preserves the old float-kind check; lanes do not affect this policy.
-    return tensor_ty && tensor_ty->dtype.code() == DLDataTypeCode::kDLFloat;
+    return tensor_ty && tensor_ty->dtype.MatchesCode(DLDataTypeCode::kDLFloat);
   }
 
   // When the return value is a Var, it is the target;
diff --git a/tests/python/te/test_te_create_primfunc.py b/tests/python/te/test_te_create_primfunc.py
index 6aa5689ad10d..2249b5bd4ab6 100644
--- a/tests/python/te/test_te_create_primfunc.py
+++ b/tests/python/te/test_te_create_primfunc.py
@@ -353,8 +353,8 @@ def test_constant():
 
     func = te.create_prim_func([C, A])
     func = tvm.compile(func)
-    a_np = np.random.uniform(size=(M,)).astype(A.dtype)
-    c = tvm.runtime.tensor(np.zeros(M, dtype=C.dtype))
+    a_np = np.random.uniform(size=(M,)).astype(A.dtype.dtype)
+    c = tvm.runtime.tensor(np.zeros(M, dtype=C.dtype.dtype))
     x = func(c, tvm.runtime.tensor(a_np))
     tvm.testing.assert_allclose(a_np + 2, c.numpy())
 
@@ -393,9 +393,9 @@ def test_data_dependent_access():
     func = te.create_prim_func([C, A, B])
     func = tvm.compile(func)
 
-    a_np = np.random.uniform(size=(10,)).astype(A.dtype)
-    b_np = np.arange(10, dtype=B.dtype)
-    c = tvm.runtime.tensor(np.zeros(10, dtype=C.dtype))
+    a_np = np.random.uniform(size=(10,)).astype(A.dtype.dtype)
+    b_np = np.arange(10, dtype=B.dtype.dtype)
+    c = tvm.runtime.tensor(np.zeros(10, dtype=C.dtype.dtype))
     func(c, tvm.runtime.tensor(a_np), tvm.runtime.tensor(b_np))
     tvm.testing.assert_allclose(a_np[b_np], c.numpy())
 

From e130c820824a183660f8164ce0a1c144fc7408f7 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 23 Jun 2026 17:15:52 +0000
Subject: [PATCH 4/8] [REFACTOR][Runtime] Use raw DLDataType in contrib
 backends

---
 src/backend/cuda/runtime/cuda_device_api.cc   | 21 +++---
 src/runtime/extra/contrib/cblas/cblas.cc      | 64 ++++++++---------
 src/runtime/extra/contrib/cblas/dnnl_blas.cc  |  3 +-
 src/runtime/extra/contrib/cblas/mkl.cc        | 55 +++++++-------
 src/runtime/extra/contrib/cublas/cublas.cc    | 71 ++++++++++---------
 .../extra/contrib/cudnn/conv_backward.cc      |  2 -
 .../extra/contrib/cudnn/conv_forward.cc       |  2 -
 src/runtime/extra/contrib/cudnn/softmax.cc    |  2 -
 src/runtime/extra/contrib/hipblas/hipblas.cc  | 54 +++++++-------
 src/runtime/extra/contrib/random/random.cc    |  2 -
 10 files changed, 135 insertions(+), 141 deletions(-)

diff --git a/src/backend/cuda/runtime/cuda_device_api.cc b/src/backend/cuda/runtime/cuda_device_api.cc
index 6e30df29aa91..44d1acff4937 100644
--- a/src/backend/cuda/runtime/cuda_device_api.cc
+++ b/src/backend/cuda/runtime/cuda_device_api.cc
@@ -478,13 +478,14 @@ TVM_FFI_STATIC_INIT_BLOCK() {
     auto l2_promotion_kind = static_cast<CUtensorMapL2promotion>(args[arg_cnt++].cast<int>());
     auto oob_fill_kind = static_cast<CUtensorMapFloatOOBfill>(args[arg_cnt++].cast<int>());
 
-    TVM_FFI_ICHECK_EQ(tensor_dtype.lanes(), 1)
+    TVM_FFI_ICHECK_EQ(tensor_dtype.lanes, 1)
         << "Expect tensor_dtype to have lanes=1, but get " << tensor_dtype;
+    uint64_t tensor_dtype_bytes = (static_cast<uint64_t>(tensor_dtype.bits) + 7) / 8;
     CUtensorMapDataType cu_dtype;
-    switch (tensor_dtype.code()) {
+    switch (tensor_dtype.code) {
       case kDLInt:
         // int
-        switch (tensor_dtype.bits()) {
+        switch (tensor_dtype.bits) {
           case 8:
             cu_dtype = CU_TENSOR_MAP_DATA_TYPE_UINT8;
             break;
@@ -501,7 +502,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
         break;
       case kDLUInt:
         // unsigned int
-        switch (tensor_dtype.bits()) {
+        switch (tensor_dtype.bits) {
           case 8:
             cu_dtype = CU_TENSOR_MAP_DATA_TYPE_UINT8;
             break;
@@ -521,7 +522,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
         break;
       case kDLFloat:
         // float
-        switch (tensor_dtype.bits()) {
+        switch (tensor_dtype.bits) {
           case 16:
             cu_dtype = CU_TENSOR_MAP_DATA_TYPE_FLOAT16;
             break;
@@ -538,7 +539,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
         break;
       case kDLBfloat:
         // bfloat
-        switch (tensor_dtype.bits()) {
+        switch (tensor_dtype.bits) {
           case 16:
             cu_dtype = CU_TENSOR_MAP_DATA_TYPE_BFLOAT16;
             break;
@@ -674,7 +675,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
           << "globalDim[0] must be a multiple of 2 for packed 16U4 align8 format";
     }
     if (interleaved_kind == CU_TENSOR_MAP_INTERLEAVE_NONE && !is_packed_dtype) {
-      uint64_t inner_box_bytes = static_cast<uint64_t>(box_dim[0]) * tensor_dtype.bytes();
+      uint64_t inner_box_bytes = static_cast<uint64_t>(box_dim[0]) * tensor_dtype_bytes;
       TVM_FFI_ICHECK_EQ(inner_box_bytes % 16, 0)
           << "boxDim[0] * elementSizeInBytes(tensorDataType) must be a multiple of 16 bytes";
     }
@@ -694,15 +695,15 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
     if (interleaved_kind == CU_TENSOR_MAP_INTERLEAVE_NONE && !is_packed_dtype &&
         swizzle_kind == CU_TENSOR_MAP_SWIZZLE_32B) {
-      TVM_FFI_ICHECK_LE(box_dim[0] * tensor_dtype.bytes(), 32)
+      TVM_FFI_ICHECK_LE(box_dim[0] * tensor_dtype_bytes, 32)
           << "CU_TENSOR_MAP_SWIZZLE_32B implies the bounding box inner dimension will be <= 32.";
     } else if (interleaved_kind == CU_TENSOR_MAP_INTERLEAVE_NONE && !is_packed_dtype &&
                swizzle_kind == CU_TENSOR_MAP_SWIZZLE_64B) {
-      TVM_FFI_ICHECK_LE(box_dim[0] * tensor_dtype.bytes(), 64)
+      TVM_FFI_ICHECK_LE(box_dim[0] * tensor_dtype_bytes, 64)
           << "CU_TENSOR_MAP_SWIZZLE_64B implies the bounding box inner dimension will be <= 64.";
     } else if (interleaved_kind == CU_TENSOR_MAP_INTERLEAVE_NONE && !is_packed_dtype &&
                is_128b_swizzle) {
-      TVM_FFI_ICHECK_LE(box_dim[0] * tensor_dtype.bytes(), 128)
+      TVM_FFI_ICHECK_LE(box_dim[0] * tensor_dtype_bytes, 128)
           << "CU_TENSOR_MAP_SWIZZLE_128B implies the bounding box inner dimension will be <= "
              "128.";
     }
diff --git a/src/runtime/extra/contrib/cblas/cblas.cc b/src/runtime/extra/contrib/cblas/cblas.cc
index aae0a5acce1c..a19ccc99bb3f 100644
--- a/src/runtime/extra/contrib/cblas/cblas.cc
+++ b/src/runtime/extra/contrib/cblas/cblas.cc
@@ -35,7 +35,6 @@ extern "C" {
 namespace tvm {
 namespace contrib {
 
-using namespace runtime;
 inline CBLAS_TRANSPOSE CBLASBooleanToTranspose(bool trans) {
   return trans ? CblasTrans : CblasNoTrans;
 }
@@ -128,38 +127,39 @@ struct CblasDgemmBatchIterativeOp {
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
-      .def_packed(
-          "tvm.contrib.cblas.matmul",
-          [](ffi::PackedArgs args, ffi::Any* ret) {
-            auto A = args[0].cast<DLTensor*>();
-            TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
+      .def_packed("tvm.contrib.cblas.matmul",
+                  [](ffi::PackedArgs args, ffi::Any* ret) {
+                    auto A = args[0].cast<DLTensor*>();
+                    TVM_FFI_ICHECK((A->dtype == DLDataType{kDLFloat, 32, 1} ||
+                                    A->dtype == DLDataType{kDLFloat, 64, 1}));
 
-            if (TypeMatch(A->dtype, kDLFloat, 32))
-              CallGemm(args, ret, CblasSgemmOp());
-            else
-              CallGemm(args, ret, CblasDgemmOp());
-          })
-      .def_packed(
-          "tvm.contrib.cblas.batch_matmul",
-          [](ffi::PackedArgs args, ffi::Any* ret) {
-            auto A = args[0].cast<DLTensor*>();
-            TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
-            if (TypeMatch(A->dtype, kDLFloat, 32)) {
-              CallBatchGemm(args, ret, CblasSgemmBatchOp());
-            } else {
-              CallBatchGemm(args, ret, CblasDgemmBatchOp());
-            }
-          })
-      .def_packed(
-          "tvm.contrib.cblas.batch_matmul_iterative", [](ffi::PackedArgs args, ffi::Any* ret) {
-            auto A = args[0].cast<DLTensor*>();
-            TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
-            if (TypeMatch(A->dtype, kDLFloat, 32)) {
-              CallBatchGemm(args, ret, CblasSgemmBatchIterativeOp());
-            } else {
-              CallBatchGemm(args, ret, CblasDgemmBatchIterativeOp());
-            }
-          });
+                    if (A->dtype == DLDataType{kDLFloat, 32, 1})
+                      CallGemm(args, ret, CblasSgemmOp());
+                    else
+                      CallGemm(args, ret, CblasDgemmOp());
+                  })
+      .def_packed("tvm.contrib.cblas.batch_matmul",
+                  [](ffi::PackedArgs args, ffi::Any* ret) {
+                    auto A = args[0].cast<DLTensor*>();
+                    TVM_FFI_ICHECK((A->dtype == DLDataType{kDLFloat, 32, 1} ||
+                                    A->dtype == DLDataType{kDLFloat, 64, 1}));
+                    if (A->dtype == DLDataType{kDLFloat, 32, 1}) {
+                      CallBatchGemm(args, ret, CblasSgemmBatchOp());
+                    } else {
+                      CallBatchGemm(args, ret, CblasDgemmBatchOp());
+                    }
+                  })
+      .def_packed("tvm.contrib.cblas.batch_matmul_iterative",
+                  [](ffi::PackedArgs args, ffi::Any* ret) {
+                    auto A = args[0].cast<DLTensor*>();
+                    TVM_FFI_ICHECK((A->dtype == DLDataType{kDLFloat, 32, 1} ||
+                                    A->dtype == DLDataType{kDLFloat, 64, 1}));
+                    if (A->dtype == DLDataType{kDLFloat, 32, 1}) {
+                      CallBatchGemm(args, ret, CblasSgemmBatchIterativeOp());
+                    } else {
+                      CallBatchGemm(args, ret, CblasDgemmBatchIterativeOp());
+                    }
+                  });
 }
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/runtime/extra/contrib/cblas/dnnl_blas.cc b/src/runtime/extra/contrib/cblas/dnnl_blas.cc
index c267a37aa58e..c0828c12e8b6 100644
--- a/src/runtime/extra/contrib/cblas/dnnl_blas.cc
+++ b/src/runtime/extra/contrib/cblas/dnnl_blas.cc
@@ -35,7 +35,6 @@ extern "C" {
 namespace tvm {
 namespace contrib {
 
-using namespace runtime;
 inline char DNNLBooleanToTransposeChar(bool trans) { return trans ? 'T' : 'N'; }
 
 struct DNNLSgemmOp {
@@ -52,7 +51,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def_packed("tvm.contrib.dnnl.matmul", [](ffi::PackedArgs args, ffi::Any* ret) {
     auto A = args[0].cast<DLTensor*>();
-    TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 32));
+    TVM_FFI_ICHECK((A->dtype == DLDataType{kDLFloat, 32, 1}));
     CallGemm(args, ret, DNNLSgemmOp());
   });
 }
diff --git a/src/runtime/extra/contrib/cblas/mkl.cc b/src/runtime/extra/contrib/cblas/mkl.cc
index f039df8e676f..366ada41d2f1 100644
--- a/src/runtime/extra/contrib/cblas/mkl.cc
+++ b/src/runtime/extra/contrib/cblas/mkl.cc
@@ -35,7 +35,6 @@ extern "C" {
 namespace tvm {
 namespace contrib {
 
-using namespace runtime;
 inline CBLAS_TRANSPOSE MKLBooleanToTranspose(bool trans) {
   return trans ? CblasTrans : CblasNoTrans;
 }
@@ -160,9 +159,10 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def_packed("tvm.contrib.mkl.matmul", [](ffi::PackedArgs args, ffi::Any* ret) {
     auto A = args[0].cast<DLTensor*>();
-    TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
+    TVM_FFI_ICHECK(
+        (A->dtype == DLDataType{kDLFloat, 32, 1} || A->dtype == DLDataType{kDLFloat, 64, 1}));
 
-    if (TypeMatch(A->dtype, kDLFloat, 32))
+    if (A->dtype == DLDataType{kDLFloat, 32, 1})
       CallGemm(args, ret, MKLSgemmOp());
     else
       CallGemm(args, ret, MKLDgemmOp());
@@ -178,33 +178,34 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                     auto A = args[0].cast<DLTensor*>();
                     auto B = args[1].cast<DLTensor*>();
                     auto C = args[2].cast<DLTensor*>();
-                    TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLUInt, 8) &&
-                                   TypeMatch(B->dtype, kDLInt, 8) &&
-                                   TypeMatch(C->dtype, kDLInt, 32));
+                    TVM_FFI_ICHECK((A->dtype == DLDataType{kDLUInt, 8, 1} &&
+                                    B->dtype == DLDataType{kDLInt, 8, 1} &&
+                                    C->dtype == DLDataType{kDLInt, 32, 1}));
 
                     CallU8S8S32Gemm(args, ret, MKLGemmU8S8S32Op());
                   })
-      .def_packed(
-          "tvm.contrib.mkl.batch_matmul",
-          [](ffi::PackedArgs args, ffi::Any* ret) {
-            auto A = args[0].cast<DLTensor*>();
-            TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
-            if (TypeMatch(A->dtype, kDLFloat, 32)) {
-              CallBatchGemm(args, ret, MKLSgemmBatchOp());
-            } else {
-              CallBatchGemm(args, ret, MKLDgemmBatchOp());
-            }
-          })
-      .def_packed(
-          "tvm.contrib.mkl.batch_matmul_iterative", [](ffi::PackedArgs args, ffi::Any* ret) {
-            auto A = args[0].cast<DLTensor*>();
-            TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 32) || TypeMatch(A->dtype, kDLFloat, 64));
-            if (TypeMatch(A->dtype, kDLFloat, 32)) {
-              CallBatchGemm(args, ret, MKLSgemmBatchIterativeOp());
-            } else {
-              CallBatchGemm(args, ret, MKLDgemmBatchIterativeOp());
-            }
-          });
+      .def_packed("tvm.contrib.mkl.batch_matmul",
+                  [](ffi::PackedArgs args, ffi::Any* ret) {
+                    auto A = args[0].cast<DLTensor*>();
+                    TVM_FFI_ICHECK((A->dtype == DLDataType{kDLFloat, 32, 1} ||
+                                    A->dtype == DLDataType{kDLFloat, 64, 1}));
+                    if (A->dtype == DLDataType{kDLFloat, 32, 1}) {
+                      CallBatchGemm(args, ret, MKLSgemmBatchOp());
+                    } else {
+                      CallBatchGemm(args, ret, MKLDgemmBatchOp());
+                    }
+                  })
+      .def_packed("tvm.contrib.mkl.batch_matmul_iterative",
+                  [](ffi::PackedArgs args, ffi::Any* ret) {
+                    auto A = args[0].cast<DLTensor*>();
+                    TVM_FFI_ICHECK((A->dtype == DLDataType{kDLFloat, 32, 1} ||
+                                    A->dtype == DLDataType{kDLFloat, 64, 1}));
+                    if (A->dtype == DLDataType{kDLFloat, 32, 1}) {
+                      CallBatchGemm(args, ret, MKLSgemmBatchIterativeOp());
+                    } else {
+                      CallBatchGemm(args, ret, MKLDgemmBatchIterativeOp());
+                    }
+                  });
 }
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/runtime/extra/contrib/cublas/cublas.cc b/src/runtime/extra/contrib/cublas/cublas.cc
index f114cfa6e939..461bbee1f86c 100644
--- a/src/runtime/extra/contrib/cublas/cublas.cc
+++ b/src/runtime/extra/contrib/cublas/cublas.cc
@@ -34,7 +34,6 @@
 namespace tvm {
 namespace contrib {
 
-using namespace runtime;
 inline cublasOperation_t CUBLASBooleanToTranspose(bool item) {
   return item ? CUBLAS_OP_T : CUBLAS_OP_N;
 }
@@ -125,11 +124,11 @@ struct CublasDgemmBatchOp {
 
 // Check cublas supported mix-precision computation type and return computeType
 bool CheckMixPrecisionType(DLDataType in_dtype, DLDataType out_dtype, bool int_support = true) {
-  if (int_support && TypeMatch(out_dtype, kDLInt, 32)) {
-    return TypeMatch(in_dtype, kDLInt, 8);
-  } else if (TypeMatch(out_dtype, kDLFloat, 32)) {
-    return TypeMatch(in_dtype, kDLInt, 8) || TypeMatch(in_dtype, kDLFloat, 16) ||
-           TypeMatch(in_dtype, kDLBfloat, 16);
+  if (int_support && out_dtype == DLDataType{kDLInt, 32, 1}) {
+    return in_dtype == DLDataType{kDLInt, 8, 1};
+  } else if (out_dtype == DLDataType{kDLFloat, 32, 1}) {
+    return in_dtype == DLDataType{kDLInt, 8, 1} || in_dtype == DLDataType{kDLFloat, 16, 1} ||
+           in_dtype == DLDataType{kDLBfloat, 16, 1};
   } else {
     return false;
   }
@@ -145,7 +144,7 @@ void CallCublasLt(cublasLtHandle_t hdl, cudaStream_t stream,
                   const DLTensor* C, bool transa, bool transb, void* workspace_ptr,
                   size_t workspace_size, cublasLtEpilogue_t epilogue,
                   std::optional<float> dq_scale) {
-  TVM_FFI_ICHECK(TypeEqual(A->dtype, B->dtype));
+  TVM_FFI_ICHECK(A->dtype == B->dtype);
   // Reversed strides indicates an in-place transpose operation.
   transa = IsInPlaceTransposed(A) ? !transa : transa;
   transb = IsInPlaceTransposed(B) ? !transb : transb;
@@ -164,26 +163,26 @@ void CallCublasLt(cublasLtHandle_t hdl, cudaStream_t stream,
   void* alpha = &alpha_value;
   void* beta = &zero_fp32;
 
-  if (TypeMatch(A->dtype, kDLFloat, 16)) {
+  if (A->dtype == DLDataType{kDLFloat, 16, 1}) {
     ab_type = CUDA_R_16F;
-  } else if (TypeMatch(A->dtype, kDLBfloat, 16)) {
+  } else if (A->dtype == DLDataType{kDLBfloat, 16, 1}) {
     ab_type = CUDA_R_16BF;
-  } else if (TypeMatch(A->dtype, kDLInt, 8)) {
+  } else if (A->dtype == DLDataType{kDLInt, 8, 1}) {
     ab_type = CUDA_R_8I;
-  } else if (TypeMatch(A->dtype, kDLFloat8_e4m3fn, 8)) {
+  } else if (A->dtype == DLDataType{kDLFloat8_e4m3fn, 8, 1}) {
 #if CUDART_VERSION >= 11080
-    TVM_FFI_ICHECK(TypeMatch(B->dtype, kDLFloat8_e4m3fn, 8));
+    TVM_FFI_ICHECK((B->dtype == DLDataType{kDLFloat8_e4m3fn, 8, 1}));
     ab_type = CUDA_R_8F_E4M3;
 #else
     TVM_FFI_THROW(InternalError) << "Float8 (E4M3) is only supported in CUDA 11.8 and above.";
 #endif
   }
 
-  if (TypeMatch(C->dtype, kDLFloat, 16)) {
+  if (C->dtype == DLDataType{kDLFloat, 16, 1}) {
     c_type = CUDA_R_16F;
-  } else if (TypeMatch(C->dtype, kDLBfloat, 16)) {
+  } else if (C->dtype == DLDataType{kDLBfloat, 16, 1}) {
     c_type = CUDA_R_16BF;
-  } else if (TypeMatch(C->dtype, kDLInt, 32)) {
+  } else if (C->dtype == DLDataType{kDLInt, 32, 1}) {
     c_type = CUDA_R_32I;
     compute_type = CUBLAS_COMPUTE_32I;
     scale_type = CUDA_R_32I;
@@ -346,9 +345,9 @@ inline void CallLtIgemm(ffi::PackedArgs args, ffi::Any* ret, cublasLtHandle_t hd
   TVM_FFI_ICHECK_EQ(ElementStride(B), 1);
   TVM_FFI_ICHECK_EQ(ElementStride(C), 1);
 
-  TVM_FFI_ICHECK(TypeEqual(A->dtype, B->dtype));
-  TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLInt, 8));
-  TVM_FFI_ICHECK(TypeMatch(C->dtype, kDLInt, 32));
+  TVM_FFI_ICHECK(A->dtype == B->dtype);
+  TVM_FFI_ICHECK((A->dtype == DLDataType{kDLInt, 8, 1}));
+  TVM_FFI_ICHECK((C->dtype == DLDataType{kDLInt, 32, 1}));
 
   TVM_FFI_ICHECK(CheckMixPrecisionType(A->dtype, C->dtype)) << "Unsupported data type";
   int32_t alpha = args.size() > 5 ? args[5].cast<int32_t>() : 1;
@@ -405,7 +404,7 @@ inline void CallGemmEx(ffi::PackedArgs args, ffi::Any* ret, cublasHandle_t hdl)
   TVM_FFI_ICHECK_EQ(ElementStride(B), 1);
   TVM_FFI_ICHECK_EQ(ElementStride(C), 1);
 
-  TVM_FFI_ICHECK(TypeEqual(A->dtype, B->dtype));
+  TVM_FFI_ICHECK(A->dtype == B->dtype);
 
   // C can never be transposed.
   TVM_FFI_ICHECK(!IsInPlaceTransposed(C));
@@ -415,9 +414,9 @@ inline void CallGemmEx(ffi::PackedArgs args, ffi::Any* ret, cublasHandle_t hdl)
   transb = IsInPlaceTransposed(B) ? !transb : transb;
 
   TVM_FFI_ICHECK(CheckMixPrecisionType(A->dtype, C->dtype)) << "Unsupported data type";
-  TVM_FFI_ICHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride(A) % 4 == 0)
+  TVM_FFI_ICHECK((!(A->dtype == DLDataType{kDLInt, 8, 1}) || ColumnStride(A) % 4 == 0))
       << "leading dimension must divide 4 for int8 gemm";
-  TVM_FFI_ICHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride(B) % 4 == 0)
+  TVM_FFI_ICHECK((!(B->dtype == DLDataType{kDLInt, 8, 1}) || ColumnStride(B) % 4 == 0))
       << "leading dimension must divide 4 for int8 gemm";
   double alpha = args.size() > 5 ? args[5].cast<double>() : 1.0;
   double beta = args.size() > 6 ? args[6].cast<double>() : 0.0;
@@ -464,7 +463,7 @@ inline void CallBatchGemmEx(ffi::PackedArgs args, ffi::Any* ret, cublasHandle_t
   TVM_FFI_ICHECK_EQ(ElementStride3D(B), 1);
   TVM_FFI_ICHECK_EQ(ElementStride3D(C), 1);
 
-  TVM_FFI_ICHECK(TypeEqual(A->dtype, B->dtype));
+  TVM_FFI_ICHECK(A->dtype == B->dtype);
 
   // C can never be transposed.
   TVM_FFI_ICHECK(!IsInPlaceTransposed3D(C));
@@ -474,9 +473,9 @@ inline void CallBatchGemmEx(ffi::PackedArgs args, ffi::Any* ret, cublasHandle_t
   transb = IsInPlaceTransposed3D(B) ? !transb : transb;
 
   TVM_FFI_ICHECK(CheckMixPrecisionType(A->dtype, C->dtype, true)) << "Unsupported data type";
-  TVM_FFI_ICHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride3D(A) % 4 == 0)
+  TVM_FFI_ICHECK((!(A->dtype == DLDataType{kDLInt, 8, 1}) || ColumnStride3D(A) % 4 == 0))
       << "leading dimension must divide 4 for int8 gemm";
-  TVM_FFI_ICHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride3D(B) % 4 == 0)
+  TVM_FFI_ICHECK((!(B->dtype == DLDataType{kDLInt, 8, 1}) || ColumnStride3D(B) % 4 == 0))
       << "leading dimension must divide 4 for int8 gemm";
   double alpha = args.size() > 5 ? args[5].cast<double>() : 1.0;
   double beta = args.size() > 6 ? args[6].cast<double>() : 0.0;
@@ -538,13 +537,14 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
         CUBLASTryEnableTensorCore(entry_ptr->handle);
 
-        if (TypeEqual(A->dtype, C->dtype)) {
-          TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 16) || TypeMatch(A->dtype, kDLFloat, 32) ||
-                         TypeMatch(A->dtype, kDLFloat, 64));
+        if (A->dtype == C->dtype) {
+          TVM_FFI_ICHECK((A->dtype == DLDataType{kDLFloat, 16, 1} ||
+                          A->dtype == DLDataType{kDLFloat, 32, 1} ||
+                          A->dtype == DLDataType{kDLFloat, 64, 1}));
 
-          if (TypeMatch(A->dtype, kDLFloat, 16))
+          if (A->dtype == DLDataType{kDLFloat, 16, 1})
             CallGemm(args, ret, CublasHgemmOp(entry_ptr->handle));
-          else if (TypeMatch(A->dtype, kDLFloat, 32))
+          else if (A->dtype == DLDataType{kDLFloat, 32, 1})
             CallGemm(args, ret, CublasSgemmOp(entry_ptr->handle));
           else
             CallGemm(args, ret, CublasDgemmOp(entry_ptr->handle));
@@ -565,7 +565,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
         CUBLASTryEnableTensorCore(entry_ptr->handle);
 
-        TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLInt, 8)) << "Expects dtype to be int8\n";
+        TVM_FFI_ICHECK((A->dtype == DLDataType{kDLInt, 8, 1})) << "Expects dtype to be int8\n";
         cublasLtHandle_t ltHandle;
         CHECK_CUBLAS_ERROR(cublasLtCreate(&ltHandle));
         cudaStream_t stream =
@@ -586,13 +586,14 @@ TVM_FFI_STATIC_INIT_BLOCK() {
         CuBlasThreadEntry* entry_ptr = CuBlasThreadEntry::ThreadLocal(A->device);
 
         CUBLASTryEnableTensorCore(entry_ptr->handle);
-        if (TypeEqual(A->dtype, C->dtype)) {
-          TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 16) || TypeMatch(A->dtype, kDLFloat, 32) ||
-                         TypeMatch(A->dtype, kDLFloat, 64));
+        if (A->dtype == C->dtype) {
+          TVM_FFI_ICHECK((A->dtype == DLDataType{kDLFloat, 16, 1} ||
+                          A->dtype == DLDataType{kDLFloat, 32, 1} ||
+                          A->dtype == DLDataType{kDLFloat, 64, 1}));
 
-          if (TypeMatch(A->dtype, kDLFloat, 16))
+          if (A->dtype == DLDataType{kDLFloat, 16, 1})
             CallBatchGemm(args, ret, CublasHgemmBatchOp(entry_ptr->handle));
-          else if (TypeMatch(A->dtype, kDLFloat, 32))
+          else if (A->dtype == DLDataType{kDLFloat, 32, 1})
             CallBatchGemm(args, ret, CublasSgemmBatchOp(entry_ptr->handle));
           else
             CallBatchGemm(args, ret, CublasDgemmBatchOp(entry_ptr->handle));
diff --git a/src/runtime/extra/contrib/cudnn/conv_backward.cc b/src/runtime/extra/contrib/cudnn/conv_backward.cc
index 97832248fe53..47b8ab50cdbf 100644
--- a/src/runtime/extra/contrib/cudnn/conv_backward.cc
+++ b/src/runtime/extra/contrib/cudnn/conv_backward.cc
@@ -32,8 +32,6 @@
 namespace tvm {
 namespace contrib {
 
-using namespace runtime;
-
 void ConvolutionBackwardData(int mode, int format, int algo, int dims, int groups, const int pad[],
                              const int stride[], const int dilation[], DLTensor* dy, DLTensor* w,
                              DLTensor* dx, const std::string& conv_dtype) {
diff --git a/src/runtime/extra/contrib/cudnn/conv_forward.cc b/src/runtime/extra/contrib/cudnn/conv_forward.cc
index b7257d35f2b5..aba57b7a9de7 100644
--- a/src/runtime/extra/contrib/cudnn/conv_forward.cc
+++ b/src/runtime/extra/contrib/cudnn/conv_forward.cc
@@ -32,8 +32,6 @@
 namespace tvm {
 namespace contrib {
 
-using namespace runtime;
-
 void ConvolutionForward(int mode, int format, int algo, int dims, int groups, const int pad[],
                         const int stride[], const int dilation[], const DLTensor* x,
                         const DLTensor* w, const DLTensor* y, const std::string& conv_dtype) {
diff --git a/src/runtime/extra/contrib/cudnn/softmax.cc b/src/runtime/extra/contrib/cudnn/softmax.cc
index fde7d5e4e182..50b4f69f7383 100644
--- a/src/runtime/extra/contrib/cudnn/softmax.cc
+++ b/src/runtime/extra/contrib/cudnn/softmax.cc
@@ -31,8 +31,6 @@
 namespace tvm {
 namespace contrib {
 
-using namespace runtime;
-
 void softmax_impl(cudnnSoftmaxAlgorithm_t alg, ffi::PackedArgs args, ffi::Any* ret) {
   auto x = args[0].cast<DLTensor*>();
   auto y = args[1].cast<DLTensor*>();
diff --git a/src/runtime/extra/contrib/hipblas/hipblas.cc b/src/runtime/extra/contrib/hipblas/hipblas.cc
index eae6f7241cc7..18e136b0fdec 100644
--- a/src/runtime/extra/contrib/hipblas/hipblas.cc
+++ b/src/runtime/extra/contrib/hipblas/hipblas.cc
@@ -33,7 +33,6 @@
 namespace tvm {
 namespace contrib {
 
-using namespace runtime;
 inline hipblasOperation_t HIPBLASBooleanToTranspose(bool item) {
   return item ? HIPBLAS_OP_T : HIPBLAS_OP_N;
 }
@@ -117,10 +116,10 @@ struct HipblasDgemmBatchOp {
 
 // Check supported mix-precision computation type and return computeType
 bool CheckMixPrecisionType(DLDataType in_dtype, DLDataType out_dtype, bool int_support = true) {
-  if (int_support && TypeMatch(out_dtype, kDLInt, 32)) {
-    return TypeMatch(in_dtype, kDLInt, 8);
-  } else if (TypeMatch(out_dtype, kDLFloat, 32)) {
-    return TypeMatch(in_dtype, kDLInt, 8) || TypeMatch(in_dtype, kDLFloat, 16);
+  if (int_support && out_dtype == DLDataType{kDLInt, 32, 1}) {
+    return in_dtype == DLDataType{kDLInt, 8, 1};
+  } else if (out_dtype == DLDataType{kDLFloat, 32, 1}) {
+    return in_dtype == DLDataType{kDLInt, 8, 1} || in_dtype == DLDataType{kDLFloat, 16, 1};
   } else {
     return false;
   }
@@ -131,7 +130,7 @@ void CallHipblasLt(hipblasLtHandle_t hdl, hipStream_t stream,
                    const DLTensor* B, const DLTensor* bias, const DLTensor* C, bool transa,
                    bool transb, void* workspace_ptr, size_t workspace_size,
                    hipblasLtEpilogue_t epilogue) {
-  TVM_FFI_ICHECK(TypeEqual(A->dtype, B->dtype));
+  TVM_FFI_ICHECK(A->dtype == B->dtype);
   // Reversed strides indicates an in-place transpose operation.
   transa = IsInPlaceTransposed(A) ? !transa : transa;
   transb = IsInPlaceTransposed(B) ? !transb : transb;
@@ -147,15 +146,15 @@ void CallHipblasLt(hipblasLtHandle_t hdl, hipStream_t stream,
   void* alpha = &one_fp32;
   void* beta = &zero_fp32;
 
-  if (TypeMatch(A->dtype, kDLFloat, 16)) {
+  if (A->dtype == DLDataType{kDLFloat, 16, 1}) {
     ab_type = HIP_R_16F;
-  } else if (TypeMatch(A->dtype, kDLInt, 8)) {
+  } else if (A->dtype == DLDataType{kDLInt, 8, 1}) {
     ab_type = HIP_R_8I;
   }
 
-  if (TypeMatch(C->dtype, kDLFloat, 16)) {
+  if (C->dtype == DLDataType{kDLFloat, 16, 1}) {
     c_type = HIP_R_16F;
-  } else if (TypeMatch(C->dtype, kDLInt, 32)) {
+  } else if (C->dtype == DLDataType{kDLInt, 32, 1}) {
     c_type = HIP_R_32I;
     compute_type = HIPBLAS_COMPUTE_32I;
     scale_type = HIP_R_32I;
@@ -288,7 +287,7 @@ inline void CallGemmEx(ffi::PackedArgs args, ffi::Any* ret, hipblasHandle_t hdl)
   TVM_FFI_ICHECK_EQ(ElementStride(B), 1);
   TVM_FFI_ICHECK_EQ(ElementStride(C), 1);
 
-  TVM_FFI_ICHECK(TypeEqual(A->dtype, B->dtype));
+  TVM_FFI_ICHECK(A->dtype == B->dtype);
 
   // C can never be transposed.
   TVM_FFI_ICHECK(!IsInPlaceTransposed(C));
@@ -298,9 +297,9 @@ inline void CallGemmEx(ffi::PackedArgs args, ffi::Any* ret, hipblasHandle_t hdl)
   transb = IsInPlaceTransposed(B) ? !transb : transb;
 
   TVM_FFI_ICHECK(CheckMixPrecisionType(A->dtype, C->dtype)) << "Unsupported data type";
-  TVM_FFI_ICHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride(A) % 4 == 0)
+  TVM_FFI_ICHECK((!(A->dtype == DLDataType{kDLInt, 8, 1}) || ColumnStride(A) % 4 == 0))
       << "leading dimension must divide 4 for int8 gemm";
-  TVM_FFI_ICHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride(B) % 4 == 0)
+  TVM_FFI_ICHECK((!(B->dtype == DLDataType{kDLInt, 8, 1}) || ColumnStride(B) % 4 == 0))
       << "leading dimension must divide 4 for int8 gemm";
   double alpha = args.size() > 5 ? args[5].cast<double>() : 1.0;
   double beta = args.size() > 6 ? args[6].cast<double>() : 0.0;
@@ -347,7 +346,7 @@ inline void CallBatchGemmEx(ffi::PackedArgs args, ffi::Any* ret, hipblasHandle_t
   TVM_FFI_ICHECK_EQ(ElementStride3D(B), 1);
   TVM_FFI_ICHECK_EQ(ElementStride3D(C), 1);
 
-  TVM_FFI_ICHECK(TypeEqual(A->dtype, B->dtype));
+  TVM_FFI_ICHECK(A->dtype == B->dtype);
 
   // C can never be transposed.
   TVM_FFI_ICHECK(!IsInPlaceTransposed3D(C));
@@ -357,9 +356,9 @@ inline void CallBatchGemmEx(ffi::PackedArgs args, ffi::Any* ret, hipblasHandle_t
   transb = IsInPlaceTransposed3D(B) ? !transb : transb;
 
   TVM_FFI_ICHECK(CheckMixPrecisionType(A->dtype, C->dtype, true)) << "Unsupported data type";
-  TVM_FFI_ICHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride3D(A) % 4 == 0)
+  TVM_FFI_ICHECK((!(A->dtype == DLDataType{kDLInt, 8, 1}) || ColumnStride3D(A) % 4 == 0))
       << "leading dimension must divide 4 for int8 gemm";
-  TVM_FFI_ICHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride3D(B) % 4 == 0)
+  TVM_FFI_ICHECK((!(B->dtype == DLDataType{kDLInt, 8, 1}) || ColumnStride3D(B) % 4 == 0))
       << "leading dimension must divide 4 for int8 gemm";
   double alpha = args.size() > 5 ? args[5].cast<double>() : 1.0;
   double beta = args.size() > 6 ? args[6].cast<double>() : 0.0;
@@ -419,14 +418,14 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
                     HipBlasThreadEntry* entry_ptr = HipBlasThreadEntry::ThreadLocal(A->device);
 
-                    if (TypeEqual(A->dtype, C->dtype)) {
-                      TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 16) ||
-                                     TypeMatch(A->dtype, kDLFloat, 32) ||
-                                     TypeMatch(A->dtype, kDLFloat, 64));
+                    if (A->dtype == C->dtype) {
+                      TVM_FFI_ICHECK((A->dtype == DLDataType{kDLFloat, 16, 1} ||
+                                      A->dtype == DLDataType{kDLFloat, 32, 1} ||
+                                      A->dtype == DLDataType{kDLFloat, 64, 1}));
 
-                      if (TypeMatch(A->dtype, kDLFloat, 16)) {
+                      if (A->dtype == DLDataType{kDLFloat, 16, 1}) {
                         CallGemm(args, ret, HipblasHgemmOp(entry_ptr->handle));
-                      } else if (TypeMatch(A->dtype, kDLFloat, 32)) {
+                      } else if (A->dtype == DLDataType{kDLFloat, 32, 1}) {
                         CallGemm(args, ret, HipblasSgemmOp(entry_ptr->handle));
                       } else {
                         CallGemm(args, ret, HipblasDgemmOp(entry_ptr->handle));
@@ -441,13 +440,14 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
         HipBlasThreadEntry* entry_ptr = HipBlasThreadEntry::ThreadLocal(A->device);
 
-        if (TypeEqual(A->dtype, C->dtype)) {
-          TVM_FFI_ICHECK(TypeMatch(A->dtype, kDLFloat, 16) || TypeMatch(A->dtype, kDLFloat, 32) ||
-                         TypeMatch(A->dtype, kDLFloat, 64));
+        if (A->dtype == C->dtype) {
+          TVM_FFI_ICHECK((A->dtype == DLDataType{kDLFloat, 16, 1} ||
+                          A->dtype == DLDataType{kDLFloat, 32, 1} ||
+                          A->dtype == DLDataType{kDLFloat, 64, 1}));
 
-          if (TypeMatch(A->dtype, kDLFloat, 16)) {
+          if (A->dtype == DLDataType{kDLFloat, 16, 1}) {
             CallBatchGemm(args, ret, HipblasHgemmBatchOp(entry_ptr->handle));
-          } else if (TypeMatch(A->dtype, kDLFloat, 32)) {
+          } else if (A->dtype == DLDataType{kDLFloat, 32, 1}) {
             CallBatchGemm(args, ret, HipblasSgemmBatchOp(entry_ptr->handle));
           } else {
             CallBatchGemm(args, ret, HipblasDgemmBatchOp(entry_ptr->handle));
diff --git a/src/runtime/extra/contrib/random/random.cc b/src/runtime/extra/contrib/random/random.cc
index 81db658cb86e..0a96185933e3 100644
--- a/src/runtime/extra/contrib/random/random.cc
+++ b/src/runtime/extra/contrib/random/random.cc
@@ -69,8 +69,6 @@
 namespace tvm {
 namespace contrib {
 
-using namespace runtime;
-
 struct RandomThreadLocalEntry {
   RandomEngine random_engine;
   static RandomThreadLocalEntry* ThreadLocal();

From 7e3e9ce29129f07beeb26b0e39e05c251f24ce1c Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 23 Jun 2026 18:29:03 +0000
Subject: [PATCH 5/8] [FIX][IR] Repair PrimType dtype CI fallout

---
 include/tvm/ir/base_expr.h                    |   9 ++
 include/tvm/topi/detail/broadcast.h           |   6 +-
 python/tvm/ir/expr.py                         |  12 +-
 python/tvm/script/parser/core/evaluator.py    |   6 +-
 python/tvm/tirx/buffer.py                     |  13 +-
 python/tvm/tirx/expr.py                       |  24 ++--
 python/tvm/tirx/layout.py                     |   4 +-
 python/tvm/tirx/op.py                         | 114 +++++++++++-------
 .../tirx/script/builder/external_kernel.py    |   7 +-
 python/tvm/tirx/script/builder/ir.py          |  41 ++++---
 python/tvm/tirx/script/parser/operation.py    |   6 +-
 python/tvm/tirx/script/parser/parser.py       |  10 +-
 python/tvm/tirx/stmt.py                       |   4 +-
 .../hexagon/codegen/llvm/codegen_hexagon.cc   |   2 +-
 src/backend/vulkan/codegen/ir_builder.h       |   2 +-
 src/ir/type.cc                                |  10 ++
 .../contrib/codegen_json/codegen_json.h       |  10 +-
 src/relax/op/memory/view.cc                   |   3 +-
 src/relax/transform/lower_alloc_tensor.cc     |   2 +-
 .../transform/static_plan_block_memory.cc     |   4 +-
 src/s_tir/analysis/verify_gpu_code.cc         |   4 +-
 src/s_tir/transform/bound_checker.cc          |   3 +-
 .../merge_shared_memory_allocations.cc        |   6 +-
 src/tirx/transform/ir_utils.h                 |   2 +-
 src/tirx/transform/lower_tvm_builtin.cc       |   2 +-
 src/tirx/transform/split_host_device.cc       |   2 +-
 src/tirx/transform/tvm_ffi_binder.cc          |   2 +-
 27 files changed, 183 insertions(+), 127 deletions(-)

diff --git a/include/tvm/ir/base_expr.h b/include/tvm/ir/base_expr.h
index fbde9ec26aca..0a844bb3ba8e 100644
--- a/include/tvm/ir/base_expr.h
+++ b/include/tvm/ir/base_expr.h
@@ -29,6 +29,7 @@
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ir/source_map.h>
 
+#include <cstddef>
 #include <cstdint>
 
 namespace tvm {
@@ -207,6 +208,14 @@ class PrimType final : public Type {
     return static_cast<int16_t>(get()->dtype.lanes) > 1;
   }
 
+  /*!
+   * \brief Return the number of bytes needed to store one value of this type.
+   *
+   * This uses the same packed sub-byte dtype sizing rule as runtime tensors.
+   * Scalable vector types have no compile-time storage size and are rejected.
+   */
+  TVM_DLL size_t StorageBytes() const;
+
   /*! \brief Return the same type with a different dtype code, preserving bits and lanes. */
   TVM_FFI_INLINE PrimType WithCode(DLDataTypeCode code) const {
     DLDataType dtype = get()->dtype;
diff --git a/include/tvm/topi/detail/broadcast.h b/include/tvm/topi/detail/broadcast.h
index e5984fd1d787..7c990c5c6e1a 100644
--- a/include/tvm/topi/detail/broadcast.h
+++ b/include/tvm/topi/detail/broadcast.h
@@ -43,9 +43,7 @@ struct BroadcastHelper {
 };
 
 static inline PrimType CommonType(const PrimType& type1, const PrimType& type2) {
-  TVM_FFI_ICHECK(!type1.IsScalableVector() && !type2.IsScalableVector());
-  TVM_FFI_ICHECK_EQ(type1.lanes(), 1);
-  TVM_FFI_ICHECK_EQ(type2.lanes(), 1);
+  TVM_FFI_ICHECK(type1.IsScalar() && type2.IsScalar());
   TVM_FFI_ICHECK(type1.code() == type2.code());
   return type1.bits() < type2.bits() ? type1.WithBits(type2.bits()) : type1;
 }
@@ -59,7 +57,7 @@ inline BroadcastHelper BroadcastShape(const tvm::ffi::Array<tvm::PrimExpr>& shap
   int i;
 
   auto cast_if_needed = [](PrimType to_type, PrimExpr expr) {
-    return to_type->dtype == expr.ty()->dtype ? expr : cast(to_type, expr);
+    return to_type == expr.ty() ? expr : cast(to_type, expr);
   };
 
   for (i = 1; i <= std::min(s1_size, s2_size); ++i) {
diff --git a/python/tvm/ir/expr.py b/python/tvm/ir/expr.py
index e6a33ac9b4a6..dd463150fd51 100644
--- a/python/tvm/ir/expr.py
+++ b/python/tvm/ir/expr.py
@@ -45,8 +45,16 @@ class PrimExpr(BaseExpr):
 
     @property
     def dtype(self):
-        """Return the runtime dtype represented by this expression's PrimType."""
-        return self.ty.dtype
+        """Compatibility alias for the runtime dtype of scalar PrimExpr.
+
+        New code should inspect ``expr.ty`` directly.  For scalar primitive
+        expressions, use ``expr.ty.dtype``.
+        """
+        if self.ty is None:
+            return None
+        if hasattr(self.ty, "dtype"):
+            return self.ty.dtype
+        return "handle"
 
 
 @tvm_ffi.register_object("ir.RelaxExpr")
diff --git a/python/tvm/script/parser/core/evaluator.py b/python/tvm/script/parser/core/evaluator.py
index dec30f29a114..0461e56ec984 100644
--- a/python/tvm/script/parser/core/evaluator.py
+++ b/python/tvm/script/parser/core/evaluator.py
@@ -396,7 +396,11 @@ def _eval_if_exp(self, fields: dict[str, Any]) -> Any:
         orelse = self._eval_expr(fields["orelse"])
         if isinstance(test, bool):
             return body if test else orelse
-        elif isinstance(test, tvm.tirx.PrimExpr) and test.dtype.type_code == tvm.DataTypeCode.BOOL:
+        elif (
+            isinstance(test, tvm.tirx.PrimExpr)
+            and isinstance(test.ty, tvm.ir.PrimType)
+            and test.ty.matches_code(tvm.DataTypeCode.BOOL)
+        ):
             return tvm.tirx.op.if_then_else(test, body, orelse)
         else:
             raise TypeError(f"Expected Python bool or TIR bool, but got {type(test)}")
diff --git a/python/tvm/tirx/buffer.py b/python/tvm/tirx/buffer.py
index d021bb317220..43023b4c3cb9 100644
--- a/python/tvm/tirx/buffer.py
+++ b/python/tvm/tirx/buffer.py
@@ -352,7 +352,7 @@ def _infer_shape(shape):
             shape = args
             assert all(
                 isinstance(arg, int)
-                or (isinstance(arg, PrimExpr) and arg.dtype in ["int32", "int64"])
+                or (isinstance(arg, PrimExpr) and arg.ty.dtype in ["int32", "int64"])
                 for arg in shape
             ), "shape must be a list of integers or PrimExprs with dtype int32 or int64"
             # Safely get optional keyword arguments
@@ -462,7 +462,7 @@ def permute(self, *dims) -> "Buffer":
 
     def __getitem__(self, indices):
         from ..arith import Analyzer  # pylint: disable=import-outside-toplevel
-        from .expr import BufferLoad, Ramp, const  # pylint: disable=import-outside-toplevel
+        from .expr import BufferLoad, Ramp  # pylint: disable=import-outside-toplevel
         from .stmt import BufferRegion  # pylint: disable=import-outside-toplevel
 
         if not isinstance(indices, tuple | list):
@@ -483,7 +483,8 @@ def __getitem__(self, indices):
                 else:
                     region.append(
                         Range.from_min_extent(
-                            index, const(1, index.dtype) if isinstance(index, PrimExpr) else 1
+                            index,
+                            tvm.tirx.expr.IntImm(index.ty, 1) if isinstance(index, PrimExpr) else 1,
                         )
                     )
             if has_implicit_slice:
@@ -499,7 +500,7 @@ def __getitem__(self, indices):
                     step = 1 if index.step is None else index.step
                     # We should ensure the dtype of start is the same with that of step.
                     if isinstance(start, tvm.tirx.expr.PrimExpr) and isinstance(step, int):
-                        step = tvm.tirx.expr.IntImm(start.dtype, step)
+                        step = tvm.tirx.expr.IntImm(start.ty, step)
                     lanes = analyzer.simplify((stop - start + step - 1) // step)
                     if lanes == 1:
                         expr_indices.append(start)
@@ -540,8 +541,8 @@ def decl_buffer(
         layout = TileLayout(S[tuple(shape)]) if shape else None
 
     if offset_factor != 0 and elem_offset is None:
-        shape_dtype = shape[0].dtype if shape and hasattr(shape[0], "dtype") else "int32"
-        elem_offset = Var(f"{name}_elem_offset", shape_dtype)
+        shape_ty = shape[0].ty if shape and isinstance(shape[0], PrimExpr) else "int32"
+        elem_offset = Var(f"{name}_elem_offset", shape_ty)
     if data is None:
         # Bool is represented as uint1 in the IR, but stored as int8
         storage_type = dtype if isinstance(dtype, PrimType) else PrimType(dtype)
diff --git a/python/tvm/tirx/expr.py b/python/tvm/tirx/expr.py
index 2e01f0b6d556..ec744acf5093 100644
--- a/python/tvm/tirx/expr.py
+++ b/python/tvm/tirx/expr.py
@@ -132,7 +132,7 @@ def __rmod__(self, other: PrimExpr) -> PrimExpr:
         return _ffi_api._OpFloorMod(other, self, None)  # type: ignore
 
     def __neg__(self) -> PrimExpr:
-        neg_one = const(-1, self.dtype)  # type: ignore
+        neg_one = const(-1, self.expr_ty().dtype)
         return self.__mul__(neg_one)
 
     def __lshift__(self, other: PrimExpr) -> PrimExpr:
@@ -215,7 +215,7 @@ def equal(self, other: PrimExpr, span: Span | None = None) -> bool:
         """
         return _ffi_api._OpEQ(self, other, span)  # type: ignore
 
-    def astype(self, dtype: str, span: Span | None = None) -> PrimExpr:
+    def astype(self, dtype: str | ir.PrimType, span: Span | None = None) -> PrimExpr:
         """Cast the expression to other type.
 
         Parameters
@@ -477,12 +477,10 @@ def __init__(
                 raise TypeError("dom need to be Range")
 
         name = var if var is not None else "iter"
-        dtype = "int32" if dom is None else dom.extent.dtype
+        dtype = "int32" if dom is None else dom.extent.ty
         var = Var(name, dtype=dtype, span=span) if not isinstance(var, Var) else var
         if dom is not None:
-            assert var.dtype == dom.extent.dtype, (
-                "IterVar's Var dtype must match its domain's extent's dtype"
-            )
+            assert var.ty == dom.extent.ty, "IterVar's Var type must match its domain's extent type"
         self.__init_handle_by_constructor__(
             _ffi_api.IterVar,
             dom,
@@ -618,7 +616,9 @@ class FloatImm(ConstExpr):
 
     value: float
 
-    def __init__(self, dtype: str, value: float, span: Span | None = None) -> None:
+    def __init__(self, dtype: str | ir.PrimType, value: float, span: Span | None = None) -> None:
+        if isinstance(dtype, ir.PrimType):
+            dtype = dtype.dtype
         self.__init_handle_by_constructor__(
             tvm.ir._ffi_api.FloatImm,
             dtype,
@@ -648,7 +648,9 @@ class IntImm(ConstExpr):
 
     value: int
 
-    def __init__(self, dtype: str, value: int, span: Span | None = None) -> None:
+    def __init__(self, dtype: str | ir.PrimType, value: int, span: Span | None = None) -> None:
+        if isinstance(dtype, ir.PrimType):
+            dtype = dtype.dtype
         self.__init_handle_by_constructor__(
             tvm.ir._ffi_api.IntImm,
             dtype,
@@ -725,7 +727,9 @@ class Cast(PrimExprWithOp):
 
     value: PrimExpr
 
-    def __init__(self, dtype, value, span: Span | None = None) -> None:
+    def __init__(self, dtype: str | ir.PrimType, value, span: Span | None = None) -> None:
+        if isinstance(dtype, ir.PrimType):
+            dtype = dtype.dtype
         self.__init_handle_by_constructor__(_ffi_api.Cast, dtype, value, span)  # type: ignore
 
 
@@ -1336,7 +1340,7 @@ class Call(PrimExprWithOp):
 
     def __init__(
         self,
-        dtype: str,
+        dtype: str | ir.PrimType,
         op: Op | str,
         args: list[PrimExpr],
         attrs: ir.Attrs | dict | None = None,
diff --git a/python/tvm/tirx/layout.py b/python/tvm/tirx/layout.py
index 29a19d746dee..11d1e140ae16 100644
--- a/python/tvm/tirx/layout.py
+++ b/python/tvm/tirx/layout.py
@@ -332,10 +332,10 @@ def _get_default_strides(data: list[int | PrimExpr], stride: int = 1) -> tuple:
         # produce for int64-shaped buffers (otherwise the last stride stays a
         # Python ``int`` -> int32 IntImm and breaks structural-equal).
         for t in data:
-            if isinstance(t, PrimExpr) and t.dtype != "int32":
+            if isinstance(t, PrimExpr) and t.ty.dtype != "int32":
                 from .expr import IntImm  # pylint: disable=import-outside-toplevel
 
-                stride = IntImm(t.dtype, stride)
+                stride = IntImm(t.ty, stride)
                 break
         res = list()
         for t in reversed(data):
diff --git a/python/tvm/tirx/op.py b/python/tvm/tirx/op.py
index a7a2889c444b..9a54e915bb0b 100644
--- a/python/tvm/tirx/op.py
+++ b/python/tvm/tirx/op.py
@@ -31,7 +31,7 @@
 
 from . import _ffi_api
 from .buffer import Buffer
-from .expr import BufferLoad, Call, CommReducer, IntImm, PrimExprWithOp, Var
+from .expr import BufferLoad, Call, CommReducer, ExprOp, IntImm, PrimExprWithOp, Var
 
 tir = tirx  # alias for backward compat with upstream tir.convert() calls
 
@@ -57,6 +57,24 @@ def _canonical_device_intrin_name(func_name: str) -> str:
     return func_name
 
 
+def _primexpr_ty(expr):
+    """Return the runtime primitive type of an expression."""
+    ty = getattr(expr, "ty", None)
+    if isinstance(ty, tvm.ir.PrimType):
+        return ty
+    if isinstance(expr, ExprOp):
+        return expr.expr_ty()
+    raise TypeError(f"Cannot determine PrimExpr type for {type(expr).__name__}")
+
+
+def _primexpr_dtype(expr):
+    """Return the runtime dtype of a primitive expression without using PrimExpr.dtype."""
+    ty = _primexpr_ty(expr)
+    if not isinstance(ty, tvm.ir.PrimType):
+        raise TypeError(f"Expected PrimType for {type(expr).__name__}, but got {ty}")
+    return ty.dtype
+
+
 def _pack_buffer(buf, span=None):
     """Build intrinsics that packs the buffer."""
     shape = Call("handle", "tirx.tvm_stack_make_shape", buf.shape, span=span)
@@ -187,7 +205,7 @@ def call_cpacked(*args, span=None):
     return Call("int32", Op.get("tirx.tvm_call_cpacked"), call_args, span=span)
 
 
-def call_intrin(dtype, func_name, *args, attrs=None, span=None):
+def call_intrin(dtype: str | tvm.ir.PrimType, func_name, *args, attrs=None, span=None):
     """Build expression by calling an intrinsic function.
 
     Intrinsics can be overloaded with multiple data types via
@@ -272,8 +290,9 @@ def call_extern(dtype, func_name, *args, span=None):
 
 def _require_float_arg(op_name, x):
     x = tirx.convert(x)
-    if "float" not in x.dtype and "bfloat" not in x.dtype:
-        raise TypeError(f"tirx.{op_name} only supports floating-point inputs, but got {x.dtype}")
+    dtype = _primexpr_dtype(x)
+    if "float" not in dtype and "bfloat" not in dtype:
+        raise TypeError(f"tirx.{op_name} only supports floating-point inputs, but got {dtype}")
     return x
 
 
@@ -476,8 +495,8 @@ def call_tir(global_var: tvm.ir.GlobalVar, *args):
     dtype = "void"
     if global_var.ty is not None:
         ret_ty = global_var.ty.ret
-        if hasattr(ret_ty, "dtype"):
-            dtype = ret_ty.dtype
+        if isinstance(ret_ty, tvm.ir.PrimType):
+            dtype = ret_ty
 
     return Call(dtype=dtype, op=global_var, args=args)
 
@@ -680,7 +699,7 @@ def tvm_thread_invariant(cond):
         The call expression.
     """
     assert isinstance(cond, PrimExpr)
-    return call_intrin(cond.dtype, "tirx.tvm_thread_invariant", cond)
+    return call_intrin(_primexpr_ty(cond), "tirx.tvm_thread_invariant", cond)
 
 
 def tvm_storage_sync(storage_scope, is_load=False, num_blocks=-1):
@@ -742,7 +761,9 @@ def tvm_warp_shuffle(mask, value, warp_id, width, warp_size):
     call : PrimExpr
         The call expression.
     """
-    return call_intrin(value.dtype, "tirx.tvm_warp_shuffle", mask, value, warp_id, width, warp_size)
+    return call_intrin(
+        _primexpr_ty(value), "tirx.tvm_warp_shuffle", mask, value, warp_id, width, warp_size
+    )
 
 
 def tvm_warp_shuffle_up(mask, value, offset, width, warp_size):
@@ -768,7 +789,7 @@ def tvm_warp_shuffle_up(mask, value, offset, width, warp_size):
         The call expression.
     """
     return call_intrin(
-        value.dtype, "tirx.tvm_warp_shuffle_up", mask, value, offset, width, warp_size
+        _primexpr_ty(value), "tirx.tvm_warp_shuffle_up", mask, value, offset, width, warp_size
     )
 
 
@@ -795,7 +816,7 @@ def tvm_warp_shuffle_down(mask, value, offset, width, warp_size):
         The call expression.
     """
     return call_intrin(
-        value.dtype, "tirx.tvm_warp_shuffle_down", mask, value, offset, width, warp_size
+        _primexpr_ty(value), "tirx.tvm_warp_shuffle_down", mask, value, offset, width, warp_size
     )
 
 
@@ -821,7 +842,7 @@ def tvm_warp_shuffle_xor(mask, value, lane_mask, width, warp_size):
         The call expression.
     """
     return call_intrin(
-        value.dtype, "tirx.tvm_warp_shuffle_xor", mask, value, lane_mask, width, warp_size
+        _primexpr_ty(value), "tirx.tvm_warp_shuffle_xor", mask, value, lane_mask, width, warp_size
     )
 
 
@@ -1208,7 +1229,8 @@ def trace(args, trace_action="tvm.default_trace_action"):
         raise Exception("tvm.tirx.trace consumes the args as list type")
     call_args = [_pack_buffer(x) if isinstance(x, Buffer) else x for x in args]
     call_args.insert(0, trace_action)
-    return tvm.tirx.Call(args[-1].dtype, Op.get("tirx.tvm_call_trace_packed"), call_args)
+    dtype = _primexpr_ty(args[-1]) if isinstance(args[-1], PrimExpr) else args[-1].dtype
+    return tvm.tirx.Call(dtype, Op.get("tirx.tvm_call_trace_packed"), call_args)
 
 
 def min_value(dtype, span=None):
@@ -1304,7 +1326,7 @@ def exp(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.exp", x)
+    return call_intrin(_primexpr_ty(x), "tirx.exp", x)
 
 
 def exp2(x):
@@ -1321,7 +1343,7 @@ def exp2(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.exp2", x)
+    return call_intrin(_primexpr_ty(x), "tirx.exp2", x)
 
 
 def exp10(x):
@@ -1338,7 +1360,7 @@ def exp10(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.exp10", x)
+    return call_intrin(_primexpr_ty(x), "tirx.exp10", x)
 
 
 def fma(x, y, z):
@@ -1363,7 +1385,7 @@ def fma(x, y, z):
     x = tir.convert(x)
     y = tir.convert(y)
     z = tir.convert(z)
-    return call_intrin(x.dtype, "tirx.fma", x, y, z)
+    return call_intrin(_primexpr_ty(x), "tirx.fma", x, y, z)
 
 
 def erf(x):
@@ -1380,7 +1402,7 @@ def erf(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.erf", x)
+    return call_intrin(_primexpr_ty(x), "tirx.erf", x)
 
 
 def tanh(x):
@@ -1397,7 +1419,7 @@ def tanh(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.tanh", x)
+    return call_intrin(_primexpr_ty(x), "tirx.tanh", x)
 
 
 def sigmoid(x):
@@ -1414,7 +1436,7 @@ def sigmoid(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.sigmoid", x)
+    return call_intrin(_primexpr_ty(x), "tirx.sigmoid", x)
 
 
 def log(x):
@@ -1431,7 +1453,7 @@ def log(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.log", x)
+    return call_intrin(_primexpr_ty(x), "tirx.log", x)
 
 
 def log2(x):
@@ -1448,7 +1470,7 @@ def log2(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.log2", x)
+    return call_intrin(_primexpr_ty(x), "tirx.log2", x)
 
 
 def log10(x):
@@ -1465,7 +1487,7 @@ def log10(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.log10", x)
+    return call_intrin(_primexpr_ty(x), "tirx.log10", x)
 
 
 def log1p(x):
@@ -1482,7 +1504,7 @@ def log1p(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.log1p", x)
+    return call_intrin(_primexpr_ty(x), "tirx.log1p", x)
 
 
 def tan(x):
@@ -1499,7 +1521,7 @@ def tan(x):
         The result.
     """
     x = _require_float_arg("tan", x)
-    return call_intrin(x.dtype, "tirx.tan", x)
+    return call_intrin(_primexpr_ty(x), "tirx.tan", x)
 
 
 def cos(x):
@@ -1516,7 +1538,7 @@ def cos(x):
         The result.
     """
     x = _require_float_arg("cos", x)
-    return call_intrin(x.dtype, "tirx.cos", x)
+    return call_intrin(_primexpr_ty(x), "tirx.cos", x)
 
 
 def cosh(x):
@@ -1533,7 +1555,7 @@ def cosh(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.cosh", x)
+    return call_intrin(_primexpr_ty(x), "tirx.cosh", x)
 
 
 def acos(x):
@@ -1550,7 +1572,7 @@ def acos(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.acos", x)
+    return call_intrin(_primexpr_ty(x), "tirx.acos", x)
 
 
 def acosh(x):
@@ -1567,7 +1589,7 @@ def acosh(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.acosh", x)
+    return call_intrin(_primexpr_ty(x), "tirx.acosh", x)
 
 
 def sin(x):
@@ -1584,7 +1606,7 @@ def sin(x):
         The result.
     """
     x = _require_float_arg("sin", x)
-    return call_intrin(x.dtype, "tirx.sin", x)
+    return call_intrin(_primexpr_ty(x), "tirx.sin", x)
 
 
 def sinh(x):
@@ -1601,7 +1623,7 @@ def sinh(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.sinh", x)
+    return call_intrin(_primexpr_ty(x), "tirx.sinh", x)
 
 
 def asin(x):
@@ -1618,7 +1640,7 @@ def asin(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.asin", x)
+    return call_intrin(_primexpr_ty(x), "tirx.asin", x)
 
 
 def asinh(x):
@@ -1635,7 +1657,7 @@ def asinh(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.asinh", x)
+    return call_intrin(_primexpr_ty(x), "tirx.asinh", x)
 
 
 def atan(x):
@@ -1652,7 +1674,7 @@ def atan(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.atan", x)
+    return call_intrin(_primexpr_ty(x), "tirx.atan", x)
 
 
 def atanh(x):
@@ -1669,7 +1691,7 @@ def atanh(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.atanh", x)
+    return call_intrin(_primexpr_ty(x), "tirx.atanh", x)
 
 
 def atan2(x1, x2):
@@ -1690,7 +1712,7 @@ def atan2(x1, x2):
     """
     x1 = tir.convert(x1)
     x2 = tir.convert(x2)
-    return call_intrin(x1.dtype, "tirx.atan2", x1, x2)
+    return call_intrin(_primexpr_ty(x1), "tirx.atan2", x1, x2)
 
 
 def sqrt(x):
@@ -1707,7 +1729,7 @@ def sqrt(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.sqrt", x)
+    return call_intrin(_primexpr_ty(x), "tirx.sqrt", x)
 
 
 def rsqrt(x):
@@ -1724,7 +1746,7 @@ def rsqrt(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.rsqrt", x)
+    return call_intrin(_primexpr_ty(x), "tirx.rsqrt", x)
 
 
 def clz(x):
@@ -1971,7 +1993,7 @@ def nextafter(x1, x2):
     """
     x1 = tir.convert(x1)
     x2 = tir.convert(x2)
-    return call_intrin(x1.dtype, "tirx.nextafter", x1, x2)  # type: ignore
+    return call_intrin(_primexpr_ty(x1), "tirx.nextafter", x1, x2)  # type: ignore
 
 
 def hypot(x1, x2):
@@ -1992,7 +2014,7 @@ def hypot(x1, x2):
     """
     x1 = tir.convert(x1)
     x2 = tir.convert(x2)
-    return call_intrin(x1.dtype, "tirx.hypot", x1, x2)  # type: ignore
+    return call_intrin(_primexpr_ty(x1), "tirx.hypot", x1, x2)  # type: ignore
 
 
 def copysign(x1, x2):
@@ -2013,7 +2035,7 @@ def copysign(x1, x2):
     """
     x1 = tir.convert(x1)
     x2 = tir.convert(x2)
-    return call_intrin(x1.dtype, "tirx.copysign", x1, x2)  # type: ignore
+    return call_intrin(_primexpr_ty(x1), "tirx.copysign", x1, x2)  # type: ignore
 
 
 def ldexp(x1, x2):
@@ -2034,7 +2056,7 @@ def ldexp(x1, x2):
     """
     x1 = tir.convert(x1)
     x2 = tir.convert(x2)
-    return call_intrin(x1.dtype, "tirx.ldexp", x1, x2)  # type: ignore
+    return call_intrin(_primexpr_ty(x1), "tirx.ldexp", x1, x2)  # type: ignore
 
 
 def likely(cond, span=None):
@@ -2086,7 +2108,7 @@ def selector(var, pred, span=None):
     active domain for which ``pred`` is true. It is intended for compiler
     metadata and should not survive to executable codegen.
     """
-    return call_intrin(var.dtype, "tirx.selector", var, pred, span=span)
+    return call_intrin(_primexpr_ty(var), "tirx.selector", var, pred, span=span)
 
 
 def isnan(x, span=None):
@@ -2223,7 +2245,7 @@ def popcount(x):
         The result.
     """
     x = tir.convert(x)
-    return call_intrin(x.dtype, "tirx.popcount", x)
+    return call_intrin(_primexpr_ty(x), "tirx.popcount", x)
 
 
 def q_multiply_shift(x, y, q, s):
@@ -2356,7 +2378,7 @@ def fmod(x, y):
     """
     x = tir.convert(x)
     y = tir.convert(y)
-    return call_intrin(x.dtype, "tirx.fmod", x, y)
+    return call_intrin(_primexpr_ty(x), "tirx.fmod", x, y)
 
 
 def if_then_else(cond, t, f, span=None):
@@ -2667,7 +2689,7 @@ def _make_reduce(expr, axis, where=None, init=None):
             rhs = []
             dtypes = []
             for i in range(size):
-                dtype = expr[i].dtype
+                dtype = _primexpr_dtype(expr[i])
                 dtypes.append(dtype)
                 lname = code.co_varnames[0] + "_" + str(i)
                 lhs.append(Var(lname, dtype))
@@ -2680,7 +2702,7 @@ def _make_reduce(expr, axis, where=None, init=None):
         else:
             assert isinstance(expr, tvm.ir.PrimExpr)
             size = 1
-            dtype = expr.dtype
+            dtype = _primexpr_dtype(expr)
             lvar = Var(code.co_varnames[0], dtype)
             rvar = Var(code.co_varnames[1], dtype)
             result = [fcombine(lvar, rvar)]
diff --git a/python/tvm/tirx/script/builder/external_kernel.py b/python/tvm/tirx/script/builder/external_kernel.py
index d56ed9ea0384..68e597d3f8ff 100644
--- a/python/tvm/tirx/script/builder/external_kernel.py
+++ b/python/tvm/tirx/script/builder/external_kernel.py
@@ -28,6 +28,7 @@
 
 from tvm import __version__ as tvm_version
 from tvm import tirx
+from tvm.ir import PrimExpr
 from tvm.runtime import Module, const
 from tvm.support import nvcc
 
@@ -136,8 +137,10 @@ def compile_to_device_module(  # pylint: disable=arguments-differ
             "threadIdx.y",
             "threadIdx.z",
         ][: len(grid[1])]
-        runtime_args = [arg if hasattr(arg, "dtype") else const(arg) for arg in args]
-        kernel_arg_types = [arg.dtype for arg in runtime_args]
+        runtime_args = [arg if isinstance(arg, PrimExpr) else const(arg) for arg in args]
+        kernel_arg_types = [
+            str(arg.ty.dtype) if isinstance(arg, PrimExpr) else arg.dtype for arg in runtime_args
+        ]
         runtime_args = runtime_args + list(grid[0]) + list(grid[1])
 
         # Reuse compilation path from SourceModule
diff --git a/python/tvm/tirx/script/builder/ir.py b/python/tvm/tirx/script/builder/ir.py
index 2c18c61136b8..12db12aa99db 100644
--- a/python/tvm/tirx/script/builder/ir.py
+++ b/python/tvm/tirx/script/builder/ir.py
@@ -520,7 +520,7 @@ def match_buffer(
             raise ValueError("Shape must be specified when binding input param")
     shape = (shape,) if isinstance(shape, PrimExpr | Integral) else shape
     if strides is not None:
-        idx_dtype = shape[0].dtype if isinstance(shape[0], PrimExpr) else "int32"
+        idx_dtype = shape[0].ty if isinstance(shape[0], PrimExpr) else "int32"
         strides = [Var(s, idx_dtype) if isinstance(s, str) else s for s in strides]
     else:
         strides = []
@@ -1012,8 +1012,8 @@ def _as_range(dom: ir.Range | list[PrimExpr]) -> ir.Range:
         if isinstance(extent, tir.IntImm):
             return ir.Range.from_min_extent(dom[0], extent)
         return ir.Range(dom[0], dom[1])
-    if hasattr(dom, "dtype"):
-        return ir.Range(IntImm(dom.dtype, 0), dom)
+    if isinstance(dom, PrimExpr):
+        return ir.Range(IntImm(dom.ty, 0), dom)
     return ir.Range(0, dom)
 
 
@@ -1204,8 +1204,8 @@ def serial(
             annotations["disable_unroll"] = True
     if stop is None:
         stop = start
-        if hasattr(start, "dtype"):
-            start = IntImm(start.dtype, 0)
+        if isinstance(start, PrimExpr):
+            start = IntImm(start.ty, 0)
         else:
             start = 0
     return _ffi_api.Serial(start, stop, annotations, step)  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -1241,8 +1241,8 @@ def parallel(
     """
     if stop is None:
         stop = start
-        if hasattr(start, "dtype"):
-            start = IntImm(start.dtype, 0)
+        if isinstance(start, PrimExpr):
+            start = IntImm(start.ty, 0)
         else:
             start = 0
     return _ffi_api.Parallel(start, stop, annotations, step)  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -1278,8 +1278,8 @@ def vectorized(
     """
     if stop is None:
         stop = start
-        if hasattr(start, "dtype"):
-            start = IntImm(start.dtype, 0)
+        if isinstance(start, PrimExpr):
+            start = IntImm(start.ty, 0)
         else:
             start = 0
     return _ffi_api.Vectorized(start, stop, annotations, step)  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -1315,8 +1315,8 @@ def unroll(
     """
     if stop is None:
         stop = start
-        if hasattr(start, "dtype"):
-            start = IntImm(start.dtype, 0)
+        if isinstance(start, PrimExpr):
+            start = IntImm(start.ty, 0)
         else:
             start = 0
     return _ffi_api.Unroll(start, stop, annotations, step)  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -1355,14 +1355,14 @@ def thread_binding(
             raise ValueError("Thread cannot be None for thread_binding")
         thread = stop
         stop = start
-        if hasattr(start, "dtype"):
-            start = IntImm(start.dtype, 0)
+        if isinstance(start, PrimExpr):
+            start = IntImm(start.ty, 0)
         else:
             start = 0
     elif stop is None:
         stop = start
-        if hasattr(start, "dtype"):
-            start = IntImm(start.dtype, 0)
+        if isinstance(start, PrimExpr):
+            start = IntImm(start.ty, 0)
         else:
             start = 0
     return _ffi_api.ThreadBinding(  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -1502,7 +1502,8 @@ def as_var(self, rhs_dtype=None):
             else:
                 raise TypeError(f"Invalid type for T.let: {self.type_spec}")
         elif rhs_dtype is not None:
-            return Var("", ir.PrimType(rhs_dtype))
+            rhs_ty = rhs_dtype if isinstance(rhs_dtype, Type) else ir.PrimType(rhs_dtype)
+            return Var("", rhs_ty)
         else:
             raise TypeError("T.let requires either a type or an RHS value")
 
@@ -2799,7 +2800,7 @@ def comm_reducer(combiner: Callable, identity: list[PrimExpr]) -> CommReducer:
         if isinstance(i, int):
             args.append(Var(name, "int32"))
         else:
-            args.append(Var(name, i.dtype))
+            args.append(Var(name, i.ty))
     res = combiner(*args)
     if not isinstance(res, tuple):
         res = (res,)
@@ -2986,19 +2987,19 @@ class WebGPUNamespace:
     def subgroup_shuffle(var, lane):
         if isinstance(var, Buffer):
             var = var[0]
-        return _tir_op.call_intrin(var.dtype, "tirx.webgpu.subgroup_shuffle", var, lane)
+        return _tir_op.call_intrin(var.ty, "tirx.webgpu.subgroup_shuffle", var, lane)
 
     @staticmethod
     def subgroup_shuffle_up(var, delta):
         if isinstance(var, Buffer):
             var = var[0]
-        return _tir_op.call_intrin(var.dtype, "tirx.webgpu.subgroup_shuffle_up", var, delta)
+        return _tir_op.call_intrin(var.ty, "tirx.webgpu.subgroup_shuffle_up", var, delta)
 
     @staticmethod
     def subgroup_shuffle_down(var, delta):
         if isinstance(var, Buffer):
             var = var[0]
-        return _tir_op.call_intrin(var.dtype, "tirx.webgpu.subgroup_shuffle_down", var, delta)
+        return _tir_op.call_intrin(var.ty, "tirx.webgpu.subgroup_shuffle_down", var, delta)
 
 
 webgpu = WebGPUNamespace()
diff --git a/python/tvm/tirx/script/parser/operation.py b/python/tvm/tirx/script/parser/operation.py
index c6cb50f291af..4f362b7d3acf 100644
--- a/python/tvm/tirx/script/parser/operation.py
+++ b/python/tvm/tirx/script/parser/operation.py
@@ -28,7 +28,9 @@ def _register_expr_op(ty: type):  # pylint: disable=invalid-name
     ty._dispatch_type = ty  # pylint: disable=protected-access
 
     def _expr_ty(expr):
-        ty = expr.expr_ty()
+        ty = expr.ty if isinstance(expr, tirx.PrimExpr) else None
+        if not isinstance(ty, PrimType):
+            ty = expr.expr_ty()
         if not isinstance(ty, PrimType):
             raise TypeError(f"Expected a PrimType expression, but got {ty}")
         return ty
@@ -62,7 +64,7 @@ def _get_type_str(ty: PrimType):
 
     def _auto_broadcast(a, b, op):
         if isinstance(a, int):
-            if hasattr(b, "dtype"):
+            if isinstance(b, tirx.PrimExpr) or hasattr(b, "expr_ty"):
                 b_ty = _expr_ty(b)
                 if b_ty.matches_code(DataTypeCode.INT, DataTypeCode.UINT, DataTypeCode.BOOL):
                     a = IntImm(_get_type_str(b_ty), a)
diff --git a/python/tvm/tirx/script/parser/parser.py b/python/tvm/tirx/script/parser/parser.py
index 54c18db374d8..b2f2b30063a8 100644
--- a/python/tvm/tirx/script/parser/parser.py
+++ b/python/tvm/tirx/script/parser/parser.py
@@ -225,13 +225,13 @@ def bind_assign_value(self: Parser, node: doc.expr, var_name: str, value: Any) -
             value = tvm.tirx.const(value)
         if not isinstance(value, tvm.tirx.StringImm):
             # x = expr -> scalar (auto-typed from value)
-            scalar = T.local_scalar(dtype=str(value.dtype))
+            scalar = T.local_scalar(dtype=str(value.ty.dtype))
             IRBuilder.name(var_name, scalar.scalar.buffer)
             T.buffer_store(scalar.scalar.buffer, value, [0])
             return scalar.scalar
         else:
             # StringImm: x = expr -> immutable Bind var
-            ann_var = tvm.tirx.Var(var_name, value.dtype)
+            ann_var = tvm.tirx.Var(var_name, value.ty)
             IRBuilder.name(var_name, ann_var)
             T.Bind(value, var=ann_var)
             return ann_var
@@ -539,7 +539,7 @@ def visit_ann_assign(self: Parser, node: doc.AnnAssign) -> None:
         if raw_ann.type_spec is not None:
             ann_var = raw_ann.as_var()
         else:
-            ann_var = raw_ann.as_var(rhs_dtype=rhs.dtype)
+            ann_var = raw_ann.as_var(rhs_dtype=rhs.ty)
         if not isinstance(ann_var, Var):
             self.report_error(node.annotation, "Annotation should resolve to Var")
         self.eval_assign(target=lhs, source=ann_var, bind_value=bind_assign_value)
@@ -619,7 +619,7 @@ def visit_function_def(self: Parser, node: doc.FunctionDef) -> None:
             if node.returns is not None:
                 ret_type = self.eval_expr(node.returns)
                 if callable(ret_type):
-                    ret_type = PrimType(ret_type().dtype)
+                    ret_type = ret_type().ty
                 T.func_ret(ret_type)
             with self.with_dispatch_token("tirx"):
                 # TODO: handle different types of arguments:
@@ -888,7 +888,7 @@ def visit_tvm_declare_function(self: Parser, node: doc.FunctionDef) -> GlobalVar
         if node.returns is not None:
             ret_type = self.eval_expr(node.returns)
             if callable(ret_type):
-                ret_type = PrimType(ret_type().dtype)
+                ret_type = ret_type().ty
 
         arg_annotations = []
         for arg in node.args.args:
diff --git a/python/tvm/tirx/stmt.py b/python/tvm/tirx/stmt.py
index 532bf35b254a..543ff99fed66 100644
--- a/python/tvm/tirx/stmt.py
+++ b/python/tvm/tirx/stmt.py
@@ -35,7 +35,7 @@
 
 from tvm.ir import Op, PrimExpr, Range, Span
 from tvm.runtime import Object, Scriptable, const
-from tvm.tirx import FloatImm
+from tvm.tirx import FloatImm, IntImm
 
 from . import _ffi_api
 from .buffer import Buffer
@@ -656,7 +656,7 @@ def __getitem__(self, indices):
                 new_min = old_range.min + index
                 new_region.append(
                     Range.from_min_extent(
-                        new_min, const(1, index.dtype) if isinstance(index, PrimExpr) else 1
+                        new_min, IntImm(index.ty, 1) if isinstance(index, PrimExpr) else 1
                     )
                 )
         # Fill remaining dimensions with their original ranges
diff --git a/src/backend/hexagon/codegen/llvm/codegen_hexagon.cc b/src/backend/hexagon/codegen/llvm/codegen_hexagon.cc
index 60959f2aa9fe..17aba2d3fc40 100644
--- a/src/backend/hexagon/codegen/llvm/codegen_hexagon.cc
+++ b/src/backend/hexagon/codegen/llvm/codegen_hexagon.cc
@@ -68,7 +68,7 @@ namespace codegen {
 
 TVM_FFI_INLINE int GetVectorBytes(const PrimType& dtype) {
   TVM_FFI_ICHECK(dtype.IsFixedLengthVector() || dtype.IsScalar());
-  return dtype.bits() * dtype.lanes() / 8;
+  return static_cast<int>(dtype.StorageBytes());
 }
 
 // Hexagon code generation
diff --git a/src/backend/vulkan/codegen/ir_builder.h b/src/backend/vulkan/codegen/ir_builder.h
index 85dbdc00cff4..7e8844682c4e 100644
--- a/src/backend/vulkan/codegen/ir_builder.h
+++ b/src/backend/vulkan/codegen/ir_builder.h
@@ -50,7 +50,7 @@ struct SType {
   /*! \brief The Id to represent type */
   uint32_t id{0};
   /*! \brief corresponding TVM type */
-  tvm::PrimType type;
+  tvm::PrimType type{tvm::PrimType::Void()};
   /*! \brief content type id if it is a pointer/struct-array class */
   uint32_t element_type_id{0};
   /*! \brief The storage class, if it is a pointer */
diff --git a/src/ir/type.cc b/src/ir/type.cc
index 2464f6faa659..20bbe9c0e58a 100644
--- a/src/ir/type.cc
+++ b/src/ir/type.cc
@@ -21,6 +21,7 @@
  * \file src/ir/type.cc
  * \brief Common type system AST nodes throughout the IR.
  */
+#include <tvm/ffi/container/tensor.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ir/type.h>
@@ -135,6 +136,15 @@ PrimType PrimType::ScalableVector(DLDataTypeCode code, int bits, int lanes) {
   return PrimType(ScalableVectorDType(code, bits, lanes));
 }
 
+size_t PrimType::StorageBytes() const {
+  int16_t encoded_lanes = static_cast<int16_t>(get()->dtype.lanes);
+  if (TVM_FFI_PREDICT_FALSE(encoded_lanes < 0)) {
+    TVM_FFI_THROW(InternalError)
+        << "Cannot compute compile-time storage bytes for non-fixed vector type " << get()->dtype;
+  }
+  return ffi::GetDataSize(1, get()->dtype);
+}
+
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("ir.PrimType", [](DLDataType dtype) { return PrimType(dtype); });
diff --git a/src/relax/backend/contrib/codegen_json/codegen_json.h b/src/relax/backend/contrib/codegen_json/codegen_json.h
index edebb7593fca..03133599a58a 100644
--- a/src/relax/backend/contrib/codegen_json/codegen_json.h
+++ b/src/relax/backend/contrib/codegen_json/codegen_json.h
@@ -89,8 +89,8 @@ class OpAttrExtractor {
     }
   }
 
-  void Visit(const char* key, DataType* value) {
-    if (!value->is_void()) {
+  void Visit(const char* key, DLDataType* value) {
+    if (!(value->code == kDLOpaqueHandle && value->bits == 0 && value->lanes == 0)) {
       SetNodeAttr(key, ffi::String(ffi::DLDataTypeToString(*value)));
     } else {
       SetNodeAttr(key, ffi::String(""));
@@ -201,7 +201,7 @@ class OpAttrExtractor {
           break;
         }
         case ffi::TypeIndex::kTVMFFIDataType: {
-          DataType value(field_value.cast<DLDataType>());
+          DLDataType value = field_value.cast<DLDataType>();
           this->Visit(field_info->name.data, &value);
           break;
         }
@@ -282,7 +282,7 @@ class JSONSerializer : public relax::MemoizedExprTranslator<NodeEntries> {
         ShapeExpr output_shape = tensor_ty->shape.value().as_or_throw<ShapeExpr>();
         ret.push_back(JSONGraphNodeEntry(node_id, i));
         shape.emplace_back(GetIntShape(output_shape->values));
-        dtype.emplace_back(DType2String(tensor_ty->dtype));
+        dtype.emplace_back(DType2String(tensor_ty->dtype->dtype));
       }
       node->SetNumOutput(tuple_ty->fields.size());
     } else {
@@ -292,7 +292,7 @@ class JSONSerializer : public relax::MemoizedExprTranslator<NodeEntries> {
       ShapeExpr output_shape = tensor_ty->shape.value().as_or_throw<ShapeExpr>();
 
       shape.emplace_back(GetIntShape(output_shape->values));
-      dtype.emplace_back(DType2String(tensor_ty->dtype));
+      dtype.emplace_back(DType2String(tensor_ty->dtype->dtype));
       ret.push_back(JSONGraphNodeEntry(node_id, 0));
     }
     node->SetShape(shape);
diff --git a/src/relax/op/memory/view.cc b/src/relax/op/memory/view.cc
index d1674ef92d2d..828eba4950f0 100644
--- a/src/relax/op/memory/view.cc
+++ b/src/relax/op/memory/view.cc
@@ -176,8 +176,7 @@ Type InferTypeView(const Call& call, const BlockBuilder& ctx) {
     if (ty.IsVoid() || ty.IsScalableVector()) {
       return std::nullopt;
     } else {
-      auto size_bits = ty.bits() * ty.lanes();
-      return IntImm::Int64((size_bits + 7) / 8);
+      return IntImm::Int64(static_cast<int64_t>(ty.StorageBytes()));
     }
   };
 
diff --git a/src/relax/transform/lower_alloc_tensor.cc b/src/relax/transform/lower_alloc_tensor.cc
index f76649164bed..52bca3e707eb 100644
--- a/src/relax/transform/lower_alloc_tensor.cc
+++ b/src/relax/transform/lower_alloc_tensor.cc
@@ -75,7 +75,7 @@ class Mutator : public ExprMutator {
         PrimType dtype_ty(dtype->value);
         TVM_FFI_ICHECK(!dtype_ty.IsScalableVector())
             << "Cannot statically compute allocation size for scalable vector dtype " << dtype_ty;
-        PrimExpr nbytes = IntImm::Int64(((dtype_ty.bits() * dtype_ty.lanes()) + 7) / 8);
+        PrimExpr nbytes = IntImm::Int64(static_cast<int64_t>(dtype_ty.StorageBytes()));
         for (const auto& dim : shape) {
           nbytes *= dim;
         }
diff --git a/src/relax/transform/static_plan_block_memory.cc b/src/relax/transform/static_plan_block_memory.cc
index cb672986e6e5..2a04461555d0 100644
--- a/src/relax/transform/static_plan_block_memory.cc
+++ b/src/relax/transform/static_plan_block_memory.cc
@@ -141,7 +141,7 @@ class StorageToken : public ffi::ObjectRef {
     PrimType dtype_ty(dtype);
     TVM_FFI_ICHECK(!dtype_ty.IsScalableVector())
         << "Cannot statically plan storage size for scalable vector dtype " << dtype_ty;
-    int64_t const_coeff = (((dtype_ty.bits() * dtype_ty.lanes()) + 7) / 8);
+    int64_t const_coeff = static_cast<int64_t>(dtype_ty.StorageBytes());
     PrimExpr size = IntImm::Int64(1);
     bool size_computed = false;
 
@@ -983,7 +983,7 @@ class StorageAllocationRewriter : public ExprMutator {
         PrimType dtype_ty(dtype);
         TVM_FFI_ICHECK(!dtype_ty.IsScalableVector())
             << "Cannot statically plan storage size for scalable vector dtype " << dtype_ty;
-        bytes *= (((dtype_ty.bits() * dtype_ty.lanes()) + 7) / 8);
+        bytes *= IntImm::Int64(static_cast<int64_t>(dtype_ty.StorageBytes()));
         Call alloc_storage(mem_alloc_storage,
                            {/*size=*/ShapeExpr({bytes}),
                             /*virtual_device_index=*/call->args[2].as_or_throw<PrimValue>(),
diff --git a/src/s_tir/analysis/verify_gpu_code.cc b/src/s_tir/analysis/verify_gpu_code.cc
index 6c70033056a7..8155fd791e4b 100644
--- a/src/s_tir/analysis/verify_gpu_code.cc
+++ b/src/s_tir/analysis/verify_gpu_code.cc
@@ -280,9 +280,7 @@ class GPUCodeVerifier : public StmtExprVisitor {
 
   std::vector<ffi::String> errors_;
 
-  static size_t ElementBytes(const PrimType& ty) {
-    return static_cast<size_t>(ty.lanes()) * ((ty.bits() + 7) / 8);
-  }
+  static size_t ElementBytes(const PrimType& ty) { return ty.StorageBytes(); }
 
   void Reset_() {
     local_memory_per_block_ = 0;
diff --git a/src/s_tir/transform/bound_checker.cc b/src/s_tir/transform/bound_checker.cc
index 364f4b3794c4..86086da945a8 100644
--- a/src/s_tir/transform/bound_checker.cc
+++ b/src/s_tir/transform/bound_checker.cc
@@ -178,8 +178,7 @@ class BoundChecker : public StmtExprMutator {
 
   bool IsValidScalar(const PrimExpr& expr) const {
     if (!expr.defined()) return false;
-    PrimType ty = expr.ty();
-    return !ty.IsFixedLengthVector() && !ty.IsScalableVector();
+    return expr.ty().IsScalar();
   }
 
   bool CanInstrument(const ffi::Array<PrimExpr>& indices, const Var& buffer_var) const {
diff --git a/src/s_tir/transform/merge_shared_memory_allocations.cc b/src/s_tir/transform/merge_shared_memory_allocations.cc
index 2ce0295d1675..89d472087331 100644
--- a/src/s_tir/transform/merge_shared_memory_allocations.cc
+++ b/src/s_tir/transform/merge_shared_memory_allocations.cc
@@ -648,8 +648,7 @@ class SharedMemoryRewriter : public StmtExprMutator {
       for (int i = 0; i < static_cast<int>(e->allocs.size()); i++) {
         for (const VarNode* buffer : e->allocs[i]) {
           const Buffer& buf = scope.shmem_allocs.at(buffer);
-          int elem_bytes =
-              (static_cast<int>(buf->dtype.bits()) * static_cast<int>(buf->dtype.lanes()) + 7) / 8;
+          int elem_bytes = static_cast<int>(buf->dtype.StorageBytes());
           align[i] = std::max(align[i], elem_bytes);
         }
       }
@@ -662,8 +661,7 @@ class SharedMemoryRewriter : public StmtExprMutator {
         for (const VarNode* buffer : e->allocs[i]) {
           const Buffer& buf = scope.shmem_allocs.at(buffer);
           ffi::Array<PrimExpr> alloc_shape = GetBufferAllocationShape(buf);
-          int elem_bytes =
-              (static_cast<int>(buf->dtype.bits()) * static_cast<int>(buf->dtype.lanes()) + 7) / 8;
+          int elem_bytes = static_cast<int>(buf->dtype.StorageBytes());
           int align_bytes = std::max(align[i], elem_bytes);
           if (buf->data_alignment > 0) {
             TVM_FFI_ICHECK(buf->data_alignment % align_bytes == 0)
diff --git a/src/tirx/transform/ir_utils.h b/src/tirx/transform/ir_utils.h
index 556f77e0085f..d103ff9f583a 100644
--- a/src/tirx/transform/ir_utils.h
+++ b/src/tirx/transform/ir_utils.h
@@ -177,7 +177,7 @@ inline PrimType APIType(const PrimType& t) {
 inline int GetTempAllocaAlignment(const PrimType& type, int32_t const_size) {
   int align = runtime::kTempAllocaAlignment;
   if (const_size > 0) {
-    int64_t const_s = static_cast<int64_t>(const_size) * type.bits() * type.lanes() / 8;
+    int64_t const_s = static_cast<int64_t>(const_size) * type.StorageBytes();
     while (align > const_s) {
       align = align / 2;
     }
diff --git a/src/tirx/transform/lower_tvm_builtin.cc b/src/tirx/transform/lower_tvm_builtin.cc
index 23ca5951ce86..606bfbea52aa 100644
--- a/src/tirx/transform/lower_tvm_builtin.cc
+++ b/src/tirx/transform/lower_tvm_builtin.cc
@@ -42,7 +42,7 @@ namespace tirx {
 namespace {
 
 TVM_FFI_INLINE int GetVectorBytes(const PrimType& dtype) {
-  return (dtype.bits() * dtype.lanes() + 7) / 8;
+  return static_cast<int>(dtype.StorageBytes());
 }
 
 }  // namespace
diff --git a/src/tirx/transform/split_host_device.cc b/src/tirx/transform/split_host_device.cc
index 42d1fb424801..6eaa75d57f6c 100644
--- a/src/tirx/transform/split_host_device.cc
+++ b/src/tirx/transform/split_host_device.cc
@@ -353,7 +353,7 @@ class DeviceInfoCollector : public StmtVisitor {
       for (const auto& extent : op->buffer->shape) {
         dyn_size *= extent;
       }
-      dyn_size *= (op->buffer->dtype.bits() * op->buffer->dtype.lanes() + 7) / 8;
+      dyn_size *= IntImm::Int64(static_cast<int64_t>(op->buffer->dtype.StorageBytes()));
 
       // Inline any locally-bound variables (e.g. from CSE).
       if (bind_map_.size()) {
diff --git a/src/tirx/transform/tvm_ffi_binder.cc b/src/tirx/transform/tvm_ffi_binder.cc
index 2535e0db15e0..fd659b53d7f0 100644
--- a/src/tirx/transform/tvm_ffi_binder.cc
+++ b/src/tirx/transform/tvm_ffi_binder.cc
@@ -705,7 +705,7 @@ void TVMFFIABIBuilder::DecodeParamDLTensor(const Buffer& buffer, const PrimExpr&
   }
 
   // ── Section: byte_offset ─────────────────────────────────────
-  int data_bytes = ((buffer->dtype.bits() * buffer->dtype.lanes()) + 7) / 8;
+  int data_bytes = static_cast<int>(buffer->dtype.StorageBytes());
   ffi::reflection::AccessPath byte_offset_path = param_path->Attr(ffi::String("byte_offset"));
   if (const auto* const_offset = buffer->elem_offset.as<IntImmNode>()) {
     BindScalar(IntImm(PrimType::UInt(64), const_offset->value * data_bytes),

From 1313eb9407cc3705cae4974b987514058563c07a Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 23 Jun 2026 19:29:03 +0000
Subject: [PATCH 6/8] [FIX][Vulkan] Avoid default PrimType construction

---
 src/backend/vulkan/codegen/ir_builder.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/backend/vulkan/codegen/ir_builder.cc b/src/backend/vulkan/codegen/ir_builder.cc
index ca82b06b0554..e986454a7f75 100644
--- a/src/backend/vulkan/codegen/ir_builder.cc
+++ b/src/backend/vulkan/codegen/ir_builder.cc
@@ -394,13 +394,11 @@ Value IRBuilder::GetBuiltInValue(spv::BuiltIn built_in, uint32_t index, const st
     }
   }
 
-  PrimType data_type;
-  PrimType global_arr_type;
+  PrimType data_type = PrimType::Int(32);
+  PrimType global_arr_type = data_type.WithLanes(3);
   switch (built_in) {
     case spv::BuiltInLocalInvocationId:
     case spv::BuiltInWorkgroupId:
-      data_type = PrimType::Int(32);
-      global_arr_type = data_type.WithLanes(3);
       break;
 
     default:

From 35d4a00e39c58e6038ebd8d9a171ef6f80ec8b2c Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 23 Jun 2026 20:14:46 +0000
Subject: [PATCH 7/8] [FIX][Relax] Update reverse_sequence dtype checks

---
 src/relax/op/tensor/manipulate.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/relax/op/tensor/manipulate.cc b/src/relax/op/tensor/manipulate.cc
index 8fe14c78555f..f0c7947b5ba2 100644
--- a/src/relax/op/tensor/manipulate.cc
+++ b/src/relax/op/tensor/manipulate.cc
@@ -2103,14 +2103,15 @@ Type InferTypeReverseSequence(const Call& call, const BlockBuilder& ctx) {
         << "ReverseSequence requires seq_lengths to be 1-D. However, seq_lengths has ndim "
         << seq_lengths_ty->ndim;
   }
-  if (!seq_lengths_ty->dtype.is_void() && !seq_lengths_ty->dtype.is_int()) {
+  PrimType seq_lengths_dtype = seq_lengths_ty->dtype;
+  if (!seq_lengths_ty->IsUnknownDtype() && !seq_lengths_dtype.MatchesCode(DLDataTypeCode::kDLInt)) {
     TVM_FFI_VISIT_THROW(ValueError, call)
         << "ReverseSequence requires seq_lengths to have dtype int32 or int64. However, "
            "seq_lengths has dtype "
         << seq_lengths_ty->dtype;
   }
-  if (seq_lengths_ty->dtype.is_int() && seq_lengths_ty->dtype.bits() != 32 &&
-      seq_lengths_ty->dtype.bits() != 64) {
+  if (seq_lengths_dtype.MatchesCode(DLDataTypeCode::kDLInt) &&
+      seq_lengths_dtype->dtype.bits != 32 && seq_lengths_dtype->dtype.bits != 64) {
     TVM_FFI_VISIT_THROW(ValueError, call)
         << "ReverseSequence requires seq_lengths to have dtype int32 or int64. However, "
            "seq_lengths has dtype "

From 9789895ea21ddfa871f39db4b8d37a9392f2385e Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Tue, 23 Jun 2026 20:46:34 +0000
Subject: [PATCH 8/8] [FIX][Relax] Update CUTLASS dtype codegen

---
 src/relax/backend/contrib/cutlass/codegen.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/relax/backend/contrib/cutlass/codegen.cc b/src/relax/backend/contrib/cutlass/codegen.cc
index 03621c400551..dfe4b24e4f12 100644
--- a/src/relax/backend/contrib/cutlass/codegen.cc
+++ b/src/relax/backend/contrib/cutlass/codegen.cc
@@ -167,9 +167,9 @@ class CodegenCutlass : public relax::MemoizedExprTranslator<OutputType>,
     for (const auto& arg : ext_func_args_) {
       auto ty = GetType(arg);
       if (const auto* tensor_ty = ty.as<TensorTypeNode>()) {
-        arg_types.emplace_back(backend::DType2String(tensor_ty->dtype));
+        arg_types.emplace_back(backend::DType2String(tensor_ty->dtype->dtype));
       } else if (const auto* shape_ty = ty.as<ShapeTypeNode>()) {
-        arg_types.emplace_back(backend::DType2String(shape_ty->values.value()[0]->dtype));
+        arg_types.emplace_back(backend::DType2String(shape_ty->values.value()[0].ty()->dtype));
       } else {
         TVM_FFI_THROW(InternalError) << "Unimplemented";
       }
@@ -302,7 +302,7 @@ class CodegenCutlass : public relax::MemoizedExprTranslator<OutputType>,
 
     std::vector<std::string> out_types;
     if (const auto* tensor_ty = ty.as<TensorTypeNode>()) {
-      out_types.emplace_back(backend::DType2String(tensor_ty->dtype));
+      out_types.emplace_back(backend::DType2String(tensor_ty->dtype->dtype));
     } else {
       TVM_FFI_THROW(InternalError) << "Unimplemented ty type: " << ty;
     }