diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index 934c2756f69d..e219cc684657 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -398,6 +398,7 @@ TVM_DLL Pass ForceNarrowIndexToInt32();
 /*!
  * \brief Legalize bf16 compute Ops. Add a cast to fp32
  *   before Ops, then add a cast back to bf16.
+ * \param target The target used for checking native bf16 support
  * \return The pass.
  */
 TVM_DLL Pass BF16ComputeLegalize();
@@ -405,10 +406,11 @@ TVM_DLL Pass BF16ComputeLegalize();
 /*!
  * \brief Legalize fp8 compute Ops. Add a cast to fp16/fp32
  *   before Ops, then add a cast back to fp8.
+ * \param target The target used for checking native fp8 support
  * \param promote_dtype_str The data type used for type promotion, defaults to float16
  * \return The pass.
  */
-TVM_DLL Pass FP8ComputeLegalize(String promote_dtype_str = "float16");
+TVM_DLL Pass FP8ComputeLegalize(Target target, String promote_dtype_str = "float16");
 
 /*!
  * \brief Legalize bf16 storage types to u16.
@@ -420,7 +422,7 @@ TVM_DLL Pass BF16StorageLegalize();
  * \brief Legalize fp8 storage types to u8.
  * \return The pass.
  */
-TVM_DLL Pass FP8StorageLegalize();
+TVM_DLL Pass FP8StorageLegalize(Target target);
 
 /*!
  * \brief Inline calls to private functions
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index d203007dd182..b1f042c1a597 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -270,6 +270,7 @@ def callback_libdevice_path(arch):
         return ""
 
 
+@tvm._ffi.register_func("tvm.contrib.nvcc.get_compute_version")
 def get_target_compute_version(target=None):
     """Utility function to get compute capability of compilation target.
 
@@ -406,6 +407,7 @@ def have_cudagraph():
         return False
 
 
+@tvm._ffi.register_func("tvm.contrib.nvcc.supports_bf16")
 def have_bf16(compute_version):
     """Either bf16 support is provided in the compute capability or not
 
@@ -421,6 +423,7 @@ def have_bf16(compute_version):
     return False
 
 
+@tvm._ffi.register_func("tvm.contrib.nvcc.supports_fp8")
 def have_fp8(compute_version):
     """Whether fp8 support is provided in the specified compute capability or not
 
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index bdadb6db0fb4..33b4514e6b29 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -216,7 +216,6 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
   pass_list.push_back(tir::transform::TransformMmaBufferLayout());
   pass_list.push_back(tir::transform::LowerOpaqueBlock());
   pass_list.push_back(tir::transform::FlattenBuffer());
-  pass_list.push_back(tir::transform::FP8ComputeLegalize());
   pass_list.push_back(tir::transform::BF16ComputeLegalize());
   pass_list.push_back(tir::transform::NarrowDataType(32));
   pass_list.push_back(tir::transform::Simplify());
@@ -570,6 +569,8 @@ transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target)
 
   Array<Pass> mixed_pass_list;
 
+  mixed_pass_list.push_back(tir::transform::FP8ComputeLegalize(target));
+
   // VerifyVTCMLimit must occur before LowerVtcmAlloc
   mixed_pass_list.push_back(tir::transform::VerifyVTCMLimit(target));
   // LowerVtcmAlloc must occur after any transformations that modify memory allocation locations
@@ -619,7 +620,7 @@ transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target)
   } else {
     mixed_pass_list.push_back(tir::transform::MakePackedAPI());
   }
-  mixed_pass_list.push_back(tir::transform::FP8StorageLegalize());
+  mixed_pass_list.push_back(tir::transform::FP8StorageLegalize(target));
   mixed_pass_list.push_back(tir::transform::BF16StorageLegalize());
 
   mixed_pass_list.push_back(tir::transform::LowerDeviceKernelLaunch());
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 60c102ceaa59..3f85316afdc4 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -586,6 +586,8 @@ llvm::Type* CodeGenLLVM::DTypeToLLVMType(const DataType& dtype) const {
       default:
         LOG(FATAL) << "do not support " << dtype;
     }
+  } else if (dtype.code() == DataType::kE4M3Float || dtype.code() == DataType::kE5M2Float) {
+    etype = llvm::Type::getInt8Ty(*ctx);
   }
   if (dtype.lanes() != 1) {
 #if TVM_LLVM_VERSION >= 110
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index 15905b030433..d352616f55fa 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -41,6 +41,31 @@
 namespace tvm {
 namespace codegen {
 
+std::string GetFP8Type(DataType type) {
+  std::stringstream stream;
+  int32_t lanes = type.lanes();
+  std::string vec;
+  if (type.is_scalar()) {
+    vec = "";
+  } else if (lanes == 2) {
+    vec = "_2";
+  } else if (lanes == 4) {
+    vec = "_4";
+  } else if (lanes == 8) {
+    vec = "_8";
+  } else {
+    LOG(FATAL) << "Only support scalar and vector types of width (2, 4, 8) for FP8";
+  }
+  if (type.code() == DataType::kE4M3Float) {
+    stream << "fp8_e4" << vec << "_t";
+  } else if (type.code() == DataType::kE5M2Float) {
+    stream << "fp8_e5" << vec << "_t";
+  } else {
+    LOG(FATAL) << "Unsupported FP8 type in CUDA codegen";
+  }
+  return stream.str();
+}
+
 CodeGenCUDA::CodeGenCUDA() { restrict_keyword_ = "__restrict__"; }
 
 void CodeGenCUDA::Init(bool output_ssa) {
@@ -121,8 +146,15 @@ std::string CodeGenCUDA::Finish() {
   if (enable_fp8_) {
     decl_stream << "#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)\n";
     decl_stream << "#include <cuda_fp8.h>\n";
+    decl_stream << "using fp8_e4_t = __nv_fp8_e4m3;\n";
+    decl_stream << "using fp8_e4_2_t = __nv_fp8x2_e4m3;\n";
+    decl_stream << "using fp8_e4_4_t = __nv_fp8x4_e4m3;\n";
+    decl_stream << "using fp8_e5_t = __nv_fp8_e5m2;\n";
+    decl_stream << "using fp8_e5_2_t = __nv_fp8x2_e5m2;\n";
+    decl_stream << "using fp8_e5_4_t = __nv_fp8x4_e5m2;\n";
     decl_stream << "#endif\n\n";
   }
+  declare_vector_type_extensions(decl_stream, enable_fp16_, enable_fp8_);
 
   if (enable_warp_shuffle_) {
     decl_stream << _cuda_warp_intrinsic_util;
@@ -214,17 +246,12 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
         if (t.is_scalar()) {
           os << "half";
         } else if (lanes <= 8) {
-          // Emit CUDA code to access fp16 vector elements.
-          //
-          // half4 is stored as uint2
-          //
-          // h4.x is emitted as *(half2*)(&(u2.x)).x
-          // h4.y is emitted as *(half2*)(&(u2.x)).y
-          // h4.z is emitted as *(half2*)(&(u2.y)).x
-          // h4.w is emitted as *(half2*)(&(u2.y)).y
-          //
-          ICHECK_EQ(lanes % 2, 0) << "only support even lane for half type";
-          os << "uint" << lanes / 2;
+          ICHECK_EQ(lanes % 2, 0) << "Only support an even number of lanes for half type";
+          if (lanes <= 4) {
+            os << "half" << lanes;
+          } else {
+            os << "uint" << lanes / 2;
+          }
         } else {
           fail = true;
         }
@@ -271,16 +298,9 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
     }
     if (!fail) return;
   } else if (t.is_float8()) {
-    if (t.is_scalar()) {
-      os << "unsigned char";  // __nv_fp8_storage_t is an alias of unsigned char
-    } else if (lanes == 2) {
-      os << "unsigned short int";  // __nv_fp8x2_storage_t is an alias of unsigned short
-    } else if (lanes == 4) {
-      os << "unsigned int";  // __nv_fp8x4_storage_t is an alias of unsigned int
-    } else {
-      fail = true;
-    }
-    if (!fail) return;
+    enable_fp8_ = true;
+    os << GetFP8Type(t);
+    return;
   } else if (t == DataType::Bool()) {
     os << "bool";
     return;
@@ -446,7 +466,7 @@ void CodeGenCUDA::PrintVecConstructor(DataType t, std::ostream& os) {
 
 void CodeGenCUDA::PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr lhs, PrimExpr rhs,
                                    std::ostream& os) {  // NOLINT(*)
-  // Delcare the result.
+  // Declare the result.
   std::string sret = name_supply_->FreshName("_");
   this->PrintIndent();
   this->PrintType(t, stream);
@@ -497,7 +517,11 @@ void CodeGenCUDA::PrintVecElemLoad(const std::string& vec, DataType t, int i,
       os << "((" << type_name << ")(" << ac << " >> " << i % 4 * 8 << "))";
     }
   } else if (t.is_float16()) {
-    os << "((half2*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2];
+    if (t.lanes() <= 4) {
+      os << vec << "." << access[i];
+    } else {
+      os << "((half2*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2];
+    }
   } else if (t.is_bfloat16()) {
     os << "((nv_bfloat162*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2];
   } else if (t.lanes() > 4 && t.lanes() <= 8) {
@@ -543,8 +567,13 @@ void CodeGenCUDA::PrintVecElemStore(const std::string& vec, DataType t, int i,
       stream << "(" << value << " << " << i % 4 * 8 << ");\n";
     }
   } else if (t.is_float16()) {
-    stream << "((half2*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2] << " = "
-           << value << ";\n";
+    if (t.lanes() <= 4) {
+      stream << vec << "." << access[i] << " = " << value << ";\n";
+    } else {
+      stream << "((half2*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2] << " = "
+             << value << ";\n";
+    }
+
   } else if (t.is_bfloat16()) {
     stream << "((nv_bfloat162*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2]
            << " = " << value << ";\n";
@@ -648,6 +677,16 @@ void CodeGenCUDA::VisitExpr_(const CastNode* op, std::ostream& os) {
   // Emit simple C-style type conversion.
   if (from_ty.is_scalar()) return CodeGenC::VisitExpr_(op, os);
 
+  if (target_ty.code() == DataType::kE4M3Float || target_ty.code() == DataType::kE5M2Float ||
+      from_ty.code() == DataType::kE4M3Float || from_ty.code() == DataType::kE5M2Float) {
+    std::ostringstream val;
+    val << "(";
+    PrintType(target_ty, val);
+    val << ")(" << PrintExpr(op->value) << ")";
+    os << val.str();
+    return;
+  }
+
   // We could emit make_float4 like calls, but the emitted code looks
   // too compact to read. Emit this as vectorized unary ops.
   std::string sret = name_supply_->FreshName("_");
@@ -1194,9 +1233,16 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) {  // NO
     std::string v = PrintExpr(op->value);
     PrintVecConstructor(op->dtype, os);
     os << '(';
-    for (int i = 0; i < lanes / 2; ++i) {
-      if (i != 0) os << ", ";
-      os << "__pack_half2(" << v << ", " << v << ")";
+    if (lanes <= 4) {
+      for (int i = 0; i < lanes / 2; ++i) {
+        if (i != 0) os << ", ";
+        os << v << ", " << v;
+      }
+    } else {
+      for (int i = 0; i < lanes / 2; ++i) {
+        if (i != 0) os << ", ";
+        os << "__pack_half2(" << v << ", " << v << ")";
+      }
     }
     os << ')';
     return;
@@ -1448,15 +1494,10 @@ void CodeGenCUDA::PrintVecElemLoadExpr(DataType t, int i, const std::string& val
       PrintVecConstructor(t, os);
       os << '(';
     }
-    if (i % 2 == 0) {
-      os << "__pack_half2(" << value;
+    if (i == t.lanes() - 1) {
+      os << value << ")";
     } else {
-      os << "," << value << ")";
-      if (i != t.lanes() - 1) {
-        os << ",";
-      } else {
-        os << ")";
-      }
+      os << value << ",";
     }
     return;
   }
diff --git a/src/target/source/literal/cuda_half_t.h b/src/target/source/literal/cuda_half_t.h
index 67471daf82c4..bf3e83928ed7 100644
--- a/src/target/source/literal/cuda_half_t.h
+++ b/src/target/source/literal/cuda_half_t.h
@@ -24,6 +24,8 @@
 #ifndef TVM_TARGET_SOURCE_LITERAL_CUDA_HALF_T_H_
 #define TVM_TARGET_SOURCE_LITERAL_CUDA_HALF_T_H_
 
+#include <string>
+
 static constexpr const char* _cuda_half_t_def = R"(
 typedef unsigned short uint16_t;
 typedef unsigned char uint8_t;
@@ -379,4 +381,44 @@ static constexpr const char* _cuda_warp_intrinsic_util = R"(
 
 )";
 
+void declare_vector_type_extensions(std::ostringstream& stream, bool enable_fp16, bool enable_fp8) {
+  if (enable_fp16 || enable_fp8) {
+    stream << R"(
+struct __align__(8) half4 {
+  __half x, y, z, w;
+  __host__ __device__ half4() : x(__half(0)), y(__half(0)), z(__half(0)), w(__half(0)) {}
+  __host__ __device__ half4(__half x, __half y, __half z, __half w) : x(x), y(y), z(z), w(w) {}
+)";
+    if (enable_fp8) {
+      stream << R"(
+  __host__ __device__ explicit half4(const __nv_fp8x4_e4m3& fp8x4) {
+    __nv_fp8x2_e4m3 lo_part, hi_part;
+    lo_part.__x = static_cast<__nv_fp8x2_storage_t>(fp8x4.__x & 0xFFFF);
+    hi_part.__x = static_cast<__nv_fp8x2_storage_t>((fp8x4.__x >> 16) & 0xFFFF);
+    __half2 lo_half2 = static_cast<__half2>(lo_part);
+    __half2 hi_half2 = static_cast<__half2>(hi_part);
+    x = reinterpret_cast<__half*>(&lo_half2)[0];
+    y = reinterpret_cast<__half*>(&lo_half2)[1];
+    z = reinterpret_cast<__half*>(&hi_half2)[0];
+    w = reinterpret_cast<__half*>(&hi_half2)[1];
+  }
+  __host__ __device__ explicit operator __nv_fp8x4_e4m3() const {
+    __nv_fp8x4_e4m3 result;
+    __half2 lo_half2 = *reinterpret_cast<const __half2*>(&x);
+    __half2 hi_half2 = *reinterpret_cast<const __half2*>(&z);
+    __nv_fp8x2_e4m3 lo_part(lo_half2), hi_part(hi_half2);
+    result.__x =
+        (static_cast<__uint32_t>(lo_part.__x) | (static_cast<__uint32_t>(hi_part.__x) << 16));
+    return result;
+  })";
+    }
+    stream << R"(
+};
+__host__ __device__ half4 make_half4(__half x, __half y, __half z, __half w) {
+    return half4(x, y, z, w);
+}
+)";
+  }
+}
+
 #endif  // TVM_TARGET_SOURCE_LITERAL_CUDA_HALF_T_H_
diff --git a/src/tir/transforms/unsupported_dtype_legalize.cc b/src/tir/transforms/unsupported_dtype_legalize.cc
index 030dbd01badf..c0378790740f 100644
--- a/src/tir/transforms/unsupported_dtype_legalize.cc
+++ b/src/tir/transforms/unsupported_dtype_legalize.cc
@@ -693,6 +693,20 @@ class FP8StorageLegalizer : public StorageLegalizer {
 
 namespace transform {
 
+bool CheckDataTypeSupport(const Target& target, const std::string& support_func_name) {
+  bool has_native_support = false;
+  if (target->kind->name == "cuda") {
+    if (const PackedFunc* get_cv =
+            tvm::runtime::Registry::Get("tvm.contrib.nvcc.get_compute_version")) {
+      std::string compute_version = (*get_cv)(target);
+      if (const PackedFunc* check_support = tvm::runtime::Registry::Get(support_func_name)) {
+        has_native_support = (*check_support)(compute_version);
+      }
+    }
+  }
+  return has_native_support;
+}
+
 Pass BF16ComputeLegalize() {
   auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
     // TODO(tvm-team): skip if the target supports bf16
@@ -713,9 +727,11 @@ Pass BF16StorageLegalize() {
 
 TVM_REGISTER_GLOBAL("tir.transform.BF16StorageLegalize").set_body_typed(BF16StorageLegalize);
 
-Pass FP8ComputeLegalize(String promote_dtype_str) {
+Pass FP8ComputeLegalize(Target target, String promote_dtype_str) {
   auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
-    // TODO(tvm-team): skip if the target supports fp8
+    if (CheckDataTypeSupport(target, "tvm.contrib.nvcc.supports_fp8")) {
+      return f;
+    }
     return FP8ComputeLegalizer(DataType(String2DLDataType(promote_dtype_str))).Legalize(f);
   };
   return CreatePrimFuncPass(pass_func, 0, "tir.FP8ComputeLegalize", {});
@@ -723,9 +739,11 @@ Pass FP8ComputeLegalize(String promote_dtype_str) {
 
 TVM_REGISTER_GLOBAL("tir.transform.FP8ComputeLegalize").set_body_typed(FP8ComputeLegalize);
 
-Pass FP8StorageLegalize() {
-  auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
-    // TODO(tvm-team): skip if the target supports fp8
+Pass FP8StorageLegalize(Target target) {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    if (CheckDataTypeSupport(target, "tvm.contrib.nvcc.supports_fp8")) {
+      return f;
+    }
     return FP8StorageLegalizer().Legalize(f);
   };
   return CreatePrimFuncPass(pass_func, 0, "tir.FP8StorageLegalize", {});
diff --git a/tests/python/codegen/test_target_codegen_cuda_fp8.py b/tests/python/codegen/test_target_codegen_cuda_fp8.py
new file mode 100644
index 000000000000..dade970418f9
--- /dev/null
+++ b/tests/python/codegen/test_target_codegen_cuda_fp8.py
@@ -0,0 +1,803 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+import pytest
+
+import tvm
+from tvm.script import tir as T
+import numpy as np
+import tvm.testing
+
+
+from typing import List, Tuple
+from tvm import DataType, DataTypeCode, IRModule
+from tvm import dlight as dl
+from tvm import relax, te, tir, topi
+from tvm.relax.frontend import nn
+from tvm.runtime import NDArray
+from tvm.target import Target
+from tvm.topi.utils import get_const_tuple
+
+
+@tvm.testing.requires_cuda_compute_version(9)
+def test_e4m3_conversions():
+    dtype = "e4m3_float8"
+
+    @T.prim_func
+    def add(
+        A: T.Buffer((64,), dtype),
+        B: T.Buffer((64,), dtype),
+        C: T.Buffer((64,), dtype),
+    ):
+        T.func_attr({"tir.noalias": T.bool(True)})
+        # with T.block("root"):
+        for i in range(64):
+            with T.block("C"):
+                v_i = T.axis.spatial(64, i)
+                T.reads(A[v_i], B[v_i])
+                T.writes(C[v_i])
+                C[v_i] = T.Cast(dtype, T.Cast("float16", A[v_i]) + T.Cast("float16", B[v_i]))
+
+    sch = tvm.tir.Schedule(add)
+    block = sch.get_block("C")
+    b = sch.get_loops(block)
+    bx, tx = sch.split(b[0], factors=[None, 32])
+    sch.bind(bx, "blockIdx.x")
+    sch.bind(tx, "threadIdx.x")
+
+    target = "cuda"
+    fadd = tvm.build(sch.mod, target=target)
+
+    cuda_src = fadd.imported_modules[0].get_source()
+    assert "fp8_e4_t" in cuda_src, "FP8E4M3 (fp8_e4_t) datatype not found in generated CUDA"
+
+    dev = tvm.device(target, 0)
+
+    numpytype = "float8_e4m3fn"
+    a = tvm.nd.array(np.random.uniform(low=0, high=5, size=64).astype(numpytype), dev)
+    b = tvm.nd.array(np.random.uniform(low=0, high=5, size=64).astype(numpytype), dev)
+    c = tvm.nd.array(np.zeros(64, dtype=numpytype), dev)
+    fadd(a, b, c)
+
+    tvm.testing.assert_allclose(
+        c.numpy().astype("float16"), (a.numpy() + b.numpy()).astype("float16")
+    )
+
+
+@tvm.testing.requires_cuda_compute_version(9)
+def test_e4m3_packing():
+    length = 64
+    vector_length = 4
+    native_dtype, packed_dtype = ("e4m3_float8x4", "uint32")
+
+    @T.prim_func
+    def add(
+        A: T.Buffer((length,), native_dtype),
+        R: T.Buffer((length,), packed_dtype),
+        B: T.Buffer((length,), native_dtype),
+    ):
+        T.func_attr({"tir.noalias": T.bool(True)})
+        # with T.block("root"):
+        for i in range(length):
+            with T.block("R"):
+                v_i = T.axis.spatial(length, i)
+                T.reads(A[v_i])
+                T.writes(R[v_i])
+                R[v_i] = T.reinterpret(packed_dtype, A[v_i])
+        for i in range(length):
+            with T.block("B"):
+                v_i = T.axis.spatial(length, i)
+                T.reads(R[v_i])
+                T.writes(B[v_i])
+                B[v_i] = T.reinterpret(native_dtype, R[v_i])
+
+    sch = tvm.tir.Schedule(add)
+    block = sch.get_block("R")
+    b = sch.get_loops(block)
+    bx, tx = sch.split(b[0], factors=[None, 32])
+    sch.bind(bx, "blockIdx.x")
+    sch.bind(tx, "threadIdx.x")
+    block = sch.get_block("B")
+    b = sch.get_loops(block)
+    bx, tx = sch.split(b[0], factors=[None, 32])
+    sch.bind(bx, "blockIdx.x")
+    sch.bind(tx, "threadIdx.x")
+
+    target = "cuda"
+    f = tvm.build(sch.mod, target=target)
+    dev = tvm.device(target, 0)
+
+    numpytype = "float8_e4m3fn"
+    np_shape = (length, vector_length)
+    a_np = np.random.uniform(low=0, high=5, size=np_shape).astype(numpytype)
+    a = tvm.nd.empty(shape=(length,), dtype=native_dtype, device=dev)
+    r = tvm.nd.empty(shape=(length,), dtype=packed_dtype, device=dev)
+    b = tvm.nd.empty(shape=(length,), dtype=native_dtype, device=dev)
+    a.copyfrom(a_np)
+    f(a, r, b)
+    tvm.testing.assert_allclose(a.numpy().astype("float16"), b.numpy().astype("float16"))
+
+
+native_dtype, promoted_dtype = tvm.testing.parameters(
+    ("e4m3_float8", "float32"),
+    ("e4m3_float8", "float16"),
+    ("e4m3_float8x2", "float32x2"),
+    ("e4m3_float8x2", "float16x2"),
+    ("e4m3_float8x4", "float32x4"),
+    # Supported via half4 vector type extension in codegen
+    ("e4m3_float8x4", "float16x4"),
+)
+
+
+@tvm.testing.requires_cuda_compute_version(9)
+def test_e4m3_vector_conversions(native_dtype, promoted_dtype):
+    vector_length = 64
+
+    @T.prim_func
+    def add(
+        A: T.Buffer((vector_length,), native_dtype),
+        B: T.Buffer((vector_length,), native_dtype),
+        C: T.Buffer((vector_length,), native_dtype),
+    ):
+        T.func_attr({"tir.noalias": T.bool(True)})
+        # with T.block("root"):
+        for i in range(vector_length):
+            with T.block("C"):
+                v_i = T.axis.spatial(vector_length, i)
+                T.reads(A[v_i], B[v_i])
+                T.writes(C[v_i])
+                C[v_i] = T.Cast(
+                    native_dtype, T.Cast(promoted_dtype, A[v_i]) + T.Cast(promoted_dtype, B[v_i])
+                )
+
+    sch = tvm.tir.Schedule(add)
+    block = sch.get_block("C")
+    b = sch.get_loops(block)
+    bx, tx = sch.split(b[0], factors=[None, 32])
+    sch.bind(bx, "blockIdx.x")
+    sch.bind(tx, "threadIdx.x")
+
+    target = "cuda"
+    fadd = tvm.build(sch.mod, target=target)
+    cuda_src = fadd.imported_modules[0].get_source()
+    dev = tvm.device(target, 0)
+
+    numpytype = "float8_e4m3fn"
+    if "x" in native_dtype:
+        lanes = int(native_dtype.split("x")[-1])
+    else:
+        lanes = 1
+
+    if "x" in promoted_dtype:
+        promoted_base_dtype = promoted_dtype.split("x")[0]
+    else:
+        promoted_base_dtype = promoted_dtype
+
+    np_shape = (vector_length, lanes) if lanes > 1 else (vector_length,)
+    a_np = np.random.uniform(low=0, high=5, size=np_shape).astype(numpytype)
+    a = tvm.nd.empty(shape=(vector_length,), dtype=native_dtype, device=dev)
+    a.copyfrom(a_np)
+    b_np = np.random.uniform(low=0, high=5, size=np_shape).astype(numpytype)
+    b = tvm.nd.empty(shape=(vector_length,), dtype=native_dtype, device=dev)
+    b.copyfrom(b_np)
+    c = tvm.nd.empty(shape=(vector_length,), dtype=native_dtype, device=dev)
+    fadd(a, b, c)
+
+    tvm.testing.assert_allclose(
+        c.numpy().astype(promoted_base_dtype), (a_np + b_np).astype(promoted_base_dtype)
+    )
+
+
+bcast_length = tvm.testing.parameter(2, 4, 6, 8)
+
+
+@tvm.testing.requires_cuda_compute_version(8)
+def test_half_broadcast(bcast_length):
+    dtype = "float16"
+
+    @T.prim_func
+    def vector_broadcast(a: T.Buffer[(), dtype], vec: T.Buffer[(bcast_length,), dtype]):
+        for t in range(1):
+            with T.block("broadcast"):
+                vec[0:bcast_length] = T.broadcast(a[()], bcast_length)
+
+    sch = tvm.tir.Schedule(vector_broadcast)
+    block = sch.get_block("broadcast")
+    b = sch.get_loops(block)
+    bx, tx = sch.split(b[0], factors=[None, 1])
+    sch.bind(bx, "blockIdx.x")
+    sch.bind(tx, "threadIdx.x")
+
+    target = "cuda"
+    func = tvm.build(sch.mod, target=target)
+    dev = tvm.device(target, 0)
+
+    a_np = np.random.uniform(low=0, high=4, size=()).astype(dtype)
+    a = tvm.nd.array(a_np, device=dev)
+    b = tvm.nd.empty((bcast_length,), dtype=dtype, device=dev)
+
+    func(a, b)
+
+    b_np = np.full((bcast_length,), a_np)
+
+    tvm.testing.assert_allclose(b.numpy(), b_np)
+
+
+vector_length = tvm.testing.parameter(2, 4)
+
+
+@tvm.testing.requires_cuda_compute_version(8)
+def test_half_misaligned_vector_load(vector_length):
+    dtype = "float16"
+    vec_dtype = dtype + "x" + str(vector_length)
+    length = 256
+
+    @T.prim_func
+    def vector_load(
+        A: T.Buffer[(length,), dtype], B: T.Buffer[(length // vector_length,), vec_dtype]
+    ):
+        for b in T.thread_binding(1, thread="blockIdx.x"):
+            for i in T.thread_binding(length // vector_length, thread="threadIdx.x"):
+                vec_index = T.ramp((i + 1) * vector_length - 1, -1, vector_length)
+                B[i] = A[vec_index]
+
+    target = "cuda"
+    f = tvm.build(vector_load, target=target)
+
+    dev = tvm.device(target, 0)
+    a_np = np.random.uniform(low=0, high=1, size=(length,)).astype(dtype)
+    a = tvm.nd.array(a_np, device=dev)
+
+    b = tvm.nd.empty((length // vector_length,), dtype=vec_dtype, device=dev)
+
+    f(a, b)
+
+    b_np = np.empty((length // vector_length, vector_length), dtype=dtype)
+
+    for i in range(length // vector_length):
+        start_index = (i + 1) * vector_length - 1
+        b_np[i, :] = a_np[start_index - vector_length + 1 : start_index + 1][::-1]
+
+    tvm.testing.assert_allclose(b.numpy(), b_np)
+
+
+@tvm.testing.requires_cuda_compute_version(8)
+def test_half4_vector_add():
+    dtype = "float16"
+    length = 64
+    vector_length = 4
+    vec_dtype = dtype + "x" + str(vector_length)
+
+    @T.prim_func
+    def add(
+        A: T.Buffer((length,), vec_dtype),
+        B: T.Buffer((length,), vec_dtype),
+        C: T.Buffer((length,), vec_dtype),
+    ):
+        T.func_attr({"tir.noalias": T.bool(True)})
+        # with T.block("root"):
+        for i in range(length):
+            with T.block("C"):
+                v_i = T.axis.spatial(length, i)
+                T.reads(A[v_i], B[v_i])
+                T.writes(C[v_i])
+                C[v_i] = A[v_i] + B[v_i]
+
+    sch = tvm.tir.Schedule(add)
+    block = sch.get_block("C")
+    b = sch.get_loops(block)
+    bx, tx = sch.split(b[0], factors=[None, 32])
+    sch.bind(bx, "blockIdx.x")
+    sch.bind(tx, "threadIdx.x")
+
+    target = "cuda"
+    fadd = tvm.build(sch.mod, target=target)
+    dev = tvm.device(target, 0)
+
+    a_np = np.random.uniform(-1, 1, (length, vector_length)).astype(dtype)
+    a = tvm.nd.empty(shape=(length,), dtype=vec_dtype, device=dev)
+    a.copyfrom(a_np)
+    b_np = np.random.uniform(-1, 1, (length, vector_length)).astype(dtype)
+    b = tvm.nd.empty(shape=(length,), dtype=vec_dtype, device=dev)
+    b.copyfrom(b_np)
+    c = tvm.nd.empty(shape=(length,), dtype=vec_dtype, device=dev)
+
+    fadd(a, b, c)
+    c_expected = a_np + b_np
+    tvm.testing.assert_allclose(c.numpy(), c_expected, atol=1e-5, rtol=1e-5)
+
+
+class BaseFP8E4M3QuantScaleOnly:
+    @classmethod
+    def create_quantize_func(
+        cls,
+        weight_shape,
+        model_dtype,
+        quantize_dtype,
+        storage_dtype,
+        group_size,
+        num_elem_per_storage,
+        max_int_value,
+        axis,
+        output_transpose,
+    ) -> IRModule:
+        if DataType(quantize_dtype).type_code == DataTypeCode.E4M3Float:
+            quantize_func = cls.quantize_fp8x4_e4m3
+        else:
+            assert NotImplementedError()
+
+        bb = relax.BlockBuilder()  # pylint: disable=invalid-name
+        weight_var = relax.Var("weight", relax.TensorStructInfo(weight_shape, model_dtype))
+        compute_scale, compute_quantize, compute_transpose = quantize_func(
+            weight_shape,
+            model_dtype,
+            quantize_dtype,
+            storage_dtype,
+            group_size,
+            num_elem_per_storage,
+            max_int_value,
+            axis,
+            output_transpose,
+        )
+        with bb.function(name="main", params=[weight_var]):
+            with bb.dataflow():
+                lv_scale = bb.emit_te(compute_scale, weight_var)
+                lv_quantized_weight = compute_quantize(bb, (weight_var, lv_scale))
+                if compute_transpose:
+                    lv_output = bb.emit_te(compute_transpose, lv_quantized_weight, lv_scale)
+                    lv_quantized_weight = lv_output[0]
+                    lv_scale = lv_output[1]
+                tuple_output = bb.emit((lv_quantized_weight, lv_scale))
+                gv = bb.emit_output(tuple_output)
+            bb.emit_func_output(gv)
+        return bb.finalize()
+
+    @classmethod
+    def create_dequantize_func(
+        cls,
+        packed_weight_shape,
+        scale_shape,
+        dequantized_shape,
+        model_dtype,
+        quantize_dtype,
+        storage_dtype,
+        group_size,
+        num_elem_per_storage,
+        axis,
+    ) -> IRModule:
+        if DataType(quantize_dtype).type_code == DataTypeCode.E4M3Float:
+            dequantize_func = cls.dequantize_fp8x4_e4m3
+        else:
+            assert NotImplementedError()
+
+        bb = relax.BlockBuilder()  # pylint: disable=invalid-name
+        packed_weight_var = relax.Var(
+            "weight", relax.TensorStructInfo(packed_weight_shape, storage_dtype)
+        )
+        scale_var = relax.Var("scale", relax.TensorStructInfo(scale_shape, model_dtype))
+        compute_dequantize = dequantize_func(
+            packed_weight_shape,
+            scale_shape,
+            dequantized_shape,
+            model_dtype,
+            quantize_dtype,
+            storage_dtype,
+            group_size,
+            num_elem_per_storage,
+            axis,
+        )
+        with bb.function(name="main", params=[packed_weight_var, scale_var]):
+            with bb.dataflow():
+                lv = compute_dequantize(bb, (packed_weight_var, scale_var))
+                gv = bb.emit_output(lv)
+            bb.emit_func_output(gv)
+        return bb.finalize()
+
+    @classmethod
+    def quantize_fp8x4_e4m3(  # pylint: disable=too-many-locals
+        cls,
+        weight_shape: List[tir.PrimExpr],
+        model_dtype,
+        quantize_dtype,
+        storage_dtype,
+        group_size,
+        num_elem_per_storage,
+        max_int_value,
+        axis: int = -1,
+        output_transpose: bool = False,
+    ) -> Tuple[te.Tensor, te.Tensor]:
+        """Group quantization for weight tensor, defined in tensor expression."""
+        max_int = tir.const(max_int_value, model_dtype)
+        shape = weight_shape  # pylint: disable=invalid-name
+        axis = axis if axis >= 0 else len(shape) + axis
+        k = shape[axis]
+        quantize_dtype = DataType(quantize_dtype)
+        # compute scale per group
+        r = te.reduce_axis((0, group_size), name="r")  # pylint: disable=invalid-name
+        num_group = tir.ceildiv(k, group_size)
+        # (4096, 4096) -> quantize axis = 0, group size = 32 -> (128, 4096)
+        # for channel quant group_size = 4096 -> (1, 4096)
+        scale_shape = (*shape[:axis], num_group, *shape[axis + 1 :])
+
+        def compute_scale(weight: te.Tensor):
+            min_scaling_factor = tir.const(1.0 / (max_int_value * 512.0), model_dtype)
+            max_abs = te.compute(
+                shape=scale_shape,
+                fcompute=lambda *idx: te.max(
+                    tir.if_then_else(
+                        idx[axis] * group_size + r < k,
+                        te.abs(weight(*idx[:axis], idx[axis] * group_size + r, *idx[axis + 1 :])),
+                        te.min_value(model_dtype),
+                    ),
+                    axis=r,
+                ),
+                name="max_abs_value",
+            )
+            scale = te.compute(
+                scale_shape,
+                lambda *idx: te.max(
+                    max_abs(*idx).astype(model_dtype) / max_int, min_scaling_factor
+                ),
+                name="scale",
+            )
+            return scale
+
+        def compute_quantize_weight(bb: relax.BlockBuilder, args: relax.expr.Expr):
+            # compute scaled weight
+            packed_shape = (weight_shape[0], weight_shape[1] // num_elem_per_storage)
+            quant = cls.quant_and_pack_fp8x4_e4m3_sm90(
+                weight_shape,
+                packed_shape,
+                scale_shape,
+                group_size,
+                axis,
+                model_dtype,
+                storage_dtype,
+                quantize_dtype,
+            )
+            # quant.show()
+
+            global_var = bb.add_func(quant, "quantized_weight")
+            lv_quantized_weight = bb.emit(
+                relax.call_tir(
+                    global_var, args, relax.TensorStructInfo(packed_shape, storage_dtype)
+                )
+            )
+            return lv_quantized_weight
+
+        compute_transpose = None
+        if output_transpose:
+
+            def compute_transpose(quantized_weight: te.Tensor, scale: te.Tensor):
+                if len(quantized_weight.shape) != 2 or len(scale.shape) != 2:
+                    raise ValueError(
+                        "Does not support transpose output quantized weight with ndim != 2"
+                    )
+
+                quantized_weight = topi.transpose(quantized_weight)
+                scale = topi.transpose(scale)
+                return quantized_weight, scale
+
+        return compute_scale, compute_quantize_weight, compute_transpose
+
+    @classmethod
+    def dequantize_fp8x4_e4m3(  # pylint: disable=too-many-locals
+        cls,
+        packed_weight_shape: List[tir.PrimExpr],
+        scale_shape,
+        dequant_shape,
+        model_dtype,
+        quantize_dtype,
+        storage_dtype,
+        group_size,
+        num_elem_per_storage,
+        axis: int = -1,
+    ) -> Tuple[te.Tensor, te.Tensor]:
+        """Group quantization for weight tensor, defined in tensor expression."""
+        axis = axis if axis >= 0 else len(shape) + axis
+
+        def compute_dequantize_weight(bb: relax.BlockBuilder, args: relax.expr.Expr):
+            dequant = cls.dequant_fp8x4_e4m3_sm90(
+                packed_weight_shape,
+                scale_shape,
+                dequant_shape,
+                group_size,
+                axis,
+                model_dtype,
+                storage_dtype,
+                quantize_dtype,
+            )
+
+            global_var = bb.add_func(dequant, "dequantize_weight")
+            lv_dequantized_weight = bb.emit(
+                relax.call_tir(global_var, args, relax.TensorStructInfo(dequant_shape, model_dtype))
+            )
+            return lv_dequantized_weight
+
+        return compute_dequantize_weight
+
+    @classmethod
+    def quant_and_pack_fp8x4_e4m3_sm90(
+        cls,
+        weight_shape,
+        packed_shape,
+        scale_shape,
+        group_size,
+        axis,
+        model_dtype,
+        storage_dtype,
+        quantized_dtype,
+    ):
+        vector_length = 4
+        vec_quantized_dtype = f"{quantized_dtype}x{vector_length}"
+        vec_model_dtype = f"{model_dtype}x{vector_length}"
+        num_elem_per_storage = vector_length
+        # TODO(csullivan) assert on storage dtype / quantize type bytes == vector length
+        assert (
+            group_size % vector_length == 0
+        ), f"Number of elements in a group must be divisible by fp8 vector length {vector_length}"
+
+        @T.prim_func(private=True)
+        def quant_pack(
+            A: T.Buffer(weight_shape, model_dtype),
+            scale: T.Buffer(scale_shape, model_dtype),
+            compute: T.Buffer(
+                packed_shape,
+                storage_dtype,
+            ),
+        ):
+            # with T.block("root"):
+            # test = T.alloc_buffer(1, dtype=vec_model_dtype, scope="local")
+            for i0, i1 in T.grid(
+                T.int64(weight_shape[0]), T.int64(weight_shape[1] // vector_length)
+            ):
+                with T.block("compute"):
+                    v_i0, v_i1 = T.axis.remap("SS", [i0, i1])
+                    T.reads(
+                        A[v_i0, v_i1 : v_i1 + vector_length],
+                        scale[v_i0, v_i1 * T.int64(vector_length) // T.int64(group_size)],
+                    )
+                    T.writes(compute[v_i0, v_i1 * vector_length])
+                    compute[v_i0, v_i1] = T.reinterpret(
+                        storage_dtype,
+                        T.Cast(
+                            vec_quantized_dtype,
+                            A[v_i0, T.ramp(v_i1 * vector_length, 1, vector_length)]
+                            / scale[v_i0, v_i1 * T.int64(vector_length) // T.int64(group_size)],
+                        ),
+                    )
+
+        return quant_pack
+
+    @classmethod
+    def dequant_fp8x4_e4m3_sm90(
+        cls,
+        packed_weight_shape,
+        scale_shape,
+        out_shape,
+        group_size,
+        axis,
+        model_dtype,
+        storage_dtype,
+        quantized_dtype,
+    ):
+        vector_length = 4
+        vec_quantized_dtype = f"{quantized_dtype}x{vector_length}"
+        vec_model_dtype = f"{model_dtype}x{vector_length}"
+        num_elem_per_storage = vector_length
+
+        @T.prim_func
+        def dequant(
+            packed_weight: T.Buffer(packed_weight_shape, storage_dtype),
+            scale: T.Buffer(scale_shape, model_dtype),
+            dequantize: T.Buffer(out_shape, model_dtype),
+        ):
+            T.func_attr({"tir.noalias": T.bool(True)})
+            # with T.block("root"):
+            for i0, i1 in T.grid(T.int64(packed_weight_shape[0]), T.int64(packed_weight_shape[1])):
+                with T.block("dequantize"):
+                    v_i0 = T.axis.spatial(T.int64(packed_weight_shape[0]), i0)
+                    v_i1 = T.axis.spatial(T.int64(packed_weight_shape[1]), i1)
+                    T.reads(
+                        packed_weight[v_i0, v_i1],
+                        scale[v_i0, v_i1 * T.int64(vector_length) // T.int64(group_size)],
+                    )
+
+                    dequantize[v_i0, T.ramp(v_i1 * vector_length, 1, vector_length)] = T.Cast(
+                        vec_model_dtype,
+                        T.reinterpret(vec_quantized_dtype, packed_weight[v_i0, v_i1]),
+                    ) * T.Broadcast(
+                        scale[v_i0, v_i1 * T.int64(vector_length) // T.int64(group_size)],
+                        vector_length,
+                    )
+
+        return dequant
+
+    @classmethod
+    def compile_quant_and_dequant_by_scale(
+        cls,
+        weight_shape,
+        scales_shape,
+        quant_weight_shape,
+        model_dtype,
+        quantize_dtype,
+        storage_dtype,
+        group_size,
+        num_el_per_storage,
+        max_int_value,
+        axis,
+        target_str,
+        dev,
+    ):
+        quant_mod = cls.create_quantize_func(
+            weight_shape,
+            model_dtype,
+            quantize_dtype,
+            storage_dtype,
+            group_size,
+            num_el_per_storage,
+            max_int_value,
+            axis,
+            output_transpose=False,
+        )
+        # quant_mod.show()
+
+        target = tvm.target.Target(target_str)
+        with target:
+            quant_mod = dl.ApplyDefaultSchedule(
+                dl.gpu.Reduction(),
+                dl.gpu.GeneralReduction(),
+                dl.gpu.Fallback(),
+            )(quant_mod)
+        ex_1 = relax.build(quant_mod, target=target)
+        vm_1 = relax.VirtualMachine(ex_1, dev)
+
+        dequant_mod = cls.create_dequantize_func(
+            quant_weight_shape,
+            scales_shape,
+            weight_shape,
+            model_dtype,
+            quantize_dtype,
+            storage_dtype,
+            group_size,
+            num_el_per_storage,
+            axis,
+        )
+        # dequant_mod.show()
+
+        with target:
+            dequant_mod = dl.ApplyDefaultSchedule(
+                dl.gpu.Reduction(),
+                dl.gpu.GeneralReduction(),
+                dl.gpu.Fallback(),
+            )(dequant_mod)
+        dequant_mod.show()
+
+        ex_2 = relax.build(dequant_mod, target=target)
+        vm_2 = relax.VirtualMachine(ex_2, dev)
+
+        def print_cuda(target, mod, name=None):
+            if name:
+                mod = mod[name]
+            f = tvm.build(mod, target=target)
+            cuda_src = f.imported_modules[0].get_source()
+            print(cuda_src)
+
+        print_cuda(target, dequant_mod, name="dequant")
+
+        return vm_1["main"], vm_2["main"]
+
+
+class TestFP8e4x4QuantDequantScale(BaseFP8E4M3QuantScaleOnly):
+    # weight_shape = tvm.testing.parameter((32000, 4096), (4096, 14336))
+    weight_shape = tvm.testing.parameter((128, 256), (128, 64))
+
+    @tvm.testing.fixture
+    def group_size(self):
+        return 64
+
+    @tvm.testing.fixture
+    def axis(self):
+        return 1
+
+    @tvm.testing.fixture
+    def model_dtype(self):
+        return "float16"
+
+    @tvm.testing.fixture
+    def storage_dtype(self):
+        return "uint32"
+
+    @tvm.testing.fixture
+    def quantize_dtype(self):
+        return "e4m3_float8"
+
+    @tvm.testing.fixture
+    def num_el_per_storage(self):
+        return 4
+
+    @tvm.testing.fixture
+    def max_int_value(self):
+        return 448
+
+    @tvm.testing.fixture
+    def target_str(self):
+        return "cuda"
+
+    @tvm.testing.fixture
+    def scale_shape(self, weight_shape, group_size, axis):
+        return [
+            (d + group_size - 1) // group_size if axis == i else d
+            for i, d in enumerate(weight_shape)
+        ]
+
+    @tvm.testing.fixture
+    def quant_weight_shape(self, weight_shape, num_el_per_storage, axis):
+        return [
+            (d + num_el_per_storage - 1) // num_el_per_storage if axis == i else d
+            for i, d in enumerate(weight_shape)
+        ]
+
+    @tvm.testing.fixture
+    def compiled_functions(
+        self,
+        weight_shape,
+        scale_shape,
+        quant_weight_shape,
+        model_dtype,
+        quantize_dtype,
+        storage_dtype,
+        group_size,
+        num_el_per_storage,
+        max_int_value,
+        axis,
+        target_str,
+    ):
+        dev = tvm.device(target_str, 0)
+        return self.compile_quant_and_dequant_by_scale(
+            weight_shape,
+            scale_shape,
+            quant_weight_shape,
+            model_dtype,
+            quantize_dtype,
+            storage_dtype,
+            group_size,
+            num_el_per_storage,
+            max_int_value,
+            axis,
+            target_str,
+            dev,
+        )
+
+    @tvm.testing.requires_cuda_compute_version(9)
+    def test_main(self, weight_shape, model_dtype, target_str, compiled_functions):
+        quant, dequant = compiled_functions
+        dev = tvm.device(target_str, 0)
+
+        weight_np = np.random.uniform(-100, 100, weight_shape).astype(model_dtype)
+        weight = tvm.nd.array(weight_np, device=dev)
+        quant_weight, scales = quant(weight)
+        quant_weight_np, scales_np = quant_weight.numpy(), scales.numpy()
+
+        dequant_weight = dequant(quant_weight, scales)
+        dequant_weight_np = dequant_weight.numpy()
+        tvm.testing.assert_allclose(weight_np, dequant_weight_np, atol=10, rtol=5e-2)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()