forked from NVIDIA/cuda-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
NsightEclipse.xml
81 lines (81 loc) · 2.49 KB
/
NsightEclipse.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
<entry>
<name>bf16TensorCoreGemm</name>
<cflags>
<flag>--std=c++11</flag>
</cflags>
<cuda_api_list>
<toolkit>cudaMemcpy</toolkit>
<toolkit>cudaFree</toolkit>
<toolkit>cudaGetErrorString</toolkit>
<toolkit>cudaGetLastError</toolkit>
<toolkit>cudaEventSynchronize</toolkit>
<toolkit>cudaFuncSetAttribute</toolkit>
<toolkit>cudaEventRecord</toolkit>
<toolkit>cudaMemset</toolkit>
<toolkit>cudaMalloc</toolkit>
<toolkit>cudaEventElapsedTime</toolkit>
<toolkit>cudaGetDeviceProperties</toolkit>
<toolkit>cudaEventCreate</toolkit>
</cuda_api_list>
<description><![CDATA[A CUDA sample demonstrating __nv_bfloat16 (e8m7) GEMM computation using the Warp Matrix Multiply and Accumulate (WMMA) API introduced with CUDA 11 in Ampere chip family tensor cores for faster matrix operations. This sample also uses async copy provided by cuda pipeline interface for gmem to shmem async loads which improves kernel performance and reduces register presssure.]]></description>
<devicecompilation>whole</devicecompilation>
<includepaths>
<path>./</path>
<path>../</path>
<path>../../../Common</path>
</includepaths>
<keyconcepts>
<concept level="basic">Matrix Multiply</concept>
<concept level="advanced">WMMA</concept>
<concept level="advanced">Tensor Cores</concept>
</keyconcepts>
<keywords>
<keyword>matrix multiply</keyword>
<keyword>Async copy</keyword>
<keyword>CPP11</keyword>
<keyword>GCC 5.1.0</keyword>
</keywords>
<libraries>
</libraries>
<librarypaths>
</librarypaths>
<nsight_eclipse>true</nsight_eclipse>
<primary_file>bf16TensorCoreGemm.cu</primary_file>
<required_dependencies>
<dependency>CPP11</dependency>
</required_dependencies>
<scopes>
<scope>1:CUDA Basic Topics</scope>
</scopes>
<sm-arch>sm80</sm-arch>
<sm-arch>sm86</sm-arch>
<sm-arch>sm87</sm-arch>
<sm-arch>sm89</sm-arch>
<sm-arch>sm90</sm-arch>
<supported_envs>
<env>
<arch>x86_64</arch>
<platform>linux</platform>
</env>
<env>
<arch>aarch64</arch>
</env>
<env>
<arch>sbsa</arch>
</env>
<env>
<platform>windows7</platform>
</env>
<env>
<arch>ppc64le</arch>
<platform>linux</platform>
</env>
</supported_envs>
<supported_sm_architectures>
<from>8.0</from>
</supported_sm_architectures>
<title>bfloat16 Tensor Core GEMM</title>
<type>exe</type>
</entry>