Build and Develop CUTLASS CUDA Kernels

11-12-202411-17-2024 blog 7 minutes read (About 1029 words) visits

Introduction

CUTLASS is a header-only library that consists of a collection of CUDA C++ template abstractions for implementing high-performance matrix-matrix multiplication (GEMM) and related computations at all levels and scales within CUDA.

In this blog post, we will build CUTLASS and CuTe CUDA kernels using CMake in a CUDA Docker container.

CUDA Docker Container

When it comes to creating a CUDA Docker container for CUTLASS kernel development, we will encounter an option. Either we will git clone the CUTLASS header-only library inside the Docker container, or the CUTLASS header-only library will be part of the CUDA kernel source code.

In the beginning, I cloned the CUTLASS header-only library inside the Docker container. However, it became prohibitive when I tried to check the header-only library implementation from the Docker container. Although I could still try to check the CUTLASS header-only library implementation from the Docker container if the Docker container is a VS Code Dev Container, it becomes not friendly if I want to modify and contribute to the CUTLASS header-only library. Therefore, I decided to treat the CUTLASS header-only library as part of the CUDA kernel source code.

Build Docker Image

The following CUDA Dockerfile will be used for CUTLASS kernel development. It can also be found in my CUTLASS Examples GitHub repository.

cutlass.Dockerfile

FROM nvcr.io/nvidia/cuda:12.4.1-devel-ubuntu22.04

ARG CMAKE_VERSION=3.30.5
ARG GOOGLETEST_VERSION=1.15.2
ARG NUM_JOBS=8

ENV DEBIAN_FRONTEND=noninteractive

# Install package dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
        build-essential \
        software-properties-common \
        autoconf \
        automake \
        libtool \
        pkg-config \
        ca-certificates \
        locales \
        locales-all \
        python3 \
        python3-dev \
        python3-pip \
        python3-setuptools \
        wget \
        git && \
    apt-get clean

# System locale
# Important for UTF-8
ENV LC_ALL=en_US.UTF-8
ENV LANG=en_US.UTF-8
ENV LANGUAGE=en_US.UTF-8

# Install CMake
RUN cd /tmp && \
    wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh && \
    bash cmake-${CMAKE_VERSION}-linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license && \
    rm -rf /tmp/*

# Install GoogleTest
RUN cd /tmp && \
    wget https://github.com/google/googletest/archive/refs/tags/v${GOOGLETEST_VERSION}.tar.gz && \
    tar -xzf v${GOOGLETEST_VERSION}.tar.gz && \
    cd googletest-${GOOGLETEST_VERSION} && \
    mkdir build && \
    cd build && \
    cmake .. && \
    make -j${NUM_JOBS} && \
    make install && \
    rm -rf /tmp/*

# Install QT6 and its dependencies for Nsight Compute GUI
# https://leimao.github.io/blog/Docker-Nsight-Compute/
RUN apt-get update -y && \
    apt-get install -y --no-install-recommends \
        apt-transport-https \
        ca-certificates \
        dbus \
        fontconfig \
        gnupg \
        libasound2 \
        libfreetype6 \
        libglib2.0-0 \
        libnss3 \
        libsqlite3-0 \
        libx11-xcb1 \
        libxcb-glx0 \
        libxcb-xkb1 \
        libxcomposite1 \
        libxcursor1 \
        libxdamage1 \
        libxi6 \
        libxml2 \
        libxrandr2 \
        libxrender1 \
        libxtst6 \
        libgl1-mesa-glx \
        libxkbfile-dev \
        openssh-client \
        xcb \
        xkb-data \
        libxcb-cursor0 \
        qt6-base-dev && \
    apt-get clean

RUN cd /usr/local/bin && \
    ln -s /usr/bin/python3 python && \
    ln -s /usr/bin/pip3 pip && \
    pip install --upgrade pip setuptools wheel

To build the CUTLASS Docker image locally, please run the following command.

1	$ docker build -f docker/cuda.Dockerfile --no-cache --tag cuda:12.4.1 .

Run Docker Container

To run the custom Docker container, please run the following command.

1	$ docker run -it --rm --gpus device=0 -v $(pwd):/mnt -w /mnt cuda:12.4.1

To run the custom Docker container with NVIDIA Nsight Compute, please run the following command.

1
2
3

$ xhost +
$ docker run -it --rm --gpus device=0 -v $(pwd):/mnt -w /mnt -e DISPLAY=$DISPLAY -v /tmp/.X11-unix:/tmp/.X11-unix --cap-add=SYS_ADMIN --security-opt seccomp=unconfined --network host cuda:12.4.1
$ xhost -

CUTLASS Examples

To show that the CUTLASS we installed works inside the Docker container, we will build and run two CUTLASS C++ examples copied from the CUTLASS GitHub repository without any modification.

CUTLASS is header-only. There are two key header directories to include for each CUTLASS build target, including cutlass/include and cutlass/tools/util/include.

CMakelists.txt

cmake_minimum_required(VERSION 3.28)

project(CUTLASS-Examples VERSION 0.0.1 LANGUAGES CXX CUDA)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

# Find CUDA Toolkit
find_package(CUDAToolkit REQUIRED)

# Set CUTLASS include directories
find_path(CUTLASS_INCLUDE_DIR cutlass/cutlass.h HINTS cutlass/include)
find_path(CUTLASS_UTILS_INCLUDE_DIR cutlass/util/host_tensor.h HINTS cutlass/tools/util/include)

add_subdirectory(examples)

For each build target, the experimental flag --expt-relaxed-constexpr is needed for the NVCC compiler to use some constexpr from the host code in the device code.

CMakelists.txt

cmake_minimum_required(VERSION 3.28)

project(CUTLASS-GEMM-API-V3 VERSION 0.0.1 LANGUAGES CXX CUDA)

# Set the CUDA architecture to compile the code for
# https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html
add_executable(${PROJECT_NAME} main.cu)
target_include_directories(${PROJECT_NAME} PRIVATE ${CUTLASS_INCLUDE_DIR} ${CUTLASS_UTILS_INCLUDE_DIR})
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES native)
target_compile_options(${PROJECT_NAME} PRIVATE --expt-relaxed-constexpr)

Build Examples

To build the CUTLASS examples using CMake, please run the following command.

1 2	$ cmake -B build $ cmake --build build --config Release --parallel

Run Examples

To run the CUTLASS examples, please run the following commands.

1
2
3

$ ./build/examples/gemm_api_v2/CUTLASS-GEMM-API-V2
$ echo $?
0

$ ./build/examples/gemm_api_v3/CUTLASS-GEMM-API-V3
10000 timing iterations of 2048 x 2048 x 2048 matrix-matrix multiply

Basic data-parallel GEMM
  Disposition: Passed
  Avg runtime: 0.175606 ms
  GFLOPs: 97831.9

StreamK GEMM with default load-balancing
  Disposition: Passed
  Avg runtime: 0.149729 ms
  GFLOPs: 114740
  Speedup vs Basic-DP: 1.173

StreamK emulating basic data-parallel GEMM
  Disposition: Passed
  Avg runtime: 0.177553 ms
  GFLOPs: 96759.2
  Speedup vs Basic-DP: 0.989

Basic split-K GEMM with tile-splitting factor 2
  Disposition: Passed
  Avg runtime: 0.183542 ms
  GFLOPs: 93601.7

StreamK emulating Split-K GEMM with tile-splitting factor 2
  Disposition: Passed
  Avg runtime: 0.173763 ms
  GFLOPs: 98869.8
  Speedup vs Basic-SplitK: 1.056

References

Build and Develop CUTLASS CUDA Kernels

https://leimao.github.io/blog/Build-Develop-CUTLASS-CUDA-Kernels/

Author

Lei Mao

Posted on

11-12-2024

Updated on

11-17-2024

Licensed under

Accelerated Computing,

CUDA,

CUTLASS,

Docker,

CMake

Build and Develop CUTLASS CUDA Kernels

Introduction

CUDA Docker Container

Build Docker Image

Run Docker Container

CUTLASS Examples

Build Examples

Run Examples

References

Author

Posted on

Updated on

Licensed under

Like this article? Support the author with

Comments

Advertisement

Catalogue