Flash Attention

简介

CS149存在若干待优化内容：

safe softmax, sqrt(d_k)
CUDA中的并行
v2及之后的优化

为了进一步优化，增加FA-1、FA-2等分支实现完整的各版本FA。
使用官方flash attention实现作为test以验证正确性和效率。

安装flash-attention

cuda11

安装cuda12在测试种出现问题，改为cuda11
ImportError: /home/libo/cs149/cs149gpt/module_ref.so: undefined symbol: _ZN2at4_ops5zeros4callEN3c108ArrayRefINS2_6SymIntEEENS2_8optionalINS2_10ScalarTypeEEENS6_INS2_6LayoutEEENS6_INS2_6DeviceEEENS6_IbEE`
https://github.com/stanford-cs149/cs149gpt/issues/2#issuecomment-2439791718

conda create -n gpt149
conda activate gpt149
conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=11.8 python=3.10 numpy=1.26 ninja tiktoken -c pytorch -c conda-forge -c nvidia
# https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu11torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install flash_attn-2.7.0.post2+cu11torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl

cuda12

https://blog.csdn.net/MurphyStar/article/details/138523803

https://github.com/Dao-AILab/flash-attention/releases/

conda create -n fa python=3.12
conda activate fa
pip install torch==2.3.1+cu121 torchvision==0.18.1+cu121 torchaudio==2.3.1 --extra-index-url https://download.pytorch.org/whl/cu121 

# 检查环境
python -c "
import torch;
print('PyTorch version:', torch.__version__);
print('CUDA available:', torch.cuda.is_available());
print('CXX11 ABI:', torch.compiled_with_cxx11_abi())
"

# 安装对应版本wheel
pip install flash_attn-2.7.4.post1+cu12torch2.2cxx11abiFALSE-cp312-cp312-linux_x86_64.whl

# 如果需要更新pytorch
conda install pytorch torchvision torchaudio -c pytorch

# 显示undefined symbol: _ZN3c104cuda9SetDeviceEi
# 参考https://github.com/Dao-AILab/flash-attention/issues/620
# https://github.com/EleutherAI/lm-evaluation-harness/issues/1342
pip install flash_attn -U --force-reinstall

简单测试

import torch
from flash_attn import flash_attn_func

# 创建 QKV 张量（假设 batch=2, seq_len=128, num_heads=8, head_dim=64）
batch_size = 2
seq_len = 128
num_heads = 8
head_dim = 64
dtype = torch.float16
device = 'cuda'

q = torch.randn(batch_size, seq_len, num_heads, head_dim, device=device, dtype=dtype)
k = torch.randn_like(q)
v = torch.randn_like(q)

# 调用 FlashAttention
out = flash_attn_func(q, k, v)

print("Output shape:", out.shape)

import torch
import flash_attn
import sys

def get_versions():
    versions = {}

    # Python 版本
    versions["Python version"] = sys.version.split(" ")[0]

    # PyTorch 版本
    versions["PyTorch version"] = torch.__version__

    # CUDA 是否可用
    versions["Is CUDA available"] = torch.cuda.is_available()

    # CUDA 版本（PyTorch 报告的）
    if torch.cuda.is_available():
        versions["CUDA version (from PyTorch)"] = torch.version.cuda
        versions["GPU device name"] = torch.cuda.get_device_name(0)
    else:
        versions["CUDA version (from PyTorch)"] = "N/A"
        versions["GPU device name"] = "N/A"

    # FlashAttention 版本
    versions["FlashAttention version"] = flash_attn.__version__

    # 是否使用新版 C++ ABI
    try:
        from torch.compiled_with_cxx11_abi import compiled_with_cxx11_abi
        versions["Compiled with new C++ ABI (cxx11abi)"] = compiled_with_cxx11_abi()
    except ImportError:
        versions["Compiled with new C++ ABI (cxx11abi)"] = "Unknown / Not available"

    return versions

if __name__ == "__main__":
    versions = get_versions()
    for key, value in versions.items():
        print(f"{key}: {value}")

使用CUDA-GDB调试

手动调试

# 设置同步执行
export CUDA_LAUNCH_BLOCKING=1
cuda-gdb python3 # 加载Python解释器作为调试目标
(cuda-gdb) b kernel.cu:helper
(cuda-gdb) run test.py

调试器无法跟踪异步执行流程，只能调试设备代码，故断点只能在__global__或__device__
尽量避免进入无调试信息的代码，单步执行主机函数可能进入 PyTorch 或 pybind11 内部，导致崩溃
要用cudaDeviceSynchronize()而非cudaDeviceReset()，cudaDeviceReset() 会强制销毁当前设备上下文，导致后续任何 CUDA 操作（如 cudaMemcpy、torch.Tensor 操作）都失败。

在vscode中调试（未成功）

参考：
https://zhuanlan.zhihu.com/p/716342383
https://zhuanlan.zhihu.com/p/607188244 - 不适用于pybind11
https://zhuanlan.zhihu.com/p/713265254 - 能够attach成功，并在python的断点处暂停，但是不能进入cuda部分。

我的代码：

.
├── kernel.cu
├── kernel.h
├── module.cpp
├── setup.py
└── test.py

0 directories, 5 files

// kernel.cu
#include "kernel.h"
#include <cuda_runtime.h>

__global__ void addKernelGPU(int *a, int *b, int *c, int N) {
    int i = threadIdx.x;
    if (i < N) {
        c[i] = a[i] + b[i];
    }
}

void addKernel(int *a, int *b, int *c, int N) {
    addKernelGPU<<<1, N>>>(a, b, c, N);
    cudaDeviceSynchronize(); 
}

// kernel.h
#ifndef KERNEL_H
#define KERNEL_H

void addKernel(int *a, int *b, int *c, int N);

#endif

// module.cpp
#include <torch/extension.h>
#include "kernel.h"

torch::Tensor add(torch::Tensor a, torch::Tensor b) {
    int N = a.size(0);
    auto options = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA);
    auto c = torch::zeros({N}, options);

    int *a_data = (int *)a.data_ptr();
    int *b_data = (int *)b.data_ptr();
    int *c_data = (int *)c.data_ptr();

    addKernel(a_data, b_data, c_data, N);

    return c;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("add", &add, "Vector addition on GPU");
}

# setup.py
# setup.py
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
import os

setup(
    name='custom_module',
    ext_modules=[
        CUDAExtension(
            name='custom_module',
            sources=['module.cpp', 'kernel.cu'],
            extra_compile_args={
                'cxx': ['-g'],
                'cuda': [
                    '-G', '-g',
                    f'-I{os.path.dirname(os.path.abspath(__file__))}'  # 包含头文件路径
                ]
            }
        )
    ],
    cmdclass={'build_ext': BuildExtension},
    zip_safe=False,
)

# test.py
import argparse
import torch
import custom_module

# 解析调试参数
parser = argparse.ArgumentParser()
parser.add_argument("--ptvsd", action="store_true", help="启用调试模式")
args = parser.parse_args()

if args.ptvsd:
    import ptvsd
    print("等待调试器连接...")
    ptvsd.enable_attach(address=('0.0.0.0', 5678))  # 端口可自定义
    ptvsd.wait_for_attach()

# 测试代码
a = torch.tensor([1, 2, 3], dtype=torch.int32, device='cuda')
b = torch.tensor([4, 5, 6], dtype=torch.int32, device='cuda')
c = custom_module.add(a, b)
print("Result:", c)

运行：

pip install -e .
python3 test.py

Flash Attention - 3

在CS149的基础上补充test

简介