简介
CS149存在若干待优化内容:
- safe softmax, sqrt(d_k)
- CUDA中的并行
- v2及之后的优化
为了进一步优化,增加FA-1、FA-2等分支实现完整的各版本FA。
使用官方flash attention实现作为test以验证正确性和效率。
安装flash-attention
cuda11
安装cuda12在测试种出现问题,改为cuda11
ImportError: /home/libo/cs149/cs149gpt/module_ref.so: undefined symbol: _ZN2at4_ops5zeros4callEN3c108ArrayRefINS2_6SymIntEEENS2_8optionalINS2_10ScalarTypeEEENS6_INS2_6LayoutEEENS6_INS2_6DeviceEEENS6_IbEE`
https://github.com/stanford-cs149/cs149gpt/issues/2#issuecomment-2439791718
conda create -n gpt149
conda activate gpt149
conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=11.8 python=3.10 numpy=1.26 ninja tiktoken -c pytorch -c conda-forge -c nvidia
# https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu11torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
pip install flash_attn-2.7.0.post2+cu11torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
cuda12
https://blog.csdn.net/MurphyStar/article/details/138523803
https://github.com/Dao-AILab/flash-attention/releases/
conda create -n fa python=3.12
conda activate fa
pip install torch==2.3.1+cu121 torchvision==0.18.1+cu121 torchaudio==2.3.1 --extra-index-url https://download.pytorch.org/whl/cu121
# 检查环境
python -c "
import torch;
print('PyTorch version:', torch.__version__);
print('CUDA available:', torch.cuda.is_available());
print('CXX11 ABI:', torch.compiled_with_cxx11_abi())
"
# 安装对应版本wheel
pip install flash_attn-2.7.4.post1+cu12torch2.2cxx11abiFALSE-cp312-cp312-linux_x86_64.whl
# 如果需要更新pytorch
conda install pytorch torchvision torchaudio -c pytorch
# 显示undefined symbol: _ZN3c104cuda9SetDeviceEi
# 参考https://github.com/Dao-AILab/flash-attention/issues/620
# https://github.com/EleutherAI/lm-evaluation-harness/issues/1342
pip install flash_attn -U --force-reinstall
简单测试
import torch
from flash_attn import flash_attn_func
# 创建 QKV 张量(假设 batch=2, seq_len=128, num_heads=8, head_dim=64)
batch_size = 2
seq_len = 128
num_heads = 8
head_dim = 64
dtype = torch.float16
device = 'cuda'
q = torch.randn(batch_size, seq_len, num_heads, head_dim, device=device, dtype=dtype)
k = torch.randn_like(q)
v = torch.randn_like(q)
# 调用 FlashAttention
out = flash_attn_func(q, k, v)
print("Output shape:", out.shape)
import torch
import flash_attn
import sys
def get_versions():
versions = {}
# Python 版本
versions["Python version"] = sys.version.split(" ")[0]
# PyTorch 版本
versions["PyTorch version"] = torch.__version__
# CUDA 是否可用
versions["Is CUDA available"] = torch.cuda.is_available()
# CUDA 版本(PyTorch 报告的)
if torch.cuda.is_available():
versions["CUDA version (from PyTorch)"] = torch.version.cuda
versions["GPU device name"] = torch.cuda.get_device_name(0)
else:
versions["CUDA version (from PyTorch)"] = "N/A"
versions["GPU device name"] = "N/A"
# FlashAttention 版本
versions["FlashAttention version"] = flash_attn.__version__
# 是否使用新版 C++ ABI
try:
from torch.compiled_with_cxx11_abi import compiled_with_cxx11_abi
versions["Compiled with new C++ ABI (cxx11abi)"] = compiled_with_cxx11_abi()
except ImportError:
versions["Compiled with new C++ ABI (cxx11abi)"] = "Unknown / Not available"
return versions
if __name__ == "__main__":
versions = get_versions()
for key, value in versions.items():
print(f"{key}: {value}")
使用CUDA-GDB调试
手动调试
# 设置同步执行
export CUDA_LAUNCH_BLOCKING=1
cuda-gdb python3 # 加载Python解释器作为调试目标
(cuda-gdb) b kernel.cu:helper
(cuda-gdb) run test.py
- 调试器无法跟踪异步执行流程,只能调试设备代码,故断点只能在__global__或__device__
- 尽量避免进入无调试信息的代码,单步执行主机函数可能进入 PyTorch 或 pybind11 内部,导致崩溃
- 要用cudaDeviceSynchronize()而非cudaDeviceReset(),cudaDeviceReset() 会 强制销毁当前设备上下文 ,导致后续任何 CUDA 操作(如 cudaMemcpy、torch.Tensor 操作)都失败。
在vscode中调试(未成功)
参考:
https://zhuanlan.zhihu.com/p/716342383
https://zhuanlan.zhihu.com/p/607188244 - 不适用于pybind11
https://zhuanlan.zhihu.com/p/713265254 - 能够attach成功,并在python的断点处暂停,但是不能进入cuda部分。
我的代码:
.
├── kernel.cu
├── kernel.h
├── module.cpp
├── setup.py
└── test.py
0 directories, 5 files
// kernel.cu
#include "kernel.h"
#include <cuda_runtime.h>
__global__ void addKernelGPU(int *a, int *b, int *c, int N) {
int i = threadIdx.x;
if (i < N) {
c[i] = a[i] + b[i];
}
}
void addKernel(int *a, int *b, int *c, int N) {
addKernelGPU<<<1, N>>>(a, b, c, N);
cudaDeviceSynchronize();
}
// kernel.h
#ifndef KERNEL_H
#define KERNEL_H
void addKernel(int *a, int *b, int *c, int N);
#endif
// module.cpp
#include <torch/extension.h>
#include "kernel.h"
torch::Tensor add(torch::Tensor a, torch::Tensor b) {
int N = a.size(0);
auto options = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA);
auto c = torch::zeros({N}, options);
int *a_data = (int *)a.data_ptr();
int *b_data = (int *)b.data_ptr();
int *c_data = (int *)c.data_ptr();
addKernel(a_data, b_data, c_data, N);
return c;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("add", &add, "Vector addition on GPU");
}
# setup.py
# setup.py
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
import os
setup(
name='custom_module',
ext_modules=[
CUDAExtension(
name='custom_module',
sources=['module.cpp', 'kernel.cu'],
extra_compile_args={
'cxx': ['-g'],
'cuda': [
'-G', '-g',
f'-I{os.path.dirname(os.path.abspath(__file__))}' # 包含头文件路径
]
}
)
],
cmdclass={'build_ext': BuildExtension},
zip_safe=False,
)
# test.py
import argparse
import torch
import custom_module
# 解析调试参数
parser = argparse.ArgumentParser()
parser.add_argument("--ptvsd", action="store_true", help="启用调试模式")
args = parser.parse_args()
if args.ptvsd:
import ptvsd
print("等待调试器连接...")
ptvsd.enable_attach(address=('0.0.0.0', 5678)) # 端口可自定义
ptvsd.wait_for_attach()
# 测试代码
a = torch.tensor([1, 2, 3], dtype=torch.int32, device='cuda')
b = torch.tensor([4, 5, 6], dtype=torch.int32, device='cuda')
c = custom_module.add(a, b)
print("Result:", c)
运行:
pip install -e .
python3 test.py