diff --git a/03_nf4_dequant/.gitignore b/03_nf4_dequant/.gitignore new file mode 100644 index 00000000..90a56e8b --- /dev/null +++ b/03_nf4_dequant/.gitignore @@ -0,0 +1,4 @@ +data/qlora_test.bin +data/cpp_output.bin +data/py_output.bin +__pycache__/ \ No newline at end of file diff --git "a/03_nf4_dequant/NF4\345\217\215\351\207\217\345\214\226\347\256\227\345\255\220\344\274\230\345\214\226.pdf" "b/03_nf4_dequant/NF4\345\217\215\351\207\217\345\214\226\347\256\227\345\255\220\344\274\230\345\214\226.pdf" new file mode 100644 index 00000000..f577b5e1 Binary files /dev/null and "b/03_nf4_dequant/NF4\345\217\215\351\207\217\345\214\226\347\256\227\345\255\220\344\274\230\345\214\226.pdf" differ diff --git a/03_nf4_dequant/README.md b/03_nf4_dequant/README.md index 234e9229..2f543da3 100644 --- a/03_nf4_dequant/README.md +++ b/03_nf4_dequant/README.md @@ -1,16 +1,76 @@ -## 简介 +## 快速开始性能测试 +`conda activate nf4`激活conda环境 -凭借你在 InfiniTensor 训练营的经历和人工智能与并行计算方面的出色成绩,你受雇于一家领先的 AI 大模型公司,负责为下一代对话系统开发高效的推理引擎。用户抱怨现有的聊天机器人响应速度太慢,无法支持流畅的实时交互——尤其是在消费级显卡上,大模型的显存占用导致批处理规模受限,吞吐量始终提不上去。 +`CUDA_VISIBLE_DEVICES=2 python main.py`运行测试程序,终端输出误差以及性能分析报告 -你发现问题的症结在于模型参数量过大。公司采用了 QLoRA 技术,将基座模型量化为 4-bit NF4 格式,显存占用降至原来的 1/4,理论上可以在单卡上运行更大的模型或更大的批次。然而,现有实现的解量化过程成了新的瓶颈:每个矩阵乘法前都需要将 4-bit 权重实时解压回 16-bit,这个操作拖慢了整体计算速度。 +`CUDA_VISIBLE_DEVICES=2`指定运行的GPU -## 任务内容 -开发一个 CUDA 程序,实现单核的 NF4 解量化算子,将压缩后的 4-bit 权重实时解压为 16-bit 浮点(BF16 或 FP16)输出。 +结果示例: +``` +--- 正在执行: 编译 C++ CUDA 内核 --- +--- 正在执行: 生成随机量化权重 --- +维度: 32768x32768 | Block: 64 | Group: 256 +数据已写入: ./data/qlora_test.bin +--- 运行反量化 --- +耗时: 24.4132 ms | 吞吐: 1106.48 GB/s +基准输出已保存: ./data/py_output.bin +--- 正在执行: 运行自定义 CUDA 内核 --- +原始元素: 1073741824 +对齐元素: 1073741824 +Launch Kernel... +Kernel 耗时: 21.6289 ms +有效带宽: 1248.91 GB/s +--- 正在执行: 对比精度与性能表现 --- +测试维度: 32768 x 32768 +Python: ./data/py_output.bin +C++: ./data/cpp_output.bin -## 📬 有疑问? +================================================== +误差分析报告 +================================================== +平均绝对误差 (MAE):6.5369757067e-05 +均方误差 (MSE):2.4582243441e-07 +最大误差 (Max Diff):1.5625000000e-02 +-------------------------------------------------- +完全一致元素数:1043509311 / 1073741824 +一致率:97.1844% +================================================== -更多详细信息和要求请参考本季度项目文档。 + 随机数据采样对比 +--------------------------------------------------------------------------- +Index | Python (BF16) | C++ (BF16) | Diff +--------------------------------------------------------------------------- +0 | 0.394531 | 0.394531 | 0.0000e+00 +1 | 1.171875 | 1.171875 | 0.0000e+00 +2 | -0.215820 | -0.215820 | 0.0000e+00 +446162790 | -0.277344 | -0.277344 | 0.0000e+00 +1066319814 | 0.000000 | 0.000000 | 0.0000e+00 +230426597 | 0.205078 | 0.205078 | 0.0000e+00 +951028163 | 0.322266 | 0.322266 | 0.0000e+00 +857049811 | -0.566406 | -0.566406 | 0.0000e+00 +527929503 | 0.718750 | 0.718750 | 0.0000e+00 +23305680 | 0.000000 | 0.000000 | 0.0000e+00 +168636035 | 0.134766 | 0.134766 | 0.0000e+00 +660748068 | -0.625000 | -0.625000 | 0.0000e+00 +801264382 | 0.098633 | 0.098633 | 0.0000e+00 +--------------------------------------------------------------------------- +bnb耗时:24.41320ms, 带宽:1106.47730GB/s +nf4 kernel耗时:21.62890ms, 带宽:1248.91000GB/s +``` -可以在项目群里直接询问导师和助教! +## data目录 +data目录存放测试文件以及反量化的输出结果。 +在param.txt中可以修改测试用矩阵大小等参数。 +data/log目录存放输出的性能日志。 + +## kernel_test目录 +kernel_test目录中是kernel逐步优化的过程。 + +## src目录 + +generate_data.py 用于生成随机矩阵,并使用bnb库量化后输出文件qlora_test.bin供nf4_kernel.cu读取进行反量化。同时generate_data.py将量化后的矩阵进行反量化并计时,性能日志输出到data/log/log_py.txt,运算结果输出到data/py_output.bin + +nf4_kernel.cu是CUDA实现的nf4反量化算子,从qlora_test.bin中读取量化后的矩阵进行反量化,并将结果输出到data/cpp_output.bin,性能日志输出到data/log/log_cpp.txt + +benchmark.py 是测试程序。读取cpp_output.bin与py_output.bin,对二者反量化结果进行精度对比。 -Good luck and happy coding! 🚀 diff --git a/03_nf4_dequant/data/log/log_cpp.txt b/03_nf4_dequant/data/log/log_cpp.txt new file mode 100644 index 00000000..591b51c0 --- /dev/null +++ b/03_nf4_dequant/data/log/log_cpp.txt @@ -0,0 +1 @@ +21.6248,1249.15 \ No newline at end of file diff --git a/03_nf4_dequant/data/log/log_py.txt b/03_nf4_dequant/data/log/log_py.txt new file mode 100644 index 00000000..b6ca524d --- /dev/null +++ b/03_nf4_dequant/data/log/log_py.txt @@ -0,0 +1 @@ +21.2040,1273.9424 diff --git a/03_nf4_dequant/data/param.txt b/03_nf4_dequant/data/param.txt new file mode 100644 index 00000000..719c007c --- /dev/null +++ b/03_nf4_dequant/data/param.txt @@ -0,0 +1,6 @@ +blocksize = 64 +groupsize = 256 +compute_type = "bf16" +target_gpu = "T4" +rows = 32768 +cols = 32768 \ No newline at end of file diff --git a/03_nf4_dequant/kernel_test/ALU_optim.cu b/03_nf4_dequant/kernel_test/ALU_optim.cu new file mode 100644 index 00000000..3e60a6e0 --- /dev/null +++ b/03_nf4_dequant/kernel_test/ALU_optim.cu @@ -0,0 +1,243 @@ +#include +#include +#include +#include +#include +#include +#include + +#define CHECK_CUDA(call) \ + { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA Error: " << cudaGetErrorString(err) \ + << " at line " << __LINE__ << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } + +void read_file( + const std::string file_path, std::vector& packed_weights, + std::vector& absmax_q, + std::vector& absmax2, + std::vector& code2, float& offset, + int64_t& rows, int64_t& cols, int32_t& blocksize +) +{ + std::ifstream file(file_path, std::ios::binary); + if (!file) { std::cerr << "无法打开文件" << std::endl; exit(1); } + + file.read(reinterpret_cast(&rows), 8); + file.read(reinterpret_cast(&cols), 8); + file.read(reinterpret_cast(&blocksize), 4); + + int64_t total_original = rows * cols; + int32_t groupsize = 256; + int32_t alignment = blocksize * groupsize; + int64_t total_aligned = ((total_original + alignment - 1) / alignment) * alignment; + + size_t num_blocks = total_aligned / blocksize; + size_t num_groups = total_aligned / (blocksize * groupsize); + + std::cout << "原始元素: " << total_original << std::endl; + std::cout << "对齐元素: " << total_aligned << std::endl; + + packed_weights.resize(total_aligned / 2); + absmax_q.resize(num_blocks); + absmax2.resize(num_groups); + code2.resize(256); + + file.read(reinterpret_cast(packed_weights.data()), packed_weights.size()); + file.read(reinterpret_cast(absmax_q.data()), absmax_q.size()); + file.read(reinterpret_cast(absmax2.data()), absmax2.size() * 2); + file.read(reinterpret_cast(code2.data()), 256 * 2); + file.read(reinterpret_cast(&offset), 4); +} + + +__constant__ float c_code2[256]; +// NF4 table +__constant__ float c_nf4[16] = +{ + -1.00000000f, -0.69619280f, -0.52507305f, -0.39491749f, + -0.28444138f, -0.18477343f, -0.09105004f, 0.00000000f, + 0.07958030f, 0.16093020f, 0.24611230f, 0.33791524f, + 0.44070983f, 0.56261700f, 0.72295684f, 1.00000000f +}; + +__global__ void dequantize_nf4_kernel_ALU_optim +( + const uint8_t* __restrict__ packed_weights, + const uint8_t* __restrict__ absmax_q, + const half* __restrict__ absmax2, + uint32_t* __restrict__ output_packed, + int num_bytes, + int block_shift, + int group_shift, + float offset +) +{ + __shared__ float s_nf4[16]; + + int tx = threadIdx.x; + if (tx < 16) + { + s_nf4[tx] = c_nf4[tx]; + } + __syncthreads(); + + // 每个线程处理 4 个输入字节(即 8 个 NF4 权重) + // 所以总线程数只需要是 num_bytes / 4 + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if (tid * 4 >= num_bytes) return; + + // 向量化读取:一次吞下 4 个字节 (32-bit) + uint32_t packed_4bytes = reinterpret_cast(packed_weights)[tid]; + + // 准备 128-bit 的输出容器 (4 个 uint32_t,每个 uint32_t 包含 2 个 bf16) + uint32_t out_vec[4]; + +#pragma unroll + for (int i = 0; i < 4; ++i) + { + uint8_t byte_val = (packed_4bytes >> (i * 8)) & 0xFF; + + // 计算当前权重的全局索引 + int element_idx = (tid * 4 + i) * 2; + int block_idx = element_idx >> block_shift; + int group_idx = block_idx >> group_shift; + + float scale_1 = c_code2[absmax_q[block_idx]]; + + float scale_2 = __half2float(absmax2[group_idx]); + float final_scale = scale_1 * scale_2; + + // 解码两个 NF4 + uint8_t idx_0 = byte_val >> 4; + uint8_t idx_1 = byte_val & 0x0F; + + float v0 = s_nf4[idx_0] * final_scale; + float v1 = s_nf4[idx_1] * final_scale; + + __nv_bfloat16 b0 = __float2bfloat16(v0); + __nv_bfloat16 b1 = __float2bfloat16(v1); + + uint16_t bits_0 = *reinterpret_cast(&b0); + uint16_t bits_1 = *reinterpret_cast(&b1); + + // 打包结果存入临时数组 + out_vec[i] = ((uint32_t)bits_1 << 16) | (uint32_t)bits_0; + } + + // 向量化写入 + reinterpret_cast(output_packed)[tid] = *reinterpret_cast(out_vec); +} + +void nf4_dequantize_cuda +( + std::vector& h_packed_weights, + std::vector& h_absmax_q, + std::vector& h_absmax2, + std::vector& h_code2, + int64_t rows, int64_t cols, int32_t blocksize, int32_t groupsize,float offset +) +{ + size_t num_bytes = h_packed_weights.size(); + size_t out_size = num_bytes * sizeof(uint32_t); + + float h_code2_f32[256]; + for(int i = 0; i < 256; ++i) + { + __half h_val = *reinterpret_cast<__half*>(&h_code2[i]); + h_code2_f32[i] = (float)h_val; + } + CHECK_CUDA(cudaMemcpyToSymbol(c_code2, h_code2_f32, sizeof(h_code2_f32))); + + uint8_t *d_packed, *d_absmax_q; + half *d_absmax2; + uint32_t *d_output; + + CHECK_CUDA(cudaMalloc(&d_packed, h_packed_weights.size())); + CHECK_CUDA(cudaMalloc(&d_absmax_q, h_absmax_q.size())); + CHECK_CUDA(cudaMalloc(&d_absmax2, h_absmax2.size() * 2)); + CHECK_CUDA(cudaMalloc(&d_output, out_size)); + + CHECK_CUDA(cudaMemcpy(d_packed, h_packed_weights.data(), h_packed_weights.size(), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(d_absmax_q, h_absmax_q.data(), h_absmax_q.size(), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(d_absmax2, h_absmax2.data(), h_absmax2.size() * 2, cudaMemcpyHostToDevice)); + + int threadsPerBlock = 256; + int num_elements_vec = (num_bytes + 3) / 4; + int blocksPerGrid = (num_elements_vec + threadsPerBlock - 1) / threadsPerBlock; + + int block_shift = log2(blocksize); + int group_shift = log2(groupsize); + + // warm up + for (int i = 0; i < 5; ++i) + { + dequantize_nf4_kernel_ALU_optim<<>> + (d_packed, d_absmax_q, d_absmax2, d_output, num_bytes, block_shift, group_shift, offset); + } + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); // 开始记录 + std::cout << "Launch Kernel...\n"; + for (int i = 0; i < 10; ++i) + { + dequantize_nf4_kernel_ALU_optim<<>> + (d_packed, d_absmax_q, d_absmax2, d_output, num_bytes, block_shift, group_shift, offset); + } + cudaEventRecord(stop); // 结束记录 + + CHECK_CUDA(cudaEventSynchronize(stop)); // 等待 Event 完成 + + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + + // 计算带宽 + // 读取: Packed(1) + Indices(1) + Scales(2) + Code2(忽略不计) + size_t total_read = h_packed_weights.size() + h_absmax_q.size() + h_absmax2.size() * 2; + // 写入: Output(4) (因为每个packed byte生成一个uint32) + size_t total_write = num_bytes * 4; + + double total_bytes = (double)(total_read + total_write); + double gb_per_sec = (total_bytes * 10 / 1e9) / (milliseconds / 1000.0); + + std::cout << "Kernel 耗时: " << milliseconds << " ms" << std::endl; + std::cout << "有效带宽: " << gb_per_sec << " GB/s" << std::endl; + + // 保存时间与带宽 + std::ofstream timefile("./data/log/log_cpp.txt"); + timefile << milliseconds << "," << gb_per_sec; + timefile.close(); + + std::vector h_output(num_bytes); + CHECK_CUDA(cudaMemcpy(h_output.data(), d_output, out_size, cudaMemcpyDeviceToHost)); + + const std::string output_path = "./data/cpp_output.bin"; + std::ofstream outfile(output_path, std::ios::binary); + outfile.write(reinterpret_cast(h_output.data()), out_size); + outfile.close(); + + cudaFree(d_packed); cudaFree(d_absmax_q); cudaFree(d_absmax2); cudaFree(d_output); + cudaEventDestroy(start); cudaEventDestroy(stop); +} + +int main() +{ + const std::string file_path = "../data/qlora_test.bin"; + std::vector packed_weights, absmax_q; + std::vector absmax2, code2; + float offset; + int64_t rows, cols; + int32_t blocksize; + int32_t groupsize = 256; + read_file(file_path, packed_weights, absmax_q, absmax2, code2, offset, rows, cols, blocksize); + nf4_dequantize_cuda(packed_weights, absmax_q, absmax2, code2, rows, cols, blocksize, groupsize, offset); + + return 0; +} \ No newline at end of file diff --git a/03_nf4_dequant/kernel_test/naive.cu b/03_nf4_dequant/kernel_test/naive.cu new file mode 100644 index 00000000..50987bd3 --- /dev/null +++ b/03_nf4_dequant/kernel_test/naive.cu @@ -0,0 +1,225 @@ +#include +#include +#include +#include +#include +#include +#include + +#define CHECK_CUDA(call) \ + { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA Error: " << cudaGetErrorString(err) \ + << " at line " << __LINE__ << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } + +void read_file( + const std::string file_path, std::vector& packed_weights, + std::vector& absmax_q, + std::vector& absmax2, + std::vector& code2, float& offset, + int64_t& rows, int64_t& cols, int32_t& blocksize +) +{ + std::ifstream file(file_path, std::ios::binary); + if (!file) { std::cerr << "无法打开文件" << std::endl; exit(1); } + + file.read(reinterpret_cast(&rows), 8); + file.read(reinterpret_cast(&cols), 8); + file.read(reinterpret_cast(&blocksize), 4); + + int64_t total_original = rows * cols; + int32_t groupsize = 256; + int32_t alignment = blocksize * groupsize; + int64_t total_aligned = ((total_original + alignment - 1) / alignment) * alignment; + + size_t num_blocks = total_aligned / blocksize; + size_t num_groups = total_aligned / (blocksize * groupsize); + + std::cout << "原始元素: " << total_original << std::endl; + std::cout << "对齐元素: " << total_aligned << std::endl; + + packed_weights.resize(total_aligned / 2); + absmax_q.resize(num_blocks); + absmax2.resize(num_groups); + code2.resize(256); + + file.read(reinterpret_cast(packed_weights.data()), packed_weights.size()); + file.read(reinterpret_cast(absmax_q.data()), absmax_q.size()); + file.read(reinterpret_cast(absmax2.data()), absmax2.size() * 2); + file.read(reinterpret_cast(code2.data()), 256 * 2); + file.read(reinterpret_cast(&offset), 4); +} + + +__constant__ float c_code2[256]; +// NF4 table +__device__ __forceinline__ float get_nf4_value(uint8_t idx) +{ + switch(idx) { + case 0: return -1.00000000f; + case 1: return -0.69619280f; + case 2: return -0.52507305f; + case 3: return -0.39491749f; + case 4: return -0.28444138f; + case 5: return -0.18477343f; + case 6: return -0.09105004f; + case 7: return 0.00000000f; + case 8: return 0.07958030f; + case 9: return 0.16093020f; + case 10: return 0.24611230f; + case 11: return 0.33791524f; + case 12: return 0.44070983f; + case 13: return 0.56261700f; + case 14: return 0.72295684f; + case 15: return 1.00000000f; + default: return 0.0f; + } +} + +__global__ void dequantize_nf4_kernel_naive +( + const uint8_t* __restrict__ packed_weights, + const uint8_t* __restrict__ absmax_q, + const half* __restrict__ absmax2, + uint32_t* __restrict__ output_packed, + int num_bytes, + int block_size, + int group_size, + float offset +) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= num_bytes) return; + + int element_idx = tid * 2; + int block_idx = element_idx / block_size; + int group_idx = block_idx / group_size; + + float scale_1 = c_code2[absmax_q[block_idx]] + offset; + float scale_2 = __half2float(absmax2[group_idx]); + float final_scale = scale_1 * scale_2; + + uint8_t byte_val = packed_weights[tid]; + uint8_t idx_0 = byte_val >> 4; + uint8_t idx_1 = byte_val & 0x0F; + + float v0_fp32 = get_nf4_value(idx_0) * final_scale; + float v1_fp32 = get_nf4_value(idx_1) * final_scale; + + __nv_bfloat16 v0_bf16 = __float2bfloat16(v0_fp32); + __nv_bfloat16 v1_bf16 = __float2bfloat16(v1_fp32); + + uint16_t bits_0 = *reinterpret_cast(&v0_bf16); + uint16_t bits_1 = *reinterpret_cast(&v1_bf16); + + output_packed[tid] = ((uint32_t)bits_1 << 16) | (uint32_t)bits_0; +} + +void nf4_dequantize_cuda +( + std::vector& h_packed_weights, + std::vector& h_absmax_q, + std::vector& h_absmax2, + std::vector& h_code2, + int64_t rows, int64_t cols, int32_t blocksize, int32_t goupsize,float offset +) +{ + size_t num_bytes = h_packed_weights.size(); + size_t out_size = num_bytes * sizeof(uint32_t); + + float h_code2_f32[256]; + for(int i = 0; i < 256; ++i) + { + __half h_val = *reinterpret_cast<__half*>(&h_code2[i]); + h_code2_f32[i] = (float)h_val; + } + CHECK_CUDA(cudaMemcpyToSymbol(c_code2, h_code2_f32, sizeof(h_code2_f32))); + + uint8_t *d_packed, *d_absmax_q; + half *d_absmax2; + uint32_t *d_output; + + CHECK_CUDA(cudaMalloc(&d_packed, h_packed_weights.size())); + CHECK_CUDA(cudaMalloc(&d_absmax_q, h_absmax_q.size())); + CHECK_CUDA(cudaMalloc(&d_absmax2, h_absmax2.size() * 2)); + CHECK_CUDA(cudaMalloc(&d_output, out_size)); + + CHECK_CUDA(cudaMemcpy(d_packed, h_packed_weights.data(), h_packed_weights.size(), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(d_absmax_q, h_absmax_q.data(), h_absmax_q.size(), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(d_absmax2, h_absmax2.data(), h_absmax2.size() * 2, cudaMemcpyHostToDevice)); + + int threadsPerBlock = 256; + int num_elements_vec = num_bytes; + int blocksPerGrid = (num_elements_vec + threadsPerBlock - 1) / threadsPerBlock; + + // warm up + for (int i = 0; i < 5; ++i) + { + dequantize_nf4_kernel_naive<<>> + (d_packed, d_absmax_q, d_absmax2, d_output, num_bytes, blocksize, goupsize, offset); + } + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); // 开始记录 + std::cout << "Launch Kernel...\n"; + for (int i = 0; i < 10; ++i) + { + dequantize_nf4_kernel_naive<<>> + (d_packed, d_absmax_q, d_absmax2, d_output, num_bytes, blocksize, goupsize, offset); + } + cudaEventRecord(stop); // 结束记录 + + CHECK_CUDA(cudaEventSynchronize(stop)); // 等待 Event 完成 + + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + + // 计算带宽 + // 读取: Packed(1) + Indices(1) + Scales(2) + Code2(忽略不计) + size_t total_read = h_packed_weights.size() + h_absmax_q.size() + h_absmax2.size() * 2; + // 写入: Output(4) (因为每个packed byte生成一个uint32) + size_t total_write = num_bytes * 4; + + double total_bytes = (double)(total_read + total_write); + double gb_per_sec = (total_bytes * 10 / 1e9) / (milliseconds / 1000.0); + + std::cout << "Kernel 耗时: " << milliseconds << " ms" << std::endl; + std::cout << "有效带宽: " << gb_per_sec << " GB/s" << std::endl; + + // 保存时间与带宽 + std::ofstream timefile("./data/log/log_cpp.txt"); + timefile << milliseconds << "," << gb_per_sec; + timefile.close(); + + std::vector h_output(num_bytes); + CHECK_CUDA(cudaMemcpy(h_output.data(), d_output, out_size, cudaMemcpyDeviceToHost)); + + const std::string output_path = "./data/cpp_output.bin"; + std::ofstream outfile(output_path, std::ios::binary); + outfile.write(reinterpret_cast(h_output.data()), out_size); + outfile.close(); + + cudaFree(d_packed); cudaFree(d_absmax_q); cudaFree(d_absmax2); cudaFree(d_output); + cudaEventDestroy(start); cudaEventDestroy(stop); +} + +int main() +{ + const std::string file_path = "../data/qlora_test.bin"; + std::vector packed_weights, absmax_q; + std::vector absmax2, code2; + float offset; + int64_t rows, cols; + int32_t blocksize; + int32_t groupsize = 256; + read_file(file_path, packed_weights, absmax_q, absmax2, code2, offset, rows, cols, blocksize); + nf4_dequantize_cuda(packed_weights, absmax_q, absmax2, code2, rows, cols, blocksize, groupsize, offset); + + return 0; +} \ No newline at end of file diff --git a/03_nf4_dequant/kernel_test/naive_v2.cu b/03_nf4_dequant/kernel_test/naive_v2.cu new file mode 100644 index 00000000..0e3fe72a --- /dev/null +++ b/03_nf4_dequant/kernel_test/naive_v2.cu @@ -0,0 +1,211 @@ +#include +#include +#include +#include +#include +#include +#include + +#define CHECK_CUDA(call) \ + { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA Error: " << cudaGetErrorString(err) \ + << " at line " << __LINE__ << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } + +void read_file( + const std::string file_path, std::vector& packed_weights, + std::vector& absmax_q, + std::vector& absmax2, + std::vector& code2, float& offset, + int64_t& rows, int64_t& cols, int32_t& blocksize +) +{ + std::ifstream file(file_path, std::ios::binary); + if (!file) { std::cerr << "无法打开文件" << std::endl; exit(1); } + + file.read(reinterpret_cast(&rows), 8); + file.read(reinterpret_cast(&cols), 8); + file.read(reinterpret_cast(&blocksize), 4); + + int64_t total_original = rows * cols; + int32_t groupsize = 256; + int32_t alignment = blocksize * groupsize; + int64_t total_aligned = ((total_original + alignment - 1) / alignment) * alignment; + + size_t num_blocks = total_aligned / blocksize; + size_t num_groups = total_aligned / (blocksize * groupsize); + + std::cout << "原始元素: " << total_original << std::endl; + std::cout << "对齐元素: " << total_aligned << std::endl; + + packed_weights.resize(total_aligned / 2); + absmax_q.resize(num_blocks); + absmax2.resize(num_groups); + code2.resize(256); + + file.read(reinterpret_cast(packed_weights.data()), packed_weights.size()); + file.read(reinterpret_cast(absmax_q.data()), absmax_q.size()); + file.read(reinterpret_cast(absmax2.data()), absmax2.size() * 2); + file.read(reinterpret_cast(code2.data()), 256 * 2); + file.read(reinterpret_cast(&offset), 4); +} + + +__constant__ float c_code2[256]; +// NF4 table +__constant__ float c_nf4[16] = +{ + -1.00000000f, -0.69619280f, -0.52507305f, -0.39491749f, + -0.28444138f, -0.18477343f, -0.09105004f, 0.00000000f, + 0.07958030f, 0.16093020f, 0.24611230f, 0.33791524f, + 0.44070983f, 0.56261700f, 0.72295684f, 1.00000000f +}; + + +__global__ void dequantize_nf4_kernel_naive_v2 +( + const uint8_t* __restrict__ packed_weights, + const uint8_t* __restrict__ absmax_q, + const half* __restrict__ absmax2, + uint32_t* __restrict__ output_packed, + int num_bytes, + int block_size, + int group_size, + float offset +) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= num_bytes) return; + + int element_idx = tid * 2; + int block_idx = element_idx / block_size; + int group_idx = block_idx / group_size; + + float scale_1 = c_code2[absmax_q[block_idx]] + offset; + float scale_2 = __half2float(absmax2[group_idx]); + float final_scale = scale_1 * scale_2; + + uint8_t byte_val = packed_weights[tid]; + uint8_t idx_0 = byte_val >> 4; + uint8_t idx_1 = byte_val & 0x0F; + + float v0_fp32 = c_nf4[idx_0] * final_scale; + float v1_fp32 = c_nf4[idx_1] * final_scale; + + __nv_bfloat16 v0_bf16 = __float2bfloat16(v0_fp32); + __nv_bfloat16 v1_bf16 = __float2bfloat16(v1_fp32); + + uint16_t bits_0 = *reinterpret_cast(&v0_bf16); + uint16_t bits_1 = *reinterpret_cast(&v1_bf16); + + output_packed[tid] = ((uint32_t)bits_1 << 16) | (uint32_t)bits_0; +} + +void nf4_dequantize_cuda +( + std::vector& h_packed_weights, + std::vector& h_absmax_q, + std::vector& h_absmax2, + std::vector& h_code2, + int64_t rows, int64_t cols, int32_t blocksize, int32_t goupsize,float offset +) +{ + size_t num_bytes = h_packed_weights.size(); + size_t out_size = num_bytes * sizeof(uint32_t); + + float h_code2_f32[256]; + for(int i = 0; i < 256; ++i) + { + __half h_val = *reinterpret_cast<__half*>(&h_code2[i]); + h_code2_f32[i] = (float)h_val; + } + CHECK_CUDA(cudaMemcpyToSymbol(c_code2, h_code2_f32, sizeof(h_code2_f32))); + + uint8_t *d_packed, *d_absmax_q; + half *d_absmax2; + uint32_t *d_output; + + CHECK_CUDA(cudaMalloc(&d_packed, h_packed_weights.size())); + CHECK_CUDA(cudaMalloc(&d_absmax_q, h_absmax_q.size())); + CHECK_CUDA(cudaMalloc(&d_absmax2, h_absmax2.size() * 2)); + CHECK_CUDA(cudaMalloc(&d_output, out_size)); + + CHECK_CUDA(cudaMemcpy(d_packed, h_packed_weights.data(), h_packed_weights.size(), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(d_absmax_q, h_absmax_q.data(), h_absmax_q.size(), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(d_absmax2, h_absmax2.data(), h_absmax2.size() * 2, cudaMemcpyHostToDevice)); + + int threadsPerBlock = 256; + int num_elements_vec = num_bytes; + int blocksPerGrid = (num_elements_vec + threadsPerBlock - 1) / threadsPerBlock; + + // warm up + for (int i = 0; i < 5; ++i) + { + dequantize_nf4_kernel_naive_v2<<>> + (d_packed, d_absmax_q, d_absmax2, d_output, num_bytes, blocksize, goupsize, offset); + } + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); // 开始记录 + std::cout << "Launch Kernel...\n"; + for (int i = 0; i < 10; ++i) + { + dequantize_nf4_kernel_naive_v2<<>> + (d_packed, d_absmax_q, d_absmax2, d_output, num_bytes, blocksize, goupsize, offset); + } + cudaEventRecord(stop); // 结束记录 + + CHECK_CUDA(cudaEventSynchronize(stop)); // 等待 Event 完成 + + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + + // 计算带宽 + // 读取: Packed(1) + Indices(1) + Scales(2) + Code2(忽略不计) + size_t total_read = h_packed_weights.size() + h_absmax_q.size() + h_absmax2.size() * 2; + // 写入: Output(4) (因为每个packed byte生成一个uint32) + size_t total_write = num_bytes * 4; + + double total_bytes = (double)(total_read + total_write); + double gb_per_sec = (total_bytes * 10 / 1e9) / (milliseconds / 1000.0); + + std::cout << "Kernel 耗时: " << milliseconds << " ms" << std::endl; + std::cout << "有效带宽: " << gb_per_sec << " GB/s" << std::endl; + + // 保存时间与带宽 + std::ofstream timefile("./data/log/log_cpp.txt"); + timefile << milliseconds << "," << gb_per_sec; + timefile.close(); + + std::vector h_output(num_bytes); + CHECK_CUDA(cudaMemcpy(h_output.data(), d_output, out_size, cudaMemcpyDeviceToHost)); + + const std::string output_path = "./data/cpp_output.bin"; + std::ofstream outfile(output_path, std::ios::binary); + outfile.write(reinterpret_cast(h_output.data()), out_size); + outfile.close(); + + cudaFree(d_packed); cudaFree(d_absmax_q); cudaFree(d_absmax2); cudaFree(d_output); + cudaEventDestroy(start); cudaEventDestroy(stop); +} + +int main() +{ + const std::string file_path = "../data/qlora_test.bin"; + std::vector packed_weights, absmax_q; + std::vector absmax2, code2; + float offset; + int64_t rows, cols; + int32_t blocksize; + int32_t groupsize = 256; + read_file(file_path, packed_weights, absmax_q, absmax2, code2, offset, rows, cols, blocksize); + nf4_dequantize_cuda(packed_weights, absmax_q, absmax2, code2, rows, cols, blocksize, groupsize, offset); + + return 0; +} \ No newline at end of file diff --git a/03_nf4_dequant/kernel_test/shared_memory.cu b/03_nf4_dequant/kernel_test/shared_memory.cu new file mode 100644 index 00000000..e8bbc1e2 --- /dev/null +++ b/03_nf4_dequant/kernel_test/shared_memory.cu @@ -0,0 +1,240 @@ +#include +#include +#include +#include +#include +#include +#include + +#define CHECK_CUDA(call) \ + { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA Error: " << cudaGetErrorString(err) \ + << " at line " << __LINE__ << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } + +void read_file( + const std::string file_path, std::vector& packed_weights, + std::vector& absmax_q, + std::vector& absmax2, + std::vector& code2, float& offset, + int64_t& rows, int64_t& cols, int32_t& blocksize +) +{ + std::ifstream file(file_path, std::ios::binary); + if (!file) { std::cerr << "无法打开文件" << std::endl; exit(1); } + + file.read(reinterpret_cast(&rows), 8); + file.read(reinterpret_cast(&cols), 8); + file.read(reinterpret_cast(&blocksize), 4); + + int64_t total_original = rows * cols; + int32_t groupsize = 256; + int32_t alignment = blocksize * groupsize; + int64_t total_aligned = ((total_original + alignment - 1) / alignment) * alignment; + + size_t num_blocks = total_aligned / blocksize; + size_t num_groups = total_aligned / (blocksize * groupsize); + + std::cout << "原始元素: " << total_original << std::endl; + std::cout << "对齐元素: " << total_aligned << std::endl; + + packed_weights.resize(total_aligned / 2); + absmax_q.resize(num_blocks); + absmax2.resize(num_groups); + code2.resize(256); + + file.read(reinterpret_cast(packed_weights.data()), packed_weights.size()); + file.read(reinterpret_cast(absmax_q.data()), absmax_q.size()); + file.read(reinterpret_cast(absmax2.data()), absmax2.size() * 2); + file.read(reinterpret_cast(code2.data()), 256 * 2); + file.read(reinterpret_cast(&offset), 4); +} + + +__constant__ float c_code2[256]; +// NF4 table +__constant__ float c_nf4[16] = +{ + -1.00000000f, -0.69619280f, -0.52507305f, -0.39491749f, + -0.28444138f, -0.18477343f, -0.09105004f, 0.00000000f, + 0.07958030f, 0.16093020f, 0.24611230f, 0.33791524f, + 0.44070983f, 0.56261700f, 0.72295684f, 1.00000000f +}; + +__global__ void dequantize_nf4_kernel_smem +( + const uint8_t* __restrict__ packed_weights, + const uint8_t* __restrict__ absmax_q, + const half* __restrict__ absmax2, + uint32_t* __restrict__ output_packed, + int num_bytes, + int block_size, + int group_size, + float offset +) +{ + __shared__ float s_nf4[16]; + + int tx = threadIdx.x; + if (tx < 16) + { + s_nf4[tx] = c_nf4[tx]; + } + __syncthreads(); + + // 每个线程处理 4 个输入字节(即 8 个 NF4 权重) + // 所以总线程数只需要是 num_bytes / 4 + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if (tid * 4 >= num_bytes) return; + + // 向量化读取:一次吞下 4 个字节 (32-bit) + uint32_t packed_4bytes = reinterpret_cast(packed_weights)[tid]; + + // 准备 128-bit 的输出容器 (4 个 uint32_t,每个 uint32_t 包含 2 个 bf16) + uint32_t out_vec[4]; + +#pragma unroll + for (int i = 0; i < 4; ++i) + { + uint8_t byte_val = (packed_4bytes >> (i * 8)) & 0xFF; + + // 计算当前权重的全局索引 + int element_idx = (tid * 4 + i) * 2; + int block_idx = element_idx / block_size; + int group_idx = block_idx / group_size; + + float scale_1 = c_code2[absmax_q[block_idx]]; + + float scale_2 = __half2float(absmax2[group_idx]); + float final_scale = scale_1 * scale_2; + + // 解码两个 NF4 + uint8_t idx_0 = byte_val >> 4; + uint8_t idx_1 = byte_val & 0x0F; + + float v0 = s_nf4[idx_0] * final_scale; + float v1 = s_nf4[idx_1] * final_scale; + + __nv_bfloat16 b0 = __float2bfloat16(v0); + __nv_bfloat16 b1 = __float2bfloat16(v1); + + uint16_t bits_0 = *reinterpret_cast(&b0); + uint16_t bits_1 = *reinterpret_cast(&b1); + + // 打包结果存入临时数组 + out_vec[i] = ((uint32_t)bits_1 << 16) | (uint32_t)bits_0; + } + + // 向量化写入 + reinterpret_cast(output_packed)[tid] = *reinterpret_cast(out_vec); +} + +void nf4_dequantize_cuda +( + std::vector& h_packed_weights, + std::vector& h_absmax_q, + std::vector& h_absmax2, + std::vector& h_code2, + int64_t rows, int64_t cols, int32_t blocksize, int32_t goupsize,float offset +) +{ + size_t num_bytes = h_packed_weights.size(); + size_t out_size = num_bytes * sizeof(uint32_t); + + float h_code2_f32[256]; + for(int i = 0; i < 256; ++i) + { + __half h_val = *reinterpret_cast<__half*>(&h_code2[i]); + h_code2_f32[i] = (float)h_val; + } + CHECK_CUDA(cudaMemcpyToSymbol(c_code2, h_code2_f32, sizeof(h_code2_f32))); + + uint8_t *d_packed, *d_absmax_q; + half *d_absmax2; + uint32_t *d_output; + + CHECK_CUDA(cudaMalloc(&d_packed, h_packed_weights.size())); + CHECK_CUDA(cudaMalloc(&d_absmax_q, h_absmax_q.size())); + CHECK_CUDA(cudaMalloc(&d_absmax2, h_absmax2.size() * 2)); + CHECK_CUDA(cudaMalloc(&d_output, out_size)); + + CHECK_CUDA(cudaMemcpy(d_packed, h_packed_weights.data(), h_packed_weights.size(), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(d_absmax_q, h_absmax_q.data(), h_absmax_q.size(), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(d_absmax2, h_absmax2.data(), h_absmax2.size() * 2, cudaMemcpyHostToDevice)); + + int threadsPerBlock = 256; + int num_elements_vec = (num_bytes + 3) / 4; + int blocksPerGrid = (num_elements_vec + threadsPerBlock - 1) / threadsPerBlock; + + // warm up + for (int i = 0; i < 5; ++i) + { + dequantize_nf4_kernel_smem<<>> + (d_packed, d_absmax_q, d_absmax2, d_output, num_bytes, blocksize, goupsize, offset); + } + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); // 开始记录 + std::cout << "Launch Kernel...\n"; + for (int i = 0; i < 10; ++i) + { + dequantize_nf4_kernel_smem<<>> + (d_packed, d_absmax_q, d_absmax2, d_output, num_bytes, blocksize, goupsize, offset); + } + cudaEventRecord(stop); // 结束记录 + + CHECK_CUDA(cudaEventSynchronize(stop)); // 等待 Event 完成 + + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + + // 计算带宽 + // 读取: Packed(1) + Indices(1) + Scales(2) + Code2(忽略不计) + size_t total_read = h_packed_weights.size() + h_absmax_q.size() + h_absmax2.size() * 2; + // 写入: Output(4) (因为每个packed byte生成一个uint32) + size_t total_write = num_bytes * 4; + + double total_bytes = (double)(total_read + total_write); + double gb_per_sec = (total_bytes * 10 / 1e9) / (milliseconds / 1000.0); + + std::cout << "Kernel 耗时: " << milliseconds << " ms" << std::endl; + std::cout << "有效带宽: " << gb_per_sec << " GB/s" << std::endl; + + // 保存时间与带宽 + std::ofstream timefile("./data/log/log_cpp.txt"); + timefile << milliseconds << "," << gb_per_sec; + timefile.close(); + + std::vector h_output(num_bytes); + CHECK_CUDA(cudaMemcpy(h_output.data(), d_output, out_size, cudaMemcpyDeviceToHost)); + + const std::string output_path = "./data/cpp_output.bin"; + std::ofstream outfile(output_path, std::ios::binary); + outfile.write(reinterpret_cast(h_output.data()), out_size); + outfile.close(); + + cudaFree(d_packed); cudaFree(d_absmax_q); cudaFree(d_absmax2); cudaFree(d_output); + cudaEventDestroy(start); cudaEventDestroy(stop); +} + +int main() +{ + const std::string file_path = "../data/qlora_test.bin"; + std::vector packed_weights, absmax_q; + std::vector absmax2, code2; + float offset; + int64_t rows, cols; + int32_t blocksize; + int32_t groupsize = 256; + read_file(file_path, packed_weights, absmax_q, absmax2, code2, offset, rows, cols, blocksize); + nf4_dequantize_cuda(packed_weights, absmax_q, absmax2, code2, rows, cols, blocksize, groupsize, offset); + + return 0; +} \ No newline at end of file diff --git a/03_nf4_dequant/kernel_test/vector.cu b/03_nf4_dequant/kernel_test/vector.cu new file mode 100644 index 00000000..c9f9d066 --- /dev/null +++ b/03_nf4_dequant/kernel_test/vector.cu @@ -0,0 +1,230 @@ +#include +#include +#include +#include +#include +#include +#include + +#define CHECK_CUDA(call) \ + { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA Error: " << cudaGetErrorString(err) \ + << " at line " << __LINE__ << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } + +void read_file( + const std::string file_path, std::vector& packed_weights, + std::vector& absmax_q, + std::vector& absmax2, + std::vector& code2, float& offset, + int64_t& rows, int64_t& cols, int32_t& blocksize +) +{ + std::ifstream file(file_path, std::ios::binary); + if (!file) { std::cerr << "无法打开文件" << std::endl; exit(1); } + + file.read(reinterpret_cast(&rows), 8); + file.read(reinterpret_cast(&cols), 8); + file.read(reinterpret_cast(&blocksize), 4); + + int64_t total_original = rows * cols; + int32_t groupsize = 256; + int32_t alignment = blocksize * groupsize; + int64_t total_aligned = ((total_original + alignment - 1) / alignment) * alignment; + + size_t num_blocks = total_aligned / blocksize; + size_t num_groups = total_aligned / (blocksize * groupsize); + + std::cout << "原始元素: " << total_original << std::endl; + std::cout << "对齐元素: " << total_aligned << std::endl; + + packed_weights.resize(total_aligned / 2); + absmax_q.resize(num_blocks); + absmax2.resize(num_groups); + code2.resize(256); + + file.read(reinterpret_cast(packed_weights.data()), packed_weights.size()); + file.read(reinterpret_cast(absmax_q.data()), absmax_q.size()); + file.read(reinterpret_cast(absmax2.data()), absmax2.size() * 2); + file.read(reinterpret_cast(code2.data()), 256 * 2); + file.read(reinterpret_cast(&offset), 4); +} + + +__constant__ float c_code2[256]; +// NF4 table +__constant__ float c_nf4[16] = +{ + -1.00000000f, -0.69619280f, -0.52507305f, -0.39491749f, + -0.28444138f, -0.18477343f, -0.09105004f, 0.00000000f, + 0.07958030f, 0.16093020f, 0.24611230f, 0.33791524f, + 0.44070983f, 0.56261700f, 0.72295684f, 1.00000000f +}; + +__global__ void dequantize_nf4_kernel_vec +( + const uint8_t* __restrict__ packed_weights, + const uint8_t* __restrict__ absmax_q, + const half* __restrict__ absmax2, + uint32_t* __restrict__ output_packed, // 注意:这里虽然是指针,但我们会强转写 int4 + int num_bytes, // 这里的 num_bytes 指的是 input 的字节数 + int block_size, + int group_size, + float offset +) +{ + // 每个线程处理 4 个输入字节(即 8 个 NF4 权重) + // 所以总线程数只需要是 num_bytes / 4 + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + // 越界检查:注意现在的粒度是 4 字节 + if (tid * 4 >= num_bytes) return; + + // 向量化读取:一次吞下 4 个字节 (32-bit) + uint32_t packed_4bytes = reinterpret_cast(packed_weights)[tid]; + + // 准备 128-bit 的输出容器 (4 个 uint32_t,每个 uint32_t 包含 2 个 bf16) + uint32_t out_vec[4]; + +#pragma unroll + for (int i = 0; i < 4; ++i) { + uint8_t byte_val = (packed_4bytes >> (i * 8)) & 0xFF; + + // 计算当前权重的全局索引 + int element_idx = (tid * 4 + i) * 2; + int block_idx = element_idx / block_size; + int group_idx = block_idx / group_size; + + float scale_1 = c_code2[absmax_q[block_idx]]; + float scale_2 = __half2float(absmax2[group_idx]); + float final_scale = scale_1 * scale_2; + + // 解码两个 NF4 + uint8_t idx_0 = byte_val >> 4; + uint8_t idx_1 = byte_val & 0x0F; + + float v0 = c_nf4[idx_0] * final_scale; + float v1 = c_nf4[idx_1] * final_scale; + + __nv_bfloat16 b0 = __float2bfloat16(v0); + __nv_bfloat16 b1 = __float2bfloat16(v1); + + uint16_t bits_0 = *reinterpret_cast(&b0); + uint16_t bits_1 = *reinterpret_cast(&b1); + + // 打包结果存入临时数组 + out_vec[i] = ((uint32_t)bits_1 << 16) | (uint32_t)bits_0; + } + + // 向量化写入 + reinterpret_cast(output_packed)[tid] = *reinterpret_cast(out_vec); +} + +void nf4_dequantize_cuda +( + std::vector& h_packed_weights, + std::vector& h_absmax_q, + std::vector& h_absmax2, + std::vector& h_code2, + int64_t rows, int64_t cols, int32_t blocksize, int32_t goupsize,float offset +) +{ + size_t num_bytes = h_packed_weights.size(); + size_t out_size = num_bytes * sizeof(uint32_t); + + float h_code2_f32[256]; + for(int i = 0; i < 256; ++i) + { + __half h_val = *reinterpret_cast<__half*>(&h_code2[i]); + h_code2_f32[i] = (float)h_val; + } + CHECK_CUDA(cudaMemcpyToSymbol(c_code2, h_code2_f32, sizeof(h_code2_f32))); + + uint8_t *d_packed, *d_absmax_q; + half *d_absmax2; + uint32_t *d_output; + + CHECK_CUDA(cudaMalloc(&d_packed, h_packed_weights.size())); + CHECK_CUDA(cudaMalloc(&d_absmax_q, h_absmax_q.size())); + CHECK_CUDA(cudaMalloc(&d_absmax2, h_absmax2.size() * 2)); + CHECK_CUDA(cudaMalloc(&d_output, out_size)); + + CHECK_CUDA(cudaMemcpy(d_packed, h_packed_weights.data(), h_packed_weights.size(), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(d_absmax_q, h_absmax_q.data(), h_absmax_q.size(), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(d_absmax2, h_absmax2.data(), h_absmax2.size() * 2, cudaMemcpyHostToDevice)); + + int threadsPerBlock = 256; + int num_elements_vec = (num_bytes + 3) / 4; + int blocksPerGrid = (num_elements_vec + threadsPerBlock - 1) / threadsPerBlock; + + // warm up + for (int i = 0; i < 5; ++i) + { + dequantize_nf4_kernel_vec<<>> + (d_packed, d_absmax_q, d_absmax2, d_output, num_bytes, blocksize, goupsize, offset); + } + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); // 开始记录 + std::cout << "Launch Kernel...\n"; + for (int i = 0; i < 10; ++i) + { + dequantize_nf4_kernel_vec<<>> + (d_packed, d_absmax_q, d_absmax2, d_output, num_bytes, blocksize, goupsize, offset); + } + cudaEventRecord(stop); // 结束记录 + + CHECK_CUDA(cudaEventSynchronize(stop)); // 等待 Event 完成 + + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + + // 计算带宽 + // 读取: Packed(1) + Indices(1) + Scales(2) + Code2(忽略不计) + size_t total_read = h_packed_weights.size() + h_absmax_q.size() + h_absmax2.size() * 2; + // 写入: Output(4) (因为每个packed byte生成一个uint32) + size_t total_write = num_bytes * 4; + + double total_bytes = (double)(total_read + total_write); + double gb_per_sec = (total_bytes * 10 / 1e9) / (milliseconds / 1000.0); + + std::cout << "Kernel 耗时: " << milliseconds << " ms" << std::endl; + std::cout << "有效带宽: " << gb_per_sec << " GB/s" << std::endl; + + // 保存时间与带宽 + std::ofstream timefile("./data/log/log_cpp.txt"); + timefile << milliseconds << "," << gb_per_sec; + timefile.close(); + + std::vector h_output(num_bytes); + CHECK_CUDA(cudaMemcpy(h_output.data(), d_output, out_size, cudaMemcpyDeviceToHost)); + + const std::string output_path = "./data/cpp_output.bin"; + std::ofstream outfile(output_path, std::ios::binary); + outfile.write(reinterpret_cast(h_output.data()), out_size); + outfile.close(); + + cudaFree(d_packed); cudaFree(d_absmax_q); cudaFree(d_absmax2); cudaFree(d_output); + cudaEventDestroy(start); cudaEventDestroy(stop); +} + +int main() +{ + const std::string file_path = "../data/qlora_test.bin"; + std::vector packed_weights, absmax_q; + std::vector absmax2, code2; + float offset; + int64_t rows, cols; + int32_t blocksize; + int32_t groupsize = 256; + read_file(file_path, packed_weights, absmax_q, absmax2, code2, offset, rows, cols, blocksize); + nf4_dequantize_cuda(packed_weights, absmax_q, absmax2, code2, rows, cols, blocksize, groupsize, offset); + + return 0; +} \ No newline at end of file diff --git a/03_nf4_dequant/main.py b/03_nf4_dequant/main.py new file mode 100644 index 00000000..c04b5dfc --- /dev/null +++ b/03_nf4_dequant/main.py @@ -0,0 +1,29 @@ +import subprocess +import sys + +def run_command(command, description): + print(f"--- 正在执行: {description} ---") + try: + subprocess.run(command, shell=True, check=True) + except subprocess.CalledProcessError as e: + print(f"错误: {description} 失败!") + sys.exit(1) + +def main(): + + # 编译 CUDA Kernel + compile_cmd = "nvcc -O3 ./src/nf4_kernel.cu -o nf4_kernel" + run_command(compile_cmd, "编译 C++ CUDA 内核") + + # 生成测试数据 + run_command("python ./src/generate_data.py", "生成随机量化权重") + + # 运行 C++ 反量化 + run_command("./nf4_kernel", "运行自定义 CUDA 内核") + + # 运行对比分析脚本 + run_command("python ./src/benchmark.py", "对比精度与性能表现") + + run_command("ncu --set full -o nf4 -f ./nf4_kernel", "生成ncu-rep文件") +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/03_nf4_dequant/nf4_kernel b/03_nf4_dequant/nf4_kernel new file mode 100755 index 00000000..2734ca3c Binary files /dev/null and b/03_nf4_dequant/nf4_kernel differ diff --git a/03_nf4_dequant/src/benchmark.py b/03_nf4_dequant/src/benchmark.py new file mode 100644 index 00000000..c1a9f894 --- /dev/null +++ b/03_nf4_dequant/src/benchmark.py @@ -0,0 +1,95 @@ +import torch +import numpy as np +import random +from get_param import parse_params + +CONFIG_FILE = "./data/param.txt" +PY_FILE = "./data/py_output.bin" +CPP_FILE = "./data/cpp_output.bin" +PY_LOG = "./data/log/log_py.txt" +CPP_LOG = "./data/log/log_cpp.txt" + +def load_bf16_tensor(file_path, rows, cols): + """ + 读取二进制文件,并根据原始维度进行裁剪和重塑 + """ + raw_data = np.fromfile(file_path, dtype=np.int16) + total_original = rows * cols + + # 裁剪掉末尾的 Padding 元素 + if raw_data.size > total_original: + raw_data = raw_data[:total_original] + elif raw_data.size < total_original: + print(f"错误: 文件数据量 ({raw_data.size}) 小于原始维度需求 ({total_original})") + return None + + tensor_bf16 = torch.from_numpy(raw_data).view(torch.bfloat16) + return tensor_bf16.reshape(rows, cols).to(torch.float32) + +def main(): + config = parse_params(CONFIG_FILE) + ROWS = config["rows"] + COLS = config["cols"] + + print(f"测试维度: {ROWS} x {COLS}") + print(f"Python: {PY_FILE}") + print(f"C++: {CPP_FILE}") + + t_py = load_bf16_tensor(PY_FILE, ROWS, COLS) + t_cpp = load_bf16_tensor(CPP_FILE, ROWS, COLS) + + diff = t_py - t_cpp + abs_diff = torch.abs(diff) + mae = torch.mean(abs_diff).item() + mse = torch.mean(diff ** 2).item() + max_diff = torch.max(abs_diff).item() + + exact_match_count = torch.sum(t_py == t_cpp).item() + total_elements = t_py.numel() + match_rate = (exact_match_count / total_elements) * 100 + + print("\n" + "="*50) + print("误差分析报告") + print("="*50) + print(f"平均绝对误差 (MAE):{mae:.10e}") + print(f"均方误差 (MSE):{mse:.10e}") + print(f"最大误差 (Max Diff):{max_diff:.10e}") + print("-" * 50) + print(f"完全一致元素数:{exact_match_count} / {total_elements}") + print(f"一致率:{match_rate:.4f}%") + print("="*50) + + print("\n 随机数据采样对比") + print("-" * 75) + print(f"{'Index':<10} | {'Python (BF16)':<18} | {'C++ (BF16)':<18} | {'Diff':<15}") + print("-" * 75) + + flat_py = t_py.flatten() + flat_cpp = t_cpp.flatten() + + random_indices = random.sample(range(total_elements), 10) + + check_indices = [0, 1, 2] + random_indices + for idx in check_indices: + val_py = flat_py[idx].item() + val_cpp = flat_cpp[idx].item() + val_diff = abs(val_py - val_cpp) + + print(f"{idx:<10} | {val_py:<18.6f} | {val_cpp:<18.6f} | {val_diff:<15.4e}") + print("-" * 75) + + + with open(PY_LOG, 'r', encoding='utf-8') as f: + line = f.readline() + py_time, py_bandwidth = [float(x.strip()) for x in line.split(',')] + print(f"bnb耗时:{py_time:.5f}ms, 带宽:{py_bandwidth:.5f}GB/s") + + with open(CPP_LOG, 'r', encoding='utf-8') as f: + line = f.readline() + cpp_time, cpp_bandwidth = [float(x.strip()) for x in line.split(',')] + print(f"nf4 kernel耗时:{cpp_time:.5f}ms, 带宽:{cpp_bandwidth:.5f}GB/s") + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/03_nf4_dequant/src/generate_data.py b/03_nf4_dequant/src/generate_data.py new file mode 100644 index 00000000..b45311a6 --- /dev/null +++ b/03_nf4_dequant/src/generate_data.py @@ -0,0 +1,100 @@ +import torch +from bitsandbytes import functional as F +import struct +import os +import math +from get_param import parse_params + +def generate_data(config_file="./data/param.txt"): + if not os.path.exists("./data"): os.makedirs("./data") + if not os.path.exists("./data/log"): os.makedirs("./data/log") + + # 获取动态配置 + config = parse_params(config_file) + blocksize, groupsize = config["blocksize"], config["groupsize"] + rows, cols = config["rows"], config["cols"] + device = "cuda" if torch.cuda.is_available() else "cpu" + dtype = torch.bfloat16 if config["compute_type"] == "bf16" else torch.float16 + + print(f"维度: {rows}x{cols} | Block: {blocksize} | Group: {groupsize}") + + # 权重生成与 Padding 对齐 + original_weights = torch.randn(rows, cols, dtype=dtype, device=device) * 0.5 + total_original = rows * cols + alignment = blocksize * groupsize + padded_total = math.ceil(total_original / alignment) * alignment + + if padded_total > total_original: + print(f"填充 {padded_total - total_original} 个元素以对齐") + flat_weights = original_weights.reshape(-1) + padding = torch.zeros(padded_total - total_original, dtype=dtype, device=device) + aligned_weights = torch.cat([flat_weights, padding], dim=0).reshape(1, -1) + else: + aligned_weights = original_weights.reshape(1, -1) + + # 一级量化 (NF4) + q_weight, state_1 = F.quantize_4bit(aligned_weights, blocksize=blocksize, quant_type='nf4') + # 二级量化 (FP8) + absmax_1 = state_1.absmax + absmax_q, state_2 = F.quantize_blockwise(absmax_1, blocksize=groupsize, nested=False) + + # 写入二进制文件 + code2_np = state_2.code.to(torch.float16).cpu().numpy() + absmax2_np = state_2.absmax.to(torch.float16).cpu().numpy() + + bin_path = "./data/qlora_test.bin" + with open(bin_path, "wb") as f: + f.write(struct.pack(" +#include +#include +#include +#include +#include +#include + +#define CHECK_CUDA(call) \ + { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA Error: " << cudaGetErrorString(err) \ + << " at line " << __LINE__ << std::endl; \ + exit(EXIT_FAILURE); \ + } \ + } + +void read_file( + const std::string file_path, std::vector& packed_weights, + std::vector& absmax_q, + std::vector& absmax2, + std::vector& code2, float& offset, + int64_t& rows, int64_t& cols, int32_t& blocksize +) +{ + std::ifstream file(file_path, std::ios::binary); + if (!file) { std::cerr << "无法打开文件" << std::endl; exit(1); } + + file.read(reinterpret_cast(&rows), 8); + file.read(reinterpret_cast(&cols), 8); + file.read(reinterpret_cast(&blocksize), 4); + + int64_t total_original = rows * cols; + int32_t groupsize = 256; + int32_t alignment = blocksize * groupsize; + int64_t total_aligned = ((total_original + alignment - 1) / alignment) * alignment; + + size_t num_blocks = total_aligned / blocksize; + size_t num_groups = total_aligned / (blocksize * groupsize); + + std::cout << "原始元素: " << total_original << std::endl; + std::cout << "对齐元素: " << total_aligned << std::endl; + + packed_weights.resize(total_aligned / 2); + absmax_q.resize(num_blocks); + absmax2.resize(num_groups); + code2.resize(256); + + file.read(reinterpret_cast(packed_weights.data()), packed_weights.size()); + file.read(reinterpret_cast(absmax_q.data()), absmax_q.size()); + file.read(reinterpret_cast(absmax2.data()), absmax2.size() * 2); + file.read(reinterpret_cast(code2.data()), 256 * 2); + file.read(reinterpret_cast(&offset), 4); +} + + +__constant__ float c_code2[256]; +// NF4 table +__constant__ float c_nf4[16] = +{ + -1.00000000f, -0.69619280f, -0.52507305f, -0.39491749f, + -0.28444138f, -0.18477343f, -0.09105004f, 0.00000000f, + 0.07958030f, 0.16093020f, 0.24611230f, 0.33791524f, + 0.44070983f, 0.56261700f, 0.72295684f, 1.00000000f +}; + +__global__ void dequantize_nf4_kernel +( + const uint8_t* __restrict__ packed_weights, + const uint8_t* __restrict__ absmax_q, + const half* __restrict__ absmax2, + uint32_t* __restrict__ output_packed, + int num_bytes, + int block_shift, + int group_shift, + float offset +) +{ + __shared__ float s_nf4[16]; + + int tx = threadIdx.x; + if (tx < 16) + { + s_nf4[tx] = c_nf4[tx]; + } + __syncthreads(); + + // 每个线程处理 4 个输入字节(即 8 个 NF4 权重) + // 所以总线程数只需要是 num_bytes / 4 + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if (tid * 4 >= num_bytes) return; + + // 向量化读取:一次吞下 4 个字节 (32-bit) + uint32_t packed_4bytes = reinterpret_cast(packed_weights)[tid]; + + // 准备 128-bit 的输出容器 (4 个 uint32_t,每个 uint32_t 包含 2 个 bf16) + uint32_t out_vec[4]; + +#pragma unroll + for (int i = 0; i < 4; ++i) + { + uint8_t byte_val = (packed_4bytes >> (i * 8)) & 0xFF; + + // 计算当前权重的全局索引 + int element_idx = (tid * 4 + i) * 2; + int block_idx = element_idx >> block_shift; + int group_idx = block_idx >> group_shift; + + float scale_1 = c_code2[absmax_q[block_idx]]; + + float scale_2 = __half2float(absmax2[group_idx]); + float final_scale = scale_1 * scale_2; + + // 解码两个 NF4 + uint8_t idx_0 = byte_val >> 4; + uint8_t idx_1 = byte_val & 0x0F; + + float v0 = s_nf4[idx_0] * final_scale; + float v1 = s_nf4[idx_1] * final_scale; + + __nv_bfloat16 b0 = __float2bfloat16(v0); + __nv_bfloat16 b1 = __float2bfloat16(v1); + + uint16_t bits_0 = *reinterpret_cast(&b0); + uint16_t bits_1 = *reinterpret_cast(&b1); + + // 打包结果存入临时数组 + out_vec[i] = ((uint32_t)bits_1 << 16) | (uint32_t)bits_0; + } + + // 向量化写入 + reinterpret_cast(output_packed)[tid] = *reinterpret_cast(out_vec); +} +void nf4_dequantize_cuda +( + std::vector& h_packed_weights, + std::vector& h_absmax_q, + std::vector& h_absmax2, + std::vector& h_code2, + int64_t rows, int64_t cols, int32_t blocksize, int32_t groupsize,float offset +) +{ + size_t num_bytes = h_packed_weights.size(); + size_t out_size = num_bytes * sizeof(uint32_t); + + float h_code2_f32[256]; + for(int i = 0; i < 256; ++i) + { + __half h_val = *reinterpret_cast<__half*>(&h_code2[i]); + h_code2_f32[i] = (float)h_val; + } + CHECK_CUDA(cudaMemcpyToSymbol(c_code2, h_code2_f32, sizeof(h_code2_f32))); + + uint8_t *d_packed, *d_absmax_q; + half *d_absmax2; + uint32_t *d_output; + + CHECK_CUDA(cudaMalloc(&d_packed, h_packed_weights.size())); + CHECK_CUDA(cudaMalloc(&d_absmax_q, h_absmax_q.size())); + CHECK_CUDA(cudaMalloc(&d_absmax2, h_absmax2.size() * 2)); + CHECK_CUDA(cudaMalloc(&d_output, out_size)); + + CHECK_CUDA(cudaMemcpy(d_packed, h_packed_weights.data(), h_packed_weights.size(), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(d_absmax_q, h_absmax_q.data(), h_absmax_q.size(), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(d_absmax2, h_absmax2.data(), h_absmax2.size() * 2, cudaMemcpyHostToDevice)); + + int threadsPerBlock = 256; + int num_elements_vec = (num_bytes + 3) / 4; + int blocksPerGrid = (num_elements_vec + threadsPerBlock - 1) / threadsPerBlock; + + int block_shift = log2(blocksize); + int group_shift = log2(groupsize); + + // warm up + for (int i = 0; i < 5; ++i) + { + dequantize_nf4_kernel<<>> + (d_packed, d_absmax_q, d_absmax2, d_output, num_bytes, block_shift, group_shift, offset); + } + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); // 开始记录 + std::cout << "Launch Kernel...\n"; + for (int i = 0; i < 10; ++i) + { + dequantize_nf4_kernel<<>> + (d_packed, d_absmax_q, d_absmax2, d_output, num_bytes, block_shift, group_shift, offset); + } + cudaEventRecord(stop); // 结束记录 + + CHECK_CUDA(cudaEventSynchronize(stop)); // 等待 Event 完成 + + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + + // 计算带宽 + // 读取: Packed(1) + Indices(1) + Scales(2) + Code2(忽略不计) + size_t total_read = h_packed_weights.size() + h_absmax_q.size() + h_absmax2.size() * 2; + // 写入: Output(4) (因为每个packed byte生成一个uint32) + size_t total_write = num_bytes * 4; + + double total_bytes = (double)(total_read + total_write); + double gb_per_sec = (total_bytes * 10 / 1e9) / (milliseconds / 1000.0); + + std::cout << "Kernel 耗时: " << milliseconds << " ms" << std::endl; + std::cout << "有效带宽: " << gb_per_sec << " GB/s" << std::endl; + + // 保存时间与带宽 + std::ofstream timefile("./data/log/log_cpp.txt"); + timefile << milliseconds << "," << gb_per_sec; + timefile.close(); + + std::vector h_output(num_bytes); + CHECK_CUDA(cudaMemcpy(h_output.data(), d_output, out_size, cudaMemcpyDeviceToHost)); + + const std::string output_path = "./data/cpp_output.bin"; + std::ofstream outfile(output_path, std::ios::binary); + outfile.write(reinterpret_cast(h_output.data()), out_size); + outfile.close(); + + cudaFree(d_packed); cudaFree(d_absmax_q); cudaFree(d_absmax2); cudaFree(d_output); + cudaEventDestroy(start); cudaEventDestroy(stop); +} + +int main() +{ + const std::string file_path = "./data/qlora_test.bin"; + std::vector packed_weights, absmax_q; + std::vector absmax2, code2; + float offset; + int64_t rows, cols; + int32_t blocksize; + int32_t groupsize = 256; + read_file(file_path, packed_weights, absmax_q, absmax2, code2, offset, rows, cols, blocksize); + nf4_dequantize_cuda(packed_weights, absmax_q, absmax2, code2, rows, cols, blocksize, groupsize, offset); + + return 0; +} \ No newline at end of file diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 52c5e5f5..00000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2026 InfiniTensor - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/README.md b/README.md deleted file mode 100644 index df3785a5..00000000 --- a/README.md +++ /dev/null @@ -1,101 +0,0 @@ -# Learning-CUDA - -本项目为 2025 年冬季 InfiniTensor 大模型与人工智能系统训练营 CUDA 方向专业阶段的作业与项目系统。 - -本页面为 2025 年冬季 InfiniTensor 大模型与人工智能系统训练营 CUDA 方向的项目仓库。 - -## 📁 项目结构 - -```text -learning-CUDA/ -├── 03_nf4_dequant -│ └── README.md -├── 04_hadamard_tc -│ └── README.md -├── 05_seq_align -│ └── README.md -├── 06_photon_sim -│ └── README.md -├── 07_trade_planning -│ └── README.md -├── 08_bilateral_filter -│ └── README.md -├── 09_particle_sim -│ └── README.md -├── LICENSE -└── README.md -``` - -## 🧠 项目阶段信息和简介 - -### 关于项目阶段 -项目阶段相比较于课程阶段的作业会更加 **open-ended (开放式题目)**,不会像作业一样提供测例和标准答案。项目阶段的目标不仅仅是更全面的检验和提升学员对 CUDA 编程的掌握能力,同时还是更全面的检验包括问题分析、方案与系统设计、工具使用和文档能力在内的**综合能力**。能优秀完成项目阶段的学员能具备工业级的开发能力。因此项目阶段的评审会综合和全面的考察和评判。 - -### 项目选题是什么? -项目阶段有多个选题,每个选题的方向和内容可能不尽相同,学员可以根据自己的兴趣进行选择。每位学员需**至少选择一个**选题且欢迎有兴趣和能力的同学同时完成多个选题。如果学员对选题有自己的想法,也欢迎与导师讨论。 - -### 项目阶段其他重要信息 - - - 项目阶段时间:2026.2.10 - 2026.3.16 - - - 项目提交方式:大部分选题与专业阶段作业相同,在官网提交 fork 仓库的链接与 commit 链接。具体提交方式各个选题可能有所不同,具体见项目选题要求。 - -### 注意事项 -1. 最终提交需整理代码:命名方式统一(不限定具体某种风格,全程序统一即可)、代码格式化(无具体某种格式要求,但可以默认使用默认配置的 clang-format)、无测试代码并且关键函数和代码块进行适当的注释。但与其他项目交叉的情况需遵循其他项目的规范; -2. 取决于具体选题,大体上是以总体性能为评判标准,因此代码的某些功能在适宜的情况下可以在 CPU 上完成,但必须仍有较为显著的部分在 CUDA 上完成。总体性能需超越 CPU 同级别实现; -3. 如有需要,可以使用合适的 CUDA 官方库。 -4. 如果选题的提交地址为 `Learning-CUDA` 仓库,则需要 fork `Learning-CUDA` 的 `project` 分支,并在其下对应的选题文件夹中放置自己的代码,即放置路径为 `project` 分支下 `/<选题>/<你的ID>/`。提交方式为 PR。E.g., ID 为 lzm 并且做了 path_finding 的选题,则提交的代码位于 project 分支下 `/path_finding/lzm/`. - -需注意,各个选题的基础难度各不相同,在晋级评定时会有影响。同时,是否完成额外目标或进阶优化也会直接影响评定。 - - -## 🌳 环境配置 - -### > 英伟达(NVIDIA) - -- 如果你使用的是训练营所提供的服务器,遵照英伟达算力文档中的步骤配置好环境即可。 - -- 如果为本地或其他环境,请确保系统已安装以下工具: - - 1. **CUDA Toolkit**(版本11.0及以上): - - 验证安装:运行 `nvcc --version`。 - - 安装:从[NVIDIA CUDA Toolkit下载页](https://developer.nvidia.com/cuda-downloads)获取。 - 2. **GNU Make**: - - 验证安装:运行 `make --version`(大多数 Linux/macOS 已预装)。 - 3. **C++ 版本**: - - 本次作业在英伟达上默认需支持 C++17 - -### > 天数智芯(Iluvatar CoreX) - -- 如果你使用的是训练营所提供的服务器,遵照天数 BI-100 算力文档中的步骤配置好环境即可。 - -- 对于非训练营所提供的天数算力,请配置标准的天数 GPU 开放环境。本次作业在天数上默认需支持 C++17,且**本次作业的配置不保证能在所有其他天数环境上无修改直接运行**。 - -### > 沐曦集成电路(MetaX) - -- 如果你使用的是训练营所提供的服务器,遵照沐曦 (C500) 算力文档中的步骤配置好环境即可。 - - 镜像可以选择 PyTorch 的最新镜像,即 PyTorch 2.8.0, Python 3.1.2,maca 3.3.0.4 - -- 对于非训练营所提供的沐曦算力,请配置标准的沐曦 GPU 开放环境。本次作业在沐曦上默认需支持 C++17,且**本次作业的配置不保证能在所有其他沐曦环境上无修改直接运行**。 - -### > 摩尔线程(Moore Threads) - -- 如果你使用的是训练营所提供的服务器,请先遵照摩尔 (S5000) 算力文档中的步骤配置环境。 - - 在此基础上,请确保在 `.bashrc` 中添加了以下环境变量: - - ```bash - export MUSA_ROOT=/usr/local/musa - export PATH="$MUSA_ROOT/bin:$PATH" - export LD_LIBRARY_PATH="$MUSA_ROOT/lib:$LD_LIBRARY_PATH" - export CPLUS_INCLUDE_PATH=/usr/include/c++/11:/usr/include/x86_64-linux-gnu/c++/11 - ``` - -- 对于非训练营所提供的摩尔算力,请配置标准的摩尔 GPU 开放环境。本次作业在摩尔上默认需支持 C++11,且**本次作业的配置不保证能在所有其他摩尔环境上无修改直接运行**。 - - -## 📬 有疑问? - -如有任何疑问或不确定的地方,欢迎随时询问导师或助教! - -Good luck and happy coding! 🚀