#include<iostream>#include<cuda_runtime.h>// 设备端函数,用于计算向量加法__global__ voidvectorAdd(constfloat* A, constfloat* B, float* C, int n){ int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < n) { C[i] = A[i] + B[i]; }}intmain(){ int n = 1000; size_t size = n * sizeof(float); // 分配主机端内存 float* h_A = (float*)malloc(size); float* h_B = (float*)malloc(size); float* h_C = (float*)malloc(size); // 初始化数据 for (int i = 0; i < n; i++) { h_A[i] = i; h_B[i] = i; } // 分配设备端内存 float* d_A; float* d_B; float* d_C; cudaMalloc(&d_A, size); cudaMalloc(&d_B, size); cudaMalloc(&d_C, size); // 将数据从主机端拷贝到设备端 cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); // 配置线程块和网格的大小 int blockSize = 256; int gridSize = (n + blockSize - 1) / blockSize; // 调用设备端函数 vectorAdd<<<gridSize, blockSize>>>(d_A, d_B, d_C, n); // 将计算结果从设备端拷贝到主机端 cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); // 打印计算结果 for (int i = 0; i < n; i++) { std::cout << h_C[i] << " "; } std::cout << std::endl; // 释放内存 free(h_A); free(h_B); free(h_C); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); return 0;}