• CUDA实例练习(四):矩阵转置


     1 #include <stdio.h>
     2 #include "gputimer.h"
     3 #include "cuda_runtime.h"
     4 #include "device_launch_parameters.h"
     5 #include <stdlib.h>
     6 
     7 const int N = 1024;
     8 const int K = 32;
     9 
    10 void fill_matrix(float * mat){
    11     for (int i = 0; i < N*N; i++)
    12         mat[i] = (float)i;
    13 }
    14 
    15 void print_matrix(float *mat)
    16 {
    17     for (int j = 0; j < N; j++)
    18     {
    19         for (int i = 0; i < N; i++) { printf("%4.4g ", mat[i + j*N]); }
    20         printf("
    ");
    21     }
    22 }
    23 
    24 __global__ void transpose_serial(float in[], float out[]){
    25     for (int i = 0; i < N; i++)
    26         for (int j = 0; j < N; j++)
    27             out[i + j*N] = in[j + i*N];
    28 }
    29 
    30 __global__ void transpose_parallel_per_row(float in[], float out[]){
    31     int i = threadIdx.x;
    32 
    33     for (int j = 0; j < N; j++)
    34         out[j + i*N] = in[i + j*N];
    35 }
    36 
    37 __global__ void transpose_parallel_per_element(float in[], float out[]){
    38     int i = blockIdx.x * K + threadIdx.x;
    39     int j = blockIdx.y * K + threadIdx.y;
    40     out[j + i*N] = in[i + j*N];
    41 }
    42 int main(void){
    43     int numbytes = N * N * sizeof(float);
    44 
    45     float *in = (float *)malloc(numbytes);
    46     float *out = (float *)malloc(numbytes);
    47     fill_matrix(in);
    48     
    49     float *d_in, *d_out;
    50 
    51     cudaMalloc((void **)&d_in, numbytes);
    52     cudaMalloc((void **)&d_out, numbytes);
    53     cudaMemcpy(d_in, in, numbytes, cudaMemcpyHostToDevice);
    54 
    55     GpuTimer timer;
    56     timer.Start();
    57     transpose_serial << <1, 1 >> >(d_in, d_out);
    58     timer.Stop();
    59     cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
    60     printf("transpose_serial:%g ms.
    ", timer.Elapsed());
    61 
    62     timer.Start();
    63     transpose_parallel_per_row << <1, N >> >(d_in, d_out);
    64     timer.Stop();
    65     cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
    66     printf("transpose_parallel_per_row:%g ms.
    ", timer.Elapsed());
    67 
    68     dim3 blocks(N / K, N / K);
    69     dim3 threads(K, K);
    70     timer.Start();
    71     transpose_parallel_per_element << <blocks, threads >> >(d_in, d_out);
    72     timer.Stop();
    73     cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
    74     printf("transpose_parallel_per_element: %g ms.
    ", timer.Elapsed());
    75 
    76     cudaFree(d_in);
    77     cudaFree(d_out);
    78 
    79     return 0;
    80 }
  • 相关阅读:
    【MyBatis】STDOUT_LOGGING和log4j日志
    【MyBatis】resultType与类的字段不同解决办法
    【MyBatis】SqlSession相关的生命周期及作用域
    【MyBatis】XML配置的官网说明
    【Junit】程序员都应会的白盒测试
    【MyBatis】增删改查语句入门
    【MyBatis】我的mybatis第一个程序
    【网络】HTTP知识点(1)
    InitializeComponent报错(提示不存在)
    leetcode 860柠檬水找零问题
  • 原文地址:https://www.cnblogs.com/zhangshuwen/p/7263567.html
Copyright © 2020-2023  润新知