Parallel-Computing/memory_examples.cu at master · IonesioJunior/Parallel-Computing · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#include <stdio.h>

#define ARRAY_SIZE 128
__global__ void avarage_list(float * value_in,float * value_out){
	//Local memory
	int index = threadIdx.x;
	float sum = 0.0;

	// Static shared var
	__shared__ float sh_arr[ARRAY_SIZE];

	//Shared mem  | global memory
	sh_arr[index] = value_in[index];

	__syncthreads(); // Garante que todos os numeros foram copiados antes de começar a prox operaçao

	//shared memory operation
	for(int i = 0; i <= index;i++){
		sum += sh_arr[i];
	}

	// Global memory | local memory
	value_out[index] = sum / (index + 1);
}


int main(int argc,char** argv){
	const int BYTE_SIZE = ARRAY_SIZE * sizeof(float);

	//Host var
	float h_values_in[ARRAY_SIZE];
	float h_avarage_out[ARRAY_SIZE];

	printf("Array Values : \n");
	for(int i = 0 ; i < ARRAY_SIZE;i++){
		h_values_in[i] = float(i * 2);
		printf("%.2f " , h_values_in[i]);
	}
	printf("\n");


	//Device var
	float *d_values_in;
	float *d_avarage_out;
	cudaMalloc((void**) &d_values_in,BYTE_SIZE);
	cudaMemcpy(d_values_in,h_values_in,BYTE_SIZE,cudaMemcpyHostToDevice);
	cudaMalloc((void**) &d_avarage_out,BYTE_SIZE);

	avarage_list<<<1,ARRAY_SIZE>>>(d_values_in,d_avarage_out);

	cudaMemcpy(h_avarage_out,d_avarage_out,BYTE_SIZE,cudaMemcpyDeviceToHost);


	printf("Avarage Array: \n");
	for(int i = 0 ; i < ARRAY_SIZE;i++){
		printf("%.2f ",h_avarage_out[i]);
	}
	printf("\n");

	cudaFree(d_values_in);
	cudaFree(d_avarage_out);
	return 0;
}