### Cuda Kernels

/*

*
--------------------------------------------------------------------------------

*
Bootstrap kernels

*
--------------------------------------------------------------------------------

*/

__global__
void kernel_bs_normalizeRandoms(

unsigned int* data,

unsigned int* randoms,

unsigned int nbElements
)

{

// Compute the
index

unsigned int x =
blockIdx.x*blockDim.x+threadIdx.x;

unsigned int y =
blockIdx.y*blockDim.y+threadIdx.y;

unsigned int index = (y*blockDim.x) + x;

unsigned int
randIndex = index*nbElements;

// Compute
performance

for( int i(0); i<nbElements; ++i )

{

if(
index==0 )

{

data[i] = data[i]%nbElements;

}

randoms[randIndex+i] =
randoms[randIndex+i]%nbElements;

}

}

__global__
void kernel_bs_resample(

unsigned int* data,

unsigned int* randoms,

float* series,

unsigned int nbSeries,

unsigned
int
nbElements )

{

// Compute the
index

unsigned
int x
= blockIdx.x*blockDim.x+threadIdx.x;

unsigned
int y
= blockIdx.y*blockDim.y+threadIdx.y;

unsigned
int index = (y*blockDim.x) + x;

unsigned
int randIndex = index*nbElements;

// Compute average

int
sum(0);

for(
int i(0); i<nbElements; ++i )

{

int r = randoms[randIndex+i];

if( r>=0 && r<nbElements )

{

sum +=
data[r];

}

}

// Store average

if( index
< nbSeries )

{

series[index] = (sum/nbElements);

}

}

__global__
void kernel_bs_distribute(

float* series,

float scale,

unsigned int
nbElements,

unsigned int* distribution,

unsigned int
distributionSize )

{

// Compute the
index

unsigned int x =
blockIdx.x*blockDim.x+threadIdx.x;

unsigned int y =
blockIdx.y*blockDim.y+threadIdx.y;

unsigned int index = (y*blockDim.x) + x;

float position = series[index] * scale;

if( position>0 &&
position<distributionSize)

{

// TODO: Can be optimized with shared memory within each block

atomicAdd(&distribution[(unsigned int)position],1);

}

}