Cyrille Favreau: Superfast Bootstrapping

Sunday, December 18, 2011

Superfast Bootstrapping

In statistics, bootstrapping is a computer-based method for assigning measures of accuracy to sample estimates (Efron and Tibshirani 1994). This technique allows estimation of the sample distribution of almost any statistic using only very simple methods. Generally, it falls in the broader class of resampling methods.

Introducing GPU acceleration in MS Excel user-defined functions offers an easy-to-use solution to the masses.

Cuda Kernels

* --------------------------------------------------------------------------------

* Bootstrap kernels

* --------------------------------------------------------------------------------

__global__ void kernel_bs_normalizeRandoms(

unsigned int* data,

unsigned int* randoms,

unsigned int nbElements )

{

// Compute the index

unsigned int x = blockIdx.x*blockDim.x+threadIdx.x;

unsigned int y = blockIdx.y*blockDim.y+threadIdx.y;

unsigned int index = (y*blockDim.x) + x;

unsigned int randIndex = index*nbElements;

// Compute performance

for( int i(0); i<nbElements; ++i )

{

if( index==0 )

{

data[i] = data[i]%nbElements;

}

randoms[randIndex+i] = randoms[randIndex+i]%nbElements;

}

__global__ void kernel_bs_resample(

unsigned int* data,

unsigned int* randoms,

float* series,

unsigned int nbSeries,

unsigned int nbElements )

{

// Compute the index

unsigned int x = blockIdx.x*blockDim.x+threadIdx.x;

unsigned int y = blockIdx.y*blockDim.y+threadIdx.y;

unsigned int index = (y*blockDim.x) + x;

unsigned int randIndex = index*nbElements;

// Compute average

int sum(0);

for( int i(0); i<nbElements; ++i )

{

int r = randoms[randIndex+i];

if( r>=0 && r<nbElements )

{

sum += data[r];

}

// Store average

if( index < nbSeries )

{

series[index] = (sum/nbElements);

}

__global__ void kernel_bs_distribute(

float* series,

float scale,

unsigned int nbElements,

unsigned int* distribution,

unsigned int distributionSize )

{

// Compute the index

unsigned int x = blockIdx.x*blockDim.x+threadIdx.x;

unsigned int y = blockIdx.y*blockDim.y+threadIdx.y;

unsigned int index = (y*blockDim.x) + x;

float position = series[index] * scale;

if( position>0 && position<distributionSize)

{

// TODO: Can be optimized with shared memory within each block

atomicAdd(&distribution[(unsigned int)position],1);

}