Sunday, December 18, 2011

Superfast Bootstrapping

In statistics, bootstrapping is a computer-based method for assigning measures of accuracy to sample estimates (Efron and Tibshirani 1994). This technique allows estimation of the sample distribution of almost any statistic using only very simple methods. Generally, it falls in the broader class of resampling methods.

Introducing GPU acceleration in MS Excel user-defined functions offers an easy-to-use solution to the masses.

Cuda Kernels


/*
* --------------------------------------------------------------------------------
* Bootstrap kernels
* --------------------------------------------------------------------------------
*/


__global__ void kernel_bs_normalizeRandoms(
unsigned int* data,
unsigned int* randoms,
unsigned int nbElements )
{
  // Compute the index
  unsigned int x = blockIdx.x*blockDim.x+threadIdx.x;
  unsigned int y = blockIdx.y*blockDim.y+threadIdx.y;
  unsigned int index = (y*blockDim.x) + x;

  unsigned int randIndex = index*nbElements;

  // Compute performance
  for( int i(0); i<nbElements; ++i )
  {
    if( index==0 )
    {
      data[i] = data[i]%nbElements;
    }
    randoms[randIndex+i] = randoms[randIndex+i]%nbElements;
  }
}

__global__ void kernel_bs_resample(
  unsigned int* data,
  unsigned int* randoms,
  float* series,
  unsigned int nbSeries,
  unsigned int nbElements )
{
  // Compute the index
  unsigned int x = blockIdx.x*blockDim.x+threadIdx.x;
  unsigned int y = blockIdx.y*blockDim.y+threadIdx.y;
  unsigned int index = (y*blockDim.x) + x;

  unsigned int randIndex = index*nbElements;

  // Compute average
  int sum(0);
  for( int i(0); i<nbElements; ++i )
  {
    int r = randoms[randIndex+i];
    if( r>=0 && r<nbElements )
    {
      sum += data[r];
    }
  }
  // Store average
  if( index < nbSeries )
  {
    series[index] = (sum/nbElements);
  }
}

__global__ void kernel_bs_distribute(
  float* series,
  float scale,
  unsigned int nbElements,
  unsigned int* distribution,
  unsigned int distributionSize )
{
  // Compute the index
  unsigned int x = blockIdx.x*blockDim.x+threadIdx.x;
  unsigned int y = blockIdx.y*blockDim.y+threadIdx.y;
  unsigned int index = (y*blockDim.x) + x;

  float position = series[index] * scale;

  if( position>0 && position<distributionSize)
  {
    // TODO: Can be optimized with shared memory within each block
    atomicAdd(&distribution[(unsigned int)position],1);
  }
}