Bootstrapping
In statistics, bootstrapping is a computer-based method for assigning measures of accuracy to sample estimates (Efron and Tibshirani 1994). This technique allows estimation of the sample distribution of almost any statistic using only very simple methods. Generally, it falls in the broader class of resampling methods.Download the code for Windows Visual Studio 2010
https://github.com/cyrillefavreau/GPUPoweredExcelCuda Kernels
#include <cuda.h>
#include <curand.h>
#include "BootStrap.h"
/*
*
--------------------------------------------------------------------------------
* Bootstrap kernels
*
--------------------------------------------------------------------------------
*/
__global__ void
kernel_bs_normalizeRandoms(
unsigned int* data,
unsigned int*
randoms,
unsigned int nbSeries,
unsigned int nbObjects )
{
// Compute the index
unsigned int index =
blockIdx.x*blockDim.x+threadIdx.x;
unsigned int randIndex
= index*nbObjects;
if( randIndex>nbSeries*nbObjects ) return;
// Compute performance
for( int i(0);
i<nbObjects; ++i )
{
if( index==0 )
{
data[i] = data[i]%nbObjects;
}
randoms[randIndex+i] = randoms[randIndex+i]%nbObjects;
}
}
__global__ void
kernel_bs_resample(
unsigned int* data,
unsigned int*
randoms,
float*
series,
unsigned int nbSeries,
unsigned int nbObjects )
{
// Compute the index
unsigned int index =
blockIdx.x*blockDim.x+threadIdx.x;
unsigned int
randIndex = index*nbObjects;
if( randIndex>nbSeries*nbObjects ) return;
// Compute average
int sum(0);
for( int i(0);
i<nbObjects; ++i )
{
int r = randoms[randIndex+i];
if( r>=0
&& r<nbObjects )
{
sum
+= data[r];
}
}
// Store average
if( index < nbSeries )
{
series[index] = (sum/nbObjects);
}
}
__global__ void
kernel_bs_distribute(
float*
series,
float
range,
unsigned int nbSeries,
unsigned int*
distribution,
unsigned int distributionSize )
{
// Compute the index
unsigned int index =
blockIdx.x*blockDim.x+threadIdx.x;
if( index>nbSeries ) return;
float v = series[index];
int position = (v/(range/distributionSize));
if( position>=0 &&
position<distributionSize)
{
atomicAdd(&distribution[position],1);
}
}
void ExcelCUDA_Boostrap(unsigned
int nbObjects, unsigned
int nbSeries, unsigned
int distributionSize, unsigned
int* distribution)
{
unsigned long long random = rand()%1000/1000;
float range = 100.f;
// Device buffers
unsigned int* _dData;
unsigned int*
_dRandoms;
float* _dSeries;
unsigned int* _dDistribution;
// Device allocation
cudaMalloc( (void**)&_dData,
nbObjects*nbSeries*sizeof(unsigned int) );
cudaMalloc( (void**)&_dRandoms,
nbObjects*nbSeries*sizeof(unsigned int) );
cudaMalloc( (void**)&_dSeries,
nbSeries*sizeof(float)
);
cudaMalloc( (void**)&_dDistribution,
distributionSize*sizeof(unsigned int) );
// Create pseudo-random number generator
curandGenerator_t gen;
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, random);
curandGenerate(gen,_dData, nbObjects*nbSeries);
curandDestroyGenerator(gen);
int blockSize(32);
int size = nbObjects*nbSeries/blockSize;
int gridSize = size;
// Normalize Randoms
kernel_bs_normalizeRandoms<<<gridSize,blockSize>>>(
_dData, _dRandoms, nbSeries, nbObjects );
// Resample
kernel_bs_resample<<<gridSize,blockSize>>>( _dData,
_dRandoms, _dSeries, nbSeries, nbObjects );
// ----------------------------------------
// Compute distribution
// ----------------------------------------
// Reset memory
cudaMemset( _dDistribution, 0, distributionSize*sizeof(unsigned int) );
gridSize =
nbSeries/blockSize;
kernel_bs_distribute<<<gridSize,blockSize>>>(
_dSeries, range, nbSeries, _dDistribution, distributionSize );
cudaMemcpy( distribution, _dDistribution, distributionSize*sizeof(unsigned int),
cudaMemcpyDeviceToHost);
cudaFree(
_dData );
cudaFree(
_dRandoms );
cudaFree(
_dSeries );
cudaFree(
_dDistribution );
}
Monte-Carlo simulations
Half a million Monte Carlo simulations per second! Release the power of your GPU from MS Excel 2010. Implementing user defined functions using CUDA ot OpenCL opens a whole new dimension in supercomputing in Microsoft Office.
The algorithm is composed of three parts:
1. Generate random numbers for n series
2. Compute the performance for each of these series using
3. Distribute performances into the resulting set.
Step 1: This is done using the curand nVidia librairy.
Step 2: This is performed by the performanceStorageKernel function described bellow.
Step 3: The trick here is to use an atomic operation (atomicAdd) to generate the resulting Gauss-like curve. Note that atomic operations are only availble on architectures equals or higher to sm_13. This is performed by the frequenciesKernel function.
Cuda Kernel
#include <curand.h>
#include <cuda.h>
#include <math.h>
#include <windows.h>
cudaDeviceProp _dDeviceProp;
// Device buffers
float* _dObjects;
float* _dSeries;
float*
_dPerformances;
int* _dFrequencies;
// Variables
int _dNbObjects;
int _dNbFrequencies;
int _dNbSeries;
/**
* @brief Initializes the testcase by
* - Allocating the GPU memory
* - Copying the buffer from RAM to GPU memory.
* This function is executed on the Host
* @ param hBuffer Pointer to array of floats in host
memory
* @ param N Number of objects in the array (an Objects
consists of 6 floats)
*/
extern "C"
void initialize_device( int
NbSeries, int NbObjects, int NbFrequencies )
{
// Testcase initialization
_dNbObjects = NbObjects;
_dNbFrequencies = NbFrequencies;
_dNbSeries = NbSeries;
// Device allocation
cudaMalloc( (void**)&_dObjects,
NbSeries*NbObjects*sizeof(float) );
cudaMalloc( (void**)&_dSeries,
NbSeries*sizeof(float)
);
cudaMalloc( (void**)&_dFrequencies,
NbFrequencies*sizeof(int)
);
cudaMalloc( (void**)&_dPerformances,
NbObjects*sizeof(float)
);
}
/**
* @brief
*/
__global__ void
performanceStorageKernel( float* series, float* objects, int
nbObjects, float* performances )
{
// Compute the index
unsigned int
x = blockIdx.x*blockDim.x+threadIdx.x;
unsigned int
y = blockIdx.y*blockDim.y+threadIdx.y;
unsigned int
index = (y*blockDim.x) + x;
int objectsIndex =
index*nbObjects;
// Compute performance
__shared__ float
localPerformance[2];
localPerformance[0] = 1.0; // current
performance
localPerformance[1] = 0.0; // previous
performance
for( int
i(0); i<nbObjects; ++i ) {
localPerformance[1] = localPerformance[0];
localPerformance[0] =
(1.0+objects[objectsIndex+i])*localPerformance[1];
if( index == 0 ) performances[i]
= localPerformance[0];
}
// Store performance
series[index] = localPerformance[0] - 1.0;
}
/**
* @brief Kernel function to be executed on the GPU
* @param ptr Pointer to an array of floats stored in GPU
memory
*/
__global__ void frequenciesKernel( float* series, float
range, int* frequencies, int nbFrequencies )
{
// Compute the index
unsigned int
x = blockIdx.x*blockDim.x+threadIdx.x;
unsigned int
y = blockIdx.y*blockDim.y+threadIdx.y;
unsigned int
index = (y*blockDim.x) + x;
float v =
series[index]-(-range/2);
int position =
(v/(range/nbFrequencies));
atomicAdd(&frequencies[position],1);
}
/**
* @brief Run the kernel on the GPU.
* This function is executed on the Host.
*/
extern "C"
void run_kernel( float range,
float mu, float
sigma, int random, int
nbThreadsPerBlock )
{
// Create pseudo-random number generator
curandGenerator_t gen;
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, random);
curandGenerateNormal(gen, _dObjects, _dNbSeries*_dNbObjects, mu,
sigma );
curandDestroyGenerator(gen);
dim3 block(8, 8, 1);
int gridDim = (int)sqrt(float(_dNbSeries));
dim3 gridPerformances( gridDim/block.x, gridDim/block.y, 1);
performanceStorageKernel<<<gridPerformances,block>>>(_dSeries,
_dObjects, _dNbObjects, _dPerformances );
// Reset memory
cudaMemset( _dFrequencies, 0, _dNbFrequencies*sizeof(int) );
// compute Frequencies
gridDim = (int)sqrt(float(_dNbSeries));
dim3 gridFrequencies( gridDim/block.x, gridDim/block.y, 1);
frequenciesKernel<<<gridFrequencies,block>>>(
_dSeries, range, _dFrequencies, _dNbFrequencies );
}
/*
* This function is executed on the Host.
* @brief Copy the data back to the Host and releases the
buffer on
* GPU device.
* This function is executed on the Host
* @ param hBuffer Pointer to array of floats in host
memory
* @ param N Number of objects in the array (an Objects
consists of 6 floats)
*/
extern "C"
void device_to_host( int*
hOutFrequencies, float* hOutPerformances, float* hOutSeries )
{
cudaMemcpy( hOutFrequencies, _dFrequencies, _dNbFrequencies*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy( hOutPerformances, _dPerformances, _dNbObjects*sizeof(float),
cudaMemcpyDeviceToHost);
}
/*
* This function is executed on the Host.
* @brief Releases the buffer on GPU device.
* This function is executed on the Host
*/
extern "C"
void destroy_device()
{
cudaFree( _dObjects );
cudaFree( _dFrequencies );
cudaFree( _dSeries );
cudaFree(
_dPerformances );
}