Cyrille Favreau: Half a million Monte Carlo simulations per second in Microsoft Excel 2010

Tuesday, December 13, 2011

Half a million Monte Carlo simulations per second in Microsoft Excel 2010

Half a million Monte Carlo simulations per second! Release the power of your GPU from MS Excel 2010. Implementing user defined functions using CUDA ot OpenCL opens a whole new dimension in supercomputing in Microsoft Office.

The algorithm is composed of three parts:

1. Generate random numbers for n series

2. Compute the performance for each of these series using

3. Distribute performances into the resulting set.

Step 1: This is done using the curand nVidia librairy.

Step 2: This is performed by the performanceStorageKernel function described bellow.

Step 3: The trick here is to use an atomic operation (atomicAdd) to generate the resulting Gauss-like curve. Note that atomic operations are only availble on architectures equals or higher to sm_13. This is performed by the frequenciesKernel function.

Cuda Kernel

#include <curand.h>

#include <cuda.h>

#include <math.h>

#include <windows.h>

cudaDeviceProp _dDeviceProp;

// Device buffers

float* _dObjects;

float* _dSeries;

float* _dPerformances;

int* _dFrequencies;

// Variables

int _dNbObjects;

int _dNbFrequencies;

int _dNbSeries;

/**

* @brief Initializes the testcase by

* - Allocating the GPU memory

* - Copying the buffer from RAM to GPU memory.

* This function is executed on the Host

* @ param hBuffer Pointer to array of floats in host memory

* @ param N Number of objects in the array (an Objects consists of 6 floats)

extern "C"

void initialize_device( int NbSeries, int NbObjects, int NbFrequencies )

{

// Testcase initialization

_dNbObjects = NbObjects;

_dNbFrequencies = NbFrequencies;

_dNbSeries = NbSeries;

// Device allocation

cudaMalloc( (void**)&_dObjects, NbSeries*NbObjects*sizeof(float) );

cudaMalloc( (void**)&_dSeries, NbSeries*sizeof(float) );

cudaMalloc( (void**)&_dFrequencies, NbFrequencies*sizeof(int) );

cudaMalloc( (void**)&_dPerformances, NbObjects*sizeof(float) );

}

/**

* @brief

__global__ void performanceStorageKernel( float* series, float* objects, int nbObjects, float* performances )

{

// Compute the index

unsigned int x = blockIdx.x*blockDim.x+threadIdx.x;

unsigned int y = blockIdx.y*blockDim.y+threadIdx.y;

unsigned int index = (y*blockDim.x) + x;

int objectsIndex = index*nbObjects;

// Compute performance

__shared__ float localPerformance[2];

localPerformance[0] = 1.0; // current performance

localPerformance[1] = 0.0; // previous performance

for( int i(0); i<nbObjects; ++i ) {

localPerformance[1] = localPerformance[0];

localPerformance[0] = (1.0+objects[objectsIndex+i])*localPerformance[1];

if( index == 0 ) performances[i] = localPerformance[0];

}

// Store performance

series[index] = localPerformance[0] - 1.0;

}

/**

* @brief Kernel function to be executed on the GPU

* @param ptr Pointer to an array of floats stored in GPU memory

__global__ void frequenciesKernel( float* series, float range, int* frequencies, int nbFrequencies )

{

// Compute the index

unsigned int x = blockIdx.x*blockDim.x+threadIdx.x;

unsigned int y = blockIdx.y*blockDim.y+threadIdx.y;

unsigned int index = (y*blockDim.x) + x;

float v = series[index]-(-range/2);

int position = (v/(range/nbFrequencies));

atomicAdd(&frequencies[position],1);

}

/**

* @brief Run the kernel on the GPU.

* This function is executed on the Host.

extern "C"

void run_kernel( float range, float mu, float sigma, int random, int nbThreadsPerBlock )

{

// Create pseudo-random number generator

curandGenerator_t gen;

curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);

curandSetPseudoRandomGeneratorSeed(gen, random);

curandGenerateNormal(gen, _dObjects, _dNbSeries*_dNbObjects, mu, sigma );

curandDestroyGenerator(gen);

dim3 block(8, 8, 1);

int gridDim = (int)sqrt(float(_dNbSeries));

dim3 gridPerformances( gridDim/block.x, gridDim/block.y, 1);

performanceStorageKernel<<<gridPerformances,block>>>(_dSeries, _dObjects, _dNbObjects, _dPerformances );

// Reset memory

cudaMemset( _dFrequencies, 0, _dNbFrequencies*sizeof(int) );

// compute Frequencies

gridDim = (int)sqrt(float(_dNbSeries));

dim3 gridFrequencies( gridDim/block.x, gridDim/block.y, 1);

frequenciesKernel<<<gridFrequencies,block>>>( _dSeries, range, _dFrequencies, _dNbFrequencies );

}

* This function is executed on the Host.

* @brief Copy the data back to the Host and releases the buffer on

* GPU device.

* This function is executed on the Host

* @ param hBuffer Pointer to array of floats in host memory

* @ param N Number of objects in the array (an Objects consists of 6 floats)

extern "C"

void device_to_host( int* hOutFrequencies, float* hOutPerformances, float* hOutSeries )

{

cudaMemcpy( hOutFrequencies, _dFrequencies, _dNbFrequencies*sizeof(int), cudaMemcpyDeviceToHost);

cudaMemcpy( hOutPerformances, _dPerformances, _dNbObjects*sizeof(float), cudaMemcpyDeviceToHost);

}

* This function is executed on the Host.

* @brief Releases the buffer on GPU device.

* This function is executed on the Host

extern "C"

void destroy_device()

{

cudaFree( _dObjects );

cudaFree( _dFrequencies );

cudaFree( _dSeries );

cudaFree( _dPerformances );

}