[Halld-offline] updates to AmpTools for newer GPU hardware

Tue Mar 27 15:06:12 EDT 2012

Matt,

I have tested the latest commodity-priced GPU hardware from nvidia:

  * GTX 580, 1.5 GB ram, 750MHz clock, 512 cores ($430 from Newegg)
      o factor 100 speedup in gen_3pi, current trunk release
  * GTX 680, 2.0 GB ram, 1000MHz clock, 1536 cores ($495 from Newegg)
      o factor 150 speedup in gen_3pi, current trunk release

The current amptools code works without modification, but it reports the wrong hardware information in the output log, in case someone might care.  A revision to GPUManager.cc that correctly reports the features of the above hardware (sdkHelper.h is a modified form of a header with the same name in the nividia SDK release 4.1) is attached.

First release of the gen_5pi code for cuda amplitude generation will be coming shortly.

-Richard Jones

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://mailman.jlab.org/pipermail/halld-offline/attachments/20120327/c4e907b4/attachment-0002.html>
-------------- next part --------------
//******************************************************************************
// This file is part of AmpTools, a package for performing Amplitude Analysis
// 
// Copyright Trustees of Indiana University 2010, all rights reserved
// 
// This software written by Matthew Shepherd, Ryan Mitchell, and 
//                  Hrayr Matevosyan at Indiana University, Bloomington
// 
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
//    notice and author attribution, this list of conditions and the
//    following disclaimer. 
// 2. Redistributions in binary form must reproduce the above copyright
//    notice and author attribution, this list of conditions and the
//    following disclaimer in the documentation and/or other materials
//    provided with the distribution.
// 3. Neither the name of the University nor the names of its contributors
//    may be used to endorse or promote products derived from this software
//    without specific prior written permission.
// 
// Creation of derivative forms of this software for commercial
// utilization may be subject to restriction; written permission may be
// obtained from the Trustees of Indiana University.
// 
// INDIANA UNIVERSITY AND THE AUTHORS MAKE NO REPRESENTATIONS OR WARRANTIES, 
// EXPRESS OR IMPLIED.  By way of example, but not limitation, INDIANA 
// UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCANTABILITY OR 
// FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THIS SOFTWARE OR 
// DOCUMENTATION WILL NOT INFRINGE ANY PATENTS, COPYRIGHTS, TRADEMARKS, 
// OR OTHER RIGHTS.  Neither Indiana University nor the authors shall be 
// held liable for any liability with respect to any claim by the user or 
// any other party arising from use of the program.
//******************************************************************************

#ifdef USE_MPI
#include <mpi.h>
#endif

#include <iostream>
#include <math.h>
#include <string.h>
#include <sys/time.h>
#include "cuda_runtime.h"

#include "IUAmpTools/AmplitudeManager.h"
#include "IUAmpTools/AmpVecs.h"

#include "GPUManager/GPUKernel.h"
#include "GPUManager/GPUManager.h"
#include "GPUManager/sdkHelper.h"

bool GPUManager::m_cudaDisplay = false;

template <class T>
void reduce(int size, int threads, int blocks,
                 int whichKernel, T *d_idata, T *d_odata);

GPUManager::GPUManager() : 
  m_ampCalcOnly( false )
{
	m_iNEvents=0;
	m_iNTrueEvents=0;

	m_iNAmps=0;
	m_iNAmpsH=0;
	m_iAmpArrSize=0;

	m_iEventArrSize=0;
	m_iVArrSize=0;

	//Host Arrays
	m_pfAmpRe=0;
	m_pfAmpIm=0;
	m_pfVRe=0;
	m_pfVIm=0;
	m_pfRes=0;

	//Device Arrays	
	m_pfDevData=0;
	m_pcDevCalcAmp=0;
  m_piDevPerm=0;
	m_pfDevAmpRe=0;
	m_pfDevAmpIm=0;
  m_pfDevWeights=0;
	m_pfDevVRe=0;
	m_pfDevVIm=0;
	m_pfDevRes=0;
	m_pfDevREDUCE=0;

	//CUDA Thread and Grid sizes
	m_iDimGridX=0;
	m_iDimGridY=0;
	m_iDimThreadX=0;
	m_iDimThreadY=0;

  int thisDevice = 0;

  if( !m_cudaDisplay )
    cout<<"\n################### CUDA DEVICE ##################\n";    

#ifdef USE_MPI

  // Note that a better algorithm would be to utilize the jobs "local
  // rank" on the machine instead of global rank -- this needs development.
  // The obvious problem wiht the technique below is that, e.g., in a 
  // two-GPU per node cluster, all even rank jobs will use device zero.
  // If two even rank jobs land on the same node device zero will be
  // overloaded and device 1 will be unused.

  int rank, devs;
  MPI_Comm_rank( MPI_COMM_WORLD, &rank );
  cudaGetDeviceCount(&devs);
  thisDevice = rank % devs;

  if( !m_cudaDisplay ) {
    cout << "\nParallel GPU configuration requested." << endl;
    cout << "\nNumber of CUDA devices available on this node:  " << devs << endl;
    cout << "\nMPI process " << rank << " is using device " << thisDevice << endl;
  }
#endif

	///////CUDA INITIALIZATION
  cudaSetDevice( thisDevice );
  cudaDeviceProp devProp;
  cudaGetDeviceProperties( &devProp, thisDevice );
  int cudaCoresPerMP = _ConvertSMVer2Cores(devProp.major,devProp.minor);

	if( ! m_cudaDisplay ){

    cout<<"Current GPU Properites:\n";
    cout<<"\t Name: "<<devProp.name<<endl; 
    cout<<"\t Total global memory: "<<devProp.totalGlobalMem/((float)1024*1024)<<" MB"<<endl; 
    cout<<"\t Number of cores: " << cudaCoresPerMP*devProp.multiProcessorCount << endl;
    cout<<"\t Rev.: "<<devProp.major<<"."<<devProp.minor<<endl;
    cout<<"\t Precision (size of GDouble): " << sizeof(GDouble) << " bytes" << endl;	
    cout<<"##################################################\n\n";
    ///////END OF CUDA INITIALIZATION
    m_cudaDisplay = true;
  }

  if( ( devProp.major == 1 ) && devProp.minor < 3 ){

    // double precision operations need 1.3 hardware or higher
    assert( sizeof( GDouble ) <= 4 );
  }

	cudaError_t cerrKernel=cudaGetLastError();
	if( cerrKernel != cudaSuccess  ){

		cout<<"\n\nDEVICE INIT ERROR: "<< cudaGetErrorString(cerrKernel) << endl;  
    assert( false );
  }
}

GPUManager::GPUManager( const AmpVecs& a )
{
	GPUManager();
	init( a );
}

GPUManager::~GPUManager()
{
	clearAll();
}

// Initialization routines:

void 
GPUManager::init( const AmpVecs& a, bool ampCalcOnly )
{
  clearAll();

  m_ampCalcOnly = ampCalcOnly;

  // copy over some info from the AmpVecs object for array dimensions
	m_iNTrueEvents = a.m_iNTrueEvents;
	m_iNEvents     = a.m_iNEvents;
  m_iNParticles  = a.m_iNParticles;
	m_iNAmps       = a.m_iNAmps;

  // the rest of the data are derived:
	m_iEventArrSize     = sizeof(GDouble) * m_iNEvents;
  m_iTrueEventArrSize = sizeof(GDouble) * m_iNTrueEvents;

	// size of upper half of AiAj* matrix
  m_iNAmpsH = m_iNAmps * ( m_iNAmps + 1 ) / 2;

  // size needed to store amplitudes for each event
	m_iAmpArrSize     = sizeof(GDouble) * m_iNEvents * m_iNAmpsH;

	// size of upper half of ViVj* matrix
  m_iVArrSize = sizeof(GDouble) * m_iNAmpsH;

  // save memory by not allocating it if we are only using the GPU
  // to do amplitude calcuations (as in normalization integrals)
  if( !m_ampCalcOnly ){

    // host memory needed for intensity calculation
    cudaMallocHost( (void**)&m_pfAmpRe , m_iAmpArrSize   );
    cudaMallocHost( (void**)&m_pfAmpIm , m_iAmpArrSize   );
    cudaMallocHost( (void**)&m_pfVRe   , m_iVArrSize     );
    cudaMallocHost( (void**)&m_pfVIm   , m_iVArrSize     );
    cudaMallocHost( (void**)&m_pfRes   , m_iEventArrSize );

    // device memory needed for intensity calculation and sum
    cudaMalloc( (void**)&m_pfDevAmpRe   , m_iAmpArrSize                       );
    cudaMalloc( (void**)&m_pfDevAmpIm   , m_iAmpArrSize                       );
    cudaMalloc( (void**)&m_pfDevVRe     , m_iVArrSize                         );
    cudaMalloc( (void**)&m_pfDevVIm     , m_iVArrSize                         );
    cudaMalloc( (void**)&m_pfDevWeights , m_iEventArrSize                     );
    cudaMalloc( (void**)&m_pfDevRes     , m_iEventArrSize                     );
    cudaMalloc( (void**)&m_pfDevREDUCE  , m_iEventArrSize                     );
  }

  // allocate device memory needed for amplitude calculations
  cudaMalloc( (void**)&m_pfDevData    , 4 * m_iNParticles * m_iEventArrSize );
  cudaMalloc( (void**)&m_pcDevCalcAmp , 2 * m_iEventArrSize                 );
  cudaMalloc( (void**)&m_piDevPerm    , m_iNParticles * sizeof( int )       );

	cudaError_t cerrKernel = cudaGetLastError();
	if( cerrKernel!= cudaSuccess  ){

    // an error here probably means we don't have enough
    // RAM in the GPU for the number of events and amplitudes
    // we plan to work with

		cout << "\n\nMEMORY ALLOCATION ERROR: " 
         << cudaGetErrorString( cerrKernel ) << endl;

    assert( false );
  }

  cout << "GPU memory allocated for " << m_iNAmps << " amplitudes and "
       << m_iNEvents << " events (" << m_iNTrueEvents << " actual events)"
       << endl;

  // check for errors
  cerrKernel = cudaGetLastError();
	if( cerrKernel!= cudaSuccess  ){

		cout << "\nError initializing constant memory: " 
         << cudaGetErrorString( cerrKernel ) 
         << endl;
    assert( false );
  }

	//CUDA Dims
	calcCUDADims();
}

void
GPUManager::copyDataToGPU( const AmpVecs& a )
{		

  // make sure AmpVecs has been loaded with data
  assert( a.m_pdData );

  // copy the data into the device
  cudaMemcpy( m_pfDevData, a.m_pdData, 4 * m_iNParticles * m_iEventArrSize, 
              cudaMemcpyHostToDevice );

  // check for errors
 	cudaError_t cerrKernel = cudaGetLastError();
	if( cerrKernel!= cudaSuccess  ){

		cout << "\nError copying data to device: " << cudaGetErrorString( cerrKernel ) 
         << endl;
    assert( false );
  }
}

void
GPUManager::copyAmpsToGPU( const AmpVecs& a )
{
	if(!m_pfAmpRe) {

		cout << "GPUManager::InitAmps is called without initalization or this\n" 
         << "instance of GPUManager is for amplitude calculation only." << endl;
    assert( false );
	}

	unsigned int i,j,iEvent;
	for( iEvent = 0; iEvent < m_iNTrueEvents; iEvent++ )
	{
		//Saving only the upper half of the AiAj*
		for( i = 0; i < m_iNAmps; i++ )
			for( j = 0; j <= i; j++ )
			{
				m_pfAmpRe[iEvent+m_iNEvents*(i*(i+1)/2+j)] = 
          a.m_pdAmps[2*m_iNEvents*i+2*iEvent]   * a.m_pdAmps[2*m_iNEvents*j+2*iEvent] + 
          a.m_pdAmps[2*m_iNEvents*i+2*iEvent+1] * a.m_pdAmps[2*m_iNEvents*j+2*iEvent+1];

				m_pfAmpIm[iEvent+m_iNEvents*(i*(i+1)/2+j)] = 
          -a.m_pdAmps[2*m_iNEvents*i+2*iEvent]   * a.m_pdAmps[2*m_iNEvents*j+2*iEvent+1] + 
           a.m_pdAmps[2*m_iNEvents*i+2*iEvent+1] * a.m_pdAmps[2*m_iNEvents*j+2*iEvent];

				//Doubling the off-diagonal elements to sum over only upper triangle
				if(j !=i )
				{
					m_pfAmpRe[iEvent+m_iNEvents*(i*(i+1)/2+j)]*=2.;
					m_pfAmpIm[iEvent+m_iNEvents*(i*(i+1)/2+j)]*=2.;
				}	
			}
	}

	//Now padding the upper half to make sure there are no nans in log
	for( ; iEvent < m_iNEvents; iEvent++ ) {
		for( i = 0; i < m_iNAmps; i++ ) {
			for( j = 0; j <= i; j++ ) {
				m_pfAmpRe[iEvent+m_iNEvents*(i*(i+1)/2+j)] = 1.;
				m_pfAmpIm[iEvent+m_iNEvents*(i*(i+1)/2+j)] = 0.;
      }
    }
  }

  /* // useful block for debugging:
  for( iEvent = 0; iEvent < m_iNEvents; iEvent++ ){

    cout << "Event " << iEvent << endl;
    for( i = 0; i < m_iNAmps; i++ ){

      cout << "Amp " << i << ":\t";
			for( j = 0; j <= i; j++ ){

        cout << "(" <<
        m_pfAmpRe[iEvent+m_iNEvents*(i*(i+1)/2+j)]
        << ", " <<
				m_pfAmpIm[iEvent+m_iNEvents*(i*(i+1)/2+j)]
        << ")\t";
      }
      cout << endl;
    }
  }
   */

	cudaMemcpy( m_pfDevAmpRe, m_pfAmpRe, m_iAmpArrSize, cudaMemcpyHostToDevice );
	cudaMemcpy( m_pfDevAmpIm, m_pfAmpIm, m_iAmpArrSize, cudaMemcpyHostToDevice );

  // copy the weights to the GPU
  cudaMemcpy( m_pfDevWeights, a.m_pdWeights, m_iEventArrSize, cudaMemcpyHostToDevice );

  cudaError_t cerrKernel = cudaGetLastError();
	if( cerrKernel!= cudaSuccess  ){

		cout << "\nError copying amplitudes or weights to GPU: " 
    << cudaGetErrorString(cerrKernel) << endl;
    assert( false );
  }

}

void
GPUManager::setParamPtrs(  const vector< const complex< double >* >& pvpFitPars )
{
	m_vpFitPars = pvpFitPars;
}

void
GPUManager::setCoherenceMatrix( const vector< vector < bool > >& cohMtx ){

  m_vbSumCoherently = cohMtx;
}

void 
GPUManager::calcAmplitudeAll( const Amplitude* amp, GDouble* pcResAmp, 
                              const vector< vector< int > >* pvPermutations )
{

	dim3 dimBlock( m_iDimThreadX, m_iDimThreadY );
	dim3 dimGrid( m_iDimGridX, m_iDimGridY );

  // do the computation for all events for each permutation in the
  // vector of permunations
  vector< vector< int > >::const_iterator permItr = pvPermutations->begin();

  // if this is not true, AmplitudeManager hasn't been setup properly
  assert( permItr->size() == m_iNParticles );

  int permOffset = 0;
  for( ; permItr != pvPermutations->end(); ++permItr ){

    // copy the permutation to global memory
    cudaMemcpy( m_piDevPerm, &((*permItr)[0]), m_iNParticles * sizeof( int ),
                cudaMemcpyHostToDevice );

    // check for errors
    cudaError_t cerrPreKernel = cudaGetLastError();
    if( cerrPreKernel != cudaSuccess  ){
      cout << "\nError copying permutation to GPU: " 
           << cudaGetErrorString( cerrPreKernel ) << endl;
      assert( false );
    }

    // calculate amplitude factor for all events --
    // casting amp array to WCUComplex for 8 or 16 bit write 
    // operation of both real and complex parts at once

    amp->calcAmplitudeGPU( dimGrid, dimBlock, m_pfDevData, 
                          (WCUComplex*)m_pcDevCalcAmp, 
                          m_piDevPerm, m_iNParticles, m_iNEvents,
                          *permItr );

    cudaThreadSynchronize();    

    // check to be sure kernel execution was OK
    cudaError_t cerrKernel=cudaGetLastError();
    if( cerrKernel!= cudaSuccess  ){

      cout << "\nKERNEL LAUNCH ERROR [" << amp->name() << "]: " 
           << cudaGetErrorString( cerrKernel ) << endl;
      assert( false );
    }

    // now copy the result out of the GPU into the correct place in the
    // pcResAmp array for this particular permutation
    cudaMemcpy( &(pcResAmp[permOffset]), m_pcDevCalcAmp, 2 * m_iEventArrSize, 
               cudaMemcpyDeviceToHost);

    cerrKernel = cudaGetLastError();
    if( cerrKernel!= cudaSuccess  ){

      cout << "\nError copying amplitudes from GPU [" << amp->name() << "]: " 
      << cudaGetErrorString(cerrKernel) << endl;
      assert( false );
    }

    // increment the offset so that we place the computation for the
    // next permutation after the previous in pcResAmp
    permOffset += 2 * m_iNEvents;
  }    
}

double GPUManager::calcSumLogIntensity()
{

  // be sure memory has been allocated for intensity computation
  assert( !m_ampCalcOnly );

	unsigned int i,j;

  // precompute the real and imaginary parts of ViVj* and copy to 
  // GPU global memory
	complex< double > cdFij;
	for( i = 0; i< m_iNAmps; i++) {
		for( j = 0; j <= i; j++ ) {

			cdFij = (*(m_vpFitPars[i])) * conj(*(m_vpFitPars[j]));

      // here is the transition from double -> GDouble
			m_pfVRe[i*(i+1)/2+j] = 
        ( m_vbSumCoherently[i][j] ? static_cast< GDouble >( cdFij.real() ) : 0 );
			m_pfVIm[i*(i+1)/2+j] = 
        ( m_vbSumCoherently[i][j] ? static_cast< GDouble >( cdFij.imag() ) : 0 );
    }
  }

  //Init global memory on GPU
	cudaMemcpyToSymbol( "da_pfDevVRe" , m_pfVRe     , m_iVArrSize );
	cudaMemcpyToSymbol( "da_pfDevVIm" , m_pfVIm     , m_iVArrSize );
	cudaMemcpyToSymbol( "da_iNAmpsH"  , &m_iNAmpsH  , sizeof(int) );
	cudaMemcpyToSymbol( "da_iNEvents" , &m_iNEvents , sizeof(int) );

  cudaError_t cerrPreKernel=cudaGetLastError();
  if( cerrPreKernel != cudaSuccess  ){
    cout << "\nError copying weights or parameters to GPU: " 
    << cudaGetErrorString( cerrPreKernel ) << endl;
    assert( false );
  }

  // compute the intensities
	dim3 dimBlock( m_iDimThreadX, m_iDimThreadY );
	dim3 dimGrid( m_iDimGridX, m_iDimGridY );
	GPU_ExecAmpKernel( dimGrid, dimBlock, m_pfDevAmpRe, m_pfDevAmpIm, 
                     m_pfDevWeights, m_pfDevRes );  

	// Now the summation of the results	-- do this on the CPU for small
  // numbers of events or cases where double precision GPU is not enabled
	double dGPUResult = 0;
	if( m_iNTrueEvents <= m_iNBlocks || sizeof( GDouble ) <= 4 )
	{
		cudaMemcpy(m_pfRes,m_pfDevRes,m_iTrueEventArrSize,cudaMemcpyDeviceToHost);
		for(i=0; i<m_iNTrueEvents; i++)
			dGPUResult += m_pfRes[i];
	}
	else
	{
		cudaThreadSynchronize();		
		//Zeroing out the padding as not to alter the results after the reduction
		cudaMemset(m_pfDevRes+m_iNTrueEvents,0,sizeof(GDouble)*(m_iNEvents-m_iNTrueEvents));
		int whichKernel = 6;   		
		// execute the kernel to sum partial sums from each block on CPU  
		reduce<GDouble>(m_iNEvents, m_iNThreads, m_iNBlocks, whichKernel, m_pfDevRes, m_pfDevREDUCE);
		// copy result from device to host
		cudaMemcpy( m_pfRes, m_pfDevREDUCE, m_iNBlocks*sizeof(GDouble), cudaMemcpyDeviceToHost);
		for(i=0; i<m_iNBlocks; i++) 
			dGPUResult += m_pfRes[i];
	}
	return dGPUResult;
}

// Methods to clear memory:

void GPUManager::clearAll()
{
	clearAmpCalc();

	if( !m_ampCalcOnly ) clearLikeCalc();
}

void GPUManager::clearAmpCalc()
{
	m_iNParticles=0;
	m_iNEvents=0;
	m_iNTrueEvents=0;
	m_iEventArrSize=0;
	m_iTrueEventArrSize=0;

	//Device Memory	
	if(m_pfDevData)
		cudaFree(m_pfDevData);
	m_pfDevData=0;

	if(m_pcDevCalcAmp)
		cudaFree(m_pcDevCalcAmp);
	m_pcDevCalcAmp=0;

  if(m_piDevPerm)
    cudaFree(m_piDevPerm);
  m_piDevPerm=0;

}

void GPUManager::clearLikeCalc()
{
	m_iNEvents=0;
	m_iNTrueEvents=0;
	m_iNAmps=0;
	m_iNAmpsH=0;

	m_iEventArrSize=0;
  m_iTrueEventArrSize=0;
	m_iAmpArrSize=0;
	m_iVArrSize=0;

	//Host Memory

	//Allocated pointers
	if(m_pfAmpRe)
		cudaFreeHost(m_pfAmpRe);
	m_pfAmpRe=0;

	if(m_pfAmpIm)
		cudaFreeHost(m_pfAmpIm);
	m_pfAmpIm=0;

	if(m_pfVRe)
		cudaFreeHost(m_pfVRe);
	m_pfVRe=0;

	if(m_pfVIm)
		cudaFreeHost(m_pfVIm);
	m_pfVIm=0;

	if(m_pfRes)
		cudaFreeHost(m_pfRes);
	m_pfRes=0;

	//Device Memory	

	if(m_pfDevAmpRe)
		cudaFree(m_pfDevAmpRe);
	m_pfDevAmpRe=0;

	if(m_pfDevAmpIm)
		cudaFree(m_pfDevAmpIm);
	m_pfDevAmpIm=0;

	if(m_pfDevVRe)
		cudaFree(m_pfDevVRe);
	m_pfDevVRe=0;

	if(m_pfDevVIm)
		cudaFree(m_pfDevVIm);
	m_pfDevVIm=0;

  if(m_pfDevWeights)
    cudaFree(m_pfDevWeights);
  m_pfDevWeights=0;

	if(m_pfDevRes)
		cudaFree(m_pfDevRes);
	m_pfDevRes=0;

	if(m_pfDevREDUCE)
		cudaFree(m_pfDevREDUCE);
	m_pfDevREDUCE=0;

	//CUDA Thread and Grid sizes
	m_iDimGridX=0;
	m_iDimGridY=0;
	m_iDimThreadX=0;
	m_iDimThreadY=0;
}

// Internal utilities:
void GPUManager::calcCUDADims()
{
	if(m_iNEvents<1)
		return;

	m_iDimThreadX=GPU_BLOCK_SIZE_X;
	m_iDimThreadY=GPU_BLOCK_SIZE_Y;	

	unsigned int iBlockSizeSq=GPU_BLOCK_SIZE_SQ;
	unsigned int iNBlocks=m_iNEvents/iBlockSizeSq;
	if(iNBlocks<=1)
	{
		m_iDimGridX=1;
		m_iDimGridY=1;
	}
	else
	{
		unsigned int iDivLo=1,iDivHi=iNBlocks;
		for(iDivLo=static_cast<int>(sqrt(iNBlocks));iDivLo>=1;iDivLo--)
		{
			iDivHi=iNBlocks/iDivLo;
			if(iDivLo*iDivHi==iNBlocks)
				break;
		}
		m_iDimGridX=iDivLo;
		m_iDimGridY=iDivHi;
	}

	// cout<<"\tThread dimensions:  ("<<m_iDimThreadX<<","<<m_iDimThreadY<<")\n";
	// cout<<"\tGrid dimensions:  ("<<m_iDimGridX<<","<<m_iDimGridY<<")\n";

	//Reduction Parameters
//	unsigned int maxThreads = 256;  // number of threads per block
//	unsigned int maxBlocks = 256;		
	unsigned int maxThreads = 1024;  // number of threads per block
	unsigned int maxBlocks = 1024;		

  if (m_iNEvents == 1) 
		m_iNThreads = 1;
	else
		m_iNThreads = (m_iNEvents < maxThreads*2) ? m_iNEvents / 2 : maxThreads;

	m_iNBlocks = m_iNEvents / (m_iNThreads * 2);	
	m_iNBlocks = min(maxBlocks, m_iNBlocks);

	 cout<<"Reduction:\n";
	 cout<<"\tNumber of threads:  "<<m_iNThreads<<"\n";
         cout<<"\tNumber of blocks:   "<<m_iNBlocks<<"\n\n\n"<<flush;	
}

-------------- next part --------------
/*
 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

#ifndef SDK_HELPER_H
#define SDK_HELPER_H

// Beginning of GPU Architecture definitions
inline int _ConvertSMVer2Cores(int major, int minor)
{
    // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
    typedef struct {
       int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
       int Cores;
    } sSMtoCores;

    sSMtoCores nGpuArchCoresPerSM[] = 
    { { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
      { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
      { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
      { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
      { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
      { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
      { 0x30,192 }, // Kepler Generation (SM 3.0) GK10x class
      {   -1, -1 }
    };

    int index = 0;
    while (nGpuArchCoresPerSM[index].SM != -1) {
       if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
          return nGpuArchCoresPerSM[index].Cores;
       }	
       index++;
    }
    cerr << "MapSMtoCores undefined SM " << major << "." << minor
         << " is undefined (please update to the latest SDK)!" << endl;
    return -1;
}
// end of GPU Architecture definitions

#endif
-------------- next part --------------
A non-text attachment was scrubbed...
Name: smime.p7s
Type: application/pkcs7-signature
Size: 2757 bytes
Desc: S/MIME Cryptographic Signature
URL: <https://mailman.jlab.org/pipermail/halld-offline/attachments/20120327/c4e907b4/attachment.p7s>