diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cu index 9a11b45f2fb4b..5cc15e3576aae 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cu @@ -102,14 +102,14 @@ namespace gpu { template -GPUd() void printOnThread(const int tId, const char* str, Args... args) +GPUd() void printOnThread(const unsigned int tId, const char* str, Args... args) { if (blockIdx.x * blockDim.x + threadIdx.x == tId) { printf(str, args...); } } -GPUd() void printVectorOnThread(const char* name, Vector& vector, size_t size, const int tId = 0) +GPUd() void printVectorOnThread(const char* name, Vector& vector, size_t size, const unsigned int tId = 0) { if (blockIdx.x * blockDim.x + threadIdx.x == tId) { printf("vector %s :", name); @@ -120,7 +120,7 @@ GPUd() void printVectorOnThread(const char* name, Vector& vector, size_t si } } -GPUg() void printVectorKernel(DeviceStoreVertexerGPU& store, const int threadId) +GPUg() void printVectorKernel(DeviceStoreVertexerGPU& store, const unsigned int threadId) { if (blockIdx.x * blockDim.x + threadIdx.x == threadId) { for (int i{0}; i < store.getConfig().histConf.nBinsXYZ[0] - 1; ++i) { @@ -138,7 +138,7 @@ GPUg() void printVectorKernel(DeviceStoreVertexerGPU& store, const int threadId) } } -GPUg() void dumpMaximaKernel(DeviceStoreVertexerGPU& store, const int threadId) +GPUg() void dumpMaximaKernel(DeviceStoreVertexerGPU& store, const unsigned int threadId) { if (blockIdx.x * blockDim.x + threadIdx.x == threadId) { printf("XmaxBin: %d at index: %d | YmaxBin: %d at index: %d | ZmaxBin: %d at index: %d\n", @@ -232,8 +232,8 @@ GPUg() void trackletSelectionKernel( GPUg() void computeCentroidsKernel(DeviceStoreVertexerGPU& store, const float pairCut) { - const int nLines = store.getNExclusiveFoundLines()[store.getClusters()[1].size() - 1] + store.getNFoundLines()[store.getClusters()[1].size() - 1]; - const int maxIterations{nLines * (nLines - 1) / 2}; + const size_t nLines = store.getNExclusiveFoundLines()[store.getClusters()[1].size() - 1] + store.getNFoundLines()[store.getClusters()[1].size() - 1]; + const size_t maxIterations{nLines * (nLines - 1) / 2}; for (size_t currentThreadIndex = blockIdx.x * blockDim.x + threadIdx.x; currentThreadIndex < maxIterations; currentThreadIndex += blockDim.x * gridDim.x) { int iFirstLine = currentThreadIndex / nLines; int iSecondLine = currentThreadIndex % nLines; diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ClusterLines.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ClusterLines.h index b843a6b7383fa..12709ccb97b8c 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ClusterLines.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ClusterLines.h @@ -183,7 +183,7 @@ GPUhdi() void Line::getDCAComponents(const Line& line, const float point[3], flo inline bool Line::operator==(const Line& rhs) const { - bool val; + bool val{false}; for (int i{0}; i < 3; ++i) { val &= this->originPoint[i] == rhs.originPoint[i]; } diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/VertexerTraits.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/VertexerTraits.h index 7b9b6986dc10b..859e85a9e6242 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/VertexerTraits.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/VertexerTraits.h @@ -81,7 +81,7 @@ class VertexerTraits { public: VertexerTraits() = default; - ~VertexerTraits() = default; + virtual ~VertexerTraits() = default; GPUhd() static constexpr int4 getEmptyBinsRect() { diff --git a/GPU/GPUbenchmark/Shared/Kernels.h b/GPU/GPUbenchmark/Shared/Kernels.h index 9723f0584035e..b16cfcde162ee 100644 --- a/GPU/GPUbenchmark/Shared/Kernels.h +++ b/GPU/GPUbenchmark/Shared/Kernels.h @@ -63,7 +63,7 @@ class GPUbenchmark final float runDistributed(void (*kernel)(chunk_t**, size_t*, T...), std::vector>& chunkRanges, int nLaunches, - int nBlocks, + size_t nBlocks, int nThreads, T&... args); diff --git a/GPU/GPUbenchmark/cuda/Kernels.cu b/GPU/GPUbenchmark/cuda/Kernels.cu index 44816f2e28d1d..8c2bddc7634ae 100644 --- a/GPU/GPUbenchmark/cuda/Kernels.cu +++ b/GPU/GPUbenchmark/cuda/Kernels.cu @@ -42,7 +42,7 @@ bool checkTestChunks(std::vector>& chunks, size_t availM bool check{false}; sort(chunks.begin(), chunks.end()); - for (auto iChunk{0}; iChunk < chunks.size(); ++iChunk) { // Check boundaries + for (size_t iChunk{0}; iChunk < chunks.size(); ++iChunk) { // Check boundaries if (chunks[iChunk].first + chunks[iChunk].second > availMemSizeGB) { check = false; break; @@ -465,20 +465,20 @@ std::vector GPUbenchmark::runConcurrent(void (*kernel)(chunk_t*, for (auto iStream{0}; iStream < dimStreams; ++iStream) { GPUCHECK(cudaStreamCreate(&(streams.at(iStream)))); // round-robin on stream pool } - for (auto iChunk{0}; iChunk < nChunks; ++iChunk) { + for (size_t iChunk{0}; iChunk < nChunks; ++iChunk) { GPUCHECK(cudaEventCreate(&(starts[iChunk]))); GPUCHECK(cudaEventCreate(&(stops[iChunk]))); } // Warm up on every chunk - for (auto iChunk{0}; iChunk < nChunks; ++iChunk) { + for (size_t iChunk{0}; iChunk < nChunks; ++iChunk) { auto& chunk = chunkRanges[iChunk]; chunk_t* chunkPtr = getCustomPtr(mState.scratchPtr, chunk.first); (*kernel)<<>>(chunkPtr, getBufferCapacity(chunk.second, mOptions.prime), args...); } auto start = std::chrono::high_resolution_clock::now(); - for (auto iChunk{0}; iChunk < nChunks; ++iChunk) { + for (size_t iChunk{0}; iChunk < nChunks; ++iChunk) { auto& chunk = chunkRanges[iChunk]; chunk_t* chunkPtr = getCustomPtr(mState.scratchPtr, chunk.first); GPUCHECK(cudaEventRecord(starts[iChunk], streams[iChunk % dimStreams])); @@ -488,7 +488,7 @@ std::vector GPUbenchmark::runConcurrent(void (*kernel)(chunk_t*, GPUCHECK(cudaEventRecord(stops[iChunk], streams[iChunk % dimStreams])); } - for (auto iChunk{0}; iChunk < nChunks; ++iChunk) { + for (size_t iChunk{0}; iChunk < nChunks; ++iChunk) { GPUCHECK(cudaEventSynchronize(stops[iChunk])); GPUCHECK(cudaEventElapsedTime(&(results.at(iChunk)), starts[iChunk], stops[iChunk])); GPUCHECK(cudaEventDestroy(starts[iChunk])); @@ -512,7 +512,7 @@ template float GPUbenchmark::runDistributed(void (*kernel)(chunk_t**, size_t*, T...), std::vector>& chunkRanges, int nLaunches, - int nBlocks, + size_t nBlocks, int nThreads, T&... args) { @@ -521,7 +521,7 @@ float GPUbenchmark::runDistributed(void (*kernel)(chunk_t**, size_t*, T std::vector perBlockCapacity(nBlocks); // Capacity of sub-buffer for block float totChunkGB{0.f}; - int totComputedBlocks{0}; + size_t totComputedBlocks{0}; for (size_t iChunk{0}; iChunk < chunkRanges.size(); ++iChunk) { chunkPtrs[iChunk] = getCustomPtr(mState.scratchPtr, chunkRanges[iChunk].first); @@ -666,10 +666,11 @@ void GPUbenchmark::runTest(Test test, Mode mode, KernelConfig config) } nThreads *= mOptions.threadPoolFraction; - void (*kernel)(chunk_t*, size_t); - void (*kernel_distributed)(chunk_t**, size_t*); - void (*kernel_rand)(chunk_t*, size_t, int); - void (*kernel_rand_distributed)(chunk_t**, size_t*, int); + void (*kernel)(chunk_t*, size_t) = &gpu::read_k; // Initialising to a default value + void (*kernel_distributed)(chunk_t**, size_t*) = &gpu::read_dist_k; // Initialising to a default value + void (*kernel_rand)(chunk_t*, size_t, int) = &gpu::rand_read_k; // Initialising to a default value + void (*kernel_rand_distributed)(chunk_t**, size_t*, int) = &gpu::rand_read_dist_k; // Initialising to a default value + bool is_random{false}; if (mode != Mode::Distributed) { @@ -744,7 +745,7 @@ void GPUbenchmark::runTest(Test test, Mode mode, KernelConfig config) if (!mOptions.raw) { std::cout << " │ - per chunk throughput:\n"; } - for (auto iChunk{0}; iChunk < mState.testChunks.size(); ++iChunk) { // loop over single chunks separately + for (size_t iChunk{0}; iChunk < mState.testChunks.size(); ++iChunk) { // loop over single chunks separately auto& chunk = mState.testChunks[iChunk]; float result{0.f}; if (!is_random) { @@ -793,7 +794,7 @@ void GPUbenchmark::runTest(Test test, Mode mode, KernelConfig config) mOptions.prime); } float sum{0}; - for (auto iChunk{0}; iChunk < mState.testChunks.size(); ++iChunk) { + for (size_t iChunk{0}; iChunk < mState.testChunks.size(); ++iChunk) { auto& chunk = mState.testChunks[iChunk]; float chunkSize = (float)getBufferCapacity(chunk.second, mOptions.prime) * sizeof(chunk_t) / (float)GB; auto throughput = computeThroughput(test, results[iChunk], chunkSize, mState.getNKernelLaunches());