From d1e320d9dd5c4186c1ce0acec5bc293c37a09b5e Mon Sep 17 00:00:00 2001 From: Oliver Rietmann Date: Sat, 14 Mar 2026 21:43:47 +0100 Subject: [PATCH 1/7] Write benchmark to CSV file --- GPU/GPUTracking/Base/GPUReconstructionCPU.cxx | 81 +++++++ GPU/GPUTracking/Definitions/GPUSettingsList.h | 1 + .../Definitions/Parameters/GPUParameters.csv | 226 +++++++++--------- GPU/GPUTracking/Standalone/cmake/config.cmake | 6 +- dependencies/FindO2GPU.cmake | 10 +- 5 files changed, 205 insertions(+), 119 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index 409c28b8bf328..8491974bda331 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -35,6 +35,8 @@ #include #include +#include +#include #ifndef _WIN32 #include @@ -212,6 +214,38 @@ int32_t GPUReconstructionCPU::ExitDevice() return 0; } +namespace { + void write_header(std::ostream& stream) { + stream << "type,count,name,kernel (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n"; + } + + struct Row { + std::string type = ""; + std::string name = ""; + double kernel_time = -1.0; + double cpu_time = -1.0; + double total_time = -1.0; + size_t memSize = 0; + uint32_t count = 0; + + void write(std::ostream& stream, uint32_t statNEvents) { + double scale = 1000000.0 / statNEvents; + stream << type << ","; + if (count != 0) stream << count; + stream << "," << name << "," << uint32_t(kernel_time * scale) << ","; + if (cpu_time != -1.0) stream << uint32_t(cpu_time * scale); + stream << ","; + if (cpu_time != -1.0 && total_time != -1.0) stream << uint32_t(cpu_time / total_time *100) / 100.0; + stream << ","; + if (total_time != -1.0) stream << uint32_t(total_time * scale); + stream << ","; + if (memSize != 0 && count != 0) stream << uint32_t(memSize / kernel_time * 1e-6) * 1e-3 << "," << memSize / statNEvents << "," << memSize / statNEvents / count; + else stream << ",,"; + stream << std::endl; + } + }; +} + int32_t GPUReconstructionCPU::RunChains() { mMemoryScalers->temporaryFactor = 1.; @@ -264,6 +298,16 @@ int32_t GPUReconstructionCPU::RunChains() double kernelTotal = 0; std::vector kernelStepTimes(gpudatatypes::N_RECO_STEPS, 0.); + std::ofstream benchmarkCSV; + if (!GetProcessingSettings().timingCSV.empty()) { + benchmarkCSV.open(GetProcessingSettings().timingCSV, std::ios::out | std::ios::app); + if (!benchmarkCSV.is_open()) { + GPUError("Could not open timing CSV file '%s' for writing", GetProcessingSettings().timingCSV.c_str()); + } else if (mNEventsProcessed == 1) { + write_header(benchmarkCSV); + } + } + if (GetProcessingSettings().debugLevel >= 1) { for (uint32_t i = 0; i < mTimers.size(); i++) { double time = 0; @@ -285,9 +329,16 @@ int32_t GPUReconstructionCPU::RunChains() kernelStepTimes[stepNum] += time; } char bandwidth[256] = ""; + Row task_row; + task_row.type = 'K'; + task_row.name = mTimers[i]->name.c_str(); + task_row.kernel_time = time; + task_row.count = mTimers[i]->count; if (mTimers[i]->memSize && mStatNEvents && time != 0.) { + task_row.memSize = mTimers[i]->memSize; snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", mTimers[i]->memSize / time * 1e-9, mTimers[i]->memSize / mStatNEvents, mTimers[i]->memSize / mStatNEvents / mTimers[i]->count); } + if (benchmarkCSV.is_open()) task_row.write(benchmarkCSV, mStatNEvents); printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), time * 1000000 / mStatNEvents, bandwidth); if (GetProcessingSettings().resetTimers) { mTimers[i]->count = 0; @@ -298,14 +349,34 @@ int32_t GPUReconstructionCPU::RunChains() if (GetProcessingSettings().recoTaskTiming) { for (int32_t i = 0; i < gpudatatypes::N_RECO_STEPS; i++) { if (kernelStepTimes[i] != 0. || mTimersRecoSteps[i].timerTotal.GetElapsedTime() != 0.) { + Row reco_step_row; + reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (Tasks)"; + reco_step_row.kernel_time = kernelStepTimes[i]; + reco_step_row.cpu_time = mTimersRecoSteps[i].timerCPU; + reco_step_row.total_time = mTimersRecoSteps[i].timerTotal.GetElapsedTime(); + if (benchmarkCSV.is_open()) reco_step_row.write(benchmarkCSV, mStatNEvents); printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", "Tasks", gpudatatypes::RECO_STEP_NAMES[i], kernelStepTimes[i] * 1000000 / mStatNEvents, "", mTimersRecoSteps[i].timerTotal.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU / mTimersRecoSteps[i].timerTotal.GetElapsedTime()); } if (mTimersRecoSteps[i].bytesToGPU) { + Row reco_step_row; + reco_step_row.type = 'D'; + reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to GPU)"; + reco_step_row.kernel_time = mTimersRecoSteps[i].timerToGPU.GetElapsedTime(); + reco_step_row.memSize = mTimersRecoSteps[i].bytesToGPU; + reco_step_row.count = mTimersRecoSteps[i].countToGPU; + if (benchmarkCSV.is_open()) reco_step_row.write(benchmarkCSV, mStatNEvents); printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToGPU, "DMA to GPU", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToGPU / mStatNEvents, mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].countToGPU); } if (mTimersRecoSteps[i].bytesToHost) { + Row reco_step_row; + reco_step_row.type = 'D'; + reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to Host)"; + reco_step_row.kernel_time = mTimersRecoSteps[i].timerToHost.GetElapsedTime(); + reco_step_row.memSize = mTimersRecoSteps[i].bytesToHost; + reco_step_row.count = mTimersRecoSteps[i].countToHost; + if (benchmarkCSV.is_open()) reco_step_row.write(benchmarkCSV, mStatNEvents); printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToHost, "DMA to Host", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToHost / mStatNEvents, mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].countToHost); } @@ -321,13 +392,23 @@ int32_t GPUReconstructionCPU::RunChains() } for (int32_t i = 0; i < gpudatatypes::N_GENERAL_STEPS; i++) { if (mTimersGeneralSteps[i].GetElapsedTime() != 0.) { + Row general_step_row; + general_step_row.name = gpudatatypes::GENERAL_STEP_NAMES[i]; + general_step_row.kernel_time = mTimersGeneralSteps[i].GetElapsedTime(); + if (benchmarkCSV.is_open()) general_step_row.write(benchmarkCSV, mStatNEvents); printf("Execution Time: General Step : %50s Time: %'10.0f us\n", gpudatatypes::GENERAL_STEP_NAMES[i], mTimersGeneralSteps[i].GetElapsedTime() * 1000000 / mStatNEvents); } } + Row wall_row; + wall_row.name = "Wall"; if (GetProcessingSettings().debugLevel >= 1) { + wall_row.kernel_time = kernelTotal; mStatKernelTime = kernelTotal * 1000000 / mStatNEvents; printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", mStatKernelTime, nEventReport.c_str()); } + wall_row.cpu_time = mStatCPUTime; + wall_row.total_time = mStatWallTime * mStatNEvents / 1000000; + if (benchmarkCSV.is_open()) wall_row.write(benchmarkCSV, mStatNEvents); printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", mStatWallTime, mStatCPUTime * 1000000 / mStatNEvents, mStatCPUTime / mTimerTotal.GetElapsedTime(), nEventReport.c_str()); } else if (GetProcessingSettings().debugLevel >= 0) { GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str()); diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 57cb1371a4aa0..06c0d8f344af1 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -329,6 +329,7 @@ AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent, AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for memory allocations (without messing with normal debug level)") AddOption(debugMask, uint32_t, (1 << 18) - 1, "debugMask", 0, "Mask for debug output dumps to file") AddOption(debugLogSuffix, std::string, "", "debugSuffix", 0, "Suffix for debug log files with --debug 6") +AddOption(timingCSV, std::string, "", "", 0, "CSV filename to append the benchmark results. Verbosity determined by parameter --debug.") AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures") AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks") AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6 or deterministic compile flag set", def(1)) diff --git a/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv b/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv index fc27de72ea2f1..f240402acc19c 100644 --- a/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv +++ b/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv @@ -1,113 +1,113 @@ -Architecture,default,default_cpu,MI100,VEGA,TAHITI,TESLA,FERMI,PASCAL,KEPLER,AMPERE,TURING -,,,,,,,,,,, -CORE:,,,,,,,,,,, -WARP_SIZE,32,,64,64,32,32,32,32,32,32,32 -THREAD_COUNT_DEFAULT,256,,256,256,,,,,,512,512 -,,,,,,,,,,, -LB:,,,,,,,,,,, -GPUTPCCreateTrackingData,256,,"[256, 7]","[192, 2]",,,,,,384,256 -GPUTPCTrackletConstructor,256,,"[768, 8]","[512, 10]","[256, 2]","[256, 1]","[256, 2]","[1024, 2]","[512, 4]","[256, 2]","[256, 2]" -GPUTPCTrackletSelector,256,,"[384, 5]","[192, 10]","[256, 3]","[256, 1]","[256, 3]","[512, 4]","[256, 3]","[192, 3]","[192, 3]" -GPUTPCNeighboursFinder,256,,"[192, 8]","[960, 8]",256,256,256,512,256,"[640, 1]","[640, 1]" -GPUTPCNeighboursCleaner,256,,"[128, 5]","[384, 9]",256,256,256,256,256,512,512 -GPUTPCExtrapolationTracking,256,,"[256, 7]","[256, 2]",,,,,,"[128, 4]","[192, 2]" -GPUTRDTrackerKernels_gpuVersion,512,,,,,,,,,, -GPUTPCCreateOccupancyMap_fill,256,,,,,,,,,, -GPUTPCCreateOccupancyMap_fold,256,,,,,,,,,, -GPUTRDTrackerKernels_o2Version,512,,,,,,,,,, -GPUTPCCompressionKernels_step0attached,256,,"[128, 1]","[64, 2]",,,,,,"[64, 2]",128 -GPUTPCCompressionKernels_step1unattached,256,,"[512, 2]","[512, 2]",,,,,,"[512, 3]","[512, 2]" -GPUTPCDecompressionKernels_step0attached,256,,"[128, 2]","[128, 2]",,,,,,"[32, 1]","[32, 1]" -GPUTPCDecompressionKernels_step1unattached,256,,"[64, 2]","[64, 2]",,,,,,"[32, 1]","[32, 1]" -GPUTPCDecompressionUtilKernels_sortPerSectorRow,256,,,,,,,,,, -GPUTPCDecompressionUtilKernels_countFilteredClusters,256,,,,,,,,,, -GPUTPCDecompressionUtilKernels_storeFilteredClusters,256,,,,,,,,,, -GPUTPCCFDecodeZS,"[128, 4]",,"[64, 4]","[64, 1]",,,,,,"[64, 10]","[64, 8]" -GPUTPCCFDecodeZSLink,"""GPUCA_WARP_SIZE""",,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""",,,,,,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""" -GPUTPCCFDecodeZSDenseLink,"""GPUCA_WARP_SIZE""",,"[""GPUCA_WARP_SIZE"", 4]","[""GPUCA_WARP_SIZE"", 14]",,,,,,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""" -GPUTPCCFGather,"[1024, 1]",,"[1024, 5]","[1024, 1]",,,,,,"[1024, 1]","[1024, 1]" -COMPRESSION_GATHER,1024,,1024,1024,,,,,,1024,1024 -GPUTPCGMMergerTrackFit,256,,"[192, 2]","[64, 7]",,,,,,"[64, 4]","[32, 8]" -GPUTPCGMMergerFollowLoopers,256,,"[256, 5]","[256, 4]",,,,,,"[64, 12]","[128, 4]" -GPUTPCGMMergerSectorRefit,256,,"[64, 4]","[256, 2]",,,,,,"[32, 6]","[64, 5]" -GPUTPCGMMergerUnpackResetIds,256,,256,256,,,,,,256,256 -GPUTPCGMMergerUnpackGlobal,256,,256,256,,,,,,256,256 -GPUTPCGMMergerResolve_step0,256,,512,256,,,,,,256,256 -GPUTPCGMMergerResolve_step1,256,,512,256,,,,,,256,256 -GPUTPCGMMergerResolve_step2,256,,512,256,,,,,,256,256 -GPUTPCGMMergerResolve_step3,256,,512,256,,,,,,256,256 -GPUTPCGMMergerResolve_step4,256,,512,256,,,,,,"[256, 4]","[256, 4]" -GPUTPCGMMergerClearLinks,256,,256,256,,,,,,256,256 -GPUTPCGMMergerMergeWithinPrepare,256,,256,256,,,,,,256,256 -GPUTPCGMMergerMergeSectorsPrepare,256,,256,256,,,,,,"[256, 2]","[256, 2]" -GPUTPCGMMergerMergeBorders_step0,256,,512,256,,,,,,192,192 -GPUTPCGMMergerMergeBorders_step2,256,,512,256,,,,,,"[64, 2]",256 -GPUTPCGMMergerMergeCE,256,,512,256,,,,,,256,256 -GPUTPCGMMergerLinkExtrapolatedTracks,256,,256,256,,,,,,256,256 -GPUTPCGMMergerCollect,256,,"[768, 1]","[1024, 1]",,,,,,"[256, 2]","[128, 2]" -GPUTPCGMMergerSortTracksPrepare,256,,256,256,,,,,,256,256 -GPUTPCGMMergerPrepareForFit_step0,256,,256,256,,,,,,256,256 -GPUTPCGMMergerPrepareForFit_step1,256,,256,256,,,,,,256,256 -GPUTPCGMMergerPrepareForFit_step2,256,,256,256,,,,,,256,256 -GPUTPCGMMergerFinalize_step0,256,,,256,,,,,,, -GPUTPCGMMergerFinalize_step1,256,,,256,,,,,,, -GPUTPCGMMergerFinalize_step2,256,,,256,,,,,,, -GPUTPCGMMergerMergeLoopers_step0,256,,,,,,,,,, -GPUTPCGMMergerMergeLoopers_step1,256,,,,,,,,,, -GPUTPCGMMergerMergeLoopers_step2,256,,,,,,,,,, -GPUTPCGMO2Output_prepare,256,,,,,,,,,, -GPUTPCGMO2Output_output,256,,,,,,,,,, -GPUTPCStartHitsFinder,256,,"[1024, 2]","[1024, 7]",256,256,256,256,256,512,512 -GPUTPCStartHitsSorter,256,,"[1024, 5]","[512, 7]",256,256,256,256,256,"[512, 1]","[512, 1]" -GPUTPCCFCheckPadBaseline,576,,"[576, 2]","[576, 2]",,,,,,"[576, 2]", -GPUTPCCFChargeMapFiller_fillIndexMap,512,,512,512,,,,,,448, -GPUTPCCFChargeMapFiller_fillFromDigits,512,,512,512,,,,,,448, -GPUTPCCFChargeMapFiller_findFragmentStart,512,,512,512,,,,,,448, -GPUTPCCFPeakFinder,512,,"[512, 9]","[512, 4]",,,,,,128, -GPUTPCCFNoiseSuppression,512,,512,512,,,,,,448, -GPUTPCCFDeconvolution,512,,"[512, 5]","[512, 5]",,,,,,384, -GPUTPCCFClusterizer,512,,"[448, 3]","[512, 2]",,,,,,448, -GPUTPCNNClusterizerKernels,512,,,,,,,,,, -GPUTrackingRefitKernel_mode0asGPU,256,,,,,,,,,, -GPUTrackingRefitKernel_mode1asTrackParCov,256,,,,,,,,,, -GPUMemClean16,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,, -GPUitoa,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,, -GPUTPCCFNoiseSuppression_noiseSuppression,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,, -GPUTPCCFNoiseSuppression_updatePeaks,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,, -GPUTPCNNClusterizerKernels_runCfClusterizer,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, -GPUTPCNNClusterizerKernels_fillInputNNCPU,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, -GPUTPCNNClusterizerKernels_fillInputNNGPU,1024,,,,,,,,,, -GPUTPCNNClusterizerKernels_determineClass1Labels,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, -GPUTPCNNClusterizerKernels_determineClass2Labels,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, -GPUTPCNNClusterizerKernels_publishClass1Regression,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, -GPUTPCNNClusterizerKernels_publishClass2Regression,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, -GPUTPCNNClusterizerKernels_publishDeconvolutionFlags,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, -GPUTPCCFStreamCompaction_scanStart,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,, -GPUTPCCFStreamCompaction_scanUp,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,, -GPUTPCCFStreamCompaction_scanTop,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,, -GPUTPCCFStreamCompaction_scanDown,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,, -GPUTPCCFStreamCompaction_compactDigits,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,, -GPUTPCCompressionGatherKernels_unbuffered,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,, -GPUTPCCompressionGatherKernels_buffered32,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,, -GPUTPCCompressionGatherKernels_buffered64,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,, -GPUTPCCompressionGatherKernels_buffered128,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,, -GPUTPCCompressionGatherKernels_multiBlock,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,, -GPUTPCGMMergerFinalize_0,256,,256,,,,,,,256,256 -GPUTPCGMMergerFinalize_1,256,,256,,,,,,,256,256 -GPUTPCGMMergerFinalize_2,256,,256,,,,,,,256,256 -,,,,,,,,,,, -PAR:,,,,,,,,,,, -AMD_EUS_PER_CU,0,0,4,4,,,,,,, -SORT_STARTHITS,1,0,,,,,,,,, -NEIGHBOURS_FINDER_MAX_NNEIGHUP,6,0,10,4,,,,,,4,4 -NEIGHBOURS_FINDER_UNROLL_GLOBAL,4,0,4,2,,,,,,, -NEIGHBOURS_FINDER_UNROLL_SHARED,1,0,0,0,,,,,,, -TRACKLET_SELECTOR_HITS_REG_SIZE,12,0,9,27,,,,,,20,20 -ALTERNATE_BORDER_SORT,0,0,1,1,,,,,,1,1 -SORT_BEFORE_FIT,0,0,1,1,,,,,,1,1 -NO_ATOMIC_PRECHECK,0,0,1,1,,,,,,1,1 -DEDX_STORAGE_TYPE,"""float""","""float""","""uint16_t""","""uint16_t""",,,,,,"""uint16_t""","""uint16_t""" -MERGER_INTERPOLATION_ERROR_TYPE,"""float""","""float""","""half""","""half""",,,,,,"""half""","""half""" -COMP_GATHER_KERNEL,0,0,4,4,,,,,,4,4 -COMP_GATHER_MODE,2,0,3,3,,,,,,3,3 -CF_SCAN_WORKGROUP_SIZE,512,0,,,,,,,,, +Architecture,default,default_cpu,MI100,MI210,RDNA3,VEGA,TAHITI,TESLA,FERMI,PASCAL,KEPLER,AMPERE,TURING,HOPPER +,,,,,,,,,,,,,, +CORE:,,,,,,,,,,,,,, +WARP_SIZE,32,,64,64,32,64,32,32,32,32,32,32,32,32 +THREAD_COUNT_DEFAULT,256,,256,256,256,256,,,,,,512,512,512 +,,,,,,,,,,,,,, +LB:,,,,,,,,,,,,,, +GPUTPCCreateTrackingData,256,,"[256, 7]","[256, 7]","[256, 7]","[192, 2]",,,,,,384,256,256 +GPUTPCTrackletConstructor,256,,"[768, 8]","[768, 8]","[768, 8]","[512, 10]","[256, 2]","[256, 1]","[256, 2]","[1024, 2]","[512, 4]","[256, 2]","[256, 2]","[256, 2]" +GPUTPCTrackletSelector,256,,"[384, 5]","[384, 5]","[384, 5]","[192, 10]","[256, 3]","[256, 1]","[256, 3]","[512, 4]","[256, 3]","[192, 3]","[192, 3]","[192, 3]" +GPUTPCNeighboursFinder,256,,"[192, 8]","[192, 8]","[192, 8]","[960, 8]",256,256,256,512,256,"[640, 1]","[640, 1]","[640, 1]" +GPUTPCNeighboursCleaner,256,,"[128, 5]","[128, 5]","[128, 5]","[384, 9]",256,256,256,256,256,512,512,512 +GPUTPCExtrapolationTracking,256,,"[256, 7]","[256, 7]","[256, 7]","[256, 2]",,,,,,"[128, 4]","[192, 2]","[192, 2]" +GPUTRDTrackerKernels_gpuVersion,512,,,,,,,,,,,,, +GPUTPCCreateOccupancyMap_fill,256,,,,,,,,,,,,, +GPUTPCCreateOccupancyMap_fold,256,,,,,,,,,,,,, +GPUTRDTrackerKernels_o2Version,512,,,,,,,,,,,,, +GPUTPCCompressionKernels_step0attached,256,,"[128, 1]","[128, 1]","[128, 1]","[64, 2]",,,,,,"[64, 2]",128,128 +GPUTPCCompressionKernels_step1unattached,256,,"[512, 2]","[512, 2]","[512, 2]","[512, 2]",,,,,,"[512, 3]","[512, 2]","[512, 2]" +GPUTPCDecompressionKernels_step0attached,256,,"[128, 2]","[128, 2]","[128, 2]","[128, 2]",,,,,,"[32, 1]","[32, 1]","[32, 1]" +GPUTPCDecompressionKernels_step1unattached,256,,"[64, 2]","[64, 2]","[64, 2]","[64, 2]",,,,,,"[32, 1]","[32, 1]","[32, 1]" +GPUTPCDecompressionUtilKernels_sortPerSectorRow,256,,,,,,,,,,,,, +GPUTPCDecompressionUtilKernels_countFilteredClusters,256,,,,,,,,,,,,, +GPUTPCDecompressionUtilKernels_storeFilteredClusters,256,,,,,,,,,,,,, +GPUTPCCFDecodeZS,"[128, 4]",,"[64, 4]","[64, 4]","[64, 4]","[64, 1]",,,,,,"[64, 10]","[64, 8]","[64, 8]" +GPUTPCCFDecodeZSLink,"""GPUCA_WARP_SIZE""",,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""",,,,,,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""" +GPUTPCCFDecodeZSDenseLink,"""GPUCA_WARP_SIZE""",,"[""GPUCA_WARP_SIZE"", 4]","[""GPUCA_WARP_SIZE"", 4]","[""GPUCA_WARP_SIZE"", 4]","[""GPUCA_WARP_SIZE"", 14]",,,,,,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""" +GPUTPCCFGather,"[1024, 1]",,"[1024, 5]","[1024, 5]","[1024, 5]","[1024, 1]",,,,,,"[1024, 1]","[1024, 1]","[1024, 1]" +COMPRESSION_GATHER,1024,,1024,1024,1024,1024,,,,,,1024,1024,1024 +GPUTPCGMMergerTrackFit,256,,"[192, 2]","[192, 2]","[192, 2]","[64, 7]",,,,,,"[64, 4]","[32, 8]","[32, 8]" +GPUTPCGMMergerFollowLoopers,256,,"[256, 5]","[256, 5]","[256, 5]","[256, 4]",,,,,,"[64, 12]","[128, 4]","[128, 4]" +GPUTPCGMMergerSectorRefit,256,,"[64, 4]","[64, 4]","[64, 4]","[256, 2]",,,,,,"[32, 6]","[64, 5]","[64, 5]" +GPUTPCGMMergerUnpackResetIds,256,,256,256,256,256,,,,,,256,256,256 +GPUTPCGMMergerUnpackGlobal,256,,256,256,256,256,,,,,,256,256,256 +GPUTPCGMMergerResolve_step0,256,,512,512,512,256,,,,,,256,256,256 +GPUTPCGMMergerResolve_step1,256,,512,512,512,256,,,,,,256,256,256 +GPUTPCGMMergerResolve_step2,256,,512,512,512,256,,,,,,256,256,256 +GPUTPCGMMergerResolve_step3,256,,512,512,512,256,,,,,,256,256,256 +GPUTPCGMMergerResolve_step4,256,,512,512,512,256,,,,,,"[256, 4]","[256, 4]","[256, 4]" +GPUTPCGMMergerClearLinks,256,,256,256,256,256,,,,,,256,256,256 +GPUTPCGMMergerMergeWithinPrepare,256,,256,256,256,256,,,,,,256,256,256 +GPUTPCGMMergerMergeSectorsPrepare,256,,256,256,256,256,,,,,,"[256, 2]","[256, 2]","[256, 2]" +GPUTPCGMMergerMergeBorders_step0,256,,512,512,512,256,,,,,,192,192,192 +GPUTPCGMMergerMergeBorders_step2,256,,512,512,512,256,,,,,,"[64, 2]",256,256 +GPUTPCGMMergerMergeCE,256,,512,512,512,256,,,,,,256,256,256 +GPUTPCGMMergerLinkExtrapolatedTracks,256,,256,256,256,256,,,,,,256,256,256 +GPUTPCGMMergerCollect,256,,"[768, 1]","[768, 1]","[768, 1]","[1024, 1]",,,,,,"[256, 2]","[128, 2]","[128, 2]" +GPUTPCGMMergerSortTracksPrepare,256,,256,256,256,256,,,,,,256,256,256 +GPUTPCGMMergerPrepareForFit_step0,256,,256,256,256,256,,,,,,256,256,256 +GPUTPCGMMergerPrepareForFit_step1,256,,256,256,256,256,,,,,,256,256,256 +GPUTPCGMMergerPrepareForFit_step2,256,,256,256,256,256,,,,,,256,256,256 +GPUTPCGMMergerFinalize_step0,256,,,,,256,,,,,,,, +GPUTPCGMMergerFinalize_step1,256,,,,,256,,,,,,,, +GPUTPCGMMergerFinalize_step2,256,,,,,256,,,,,,,, +GPUTPCGMMergerMergeLoopers_step0,256,,,,,,,,,,,,, +GPUTPCGMMergerMergeLoopers_step1,256,,,,,,,,,,,,, +GPUTPCGMMergerMergeLoopers_step2,256,,,,,,,,,,,,, +GPUTPCGMO2Output_prepare,256,,,,,,,,,,,,, +GPUTPCGMO2Output_output,256,,,,,,,,,,,,, +GPUTPCStartHitsFinder,256,,"[1024, 2]","[1024, 2]","[1024, 2]","[1024, 7]",256,256,256,256,256,512,512,512 +GPUTPCStartHitsSorter,256,,"[1024, 5]","[1024, 5]","[1024, 5]","[512, 7]",256,256,256,256,256,"[512, 1]","[512, 1]","[512, 1]" +GPUTPCCFCheckPadBaseline,576,,"[576, 2]","[576, 2]","[576, 2]","[576, 2]",,,,,,"[576, 2]",, +GPUTPCCFChargeMapFiller_fillIndexMap,512,,512,512,512,512,,,,,,448,, +GPUTPCCFChargeMapFiller_fillFromDigits,512,,512,512,512,512,,,,,,448,, +GPUTPCCFChargeMapFiller_findFragmentStart,512,,512,512,512,512,,,,,,448,, +GPUTPCCFPeakFinder,512,,"[512, 9]","[512, 9]","[512, 9]","[512, 4]",,,,,,128,, +GPUTPCCFNoiseSuppression,512,,512,512,512,512,,,,,,448,, +GPUTPCCFDeconvolution,512,,"[512, 5]","[512, 5]","[512, 5]","[512, 5]",,,,,,384,, +GPUTPCCFClusterizer,512,,"[448, 3]","[448, 3]","[448, 3]","[512, 2]",,,,,,448,, +GPUTPCNNClusterizerKernels,512,,,,,,,,,,,,, +GPUTrackingRefitKernel_mode0asGPU,256,,,,,,,,,,,,, +GPUTrackingRefitKernel_mode1asTrackParCov,256,,,,,,,,,,,,, +GPUMemClean16,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,,,,, +GPUitoa,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,,,,, +GPUTPCCFNoiseSuppression_noiseSuppression,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,,,,, +GPUTPCCFNoiseSuppression_updatePeaks,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,,,,, +GPUTPCNNClusterizerKernels_runCfClusterizer,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, +GPUTPCNNClusterizerKernels_fillInputNNCPU,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, +GPUTPCNNClusterizerKernels_fillInputNNGPU,1024,,,,,,,,,,,,, +GPUTPCNNClusterizerKernels_determineClass1Labels,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, +GPUTPCNNClusterizerKernels_determineClass2Labels,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, +GPUTPCNNClusterizerKernels_publishClass1Regression,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, +GPUTPCNNClusterizerKernels_publishClass2Regression,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, +GPUTPCNNClusterizerKernels_publishDeconvolutionFlags,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, +GPUTPCCFStreamCompaction_scanStart,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,,,,, +GPUTPCCFStreamCompaction_scanUp,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,,,,, +GPUTPCCFStreamCompaction_scanTop,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,,,,, +GPUTPCCFStreamCompaction_scanDown,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,,,,, +GPUTPCCFStreamCompaction_compactDigits,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,,,,, +GPUTPCCompressionGatherKernels_unbuffered,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,,,,, +GPUTPCCompressionGatherKernels_buffered32,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,,,,, +GPUTPCCompressionGatherKernels_buffered64,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,,,,, +GPUTPCCompressionGatherKernels_buffered128,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,,,,, +GPUTPCCompressionGatherKernels_multiBlock,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,,,,, +GPUTPCGMMergerFinalize_0,256,,256,256,256,,,,,,,256,256,256 +GPUTPCGMMergerFinalize_1,256,,256,256,256,,,,,,,256,256,256 +GPUTPCGMMergerFinalize_2,256,,256,256,256,,,,,,,256,256,256 +,,,,,,,,,,,,,, +PAR:,,,,,,,,,,,,,, +AMD_EUS_PER_CU,0,0,4,4,4,4,,,,,,,, +SORT_STARTHITS,1,0,,,,,,,,,,,, +NEIGHBOURS_FINDER_MAX_NNEIGHUP,6,0,10,10,10,4,,,,,,4,4,4 +NEIGHBOURS_FINDER_UNROLL_GLOBAL,4,0,4,4,4,2,,,,,,,, +NEIGHBOURS_FINDER_UNROLL_SHARED,1,0,0,0,0,0,,,,,,,, +TRACKLET_SELECTOR_HITS_REG_SIZE,12,0,9,9,9,27,,,,,,20,20,20 +ALTERNATE_BORDER_SORT,0,0,1,1,1,1,,,,,,1,1,1 +SORT_BEFORE_FIT,0,0,1,1,1,1,,,,,,1,1,1 +NO_ATOMIC_PRECHECK,0,0,1,1,1,1,,,,,,1,1,1 +DEDX_STORAGE_TYPE,"""float""","""float""","""uint16_t""","""uint16_t""","""uint16_t""","""uint16_t""",,,,,,"""uint16_t""","""uint16_t""","""uint16_t""" +MERGER_INTERPOLATION_ERROR_TYPE,"""float""","""float""","""half""","""half""","""half""","""half""",,,,,,"""half""","""half""","""half""" +COMP_GATHER_KERNEL,0,0,4,4,4,4,,,,,,4,4,4 +COMP_GATHER_MODE,2,0,3,3,3,3,,,,,,3,3,3 +CF_SCAN_WORKGROUP_SIZE,512,0,,,,,,,,,,,, diff --git a/GPU/GPUTracking/Standalone/cmake/config.cmake b/GPU/GPUTracking/Standalone/cmake/config.cmake index 9355311db617c..abdfc52c460e4 100644 --- a/GPU/GPUTracking/Standalone/cmake/config.cmake +++ b/GPU/GPUTracking/Standalone/cmake/config.cmake @@ -19,7 +19,7 @@ set(GPUCA_CONFIG_VC 1) set(GPUCA_CONFIG_FMT 1) set(GPUCA_CONFIG_ROOT 1) set(GPUCA_CONFIG_ONNX 0) -set(GPUCA_BUILD_EVENT_DISPLAY 1) +set(GPUCA_BUILD_EVENT_DISPLAY 0) set(GPUCA_BUILD_EVENT_DISPLAY_FREETYPE 1) set(GPUCA_BUILD_EVENT_DISPLAY_VULKAN 1) set(GPUCA_BUILD_EVENT_DISPLAY_WAYLAND 1) @@ -32,8 +32,8 @@ set(GPUCA_BUILD_DEBUG_HOSTONLY 0) set(GPUCA_DETERMINISTIC_MODE 0) # OFF / NO_FAST_MATH / OPTO2 / GPU / WHOLEO2 #set(GPUCA_CUDA_GCCBIN c++-14) #set(GPUCA_OPENCL_CLANGBIN clang-20) -set(HIP_AMDGPUTARGET "default") # "gfx906;gfx908;gfx90a" -set(CUDA_COMPUTETARGET "default") # 86 89 +set(HIP_AMDGPUTARGET "gfx1100") # "gfx906;gfx908;gfx90a" +#set(CUDA_COMPUTETARGET "default") # 86 89 #set(GPUCA_CUDA_COMPILE_MODE perkernel) # onefile / perkernel / rtc #set(GPUCA_HIP_COMPILE_MODE perkernel) #set(GPUCA_RTC_NO_COMPILED_KERNELS 1) diff --git a/dependencies/FindO2GPU.cmake b/dependencies/FindO2GPU.cmake index 3e8f012fea4b5..0aeae438b7187 100644 --- a/dependencies/FindO2GPU.cmake +++ b/dependencies/FindO2GPU.cmake @@ -52,7 +52,9 @@ function(detect_gpu_arch backend) # Detect GPU architecture, optionally filterri set(CUDA_FIRST_TARGET 86) message(STATUS "CUDA_COMPUTETARGET not set, defaulting CUDA optimization for architecture ${CUDA_FIRST_TARGET}") endif() - if(CUDA_FIRST_TARGET GREATER_EQUAL 86) + if(CUDA_FIRST_TARGET GREATER_EQUAL 89) + set(CUDA_TARGET HOPPER) + elseif(CUDA_FIRST_TARGET GREATER_EQUAL 86) set(CUDA_TARGET AMPERE) elseif(CUDA_FIRST_TARGET GREATER_EQUAL 75) set(CUDA_TARGET TURING) @@ -75,7 +77,9 @@ function(detect_gpu_arch backend) # Detect GPU architecture, optionally filterri endif() string(TOLOWER "${HIP_FIRST_TARGET}" HIP_FIRST_TARGET) string(REGEX MATCH "....$" HIP_FIRST_TARGET_PADDED "0000${HIP_FIRST_TARGET}") - if(HIP_FIRST_TARGET_PADDED STRGREATER_EQUAL "1000") + if(HIP_FIRST_TARGET_PADDED STRGREATER_EQUAL "1100") + set(HIP_TARGET RDNA3) + elseif(HIP_FIRST_TARGET_PADDED STRGREATER_EQUAL "1000") set(HIP_TARGET RDNA) elseif(HIP_FIRST_TARGET_PADDED STRGREATER_EQUAL "090a") set(HIP_TARGET MI210) @@ -400,4 +404,4 @@ endif() set(O2GPU_FOUND TRUE) if (NOT GPUCA_FINDO2GPU_CHECK_ONLY) include("${CMAKE_CURRENT_LIST_DIR}/../GPU/GPUTracking/cmake/kernel_helpers.cmake") -endif() +endif() \ No newline at end of file From 61a9e88d4a0269c0b1d78dddbdf2e714257bd069 Mon Sep 17 00:00:00 2001 From: Oliver Rietmann Date: Tue, 17 Mar 2026 15:20:55 +0100 Subject: [PATCH 2/7] Write markdown style to terminal --- GPU/GPUTracking/Base/GPUReconstructionCPU.cxx | 155 +++++++++++------- 1 file changed, 98 insertions(+), 57 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index 8491974bda331..6db667ea0886f 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -35,8 +35,10 @@ #include #include -#include #include +#include +#include + #ifndef _WIN32 #include @@ -215,35 +217,62 @@ int32_t GPUReconstructionCPU::ExitDevice() } namespace { - void write_header(std::ostream& stream) { - stream << "type,count,name,kernel (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n"; +void writeHeaderMarkdown(std::ostream& stream) { + stream << "| | count | name | gpu (us) | cpu (us) | cpu/tot | tot (us) | GB/s | bytes | bytes/call |\n"; + stream << "|---|--------|-------------------------------------------|-----------|-----------|---------|-----------|-----------|---------------|---------------|\n"; +} + +void writeHeaderCSV(std::ostream& stream) { + stream << "type,count,name,gpu (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n"; +} + +struct Row { + char type = ' '; + std::string name; + uint32_t count = 0; + double gpu_time = -1.0; + double cpu_time = -1.0; + double total_time = -1.0; + uint32_t memSize = 0; + uint32_t statNEvents; + + void writeMarkdown(std::ostream& stream) { + double scale = 1000000.0 / statNEvents; + stream << "| " << type << " | "; + if (count != 0) stream << std::format("{:6} |", count); + else stream << " |"; + stream << std::format(" {:42}|", name); + if (gpu_time != -1.0) stream << std::format("{:10.0f} |", gpu_time * scale); + else stream << " |"; + if (cpu_time != -1.0) stream << std::format("{:10.0f} |", cpu_time * scale); + else stream << " |"; + if (cpu_time != -1.0 && total_time != -1.0) stream << std::format("{:8.2f} |", cpu_time / total_time); + else stream << " |"; + if (total_time != -1.0) stream << std::format("{:10.0f} |", total_time * scale); + else stream << " |"; + if (memSize != 0 && count != 0) stream << std::format("{:10.3f} |{:14} |{:14} |", memSize / gpu_time * 1e-9, memSize / statNEvents, memSize / statNEvents / count); + else stream << " | | |"; + stream << std::endl; } - struct Row { - std::string type = ""; - std::string name = ""; - double kernel_time = -1.0; - double cpu_time = -1.0; - double total_time = -1.0; - size_t memSize = 0; - uint32_t count = 0; - - void write(std::ostream& stream, uint32_t statNEvents) { - double scale = 1000000.0 / statNEvents; - stream << type << ","; - if (count != 0) stream << count; - stream << "," << name << "," << uint32_t(kernel_time * scale) << ","; - if (cpu_time != -1.0) stream << uint32_t(cpu_time * scale); - stream << ","; - if (cpu_time != -1.0 && total_time != -1.0) stream << uint32_t(cpu_time / total_time *100) / 100.0; - stream << ","; - if (total_time != -1.0) stream << uint32_t(total_time * scale); - stream << ","; - if (memSize != 0 && count != 0) stream << uint32_t(memSize / kernel_time * 1e-6) * 1e-3 << "," << memSize / statNEvents << "," << memSize / statNEvents / count; - else stream << ",,"; - stream << std::endl; - } - }; + void writeCSV(std::ostream& stream) { + double scale = 1000000.0 / statNEvents; + stream << type << ","; + if (count != 0) stream << count; + stream << "," << name << ","; + if (gpu_time != -1.0) stream << std::format("{:.0f}", gpu_time * scale); + stream << ","; + if (cpu_time != -1.0) stream << std::format("{:.0f}", cpu_time * scale); + stream << ","; + if (cpu_time != -1.0 && total_time != -1.0) stream << std::format("{:.2f}", cpu_time / total_time); + stream << ","; + if (total_time != -1.0) stream << std::format("{:.0f}", total_time * scale); + stream << ","; + if (memSize != 0 && count != 0) stream << std::format("{:.3f},{},{}", memSize / gpu_time * 1e-9, memSize / statNEvents, memSize / statNEvents / count); + else stream << ",,"; + stream << std::endl; + } +}; } int32_t GPUReconstructionCPU::RunChains() @@ -290,7 +319,7 @@ int32_t GPUReconstructionCPU::RunChains() PrintMemoryOverview(); } - mStatWallTime = (mTimerTotal.GetElapsedTime() * 1000000. / mStatNEvents); + mStatWallTime = mTimerTotal.GetElapsedTime(); std::string nEventReport; if (GetProcessingSettings().debugLevel >= 0 && mStatNEvents > 1) { nEventReport += " (avergage of " + std::to_string(mStatNEvents) + " runs)"; @@ -304,11 +333,12 @@ int32_t GPUReconstructionCPU::RunChains() if (!benchmarkCSV.is_open()) { GPUError("Could not open timing CSV file '%s' for writing", GetProcessingSettings().timingCSV.c_str()); } else if (mNEventsProcessed == 1) { - write_header(benchmarkCSV); + writeHeaderCSV(benchmarkCSV); } } if (GetProcessingSettings().debugLevel >= 1) { + writeHeaderMarkdown(std::cout); for (uint32_t i = 0; i < mTimers.size(); i++) { double time = 0; if (mTimers[i] == nullptr) { @@ -328,18 +358,19 @@ int32_t GPUReconstructionCPU::RunChains() int32_t stepNum = getRecoStepNum(mTimers[i]->step); kernelStepTimes[stepNum] += time; } - char bandwidth[256] = ""; Row task_row; task_row.type = 'K'; task_row.name = mTimers[i]->name.c_str(); - task_row.kernel_time = time; + task_row.gpu_time = time; task_row.count = mTimers[i]->count; + task_row.statNEvents = mStatNEvents; if (mTimers[i]->memSize && mStatNEvents && time != 0.) { task_row.memSize = mTimers[i]->memSize; - snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", mTimers[i]->memSize / time * 1e-9, mTimers[i]->memSize / mStatNEvents, mTimers[i]->memSize / mStatNEvents / mTimers[i]->count); } - if (benchmarkCSV.is_open()) task_row.write(benchmarkCSV, mStatNEvents); - printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), time * 1000000 / mStatNEvents, bandwidth); + if (benchmarkCSV.is_open()) { + task_row.writeCSV(benchmarkCSV); + } + task_row.writeMarkdown(std::cout); if (GetProcessingSettings().resetTimers) { mTimers[i]->count = 0; mTimers[i]->memSize = 0; @@ -351,34 +382,40 @@ int32_t GPUReconstructionCPU::RunChains() if (kernelStepTimes[i] != 0. || mTimersRecoSteps[i].timerTotal.GetElapsedTime() != 0.) { Row reco_step_row; reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (Tasks)"; - reco_step_row.kernel_time = kernelStepTimes[i]; + reco_step_row.gpu_time = kernelStepTimes[i]; reco_step_row.cpu_time = mTimersRecoSteps[i].timerCPU; reco_step_row.total_time = mTimersRecoSteps[i].timerTotal.GetElapsedTime(); - if (benchmarkCSV.is_open()) reco_step_row.write(benchmarkCSV, mStatNEvents); - printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", "Tasks", - gpudatatypes::RECO_STEP_NAMES[i], kernelStepTimes[i] * 1000000 / mStatNEvents, "", mTimersRecoSteps[i].timerTotal.GetElapsedTime() * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU * 1000000 / mStatNEvents, mTimersRecoSteps[i].timerCPU / mTimersRecoSteps[i].timerTotal.GetElapsedTime()); + reco_step_row.statNEvents = mStatNEvents; + if (benchmarkCSV.is_open()) { + reco_step_row.writeCSV(benchmarkCSV); + } + reco_step_row.writeMarkdown(std::cout); } if (mTimersRecoSteps[i].bytesToGPU) { Row reco_step_row; reco_step_row.type = 'D'; reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to GPU)"; - reco_step_row.kernel_time = mTimersRecoSteps[i].timerToGPU.GetElapsedTime(); + reco_step_row.gpu_time = mTimersRecoSteps[i].timerToGPU.GetElapsedTime(); reco_step_row.memSize = mTimersRecoSteps[i].bytesToGPU; reco_step_row.count = mTimersRecoSteps[i].countToGPU; - if (benchmarkCSV.is_open()) reco_step_row.write(benchmarkCSV, mStatNEvents); - printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToGPU, "DMA to GPU", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1000000 / mStatNEvents, - mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].timerToGPU.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToGPU / mStatNEvents, mTimersRecoSteps[i].bytesToGPU / mTimersRecoSteps[i].countToGPU); + reco_step_row.statNEvents = mStatNEvents; + if (benchmarkCSV.is_open()) { + reco_step_row.writeCSV(benchmarkCSV); + } + reco_step_row.writeMarkdown(std::cout); } if (mTimersRecoSteps[i].bytesToHost) { Row reco_step_row; reco_step_row.type = 'D'; reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to Host)"; - reco_step_row.kernel_time = mTimersRecoSteps[i].timerToHost.GetElapsedTime(); + reco_step_row.gpu_time = mTimersRecoSteps[i].timerToHost.GetElapsedTime(); reco_step_row.memSize = mTimersRecoSteps[i].bytesToHost; reco_step_row.count = mTimersRecoSteps[i].countToHost; - if (benchmarkCSV.is_open()) reco_step_row.write(benchmarkCSV, mStatNEvents); - printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToHost, "DMA to Host", gpudatatypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1000000 / mStatNEvents, - mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToHost / mStatNEvents, mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].countToHost); + reco_step_row.statNEvents = mStatNEvents; + if (benchmarkCSV.is_open()) { + reco_step_row.writeCSV(benchmarkCSV); + } + reco_step_row.writeMarkdown(std::cout); } if (GetProcessingSettings().resetTimers) { mTimersRecoSteps[i].bytesToGPU = mTimersRecoSteps[i].bytesToHost = 0; @@ -394,24 +431,28 @@ int32_t GPUReconstructionCPU::RunChains() if (mTimersGeneralSteps[i].GetElapsedTime() != 0.) { Row general_step_row; general_step_row.name = gpudatatypes::GENERAL_STEP_NAMES[i]; - general_step_row.kernel_time = mTimersGeneralSteps[i].GetElapsedTime(); - if (benchmarkCSV.is_open()) general_step_row.write(benchmarkCSV, mStatNEvents); - printf("Execution Time: General Step : %50s Time: %'10.0f us\n", gpudatatypes::GENERAL_STEP_NAMES[i], mTimersGeneralSteps[i].GetElapsedTime() * 1000000 / mStatNEvents); + general_step_row.gpu_time = mTimersGeneralSteps[i].GetElapsedTime(); + general_step_row.statNEvents = mStatNEvents; + if (benchmarkCSV.is_open()) { + general_step_row.writeCSV(benchmarkCSV); + } + general_step_row.writeMarkdown(std::cout); } } Row wall_row; wall_row.name = "Wall"; if (GetProcessingSettings().debugLevel >= 1) { - wall_row.kernel_time = kernelTotal; - mStatKernelTime = kernelTotal * 1000000 / mStatNEvents; - printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", mStatKernelTime, nEventReport.c_str()); + wall_row.gpu_time = kernelTotal; } wall_row.cpu_time = mStatCPUTime; - wall_row.total_time = mStatWallTime * mStatNEvents / 1000000; - if (benchmarkCSV.is_open()) wall_row.write(benchmarkCSV, mStatNEvents); - printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", mStatWallTime, mStatCPUTime * 1000000 / mStatNEvents, mStatCPUTime / mTimerTotal.GetElapsedTime(), nEventReport.c_str()); + wall_row.total_time = mStatWallTime; + wall_row.statNEvents = mStatNEvents; + if (benchmarkCSV.is_open()) { + wall_row.writeCSV(benchmarkCSV); + } + wall_row.writeMarkdown(std::cout); } else if (GetProcessingSettings().debugLevel >= 0) { - GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str()); + GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime * 1000000 / mStatNEvents, nEventReport.c_str()); } if (GetProcessingSettings().resetTimers) { mStatNEvents = 0; From 0a93b3af81d74b66263a8c6080816820aef6593a Mon Sep 17 00:00:00 2001 From: Oliver Rietmann Date: Tue, 17 Mar 2026 16:01:29 +0100 Subject: [PATCH 3/7] Fix overwriting of --PROCresetTimers --- GPU/GPUTracking/Definitions/GPUSettingsList.h | 2 +- GPU/GPUTracking/Standalone/Benchmark/standalone.cxx | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 06c0d8f344af1..3209b98547d75 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -335,7 +335,7 @@ AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole re AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6 or deterministic compile flag set", def(1)) AddOption(showOutputStat, bool, false, "", 0, "Print some track output statistics") AddOption(runCompressionStatistics, bool, false, "compressionStat", 0, "Run statistics and verification for cluster compression") -AddOption(resetTimers, int8_t, 1, "", 0, "Reset timers every event") +AddOption(resetTimers, int8_t, 0, "", 0, "Reset timers every event") AddOption(deviceTimers, bool, true, "", 0, "Use device timers instead of host-based time measurement") AddOption(keepAllMemory, bool, false, "", 0, "Allocate all memory on both device and host, and do not reuse") AddOption(keepDisplayMemory, bool, false, "", 0, "Like keepAllMemory, but only for memory required for event display") diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx index a2e74c45fcb86..ed35bf0b281bc 100644 --- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx +++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx @@ -627,7 +627,7 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU if (configStandalone.runs > 1) { printf("Run %d (thread %d)\n", iteration + 1, threadId); } - recUse->SetResetTimers(iRun < configStandalone.runsInit); + recUse->SetResetTimers(iRun < configStandalone.runsInit || configStandalone.proc.resetTimers); if (configStandalone.outputcontrolmem) { recUse->SetOutputControl(threadId ? outputmemoryPipeline.get() : outputmemory.get(), configStandalone.outputcontrolmem); } @@ -685,7 +685,7 @@ int32_t RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingU chainTrackingAsync->mIOPtrs.nRawClusters[i] = 0; } chainTrackingAsync->mIOPtrs.clustersNative = nullptr; - recAsync->SetResetTimers(iRun < configStandalone.runsInit); + recAsync->SetResetTimers(iRun < configStandalone.runsInit || configStandalone.proc.resetTimers); tmpRetVal = recAsync->RunChains(); if (tmpRetVal == 0 || tmpRetVal == 2) { OutputStat(chainTrackingAsync, nullptr, nullptr); From d1456c5a673858c7801be39f34815ade008f5c8d Mon Sep 17 00:00:00 2001 From: Oliver Rietmann Date: Tue, 17 Mar 2026 16:21:06 +0100 Subject: [PATCH 4/7] Revert GPU Paramter and CMake Files --- .../Definitions/Parameters/GPUParameters.csv | 226 +++++++++--------- GPU/GPUTracking/Standalone/cmake/config.cmake | 6 +- dependencies/FindO2GPU.cmake | 10 +- 3 files changed, 119 insertions(+), 123 deletions(-) diff --git a/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv b/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv index f240402acc19c..fc27de72ea2f1 100644 --- a/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv +++ b/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv @@ -1,113 +1,113 @@ -Architecture,default,default_cpu,MI100,MI210,RDNA3,VEGA,TAHITI,TESLA,FERMI,PASCAL,KEPLER,AMPERE,TURING,HOPPER -,,,,,,,,,,,,,, -CORE:,,,,,,,,,,,,,, -WARP_SIZE,32,,64,64,32,64,32,32,32,32,32,32,32,32 -THREAD_COUNT_DEFAULT,256,,256,256,256,256,,,,,,512,512,512 -,,,,,,,,,,,,,, -LB:,,,,,,,,,,,,,, -GPUTPCCreateTrackingData,256,,"[256, 7]","[256, 7]","[256, 7]","[192, 2]",,,,,,384,256,256 -GPUTPCTrackletConstructor,256,,"[768, 8]","[768, 8]","[768, 8]","[512, 10]","[256, 2]","[256, 1]","[256, 2]","[1024, 2]","[512, 4]","[256, 2]","[256, 2]","[256, 2]" -GPUTPCTrackletSelector,256,,"[384, 5]","[384, 5]","[384, 5]","[192, 10]","[256, 3]","[256, 1]","[256, 3]","[512, 4]","[256, 3]","[192, 3]","[192, 3]","[192, 3]" -GPUTPCNeighboursFinder,256,,"[192, 8]","[192, 8]","[192, 8]","[960, 8]",256,256,256,512,256,"[640, 1]","[640, 1]","[640, 1]" -GPUTPCNeighboursCleaner,256,,"[128, 5]","[128, 5]","[128, 5]","[384, 9]",256,256,256,256,256,512,512,512 -GPUTPCExtrapolationTracking,256,,"[256, 7]","[256, 7]","[256, 7]","[256, 2]",,,,,,"[128, 4]","[192, 2]","[192, 2]" -GPUTRDTrackerKernels_gpuVersion,512,,,,,,,,,,,,, -GPUTPCCreateOccupancyMap_fill,256,,,,,,,,,,,,, -GPUTPCCreateOccupancyMap_fold,256,,,,,,,,,,,,, -GPUTRDTrackerKernels_o2Version,512,,,,,,,,,,,,, -GPUTPCCompressionKernels_step0attached,256,,"[128, 1]","[128, 1]","[128, 1]","[64, 2]",,,,,,"[64, 2]",128,128 -GPUTPCCompressionKernels_step1unattached,256,,"[512, 2]","[512, 2]","[512, 2]","[512, 2]",,,,,,"[512, 3]","[512, 2]","[512, 2]" -GPUTPCDecompressionKernels_step0attached,256,,"[128, 2]","[128, 2]","[128, 2]","[128, 2]",,,,,,"[32, 1]","[32, 1]","[32, 1]" -GPUTPCDecompressionKernels_step1unattached,256,,"[64, 2]","[64, 2]","[64, 2]","[64, 2]",,,,,,"[32, 1]","[32, 1]","[32, 1]" -GPUTPCDecompressionUtilKernels_sortPerSectorRow,256,,,,,,,,,,,,, -GPUTPCDecompressionUtilKernels_countFilteredClusters,256,,,,,,,,,,,,, -GPUTPCDecompressionUtilKernels_storeFilteredClusters,256,,,,,,,,,,,,, -GPUTPCCFDecodeZS,"[128, 4]",,"[64, 4]","[64, 4]","[64, 4]","[64, 1]",,,,,,"[64, 10]","[64, 8]","[64, 8]" -GPUTPCCFDecodeZSLink,"""GPUCA_WARP_SIZE""",,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""",,,,,,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""" -GPUTPCCFDecodeZSDenseLink,"""GPUCA_WARP_SIZE""",,"[""GPUCA_WARP_SIZE"", 4]","[""GPUCA_WARP_SIZE"", 4]","[""GPUCA_WARP_SIZE"", 4]","[""GPUCA_WARP_SIZE"", 14]",,,,,,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""" -GPUTPCCFGather,"[1024, 1]",,"[1024, 5]","[1024, 5]","[1024, 5]","[1024, 1]",,,,,,"[1024, 1]","[1024, 1]","[1024, 1]" -COMPRESSION_GATHER,1024,,1024,1024,1024,1024,,,,,,1024,1024,1024 -GPUTPCGMMergerTrackFit,256,,"[192, 2]","[192, 2]","[192, 2]","[64, 7]",,,,,,"[64, 4]","[32, 8]","[32, 8]" -GPUTPCGMMergerFollowLoopers,256,,"[256, 5]","[256, 5]","[256, 5]","[256, 4]",,,,,,"[64, 12]","[128, 4]","[128, 4]" -GPUTPCGMMergerSectorRefit,256,,"[64, 4]","[64, 4]","[64, 4]","[256, 2]",,,,,,"[32, 6]","[64, 5]","[64, 5]" -GPUTPCGMMergerUnpackResetIds,256,,256,256,256,256,,,,,,256,256,256 -GPUTPCGMMergerUnpackGlobal,256,,256,256,256,256,,,,,,256,256,256 -GPUTPCGMMergerResolve_step0,256,,512,512,512,256,,,,,,256,256,256 -GPUTPCGMMergerResolve_step1,256,,512,512,512,256,,,,,,256,256,256 -GPUTPCGMMergerResolve_step2,256,,512,512,512,256,,,,,,256,256,256 -GPUTPCGMMergerResolve_step3,256,,512,512,512,256,,,,,,256,256,256 -GPUTPCGMMergerResolve_step4,256,,512,512,512,256,,,,,,"[256, 4]","[256, 4]","[256, 4]" -GPUTPCGMMergerClearLinks,256,,256,256,256,256,,,,,,256,256,256 -GPUTPCGMMergerMergeWithinPrepare,256,,256,256,256,256,,,,,,256,256,256 -GPUTPCGMMergerMergeSectorsPrepare,256,,256,256,256,256,,,,,,"[256, 2]","[256, 2]","[256, 2]" -GPUTPCGMMergerMergeBorders_step0,256,,512,512,512,256,,,,,,192,192,192 -GPUTPCGMMergerMergeBorders_step2,256,,512,512,512,256,,,,,,"[64, 2]",256,256 -GPUTPCGMMergerMergeCE,256,,512,512,512,256,,,,,,256,256,256 -GPUTPCGMMergerLinkExtrapolatedTracks,256,,256,256,256,256,,,,,,256,256,256 -GPUTPCGMMergerCollect,256,,"[768, 1]","[768, 1]","[768, 1]","[1024, 1]",,,,,,"[256, 2]","[128, 2]","[128, 2]" -GPUTPCGMMergerSortTracksPrepare,256,,256,256,256,256,,,,,,256,256,256 -GPUTPCGMMergerPrepareForFit_step0,256,,256,256,256,256,,,,,,256,256,256 -GPUTPCGMMergerPrepareForFit_step1,256,,256,256,256,256,,,,,,256,256,256 -GPUTPCGMMergerPrepareForFit_step2,256,,256,256,256,256,,,,,,256,256,256 -GPUTPCGMMergerFinalize_step0,256,,,,,256,,,,,,,, -GPUTPCGMMergerFinalize_step1,256,,,,,256,,,,,,,, -GPUTPCGMMergerFinalize_step2,256,,,,,256,,,,,,,, -GPUTPCGMMergerMergeLoopers_step0,256,,,,,,,,,,,,, -GPUTPCGMMergerMergeLoopers_step1,256,,,,,,,,,,,,, -GPUTPCGMMergerMergeLoopers_step2,256,,,,,,,,,,,,, -GPUTPCGMO2Output_prepare,256,,,,,,,,,,,,, -GPUTPCGMO2Output_output,256,,,,,,,,,,,,, -GPUTPCStartHitsFinder,256,,"[1024, 2]","[1024, 2]","[1024, 2]","[1024, 7]",256,256,256,256,256,512,512,512 -GPUTPCStartHitsSorter,256,,"[1024, 5]","[1024, 5]","[1024, 5]","[512, 7]",256,256,256,256,256,"[512, 1]","[512, 1]","[512, 1]" -GPUTPCCFCheckPadBaseline,576,,"[576, 2]","[576, 2]","[576, 2]","[576, 2]",,,,,,"[576, 2]",, -GPUTPCCFChargeMapFiller_fillIndexMap,512,,512,512,512,512,,,,,,448,, -GPUTPCCFChargeMapFiller_fillFromDigits,512,,512,512,512,512,,,,,,448,, -GPUTPCCFChargeMapFiller_findFragmentStart,512,,512,512,512,512,,,,,,448,, -GPUTPCCFPeakFinder,512,,"[512, 9]","[512, 9]","[512, 9]","[512, 4]",,,,,,128,, -GPUTPCCFNoiseSuppression,512,,512,512,512,512,,,,,,448,, -GPUTPCCFDeconvolution,512,,"[512, 5]","[512, 5]","[512, 5]","[512, 5]",,,,,,384,, -GPUTPCCFClusterizer,512,,"[448, 3]","[448, 3]","[448, 3]","[512, 2]",,,,,,448,, -GPUTPCNNClusterizerKernels,512,,,,,,,,,,,,, -GPUTrackingRefitKernel_mode0asGPU,256,,,,,,,,,,,,, -GPUTrackingRefitKernel_mode1asTrackParCov,256,,,,,,,,,,,,, -GPUMemClean16,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,,,,, -GPUitoa,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,,,,, -GPUTPCCFNoiseSuppression_noiseSuppression,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,,,,, -GPUTPCCFNoiseSuppression_updatePeaks,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,,,,, -GPUTPCNNClusterizerKernels_runCfClusterizer,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, -GPUTPCNNClusterizerKernels_fillInputNNCPU,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, -GPUTPCNNClusterizerKernels_fillInputNNGPU,1024,,,,,,,,,,,,, -GPUTPCNNClusterizerKernels_determineClass1Labels,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, -GPUTPCNNClusterizerKernels_determineClass2Labels,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, -GPUTPCNNClusterizerKernels_publishClass1Regression,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, -GPUTPCNNClusterizerKernels_publishClass2Regression,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, -GPUTPCNNClusterizerKernels_publishDeconvolutionFlags,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,, -GPUTPCCFStreamCompaction_scanStart,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,,,,, -GPUTPCCFStreamCompaction_scanUp,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,,,,, -GPUTPCCFStreamCompaction_scanTop,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,,,,, -GPUTPCCFStreamCompaction_scanDown,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,,,,, -GPUTPCCFStreamCompaction_compactDigits,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,,,,, -GPUTPCCompressionGatherKernels_unbuffered,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,,,,, -GPUTPCCompressionGatherKernels_buffered32,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,,,,, -GPUTPCCompressionGatherKernels_buffered64,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,,,,, -GPUTPCCompressionGatherKernels_buffered128,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,,,,, -GPUTPCCompressionGatherKernels_multiBlock,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,,,,, -GPUTPCGMMergerFinalize_0,256,,256,256,256,,,,,,,256,256,256 -GPUTPCGMMergerFinalize_1,256,,256,256,256,,,,,,,256,256,256 -GPUTPCGMMergerFinalize_2,256,,256,256,256,,,,,,,256,256,256 -,,,,,,,,,,,,,, -PAR:,,,,,,,,,,,,,, -AMD_EUS_PER_CU,0,0,4,4,4,4,,,,,,,, -SORT_STARTHITS,1,0,,,,,,,,,,,, -NEIGHBOURS_FINDER_MAX_NNEIGHUP,6,0,10,10,10,4,,,,,,4,4,4 -NEIGHBOURS_FINDER_UNROLL_GLOBAL,4,0,4,4,4,2,,,,,,,, -NEIGHBOURS_FINDER_UNROLL_SHARED,1,0,0,0,0,0,,,,,,,, -TRACKLET_SELECTOR_HITS_REG_SIZE,12,0,9,9,9,27,,,,,,20,20,20 -ALTERNATE_BORDER_SORT,0,0,1,1,1,1,,,,,,1,1,1 -SORT_BEFORE_FIT,0,0,1,1,1,1,,,,,,1,1,1 -NO_ATOMIC_PRECHECK,0,0,1,1,1,1,,,,,,1,1,1 -DEDX_STORAGE_TYPE,"""float""","""float""","""uint16_t""","""uint16_t""","""uint16_t""","""uint16_t""",,,,,,"""uint16_t""","""uint16_t""","""uint16_t""" -MERGER_INTERPOLATION_ERROR_TYPE,"""float""","""float""","""half""","""half""","""half""","""half""",,,,,,"""half""","""half""","""half""" -COMP_GATHER_KERNEL,0,0,4,4,4,4,,,,,,4,4,4 -COMP_GATHER_MODE,2,0,3,3,3,3,,,,,,3,3,3 -CF_SCAN_WORKGROUP_SIZE,512,0,,,,,,,,,,,, +Architecture,default,default_cpu,MI100,VEGA,TAHITI,TESLA,FERMI,PASCAL,KEPLER,AMPERE,TURING +,,,,,,,,,,, +CORE:,,,,,,,,,,, +WARP_SIZE,32,,64,64,32,32,32,32,32,32,32 +THREAD_COUNT_DEFAULT,256,,256,256,,,,,,512,512 +,,,,,,,,,,, +LB:,,,,,,,,,,, +GPUTPCCreateTrackingData,256,,"[256, 7]","[192, 2]",,,,,,384,256 +GPUTPCTrackletConstructor,256,,"[768, 8]","[512, 10]","[256, 2]","[256, 1]","[256, 2]","[1024, 2]","[512, 4]","[256, 2]","[256, 2]" +GPUTPCTrackletSelector,256,,"[384, 5]","[192, 10]","[256, 3]","[256, 1]","[256, 3]","[512, 4]","[256, 3]","[192, 3]","[192, 3]" +GPUTPCNeighboursFinder,256,,"[192, 8]","[960, 8]",256,256,256,512,256,"[640, 1]","[640, 1]" +GPUTPCNeighboursCleaner,256,,"[128, 5]","[384, 9]",256,256,256,256,256,512,512 +GPUTPCExtrapolationTracking,256,,"[256, 7]","[256, 2]",,,,,,"[128, 4]","[192, 2]" +GPUTRDTrackerKernels_gpuVersion,512,,,,,,,,,, +GPUTPCCreateOccupancyMap_fill,256,,,,,,,,,, +GPUTPCCreateOccupancyMap_fold,256,,,,,,,,,, +GPUTRDTrackerKernels_o2Version,512,,,,,,,,,, +GPUTPCCompressionKernels_step0attached,256,,"[128, 1]","[64, 2]",,,,,,"[64, 2]",128 +GPUTPCCompressionKernels_step1unattached,256,,"[512, 2]","[512, 2]",,,,,,"[512, 3]","[512, 2]" +GPUTPCDecompressionKernels_step0attached,256,,"[128, 2]","[128, 2]",,,,,,"[32, 1]","[32, 1]" +GPUTPCDecompressionKernels_step1unattached,256,,"[64, 2]","[64, 2]",,,,,,"[32, 1]","[32, 1]" +GPUTPCDecompressionUtilKernels_sortPerSectorRow,256,,,,,,,,,, +GPUTPCDecompressionUtilKernels_countFilteredClusters,256,,,,,,,,,, +GPUTPCDecompressionUtilKernels_storeFilteredClusters,256,,,,,,,,,, +GPUTPCCFDecodeZS,"[128, 4]",,"[64, 4]","[64, 1]",,,,,,"[64, 10]","[64, 8]" +GPUTPCCFDecodeZSLink,"""GPUCA_WARP_SIZE""",,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""",,,,,,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""" +GPUTPCCFDecodeZSDenseLink,"""GPUCA_WARP_SIZE""",,"[""GPUCA_WARP_SIZE"", 4]","[""GPUCA_WARP_SIZE"", 14]",,,,,,"""GPUCA_WARP_SIZE""","""GPUCA_WARP_SIZE""" +GPUTPCCFGather,"[1024, 1]",,"[1024, 5]","[1024, 1]",,,,,,"[1024, 1]","[1024, 1]" +COMPRESSION_GATHER,1024,,1024,1024,,,,,,1024,1024 +GPUTPCGMMergerTrackFit,256,,"[192, 2]","[64, 7]",,,,,,"[64, 4]","[32, 8]" +GPUTPCGMMergerFollowLoopers,256,,"[256, 5]","[256, 4]",,,,,,"[64, 12]","[128, 4]" +GPUTPCGMMergerSectorRefit,256,,"[64, 4]","[256, 2]",,,,,,"[32, 6]","[64, 5]" +GPUTPCGMMergerUnpackResetIds,256,,256,256,,,,,,256,256 +GPUTPCGMMergerUnpackGlobal,256,,256,256,,,,,,256,256 +GPUTPCGMMergerResolve_step0,256,,512,256,,,,,,256,256 +GPUTPCGMMergerResolve_step1,256,,512,256,,,,,,256,256 +GPUTPCGMMergerResolve_step2,256,,512,256,,,,,,256,256 +GPUTPCGMMergerResolve_step3,256,,512,256,,,,,,256,256 +GPUTPCGMMergerResolve_step4,256,,512,256,,,,,,"[256, 4]","[256, 4]" +GPUTPCGMMergerClearLinks,256,,256,256,,,,,,256,256 +GPUTPCGMMergerMergeWithinPrepare,256,,256,256,,,,,,256,256 +GPUTPCGMMergerMergeSectorsPrepare,256,,256,256,,,,,,"[256, 2]","[256, 2]" +GPUTPCGMMergerMergeBorders_step0,256,,512,256,,,,,,192,192 +GPUTPCGMMergerMergeBorders_step2,256,,512,256,,,,,,"[64, 2]",256 +GPUTPCGMMergerMergeCE,256,,512,256,,,,,,256,256 +GPUTPCGMMergerLinkExtrapolatedTracks,256,,256,256,,,,,,256,256 +GPUTPCGMMergerCollect,256,,"[768, 1]","[1024, 1]",,,,,,"[256, 2]","[128, 2]" +GPUTPCGMMergerSortTracksPrepare,256,,256,256,,,,,,256,256 +GPUTPCGMMergerPrepareForFit_step0,256,,256,256,,,,,,256,256 +GPUTPCGMMergerPrepareForFit_step1,256,,256,256,,,,,,256,256 +GPUTPCGMMergerPrepareForFit_step2,256,,256,256,,,,,,256,256 +GPUTPCGMMergerFinalize_step0,256,,,256,,,,,,, +GPUTPCGMMergerFinalize_step1,256,,,256,,,,,,, +GPUTPCGMMergerFinalize_step2,256,,,256,,,,,,, +GPUTPCGMMergerMergeLoopers_step0,256,,,,,,,,,, +GPUTPCGMMergerMergeLoopers_step1,256,,,,,,,,,, +GPUTPCGMMergerMergeLoopers_step2,256,,,,,,,,,, +GPUTPCGMO2Output_prepare,256,,,,,,,,,, +GPUTPCGMO2Output_output,256,,,,,,,,,, +GPUTPCStartHitsFinder,256,,"[1024, 2]","[1024, 7]",256,256,256,256,256,512,512 +GPUTPCStartHitsSorter,256,,"[1024, 5]","[512, 7]",256,256,256,256,256,"[512, 1]","[512, 1]" +GPUTPCCFCheckPadBaseline,576,,"[576, 2]","[576, 2]",,,,,,"[576, 2]", +GPUTPCCFChargeMapFiller_fillIndexMap,512,,512,512,,,,,,448, +GPUTPCCFChargeMapFiller_fillFromDigits,512,,512,512,,,,,,448, +GPUTPCCFChargeMapFiller_findFragmentStart,512,,512,512,,,,,,448, +GPUTPCCFPeakFinder,512,,"[512, 9]","[512, 4]",,,,,,128, +GPUTPCCFNoiseSuppression,512,,512,512,,,,,,448, +GPUTPCCFDeconvolution,512,,"[512, 5]","[512, 5]",,,,,,384, +GPUTPCCFClusterizer,512,,"[448, 3]","[512, 2]",,,,,,448, +GPUTPCNNClusterizerKernels,512,,,,,,,,,, +GPUTrackingRefitKernel_mode0asGPU,256,,,,,,,,,, +GPUTrackingRefitKernel_mode1asTrackParCov,256,,,,,,,,,, +GPUMemClean16,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,, +GPUitoa,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,, +GPUTPCCFNoiseSuppression_noiseSuppression,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,, +GPUTPCCFNoiseSuppression_updatePeaks,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,, +GPUTPCNNClusterizerKernels_runCfClusterizer,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, +GPUTPCNNClusterizerKernels_fillInputNNCPU,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, +GPUTPCNNClusterizerKernels_fillInputNNGPU,1024,,,,,,,,,, +GPUTPCNNClusterizerKernels_determineClass1Labels,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, +GPUTPCNNClusterizerKernels_determineClass2Labels,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, +GPUTPCNNClusterizerKernels_publishClass1Regression,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, +GPUTPCNNClusterizerKernels_publishClass2Regression,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, +GPUTPCNNClusterizerKernels_publishDeconvolutionFlags,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,, +GPUTPCCFStreamCompaction_scanStart,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,, +GPUTPCCFStreamCompaction_scanUp,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,, +GPUTPCCFStreamCompaction_scanTop,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,, +GPUTPCCFStreamCompaction_scanDown,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,, +GPUTPCCFStreamCompaction_compactDigits,"""GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE""",,,,,,,,,, +GPUTPCCompressionGatherKernels_unbuffered,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,, +GPUTPCCompressionGatherKernels_buffered32,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,, +GPUTPCCompressionGatherKernels_buffered64,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,, +GPUTPCCompressionGatherKernels_buffered128,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,, +GPUTPCCompressionGatherKernels_multiBlock,"""GPUCA_LB_COMPRESSION_GATHER""",,,,,,,,,, +GPUTPCGMMergerFinalize_0,256,,256,,,,,,,256,256 +GPUTPCGMMergerFinalize_1,256,,256,,,,,,,256,256 +GPUTPCGMMergerFinalize_2,256,,256,,,,,,,256,256 +,,,,,,,,,,, +PAR:,,,,,,,,,,, +AMD_EUS_PER_CU,0,0,4,4,,,,,,, +SORT_STARTHITS,1,0,,,,,,,,, +NEIGHBOURS_FINDER_MAX_NNEIGHUP,6,0,10,4,,,,,,4,4 +NEIGHBOURS_FINDER_UNROLL_GLOBAL,4,0,4,2,,,,,,, +NEIGHBOURS_FINDER_UNROLL_SHARED,1,0,0,0,,,,,,, +TRACKLET_SELECTOR_HITS_REG_SIZE,12,0,9,27,,,,,,20,20 +ALTERNATE_BORDER_SORT,0,0,1,1,,,,,,1,1 +SORT_BEFORE_FIT,0,0,1,1,,,,,,1,1 +NO_ATOMIC_PRECHECK,0,0,1,1,,,,,,1,1 +DEDX_STORAGE_TYPE,"""float""","""float""","""uint16_t""","""uint16_t""",,,,,,"""uint16_t""","""uint16_t""" +MERGER_INTERPOLATION_ERROR_TYPE,"""float""","""float""","""half""","""half""",,,,,,"""half""","""half""" +COMP_GATHER_KERNEL,0,0,4,4,,,,,,4,4 +COMP_GATHER_MODE,2,0,3,3,,,,,,3,3 +CF_SCAN_WORKGROUP_SIZE,512,0,,,,,,,,, diff --git a/GPU/GPUTracking/Standalone/cmake/config.cmake b/GPU/GPUTracking/Standalone/cmake/config.cmake index abdfc52c460e4..9355311db617c 100644 --- a/GPU/GPUTracking/Standalone/cmake/config.cmake +++ b/GPU/GPUTracking/Standalone/cmake/config.cmake @@ -19,7 +19,7 @@ set(GPUCA_CONFIG_VC 1) set(GPUCA_CONFIG_FMT 1) set(GPUCA_CONFIG_ROOT 1) set(GPUCA_CONFIG_ONNX 0) -set(GPUCA_BUILD_EVENT_DISPLAY 0) +set(GPUCA_BUILD_EVENT_DISPLAY 1) set(GPUCA_BUILD_EVENT_DISPLAY_FREETYPE 1) set(GPUCA_BUILD_EVENT_DISPLAY_VULKAN 1) set(GPUCA_BUILD_EVENT_DISPLAY_WAYLAND 1) @@ -32,8 +32,8 @@ set(GPUCA_BUILD_DEBUG_HOSTONLY 0) set(GPUCA_DETERMINISTIC_MODE 0) # OFF / NO_FAST_MATH / OPTO2 / GPU / WHOLEO2 #set(GPUCA_CUDA_GCCBIN c++-14) #set(GPUCA_OPENCL_CLANGBIN clang-20) -set(HIP_AMDGPUTARGET "gfx1100") # "gfx906;gfx908;gfx90a" -#set(CUDA_COMPUTETARGET "default") # 86 89 +set(HIP_AMDGPUTARGET "default") # "gfx906;gfx908;gfx90a" +set(CUDA_COMPUTETARGET "default") # 86 89 #set(GPUCA_CUDA_COMPILE_MODE perkernel) # onefile / perkernel / rtc #set(GPUCA_HIP_COMPILE_MODE perkernel) #set(GPUCA_RTC_NO_COMPILED_KERNELS 1) diff --git a/dependencies/FindO2GPU.cmake b/dependencies/FindO2GPU.cmake index 0aeae438b7187..3e8f012fea4b5 100644 --- a/dependencies/FindO2GPU.cmake +++ b/dependencies/FindO2GPU.cmake @@ -52,9 +52,7 @@ function(detect_gpu_arch backend) # Detect GPU architecture, optionally filterri set(CUDA_FIRST_TARGET 86) message(STATUS "CUDA_COMPUTETARGET not set, defaulting CUDA optimization for architecture ${CUDA_FIRST_TARGET}") endif() - if(CUDA_FIRST_TARGET GREATER_EQUAL 89) - set(CUDA_TARGET HOPPER) - elseif(CUDA_FIRST_TARGET GREATER_EQUAL 86) + if(CUDA_FIRST_TARGET GREATER_EQUAL 86) set(CUDA_TARGET AMPERE) elseif(CUDA_FIRST_TARGET GREATER_EQUAL 75) set(CUDA_TARGET TURING) @@ -77,9 +75,7 @@ function(detect_gpu_arch backend) # Detect GPU architecture, optionally filterri endif() string(TOLOWER "${HIP_FIRST_TARGET}" HIP_FIRST_TARGET) string(REGEX MATCH "....$" HIP_FIRST_TARGET_PADDED "0000${HIP_FIRST_TARGET}") - if(HIP_FIRST_TARGET_PADDED STRGREATER_EQUAL "1100") - set(HIP_TARGET RDNA3) - elseif(HIP_FIRST_TARGET_PADDED STRGREATER_EQUAL "1000") + if(HIP_FIRST_TARGET_PADDED STRGREATER_EQUAL "1000") set(HIP_TARGET RDNA) elseif(HIP_FIRST_TARGET_PADDED STRGREATER_EQUAL "090a") set(HIP_TARGET MI210) @@ -404,4 +400,4 @@ endif() set(O2GPU_FOUND TRUE) if (NOT GPUCA_FINDO2GPU_CHECK_ONLY) include("${CMAKE_CURRENT_LIST_DIR}/../GPU/GPUTracking/cmake/kernel_helpers.cmake") -endif() \ No newline at end of file +endif() From 408e62473290956202f2f6d350015eb337c98cbd Mon Sep 17 00:00:00 2001 From: Oliver Rietmann Date: Tue, 17 Mar 2026 16:58:35 +0100 Subject: [PATCH 5/7] clang-format --- GPU/GPUTracking/Base/GPUReconstructionCPU.cxx | 81 ++++++++++++------- 1 file changed, 52 insertions(+), 29 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index 6db667ea0886f..dc96c2a238d1a 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -39,7 +39,6 @@ #include #include - #ifndef _WIN32 #include #endif @@ -216,14 +215,17 @@ int32_t GPUReconstructionCPU::ExitDevice() return 0; } -namespace { -void writeHeaderMarkdown(std::ostream& stream) { - stream << "| | count | name | gpu (us) | cpu (us) | cpu/tot | tot (us) | GB/s | bytes | bytes/call |\n"; - stream << "|---|--------|-------------------------------------------|-----------|-----------|---------|-----------|-----------|---------------|---------------|\n"; +namespace +{ +void writeHeaderMarkdown(std::ostream& stream) +{ + stream << "| | count | name | gpu (us) | cpu (us) | cpu/tot | tot (us) | GB/s | bytes | bytes/call |\n"; + stream << "|---|--------|-------------------------------------------|-----------|-----------|---------|-----------|-----------|---------------|---------------|\n"; } -void writeHeaderCSV(std::ostream& stream) { - stream << "type,count,name,gpu (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n"; +void writeHeaderCSV(std::ostream& stream) +{ + stream << "type,count,name,gpu (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n"; } struct Row { @@ -236,44 +238,65 @@ struct Row { uint32_t memSize = 0; uint32_t statNEvents; - void writeMarkdown(std::ostream& stream) { + void writeMarkdown(std::ostream& stream) + { double scale = 1000000.0 / statNEvents; stream << "| " << type << " | "; - if (count != 0) stream << std::format("{:6} |", count); - else stream << " |"; + if (count != 0) + stream << std::format("{:6} |", count); + else + stream << " |"; stream << std::format(" {:42}|", name); - if (gpu_time != -1.0) stream << std::format("{:10.0f} |", gpu_time * scale); - else stream << " |"; - if (cpu_time != -1.0) stream << std::format("{:10.0f} |", cpu_time * scale); - else stream << " |"; - if (cpu_time != -1.0 && total_time != -1.0) stream << std::format("{:8.2f} |", cpu_time / total_time); - else stream << " |"; - if (total_time != -1.0) stream << std::format("{:10.0f} |", total_time * scale); - else stream << " |"; - if (memSize != 0 && count != 0) stream << std::format("{:10.3f} |{:14} |{:14} |", memSize / gpu_time * 1e-9, memSize / statNEvents, memSize / statNEvents / count); - else stream << " | | |"; + if (gpu_time != -1.0) + stream << std::format("{:10.0f} |", gpu_time * scale); + else + stream << " |"; + if (cpu_time != -1.0) + stream << std::format("{:10.0f} |", cpu_time * scale); + else + stream << " |"; + if (cpu_time != -1.0 && total_time != -1.0) + stream << std::format("{:8.2f} |", cpu_time / total_time); + else + stream << " |"; + if (total_time != -1.0) + stream << std::format("{:10.0f} |", total_time * scale); + else + stream << " |"; + if (memSize != 0 && count != 0) + stream << std::format("{:10.3f} |{:14} |{:14} |", memSize / gpu_time * 1e-9, memSize / statNEvents, memSize / statNEvents / count); + else + stream << " | | |"; stream << std::endl; } - void writeCSV(std::ostream& stream) { + void writeCSV(std::ostream& stream) + { double scale = 1000000.0 / statNEvents; stream << type << ","; - if (count != 0) stream << count; + if (count != 0) + stream << count; stream << "," << name << ","; - if (gpu_time != -1.0) stream << std::format("{:.0f}", gpu_time * scale); + if (gpu_time != -1.0) + stream << std::format("{:.0f}", gpu_time * scale); stream << ","; - if (cpu_time != -1.0) stream << std::format("{:.0f}", cpu_time * scale); + if (cpu_time != -1.0) + stream << std::format("{:.0f}", cpu_time * scale); stream << ","; - if (cpu_time != -1.0 && total_time != -1.0) stream << std::format("{:.2f}", cpu_time / total_time); + if (cpu_time != -1.0 && total_time != -1.0) + stream << std::format("{:.2f}", cpu_time / total_time); stream << ","; - if (total_time != -1.0) stream << std::format("{:.0f}", total_time * scale); + if (total_time != -1.0) + stream << std::format("{:.0f}", total_time * scale); stream << ","; - if (memSize != 0 && count != 0) stream << std::format("{:.3f},{},{}", memSize / gpu_time * 1e-9, memSize / statNEvents, memSize / statNEvents / count); - else stream << ",,"; + if (memSize != 0 && count != 0) + stream << std::format("{:.3f},{},{}", memSize / gpu_time * 1e-9, memSize / statNEvents, memSize / statNEvents / count); + else + stream << ",,"; stream << std::endl; } }; -} +} // namespace int32_t GPUReconstructionCPU::RunChains() { From 5c23211f87f508284dc39e3d7dbda8ec9f03ef50 Mon Sep 17 00:00:00 2001 From: Oliver Rietmann Date: Fri, 20 Mar 2026 20:05:37 +0100 Subject: [PATCH 6/7] Move standlone debug output to GPUReconstructionCPU.cxx --- GPU/GPUTracking/Base/GPUReconstruction.h | 11 +++ GPU/GPUTracking/Base/GPUReconstructionCPU.cxx | 90 +++---------------- .../Base/GPUReconstructionDebug.cxx | 84 +++++++++++++++++ GPU/GPUTracking/Definitions/GPUSettingsList.h | 3 +- 4 files changed, 110 insertions(+), 78 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h index 9a337c02ad26d..7be1413650119 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.h +++ b/GPU/GPUTracking/Base/GPUReconstruction.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -423,6 +424,16 @@ class GPUReconstruction void debugInit(); void debugExit(); + struct debugWriter { + debugWriter(std::string filenameCSV, bool markdown, uint32_t statNEvents); + void header(); + void row(char type, uint32_t count, std::string name, double gpu_time, double cpu_time, double total_time, std::size_t memSize, std::string nEventReport = ""); + private: + std::ofstream streamCSV; + bool mMarkdown; + uint32_t mStatNEvents; + }; + static GPUReconstruction* GPUReconstruction_Create_CPU(const GPUSettingsDeviceBackend& cfg); }; diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index dc96c2a238d1a..edfdad34ed54b 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -215,6 +215,7 @@ int32_t GPUReconstructionCPU::ExitDevice() return 0; } +<<<<<<< Updated upstream namespace { void writeHeaderMarkdown(std::ostream& stream) @@ -298,6 +299,8 @@ struct Row { }; } // namespace +======= +>>>>>>> Stashed changes int32_t GPUReconstructionCPU::RunChains() { mMemoryScalers->temporaryFactor = 1.; @@ -342,7 +345,6 @@ int32_t GPUReconstructionCPU::RunChains() PrintMemoryOverview(); } - mStatWallTime = mTimerTotal.GetElapsedTime(); std::string nEventReport; if (GetProcessingSettings().debugLevel >= 0 && mStatNEvents > 1) { nEventReport += " (avergage of " + std::to_string(mStatNEvents) + " runs)"; @@ -350,18 +352,10 @@ int32_t GPUReconstructionCPU::RunChains() double kernelTotal = 0; std::vector kernelStepTimes(gpudatatypes::N_RECO_STEPS, 0.); - std::ofstream benchmarkCSV; - if (!GetProcessingSettings().timingCSV.empty()) { - benchmarkCSV.open(GetProcessingSettings().timingCSV, std::ios::out | std::ios::app); - if (!benchmarkCSV.is_open()) { - GPUError("Could not open timing CSV file '%s' for writing", GetProcessingSettings().timingCSV.c_str()); - } else if (mNEventsProcessed == 1) { - writeHeaderCSV(benchmarkCSV); - } - } + debugWriter writer(GetProcessingSettings().debugCSV, GetProcessingSettings().debugMarkdown, mStatNEvents); if (GetProcessingSettings().debugLevel >= 1) { - writeHeaderMarkdown(std::cout); + writer.header(); for (uint32_t i = 0; i < mTimers.size(); i++) { double time = 0; if (mTimers[i] == nullptr) { @@ -381,19 +375,7 @@ int32_t GPUReconstructionCPU::RunChains() int32_t stepNum = getRecoStepNum(mTimers[i]->step); kernelStepTimes[stepNum] += time; } - Row task_row; - task_row.type = 'K'; - task_row.name = mTimers[i]->name.c_str(); - task_row.gpu_time = time; - task_row.count = mTimers[i]->count; - task_row.statNEvents = mStatNEvents; - if (mTimers[i]->memSize && mStatNEvents && time != 0.) { - task_row.memSize = mTimers[i]->memSize; - } - if (benchmarkCSV.is_open()) { - task_row.writeCSV(benchmarkCSV); - } - task_row.writeMarkdown(std::cout); + writer.row('K', mTimers[i]->count, mTimers[i]->name.c_str(), time, -1.0, -1.0, mTimers[i]->memSize); if (GetProcessingSettings().resetTimers) { mTimers[i]->count = 0; mTimers[i]->memSize = 0; @@ -403,42 +385,13 @@ int32_t GPUReconstructionCPU::RunChains() if (GetProcessingSettings().recoTaskTiming) { for (int32_t i = 0; i < gpudatatypes::N_RECO_STEPS; i++) { if (kernelStepTimes[i] != 0. || mTimersRecoSteps[i].timerTotal.GetElapsedTime() != 0.) { - Row reco_step_row; - reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (Tasks)"; - reco_step_row.gpu_time = kernelStepTimes[i]; - reco_step_row.cpu_time = mTimersRecoSteps[i].timerCPU; - reco_step_row.total_time = mTimersRecoSteps[i].timerTotal.GetElapsedTime(); - reco_step_row.statNEvents = mStatNEvents; - if (benchmarkCSV.is_open()) { - reco_step_row.writeCSV(benchmarkCSV); - } - reco_step_row.writeMarkdown(std::cout); + writer.row(' ', 0, std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (Tasks)", kernelStepTimes[i], mTimersRecoSteps[i].timerCPU, mTimersRecoSteps[i].timerTotal.GetElapsedTime(), 0); } if (mTimersRecoSteps[i].bytesToGPU) { - Row reco_step_row; - reco_step_row.type = 'D'; - reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to GPU)"; - reco_step_row.gpu_time = mTimersRecoSteps[i].timerToGPU.GetElapsedTime(); - reco_step_row.memSize = mTimersRecoSteps[i].bytesToGPU; - reco_step_row.count = mTimersRecoSteps[i].countToGPU; - reco_step_row.statNEvents = mStatNEvents; - if (benchmarkCSV.is_open()) { - reco_step_row.writeCSV(benchmarkCSV); - } - reco_step_row.writeMarkdown(std::cout); + writer.row('D', mTimersRecoSteps[i].countToGPU, std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to GPU)", mTimersRecoSteps[i].timerToGPU.GetElapsedTime(), -1.0, -1.0, mTimersRecoSteps[i].bytesToGPU); } if (mTimersRecoSteps[i].bytesToHost) { - Row reco_step_row; - reco_step_row.type = 'D'; - reco_step_row.name = std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to Host)"; - reco_step_row.gpu_time = mTimersRecoSteps[i].timerToHost.GetElapsedTime(); - reco_step_row.memSize = mTimersRecoSteps[i].bytesToHost; - reco_step_row.count = mTimersRecoSteps[i].countToHost; - reco_step_row.statNEvents = mStatNEvents; - if (benchmarkCSV.is_open()) { - reco_step_row.writeCSV(benchmarkCSV); - } - reco_step_row.writeMarkdown(std::cout); + writer.row('D', mTimersRecoSteps[i].countToHost, std::string(gpudatatypes::RECO_STEP_NAMES[i]) + " (DMA to Host)", mTimersRecoSteps[i].timerToHost.GetElapsedTime(), -1.0, -1.0, mTimersRecoSteps[i].bytesToHost); } if (GetProcessingSettings().resetTimers) { mTimersRecoSteps[i].bytesToGPU = mTimersRecoSteps[i].bytesToHost = 0; @@ -452,30 +405,13 @@ int32_t GPUReconstructionCPU::RunChains() } for (int32_t i = 0; i < gpudatatypes::N_GENERAL_STEPS; i++) { if (mTimersGeneralSteps[i].GetElapsedTime() != 0.) { - Row general_step_row; - general_step_row.name = gpudatatypes::GENERAL_STEP_NAMES[i]; - general_step_row.gpu_time = mTimersGeneralSteps[i].GetElapsedTime(); - general_step_row.statNEvents = mStatNEvents; - if (benchmarkCSV.is_open()) { - general_step_row.writeCSV(benchmarkCSV); - } - general_step_row.writeMarkdown(std::cout); + writer.row(' ', 0, gpudatatypes::GENERAL_STEP_NAMES[i], mTimersGeneralSteps[i].GetElapsedTime(), -1.0, -1.0, 0); } } - Row wall_row; - wall_row.name = "Wall"; - if (GetProcessingSettings().debugLevel >= 1) { - wall_row.gpu_time = kernelTotal; - } - wall_row.cpu_time = mStatCPUTime; - wall_row.total_time = mStatWallTime; - wall_row.statNEvents = mStatNEvents; - if (benchmarkCSV.is_open()) { - wall_row.writeCSV(benchmarkCSV); - } - wall_row.writeMarkdown(std::cout); + double gpu_time = GetProcessingSettings().debugLevel >= 1 ? kernelTotal : -1.0; + writer.row(' ', 0, "Wall", gpu_time, mStatCPUTime, mTimerTotal.GetElapsedTime(), 0, nEventReport); } else if (GetProcessingSettings().debugLevel >= 0) { - GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime * 1000000 / mStatNEvents, nEventReport.c_str()); + GPUInfo("Total Wall Time: %10.0f us%s", mTimerTotal.GetElapsedTime() * 1000000 / mStatNEvents, nEventReport.c_str()); } if (GetProcessingSettings().resetTimers) { mStatNEvents = 0; diff --git a/GPU/GPUTracking/Base/GPUReconstructionDebug.cxx b/GPU/GPUTracking/Base/GPUReconstructionDebug.cxx index c1c31eedde1b2..abc65d333ea09 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionDebug.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionDebug.cxx @@ -23,6 +23,8 @@ #include #include #include +#include +#include using namespace o2::gpu; @@ -186,3 +188,85 @@ bool GPUReconstruction::triggerDebugDump() } return false; } + +GPUReconstruction::debugWriter::debugWriter(std::string filenameCSV, bool markdown, uint32_t statNEvents) : mMarkdown{markdown}, mStatNEvents{statNEvents} { + if (!filenameCSV.empty()) { + streamCSV.open(filenameCSV, std::ios::out | std::ios::app); + } +} + +void GPUReconstruction::debugWriter::header() { + if (streamCSV.is_open() && !streamCSV.tellp()) { + streamCSV << "type,count,name,gpu (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n"; + } + + if (mMarkdown) { + std::cout << "| | count | name | gpu (us) | cpu (us) | cpu/tot | tot (us) | GB/s | bytes | bytes/call |\n"; + std::cout << "|---|--------|-------------------------------------------|-----------|-----------|---------|-----------|-----------|---------------|---------------|\n"; + } +} + +void GPUReconstruction::debugWriter::row(char type, uint32_t count, std::string name, double gpu_time, double cpu_time, double total_time, std::size_t memSize, std::string nEventReport) { + double scale = 1000000.0 / mStatNEvents; + + if (streamCSV.is_open()) { + streamCSV << type << ","; + if (count != 0) streamCSV << count; + streamCSV << "," << name << ","; + if (gpu_time != -1.0) streamCSV << std::format("{:.0f}", gpu_time * scale); + streamCSV << ","; + if (cpu_time != -1.0) streamCSV << std::format("{:.0f}", cpu_time * scale); + streamCSV << ","; + if (cpu_time != -1.0 && total_time != -1.0) streamCSV << std::format("{:.2f}", cpu_time / total_time); + streamCSV << ","; + if (total_time != -1.0) streamCSV << std::format("{:.0f}", total_time * scale); + streamCSV << ","; + if (memSize != 0 && count != 0) streamCSV << std::format("{:.3f},{},{}", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count); + else streamCSV << ",,"; + streamCSV << std::endl; + } + + if (mMarkdown) { + std::cout << "| " << type << " | "; + if (count != 0) std::cout << std::format("{:6} |", count); + else std::cout << " |"; + std::cout << std::format(" {:42}|", name); + if (gpu_time != -1.0) std::cout << std::format("{:10.0f} |", gpu_time * scale); + else std::cout << " |"; + if (cpu_time != -1.0) std::cout << std::format("{:10.0f} |", cpu_time * scale); + else std::cout << " |"; + if (cpu_time != -1.0 && total_time != -1.0) std::cout << std::format("{:8.2f} |", cpu_time / total_time); + else std::cout << " |"; + if (total_time != -1.0) std::cout << std::format("{:10.0f} |", total_time * scale); + else std::cout << " |"; + if (memSize != 0 && count != 0) std::cout << std::format("{:10.3f} |{:14} |{:14} |", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count); + else std::cout << " | | |"; + std::cout << std::endl; + } else { + if (name.substr(0, 3) == "GPU") { + char bandwidth[256] = ""; + if (memSize && mStatNEvents && gpu_time != 0.0) { + snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count); + } + printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type, count, name.c_str(), gpu_time * scale, bandwidth); + } else if (name.substr(0, 3) == "TPC") { + std::size_t n = name.find('('); + std::string basename = name.substr(0, n - 1); + std::string postfix = name.substr(n + 1, name.size() - n - 2); + if (total_time != -1.0) { + printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", postfix.c_str(), + basename.c_str(), gpu_time * scale, "", total_time * scale, cpu_time * scale, cpu_time / total_time); + } else { + printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", count, postfix.c_str(), basename.c_str(), gpu_time * scale, + memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count); + } + } else if (name == "Prepare") { + printf("Execution Time: General Step : %50s Time: %'10.0f us\n", name.c_str(), gpu_time * scale); + } else if (name == "Wall") { + if (gpu_time != -1.0) { + printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", gpu_time * scale, nEventReport.c_str()); + } + printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", total_time * scale, cpu_time * scale, cpu_time / total_time, nEventReport.c_str()); + } + } +} diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 3209b98547d75..f83dc7471fed0 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -329,7 +329,8 @@ AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent, AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for memory allocations (without messing with normal debug level)") AddOption(debugMask, uint32_t, (1 << 18) - 1, "debugMask", 0, "Mask for debug output dumps to file") AddOption(debugLogSuffix, std::string, "", "debugSuffix", 0, "Suffix for debug log files with --debug 6") -AddOption(timingCSV, std::string, "", "", 0, "CSV filename to append the benchmark results. Verbosity determined by parameter --debug.") +AddOption(debugCSV, std::string, "", "", 0, "CSV filename to append the benchmark results. Verbosity determined by parameter --debug.") +AddOption(debugMarkdown, bool, false, "", 0, "Print the results of standlaone benchmarks in markdown format") AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures") AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks") AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6 or deterministic compile flag set", def(1)) From 38568a60220b58ffacd60e0373f421c0f8da1973 Mon Sep 17 00:00:00 2001 From: Oliver Rietmann Date: Fri, 20 Mar 2026 20:17:57 +0100 Subject: [PATCH 7/7] clang-format --- GPU/GPUTracking/Base/GPUReconstruction.h | 1 + .../Base/GPUReconstructionDebug.cxx | 150 ++++++++++-------- 2 files changed, 87 insertions(+), 64 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h index 7be1413650119..33dd5b9e6bb89 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.h +++ b/GPU/GPUTracking/Base/GPUReconstruction.h @@ -428,6 +428,7 @@ class GPUReconstruction debugWriter(std::string filenameCSV, bool markdown, uint32_t statNEvents); void header(); void row(char type, uint32_t count, std::string name, double gpu_time, double cpu_time, double total_time, std::size_t memSize, std::string nEventReport = ""); + private: std::ofstream streamCSV; bool mMarkdown; diff --git a/GPU/GPUTracking/Base/GPUReconstructionDebug.cxx b/GPU/GPUTracking/Base/GPUReconstructionDebug.cxx index abc65d333ea09..3ae30dfe50f91 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionDebug.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionDebug.cxx @@ -189,84 +189,106 @@ bool GPUReconstruction::triggerDebugDump() return false; } -GPUReconstruction::debugWriter::debugWriter(std::string filenameCSV, bool markdown, uint32_t statNEvents) : mMarkdown{markdown}, mStatNEvents{statNEvents} { +GPUReconstruction::debugWriter::debugWriter(std::string filenameCSV, bool markdown, uint32_t statNEvents) : mMarkdown{markdown}, mStatNEvents{statNEvents} +{ if (!filenameCSV.empty()) { streamCSV.open(filenameCSV, std::ios::out | std::ios::app); } } -void GPUReconstruction::debugWriter::header() { +void GPUReconstruction::debugWriter::header() +{ if (streamCSV.is_open() && !streamCSV.tellp()) { - streamCSV << "type,count,name,gpu (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n"; + streamCSV << "type,count,name,gpu (us),cpu (us),cpu/total,total (us),GB/s,bytes,bytes/call\n"; } if (mMarkdown) { - std::cout << "| | count | name | gpu (us) | cpu (us) | cpu/tot | tot (us) | GB/s | bytes | bytes/call |\n"; - std::cout << "|---|--------|-------------------------------------------|-----------|-----------|---------|-----------|-----------|---------------|---------------|\n"; + std::cout << "| | count | name | gpu (us) | cpu (us) | cpu/tot | tot (us) | GB/s | bytes | bytes/call |\n"; + std::cout << "|---|--------|-------------------------------------------|-----------|-----------|---------|-----------|-----------|---------------|---------------|\n"; } } -void GPUReconstruction::debugWriter::row(char type, uint32_t count, std::string name, double gpu_time, double cpu_time, double total_time, std::size_t memSize, std::string nEventReport) { - double scale = 1000000.0 / mStatNEvents; +void GPUReconstruction::debugWriter::row(char type, uint32_t count, std::string name, double gpu_time, double cpu_time, double total_time, std::size_t memSize, std::string nEventReport) +{ + double scale = 1000000.0 / mStatNEvents; - if (streamCSV.is_open()) { - streamCSV << type << ","; - if (count != 0) streamCSV << count; - streamCSV << "," << name << ","; - if (gpu_time != -1.0) streamCSV << std::format("{:.0f}", gpu_time * scale); - streamCSV << ","; - if (cpu_time != -1.0) streamCSV << std::format("{:.0f}", cpu_time * scale); - streamCSV << ","; - if (cpu_time != -1.0 && total_time != -1.0) streamCSV << std::format("{:.2f}", cpu_time / total_time); - streamCSV << ","; - if (total_time != -1.0) streamCSV << std::format("{:.0f}", total_time * scale); - streamCSV << ","; - if (memSize != 0 && count != 0) streamCSV << std::format("{:.3f},{},{}", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count); - else streamCSV << ",,"; - streamCSV << std::endl; - } + if (streamCSV.is_open()) { + streamCSV << type << ","; + if (count != 0) + streamCSV << count; + streamCSV << "," << name << ","; + if (gpu_time != -1.0) + streamCSV << std::format("{:.0f}", gpu_time * scale); + streamCSV << ","; + if (cpu_time != -1.0) + streamCSV << std::format("{:.0f}", cpu_time * scale); + streamCSV << ","; + if (cpu_time != -1.0 && total_time != -1.0) + streamCSV << std::format("{:.2f}", cpu_time / total_time); + streamCSV << ","; + if (total_time != -1.0) + streamCSV << std::format("{:.0f}", total_time * scale); + streamCSV << ","; + if (memSize != 0 && count != 0) + streamCSV << std::format("{:.3f},{},{}", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count); + else + streamCSV << ",,"; + streamCSV << std::endl; + } - if (mMarkdown) { - std::cout << "| " << type << " | "; - if (count != 0) std::cout << std::format("{:6} |", count); - else std::cout << " |"; - std::cout << std::format(" {:42}|", name); - if (gpu_time != -1.0) std::cout << std::format("{:10.0f} |", gpu_time * scale); - else std::cout << " |"; - if (cpu_time != -1.0) std::cout << std::format("{:10.0f} |", cpu_time * scale); - else std::cout << " |"; - if (cpu_time != -1.0 && total_time != -1.0) std::cout << std::format("{:8.2f} |", cpu_time / total_time); - else std::cout << " |"; - if (total_time != -1.0) std::cout << std::format("{:10.0f} |", total_time * scale); - else std::cout << " |"; - if (memSize != 0 && count != 0) std::cout << std::format("{:10.3f} |{:14} |{:14} |", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count); - else std::cout << " | | |"; - std::cout << std::endl; - } else { - if (name.substr(0, 3) == "GPU") { - char bandwidth[256] = ""; - if (memSize && mStatNEvents && gpu_time != 0.0) { - snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count); - } - printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type, count, name.c_str(), gpu_time * scale, bandwidth); - } else if (name.substr(0, 3) == "TPC") { - std::size_t n = name.find('('); - std::string basename = name.substr(0, n - 1); - std::string postfix = name.substr(n + 1, name.size() - n - 2); - if (total_time != -1.0) { - printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", postfix.c_str(), - basename.c_str(), gpu_time * scale, "", total_time * scale, cpu_time * scale, cpu_time / total_time); - } else { - printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", count, postfix.c_str(), basename.c_str(), gpu_time * scale, - memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count); - } - } else if (name == "Prepare") { - printf("Execution Time: General Step : %50s Time: %'10.0f us\n", name.c_str(), gpu_time * scale); - } else if (name == "Wall") { - if (gpu_time != -1.0) { - printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", gpu_time * scale, nEventReport.c_str()); - } - printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", total_time * scale, cpu_time * scale, cpu_time / total_time, nEventReport.c_str()); + if (mMarkdown) { + std::cout << "| " << type << " | "; + if (count != 0) + std::cout << std::format("{:6} |", count); + else + std::cout << " |"; + std::cout << std::format(" {:42}|", name); + if (gpu_time != -1.0) + std::cout << std::format("{:10.0f} |", gpu_time * scale); + else + std::cout << " |"; + if (cpu_time != -1.0) + std::cout << std::format("{:10.0f} |", cpu_time * scale); + else + std::cout << " |"; + if (cpu_time != -1.0 && total_time != -1.0) + std::cout << std::format("{:8.2f} |", cpu_time / total_time); + else + std::cout << " |"; + if (total_time != -1.0) + std::cout << std::format("{:10.0f} |", total_time * scale); + else + std::cout << " |"; + if (memSize != 0 && count != 0) + std::cout << std::format("{:10.3f} |{:14} |{:14} |", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count); + else + std::cout << " | | |"; + std::cout << std::endl; + } else { + if (name.substr(0, 3) == "GPU") { + char bandwidth[256] = ""; + if (memSize && mStatNEvents && gpu_time != 0.0) { + snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count); } + printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type, count, name.c_str(), gpu_time * scale, bandwidth); + } else if (name.substr(0, 3) == "TPC") { + std::size_t n = name.find('('); + std::string basename = name.substr(0, n - 1); + std::string postfix = name.substr(n + 1, name.size() - n - 2); + if (total_time != -1.0) { + printf("Execution Time: Step : %11s %38s Time: %'10.0f us %64s ( Total Time : %'14.0f us, CPU Time : %'14.0f us, %'7.2fx )\n", postfix.c_str(), + basename.c_str(), gpu_time * scale, "", total_time * scale, cpu_time * scale, cpu_time / total_time); + } else { + printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", count, postfix.c_str(), basename.c_str(), gpu_time * scale, + memSize / gpu_time * 1e-9, memSize / mStatNEvents, memSize / mStatNEvents / count); + } + } else if (name == "Prepare") { + printf("Execution Time: General Step : %50s Time: %'10.0f us\n", name.c_str(), gpu_time * scale); + } else if (name == "Wall") { + if (gpu_time != -1.0) { + printf("Execution Time: Total : %50s Time: %'10.0f us%s\n", "Total Kernel", gpu_time * scale, nEventReport.c_str()); + } + printf("Execution Time: Total : %50s Time: %'10.0f us ( CPU Time : %'10.0f us, %7.2fx ) %s\n", "Total Wall", total_time * scale, cpu_time * scale, cpu_time / total_time, nEventReport.c_str()); } + } }