29 #ifndef OCLCPAENGINE_HPP 30 #define OCLCPAENGINE_HPP 43 template<
class Tc,
class Tt,
class Tp>
48 static const char * m_programCode;
49 unsigned int m_samplesPerTrace;
50 unsigned int m_noOfCandidates;
51 unsigned int m_noOfTraces;
56 cl_mem m_predictions_mem;
59 cl_mem m_predsAvg_mem;
60 cl_mem m_predsMSum_mem;
61 cl_mem m_tracesAvg_mem;
62 cl_mem m_tracesMSum_mem;
63 cl_mem m_predsTracesCSum_mem;
67 cl_kernel m_kernel_computeTracesAvgMSum;
68 cl_kernel m_kernel_computePredsAvgMSum;
69 cl_kernel m_kernel_computeCSum;
74 OclCpaEngine(
unsigned int platform,
unsigned int device,
unsigned int samplesPerTrace,
unsigned int noOfCandidates,
unsigned int noOfTraces);
90 template<
class Tc,
class Tt,
class Tp>
92 :
OclEngine<Tc>(platform, device), m_samplesPerTrace(samplesPerTrace), m_noOfCandidates(noOfCandidates), m_noOfTraces(noOfTraces), m_compiled(false) {
97 m_predictions_mem = clCreateBuffer(this->m_context, CL_MEM_READ_ONLY, noOfCandidates * noOfTraces *
sizeof(Tp), NULL, &ret);
99 throw RuntimeException(
"Couldn't allocate a data buffer on the device", ret);
102 m_traces_mem = clCreateBuffer(this->m_context, CL_MEM_READ_ONLY, samplesPerTrace * noOfTraces *
sizeof(Tt), NULL, &ret);
104 clReleaseMemObject(m_predictions_mem);
105 throw RuntimeException(
"Couldn't allocate a data buffer on the device", ret);
108 m_predsAvg_mem = clCreateBuffer(this->m_context, CL_MEM_READ_WRITE, noOfCandidates *
sizeof(Tc), NULL, &ret);
110 clReleaseMemObject(m_predictions_mem);
111 clReleaseMemObject(m_traces_mem);
112 throw RuntimeException(
"Couldn't allocate a working context buffer on the device", ret);
115 m_predsMSum_mem = clCreateBuffer(this->m_context, CL_MEM_READ_WRITE, noOfCandidates *
sizeof(Tc), NULL, &ret);
117 clReleaseMemObject(m_predictions_mem);
118 clReleaseMemObject(m_traces_mem);
119 clReleaseMemObject(m_predsAvg_mem);
120 throw RuntimeException(
"Couldn't allocate a working context buffer on the device", ret);
123 m_tracesAvg_mem = clCreateBuffer(this->m_context, CL_MEM_READ_WRITE, samplesPerTrace *
sizeof(Tc), NULL, &ret);
125 clReleaseMemObject(m_predictions_mem);
126 clReleaseMemObject(m_traces_mem);
127 clReleaseMemObject(m_predsAvg_mem);
128 clReleaseMemObject(m_predsMSum_mem);
129 throw RuntimeException(
"Couldn't allocate a working context buffer on the device", ret);
132 m_tracesMSum_mem = clCreateBuffer(this->m_context, CL_MEM_READ_WRITE, samplesPerTrace *
sizeof(Tc), NULL, &ret);
134 clReleaseMemObject(m_predictions_mem);
135 clReleaseMemObject(m_traces_mem);
136 clReleaseMemObject(m_predsAvg_mem);
137 clReleaseMemObject(m_predsMSum_mem);
138 clReleaseMemObject(m_tracesAvg_mem);
139 throw RuntimeException(
"Couldn't allocate a working context buffer on the device", ret);
142 m_predsTracesCSum_mem = clCreateBuffer(this->m_context, CL_MEM_READ_WRITE, noOfCandidates * samplesPerTrace *
sizeof(Tc), NULL, &ret);
144 clReleaseMemObject(m_predictions_mem);
145 clReleaseMemObject(m_traces_mem);
146 clReleaseMemObject(m_predsAvg_mem);
147 clReleaseMemObject(m_predsMSum_mem);
148 clReleaseMemObject(m_tracesAvg_mem);
149 clReleaseMemObject(m_tracesMSum_mem);
150 throw RuntimeException(
"Couldn't allocate a working context buffer on the device", ret);
156 template<
class Tc,
class Tt,
class Tp>
160 clReleaseKernel(m_kernel_computeCSum);
161 clReleaseKernel(m_kernel_computePredsAvgMSum);
162 clReleaseKernel(m_kernel_computeTracesAvgMSum);
163 clReleaseProgram(m_program);
166 clReleaseMemObject(m_predictions_mem);
167 clReleaseMemObject(m_traces_mem);
168 clReleaseMemObject(m_predsAvg_mem);
169 clReleaseMemObject(m_predsMSum_mem);
170 clReleaseMemObject(m_tracesAvg_mem);
171 clReleaseMemObject(m_tracesMSum_mem);
172 clReleaseMemObject(m_predsTracesCSum_mem);
177 template<
class Tc,
class Tt,
class Tp>
180 if(m_compiled)
return;
188 std::string code(
"");
191 if (!(this->getTypeName(dummyTc)).compare(
"double") || !(this->getTypeName(dummyTt)).compare(
"double") || !(this->getTypeName(dummyTp)).compare(
"double")) {
192 code.append(
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n");
195 code.append(
"typedef ").append(this->getTypeName(dummyTc)).append(
" Tc;\n");
196 code.append(
"typedef ").append(this->getTypeName(dummyTt)).append(
" Tt;\n");
197 code.append(
"typedef ").append(this->getTypeName(dummyTp)).append(
" Tp;\n");
199 code.append(m_programCode);
202 const char * codePtr = code.c_str();
203 const size_t codeSize = code.size();
205 m_program = clCreateProgramWithSource(this->m_context, 1, (
const char **)&codePtr, (
const size_t *)&codeSize, &ret);
206 if (ret)
throw RuntimeException(
"Couldn't create the ocl program from source", ret);
209 ret = clBuildProgram(m_program, 1, &((this->m_devices)[this->m_device]), NULL, NULL, NULL);
211 clReleaseProgram(m_program);
216 m_kernel_computeTracesAvgMSum = clCreateKernel(m_program,
"computeTracesAvgMSum", &ret);
218 clReleaseProgram(m_program);
222 m_kernel_computePredsAvgMSum = clCreateKernel(m_program,
"computePredsAvgMSum", &ret);
224 clReleaseKernel(m_kernel_computeTracesAvgMSum);
225 clReleaseProgram(m_program);
229 m_kernel_computeCSum = clCreateKernel(m_program,
"computeCSum", &ret);
231 clReleaseKernel(m_kernel_computePredsAvgMSum);
232 clReleaseKernel(m_kernel_computeTracesAvgMSum);
233 clReleaseProgram(m_program);
240 ret = clSetKernelArg(m_kernel_computeTracesAvgMSum, 0,
sizeof(cl_mem), (
void *)&m_traces_mem);
243 ret = clSetKernelArg(m_kernel_computeTracesAvgMSum, 1,
sizeof(cl_mem), (
void *)&m_tracesAvg_mem);
246 ret = clSetKernelArg(m_kernel_computeTracesAvgMSum, 2,
sizeof(cl_mem), (
void *)&m_tracesMSum_mem);
249 ret = clSetKernelArg(m_kernel_computeTracesAvgMSum, 3,
sizeof(
unsigned int), (
void *)&m_samplesPerTrace);
253 ret = clSetKernelArg(m_kernel_computePredsAvgMSum, 0,
sizeof(cl_mem), (
void *)&m_predictions_mem);
256 ret = clSetKernelArg(m_kernel_computePredsAvgMSum, 1,
sizeof(cl_mem), (
void *)&m_predsAvg_mem);
259 ret = clSetKernelArg(m_kernel_computePredsAvgMSum, 2,
sizeof(cl_mem), (
void *)&m_predsMSum_mem);
262 ret = clSetKernelArg(m_kernel_computePredsAvgMSum, 3,
sizeof(
unsigned int), (
void *)&m_noOfCandidates);
266 ret = clSetKernelArg(m_kernel_computeCSum, 0,
sizeof(cl_mem), (
void *)&m_traces_mem);
269 ret = clSetKernelArg(m_kernel_computeCSum, 1,
sizeof(cl_mem), (
void *)&m_predictions_mem);
272 ret = clSetKernelArg(m_kernel_computeCSum, 2,
sizeof(cl_mem), (
void *)&m_tracesAvg_mem);
275 ret = clSetKernelArg(m_kernel_computeCSum, 3,
sizeof(cl_mem), (
void *)&m_predsAvg_mem);
278 ret = clSetKernelArg(m_kernel_computeCSum, 4,
sizeof(cl_mem), (
void *)&m_predsTracesCSum_mem);
281 ret = clSetKernelArg(m_kernel_computeCSum, 5,
sizeof(
unsigned int), (
void *)&m_samplesPerTrace);
284 ret = clSetKernelArg(m_kernel_computeCSum, 6,
sizeof(
unsigned int), (
void *)&m_noOfCandidates);
290 template<
class Tc,
class Tt,
class Tp>
293 if (m_noOfTraces * m_noOfCandidates *
sizeof(Tp) != pp.
size())
294 throw RuntimeException(
"Number of traces and/or number of candidates conflicts with values set within construction of the ocl engine");
296 cl_int ret = clEnqueueWriteBuffer(this->m_command_queue, m_predictions_mem, blocking ? CL_TRUE : CL_FALSE, 0, pp.
size(), pp.
data(), 0, NULL, NULL);
297 if (ret)
throw RuntimeException(
"Couldn't enqueue a data transmit to the device", ret);
302 template<
class Tc,
class Tt,
class Tp>
305 if (m_noOfTraces * m_samplesPerTrace *
sizeof(Tt) != pt.
size())
306 throw RuntimeException(
"Number of traces and/or number of samples per trace conflicts with values set within construction of the ocl engine");
308 cl_int ret = clEnqueueWriteBuffer(this->m_command_queue, m_traces_mem, blocking ? CL_TRUE : CL_FALSE, 0, pt.
size(), pt.
data(), 0, NULL, NULL);
309 if (ret)
throw RuntimeException(
"Couldn't enqueue a data transmit to the device", ret);
314 template<
class Tc,
class Tt,
class Tp>
317 corrContext.init(m_samplesPerTrace, m_noOfCandidates, 1, 2, 1);
319 unsigned int noOfSlices = m_noOfTraces / sliceSize;
320 unsigned int remaindingSliceSize = m_noOfTraces - noOfSlices * sliceSize;
324 size_t traces_global_item_size = (((m_samplesPerTrace - 1) / 64) + 1) * 64;
325 size_t traces_local_item_size = 64;
327 ret = clSetKernelArg(m_kernel_computeTracesAvgMSum, 4,
sizeof(
unsigned int), (
void *)&sliceSize);
330 for (
unsigned int i = 0; i < noOfSlices; i++) {
332 offset = i * sliceSize;
334 ret = clSetKernelArg(m_kernel_computeTracesAvgMSum, 5,
sizeof(
unsigned int), (
void *)&offset);
337 ret = clEnqueueNDRangeKernel(this->m_command_queue, m_kernel_computeTracesAvgMSum, 1, NULL, &traces_global_item_size, &traces_local_item_size, 0, NULL, NULL);
338 if (ret)
throw RuntimeException(
"Couldn't enqueue a kernel to the device", ret);
340 ret = clFinish(this->m_command_queue);
345 offset = noOfSlices * sliceSize;
347 ret = clSetKernelArg(m_kernel_computeTracesAvgMSum, 4,
sizeof(
unsigned int), (
void *)&remaindingSliceSize);
350 ret = clSetKernelArg(m_kernel_computeTracesAvgMSum, 5,
sizeof(
unsigned int), (
void *)&offset);
353 ret = clEnqueueNDRangeKernel(this->m_command_queue, m_kernel_computeTracesAvgMSum, 1, NULL, &traces_global_item_size, &traces_local_item_size, 0, NULL, NULL);
354 if (ret)
throw RuntimeException(
"Couldn't enqueue a kernel to the device", ret);
356 ret = clFinish(this->m_command_queue);
361 size_t preds_global_item_size = (((m_noOfCandidates - 1) / 64) + 1) * 64;
362 size_t preds_local_item_size = 64;
364 ret = clSetKernelArg(m_kernel_computePredsAvgMSum, 4,
sizeof(
unsigned int), (
void *)&sliceSize);
367 for (
unsigned int i = 0; i < noOfSlices; i++) {
369 offset = i * sliceSize;
371 ret = clSetKernelArg(m_kernel_computePredsAvgMSum, 5,
sizeof(
unsigned int), (
void *)&offset);
374 ret = clEnqueueNDRangeKernel(this->m_command_queue, m_kernel_computePredsAvgMSum, 1, NULL, &preds_global_item_size, &preds_local_item_size, 0, NULL, NULL);
375 if (ret)
throw RuntimeException(
"Couldn't enqueue a kernel to the device", ret);
377 ret = clFinish(this->m_command_queue);
382 offset = noOfSlices * sliceSize;
384 ret = clSetKernelArg(m_kernel_computePredsAvgMSum, 4,
sizeof(
unsigned int), (
void *)&remaindingSliceSize);
387 ret = clSetKernelArg(m_kernel_computePredsAvgMSum, 5,
sizeof(
unsigned int), (
void *)&offset);
390 ret = clEnqueueNDRangeKernel(this->m_command_queue, m_kernel_computePredsAvgMSum, 1, NULL, &preds_global_item_size, &preds_local_item_size, 0, NULL, NULL);
391 if (ret)
throw RuntimeException(
"Couldn't enqueue a kernel to the device", ret);
393 ret = clFinish(this->m_command_queue);
398 size_t csum_global_item_size[2];
399 csum_global_item_size[0] = (((m_samplesPerTrace - 1) / 16) + 1) * 16;
400 csum_global_item_size[1] = (((m_noOfCandidates - 1) / 16) + 1) * 16;
401 size_t csum_local_item_size[] = { 16, 16 };
403 ret = clSetKernelArg(m_kernel_computeCSum, 7,
sizeof(
unsigned int), (
void *)&sliceSize);
406 for (
unsigned int i = 0; i < noOfSlices; i++) {
408 offset = i * sliceSize;
410 ret = clSetKernelArg(m_kernel_computeCSum, 8,
sizeof(
unsigned int), (
void *)&offset);
413 ret = clEnqueueNDRangeKernel(this->m_command_queue, m_kernel_computeCSum, 2, NULL, csum_global_item_size, csum_local_item_size, 0, NULL, NULL);
414 if (ret)
throw RuntimeException(
"Couldn't enqueue a kernel to the device", ret);
416 ret = clFinish(this->m_command_queue);
421 offset = noOfSlices * sliceSize;
423 ret = clSetKernelArg(m_kernel_computeCSum, 7,
sizeof(
unsigned int), (
void *)&remaindingSliceSize);
426 ret = clSetKernelArg(m_kernel_computeCSum, 8,
sizeof(
unsigned int), (
void *)&offset);
429 ret = clEnqueueNDRangeKernel(this->m_command_queue, m_kernel_computeCSum, 2, NULL, csum_global_item_size, csum_local_item_size, 0, NULL, NULL);
430 if (ret)
throw RuntimeException(
"Couldn't enqueue a kernel to the device", ret);
432 ret = clFinish(this->m_command_queue);
437 ret = clEnqueueReadBuffer(this->m_command_queue, m_predsAvg_mem, CL_TRUE, 0, corrContext.
p2M(1).size(), corrContext.
p2M(1).data(), 0, NULL, NULL);
438 if (ret)
throw RuntimeException(
"Couldn't enqueue a data transmit from the device", ret);
440 ret = clEnqueueReadBuffer(this->m_command_queue, m_predsMSum_mem, CL_TRUE, 0, corrContext.
p2CS(2).size(), corrContext.
p2CS(2).data(), 0, NULL, NULL);
441 if (ret)
throw RuntimeException(
"Couldn't enqueue a data transmit from the device", ret);
443 ret = clEnqueueReadBuffer(this->m_command_queue, m_tracesAvg_mem, CL_TRUE, 0, corrContext.
p1M(1).size(), corrContext.
p1M(1).data(), 0, NULL, NULL);
444 if (ret)
throw RuntimeException(
"Couldn't enqueue a data transmit from the device", ret);
446 ret = clEnqueueReadBuffer(this->m_command_queue, m_tracesMSum_mem, CL_TRUE, 0, corrContext.
p1CS(2).size(), corrContext.
p1CS(2).data(), 0, NULL, NULL);
447 if (ret)
throw RuntimeException(
"Couldn't enqueue a data transmit from the device", ret);
449 ret = clEnqueueReadBuffer(this->m_command_queue, m_predsTracesCSum_mem, CL_TRUE, 0, corrContext.
p12ACS(1).size(), corrContext.
p12ACS(1).data(), 0, NULL, NULL);
450 if (ret)
throw RuntimeException(
"Couldn't enqueue a data transmit from the device", ret);
452 ret = clFinish(this->m_command_queue);
455 corrContext.
p1Card() = m_noOfTraces;
461 template<
class Tc,
class Tt,
class Tp>
464 __kernel void computeCSum(__global const Tt *traces, __global const Tp *predictions, __global const Tc *tracesAvg, __global const Tc *predsAvg, __global Tc *CSums, unsigned int samplesPerTrace, unsigned int noOfCandidates, unsigned int noOfTraces, unsigned int traceOffset) { \ 466 unsigned int sample = get_global_id(0); \ 467 unsigned int candidate = get_global_id(1);\ 469 Tc localTracesAvg = (sample < samplesPerTrace) ? tracesAvg[sample] : (Tc)0;\ 470 Tc localPredsAvg = (candidate < noOfCandidates) ? predsAvg[candidate] : (Tc)0;\ 471 Tc localCSum = ((traceOffset > 0) && (sample < samplesPerTrace) && (candidate < noOfCandidates)) ? CSums[candidate * samplesPerTrace + sample] : (Tc)0;\ 476 for (int trace = traceOffset; trace < (traceOffset + noOfTraces); trace++) {\ 478 val1 = (sample < samplesPerTrace) ? traces[trace * samplesPerTrace + sample] : (Tc)0;\ 479 val2 = (candidate < noOfCandidates) ? predictions[trace * noOfCandidates + candidate] : (Tc)0;\ 481 localCSum += (val1 - localTracesAvg) * (val2 - localPredsAvg);\ 483 barrier(CLK_LOCAL_MEM_FENCE);\ 487 if ((sample < samplesPerTrace) && (candidate < noOfCandidates)) {\ 489 CSums[candidate * samplesPerTrace + sample] = localCSum;\ 496 __kernel void computeTracesAvgMSum(__global const Tt *traces, __global Tc *avgs, __global Tc *msums, unsigned int samplesPerTrace, unsigned int noOfTraces, unsigned int traceOffset) {\ 498 unsigned int sample = get_global_id(0);\ 500 if (sample >= samplesPerTrace)\ 503 unsigned int trace = traceOffset;\ 504 Tc ctrace = (Tc)traceOffset;\ 506 Tc localAvg = (traceOffset > 0) ? avgs[sample] : (Tc)0;\ 507 Tc localMSum = (traceOffset > 0) ? msums[sample] : (Tc)0;\ 512 for (; trace < (traceOffset + noOfTraces); trace++) {\ 514 val = traces[trace * samplesPerTrace + sample];\ 516 temp = val - localAvg;\ 517 ctrace = ctrace + (Tc)1;\ 518 localAvg += temp / ctrace;\ 519 localMSum += temp * (val - localAvg);\ 523 avgs[sample] = localAvg;\ 524 msums[sample] = localMSum;\ 529 __kernel void computePredsAvgMSum(__global const Tp *preds, __global Tc *avgs, __global Tc *msums, unsigned int noOfCandidates, unsigned int noOfTraces, unsigned int traceOffset) {\ 531 unsigned int candidate = get_global_id(0);\ 533 if (candidate >= noOfCandidates)\ 536 unsigned int trace = traceOffset;\ 537 Tc ctrace = (Tc)traceOffset;\ 539 Tc localAvg = (traceOffset > 0) ? avgs[candidate] : (Tc)0;\ 540 Tc localMSum = (traceOffset > 0) ? msums[candidate] : (Tc)0;\ 545 for (; trace < (traceOffset + noOfTraces); trace++) {\ 547 val = preds[trace * noOfCandidates + candidate];\ 549 temp = val - localAvg;\ 550 ctrace = ctrace + (Tc)1;\ 551 localAvg += temp / ctrace;\ 552 localMSum += temp * (val - localAvg);\ 556 avgs[candidate] = localAvg;\ 557 msums[candidate] = localMSum;\ OpenCL base class template for SICAK plugins.
virtual T * data()
Returns a pointer to the contained data.
Definition: types_basic.hpp:343
A class representing a Two-population Univariate Moment-based statistical context.
Definition: types_stat.hpp:43
virtual Matrix< T > & p12ACS(size_t order)
Adjusted central moment sum both populations, order 1 upto acsOrder.
Definition: types_stat.hpp:220
This header file contains exceptions.
OclCpaEngine(unsigned int platform, unsigned int device, unsigned int samplesPerTrace, unsigned int noOfCandidates, unsigned int noOfTraces)
Initialize given platform and device, create command queue and allocate device memory buffers.
Definition: oclcpaengine.hpp:91
OpenCL base class template used in other SICAK plugins.
Definition: oclengine.hpp:51
virtual size_t & p1Card()
Cardinality of the first population.
Definition: types_stat.hpp:190
Definition: oclcpaengine.hpp:44
virtual Vector< T > & p2CS(size_t order)
Central moment sum of the second population, order 2 upto csOrder.
Definition: types_stat.hpp:215
This header file contains class templates of power traces and power consumption containers.
A class representing a Matrix with 'noOfTraces' power predictions, with 'noOfCandidates' key candidat...
Definition: types_power.hpp:82
void loadPredictionsToDevice(const PowerPredictions< Tp > &pp, bool blocking=false)
Load power predictions from local memory to device buffers.
Definition: oclcpaengine.hpp:291
virtual size_t & p2Card()
Cardinality of the second population.
Definition: types_stat.hpp:195
void loadTracesToDevice(const PowerTraces< Tt > &pt, bool blocking=false)
Load power traces from local memory to device buffers.
Definition: oclcpaengine.hpp:303
virtual Vector< T > & p1CS(size_t order)
Central moment sum of the first population, order 2 upto csOrder.
Definition: types_stat.hpp:210
void buildProgram()
Build the OpenCL kernels.
Definition: oclcpaengine.hpp:178
virtual Vector< T > & p2M(size_t order)
Raw moment of the second population, order 1 upto mOrder.
Definition: types_stat.hpp:205
An exception which cannot be directly influenced by the user, or predicted beforehand.
Definition: exceptions.hpp:76
void compute(UnivariateContext< Tc > &context, unsigned int sliceSize)
Launch the computation kernel, divide the work by sliceSize (long running GPU kernel is not good),...
Definition: oclcpaengine.hpp:315
This header file contains class templates of statistical computational contexts.
virtual size_t size() const
Returns the size of the contained data (i.e. length * sizeof(T))
Definition: types_basic.hpp:347
A class representing a Matrix with 'noOfTraces' power traces, with 'samplesPerTrace' samples per powe...
Definition: types_power.hpp:44
virtual Vector< T > & p1M(size_t order)
Raw moment of the first population, order 1 upto mOrder.
Definition: types_stat.hpp:200