Warning, /HeterogeneousCore/CUDACore/test/cudaTimeMeasurement.cu is written in an unsupported language. File is not indexed.
0001 #include <iostream>
0002 #include <fstream>
0003 #include <iomanip>
0004 #include <cstdlib>
0005 #include <string>
0006 #include <vector>
0007 #include <random>
0008 #include <algorithm>
0009 #include <utility>
0010 #include <chrono> //Time
0011 #include <cuda.h>
0012 #include <thrust/device_vector.h>
0013 #include <unistd.h>
0014 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0015 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0016
0017 //Global Varaibles
0018 int sizeOfVector = 200; //default size.
0019 int averg = 10; //default average.
0020 int extra = 2; //extra length in vectors for calculation.
0021 int task = 100; //default number of task.
0022 int part = 1; //default for user's choice of part.
0023 int saveFile = 0; //default for saving results into a file.
0024 int printStander = 0; //default for printing stander deviation.
0025 std::vector<int> partNumber(1, 1); //vector for user's choice of part.
0026
0027 // Data Structur For Times
0028 struct Timing {
0029 int partChosen;
0030 std::chrono::steady_clock::time_point copyToDevice[2]; // get time points from start and end.
0031 std::chrono::steady_clock::time_point operationOnDeviceByHost[2]; //get time duration in Device with Host perspective.
0032 std::chrono::steady_clock::time_point copyToHost[2];
0033
0034 std::vector<std::chrono::duration<double, std::micro>> timeCopyToDevice; //Save the Duration in Microsecond.
0035 std::vector<std::chrono::duration<double, std::micro>> timeOperationOnDeviceByHost;
0036 std::vector<std::chrono::duration<double, std::micro>> timeCopyToHost;
0037
0038 cudaEvent_t start, stop; //get time points in Device.
0039 float operationOnDeviceByDevice = 0; //get time duration in Device with device perspective.
0040 std::vector<float> operationOnDeviceByDeviceAcc; //get accumulating time duration in Device with device perspective.
0041 };
0042
0043 // Data Structure For Vectors
0044 struct Vectors {
0045 std::vector<float> vect1; //create vector.
0046 std::vector<float> vect2;
0047 std::vector<float> vect3Cpu; //this is only for Host to verify.
0048 std::vector<float> vect3Gpu; //this is only for Device.
0049 };
0050
0051 //Data Structure for Pointers
0052 struct Pointers {
0053 float *dVect1;
0054 float *dVect2;
0055 float *dVect3;
0056
0057 float *dVect1Extra;
0058 float *dVect2Extra;
0059 float *dVect3Extra;
0060 };
0061
0062 //called in the Host (CPU) and excuted in the Device (GPU)
0063 __global__ void addVectorsGpu(float *vect1, float *vect2, float *vect3, int size, int taskN) {
0064 //blockDim.x gives the number of threads in a block, in the x direction.
0065 //gridDim.x gives the number of blocks in a grid, in the x direction.
0066 //blockDim.x * gridDim.x gives the number of threads in a grid (in the x direction, in this case).
0067 int first = blockDim.x * blockIdx.x + threadIdx.x;
0068 int stride = blockDim.x * gridDim.x;
0069 for (int i = 0; i < taskN; ++i) {
0070 for (int j = first; j < size; j += stride) {
0071 vect3[j] = vect2[j] + vect1[j];
0072 }
0073 }
0074 } //add two vectors and save the result into the third vector.
0075
0076 //__host__//called in the Host (CPU) and executed in the host.
0077 void addVectorsCpu(std::vector<float> &vect1,
0078 std::vector<float> &vect2,
0079 std::vector<float> &vect3); //add two vectors and save the result into a third vector.
0080
0081 void randomGenerator(std::vector<float> &vect); //generate uniform random numbers.
0082
0083 bool checkingResultsPrintout(std::vector<float> &vectCpu, std::vector<float> &vectGpu); //printout the results.
0084
0085 void calculateTimeDuration(Timing &timing, int i); //get Duration time for each cycle.
0086
0087 void calculateAverageDeviation(Timing &timing); //Calculate Average and Standard deviation.
0088
0089 bool saveToFile(const std::string &name, const Timing &timing);
0090
0091 const std::vector<int> chooseFunction(int toInteger); //Convert integers to a vector.
0092
0093 Timing cudaTimePart0(Timing &timing, Vectors &vect, Pointers &dvect, int size); //Part 0 of Cuda measurement time.
0094
0095 Timing cudaTimePart1(
0096 Timing &timing, Vectors &vect, Pointers &dvect, int size, int startSave); //PArt 1 of Cuda measurement time.
0097
0098 Timing cudaTimePart2(Timing &timing, Vectors &vect, int size); //Part 2 of Cuda measurement time.
0099
0100 Timing cudaTimePart3(Timing &timing, Vectors &vect, Pointers &dvect, int size); //Part 3 of Cuda measurement time.
0101
0102 Timing cudaTimePart4(Timing &timing, Vectors &vect, Pointers &dvect, int size); //Part 4 of Cuda measurement time.
0103
0104 Timing cudaTimePart5(Timing &timing, Vectors &vect, Pointers &dvect, int size); //Part 5 of Cuda measurement time.
0105
0106 void printoutAll(std::vector<Timing> &timing, bool standerDeviationPrint);
0107
0108 int getNumberofDigits(double number);
0109
0110 void newLineTitle(int line, const std::string &title);
0111
0112 void printResultEach(std::vector<Timing> &timing, int type, bool standerDeviationPrint);
0113
0114 int main(int argc, char *argv[]) {
0115 cms::cudatest::requireDevices();
0116 int c; //to get parameters from user.
0117 while ((c = getopt(argc, argv, "s:a:t:p:fq")) != -1) {
0118 switch (c) {
0119 case 's':
0120 try {
0121 sizeOfVector = std::stoll(optarg, nullptr, 0);
0122 } catch (std::exception &err) {
0123 std::cout << "\n\tError Must be integer Argument!";
0124 std::cout << "\n\t" << err.what() << std::endl;
0125 return 0;
0126 }
0127 break;
0128 case 'a':
0129 try {
0130 averg = std::stoll(optarg, nullptr, 0);
0131
0132 } catch (std::exception &err) {
0133 std::cout << "\n\tError Must be integer Argument!";
0134 std::cout << "\n\t" << err.what() << std::endl;
0135 return 0;
0136 }
0137 break;
0138 case 't':
0139 try {
0140 task = std::stoll(optarg, nullptr, 0);
0141 std::cout << "\nNumber of repeated Task is " << task << std::endl;
0142 } catch (std::exception &err) {
0143 std::cout << "\n\tError Must be integer Argument!";
0144 std::cout << "\n\t" << err.what() << std::endl;
0145 return 0;
0146 }
0147 break;
0148 case 'p':
0149 try {
0150 part = std::stoll(optarg, nullptr, 0);
0151 partNumber = chooseFunction(part);
0152 std::cout << "\nyou have chosen Part ";
0153 for (unsigned int j = 0; j < partNumber.size(); ++j) {
0154 std::cout << partNumber[j] << " ,";
0155 }
0156 std::cout << "\n";
0157 } catch (std::exception &err) {
0158 std::cout << "\n\tError Must be integer Argument!";
0159 std::cout << "\n\t" << err.what() << std::endl;
0160 return 0;
0161 }
0162 break;
0163 case 'f':
0164 try {
0165 saveFile = 1;
0166 std::cout << "\nyou have chosen to save file." << std::endl;
0167 } catch (std::exception &err) {
0168 std::cout << "\n\tError Must be integer Argument!";
0169 std::cout << "\n\t" << err.what() << std::endl;
0170 return 0;
0171 }
0172 break;
0173 case 'q':
0174 try {
0175 printStander = 1;
0176 std::cout << "\nyou have chosen to print stander Deviation." << std::endl;
0177 } catch (std::exception &err) {
0178 std::cout << "\n\tError Must be integer Argument!";
0179 std::cout << "\n\t" << err.what() << std::endl;
0180 return 0;
0181 }
0182 break;
0183 default:
0184 abort();
0185 }
0186 }
0187 int size = sizeOfVector * sizeof(float); //get size in byte for vectors.
0188 int startSave = 0; // to worm up GPU.
0189 std::vector<Timing> allTiming;
0190 allTiming.resize(partNumber.size());
0191
0192 Timing timing; //times Variables.
0193 Timing resetTime; //To reset timing object.
0194 Vectors vect; //Vectros variables.
0195 Pointers dvect; //Pointers for device vectors.
0196
0197 //Initialize vectors:
0198 vect.vect1.resize(sizeOfVector);
0199 vect.vect2.resize(sizeOfVector);
0200 vect.vect3Cpu.resize(sizeOfVector);
0201 vect.vect3Gpu.resize(sizeOfVector);
0202
0203 //Setup Verctors for Taking Average and Standard deviation
0204 timing.timeCopyToDevice.resize(averg + extra); //extra for saving the average.
0205 timing.timeOperationOnDeviceByHost.resize(averg + extra);
0206 timing.timeCopyToHost.resize(averg + extra);
0207 timing.operationOnDeviceByDeviceAcc.resize(averg + extra);
0208
0209 //Setup Verctors for reseting timing.
0210 resetTime.timeCopyToDevice.resize(averg + extra); //extra for saving the average.
0211 resetTime.timeOperationOnDeviceByHost.resize(averg + extra);
0212 resetTime.timeCopyToHost.resize(averg + extra);
0213 resetTime.operationOnDeviceByDeviceAcc.resize(averg + extra);
0214
0215 //generate random numbers.
0216 randomGenerator(vect.vect1);
0217 randomGenerator(vect.vect2);
0218
0219 for (unsigned int i = 0; i < partNumber.size(); ++i) {
0220 if (partNumber[i] == 6) {
0221 allTiming[i] = cudaTimePart0(timing, vect, dvect, size);
0222 timing = resetTime; //reset timing.
0223 } else if (partNumber[i] == 1) {
0224 allTiming[i] = cudaTimePart1(timing, vect, dvect, size, startSave++);
0225 timing = resetTime;
0226 } else if (partNumber[i] == 2) {
0227 allTiming[i] = cudaTimePart2(timing, vect, size);
0228 timing = resetTime;
0229 } else if (partNumber[i] == 3) {
0230 allTiming[i] = cudaTimePart3(timing, vect, dvect, size);
0231 timing = resetTime;
0232 } else if (partNumber[i] == 4) {
0233 allTiming[i] = cudaTimePart4(timing, vect, dvect, size);
0234 timing = resetTime;
0235 } else if (partNumber[i] == 5) {
0236 allTiming[i] = cudaTimePart5(timing, vect, dvect, size);
0237 timing = resetTime;
0238 } else {
0239 std::cout << "\n\n\tError the User has not chose any number of Function!\n";
0240 break;
0241 }
0242 }
0243
0244 printoutAll(allTiming, printStander);
0245 return 0;
0246 }
0247
0248 const std::vector<int> chooseFunction(int toInteger) {
0249 std::vector<int> digits(0, 0);
0250 std::vector<int> ERROR(0, 0);
0251
0252 int digit{1};
0253
0254 while (toInteger > 0) {
0255 digit = toInteger % 10;
0256 if (digit > part) {
0257 std::cout << "\n\tError Must be integer Argument <= " << part << std::endl;
0258 return ERROR;
0259 }
0260 digits.push_back(digit);
0261 toInteger /= 10;
0262 }
0263 std::reverse(digits.begin(), digits.end());
0264 return digits;
0265 }
0266
0267 void randomGenerator(std::vector<float> &vect) {
0268 std::random_device rand;
0269 std::default_random_engine gener(rand());
0270 std::uniform_real_distribution<> dis(0., 1.);
0271 int size = vect.size();
0272 for (int i = 0; i < size; i++) {
0273 vect.at(i) = dis(gener);
0274 }
0275 }
0276 void addVectorsCpu(std::vector<float> &vect1, std::vector<float> &vect2, std::vector<float> &vect3) {
0277 for (unsigned int i = 0; i < vect1.size(); ++i) {
0278 vect3[i] = vect2[i] + vect1[i];
0279 }
0280 }
0281
0282 bool checkingResultsPrintout(std::vector<float> &vectCpu, std::vector<float> &vectGpu) {
0283 float percent{0.0};
0284 float totalError{0.0};
0285 int size = vectCpu.size();
0286 for (int j = 0; j < size; j++) {
0287 percent = ((vectCpu[j] - vectGpu[j]) / vectCpu[j]) * 100;
0288 totalError += percent;
0289 }
0290 if (totalError) {
0291 std::cout << "\n------------------------------------\n";
0292 std::cout << "| CpuSum | GpuSum | Error | Error %| ";
0293 std::cout << "\n------------------------------------\n";
0294 //std::cout.precision(4);
0295 for (int j = 0; j < size; j++) {
0296 std::cout.flags(std::ios::fixed | std::ios::showpoint);
0297 std::cout.precision(4);
0298 std::cout << "| " << vectCpu[j] << " | " << vectGpu[j] << " | " << vectCpu[j] - vectGpu[j] << " | " << percent
0299 << " |\n";
0300 }
0301 std::cout << "-------------------------------------\n";
0302 std::cout << "-Total Error is " << totalError << std::endl;
0303 return false;
0304 }
0305 return true;
0306 }
0307 void calculateTimeDuration(Timing &timing, int i) {
0308 timing.timeCopyToDevice[i] = (timing.copyToDevice[1] - timing.copyToDevice[0]); //getting the time in microseconds
0309 timing.timeOperationOnDeviceByHost[i] = (timing.operationOnDeviceByHost[1] - timing.operationOnDeviceByHost[0]);
0310 timing.timeCopyToHost[i] = (timing.copyToHost[1] - timing.copyToHost[0]);
0311 cudaEventElapsedTime(&timing.operationOnDeviceByDevice,
0312 timing.start,
0313 timing.stop); //get the time elapse in Device operation with device perspective.
0314 timing.operationOnDeviceByDeviceAcc[i] = (timing.operationOnDeviceByDevice * 1000);
0315 }
0316 void calculateAverageDeviation(Timing &timing) {
0317 //Average
0318 for (int i = 0; i < averg; ++i) {
0319 timing.timeCopyToDevice[averg] += timing.timeCopyToDevice[i];
0320 timing.timeOperationOnDeviceByHost[averg] += timing.timeOperationOnDeviceByHost[i];
0321 timing.timeCopyToHost[averg] += timing.timeCopyToHost[i];
0322 timing.operationOnDeviceByDeviceAcc[averg] += timing.operationOnDeviceByDeviceAcc[i];
0323 }
0324 timing.timeCopyToDevice[averg] = timing.timeCopyToDevice[averg] / averg;
0325 timing.timeOperationOnDeviceByHost[averg] = timing.timeOperationOnDeviceByHost[averg] / averg;
0326 timing.timeCopyToHost[averg] = timing.timeCopyToHost[averg] / averg;
0327 timing.operationOnDeviceByDeviceAcc[averg] = (double)timing.operationOnDeviceByDeviceAcc[averg] / averg;
0328
0329 //Standard deviation
0330 for (int i = 0; i < averg; ++i) {
0331 timing.timeCopyToDevice[i] -= timing.timeCopyToDevice[averg]; //Take the different.
0332 timing.timeCopyToDevice[i] = timing.timeCopyToDevice[i] * timing.timeCopyToDevice[i].count(); // Square it.
0333 timing.timeCopyToDevice[averg + 1] +=
0334 timing.timeCopyToDevice[i]; //add them togather. averg+1 is location of the Deviation
0335
0336 timing.timeOperationOnDeviceByHost[i] -= timing.timeOperationOnDeviceByHost[averg];
0337 timing.timeOperationOnDeviceByHost[i] *= timing.timeOperationOnDeviceByHost[i].count();
0338 timing.timeOperationOnDeviceByHost[averg + 1] += timing.timeOperationOnDeviceByHost[i];
0339
0340 timing.timeCopyToHost[i] -= timing.timeCopyToHost[averg];
0341 timing.timeCopyToHost[i] *= timing.timeCopyToHost[i].count();
0342 timing.timeCopyToHost[averg + 1] += timing.timeCopyToHost[i];
0343
0344 timing.operationOnDeviceByDeviceAcc[i] -= timing.operationOnDeviceByDeviceAcc[averg];
0345 timing.operationOnDeviceByDeviceAcc[i] *= timing.operationOnDeviceByDeviceAcc[i];
0346 timing.operationOnDeviceByDeviceAcc[averg + 1] += timing.operationOnDeviceByDeviceAcc[i];
0347 }
0348
0349 timing.timeCopyToDevice[averg + 1] = timing.timeCopyToDevice[averg + 1] / averg;
0350 timing.timeCopyToDevice[averg + 1] =
0351 (std::chrono::duration<double, std::micro>)sqrt(timing.timeCopyToDevice[averg + 1].count());
0352 timing.timeOperationOnDeviceByHost[averg + 1] = timing.timeOperationOnDeviceByHost[averg + 1] / averg;
0353 timing.timeOperationOnDeviceByHost[averg + 1] =
0354 (std::chrono::duration<double, std::micro>)sqrt(timing.timeOperationOnDeviceByHost[averg + 1].count());
0355 timing.timeCopyToHost[averg + 1] = timing.timeCopyToHost[averg + 1] / averg;
0356 timing.timeCopyToHost[averg + 1] =
0357 (std::chrono::duration<double, std::micro>)sqrt(timing.timeCopyToHost[averg + 1].count());
0358
0359 timing.operationOnDeviceByDeviceAcc[averg + 1] = (double)timing.operationOnDeviceByDeviceAcc[averg + 1] / averg;
0360 timing.operationOnDeviceByDeviceAcc[averg + 1] = sqrt(timing.operationOnDeviceByDeviceAcc[averg + 1]);
0361 }
0362
0363 void printoutAll(std::vector<Timing> &timing, bool standerDeviationPrint) {
0364 const std::string gpuReadCpu = " Duration Time Read from Host To Device ";
0365 const std::string timeCpu = " Duration Time operation on Host point View ";
0366 const std::string timeGpu = " Duration Time operation on Device point View ";
0367 const std::string cpuReadGpu = " Duration Time Read from Device To Host ";
0368
0369 const std::string averageTime = " AverTime ";
0370 const std::string standerDeviation = " StDeviation ";
0371 const std::string nameTiming = " Name Timing ";
0372 const std::string partsNumberall = "Part ";
0373
0374 int totalFix = 0;
0375
0376 if (standerDeviationPrint) {
0377 totalFix = timeGpu.size() + timing.size() * (averageTime.size() + standerDeviation.size() + 3);
0378 } else {
0379 totalFix = timeGpu.size() + timing.size() * (averageTime.size() + 3);
0380 }
0381
0382 std::cout.flags(std::ios::fixed | std::ios::showpoint);
0383 std::cout.precision(4);
0384
0385 std::cout << '\n';
0386 std::cout.width(totalFix);
0387 std::cout.fill('-');
0388 std::cout << '-' << '\n';
0389 std::cout.fill(' ');
0390
0391 std::cout << "|";
0392 std::cout.width((timeGpu.size() - nameTiming.size()) / 2);
0393 std::cout.fill(' ');
0394 std::cout << " ";
0395 std::cout << nameTiming;
0396 std::cout.width((timeGpu.size() - nameTiming.size()) / 2);
0397 std::cout.fill(' ');
0398 std::cout << " ";
0399 std::cout << " |";
0400
0401 for (unsigned int i = 0; i < timing.size(); ++i) {
0402 if (standerDeviationPrint) {
0403 std::cout.width(((averageTime.size() + standerDeviation.size()) - partsNumberall.size() + 1) / 2);
0404 } //9
0405 else {
0406 std::cout.width(((averageTime.size()) - partsNumberall.size()) / 2);
0407 } //2
0408
0409 std::cout << " ";
0410 std::cout << partsNumberall << timing[i].partChosen;
0411
0412 if (standerDeviationPrint) {
0413 std::cout.width(((averageTime.size() + standerDeviation.size()) - partsNumberall.size() + 1) / 2);
0414 } //9
0415 else {
0416 std::cout.width(((averageTime.size()) - partsNumberall.size()) / 2);
0417 }
0418 //2
0419 std::cout << " ";
0420 std::cout << "|";
0421 }
0422
0423 std::cout << '\n';
0424 std::cout << "|";
0425 std::cout.width(gpuReadCpu.size() + 3);
0426 std::cout.fill(' ');
0427 std::cout << "|";
0428
0429 for (unsigned int i = 0; i < timing.size(); ++i) {
0430 std::cout << averageTime;
0431 std::cout << "|";
0432 if (standerDeviationPrint) {
0433 std::cout << standerDeviation;
0434 std::cout << "|";
0435 }
0436 }
0437
0438 newLineTitle(totalFix, gpuReadCpu);
0439 printResultEach(timing, 1, standerDeviationPrint);
0440 newLineTitle(totalFix, timeCpu);
0441 printResultEach(timing, 2, standerDeviationPrint);
0442 newLineTitle(totalFix, timeGpu);
0443 printResultEach(timing, 3, standerDeviationPrint);
0444 newLineTitle(totalFix, cpuReadGpu);
0445 printResultEach(timing, 4, standerDeviationPrint);
0446
0447 std::cout << '\n';
0448 std::cout.width(totalFix);
0449 std::cout.fill('-');
0450 std::cout << '-' << '\n';
0451 std::cout.fill(' ');
0452 }
0453 int getNumberofDigits(double number) { return ((int)log10(number) + 1) + 4; }
0454 void newLineTitle(int line, const std::string &title) {
0455 std::cout << '\n';
0456 std::cout.width(line);
0457 std::cout.fill('-');
0458 std::cout << '-' << '\n';
0459 std::cout.fill(' ');
0460
0461 std::cout << "| ";
0462 std::cout << title;
0463 std::cout << " |";
0464 }
0465 void printResultEach(std::vector<Timing> &timing, int type, bool standerDeviationPrint) {
0466 int averageTimeWidth = 10;
0467 int standerDeviationWidth = 13;
0468
0469 for (unsigned int i = 0; i < timing.size(); ++i) {
0470 if (type == 1) {
0471 std::cout.width(averageTimeWidth);
0472 std::cout.fill(' ');
0473 std::cout << timing[i].timeCopyToDevice[averg].count();
0474 std::cout << "|";
0475 if (standerDeviationPrint) {
0476 std::cout.width(standerDeviationWidth);
0477 std::cout.fill(' ');
0478 std::cout << timing[i].timeCopyToDevice[averg + 1].count();
0479 std::cout << "|";
0480 }
0481 } else if (type == 2) {
0482 std::cout.width(averageTimeWidth);
0483 std::cout.fill(' ');
0484 std::cout << timing[i].timeOperationOnDeviceByHost[averg].count();
0485 std::cout << "|";
0486 if (standerDeviationPrint) {
0487 std::cout.width(standerDeviationWidth);
0488 std::cout.fill(' ');
0489 std::cout << timing[i].timeOperationOnDeviceByHost[averg + 1].count();
0490 std::cout << "|";
0491 }
0492 } else if (type == 3) {
0493 std::cout.width(averageTimeWidth);
0494 std::cout.fill(' ');
0495 std::cout << timing[i].operationOnDeviceByDeviceAcc[averg];
0496 std::cout << "|";
0497 if (standerDeviationPrint) {
0498 std::cout.width(standerDeviationWidth);
0499 std::cout.fill(' ');
0500 std::cout << timing[i].operationOnDeviceByDeviceAcc[averg + 1];
0501 std::cout << "|";
0502 }
0503 } else if (type == 4) {
0504 std::cout.width(averageTimeWidth);
0505 std::cout.fill(' ');
0506 std::cout << timing[i].timeCopyToHost[averg].count();
0507 std::cout << "|";
0508 if (standerDeviationPrint) {
0509 std::cout.width(standerDeviationWidth);
0510 std::cout.fill(' ');
0511 std::cout << timing[i].timeCopyToHost[averg + 1].count();
0512 std::cout << "|";
0513 }
0514 }
0515 }
0516 }
0517
0518 bool saveToFile(const std::string &name, const Timing &timing) {
0519 std::ofstream file(name + ".txt", std::ios::out | std::ios::app);
0520
0521 if (!file.is_open()) {
0522 std::cout << "\nCannot open File nor Create File!" << std::endl;
0523 return 0;
0524 }
0525
0526 file << sizeOfVector << std::endl;
0527 file << averg << std::endl;
0528 file << task << std::endl;
0529 file << timing.timeCopyToDevice[averg].count() << " " << timing.timeCopyToDevice[averg + 1].count() << std::endl;
0530 file << timing.timeOperationOnDeviceByHost[averg].count() << " "
0531 << timing.timeOperationOnDeviceByHost[averg + 1].count() << std::endl;
0532 file << timing.operationOnDeviceByDeviceAcc[averg] << " " << timing.operationOnDeviceByDeviceAcc[averg + 1]
0533 << std::endl;
0534 file << timing.timeCopyToHost[averg].count() << " " << timing.timeCopyToHost[averg + 1].count() << std::endl;
0535
0536 file.close();
0537 if (!file.good()) {
0538 std::cout << "\n*ERROR While Writing The " + name + " file!!" << std::endl;
0539 return 0;
0540 }
0541 return 1;
0542 }
0543
0544 Timing cudaTimePart0(Timing &timing, Vectors &vect, Pointers &dvect, int size) {
0545 std::cout << "\nCudaMalloc is applied Part 0.\n";
0546 timing.partChosen = 0;
0547
0548 //////////// Start Average From Here /////////////////////
0549 for (int i = 0; i < averg; i++) {
0550 std::fill(vect.vect3Gpu.begin(), vect.vect3Gpu.end(), 0); //clear each value of vector's elements
0551
0552 cudaCheck(cudaEventCreate(&timing.start)); //inialize Event.
0553 cudaCheck(cudaEventCreate(&timing.stop));
0554
0555 ////////////////////////// Copy From Host To Device //////////////////////////////////
0556 timing.copyToDevice[0] = std::chrono::steady_clock::now(); //get current tick time in monotonic point.
0557
0558 timing.copyToDevice[1] = std::chrono::steady_clock::now(); //get current tick time in monotonic point.
0559
0560 int threads = 512; //arbitrary number.
0561 int blocks = (sizeOfVector + threads - 1) / threads; //get ceiling number of blocks.
0562 blocks = std::min(blocks, 8); // Number 8 is least number can be got from lowest Nevedia GPUs.
0563
0564 ////////////////////////// CAll Device Kernel //////////////////////////////////
0565 cudaCheck(cudaEventRecord(timing.start));
0566 timing.operationOnDeviceByHost[0] = std::chrono::steady_clock::now();
0567
0568 addVectorsGpu<<<blocks, threads>>>(vect.vect1.data(),
0569 vect.vect2.data(),
0570 vect.vect3Gpu.data(),
0571 sizeOfVector,
0572 task); //call device function to add two vectors and save into vect3Gpu.
0573 cudaCheck(cudaGetLastError());
0574
0575 cudaCheck(cudaDeviceSynchronize());
0576 timing.operationOnDeviceByHost[1] = std::chrono::steady_clock::now();
0577 cudaCheck(cudaEventRecord(timing.stop));
0578
0579 ////////////////////////// Copy From Device To Host //////////////////////////////////
0580 timing.copyToHost[0] = std::chrono::steady_clock::now();
0581
0582 // cudaMemcpy(vect.vect3Gpu.data(), dvect.dVect3, size, cudaMemcpyDeviceToHost);//copy summing result vector from Device to Host.// Try_Regist(3) delete this
0583
0584 timing.copyToHost[1] = std::chrono::steady_clock::now();
0585
0586 calculateTimeDuration(timing, i);
0587
0588 cudaCheck(cudaEventDestroy(timing.start));
0589 cudaCheck(cudaEventDestroy(timing.stop));
0590 }
0591
0592 //////////////////// End Average //////////////////////
0593 bool test = 0;
0594 addVectorsCpu(vect.vect1, vect.vect2, vect.vect3Cpu); //Host is adding vectors too.
0595 test = checkingResultsPrintout(vect.vect3Cpu,
0596 vect.vect3Gpu); //Checking the results, if error then Print out to the user.
0597 if (test) {
0598 calculateAverageDeviation(timing);
0599 if (test && saveFile) {
0600 test = saveToFile("dataPart0", timing);
0601 std::cout << "\nThe File is saved successfuly.\n";
0602 }
0603 }
0604
0605 return timing;
0606 }
0607 Timing cudaTimePart1(Timing &timing, Vectors &vect, Pointers &dvect, int size, int startSave) {
0608 std::cout << "\nCudaMalloc is applied Part 1.\n";
0609 timing.partChosen = 1;
0610 cudaCheck(
0611 cudaMalloc((void **)&dvect.dVect1, size)); //allocate memory space for vector in the global memory of the Device.
0612 cudaCheck(cudaMalloc((void **)&dvect.dVect2, size));
0613 cudaCheck(cudaMalloc((void **)&dvect.dVect3, size));
0614
0615 //////////// Start Average From Here /////////////////////
0616 for (int i = 0; i < averg; i++) {
0617 std::fill(vect.vect3Gpu.begin(), vect.vect3Gpu.end(), 0);
0618
0619 cudaCheck(cudaEventCreate(&timing.start)); //inialize Event.
0620 cudaCheck(cudaEventCreate(&timing.stop));
0621
0622 ////////////////////////// Copy From Host To Device //////////////////////////////////
0623 timing.copyToDevice[0] = std::chrono::steady_clock::now(); //get current tick time in monotonic point.
0624
0625 cudaCheck(cudaMemcpy(
0626 dvect.dVect1, vect.vect1.data(), size, cudaMemcpyHostToDevice)); //copy random vector from host to device.
0627 cudaCheck(cudaMemcpy(dvect.dVect2, vect.vect2.data(), size, cudaMemcpyHostToDevice));
0628
0629 timing.copyToDevice[1] = std::chrono::steady_clock::now(); //get current tick time in monotonic point.
0630
0631 int threads = 512; //arbitrary number.
0632 int blocks = (sizeOfVector + threads - 1) / threads; //get ceiling number of blocks.
0633 blocks = std::min(blocks, 8); // Number 8 is least number can be got from lowest Nevedia GPUs.
0634
0635 ////////////////////////// CAll Device Kernel //////////////////////////////////
0636 cudaCheck(cudaEventRecord(timing.start));
0637 timing.operationOnDeviceByHost[0] = std::chrono::steady_clock::now();
0638
0639 addVectorsGpu<<<blocks, threads>>>(dvect.dVect1,
0640 dvect.dVect2,
0641 dvect.dVect3,
0642 sizeOfVector,
0643 task); //call device function to add two vectors and save into vect3Gpu.
0644 cudaCheck(cudaGetLastError());
0645
0646 cudaCheck(cudaDeviceSynchronize());
0647
0648 timing.operationOnDeviceByHost[1] = std::chrono::steady_clock::now();
0649 cudaCheck(cudaEventRecord(timing.stop));
0650
0651 ////////////////////////// Copy From Device To Host //////////////////////////////////
0652 timing.copyToHost[0] = std::chrono::steady_clock::now();
0653
0654 cudaCheck(cudaMemcpy(
0655 vect.vect3Gpu.data(),
0656 dvect.dVect3,
0657 size,
0658 cudaMemcpyDeviceToHost)); //copy summing result vector from Device to Host.// Try_Regist(3) delete this
0659
0660 timing.copyToHost[1] = std::chrono::steady_clock::now();
0661
0662 calculateTimeDuration(timing, i);
0663
0664 cudaCheck(cudaEventDestroy(timing.start));
0665 cudaCheck(cudaEventDestroy(timing.stop));
0666 }
0667
0668 //////////////////// End Average //////////////////////
0669 bool test = 0;
0670 addVectorsCpu(vect.vect1, vect.vect2, vect.vect3Cpu); //Host is adding vectors too.
0671 test = checkingResultsPrintout(vect.vect3Cpu,
0672 vect.vect3Gpu); //Checking the results, if error then Print out to the user.
0673 if (test) {
0674 calculateAverageDeviation(timing);
0675 if (test && saveFile && startSave > 2) {
0676 test = saveToFile("dataPart1", timing);
0677 std::cout << "\nThe File is saved successfuly.\n";
0678 }
0679 }
0680 cudaCheck(cudaFree(dvect.dVect1));
0681 cudaCheck(cudaFree(dvect.dVect2));
0682 cudaCheck(cudaFree(dvect.dVect3));
0683
0684 return timing;
0685 }
0686 Timing cudaTimePart2(Timing &timing, Vectors &vect, int size) {
0687 std::cout << "\nCudaHostRegister is Part 2.\n";
0688 timing.partChosen = 2;
0689
0690 //////////// Start Average From Here /////////////////////
0691 for (int i = 0; i < averg; i++) {
0692 std::fill(vect.vect3Gpu.begin(), vect.vect3Gpu.end(), 0);
0693
0694 cudaCheck(cudaEventCreate(&timing.start)); //inialize Event.
0695 cudaCheck(cudaEventCreate(&timing.stop));
0696
0697 timing.copyToDevice[0] = std::chrono::steady_clock::now(); //get current tick time in monotonic point.
0698
0699 cudaCheck(cudaHostRegister(vect.vect1.data(), size, cudaHostRegisterDefault));
0700 cudaCheck(cudaHostRegister(vect.vect2.data(), size, cudaHostRegisterDefault));
0701 cudaCheck(cudaHostRegister(vect.vect3Gpu.data(), size, cudaHostRegisterDefault));
0702
0703 timing.copyToDevice[1] = std::chrono::steady_clock::now(); //get current tick time in monotonic point.
0704
0705 int threads = 512; //arbitrary number.
0706 int blocks = (sizeOfVector + threads - 1) / threads; //get ceiling number of blocks.
0707 blocks = std::min(blocks, 8); // Number 8 is least number can be got from lowest Nevedia GPUs.
0708
0709 timing.operationOnDeviceByHost[0] = std::chrono::steady_clock::now();
0710 cudaCheck(cudaEventRecord(timing.start));
0711 cudaCheck(cudaEventSynchronize(
0712 timing.start)); //If the cudaEventBlockingSync flag has not been set, then the CPU thread will busy-wait until the event has been completed by the GPU.
0713
0714 addVectorsGpu<<<blocks, threads>>>(vect.vect1.data(),
0715 vect.vect2.data(),
0716 vect.vect3Gpu.data(),
0717 sizeOfVector,
0718 task); //call device function to add two vectors and save into vect3Gpu.
0719 cudaCheck(cudaGetLastError());
0720
0721 cudaCheck(cudaDeviceSynchronize());
0722 cudaCheck(cudaEventRecord(timing.stop));
0723 cudaCheck(cudaEventSynchronize(timing.stop));
0724
0725 timing.operationOnDeviceByHost[1] = std::chrono::steady_clock::now();
0726
0727 timing.copyToHost[0] = std::chrono::steady_clock::now();
0728
0729 cudaCheck(cudaHostUnregister(vect.vect1.data()));
0730 cudaCheck(cudaHostUnregister(vect.vect2.data()));
0731 cudaCheck(cudaHostUnregister(vect.vect3Gpu.data()));
0732
0733 timing.copyToHost[1] = std::chrono::steady_clock::now();
0734
0735 calculateTimeDuration(timing, i);
0736
0737 cudaCheck(cudaEventDestroy(timing.start));
0738 cudaCheck(cudaEventDestroy(timing.stop));
0739 }
0740 //////////////////// End Average //////////////////////
0741 bool test = 0;
0742 addVectorsCpu(vect.vect1, vect.vect2, vect.vect3Cpu); //Host is adding vectors too.
0743
0744 test = checkingResultsPrintout(vect.vect3Cpu,
0745 vect.vect3Gpu); //Checking the results, if error then Print out to the user.
0746
0747 if (test) {
0748 calculateAverageDeviation(timing);
0749 if (test && saveFile) {
0750 test = saveToFile("dataPart2", timing);
0751 std::cout << "\nThe File is saved successfuly.\n";
0752 }
0753 }
0754
0755 return timing;
0756 }
0757
0758 Timing cudaTimePart3(Timing &timing, Vectors &vect, Pointers &dvect, int size) {
0759 std::cout << "\nCudaMallocHost is applied Part 3.\n";
0760 timing.partChosen = 3;
0761
0762 cudaCheck(cudaMallocHost((void **)&dvect.dVect1, size)); //allocate memory space for vector in the host memory.
0763 cudaCheck(cudaMallocHost((void **)&dvect.dVect2, size));
0764 cudaCheck(cudaMallocHost((void **)&dvect.dVect3, size));
0765
0766 //////////// Start Average From Here /////////////////////
0767 for (int i = 0; i < averg; i++) {
0768 std::fill(vect.vect3Gpu.begin(), vect.vect3Gpu.end(), 0);
0769
0770 cudaCheck(cudaEventCreate(&timing.start)); //inialize Event.
0771 cudaCheck(cudaEventCreate(&timing.stop));
0772
0773 timing.copyToDevice[0] = std::chrono::steady_clock::now(); //get current tick time in monotonic point.
0774
0775 cudaCheck(cudaMemcpy(dvect.dVect1, vect.vect1.data(), size, cudaMemcpyHostToDevice));
0776 cudaCheck(cudaMemcpy(dvect.dVect2, vect.vect2.data(), size, cudaMemcpyHostToDevice));
0777
0778 timing.copyToDevice[1] = std::chrono::steady_clock::now(); //get current tick time in monotonic point.
0779
0780 int threads = 512; //arbitrary number.
0781 int blocks = (sizeOfVector + threads - 1) / threads; //get ceiling number of blocks.
0782 blocks = std::min(blocks, 8); // Number 8 is least number can be got from lowest Nevedia GPUs.
0783
0784 timing.operationOnDeviceByHost[0] = std::chrono::steady_clock::now();
0785 cudaCheck(cudaEventRecord(timing.start));
0786 cudaCheck(cudaEventSynchronize(
0787 timing.start)); //Waits for an event to complete.If the cudaEventBlockingSync flag has not been set, then the CPU thread will busy-wait until the event has been completed by the GPU.
0788
0789 addVectorsGpu<<<blocks, threads>>>(dvect.dVect1,
0790 dvect.dVect2,
0791 dvect.dVect3,
0792 sizeOfVector,
0793 task); //call device function to add two vectors and save into vect3Gpu.
0794 cudaCheck(cudaGetLastError());
0795
0796 cudaCheck(cudaDeviceSynchronize());
0797
0798 cudaCheck(cudaEventRecord(timing.stop));
0799 cudaCheck(cudaEventSynchronize(timing.stop));
0800
0801 timing.operationOnDeviceByHost[1] = std::chrono::steady_clock::now();
0802
0803 timing.copyToHost[0] = std::chrono::steady_clock::now();
0804
0805 cudaCheck(cudaMemcpy(vect.vect3Gpu.data(), dvect.dVect3, size, cudaMemcpyDeviceToHost));
0806
0807 timing.copyToHost[1] = std::chrono::steady_clock::now();
0808
0809 calculateTimeDuration(timing, i);
0810
0811 cudaCheck(cudaEventDestroy(timing.start));
0812 cudaCheck(cudaEventDestroy(timing.stop));
0813 }
0814 //////////////////// End Average //////////////////////
0815 bool test = 0;
0816 addVectorsCpu(vect.vect1, vect.vect2, vect.vect3Cpu); //Host is adding vectors too.
0817
0818 test = checkingResultsPrintout(vect.vect3Cpu,
0819 vect.vect3Gpu); //Checking the results, if error then Print out to the user.
0820
0821 if (test) {
0822 calculateAverageDeviation(timing);
0823 if (test && saveFile) {
0824 test = saveToFile("dataPart3", timing);
0825 std::cout << "\nThe File is saved successfuly.\n";
0826 }
0827 }
0828 cudaCheck(cudaFreeHost(dvect.dVect1));
0829 cudaCheck(cudaFreeHost(dvect.dVect2));
0830 cudaCheck(cudaFreeHost(dvect.dVect3));
0831 return timing;
0832 }
0833 Timing cudaTimePart4(Timing &timing, Vectors &vect, Pointers &dvect, int size) {
0834 std::cout << "\nCudaMallocHost is applied Part 4\n";
0835 timing.partChosen = 4;
0836
0837 //Using cudaMallocHost for pinning Vector Memory.
0838 cudaCheck(cudaMallocHost((void **)&dvect.dVect1, size)); //allocate memory inside the host and pinned that memory.
0839 cudaCheck(cudaMallocHost((void **)&dvect.dVect2, size));
0840 cudaCheck(cudaMallocHost((void **)&dvect.dVect3, size));
0841
0842 cudaCheck(cudaMalloc((void **)&dvect.dVect1Extra, size)); //Allocate memory inside the device.
0843 cudaCheck(cudaMalloc((void **)&dvect.dVect2Extra, size));
0844 cudaCheck(cudaMalloc((void **)&dvect.dVect3Extra, size));
0845
0846 //////////// Start Average From Here /////////////////////
0847 for (int i = 0; i < averg; i++) {
0848 std::fill(vect.vect3Gpu.begin(), vect.vect3Gpu.end(), 0);
0849
0850 cudaCheck(cudaEventCreate(&timing.start)); //inialize Event.
0851 cudaCheck(cudaEventCreate(&timing.stop));
0852
0853 timing.copyToDevice[0] = std::chrono::steady_clock::now(); //get current tick time in monotonic point.
0854
0855 memcpy(dvect.dVect1, vect.vect1.data(), size); //Copy from vector host to pinned buffer Host.
0856 memcpy(dvect.dVect2, vect.vect2.data(), size);
0857
0858 cudaCheck(cudaMemcpy(dvect.dVect1Extra, dvect.dVect1, size, cudaMemcpyHostToDevice));
0859 cudaCheck(cudaMemcpy(dvect.dVect2Extra, dvect.dVect2, size, cudaMemcpyHostToDevice));
0860
0861 timing.copyToDevice[1] = std::chrono::steady_clock::now(); //get current tick time in monotonic point.
0862
0863 int threads = 512; //arbitrary number.
0864 int blocks = (sizeOfVector + threads - 1) / threads; //get ceiling number of blocks.
0865 blocks = std::min(blocks, 8); // Number 8 is least number can be got from lowest Nevedia GPUs.
0866
0867 timing.operationOnDeviceByHost[0] = std::chrono::steady_clock::now();
0868 cudaCheck(cudaEventRecord(timing.start));
0869 cudaCheck(cudaEventSynchronize(
0870 timing.start)); //Waits for an event to complete.If the cudaEventBlockingSync flag has not been set, then the CPU thread will busy-wait until the event has been completed by the GPU.
0871
0872 addVectorsGpu<<<blocks, threads>>>(dvect.dVect1Extra,
0873 dvect.dVect2Extra,
0874 dvect.dVect3Extra,
0875 sizeOfVector,
0876 task); //call device function to add two vectors and save into vect3Gpu.
0877 cudaCheck(cudaGetLastError());
0878
0879 cudaCheck(cudaDeviceSynchronize());
0880 cudaCheck(cudaEventRecord(timing.stop));
0881 cudaCheck(cudaEventSynchronize(timing.stop));
0882
0883 timing.operationOnDeviceByHost[1] = std::chrono::steady_clock::now();
0884
0885 timing.copyToHost[0] = std::chrono::steady_clock::now();
0886
0887 cudaCheck(cudaMemcpy(dvect.dVect3, dvect.dVect3Extra, size, cudaMemcpyDeviceToHost));
0888 memcpy(vect.vect3Gpu.data(), dvect.dVect3, size); //copy pinned host buffer to vector host.
0889
0890 timing.copyToHost[1] = std::chrono::steady_clock::now();
0891
0892 calculateTimeDuration(timing, i);
0893
0894 cudaCheck(cudaEventDestroy(timing.start));
0895 cudaCheck(cudaEventDestroy(timing.stop));
0896 }
0897 //////////////////// End Average //////////////////////
0898 bool test = 0;
0899 addVectorsCpu(vect.vect1, vect.vect2, vect.vect3Cpu); //Host is adding vectors too.
0900
0901 test = checkingResultsPrintout(vect.vect3Cpu,
0902 vect.vect3Gpu); //Checking the results, if error then Print out to the user.
0903 if (test) {
0904 calculateAverageDeviation(timing);
0905 if (test && saveFile) {
0906 test = saveToFile("dataPart4", timing);
0907 std::cout << "\nThe File is saved successfuly.\n";
0908 }
0909 }
0910
0911 cudaCheck(cudaFreeHost(dvect.dVect1));
0912 cudaCheck(cudaFreeHost(dvect.dVect2));
0913 cudaCheck(cudaFreeHost(dvect.dVect3));
0914 cudaCheck(cudaFree(dvect.dVect1Extra));
0915 cudaCheck(cudaFree(dvect.dVect2Extra));
0916 cudaCheck(cudaFree(dvect.dVect3Extra));
0917
0918 return timing;
0919 }
0920
0921 Timing cudaTimePart5(Timing &timing, Vectors &vect, Pointers &dvect, int size) {
0922 std::cout << "\nCudaHostRegister is applied Part 5.\n";
0923 timing.partChosen = 5;
0924
0925 cudaCheck(
0926 cudaMalloc((void **)&dvect.dVect1, size)); //allocate memory space for vector in the global memory of the Device.
0927 cudaCheck(cudaMalloc((void **)&dvect.dVect2, size));
0928 cudaCheck(cudaMalloc((void **)&dvect.dVect3, size));
0929
0930 //////////// Start Average From Here /////////////////////
0931 for (int i = 0; i < averg; i++) {
0932 std::fill(vect.vect3Gpu.begin(), vect.vect3Gpu.end(), 0);
0933
0934 cudaCheck(cudaEventCreate(&timing.start)); //inialize Event.
0935 cudaCheck(cudaEventCreate(&timing.stop));
0936
0937 timing.copyToDevice[0] = std::chrono::steady_clock::now(); //get current tick time in monotonic point.
0938 cudaCheck(cudaHostRegister(vect.vect1.data(), size, cudaHostRegisterDefault));
0939 cudaCheck(cudaHostRegister(vect.vect2.data(), size, cudaHostRegisterDefault));
0940 cudaCheck(cudaHostRegister(vect.vect3Gpu.data(), size, cudaHostRegisterDefault));
0941
0942 cudaCheck(cudaMemcpy(dvect.dVect1,
0943 vect.vect1.data(),
0944 size,
0945 cudaMemcpyHostToDevice)); //copy pinned vector in the host to buffer in the device.
0946 cudaCheck(cudaMemcpy(dvect.dVect2, vect.vect2.data(), size, cudaMemcpyHostToDevice));
0947
0948 timing.copyToDevice[1] = std::chrono::steady_clock::now(); //get current tick time in monotonic point.
0949
0950 int threads = 512; //arbitrary number.
0951 int blocks = (sizeOfVector + threads - 1) / threads; //get ceiling number of blocks.
0952 blocks = std::min(blocks, 8); // Number 8 is least number can be got from lowest Nevedia GPUs.
0953
0954 timing.operationOnDeviceByHost[0] = std::chrono::steady_clock::now();
0955 cudaCheck(cudaEventRecord(timing.start));
0956 cudaCheck(cudaEventSynchronize(
0957 timing.start)); //If the cudaEventBlockingSync flag has not been set, then the CPU thread will busy-wait until the event has been completed by the GPU.
0958
0959 addVectorsGpu<<<blocks, threads>>>(dvect.dVect1,
0960 dvect.dVect2,
0961 dvect.dVect3,
0962 sizeOfVector,
0963 task); //call device function to add two vectors and save into vect3Gpu.
0964 cudaCheck(cudaGetLastError());
0965
0966 cudaCheck(cudaDeviceSynchronize());
0967 cudaCheck(cudaEventRecord(timing.stop));
0968 cudaCheck(cudaEventSynchronize(timing.stop));
0969
0970 timing.operationOnDeviceByHost[1] = std::chrono::steady_clock::now();
0971
0972 timing.copyToHost[0] = std::chrono::steady_clock::now();
0973
0974 cudaCheck(cudaMemcpy(vect.vect3Gpu.data(),
0975 dvect.dVect3,
0976 size,
0977 cudaMemcpyDeviceToHost)); //copy buffer in the device to pinned vector in the host.
0978 cudaCheck(cudaHostUnregister(vect.vect1.data()));
0979 cudaCheck(cudaHostUnregister(vect.vect2.data()));
0980 cudaCheck(cudaHostUnregister(vect.vect3Gpu.data()));
0981
0982 timing.copyToHost[1] = std::chrono::steady_clock::now();
0983
0984 calculateTimeDuration(timing, i);
0985
0986 cudaCheck(cudaEventDestroy(timing.start));
0987 cudaCheck(cudaEventDestroy(timing.stop));
0988 }
0989 //////////////////// End Average //////////////////////
0990 bool test = 0;
0991 addVectorsCpu(vect.vect1, vect.vect2, vect.vect3Cpu); //Host is adding vectors too.
0992
0993 test = checkingResultsPrintout(vect.vect3Cpu,
0994 vect.vect3Gpu); //Checking the results, if error then Print out to the user.
0995
0996 if (test) {
0997 calculateAverageDeviation(timing);
0998 if (test && saveFile) {
0999 test = saveToFile("dataPart5", timing);
1000 std::cout << "\nThe File is saved successfuly.\n";
1001 }
1002 }
1003 cudaCheck(cudaFree(dvect.dVect1));
1004 cudaCheck(cudaFree(dvect.dVect2));
1005 cudaCheck(cudaFree(dvect.dVect3));
1006 return timing;
1007 }