Warning, /HeterogeneousCore/CUDACore/test/mpiCudaGeneric.cu is written in an unsupported language. File is not indexed.
0001 #include <iostream>
0002 #include <fstream>
0003 #include <iomanip>
0004 #include <cstdlib>
0005 #include <string>
0006 #include <algorithm>
0007 #include <vector>
0008 #include <random>
0009 #include <utility>
0010 #include <mpi.h>
0011 #include <unistd.h>
0012 //////////////////////////////////////////// C U D A /////////////////////////////////////////
0013 #include <cuda.h>
0014 #include <thrust/device_vector.h>
0015 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0016 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0017
0018 //called in the Host and excuted in the Device (GPU)
0019 __global__ void addVectorsGpu(float *vect1, float *vect2, float *vect3, int size, int taskN) {
0020 //blockDim.x gives the number of threads in a block, in the x direction.
0021 //gridDim.x gives the number of blocks in a grid, in the x direction.
0022 //blockDim.x * gridDim.x gives the number of threads in a grid (in the x direction, in this case).
0023 int first = blockDim.x * blockIdx.x + threadIdx.x;
0024 int stride = blockDim.x * gridDim.x;
0025 for (int i = 0; i < taskN; ++i) {
0026 for (int j = first; j < size; j += stride) {
0027 vect3[j] = vect2[j] + vect1[j];
0028 }
0029 }
0030 } //add two vectors and save the result into the third vector.
0031 //////////////////////////////////////////////////////////////////////////////////////////////////
0032
0033 /////////////////////////////////////////// Global Varaibles /////////////////////////////////////
0034 unsigned int sizeVector = 2000;
0035 int average = 5;
0036 int task = 1;
0037 int partsToRun = 1;
0038 bool printStander = false;
0039 bool saveFile = false;
0040 bool help = false;
0041 //////////////////////////////////////////////////////////////////////////////////////////////////
0042
0043 /////////////////////////////////////////// Data Structure /////////////////////////////////////
0044 //Data For User's Choises Only
0045 struct UserChoises {
0046 // unsigned int sizeVector;
0047 unsigned int sizeVectorBytes; //Defualt vectors element float
0048 // unsigned int average;
0049 unsigned int extra;
0050 // unsigned int task;
0051 // unsigned int partsToRun;
0052 int root;
0053 // int numberProcess;
0054 int averageVectorSend;
0055 std::vector<int> partsToRunVector; //vector for user's choice of part.
0056 };
0057
0058 //Data For MPI Only
0059 struct MPIData {
0060 int num_procs;
0061 int rank;
0062
0063 std::pair<int, int> workSplit;
0064 float *mVect1; //declare vector 1.
0065 float *mVect2; //declare vector 2.
0066 float *mVect3; //declare vector fulled only by root to get result from workers.
0067 float *mVectChecking; //declare vector to verify the results from each process.
0068 float *mVectWorker1; //declare vector 1 for workers only.
0069 float *mVectWorker2; //declare vector 2 for workers only.
0070 float *mVectWorker3; //declare vector 2 for workers only.
0071 std::vector<int> displacement; //declare vector for selecting location of each element to be sent.
0072 std::vector<int> numberToSend;
0073 };
0074
0075 //Data For Cuda Only
0076 struct Pointers {
0077 float *vect1; //pointers only for Host
0078 float *vect2;
0079 float *vect3;
0080
0081 float *dVect1; //pointers only for device
0082 float *dVect2;
0083 float *dVect3;
0084
0085 float *dVect1Extra; //pointers only for device
0086 float *dVect2Extra;
0087 float *dVect3Extra;
0088 };
0089
0090 //Data for Time Measurements Only
0091 struct Timing {
0092 int partChosen;
0093 int unitChoice;
0094 double inputPreparationRoot[2]; // get time points from start and end on Root Side
0095 double inputPreparationHost[2]; // get time points from start and end on Host Side.
0096 double operationOnDeviceByHost[2]; //get time duration in Device with Host perspective.
0097
0098 double outputPreparationRoot[2];
0099 double outputPreparationHost[2];
0100
0101 std::vector<double> timeInputPreparationRoot; //Save the Duration time.
0102 std::vector<double> timeInputPreparationHost;
0103 std::vector<double> timeOperationOnDeviceByRootHost;
0104 std::vector<double> timeOutputPreparationRoot;
0105 std::vector<double> timeOutputPreparationHost;
0106
0107 cudaEvent_t start, stop; //get time points in Device.
0108 float operationOnDeviceByDevice = 0; //get time duration in Device with device perspective.
0109 std::vector<float> operationOnDeviceByDeviceAcc; //get accumulating time duration in Device with device perspective.
0110 std::vector<float> averageResults; ///declare vector for getting average calcualtion for Hosts and device to Root.
0111 };
0112
0113 //////////////////////////////////////////////////////////////////////////////////////////////////
0114 const std::vector<int> chooseFunction(int toInteger);
0115 std::pair<int, int> splitProcess(int works, int numberOfProcess);
0116 const std::vector<int> numberDataSend(int numberOfProcess, std::pair<int, int> splitWorks);
0117 void setupMPIAndVectors(
0118 MPIData &mpiData,
0119 UserChoises &user); //initialize communicator environment for MPI and Resize Vectors with Generating Random numbers.
0120 void setupTime(Timing &timing, UserChoises &user); //Resizing Vectors of Time.
0121 void calculateTimeDuration(Timing &timing, int i, int &root);
0122 void addVectorsHost(float *vect1, float *vect2, float *vect3);
0123 void cleanBuffer(float *vect);
0124 bool checkingResultsPrintout(float *vectCpu, float *vectGpu);
0125 void calculateAverageDeviation(Timing &timing, int averg, int &root);
0126 bool sendAverageToRoot(Timing &timing, UserChoises &user, int &rank);
0127
0128 Timing blockSendPart1(MPIData &mpidata, Timing &timing, Pointers &pointer, UserChoises &user);
0129 Timing blockSendPart2(MPIData &mpiData, Timing &timing, Pointers &pointer, UserChoises &user);
0130 Timing blockSendPart3(MPIData &mpiData, Timing &timing, Pointers &pointer, UserChoises &user);
0131
0132 void printTable(std::vector<Timing> &timing, bool standerDeviationPrint);
0133 int getNumberofDigits(double number);
0134 void newLineTitle(int line, const std::string &title);
0135 void printResultEach(std::vector<Timing> &timing, int type, bool standerDeviationPrint);
0136 bool saveToFile(const std::string &name, const Timing &timing);
0137
0138 void printHelp(void);
0139 int main(int argc, char *argv[]) {
0140 int c; //to get parameters from user.
0141
0142 UserChoises user; //Setup Uuser's input variables
0143 user.extra = 2;
0144 user.root = 0;
0145 user.averageVectorSend = 8;
0146
0147 while ((c = getopt(argc, argv, "s:a:t:p:qfh")) != -1) {
0148 switch (c) {
0149 case 's':
0150 try {
0151 sizeVector = std::stoll(optarg, nullptr, 0);
0152 } catch (std::exception &err) {
0153 std::cout << "\n\tError Must be integer Argument!";
0154 std::cout << "\n\t" << err.what() << std::endl;
0155 return 0;
0156 }
0157 break;
0158 case 'a':
0159 try {
0160 average = std::stoll(optarg, nullptr, 0);
0161
0162 } catch (std::exception &err) {
0163 std::cout << "\n\tError Must be integer Argument!";
0164 std::cout << "\n\t" << err.what() << std::endl;
0165 return 0;
0166 }
0167 break;
0168 case 't':
0169 try {
0170 task = std::stoll(optarg, nullptr, 0);
0171 //std::cout << "\nNumber of repeated Task is " << task << std::endl;
0172 } catch (std::exception &err) {
0173 std::cout << "\n\tError Must be integer Argument!";
0174 std::cout << "\n\t" << err.what() << std::endl;
0175 return 0;
0176 }
0177 break;
0178 case 'p':
0179 try {
0180 partsToRun = std::stoll(optarg, nullptr, 0);
0181 user.partsToRunVector = chooseFunction(partsToRun);
0182 //std::cout << "\nyou have chosen Part ";
0183 for (unsigned int j = 0; j < user.partsToRunVector.size(); ++j) {
0184 std::cout << user.partsToRunVector[j] << " ,";
0185 }
0186 std::cout << "\n";
0187 } catch (std::exception &err) {
0188 std::cout << "\n\tError Must be integer Argument!";
0189 std::cout << "\n\t" << err.what() << std::endl;
0190 return 0;
0191 }
0192 break;
0193 case 'q':
0194 try {
0195 printStander = true;
0196 } catch (std::exception &err) {
0197 std::cout << "\n\tError Must be integer Argument!";
0198 std::cout << "\n\t" << err.what() << std::endl;
0199 return 0;
0200 }
0201 break;
0202 case 'f':
0203 try {
0204 saveFile = true;
0205 } catch (std::exception &err) {
0206 std::cout << "\n\tError Must be integer Argument!";
0207 std::cout << "\n\t" << err.what() << std::endl;
0208 return 0;
0209 }
0210 break;
0211 case 'h':
0212 try {
0213 help = true;
0214 } catch (std::exception &err) {
0215 std::cout << "\n\tError Must be integer Argument!";
0216 std::cout << "\n\t" << err.what() << std::endl;
0217 return 0;
0218 }
0219 break;
0220
0221 default:
0222 abort();
0223 }
0224 }
0225
0226 MPIData mpiData;
0227 Timing timing;
0228 Timing resetTime;
0229 Pointers pointer;
0230 timing.unitChoice = 1000000; //1M
0231 resetTime.unitChoice = 1000000; //1M
0232
0233 std::vector<Timing> allTiming;
0234 allTiming.resize(user.partsToRunVector.size());
0235
0236 MPI_Init(&argc, &argv); //initialize communicator environment.
0237
0238 if (help) {
0239 printHelp();
0240 MPI::Finalize();
0241 exit(0);
0242 }
0243 setupMPIAndVectors(mpiData, user);
0244
0245 setupTime(timing, user);
0246 setupTime(resetTime, user);
0247
0248 for (long unsigned int i = 0; i < user.partsToRunVector.size(); ++i) {
0249 if (user.partsToRunVector[i] == 1) {
0250 //setupTime(allTiming[i], user);
0251 //blockSendPart1(mpiData, allTiming[i], pointer, user);
0252 allTiming[i] = blockSendPart1(mpiData, timing, pointer, user);
0253 timing = resetTime;
0254
0255 } else if (user.partsToRunVector[i] == 2) {
0256 //setupTime(allTiming[i], user);
0257 //blockSendPart2(mpiData, allTiming[i], pointer, user);
0258 allTiming[i] = blockSendPart2(mpiData, timing, pointer, user);
0259 timing = resetTime;
0260
0261 } else if (user.partsToRunVector[i] == 3) {
0262 allTiming[i] = blockSendPart3(mpiData, timing, pointer, user);
0263 timing = resetTime;
0264 // } else if (user.partsToRunVector[i] == 4) {
0265 // allTiming[i] = cudaTimePart4(timing, vect, dvect, size);
0266
0267 // } else if (user.partsToRunVector[i] == 5) {
0268 // allTiming[i] = cudaTimePart5(timing, vect, dvect, size);
0269
0270 } else {
0271 std::cout << "\n\n\tError the User has not chose any number of Function!\n";
0272 break;
0273 }
0274 }
0275
0276 if (!mpiData.rank)
0277 printTable(allTiming, printStander);
0278
0279 MPI::Finalize();
0280 return 0;
0281 }
0282 const std::vector<int> chooseFunction(int toInteger) {
0283 std::vector<int> digits(0, 0);
0284 std::vector<int> ERROR(0, 0);
0285
0286 int digit{1};
0287
0288 while (toInteger > 0) {
0289 digit = toInteger % 10;
0290 if (digit > 7) {
0291 std::cout << "\n\tError Must be integer Argument <= " << toInteger << std::endl;
0292 return ERROR;
0293 }
0294 digits.push_back(digit);
0295 toInteger /= 10;
0296 }
0297 std::reverse(digits.begin(), digits.end());
0298 return digits;
0299 }
0300
0301 std::pair<int, int> splitProcess(int works, int numberOfProcess) {
0302 std::pair<int, int> Return{0, 0};
0303 if (numberOfProcess > 1 && numberOfProcess <= works) {
0304 Return.first = works / (numberOfProcess - 1); //number of cycle for each process.
0305 Return.second = works % (numberOfProcess - 1); //extra cycle for process.
0306 } else {
0307 std::cout << "\tError Either No worker are found OR Number Processes Larger than Length!!!\n";
0308 }
0309
0310 return Return;
0311 }
0312 const std::vector<int> numberDataSend(int numberOfProcess, std::pair<int, int> splitWorks) {
0313 std::vector<int> dataSend(numberOfProcess, splitWorks.first);
0314 dataSend[0] = 0;
0315 for (int i = 1; i < splitWorks.second + 1; i++) //neglect root
0316 {
0317 dataSend[i] += 1; //extra work for each first processes.
0318 }
0319 return dataSend;
0320 }
0321 const std::vector<int> displacmentData(int numberOfProcess,
0322 std::pair<int, int> splitWorks,
0323 const std::vector<int> &numberDataSend) {
0324 std::vector<int> displacment(numberOfProcess, splitWorks.first);
0325
0326 displacment[0] = 0;
0327 displacment[1] = 0; //start Here.
0328
0329 for (int i = 2; i < numberOfProcess; i++) //neglect root
0330 {
0331 displacment[i] = numberDataSend[i - 1] + displacment[i - 1]; //extra work for each first processes.
0332 }
0333 return displacment;
0334 }
0335 void randomGenerator(float *vect) {
0336 std::random_device rand;
0337 std::default_random_engine gener(rand());
0338 std::uniform_real_distribution<> dis(0., 1.);
0339 for (unsigned int i = 0; i < sizeVector; ++i) {
0340 vect[i] = dis(gener);
0341 }
0342 }
0343 void setupMPIAndVectors(MPIData &mpiData, UserChoises &user) {
0344 mpiData.num_procs = MPI::COMM_WORLD.Get_size(); //get total size of processes.
0345 mpiData.rank = MPI::COMM_WORLD.Get_rank(); //get each process number.
0346
0347 user.sizeVectorBytes = sizeVector * sizeof(float); //get size in byte for vectors.
0348
0349 mpiData.mVect1 = (float *)malloc(user.sizeVectorBytes); //initialize size.
0350 mpiData.mVect2 = (float *)malloc(user.sizeVectorBytes);
0351 mpiData.mVect3 = (float *)malloc(user.sizeVectorBytes);
0352 mpiData.mVectChecking = (float *)malloc(user.sizeVectorBytes);
0353
0354 //mpiData.mVectWorker1 = (float*) malloc(user.sizeVectorBytes);
0355 //mpiData.mVectWorker2 = (float*) malloc(user.sizeVectorBytes);
0356 mpiData.mVectWorker3 = (float *)malloc(user.sizeVectorBytes);
0357
0358 mpiData.workSplit = splitProcess(sizeVector, mpiData.num_procs);
0359
0360 if (!mpiData.workSplit.first) {
0361 MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
0362 exit(-1);
0363 }
0364
0365 mpiData.numberToSend = numberDataSend(mpiData.num_procs, mpiData.workSplit);
0366 mpiData.displacement = displacmentData(mpiData.num_procs, mpiData.workSplit, mpiData.numberToSend);
0367
0368 // mpiData.mVectWorker1.resize(mpiData.numberToSend[mpiData.rank]); //Resizing each process with appropriate Receiving Data.
0369 // mpiData.mVectWorker2.resize(mpiData.numberToSend[mpiData.rank]);
0370 mpiData.mVectWorker1 = (float *)malloc(mpiData.numberToSend[mpiData.rank] * sizeof(float));
0371 mpiData.mVectWorker2 = (float *)malloc(mpiData.numberToSend[mpiData.rank] * sizeof(float));
0372
0373 if (!mpiData.rank) //Only for root
0374 {
0375 randomGenerator(mpiData.mVect1); //generate random floating numbers from(0,1) Only in the root.
0376 randomGenerator(mpiData.mVect2);
0377 std::cout << "\n\tNumber of Processes " << mpiData.num_procs << std::endl;
0378 std::cout << "\tNumber of workSplit First " << mpiData.workSplit.first << std::endl;
0379 std::cout << "\tNumber of workSplit Second " << mpiData.workSplit.second << std::endl;
0380 std::cout << "\tTotal size of a Buffer " << user.sizeVectorBytes << " B" << std::endl;
0381 }
0382 }
0383 void setupTime(Timing &timing, UserChoises &user) {
0384 //Setup Verctors for Taking Average and Standard deviation
0385 timing.timeInputPreparationRoot.resize(average + user.extra); //extra for saving the average.
0386 timing.timeInputPreparationHost.resize(average + user.extra);
0387 timing.timeOperationOnDeviceByRootHost.resize(average + user.extra);
0388 timing.timeOutputPreparationRoot.resize(average + user.extra);
0389 timing.timeOutputPreparationHost.resize(average + user.extra);
0390 timing.operationOnDeviceByDeviceAcc.resize(average + user.extra);
0391 timing.averageResults.resize(user.averageVectorSend);
0392 }
0393
0394 void calculateTimeDuration(Timing &timing, int i, int &root) {
0395 if (!root) {
0396 timing.timeInputPreparationRoot[i] =
0397 (timing.inputPreparationRoot[1] - timing.inputPreparationRoot[0]); //getting the time in microseconds
0398 timing.timeOperationOnDeviceByRootHost[i] = (timing.outputPreparationRoot[0] - timing.inputPreparationRoot[1]);
0399 timing.timeOutputPreparationRoot[i] = (timing.outputPreparationRoot[1] - timing.outputPreparationRoot[0]);
0400 } else {
0401 timing.timeInputPreparationHost[i] = (timing.inputPreparationHost[1] - timing.inputPreparationHost[0]);
0402 timing.timeOperationOnDeviceByRootHost[i] =
0403 (timing.operationOnDeviceByHost[1] -
0404 timing.operationOnDeviceByHost[0]); //time taking for Device operation with respect of Host.
0405 cudaEventElapsedTime(&timing.operationOnDeviceByDevice,
0406 timing.start,
0407 timing.stop); //get the time elapse in Device operation with device perspective.
0408 timing.operationOnDeviceByDeviceAcc[i] = (timing.operationOnDeviceByDevice * 1000);
0409 timing.timeOutputPreparationHost[i] = (timing.outputPreparationHost[1] - timing.outputPreparationHost[0]);
0410 }
0411 }
0412 void addVectorsHost(float *vect1, float *vect2, float *vect3) {
0413 for (unsigned int i = 0; i < sizeVector; ++i) {
0414 vect3[i] = vect2[i] + vect1[i];
0415 }
0416 }
0417 void cleanBuffer(float *vect) {
0418 for (unsigned int i = 0; i < sizeVector; ++i) {
0419 vect[i] = 0;
0420 }
0421 }
0422 bool checkingResultsPrintout(float *vectCpu, float *vectGpu) {
0423 float percent{0.0};
0424 float totalError{0.0};
0425
0426 for (unsigned int j = 0; j < sizeVector; j++) {
0427 percent = ((vectCpu[j] - vectGpu[j]) / vectCpu[j]) * 100;
0428 totalError += percent;
0429 }
0430 if (totalError) {
0431 std::cout << "\n------------------------------------\n";
0432 std::cout << "| CpuSum | GpuSum | Error | Error %| ";
0433 std::cout << "\n------------------------------------\n";
0434 //std::cout.precision(4);
0435 for (unsigned int j = 0; j < sizeVector; j++) {
0436 std::cout.flags(std::ios::fixed | std::ios::showpoint);
0437 std::cout.precision(4);
0438 std::cout << "| " << vectCpu[j] << " | " << vectGpu[j] << " | " << vectCpu[j] - vectGpu[j] << " | " << percent
0439 << " |\n";
0440 }
0441 std::cout << "-------------------------------------\n";
0442 std::cout << "-Total Error is " << totalError << std::endl;
0443 return false;
0444 }
0445 return true;
0446 }
0447 void calculateAverageDeviation(Timing &timing, int averg, int &root) {
0448 //Average
0449 for (int i = 0; i < averg; ++i) {
0450 if (!root) {
0451 timing.timeInputPreparationRoot[averg] += timing.timeInputPreparationRoot[i];
0452 timing.timeOperationOnDeviceByRootHost[averg] += timing.timeOperationOnDeviceByRootHost[i];
0453 timing.timeOutputPreparationRoot[averg] += timing.timeOutputPreparationRoot[i];
0454 } else {
0455 timing.timeInputPreparationHost[averg] += timing.timeInputPreparationHost[i];
0456 timing.timeOperationOnDeviceByRootHost[averg] += timing.timeOperationOnDeviceByRootHost[i];
0457 timing.timeOutputPreparationHost[averg] += timing.timeOutputPreparationHost[i];
0458 timing.operationOnDeviceByDeviceAcc[averg] += timing.operationOnDeviceByDeviceAcc[i];
0459 }
0460 }
0461 if (!root) {
0462 timing.timeInputPreparationRoot[averg] = timing.timeInputPreparationRoot[averg] / averg;
0463 timing.timeOperationOnDeviceByRootHost[averg] = timing.timeOperationOnDeviceByRootHost[averg] / averg;
0464
0465 timing.timeOutputPreparationRoot[averg] = timing.timeOutputPreparationRoot[averg] / averg;
0466
0467 } else {
0468 timing.timeInputPreparationHost[averg] = timing.timeInputPreparationHost[averg] / averg;
0469
0470 timing.timeOperationOnDeviceByRootHost[averg] = timing.timeOperationOnDeviceByRootHost[averg] / averg;
0471
0472 timing.timeOutputPreparationHost[averg] = timing.timeOutputPreparationHost[averg] / averg;
0473
0474 timing.operationOnDeviceByDeviceAcc[averg] = (double)timing.operationOnDeviceByDeviceAcc[averg] / averg;
0475 }
0476
0477 //Standard deviation
0478 for (int i = 0; i < averg; ++i) {
0479 if (!root) {
0480 timing.timeInputPreparationRoot[i] -= timing.timeInputPreparationRoot[averg]; //Take the different.
0481 timing.timeInputPreparationRoot[i] =
0482 timing.timeInputPreparationRoot[i] * timing.timeInputPreparationRoot[i]; // Square it.
0483 timing.timeInputPreparationRoot[averg + 1] +=
0484 timing.timeInputPreparationRoot[i]; //add them togather. averg+1 is location of the Deviation
0485
0486 timing.timeOperationOnDeviceByRootHost[i] -= timing.timeOperationOnDeviceByRootHost[averg];
0487 timing.timeOperationOnDeviceByRootHost[i] *= timing.timeOperationOnDeviceByRootHost[i];
0488 timing.timeOperationOnDeviceByRootHost[averg + 1] += timing.timeOperationOnDeviceByRootHost[i];
0489
0490 timing.timeOutputPreparationRoot[i] -= timing.timeOutputPreparationRoot[averg];
0491 timing.timeOutputPreparationRoot[i] *= timing.timeOutputPreparationRoot[i];
0492 timing.timeOutputPreparationRoot[averg + 1] += timing.timeOutputPreparationRoot[i];
0493 } else {
0494 timing.timeInputPreparationHost[i] -= timing.timeInputPreparationHost[averg]; //Take the different.
0495 timing.timeInputPreparationHost[i] =
0496 timing.timeInputPreparationHost[i] * timing.timeInputPreparationHost[i]; // Square it.
0497 timing.timeInputPreparationHost[averg + 1] +=
0498 timing.timeInputPreparationHost[i]; //add them togather. averg+1 is location of the Deviation
0499
0500 timing.timeOperationOnDeviceByRootHost[i] -= timing.timeOperationOnDeviceByRootHost[averg];
0501 timing.timeOperationOnDeviceByRootHost[i] *= timing.timeOperationOnDeviceByRootHost[i];
0502 timing.timeOperationOnDeviceByRootHost[averg + 1] += timing.timeOperationOnDeviceByRootHost[i];
0503
0504 timing.timeOutputPreparationHost[i] -= timing.timeOutputPreparationHost[averg];
0505 timing.timeOutputPreparationHost[i] *= timing.timeOutputPreparationHost[i];
0506 timing.timeOutputPreparationHost[averg + 1] += timing.timeOutputPreparationHost[i];
0507
0508 timing.operationOnDeviceByDeviceAcc[i] -= timing.operationOnDeviceByDeviceAcc[averg];
0509 timing.operationOnDeviceByDeviceAcc[i] *= timing.operationOnDeviceByDeviceAcc[i];
0510 timing.operationOnDeviceByDeviceAcc[averg + 1] += timing.operationOnDeviceByDeviceAcc[i];
0511 }
0512 }
0513
0514 if (!root) {
0515 timing.timeInputPreparationRoot[averg + 1] = timing.timeInputPreparationRoot[averg + 1] / averg;
0516 timing.timeInputPreparationRoot[averg + 1] = sqrt(timing.timeInputPreparationRoot[averg + 1]);
0517
0518 timing.timeOperationOnDeviceByRootHost[averg + 1] = timing.timeOperationOnDeviceByRootHost[averg + 1] / averg;
0519 timing.timeOperationOnDeviceByRootHost[averg + 1] = sqrt(timing.timeOperationOnDeviceByRootHost[averg + 1]);
0520
0521 timing.timeOutputPreparationRoot[averg + 1] = timing.timeOutputPreparationRoot[averg + 1] / averg;
0522 timing.timeOutputPreparationRoot[averg + 1] = sqrt(timing.timeOutputPreparationRoot[averg + 1]);
0523
0524 } else {
0525 timing.timeInputPreparationHost[averg + 1] = timing.timeInputPreparationHost[averg + 1] / averg; //*1000000
0526 timing.timeInputPreparationHost[averg + 1] = sqrt(timing.timeInputPreparationHost[averg + 1]);
0527
0528 timing.timeOperationOnDeviceByRootHost[averg + 1] = timing.timeOperationOnDeviceByRootHost[averg + 1] / averg;
0529 timing.timeOperationOnDeviceByRootHost[averg + 1] = sqrt(timing.timeOperationOnDeviceByRootHost[averg + 1]);
0530
0531 timing.timeOutputPreparationHost[averg + 1] = timing.timeOutputPreparationHost[averg + 1] / averg;
0532 timing.timeOutputPreparationHost[averg + 1] = sqrt(timing.timeOutputPreparationHost[averg + 1]);
0533
0534 timing.operationOnDeviceByDeviceAcc[averg + 1] = (double)timing.operationOnDeviceByDeviceAcc[averg + 1] / averg;
0535 timing.operationOnDeviceByDeviceAcc[averg + 1] = sqrt(timing.operationOnDeviceByDeviceAcc[averg + 1]);
0536 }
0537
0538 if (!root) {
0539 timing.timeInputPreparationRoot[averg] *= timing.unitChoice;
0540 timing.timeOperationOnDeviceByRootHost[averg] *= timing.unitChoice;
0541 timing.timeOutputPreparationRoot[averg] *= timing.unitChoice;
0542
0543 timing.timeInputPreparationRoot[averg + 1] *= timing.unitChoice;
0544 timing.timeOperationOnDeviceByRootHost[averg + 1] *= timing.unitChoice;
0545 timing.timeOutputPreparationRoot[averg + 1] *= timing.unitChoice;
0546 } else {
0547 timing.timeInputPreparationHost[averg] *= timing.unitChoice;
0548 timing.timeOperationOnDeviceByRootHost[averg] *= timing.unitChoice;
0549 timing.timeOutputPreparationHost[averg] *= timing.unitChoice;
0550
0551 timing.timeInputPreparationHost[averg + 1] *= timing.unitChoice;
0552 timing.timeOperationOnDeviceByRootHost[averg + 1] *= timing.unitChoice;
0553 timing.timeOutputPreparationHost[averg + 1] *= timing.unitChoice;
0554 }
0555 }
0556
0557 bool sendAverageToRoot(Timing &timing, UserChoises &user, int &rank) {
0558 if (rank) {
0559 timing.averageResults[0] = timing.timeInputPreparationHost[average];
0560 timing.averageResults[1] = timing.timeInputPreparationHost[average + 1]; //Stander Deviation
0561
0562 timing.averageResults[2] = timing.timeOperationOnDeviceByRootHost[average];
0563 timing.averageResults[3] = timing.timeOperationOnDeviceByRootHost[average + 1];
0564
0565 timing.averageResults[4] = timing.timeOutputPreparationHost[average];
0566 timing.averageResults[5] = timing.timeOutputPreparationHost[average + 1];
0567
0568 timing.averageResults[6] = timing.operationOnDeviceByDeviceAcc[average];
0569 timing.averageResults[7] = timing.operationOnDeviceByDeviceAcc[average + 1];
0570
0571 MPI_Send(&timing.averageResults[0], user.averageVectorSend, MPI_FLOAT, user.root, 0, MPI_COMM_WORLD);
0572
0573 } else if (!rank) {
0574 MPI_Recv(&timing.averageResults[0], user.averageVectorSend, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
0575 }
0576 return true;
0577 }
0578
0579 Timing blockSendPart1(MPIData &mpiData, Timing &timing, Pointers &pointer, UserChoises &user) {
0580 cleanBuffer(mpiData.mVectWorker3); //clear each value of vector's elements
0581 timing.partChosen = 1;
0582
0583 if (mpiData.rank) //Only for Workers
0584 {
0585 cudaCheck(cudaMalloc((void **)&pointer.dVect1,
0586 user.sizeVectorBytes)); //allocate memory space for vector in the global memory of the Device.
0587 cudaCheck(cudaMalloc((void **)&pointer.dVect2, user.sizeVectorBytes));
0588 cudaCheck(cudaMalloc((void **)&pointer.dVect3, user.sizeVectorBytes));
0589 }
0590 ///////////////////////////// Start of Average ////////////////////////
0591 for (int a = 0; a <= average; ++a) {
0592 if (!mpiData.rank) //Only for root
0593 {
0594 ////////////////////////////////// Input Prepation for Root //////////////////////////////////
0595 timing.inputPreparationRoot[0] = MPI_Wtime();
0596 for (int i = 1; i < mpiData.num_procs; ++i) {
0597 MPI_Send(&mpiData.mVect1[mpiData.displacement[i]],
0598 mpiData.numberToSend[i],
0599 MPI_FLOAT,
0600 i,
0601 0,
0602 MPI_COMM_WORLD); //Tag is 0
0603 MPI_Send(&mpiData.mVect2[mpiData.displacement[i]], mpiData.numberToSend[i], MPI_FLOAT, i, 0, MPI_COMM_WORLD);
0604 }
0605 timing.inputPreparationRoot[1] = MPI_Wtime();
0606 /////////////////////////////////////////////////////////////////////////////////////////////////
0607 }
0608
0609 if (mpiData.rank) //Only for Workers
0610 {
0611 ////////////////////////////////// Input Prepation for Host //////////////////////////////////
0612 MPI_Probe(user.root, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
0613 timing.inputPreparationHost[0] = MPI_Wtime();
0614 MPI_Recv(&mpiData.mVectWorker1[0],
0615 mpiData.numberToSend[mpiData.rank],
0616 MPI_FLOAT,
0617 user.root,
0618 0,
0619 MPI_COMM_WORLD,
0620 MPI_STATUS_IGNORE);
0621 MPI_Recv(&mpiData.mVectWorker2[0],
0622 mpiData.numberToSend[mpiData.rank],
0623 MPI_FLOAT,
0624 user.root,
0625 0,
0626 MPI_COMM_WORLD,
0627 MPI_STATUS_IGNORE);
0628
0629 cudaCheck(cudaMemcpy(pointer.dVect1,
0630 mpiData.mVectWorker1,
0631 user.sizeVectorBytes,
0632 cudaMemcpyHostToDevice)); //copy random vector from host to device.
0633 cudaCheck(cudaMemcpy(pointer.dVect2, mpiData.mVectWorker2, user.sizeVectorBytes, cudaMemcpyHostToDevice));
0634
0635 timing.inputPreparationHost[1] = MPI_Wtime();
0636 ///////////////////////////////////////////////////////////////////////////////////////
0637
0638 cudaCheck(cudaEventCreate(&timing.start)); //inialize Event.
0639 cudaCheck(cudaEventCreate(&timing.stop));
0640
0641 ///////////////////////////// Operation on Device with respect of Host //////////////////
0642
0643 int threads = 512; //arbitrary number.
0644 int blocks = (sizeVector + threads - 1) / threads; //get ceiling number of blocks.
0645 blocks = std::min(blocks, 8); // Number 8 is least number can be got from lowest Nevedia GPUs.
0646
0647 ////////////////////////// CAll Device Kernel //////////////////////////////////
0648 cudaCheck(cudaEventRecord(timing.start));
0649 timing.operationOnDeviceByHost[0] = MPI_Wtime();
0650
0651 addVectorsGpu<<<blocks, threads>>>(pointer.dVect1,
0652 pointer.dVect2,
0653 pointer.dVect3,
0654 sizeVector,
0655 task); //call device function to add two vectors and save into vect3Gpu.
0656
0657 cudaCheck(cudaGetLastError());
0658 cudaCheck(cudaDeviceSynchronize());
0659 cudaCheck(cudaEventRecord(timing.stop));
0660
0661 timing.operationOnDeviceByHost[1] = MPI_Wtime();
0662 /////////////////////////////////////////////////////////////////////////////////////////////
0663
0664 /////////////////////////////////// Output Prepation for the Host //////////////////////////////////////
0665 timing.outputPreparationHost[0] = MPI_Wtime();
0666 cudaCheck(cudaMemcpy(
0667 mpiData.mVectWorker3,
0668 pointer.dVect3,
0669 user.sizeVectorBytes,
0670 cudaMemcpyDeviceToHost)); //copy summing result vector from Device to Host.// Try_Regist(3) delete this
0671
0672 MPI_Send(&mpiData.mVectWorker3[0],
0673 mpiData.numberToSend[mpiData.rank],
0674 MPI_FLOAT,
0675 user.root,
0676 0,
0677 MPI_COMM_WORLD); //Tag is 0
0678 timing.outputPreparationHost[1] = MPI_Wtime();
0679 ////////////////////////////////////////////////////////////////////////////////////////////////
0680 }
0681
0682 if (!mpiData.rank) //Only for root
0683 {
0684 /////////////////////////////////// Output Prepation for the Root //////////////////////////////////////
0685 MPI_Probe(MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
0686 timing.outputPreparationRoot[0] = MPI_Wtime();
0687 //MPI probe
0688 for (int i = 1; i < mpiData.num_procs; i++) {
0689 MPI_Recv(&mpiData.mVectWorker3[mpiData.displacement[i]],
0690 mpiData.numberToSend[i],
0691 MPI_FLOAT,
0692 i,
0693 0,
0694 MPI_COMM_WORLD,
0695 MPI_STATUS_IGNORE);
0696 }
0697 timing.outputPreparationRoot[1] = MPI_Wtime();
0698 ////////////////////////////////////////////////////////////////////////////////////////////////
0699 }
0700
0701 if (a > 0)
0702 calculateTimeDuration(timing, a - 1, mpiData.rank);
0703
0704 if (mpiData.rank) {
0705 cudaCheck(cudaEventDestroy(timing.start));
0706 cudaCheck(cudaEventDestroy(timing.stop));
0707 }
0708 }
0709 ///////////////////////////// End of Average ////////////////////////
0710 if (mpiData.rank) {
0711 cudaCheck(cudaFree(pointer.dVect1));
0712 cudaCheck(cudaFree(pointer.dVect2));
0713 cudaCheck(cudaFree(pointer.dVect3));
0714 }
0715 ///
0716 bool test = 0;
0717 if (!mpiData.rank) //Only for root
0718 {
0719 addVectorsHost(mpiData.mVect1, mpiData.mVect2, mpiData.mVectChecking); //Host is adding vectors too.
0720 test = checkingResultsPrintout(mpiData.mVectChecking,
0721 mpiData.mVectWorker3); //Checking the results, if error then Print out to the user.
0722 if (!test)
0723 exit(-1);
0724 }
0725
0726 calculateAverageDeviation(timing, average, mpiData.rank);
0727 test = sendAverageToRoot(timing, user, mpiData.rank);
0728 if (test && !mpiData.rank) {
0729 if (saveFile) {
0730 test = saveToFile("dataPart1", timing);
0731
0732 if (test)
0733 std::cout << "Done Part " << timing.partChosen << " And File saved" << std::endl;
0734 else
0735 std::cout << "Error Saving File!!" << std::endl;
0736 }
0737 std::cout << "Done Part " << timing.partChosen << std::endl;
0738 }
0739 return timing;
0740 }
0741
0742 Timing blockSendPart2(MPIData &mpiData, Timing &timing, Pointers &pointer, UserChoises &user) {
0743 cleanBuffer(mpiData.mVectWorker3); //clear each value of vector's elements
0744 timing.partChosen = 2;
0745
0746 if (mpiData.rank) //Only for Workers
0747 {
0748 cudaCheck(cudaMallocHost((void **)&pointer.vect1, user.sizeVectorBytes)); //allocate Pinned memory on the Host.
0749 cudaCheck(cudaMallocHost((void **)&pointer.vect2, user.sizeVectorBytes));
0750 cudaCheck(cudaMallocHost((void **)&pointer.vect3, user.sizeVectorBytes));
0751 cudaCheck(cudaMalloc((void **)&pointer.dVect1,
0752 user.sizeVectorBytes)); //allocate memory space for vector in the global memory of the Device.
0753 cudaCheck(cudaMalloc((void **)&pointer.dVect2, user.sizeVectorBytes));
0754 cudaCheck(cudaMalloc((void **)&pointer.dVect3, user.sizeVectorBytes));
0755 }
0756 ///////////////////////////// Start of Average ////////////////////////
0757 for (int a = 0; a <= average; ++a) {
0758 if (!mpiData.rank) //Only for root
0759 {
0760 ////////////////////////////////// Input Prepation for Root //////////////////////////////////
0761 timing.inputPreparationRoot[0] = MPI_Wtime();
0762 for (int i = 1; i < mpiData.num_procs; ++i) {
0763 MPI_Send(&mpiData.mVect1[mpiData.displacement[i]],
0764 mpiData.numberToSend[i],
0765 MPI_FLOAT,
0766 i,
0767 0,
0768 MPI_COMM_WORLD); //Tag is 0
0769 MPI_Send(&mpiData.mVect2[mpiData.displacement[i]], mpiData.numberToSend[i], MPI_FLOAT, i, 0, MPI_COMM_WORLD);
0770 }
0771 timing.inputPreparationRoot[1] = MPI_Wtime();
0772 /////////////////////////////////////////////////////////////////////////////////////////////////
0773 }
0774
0775 if (mpiData.rank) //Only for Workers
0776 {
0777 ////////////////////////////////// Input Prepation for Host //////////////////////////////////
0778 MPI_Probe(user.root, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
0779 timing.inputPreparationHost[0] = MPI_Wtime();
0780
0781 MPI_Recv(&pointer.vect1[0],
0782 mpiData.numberToSend[mpiData.rank],
0783 MPI_FLOAT,
0784 user.root,
0785 0,
0786 MPI_COMM_WORLD,
0787 MPI_STATUS_IGNORE);
0788 MPI_Recv(&pointer.vect2[0],
0789 mpiData.numberToSend[mpiData.rank],
0790 MPI_FLOAT,
0791 user.root,
0792 0,
0793 MPI_COMM_WORLD,
0794 MPI_STATUS_IGNORE);
0795
0796 cudaCheck(cudaMemcpy(pointer.dVect1,
0797 pointer.vect1,
0798 user.sizeVectorBytes,
0799 cudaMemcpyHostToDevice)); //copy random vector from host to device.
0800 cudaCheck(cudaMemcpy(pointer.dVect2, pointer.vect2, user.sizeVectorBytes, cudaMemcpyHostToDevice));
0801
0802 timing.inputPreparationHost[1] = MPI_Wtime();
0803 ///////////////////////////////////////////////////////////////////////////////////////
0804
0805 cudaCheck(cudaEventCreate(&timing.start)); //inialize Event.
0806 cudaCheck(cudaEventCreate(&timing.stop));
0807
0808 ///////////////////////////// Operation on Device with respect of Host //////////////////
0809
0810 int threads = 512; //arbitrary number.
0811 int blocks = (sizeVector + threads - 1) / threads; //get ceiling number of blocks.
0812 blocks = std::min(blocks, 8); // Number 8 is least number can be got from lowest Nevedia GPUs.
0813
0814 ////////////////////////// CAll Device Kernel //////////////////////////////////
0815 cudaCheck(cudaEventRecord(timing.start));
0816 timing.operationOnDeviceByHost[0] = MPI_Wtime();
0817
0818 addVectorsGpu<<<blocks, threads>>>(pointer.dVect1,
0819 pointer.dVect2,
0820 pointer.dVect3,
0821 sizeVector,
0822 task); //call device function to add two vectors and save into vect3Gpu.
0823
0824 cudaCheck(cudaGetLastError());
0825 cudaCheck(cudaDeviceSynchronize());
0826 cudaCheck(cudaEventRecord(timing.stop));
0827
0828 timing.operationOnDeviceByHost[1] = MPI_Wtime();
0829 /////////////////////////////////////////////////////////////////////////////////////////////
0830
0831 /////////////////////////////////// Output Prepation for the Host //////////////////////////////////////
0832 timing.outputPreparationHost[0] = MPI_Wtime();
0833
0834 cudaCheck(cudaMemcpy(
0835 pointer.vect3,
0836 pointer.dVect3,
0837 user.sizeVectorBytes,
0838 cudaMemcpyDeviceToHost)); //copy summing result vector from Device to Host.// Try_Regist(3) delete this
0839
0840 MPI_Send(&pointer.vect3[0],
0841 mpiData.numberToSend[mpiData.rank],
0842 MPI_FLOAT,
0843 user.root,
0844 0,
0845 MPI_COMM_WORLD); //Tag is 0
0846
0847 timing.outputPreparationHost[1] = MPI_Wtime();
0848 ////////////////////////////////////////////////////////////////////////////////////////////////
0849 }
0850
0851 if (!mpiData.rank) //Only for root
0852 {
0853 /////////////////////////////////// Output Prepation for the Root //////////////////////////////////////
0854 MPI_Probe(MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
0855 timing.outputPreparationRoot[0] = MPI_Wtime();
0856 //MPI probe
0857 for (int i = 1; i < mpiData.num_procs; i++) {
0858 MPI_Recv(&mpiData.mVectWorker3[mpiData.displacement[i]],
0859 mpiData.numberToSend[i],
0860 MPI_FLOAT,
0861 i,
0862 0,
0863 MPI_COMM_WORLD,
0864 MPI_STATUS_IGNORE);
0865 }
0866 timing.outputPreparationRoot[1] = MPI_Wtime();
0867 ////////////////////////////////////////////////////////////////////////////////////////////////
0868 }
0869
0870 if (a > 0)
0871 calculateTimeDuration(timing, a - 1, mpiData.rank);
0872
0873 if (mpiData.rank) {
0874 cudaCheck(cudaEventDestroy(timing.start));
0875 cudaCheck(cudaEventDestroy(timing.stop));
0876 }
0877 }
0878 ///////////////////////////// End of Average ////////////////////////
0879 if (mpiData.rank) {
0880 cudaCheck(cudaFreeHost(pointer.vect1));
0881 cudaCheck(cudaFreeHost(pointer.vect2));
0882 cudaCheck(cudaFreeHost(pointer.vect3));
0883 cudaCheck(cudaFree(pointer.dVect1));
0884 cudaCheck(cudaFree(pointer.dVect2));
0885 cudaCheck(cudaFree(pointer.dVect3));
0886 }
0887
0888 bool test = 0;
0889 if (!mpiData.rank) //Only for root
0890 {
0891 addVectorsHost(mpiData.mVect1, mpiData.mVect2, mpiData.mVectChecking); //Host is adding vectors too.
0892 test = checkingResultsPrintout(mpiData.mVectChecking,
0893 mpiData.mVectWorker3); //Checking the results, if error then Print out to the user.
0894 if (!test)
0895 exit(-1);
0896 }
0897
0898 calculateAverageDeviation(timing, average, mpiData.rank);
0899 test = sendAverageToRoot(timing, user, mpiData.rank);
0900 if (test && !mpiData.rank) {
0901 if (saveFile) {
0902 test = saveToFile("dataPart2", timing);
0903
0904 if (test)
0905 std::cout << "Done Part " << timing.partChosen << " And File saved" << std::endl;
0906 else
0907 std::cout << "Error Saving File!!" << std::endl;
0908 }
0909 std::cout << "Done Part " << timing.partChosen << std::endl;
0910 }
0911 return timing;
0912 }
0913
0914 Timing blockSendPart3(MPIData &mpiData, Timing &timing, Pointers &pointer, UserChoises &user) {
0915 cleanBuffer(mpiData.mVectWorker3); //clear each value of vector's elements
0916 timing.partChosen = 3;
0917
0918 if (mpiData.rank) //Only for Workers
0919 {
0920 cudaCheck(cudaMalloc((void **)&pointer.dVect1,
0921 user.sizeVectorBytes)); //allocate memory space for vector in the global memory of the Device.
0922 cudaCheck(cudaMalloc((void **)&pointer.dVect2, user.sizeVectorBytes));
0923 cudaCheck(cudaMalloc((void **)&pointer.dVect3, user.sizeVectorBytes));
0924 }
0925 ///////////////////////////// Start of Average ////////////////////////
0926 for (int a = 0; a <= average; ++a) {
0927 if (!mpiData.rank) //Only for root
0928 {
0929 ////////////////////////////////// Input Prepation for Root //////////////////////////////////
0930 timing.inputPreparationRoot[0] = MPI_Wtime();
0931 for (int i = 1; i < mpiData.num_procs; ++i) {
0932 MPI_Send(&mpiData.mVect1[mpiData.displacement[i]],
0933 mpiData.numberToSend[i],
0934 MPI_FLOAT,
0935 i,
0936 0,
0937 MPI_COMM_WORLD); //Tag is 0
0938 MPI_Send(&mpiData.mVect2[mpiData.displacement[i]], mpiData.numberToSend[i], MPI_FLOAT, i, 0, MPI_COMM_WORLD);
0939 }
0940 timing.inputPreparationRoot[1] = MPI_Wtime();
0941 /////////////////////////////////////////////////////////////////////////////////////////////////
0942 }
0943
0944 if (mpiData.rank) //Only for Workers
0945 {
0946 ////////////////////////////////// Input Prepation for Host //////////////////////////////////
0947 MPI_Probe(user.root, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
0948 timing.inputPreparationHost[0] = MPI_Wtime();
0949 MPI_Recv(&pointer.dVect1[0],
0950 mpiData.numberToSend[mpiData.rank],
0951 MPI_FLOAT,
0952 user.root,
0953 0,
0954 MPI_COMM_WORLD,
0955 MPI_STATUS_IGNORE);
0956 MPI_Recv(&pointer.dVect2[0],
0957 mpiData.numberToSend[mpiData.rank],
0958 MPI_FLOAT,
0959 user.root,
0960 0,
0961 MPI_COMM_WORLD,
0962 MPI_STATUS_IGNORE);
0963
0964 // cudaCheck(cudaMemcpy(pointer.dVect1, mpiData.mVectWorker1, user.sizeVectorBytes, cudaMemcpyHostToDevice)); //copy random vector from host to device.
0965 // cudaCheck(cudaMemcpy(pointer.dVect2, mpiData.mVectWorker2, user.sizeVectorBytes, cudaMemcpyHostToDevice));
0966
0967 timing.inputPreparationHost[1] = MPI_Wtime();
0968 ///////////////////////////////////////////////////////////////////////////////////////
0969
0970 cudaCheck(cudaEventCreate(&timing.start)); //inialize Event.
0971 cudaCheck(cudaEventCreate(&timing.stop));
0972
0973 ///////////////////////////// Operation on Device with respect of Host //////////////////
0974
0975 int threads = 512; //arbitrary number.
0976 int blocks = (sizeVector + threads - 1) / threads; //get ceiling number of blocks.
0977 blocks = std::min(blocks, 8); // Number 8 is least number can be got from lowest Nevedia GPUs.
0978
0979 ////////////////////////// CAll Device Kernel //////////////////////////////////
0980 cudaCheck(cudaEventRecord(timing.start));
0981 timing.operationOnDeviceByHost[0] = MPI_Wtime();
0982
0983 addVectorsGpu<<<blocks, threads>>>(pointer.dVect1,
0984 pointer.dVect2,
0985 pointer.dVect3,
0986 sizeVector,
0987 task); //call device function to add two vectors and save into vect3Gpu.
0988
0989 cudaCheck(cudaGetLastError());
0990 cudaCheck(cudaDeviceSynchronize());
0991 cudaCheck(cudaEventRecord(timing.stop));
0992
0993 timing.operationOnDeviceByHost[1] = MPI_Wtime();
0994 /////////////////////////////////////////////////////////////////////////////////////////////
0995
0996 /////////////////////////////////// Output Prepation for the Host //////////////////////////////////////
0997 timing.outputPreparationHost[0] = MPI_Wtime();
0998 //cudaCheck(cudaMemcpy(mpiData.mVectWorker3,pointer.dVect3,user.sizeVectorBytes,cudaMemcpyDeviceToHost)); //copy summing result vector from Device to Host.// Try_Regist(3) delete this
0999
1000 MPI_Send(&pointer.dVect3[0],
1001 mpiData.numberToSend[mpiData.rank],
1002 MPI_FLOAT,
1003 user.root,
1004 0,
1005 MPI_COMM_WORLD); //Tag is 0
1006 timing.outputPreparationHost[1] = MPI_Wtime();
1007 ////////////////////////////////////////////////////////////////////////////////////////////////
1008 }
1009
1010 if (!mpiData.rank) //Only for root
1011 {
1012 /////////////////////////////////// Output Prepation for the Root //////////////////////////////////////
1013 MPI_Probe(MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
1014 timing.outputPreparationRoot[0] = MPI_Wtime();
1015 //MPI probe
1016 for (int i = 1; i < mpiData.num_procs; i++) {
1017 MPI_Recv(&mpiData.mVectWorker3[mpiData.displacement[i]],
1018 mpiData.numberToSend[i],
1019 MPI_FLOAT,
1020 i,
1021 0,
1022 MPI_COMM_WORLD,
1023 MPI_STATUS_IGNORE);
1024 }
1025 timing.outputPreparationRoot[1] = MPI_Wtime();
1026 ////////////////////////////////////////////////////////////////////////////////////////////////
1027 }
1028
1029 if (a > 0)
1030 calculateTimeDuration(timing, a - 1, mpiData.rank);
1031
1032 if (mpiData.rank) {
1033 cudaCheck(cudaEventDestroy(timing.start));
1034 cudaCheck(cudaEventDestroy(timing.stop));
1035 }
1036 }
1037 ///////////////////////////// End of Average ////////////////////////
1038 if (mpiData.rank) {
1039 cudaCheck(cudaFree(pointer.dVect1));
1040 cudaCheck(cudaFree(pointer.dVect2));
1041 cudaCheck(cudaFree(pointer.dVect3));
1042 }
1043 ///
1044 bool test = 0;
1045 if (!mpiData.rank) //Only for root
1046 {
1047 addVectorsHost(mpiData.mVect1, mpiData.mVect2, mpiData.mVectChecking); //Host is adding vectors too.
1048 test = checkingResultsPrintout(mpiData.mVectChecking,
1049 mpiData.mVectWorker3); //Checking the results, if error then Print out to the user.
1050 if (!test)
1051 exit(-1);
1052 }
1053
1054 calculateAverageDeviation(timing, average, mpiData.rank);
1055 test = sendAverageToRoot(timing, user, mpiData.rank);
1056 if (test && !mpiData.rank) {
1057 if (saveFile) {
1058 test = saveToFile("dataPart3", timing);
1059
1060 if (test)
1061 std::cout << "Done Part " << timing.partChosen << " And File saved" << std::endl;
1062 else
1063 std::cout << "Error Saving File!!" << std::endl;
1064 }
1065 std::cout << "Done Part " << timing.partChosen << std::endl;
1066 }
1067 return timing;
1068 }
1069
1070 void printTable(std::vector<Timing> &timing, bool standerDeviationPrint) {
1071 const std::string inPrepatRoot = " Duration Time Read Input Prepations On Root ";
1072 const std::string inPrepatHost = " Duration Time Read Input Prepations On Host ";
1073 const std::string timeCpuR = " Duration Time operation on Root point View ";
1074 const std::string timeCpu = " Duration Time operation on Host point View ";
1075 const std::string timeGpu = " Duration Time operation on Device point View";
1076 const std::string outPrepatRoot = " Duration Time Read Output Prepations On Root";
1077 const std::string outPrepatHost = " Duration Time Read Output Prepations On Host";
1078
1079 const std::string averageTime = " AverTime ";
1080 const std::string standerDeviation = " StDeviation ";
1081 const std::string nameTiming = " Name Timing ";
1082 const std::string partsNumberall = "Part ";
1083
1084 int totalFix = 0;
1085
1086 if (standerDeviationPrint) {
1087 totalFix = timeGpu.size() + timing.size() * (averageTime.size() + standerDeviation.size() + 3);
1088 } else {
1089 totalFix = timeGpu.size() + timing.size() * (averageTime.size() + 3);
1090 }
1091
1092 std::cout.flags(std::ios::fixed | std::ios::showpoint);
1093 std::cout.precision(4);
1094
1095 std::cout << '\n';
1096 std::cout.width(totalFix);
1097 std::cout.fill('-');
1098 std::cout << '-' << '\n';
1099 std::cout.fill(' ');
1100
1101 std::cout << "|";
1102 std::cout.width((timeGpu.size() - nameTiming.size()) / 2);
1103 std::cout.fill(' ');
1104 std::cout << " ";
1105 std::cout << nameTiming;
1106 std::cout.width((timeGpu.size() - nameTiming.size()) / 2);
1107 std::cout.fill(' ');
1108 std::cout << " ";
1109 std::cout << " |";
1110
1111 for (unsigned int i = 0; i < timing.size(); ++i) {
1112 if (standerDeviationPrint) {
1113 std::cout.width(((averageTime.size() + standerDeviation.size()) - partsNumberall.size() + 1) / 2);
1114 } //9
1115 else {
1116 std::cout.width(((averageTime.size()) - partsNumberall.size()) / 2);
1117 } //2
1118
1119 std::cout << " ";
1120 std::cout << partsNumberall << timing[i].partChosen;
1121
1122 if (standerDeviationPrint) {
1123 std::cout.width(((averageTime.size() + standerDeviation.size()) - partsNumberall.size() + 1) / 2);
1124 } //9
1125 else {
1126 std::cout.width(((averageTime.size()) - partsNumberall.size()) / 2);
1127 }
1128 //2
1129 std::cout << " ";
1130 std::cout << "|";
1131 }
1132
1133 std::cout << '\n';
1134 std::cout << "|";
1135 std::cout.width(inPrepatHost.size() + 3);
1136 std::cout.fill(' ');
1137 std::cout << "|";
1138
1139 for (unsigned int i = 0; i < timing.size(); ++i) {
1140 std::cout << averageTime;
1141 std::cout << "|";
1142 if (standerDeviationPrint) {
1143 std::cout << standerDeviation;
1144 std::cout << "|";
1145 }
1146 }
1147
1148 newLineTitle(totalFix, inPrepatRoot);
1149 printResultEach(timing, 1, standerDeviationPrint);
1150
1151 newLineTitle(totalFix, inPrepatHost);
1152 printResultEach(timing, 2, standerDeviationPrint);
1153
1154 newLineTitle(totalFix, timeCpuR);
1155 printResultEach(timing, 3, standerDeviationPrint);
1156
1157 newLineTitle(totalFix, timeCpu);
1158 printResultEach(timing, 4, standerDeviationPrint);
1159
1160 newLineTitle(totalFix, timeGpu);
1161 printResultEach(timing, 5, standerDeviationPrint);
1162
1163 newLineTitle(totalFix, outPrepatRoot);
1164 printResultEach(timing, 6, standerDeviationPrint);
1165
1166 newLineTitle(totalFix, outPrepatHost);
1167 printResultEach(timing, 7, standerDeviationPrint);
1168
1169 std::cout << '\n';
1170 std::cout.width(totalFix);
1171 std::cout.fill('-');
1172 std::cout << '-' << '\n';
1173 std::cout.fill(' ');
1174 }
1175 int getNumberofDigits(double number) { return ((int)log10(number) + 1) + 4; }
1176 void newLineTitle(int line, const std::string &title) {
1177 std::cout << '\n';
1178 std::cout.width(line);
1179 std::cout.fill('-');
1180 std::cout << '-' << '\n';
1181 std::cout.fill(' ');
1182
1183 std::cout << "| ";
1184 std::cout << title;
1185 std::cout << " |";
1186 }
1187 void printResultEach(std::vector<Timing> &timing, int type, bool standerDeviationPrint) {
1188 int averageTimeWidth = 10;
1189 int standerDeviationWidth = 13;
1190
1191 for (unsigned int i = 0; i < timing.size(); ++i) {
1192 if (type == 1) {
1193 std::cout.width(averageTimeWidth);
1194 std::cout.fill(' ');
1195 std::cout << timing[i].timeInputPreparationRoot[average];
1196 std::cout << "|";
1197 if (standerDeviationPrint) {
1198 std::cout.width(standerDeviationWidth);
1199 std::cout.fill(' ');
1200 std::cout << timing[i].timeInputPreparationRoot[average + 1];
1201 std::cout << "|";
1202 }
1203 } else if (type == 2) {
1204 std::cout.width(averageTimeWidth);
1205 std::cout.fill(' ');
1206 std::cout << timing[i].averageResults[0];
1207 std::cout << "|";
1208 if (standerDeviationPrint) {
1209 std::cout.width(standerDeviationWidth);
1210 std::cout.fill(' ');
1211 std::cout << timing[i].averageResults[1];
1212 std::cout << "|";
1213 }
1214 } else if (type == 3) {
1215 std::cout.width(averageTimeWidth);
1216 std::cout.fill(' ');
1217 std::cout << timing[i].timeOperationOnDeviceByRootHost[average];
1218 std::cout << "|";
1219 if (standerDeviationPrint) {
1220 std::cout.width(standerDeviationWidth);
1221 std::cout.fill(' ');
1222 std::cout << timing[i].timeOperationOnDeviceByRootHost[average + 1];
1223 std::cout << "|";
1224 }
1225 } else if (type == 4) {
1226 std::cout.width(averageTimeWidth);
1227 std::cout.fill(' ');
1228 std::cout << timing[i].averageResults[2];
1229 std::cout << "|";
1230 if (standerDeviationPrint) {
1231 std::cout.width(standerDeviationWidth);
1232 std::cout.fill(' ');
1233 std::cout << timing[i].averageResults[3];
1234 std::cout << "|";
1235 }
1236 } else if (type == 5) {
1237 std::cout.width(averageTimeWidth);
1238 std::cout.fill(' ');
1239 std::cout << timing[i].averageResults[6];
1240 std::cout << "|";
1241 if (standerDeviationPrint) {
1242 std::cout.width(standerDeviationWidth);
1243 std::cout.fill(' ');
1244 std::cout << timing[i].averageResults[7];
1245 std::cout << "|";
1246 }
1247 } else if (type == 6) {
1248 std::cout.width(averageTimeWidth);
1249 std::cout.fill(' ');
1250 std::cout << timing[i].timeOutputPreparationRoot[average];
1251 std::cout << "|";
1252 if (standerDeviationPrint) {
1253 std::cout.width(standerDeviationWidth);
1254 std::cout.fill(' ');
1255 std::cout << timing[i].timeOutputPreparationRoot[average + 1];
1256 std::cout << "|";
1257 }
1258 } else if (type == 7) {
1259 std::cout.width(averageTimeWidth);
1260 std::cout.fill(' ');
1261 std::cout << timing[i].averageResults[4];
1262 std::cout << "|";
1263 if (standerDeviationPrint) {
1264 std::cout.width(standerDeviationWidth);
1265 std::cout.fill(' ');
1266 std::cout << timing[i].averageResults[5];
1267 std::cout << "|";
1268 }
1269 }
1270 }
1271 }
1272 bool saveToFile(const std::string &name, const Timing &timing) {
1273 std::ofstream file(name + ".txt", std::ios::out | std::ios::app);
1274
1275 if (!file.is_open()) {
1276 std::cout << "\nCannot open File nor Create File!" << std::endl;
1277 return 0;
1278 }
1279
1280 file << sizeVector << std::endl;
1281 file << average << std::endl;
1282 file << task << std::endl;
1283 file << timing.timeInputPreparationRoot[average] << " " << timing.timeInputPreparationRoot[average + 1] << std::endl;
1284 file << timing.averageResults[0] << " " << timing.averageResults[1] << std::endl;
1285 file << timing.timeOperationOnDeviceByRootHost[average] << " " << timing.timeOperationOnDeviceByRootHost[average + 1]
1286 << std::endl;
1287 file << timing.averageResults[2] << " " << timing.averageResults[3] << std::endl;
1288 file << timing.averageResults[6] << " " << timing.averageResults[7] << std::endl;
1289 file << timing.timeOutputPreparationRoot[average] << " " << timing.timeOutputPreparationRoot[average + 1]
1290 << std::endl;
1291 file << timing.averageResults[4] << " " << timing.averageResults[5] << std::endl;
1292
1293 file.close();
1294 if (!file.good()) {
1295 std::cout << "\n*ERROR While Writing The " + name + " file!!" << std::endl;
1296 return 0;
1297 }
1298 return 1;
1299 }
1300 void printHelp(void) {
1301 int rank = MPI::COMM_WORLD.Get_rank();
1302 if (!rank) {
1303 std::cout << "\n\n\t**************************************\n";
1304 std::cout << "\t* This is a Help for Command Opitions*";
1305 std::cout << "\n\t**************************************\n";
1306 std::cout << "\n\tYou as a user, can choose two ways to run the program:\n";
1307 std::cout << "\n\t1) mpirun -np <number of Process/ors> -s <size of Vector> -t <number of task> -a <average size> "
1308 "-p <part to run>\n";
1309 std::cout << "\n\t2) cmsenv_mpirun -np <number of Process/ors> -s <size of Vector> -t <number of task> -a <average "
1310 "size> -p <part to run>\n";
1311 std::cout << "\n\t[-np] is for number of processes or processors that you would like to run.";
1312 std::cout
1313 << "\n\t[-s] is the size of vector that you would like to send, the type is float and there are two vectors.";
1314 std::cout << "\n\t[-t] is the number of repeating of task on the Device(GPU) side.";
1315 std::cout << "\n\t[-a] is the number of repeating the part that user has chosen.";
1316 std::cout << "\n\t[-p] is the choice of what part to run in the program.";
1317 std::cout << "\n\t[-q] is to print Stander Deviation.";
1318 std::cout << "\n\t[-f] is to save the results into a file for each part.";
1319 std::cout << "\n\n\tExample for only local Machine: ";
1320 std::cout << "\n\tcmsenv_mpirun -np 2 mpiCudaGeneric -p1 -s200 -t1 -a1\n";
1321 std::cout << "\n\tExample for two Machines connected: ";
1322 std::cout
1323 << "\n\tcmsenv_mpirun -H <machine Name as Root>,<machine Name as Host> -np 2 mpiCudaGeneric -p1 -s200 -t1 -a1";
1324 std::cout << "\n\tExample for two Machines connected Using ucx: ";
1325 std::cout << "\n\tcmsenv_mpirun -H <machine Name as Root>,<machine Name as Host> -np 2 -mca pml ucx -- "
1326 "mpiCudaGeneric -p1 -s200 -t1 -a1";
1327 std::cout << "\n\n\tFor the Parts, we have in this program 4 Parts:";
1328 std::cout << "\n\t1)The Root, who does not have a GPU, using MPI Blocking send and receive to Host, The Host is "
1329 "who have a GPU, then Host:";
1330 std::cout << "\n\t uses cudaMalloc and copies the receiving values to GPU side. Next, the GPU does the compuation";
1331 std::cout << "\n\t Finaly, the Host copies the results from GPU, sends them back to The Root using MPI Blocking "
1332 "Send.\n\n";
1333 }
1334 }