Back to home page

Project CMSSW displayed by LXR

 
 

    


Warning, /HeterogeneousCore/CUDACore/test/mpiCudaGeneric.cu is written in an unsupported language. File is not indexed.

0001 #include <iostream>
0002 #include <fstream>
0003 #include <iomanip>
0004 #include <cstdlib>
0005 #include <string>
0006 #include <algorithm>
0007 #include <vector>
0008 #include <random>
0009 #include <utility>
0010 #include <mpi.h>
0011 #include <unistd.h>
0012 //////////////////////////////////////////// C U D A  /////////////////////////////////////////
0013 #include <cuda.h>
0014 #include <thrust/device_vector.h>
0015 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0016 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
0017 
0018 //called in the Host and excuted in the Device (GPU)
0019 __global__ void addVectorsGpu(float *vect1, float *vect2, float *vect3, int size, int taskN) {
0020   //blockDim.x gives the number of threads in a block, in the x direction.
0021   //gridDim.x gives the number of blocks in a grid, in the x direction.
0022   //blockDim.x * gridDim.x gives the number of threads in a grid (in the x direction, in this case).
0023   int first = blockDim.x * blockIdx.x + threadIdx.x;
0024   int stride = blockDim.x * gridDim.x;
0025   for (int i = 0; i < taskN; ++i) {
0026     for (int j = first; j < size; j += stride) {
0027       vect3[j] = vect2[j] + vect1[j];
0028     }
0029   }
0030 }  //add two vectors and save the result into the third vector.
0031 //////////////////////////////////////////////////////////////////////////////////////////////////
0032 
0033 ///////////////////////////////////////////  Global Varaibles  /////////////////////////////////////
0034 unsigned int sizeVector = 2000;
0035 int average = 5;
0036 int task = 1;
0037 int partsToRun = 1;
0038 bool printStander = false;
0039 bool saveFile = false;
0040 bool help = false;
0041 //////////////////////////////////////////////////////////////////////////////////////////////////
0042 
0043 ///////////////////////////////////////////  Data Structure  /////////////////////////////////////
0044 //Data For User's Choises Only
0045 struct UserChoises {
0046   // unsigned int sizeVector;
0047   unsigned int sizeVectorBytes;  //Defualt vectors element float
0048   // unsigned int average;
0049   unsigned int extra;
0050   // unsigned int task;
0051   // unsigned int partsToRun;
0052   int root;
0053   // int numberProcess;
0054   int averageVectorSend;
0055   std::vector<int> partsToRunVector;  //vector for user's choice of part.
0056 };
0057 
0058 //Data For MPI Only
0059 struct MPIData {
0060   int num_procs;
0061   int rank;
0062 
0063   std::pair<int, int> workSplit;
0064   float *mVect1;                  //declare vector 1.
0065   float *mVect2;                  //declare vector 2.
0066   float *mVect3;                  //declare vector fulled only by root to get result from workers.
0067   float *mVectChecking;           //declare vector to verify the results from each process.
0068   float *mVectWorker1;            //declare vector 1 for workers only.
0069   float *mVectWorker2;            //declare vector 2 for workers only.
0070   float *mVectWorker3;            //declare vector 2 for workers only.
0071   std::vector<int> displacement;  //declare vector for selecting location of each element to be sent.
0072   std::vector<int> numberToSend;
0073 };
0074 
0075 //Data For Cuda Only
0076 struct Pointers {
0077   float *vect1;  //pointers only for Host
0078   float *vect2;
0079   float *vect3;
0080 
0081   float *dVect1;  //pointers only for device
0082   float *dVect2;
0083   float *dVect3;
0084 
0085   float *dVect1Extra;  //pointers only for device
0086   float *dVect2Extra;
0087   float *dVect3Extra;
0088 };
0089 
0090 //Data for Time Measurements Only
0091 struct Timing {
0092   int partChosen;
0093   int unitChoice;
0094   double inputPreparationRoot[2];     // get time points from start and end on Root Side
0095   double inputPreparationHost[2];     // get time points from start and end on Host Side.
0096   double operationOnDeviceByHost[2];  //get time duration in Device with Host perspective.
0097 
0098   double outputPreparationRoot[2];
0099   double outputPreparationHost[2];
0100 
0101   std::vector<double> timeInputPreparationRoot;  //Save the Duration time.
0102   std::vector<double> timeInputPreparationHost;
0103   std::vector<double> timeOperationOnDeviceByRootHost;
0104   std::vector<double> timeOutputPreparationRoot;
0105   std::vector<double> timeOutputPreparationHost;
0106 
0107   cudaEvent_t start, stop;                          //get time points in Device.
0108   float operationOnDeviceByDevice = 0;              //get time duration in Device with device perspective.
0109   std::vector<float> operationOnDeviceByDeviceAcc;  //get accumulating time duration in Device with device perspective.
0110   std::vector<float> averageResults;  ///declare vector for getting average calcualtion for Hosts and device to Root.
0111 };
0112 
0113 //////////////////////////////////////////////////////////////////////////////////////////////////
0114 const std::vector<int> chooseFunction(int toInteger);
0115 std::pair<int, int> splitProcess(int works, int numberOfProcess);
0116 const std::vector<int> numberDataSend(int numberOfProcess, std::pair<int, int> splitWorks);
0117 void setupMPIAndVectors(
0118     MPIData &mpiData,
0119     UserChoises &user);  //initialize communicator environment for MPI and Resize Vectors with Generating Random numbers.
0120 void setupTime(Timing &timing, UserChoises &user);  //Resizing Vectors of Time.
0121 void calculateTimeDuration(Timing &timing, int i, int &root);
0122 void addVectorsHost(float *vect1, float *vect2, float *vect3);
0123 void cleanBuffer(float *vect);
0124 bool checkingResultsPrintout(float *vectCpu, float *vectGpu);
0125 void calculateAverageDeviation(Timing &timing, int averg, int &root);
0126 bool sendAverageToRoot(Timing &timing, UserChoises &user, int &rank);
0127 
0128 Timing blockSendPart1(MPIData &mpidata, Timing &timing, Pointers &pointer, UserChoises &user);
0129 Timing blockSendPart2(MPIData &mpiData, Timing &timing, Pointers &pointer, UserChoises &user);
0130 Timing blockSendPart3(MPIData &mpiData, Timing &timing, Pointers &pointer, UserChoises &user);
0131 
0132 void printTable(std::vector<Timing> &timing, bool standerDeviationPrint);
0133 int getNumberofDigits(double number);
0134 void newLineTitle(int line, const std::string &title);
0135 void printResultEach(std::vector<Timing> &timing, int type, bool standerDeviationPrint);
0136 bool saveToFile(const std::string &name, const Timing &timing);
0137 
0138 void printHelp(void);
0139 int main(int argc, char *argv[]) {
0140   int c;  //to get parameters from user.
0141 
0142   UserChoises user;  //Setup Uuser's input variables
0143   user.extra = 2;
0144   user.root = 0;
0145   user.averageVectorSend = 8;
0146 
0147   while ((c = getopt(argc, argv, "s:a:t:p:qfh")) != -1) {
0148     switch (c) {
0149       case 's':
0150         try {
0151           sizeVector = std::stoll(optarg, nullptr, 0);
0152         } catch (std::exception &err) {
0153           std::cout << "\n\tError Must be integer Argument!";
0154           std::cout << "\n\t" << err.what() << std::endl;
0155           return 0;
0156         }
0157         break;
0158       case 'a':
0159         try {
0160           average = std::stoll(optarg, nullptr, 0);
0161 
0162         } catch (std::exception &err) {
0163           std::cout << "\n\tError Must be integer Argument!";
0164           std::cout << "\n\t" << err.what() << std::endl;
0165           return 0;
0166         }
0167         break;
0168       case 't':
0169         try {
0170           task = std::stoll(optarg, nullptr, 0);
0171           //std::cout << "\nNumber of repeated Task is " << task << std::endl;
0172         } catch (std::exception &err) {
0173           std::cout << "\n\tError Must be integer Argument!";
0174           std::cout << "\n\t" << err.what() << std::endl;
0175           return 0;
0176         }
0177         break;
0178       case 'p':
0179         try {
0180           partsToRun = std::stoll(optarg, nullptr, 0);
0181           user.partsToRunVector = chooseFunction(partsToRun);
0182           //std::cout << "\nyou have chosen Part ";
0183           for (unsigned int j = 0; j < user.partsToRunVector.size(); ++j) {
0184             std::cout << user.partsToRunVector[j] << " ,";
0185           }
0186           std::cout << "\n";
0187         } catch (std::exception &err) {
0188           std::cout << "\n\tError Must be integer Argument!";
0189           std::cout << "\n\t" << err.what() << std::endl;
0190           return 0;
0191         }
0192         break;
0193       case 'q':
0194         try {
0195           printStander = true;
0196         } catch (std::exception &err) {
0197           std::cout << "\n\tError Must be integer Argument!";
0198           std::cout << "\n\t" << err.what() << std::endl;
0199           return 0;
0200         }
0201         break;
0202       case 'f':
0203         try {
0204           saveFile = true;
0205         } catch (std::exception &err) {
0206           std::cout << "\n\tError Must be integer Argument!";
0207           std::cout << "\n\t" << err.what() << std::endl;
0208           return 0;
0209         }
0210         break;
0211       case 'h':
0212         try {
0213           help = true;
0214         } catch (std::exception &err) {
0215           std::cout << "\n\tError Must be integer Argument!";
0216           std::cout << "\n\t" << err.what() << std::endl;
0217           return 0;
0218         }
0219         break;
0220 
0221       default:
0222         abort();
0223     }
0224   }
0225 
0226   MPIData mpiData;
0227   Timing timing;
0228   Timing resetTime;
0229   Pointers pointer;
0230   timing.unitChoice = 1000000;     //1M
0231   resetTime.unitChoice = 1000000;  //1M
0232 
0233   std::vector<Timing> allTiming;
0234   allTiming.resize(user.partsToRunVector.size());
0235 
0236   MPI_Init(&argc, &argv);  //initialize communicator environment.
0237 
0238   if (help) {
0239     printHelp();
0240     MPI::Finalize();
0241     exit(0);
0242   }
0243   setupMPIAndVectors(mpiData, user);
0244 
0245   setupTime(timing, user);
0246   setupTime(resetTime, user);
0247 
0248   for (long unsigned int i = 0; i < user.partsToRunVector.size(); ++i) {
0249     if (user.partsToRunVector[i] == 1) {
0250       //setupTime(allTiming[i], user);
0251       //blockSendPart1(mpiData, allTiming[i], pointer, user);
0252       allTiming[i] = blockSendPart1(mpiData, timing, pointer, user);
0253       timing = resetTime;
0254 
0255     } else if (user.partsToRunVector[i] == 2) {
0256       //setupTime(allTiming[i], user);
0257       //blockSendPart2(mpiData, allTiming[i], pointer, user);
0258       allTiming[i] = blockSendPart2(mpiData, timing, pointer, user);
0259       timing = resetTime;
0260 
0261     } else if (user.partsToRunVector[i] == 3) {
0262       allTiming[i] = blockSendPart3(mpiData, timing, pointer, user);
0263       timing = resetTime;
0264       // } else if (user.partsToRunVector[i] == 4) {
0265       //   allTiming[i] = cudaTimePart4(timing, vect, dvect, size);
0266 
0267       // } else if (user.partsToRunVector[i] == 5) {
0268       //   allTiming[i] = cudaTimePart5(timing, vect, dvect, size);
0269 
0270     } else {
0271       std::cout << "\n\n\tError the User has not chose any number of Function!\n";
0272       break;
0273     }
0274   }
0275 
0276   if (!mpiData.rank)
0277     printTable(allTiming, printStander);
0278 
0279   MPI::Finalize();
0280   return 0;
0281 }
0282 const std::vector<int> chooseFunction(int toInteger) {
0283   std::vector<int> digits(0, 0);
0284   std::vector<int> ERROR(0, 0);
0285 
0286   int digit{1};
0287 
0288   while (toInteger > 0) {
0289     digit = toInteger % 10;
0290     if (digit > 7) {
0291       std::cout << "\n\tError Must be integer Argument <= " << toInteger << std::endl;
0292       return ERROR;
0293     }
0294     digits.push_back(digit);
0295     toInteger /= 10;
0296   }
0297   std::reverse(digits.begin(), digits.end());
0298   return digits;
0299 }
0300 
0301 std::pair<int, int> splitProcess(int works, int numberOfProcess) {
0302   std::pair<int, int> Return{0, 0};
0303   if (numberOfProcess > 1 && numberOfProcess <= works) {
0304     Return.first = works / (numberOfProcess - 1);   //number of cycle for each process.
0305     Return.second = works % (numberOfProcess - 1);  //extra cycle for process.
0306   } else {
0307     std::cout << "\tError Either No worker are found OR Number Processes Larger than Length!!!\n";
0308   }
0309 
0310   return Return;
0311 }
0312 const std::vector<int> numberDataSend(int numberOfProcess, std::pair<int, int> splitWorks) {
0313   std::vector<int> dataSend(numberOfProcess, splitWorks.first);
0314   dataSend[0] = 0;
0315   for (int i = 1; i < splitWorks.second + 1; i++)  //neglect root
0316   {
0317     dataSend[i] += 1;  //extra work for each first processes.
0318   }
0319   return dataSend;
0320 }
0321 const std::vector<int> displacmentData(int numberOfProcess,
0322                                        std::pair<int, int> splitWorks,
0323                                        const std::vector<int> &numberDataSend) {
0324   std::vector<int> displacment(numberOfProcess, splitWorks.first);
0325 
0326   displacment[0] = 0;
0327   displacment[1] = 0;  //start Here.
0328 
0329   for (int i = 2; i < numberOfProcess; i++)  //neglect root
0330   {
0331     displacment[i] = numberDataSend[i - 1] + displacment[i - 1];  //extra work for each first processes.
0332   }
0333   return displacment;
0334 }
0335 void randomGenerator(float *vect) {
0336   std::random_device rand;
0337   std::default_random_engine gener(rand());
0338   std::uniform_real_distribution<> dis(0., 1.);
0339   for (unsigned int i = 0; i < sizeVector; ++i) {
0340     vect[i] = dis(gener);
0341   }
0342 }
0343 void setupMPIAndVectors(MPIData &mpiData, UserChoises &user) {
0344   mpiData.num_procs = MPI::COMM_WORLD.Get_size();  //get total size of processes.
0345   mpiData.rank = MPI::COMM_WORLD.Get_rank();       //get each process number.
0346 
0347   user.sizeVectorBytes = sizeVector * sizeof(float);  //get size in byte for vectors.
0348 
0349   mpiData.mVect1 = (float *)malloc(user.sizeVectorBytes);  //initialize size.
0350   mpiData.mVect2 = (float *)malloc(user.sizeVectorBytes);
0351   mpiData.mVect3 = (float *)malloc(user.sizeVectorBytes);
0352   mpiData.mVectChecking = (float *)malloc(user.sizeVectorBytes);
0353 
0354   //mpiData.mVectWorker1 = (float*) malloc(user.sizeVectorBytes);
0355   //mpiData.mVectWorker2 = (float*) malloc(user.sizeVectorBytes);
0356   mpiData.mVectWorker3 = (float *)malloc(user.sizeVectorBytes);
0357 
0358   mpiData.workSplit = splitProcess(sizeVector, mpiData.num_procs);
0359 
0360   if (!mpiData.workSplit.first) {
0361     MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
0362     exit(-1);
0363   }
0364 
0365   mpiData.numberToSend = numberDataSend(mpiData.num_procs, mpiData.workSplit);
0366   mpiData.displacement = displacmentData(mpiData.num_procs, mpiData.workSplit, mpiData.numberToSend);
0367 
0368   // mpiData.mVectWorker1.resize(mpiData.numberToSend[mpiData.rank]);  //Resizing each process with appropriate Receiving Data.
0369   // mpiData.mVectWorker2.resize(mpiData.numberToSend[mpiData.rank]);
0370   mpiData.mVectWorker1 = (float *)malloc(mpiData.numberToSend[mpiData.rank] * sizeof(float));
0371   mpiData.mVectWorker2 = (float *)malloc(mpiData.numberToSend[mpiData.rank] * sizeof(float));
0372 
0373   if (!mpiData.rank)  //Only for root
0374   {
0375     randomGenerator(mpiData.mVect1);  //generate random floating numbers from(0,1) Only in the root.
0376     randomGenerator(mpiData.mVect2);
0377     std::cout << "\n\tNumber of Processes " << mpiData.num_procs << std::endl;
0378     std::cout << "\tNumber of workSplit First " << mpiData.workSplit.first << std::endl;
0379     std::cout << "\tNumber of workSplit Second " << mpiData.workSplit.second << std::endl;
0380     std::cout << "\tTotal size of a Buffer " << user.sizeVectorBytes << " B" << std::endl;
0381   }
0382 }
0383 void setupTime(Timing &timing, UserChoises &user) {
0384   //Setup Verctors for Taking Average and Standard deviation
0385   timing.timeInputPreparationRoot.resize(average + user.extra);  //extra for saving the average.
0386   timing.timeInputPreparationHost.resize(average + user.extra);
0387   timing.timeOperationOnDeviceByRootHost.resize(average + user.extra);
0388   timing.timeOutputPreparationRoot.resize(average + user.extra);
0389   timing.timeOutputPreparationHost.resize(average + user.extra);
0390   timing.operationOnDeviceByDeviceAcc.resize(average + user.extra);
0391   timing.averageResults.resize(user.averageVectorSend);
0392 }
0393 
0394 void calculateTimeDuration(Timing &timing, int i, int &root) {
0395   if (!root) {
0396     timing.timeInputPreparationRoot[i] =
0397         (timing.inputPreparationRoot[1] - timing.inputPreparationRoot[0]);  //getting the time in microseconds
0398     timing.timeOperationOnDeviceByRootHost[i] = (timing.outputPreparationRoot[0] - timing.inputPreparationRoot[1]);
0399     timing.timeOutputPreparationRoot[i] = (timing.outputPreparationRoot[1] - timing.outputPreparationRoot[0]);
0400   } else {
0401     timing.timeInputPreparationHost[i] = (timing.inputPreparationHost[1] - timing.inputPreparationHost[0]);
0402     timing.timeOperationOnDeviceByRootHost[i] =
0403         (timing.operationOnDeviceByHost[1] -
0404          timing.operationOnDeviceByHost[0]);  //time taking for Device operation with respect of Host.
0405     cudaEventElapsedTime(&timing.operationOnDeviceByDevice,
0406                          timing.start,
0407                          timing.stop);  //get the time elapse in Device operation with device perspective.
0408     timing.operationOnDeviceByDeviceAcc[i] = (timing.operationOnDeviceByDevice * 1000);
0409     timing.timeOutputPreparationHost[i] = (timing.outputPreparationHost[1] - timing.outputPreparationHost[0]);
0410   }
0411 }
0412 void addVectorsHost(float *vect1, float *vect2, float *vect3) {
0413   for (unsigned int i = 0; i < sizeVector; ++i) {
0414     vect3[i] = vect2[i] + vect1[i];
0415   }
0416 }
0417 void cleanBuffer(float *vect) {
0418   for (unsigned int i = 0; i < sizeVector; ++i) {
0419     vect[i] = 0;
0420   }
0421 }
0422 bool checkingResultsPrintout(float *vectCpu, float *vectGpu) {
0423   float percent{0.0};
0424   float totalError{0.0};
0425 
0426   for (unsigned int j = 0; j < sizeVector; j++) {
0427     percent = ((vectCpu[j] - vectGpu[j]) / vectCpu[j]) * 100;
0428     totalError += percent;
0429   }
0430   if (totalError) {
0431     std::cout << "\n------------------------------------\n";
0432     std::cout << "| CpuSum | GpuSum | Error  | Error %| ";
0433     std::cout << "\n------------------------------------\n";
0434     //std::cout.precision(4);
0435     for (unsigned int j = 0; j < sizeVector; j++) {
0436       std::cout.flags(std::ios::fixed | std::ios::showpoint);
0437       std::cout.precision(4);
0438       std::cout << "| " << vectCpu[j] << " | " << vectGpu[j] << " | " << vectCpu[j] - vectGpu[j] << " | " << percent
0439                 << " |\n";
0440     }
0441     std::cout << "-------------------------------------\n";
0442     std::cout << "-Total Error is " << totalError << std::endl;
0443     return false;
0444   }
0445   return true;
0446 }
0447 void calculateAverageDeviation(Timing &timing, int averg, int &root) {
0448   //Average
0449   for (int i = 0; i < averg; ++i) {
0450     if (!root) {
0451       timing.timeInputPreparationRoot[averg] += timing.timeInputPreparationRoot[i];
0452       timing.timeOperationOnDeviceByRootHost[averg] += timing.timeOperationOnDeviceByRootHost[i];
0453       timing.timeOutputPreparationRoot[averg] += timing.timeOutputPreparationRoot[i];
0454     } else {
0455       timing.timeInputPreparationHost[averg] += timing.timeInputPreparationHost[i];
0456       timing.timeOperationOnDeviceByRootHost[averg] += timing.timeOperationOnDeviceByRootHost[i];
0457       timing.timeOutputPreparationHost[averg] += timing.timeOutputPreparationHost[i];
0458       timing.operationOnDeviceByDeviceAcc[averg] += timing.operationOnDeviceByDeviceAcc[i];
0459     }
0460   }
0461   if (!root) {
0462     timing.timeInputPreparationRoot[averg] = timing.timeInputPreparationRoot[averg] / averg;
0463     timing.timeOperationOnDeviceByRootHost[averg] = timing.timeOperationOnDeviceByRootHost[averg] / averg;
0464 
0465     timing.timeOutputPreparationRoot[averg] = timing.timeOutputPreparationRoot[averg] / averg;
0466 
0467   } else {
0468     timing.timeInputPreparationHost[averg] = timing.timeInputPreparationHost[averg] / averg;
0469 
0470     timing.timeOperationOnDeviceByRootHost[averg] = timing.timeOperationOnDeviceByRootHost[averg] / averg;
0471 
0472     timing.timeOutputPreparationHost[averg] = timing.timeOutputPreparationHost[averg] / averg;
0473 
0474     timing.operationOnDeviceByDeviceAcc[averg] = (double)timing.operationOnDeviceByDeviceAcc[averg] / averg;
0475   }
0476 
0477   //Standard deviation
0478   for (int i = 0; i < averg; ++i) {
0479     if (!root) {
0480       timing.timeInputPreparationRoot[i] -= timing.timeInputPreparationRoot[averg];  //Take the different.
0481       timing.timeInputPreparationRoot[i] =
0482           timing.timeInputPreparationRoot[i] * timing.timeInputPreparationRoot[i];  // Square it.
0483       timing.timeInputPreparationRoot[averg + 1] +=
0484           timing.timeInputPreparationRoot[i];  //add them togather. averg+1 is location of the Deviation
0485 
0486       timing.timeOperationOnDeviceByRootHost[i] -= timing.timeOperationOnDeviceByRootHost[averg];
0487       timing.timeOperationOnDeviceByRootHost[i] *= timing.timeOperationOnDeviceByRootHost[i];
0488       timing.timeOperationOnDeviceByRootHost[averg + 1] += timing.timeOperationOnDeviceByRootHost[i];
0489 
0490       timing.timeOutputPreparationRoot[i] -= timing.timeOutputPreparationRoot[averg];
0491       timing.timeOutputPreparationRoot[i] *= timing.timeOutputPreparationRoot[i];
0492       timing.timeOutputPreparationRoot[averg + 1] += timing.timeOutputPreparationRoot[i];
0493     } else {
0494       timing.timeInputPreparationHost[i] -= timing.timeInputPreparationHost[averg];  //Take the different.
0495       timing.timeInputPreparationHost[i] =
0496           timing.timeInputPreparationHost[i] * timing.timeInputPreparationHost[i];  // Square it.
0497       timing.timeInputPreparationHost[averg + 1] +=
0498           timing.timeInputPreparationHost[i];  //add them togather. averg+1 is location of the Deviation
0499 
0500       timing.timeOperationOnDeviceByRootHost[i] -= timing.timeOperationOnDeviceByRootHost[averg];
0501       timing.timeOperationOnDeviceByRootHost[i] *= timing.timeOperationOnDeviceByRootHost[i];
0502       timing.timeOperationOnDeviceByRootHost[averg + 1] += timing.timeOperationOnDeviceByRootHost[i];
0503 
0504       timing.timeOutputPreparationHost[i] -= timing.timeOutputPreparationHost[averg];
0505       timing.timeOutputPreparationHost[i] *= timing.timeOutputPreparationHost[i];
0506       timing.timeOutputPreparationHost[averg + 1] += timing.timeOutputPreparationHost[i];
0507 
0508       timing.operationOnDeviceByDeviceAcc[i] -= timing.operationOnDeviceByDeviceAcc[averg];
0509       timing.operationOnDeviceByDeviceAcc[i] *= timing.operationOnDeviceByDeviceAcc[i];
0510       timing.operationOnDeviceByDeviceAcc[averg + 1] += timing.operationOnDeviceByDeviceAcc[i];
0511     }
0512   }
0513 
0514   if (!root) {
0515     timing.timeInputPreparationRoot[averg + 1] = timing.timeInputPreparationRoot[averg + 1] / averg;
0516     timing.timeInputPreparationRoot[averg + 1] = sqrt(timing.timeInputPreparationRoot[averg + 1]);
0517 
0518     timing.timeOperationOnDeviceByRootHost[averg + 1] = timing.timeOperationOnDeviceByRootHost[averg + 1] / averg;
0519     timing.timeOperationOnDeviceByRootHost[averg + 1] = sqrt(timing.timeOperationOnDeviceByRootHost[averg + 1]);
0520 
0521     timing.timeOutputPreparationRoot[averg + 1] = timing.timeOutputPreparationRoot[averg + 1] / averg;
0522     timing.timeOutputPreparationRoot[averg + 1] = sqrt(timing.timeOutputPreparationRoot[averg + 1]);
0523 
0524   } else {
0525     timing.timeInputPreparationHost[averg + 1] = timing.timeInputPreparationHost[averg + 1] / averg;  //*1000000
0526     timing.timeInputPreparationHost[averg + 1] = sqrt(timing.timeInputPreparationHost[averg + 1]);
0527 
0528     timing.timeOperationOnDeviceByRootHost[averg + 1] = timing.timeOperationOnDeviceByRootHost[averg + 1] / averg;
0529     timing.timeOperationOnDeviceByRootHost[averg + 1] = sqrt(timing.timeOperationOnDeviceByRootHost[averg + 1]);
0530 
0531     timing.timeOutputPreparationHost[averg + 1] = timing.timeOutputPreparationHost[averg + 1] / averg;
0532     timing.timeOutputPreparationHost[averg + 1] = sqrt(timing.timeOutputPreparationHost[averg + 1]);
0533 
0534     timing.operationOnDeviceByDeviceAcc[averg + 1] = (double)timing.operationOnDeviceByDeviceAcc[averg + 1] / averg;
0535     timing.operationOnDeviceByDeviceAcc[averg + 1] = sqrt(timing.operationOnDeviceByDeviceAcc[averg + 1]);
0536   }
0537 
0538   if (!root) {
0539     timing.timeInputPreparationRoot[averg] *= timing.unitChoice;
0540     timing.timeOperationOnDeviceByRootHost[averg] *= timing.unitChoice;
0541     timing.timeOutputPreparationRoot[averg] *= timing.unitChoice;
0542 
0543     timing.timeInputPreparationRoot[averg + 1] *= timing.unitChoice;
0544     timing.timeOperationOnDeviceByRootHost[averg + 1] *= timing.unitChoice;
0545     timing.timeOutputPreparationRoot[averg + 1] *= timing.unitChoice;
0546   } else {
0547     timing.timeInputPreparationHost[averg] *= timing.unitChoice;
0548     timing.timeOperationOnDeviceByRootHost[averg] *= timing.unitChoice;
0549     timing.timeOutputPreparationHost[averg] *= timing.unitChoice;
0550 
0551     timing.timeInputPreparationHost[averg + 1] *= timing.unitChoice;
0552     timing.timeOperationOnDeviceByRootHost[averg + 1] *= timing.unitChoice;
0553     timing.timeOutputPreparationHost[averg + 1] *= timing.unitChoice;
0554   }
0555 }
0556 
0557 bool sendAverageToRoot(Timing &timing, UserChoises &user, int &rank) {
0558   if (rank) {
0559     timing.averageResults[0] = timing.timeInputPreparationHost[average];
0560     timing.averageResults[1] = timing.timeInputPreparationHost[average + 1];  //Stander Deviation
0561 
0562     timing.averageResults[2] = timing.timeOperationOnDeviceByRootHost[average];
0563     timing.averageResults[3] = timing.timeOperationOnDeviceByRootHost[average + 1];
0564 
0565     timing.averageResults[4] = timing.timeOutputPreparationHost[average];
0566     timing.averageResults[5] = timing.timeOutputPreparationHost[average + 1];
0567 
0568     timing.averageResults[6] = timing.operationOnDeviceByDeviceAcc[average];
0569     timing.averageResults[7] = timing.operationOnDeviceByDeviceAcc[average + 1];
0570 
0571     MPI_Send(&timing.averageResults[0], user.averageVectorSend, MPI_FLOAT, user.root, 0, MPI_COMM_WORLD);
0572 
0573   } else if (!rank) {
0574     MPI_Recv(&timing.averageResults[0], user.averageVectorSend, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
0575   }
0576   return true;
0577 }
0578 
0579 Timing blockSendPart1(MPIData &mpiData, Timing &timing, Pointers &pointer, UserChoises &user) {
0580   cleanBuffer(mpiData.mVectWorker3);  //clear each value of vector's elements
0581   timing.partChosen = 1;
0582 
0583   if (mpiData.rank)  //Only for Workers
0584   {
0585     cudaCheck(cudaMalloc((void **)&pointer.dVect1,
0586                          user.sizeVectorBytes));  //allocate memory space for vector in the global memory of the Device.
0587     cudaCheck(cudaMalloc((void **)&pointer.dVect2, user.sizeVectorBytes));
0588     cudaCheck(cudaMalloc((void **)&pointer.dVect3, user.sizeVectorBytes));
0589   }
0590   ///////////////////////////// Start of Average ////////////////////////
0591   for (int a = 0; a <= average; ++a) {
0592     if (!mpiData.rank)  //Only for root
0593     {
0594       ////////////////////////////////// Input Prepation for Root //////////////////////////////////
0595       timing.inputPreparationRoot[0] = MPI_Wtime();
0596       for (int i = 1; i < mpiData.num_procs; ++i) {
0597         MPI_Send(&mpiData.mVect1[mpiData.displacement[i]],
0598                  mpiData.numberToSend[i],
0599                  MPI_FLOAT,
0600                  i,
0601                  0,
0602                  MPI_COMM_WORLD);  //Tag is 0
0603         MPI_Send(&mpiData.mVect2[mpiData.displacement[i]], mpiData.numberToSend[i], MPI_FLOAT, i, 0, MPI_COMM_WORLD);
0604       }
0605       timing.inputPreparationRoot[1] = MPI_Wtime();
0606       /////////////////////////////////////////////////////////////////////////////////////////////////
0607     }
0608 
0609     if (mpiData.rank)  //Only for Workers
0610     {
0611       ////////////////////////////////// Input Prepation for Host //////////////////////////////////
0612       MPI_Probe(user.root, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
0613       timing.inputPreparationHost[0] = MPI_Wtime();
0614       MPI_Recv(&mpiData.mVectWorker1[0],
0615                mpiData.numberToSend[mpiData.rank],
0616                MPI_FLOAT,
0617                user.root,
0618                0,
0619                MPI_COMM_WORLD,
0620                MPI_STATUS_IGNORE);
0621       MPI_Recv(&mpiData.mVectWorker2[0],
0622                mpiData.numberToSend[mpiData.rank],
0623                MPI_FLOAT,
0624                user.root,
0625                0,
0626                MPI_COMM_WORLD,
0627                MPI_STATUS_IGNORE);
0628 
0629       cudaCheck(cudaMemcpy(pointer.dVect1,
0630                            mpiData.mVectWorker1,
0631                            user.sizeVectorBytes,
0632                            cudaMemcpyHostToDevice));  //copy random vector from host to device.
0633       cudaCheck(cudaMemcpy(pointer.dVect2, mpiData.mVectWorker2, user.sizeVectorBytes, cudaMemcpyHostToDevice));
0634 
0635       timing.inputPreparationHost[1] = MPI_Wtime();
0636       ///////////////////////////////////////////////////////////////////////////////////////
0637 
0638       cudaCheck(cudaEventCreate(&timing.start));  //inialize Event.
0639       cudaCheck(cudaEventCreate(&timing.stop));
0640 
0641       ///////////////////////////// Operation on Device with respect of Host //////////////////
0642 
0643       int threads = 512;                                  //arbitrary number.
0644       int blocks = (sizeVector + threads - 1) / threads;  //get ceiling number of blocks.
0645       blocks = std::min(blocks, 8);  // Number 8 is least number can be got from lowest Nevedia GPUs.
0646 
0647       ////////////////////////// CAll Device Kernel //////////////////////////////////
0648       cudaCheck(cudaEventRecord(timing.start));
0649       timing.operationOnDeviceByHost[0] = MPI_Wtime();
0650 
0651       addVectorsGpu<<<blocks, threads>>>(pointer.dVect1,
0652                                          pointer.dVect2,
0653                                          pointer.dVect3,
0654                                          sizeVector,
0655                                          task);  //call device function to add two vectors and save into vect3Gpu.
0656 
0657       cudaCheck(cudaGetLastError());
0658       cudaCheck(cudaDeviceSynchronize());
0659       cudaCheck(cudaEventRecord(timing.stop));
0660 
0661       timing.operationOnDeviceByHost[1] = MPI_Wtime();
0662       /////////////////////////////////////////////////////////////////////////////////////////////
0663 
0664       /////////////////////////////////// Output Prepation for the Host //////////////////////////////////////
0665       timing.outputPreparationHost[0] = MPI_Wtime();
0666       cudaCheck(cudaMemcpy(
0667           mpiData.mVectWorker3,
0668           pointer.dVect3,
0669           user.sizeVectorBytes,
0670           cudaMemcpyDeviceToHost));  //copy summing result vector from Device to Host.// Try_Regist(3) delete this
0671 
0672       MPI_Send(&mpiData.mVectWorker3[0],
0673                mpiData.numberToSend[mpiData.rank],
0674                MPI_FLOAT,
0675                user.root,
0676                0,
0677                MPI_COMM_WORLD);  //Tag is 0
0678       timing.outputPreparationHost[1] = MPI_Wtime();
0679       ////////////////////////////////////////////////////////////////////////////////////////////////
0680     }
0681 
0682     if (!mpiData.rank)  //Only for root
0683     {
0684       /////////////////////////////////// Output Prepation for the Root //////////////////////////////////////
0685       MPI_Probe(MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
0686       timing.outputPreparationRoot[0] = MPI_Wtime();
0687       //MPI probe
0688       for (int i = 1; i < mpiData.num_procs; i++) {
0689         MPI_Recv(&mpiData.mVectWorker3[mpiData.displacement[i]],
0690                  mpiData.numberToSend[i],
0691                  MPI_FLOAT,
0692                  i,
0693                  0,
0694                  MPI_COMM_WORLD,
0695                  MPI_STATUS_IGNORE);
0696       }
0697       timing.outputPreparationRoot[1] = MPI_Wtime();
0698       ////////////////////////////////////////////////////////////////////////////////////////////////
0699     }
0700 
0701     if (a > 0)
0702       calculateTimeDuration(timing, a - 1, mpiData.rank);
0703 
0704     if (mpiData.rank) {
0705       cudaCheck(cudaEventDestroy(timing.start));
0706       cudaCheck(cudaEventDestroy(timing.stop));
0707     }
0708   }
0709   ///////////////////////////// End of Average ////////////////////////
0710   if (mpiData.rank) {
0711     cudaCheck(cudaFree(pointer.dVect1));
0712     cudaCheck(cudaFree(pointer.dVect2));
0713     cudaCheck(cudaFree(pointer.dVect3));
0714   }
0715   ///
0716   bool test = 0;
0717   if (!mpiData.rank)  //Only for root
0718   {
0719     addVectorsHost(mpiData.mVect1, mpiData.mVect2, mpiData.mVectChecking);  //Host is adding vectors too.
0720     test = checkingResultsPrintout(mpiData.mVectChecking,
0721                                    mpiData.mVectWorker3);  //Checking the results, if error then Print out to the user.
0722     if (!test)
0723       exit(-1);
0724   }
0725 
0726   calculateAverageDeviation(timing, average, mpiData.rank);
0727   test = sendAverageToRoot(timing, user, mpiData.rank);
0728   if (test && !mpiData.rank) {
0729     if (saveFile) {
0730       test = saveToFile("dataPart1", timing);
0731 
0732       if (test)
0733         std::cout << "Done Part " << timing.partChosen << " And File saved" << std::endl;
0734       else
0735         std::cout << "Error Saving File!!" << std::endl;
0736     }
0737     std::cout << "Done Part " << timing.partChosen << std::endl;
0738   }
0739   return timing;
0740 }
0741 
0742 Timing blockSendPart2(MPIData &mpiData, Timing &timing, Pointers &pointer, UserChoises &user) {
0743   cleanBuffer(mpiData.mVectWorker3);  //clear each value of vector's elements
0744   timing.partChosen = 2;
0745 
0746   if (mpiData.rank)  //Only for Workers
0747   {
0748     cudaCheck(cudaMallocHost((void **)&pointer.vect1, user.sizeVectorBytes));  //allocate Pinned memory on the Host.
0749     cudaCheck(cudaMallocHost((void **)&pointer.vect2, user.sizeVectorBytes));
0750     cudaCheck(cudaMallocHost((void **)&pointer.vect3, user.sizeVectorBytes));
0751     cudaCheck(cudaMalloc((void **)&pointer.dVect1,
0752                          user.sizeVectorBytes));  //allocate memory space for vector in the global memory of the Device.
0753     cudaCheck(cudaMalloc((void **)&pointer.dVect2, user.sizeVectorBytes));
0754     cudaCheck(cudaMalloc((void **)&pointer.dVect3, user.sizeVectorBytes));
0755   }
0756   ///////////////////////////// Start of Average ////////////////////////
0757   for (int a = 0; a <= average; ++a) {
0758     if (!mpiData.rank)  //Only for root
0759     {
0760       ////////////////////////////////// Input Prepation for Root //////////////////////////////////
0761       timing.inputPreparationRoot[0] = MPI_Wtime();
0762       for (int i = 1; i < mpiData.num_procs; ++i) {
0763         MPI_Send(&mpiData.mVect1[mpiData.displacement[i]],
0764                  mpiData.numberToSend[i],
0765                  MPI_FLOAT,
0766                  i,
0767                  0,
0768                  MPI_COMM_WORLD);  //Tag is 0
0769         MPI_Send(&mpiData.mVect2[mpiData.displacement[i]], mpiData.numberToSend[i], MPI_FLOAT, i, 0, MPI_COMM_WORLD);
0770       }
0771       timing.inputPreparationRoot[1] = MPI_Wtime();
0772       /////////////////////////////////////////////////////////////////////////////////////////////////
0773     }
0774 
0775     if (mpiData.rank)  //Only for Workers
0776     {
0777       ////////////////////////////////// Input Prepation for Host //////////////////////////////////
0778       MPI_Probe(user.root, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
0779       timing.inputPreparationHost[0] = MPI_Wtime();
0780 
0781       MPI_Recv(&pointer.vect1[0],
0782                mpiData.numberToSend[mpiData.rank],
0783                MPI_FLOAT,
0784                user.root,
0785                0,
0786                MPI_COMM_WORLD,
0787                MPI_STATUS_IGNORE);
0788       MPI_Recv(&pointer.vect2[0],
0789                mpiData.numberToSend[mpiData.rank],
0790                MPI_FLOAT,
0791                user.root,
0792                0,
0793                MPI_COMM_WORLD,
0794                MPI_STATUS_IGNORE);
0795 
0796       cudaCheck(cudaMemcpy(pointer.dVect1,
0797                            pointer.vect1,
0798                            user.sizeVectorBytes,
0799                            cudaMemcpyHostToDevice));  //copy random vector from host to device.
0800       cudaCheck(cudaMemcpy(pointer.dVect2, pointer.vect2, user.sizeVectorBytes, cudaMemcpyHostToDevice));
0801 
0802       timing.inputPreparationHost[1] = MPI_Wtime();
0803       ///////////////////////////////////////////////////////////////////////////////////////
0804 
0805       cudaCheck(cudaEventCreate(&timing.start));  //inialize Event.
0806       cudaCheck(cudaEventCreate(&timing.stop));
0807 
0808       ///////////////////////////// Operation on Device with respect of Host //////////////////
0809 
0810       int threads = 512;                                  //arbitrary number.
0811       int blocks = (sizeVector + threads - 1) / threads;  //get ceiling number of blocks.
0812       blocks = std::min(blocks, 8);  // Number 8 is least number can be got from lowest Nevedia GPUs.
0813 
0814       ////////////////////////// CAll Device Kernel //////////////////////////////////
0815       cudaCheck(cudaEventRecord(timing.start));
0816       timing.operationOnDeviceByHost[0] = MPI_Wtime();
0817 
0818       addVectorsGpu<<<blocks, threads>>>(pointer.dVect1,
0819                                          pointer.dVect2,
0820                                          pointer.dVect3,
0821                                          sizeVector,
0822                                          task);  //call device function to add two vectors and save into vect3Gpu.
0823 
0824       cudaCheck(cudaGetLastError());
0825       cudaCheck(cudaDeviceSynchronize());
0826       cudaCheck(cudaEventRecord(timing.stop));
0827 
0828       timing.operationOnDeviceByHost[1] = MPI_Wtime();
0829       /////////////////////////////////////////////////////////////////////////////////////////////
0830 
0831       /////////////////////////////////// Output Prepation for the Host //////////////////////////////////////
0832       timing.outputPreparationHost[0] = MPI_Wtime();
0833 
0834       cudaCheck(cudaMemcpy(
0835           pointer.vect3,
0836           pointer.dVect3,
0837           user.sizeVectorBytes,
0838           cudaMemcpyDeviceToHost));  //copy summing result vector from Device to Host.// Try_Regist(3) delete this
0839 
0840       MPI_Send(&pointer.vect3[0],
0841                mpiData.numberToSend[mpiData.rank],
0842                MPI_FLOAT,
0843                user.root,
0844                0,
0845                MPI_COMM_WORLD);  //Tag is 0
0846 
0847       timing.outputPreparationHost[1] = MPI_Wtime();
0848       ////////////////////////////////////////////////////////////////////////////////////////////////
0849     }
0850 
0851     if (!mpiData.rank)  //Only for root
0852     {
0853       /////////////////////////////////// Output Prepation for the Root //////////////////////////////////////
0854       MPI_Probe(MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
0855       timing.outputPreparationRoot[0] = MPI_Wtime();
0856       //MPI probe
0857       for (int i = 1; i < mpiData.num_procs; i++) {
0858         MPI_Recv(&mpiData.mVectWorker3[mpiData.displacement[i]],
0859                  mpiData.numberToSend[i],
0860                  MPI_FLOAT,
0861                  i,
0862                  0,
0863                  MPI_COMM_WORLD,
0864                  MPI_STATUS_IGNORE);
0865       }
0866       timing.outputPreparationRoot[1] = MPI_Wtime();
0867       ////////////////////////////////////////////////////////////////////////////////////////////////
0868     }
0869 
0870     if (a > 0)
0871       calculateTimeDuration(timing, a - 1, mpiData.rank);
0872 
0873     if (mpiData.rank) {
0874       cudaCheck(cudaEventDestroy(timing.start));
0875       cudaCheck(cudaEventDestroy(timing.stop));
0876     }
0877   }
0878   ///////////////////////////// End of Average ////////////////////////
0879   if (mpiData.rank) {
0880     cudaCheck(cudaFreeHost(pointer.vect1));
0881     cudaCheck(cudaFreeHost(pointer.vect2));
0882     cudaCheck(cudaFreeHost(pointer.vect3));
0883     cudaCheck(cudaFree(pointer.dVect1));
0884     cudaCheck(cudaFree(pointer.dVect2));
0885     cudaCheck(cudaFree(pointer.dVect3));
0886   }
0887 
0888   bool test = 0;
0889   if (!mpiData.rank)  //Only for root
0890   {
0891     addVectorsHost(mpiData.mVect1, mpiData.mVect2, mpiData.mVectChecking);  //Host is adding vectors too.
0892     test = checkingResultsPrintout(mpiData.mVectChecking,
0893                                    mpiData.mVectWorker3);  //Checking the results, if error then Print out to the user.
0894     if (!test)
0895       exit(-1);
0896   }
0897 
0898   calculateAverageDeviation(timing, average, mpiData.rank);
0899   test = sendAverageToRoot(timing, user, mpiData.rank);
0900   if (test && !mpiData.rank) {
0901     if (saveFile) {
0902       test = saveToFile("dataPart2", timing);
0903 
0904       if (test)
0905         std::cout << "Done Part " << timing.partChosen << " And File saved" << std::endl;
0906       else
0907         std::cout << "Error Saving File!!" << std::endl;
0908     }
0909     std::cout << "Done Part " << timing.partChosen << std::endl;
0910   }
0911   return timing;
0912 }
0913 
0914 Timing blockSendPart3(MPIData &mpiData, Timing &timing, Pointers &pointer, UserChoises &user) {
0915   cleanBuffer(mpiData.mVectWorker3);  //clear each value of vector's elements
0916   timing.partChosen = 3;
0917 
0918   if (mpiData.rank)  //Only for Workers
0919   {
0920     cudaCheck(cudaMalloc((void **)&pointer.dVect1,
0921                          user.sizeVectorBytes));  //allocate memory space for vector in the global memory of the Device.
0922     cudaCheck(cudaMalloc((void **)&pointer.dVect2, user.sizeVectorBytes));
0923     cudaCheck(cudaMalloc((void **)&pointer.dVect3, user.sizeVectorBytes));
0924   }
0925   ///////////////////////////// Start of Average ////////////////////////
0926   for (int a = 0; a <= average; ++a) {
0927     if (!mpiData.rank)  //Only for root
0928     {
0929       ////////////////////////////////// Input Prepation for Root //////////////////////////////////
0930       timing.inputPreparationRoot[0] = MPI_Wtime();
0931       for (int i = 1; i < mpiData.num_procs; ++i) {
0932         MPI_Send(&mpiData.mVect1[mpiData.displacement[i]],
0933                  mpiData.numberToSend[i],
0934                  MPI_FLOAT,
0935                  i,
0936                  0,
0937                  MPI_COMM_WORLD);  //Tag is 0
0938         MPI_Send(&mpiData.mVect2[mpiData.displacement[i]], mpiData.numberToSend[i], MPI_FLOAT, i, 0, MPI_COMM_WORLD);
0939       }
0940       timing.inputPreparationRoot[1] = MPI_Wtime();
0941       /////////////////////////////////////////////////////////////////////////////////////////////////
0942     }
0943 
0944     if (mpiData.rank)  //Only for Workers
0945     {
0946       ////////////////////////////////// Input Prepation for Host //////////////////////////////////
0947       MPI_Probe(user.root, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
0948       timing.inputPreparationHost[0] = MPI_Wtime();
0949       MPI_Recv(&pointer.dVect1[0],
0950                mpiData.numberToSend[mpiData.rank],
0951                MPI_FLOAT,
0952                user.root,
0953                0,
0954                MPI_COMM_WORLD,
0955                MPI_STATUS_IGNORE);
0956       MPI_Recv(&pointer.dVect2[0],
0957                mpiData.numberToSend[mpiData.rank],
0958                MPI_FLOAT,
0959                user.root,
0960                0,
0961                MPI_COMM_WORLD,
0962                MPI_STATUS_IGNORE);
0963 
0964       // cudaCheck(cudaMemcpy(pointer.dVect1, mpiData.mVectWorker1, user.sizeVectorBytes, cudaMemcpyHostToDevice));  //copy random vector from host to device.
0965       // cudaCheck(cudaMemcpy(pointer.dVect2, mpiData.mVectWorker2, user.sizeVectorBytes, cudaMemcpyHostToDevice));
0966 
0967       timing.inputPreparationHost[1] = MPI_Wtime();
0968       ///////////////////////////////////////////////////////////////////////////////////////
0969 
0970       cudaCheck(cudaEventCreate(&timing.start));  //inialize Event.
0971       cudaCheck(cudaEventCreate(&timing.stop));
0972 
0973       ///////////////////////////// Operation on Device with respect of Host //////////////////
0974 
0975       int threads = 512;                                  //arbitrary number.
0976       int blocks = (sizeVector + threads - 1) / threads;  //get ceiling number of blocks.
0977       blocks = std::min(blocks, 8);  // Number 8 is least number can be got from lowest Nevedia GPUs.
0978 
0979       ////////////////////////// CAll Device Kernel //////////////////////////////////
0980       cudaCheck(cudaEventRecord(timing.start));
0981       timing.operationOnDeviceByHost[0] = MPI_Wtime();
0982 
0983       addVectorsGpu<<<blocks, threads>>>(pointer.dVect1,
0984                                          pointer.dVect2,
0985                                          pointer.dVect3,
0986                                          sizeVector,
0987                                          task);  //call device function to add two vectors and save into vect3Gpu.
0988 
0989       cudaCheck(cudaGetLastError());
0990       cudaCheck(cudaDeviceSynchronize());
0991       cudaCheck(cudaEventRecord(timing.stop));
0992 
0993       timing.operationOnDeviceByHost[1] = MPI_Wtime();
0994       /////////////////////////////////////////////////////////////////////////////////////////////
0995 
0996       /////////////////////////////////// Output Prepation for the Host //////////////////////////////////////
0997       timing.outputPreparationHost[0] = MPI_Wtime();
0998       //cudaCheck(cudaMemcpy(mpiData.mVectWorker3,pointer.dVect3,user.sizeVectorBytes,cudaMemcpyDeviceToHost));  //copy summing result vector from Device to Host.// Try_Regist(3) delete this
0999 
1000       MPI_Send(&pointer.dVect3[0],
1001                mpiData.numberToSend[mpiData.rank],
1002                MPI_FLOAT,
1003                user.root,
1004                0,
1005                MPI_COMM_WORLD);  //Tag is 0
1006       timing.outputPreparationHost[1] = MPI_Wtime();
1007       ////////////////////////////////////////////////////////////////////////////////////////////////
1008     }
1009 
1010     if (!mpiData.rank)  //Only for root
1011     {
1012       /////////////////////////////////// Output Prepation for the Root //////////////////////////////////////
1013       MPI_Probe(MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
1014       timing.outputPreparationRoot[0] = MPI_Wtime();
1015       //MPI probe
1016       for (int i = 1; i < mpiData.num_procs; i++) {
1017         MPI_Recv(&mpiData.mVectWorker3[mpiData.displacement[i]],
1018                  mpiData.numberToSend[i],
1019                  MPI_FLOAT,
1020                  i,
1021                  0,
1022                  MPI_COMM_WORLD,
1023                  MPI_STATUS_IGNORE);
1024       }
1025       timing.outputPreparationRoot[1] = MPI_Wtime();
1026       ////////////////////////////////////////////////////////////////////////////////////////////////
1027     }
1028 
1029     if (a > 0)
1030       calculateTimeDuration(timing, a - 1, mpiData.rank);
1031 
1032     if (mpiData.rank) {
1033       cudaCheck(cudaEventDestroy(timing.start));
1034       cudaCheck(cudaEventDestroy(timing.stop));
1035     }
1036   }
1037   ///////////////////////////// End of Average ////////////////////////
1038   if (mpiData.rank) {
1039     cudaCheck(cudaFree(pointer.dVect1));
1040     cudaCheck(cudaFree(pointer.dVect2));
1041     cudaCheck(cudaFree(pointer.dVect3));
1042   }
1043   ///
1044   bool test = 0;
1045   if (!mpiData.rank)  //Only for root
1046   {
1047     addVectorsHost(mpiData.mVect1, mpiData.mVect2, mpiData.mVectChecking);  //Host is adding vectors too.
1048     test = checkingResultsPrintout(mpiData.mVectChecking,
1049                                    mpiData.mVectWorker3);  //Checking the results, if error then Print out to the user.
1050     if (!test)
1051       exit(-1);
1052   }
1053 
1054   calculateAverageDeviation(timing, average, mpiData.rank);
1055   test = sendAverageToRoot(timing, user, mpiData.rank);
1056   if (test && !mpiData.rank) {
1057     if (saveFile) {
1058       test = saveToFile("dataPart3", timing);
1059 
1060       if (test)
1061         std::cout << "Done Part " << timing.partChosen << " And File saved" << std::endl;
1062       else
1063         std::cout << "Error Saving File!!" << std::endl;
1064     }
1065     std::cout << "Done Part " << timing.partChosen << std::endl;
1066   }
1067   return timing;
1068 }
1069 
1070 void printTable(std::vector<Timing> &timing, bool standerDeviationPrint) {
1071   const std::string inPrepatRoot = " Duration Time Read Input Prepations On Root ";
1072   const std::string inPrepatHost = " Duration Time Read Input Prepations On Host ";
1073   const std::string timeCpuR = " Duration Time operation on Root point View  ";
1074   const std::string timeCpu = " Duration Time operation on Host point View  ";
1075   const std::string timeGpu = " Duration Time operation on Device point View";
1076   const std::string outPrepatRoot = " Duration Time Read Output Prepations On Root";
1077   const std::string outPrepatHost = " Duration Time Read Output Prepations On Host";
1078 
1079   const std::string averageTime = " AverTime ";
1080   const std::string standerDeviation = " StDeviation ";
1081   const std::string nameTiming = " Name Timing ";
1082   const std::string partsNumberall = "Part ";
1083 
1084   int totalFix = 0;
1085 
1086   if (standerDeviationPrint) {
1087     totalFix = timeGpu.size() + timing.size() * (averageTime.size() + standerDeviation.size() + 3);
1088   } else {
1089     totalFix = timeGpu.size() + timing.size() * (averageTime.size() + 3);
1090   }
1091 
1092   std::cout.flags(std::ios::fixed | std::ios::showpoint);
1093   std::cout.precision(4);
1094 
1095   std::cout << '\n';
1096   std::cout.width(totalFix);
1097   std::cout.fill('-');
1098   std::cout << '-' << '\n';
1099   std::cout.fill(' ');
1100 
1101   std::cout << "|";
1102   std::cout.width((timeGpu.size() - nameTiming.size()) / 2);
1103   std::cout.fill(' ');
1104   std::cout << " ";
1105   std::cout << nameTiming;
1106   std::cout.width((timeGpu.size() - nameTiming.size()) / 2);
1107   std::cout.fill(' ');
1108   std::cout << " ";
1109   std::cout << "  |";
1110 
1111   for (unsigned int i = 0; i < timing.size(); ++i) {
1112     if (standerDeviationPrint) {
1113       std::cout.width(((averageTime.size() + standerDeviation.size()) - partsNumberall.size() + 1) / 2);
1114     }  //9
1115     else {
1116       std::cout.width(((averageTime.size()) - partsNumberall.size()) / 2);
1117     }  //2
1118 
1119     std::cout << " ";
1120     std::cout << partsNumberall << timing[i].partChosen;
1121 
1122     if (standerDeviationPrint) {
1123       std::cout.width(((averageTime.size() + standerDeviation.size()) - partsNumberall.size() + 1) / 2);
1124     }  //9
1125     else {
1126       std::cout.width(((averageTime.size()) - partsNumberall.size()) / 2);
1127     }
1128     //2
1129     std::cout << " ";
1130     std::cout << "|";
1131   }
1132 
1133   std::cout << '\n';
1134   std::cout << "|";
1135   std::cout.width(inPrepatHost.size() + 3);
1136   std::cout.fill(' ');
1137   std::cout << "|";
1138 
1139   for (unsigned int i = 0; i < timing.size(); ++i) {
1140     std::cout << averageTime;
1141     std::cout << "|";
1142     if (standerDeviationPrint) {
1143       std::cout << standerDeviation;
1144       std::cout << "|";
1145     }
1146   }
1147 
1148   newLineTitle(totalFix, inPrepatRoot);
1149   printResultEach(timing, 1, standerDeviationPrint);
1150 
1151   newLineTitle(totalFix, inPrepatHost);
1152   printResultEach(timing, 2, standerDeviationPrint);
1153 
1154   newLineTitle(totalFix, timeCpuR);
1155   printResultEach(timing, 3, standerDeviationPrint);
1156 
1157   newLineTitle(totalFix, timeCpu);
1158   printResultEach(timing, 4, standerDeviationPrint);
1159 
1160   newLineTitle(totalFix, timeGpu);
1161   printResultEach(timing, 5, standerDeviationPrint);
1162 
1163   newLineTitle(totalFix, outPrepatRoot);
1164   printResultEach(timing, 6, standerDeviationPrint);
1165 
1166   newLineTitle(totalFix, outPrepatHost);
1167   printResultEach(timing, 7, standerDeviationPrint);
1168 
1169   std::cout << '\n';
1170   std::cout.width(totalFix);
1171   std::cout.fill('-');
1172   std::cout << '-' << '\n';
1173   std::cout.fill(' ');
1174 }
1175 int getNumberofDigits(double number) { return ((int)log10(number) + 1) + 4; }
1176 void newLineTitle(int line, const std::string &title) {
1177   std::cout << '\n';
1178   std::cout.width(line);
1179   std::cout.fill('-');
1180   std::cout << '-' << '\n';
1181   std::cout.fill(' ');
1182 
1183   std::cout << "| ";
1184   std::cout << title;
1185   std::cout << " |";
1186 }
1187 void printResultEach(std::vector<Timing> &timing, int type, bool standerDeviationPrint) {
1188   int averageTimeWidth = 10;
1189   int standerDeviationWidth = 13;
1190 
1191   for (unsigned int i = 0; i < timing.size(); ++i) {
1192     if (type == 1) {
1193       std::cout.width(averageTimeWidth);
1194       std::cout.fill(' ');
1195       std::cout << timing[i].timeInputPreparationRoot[average];
1196       std::cout << "|";
1197       if (standerDeviationPrint) {
1198         std::cout.width(standerDeviationWidth);
1199         std::cout.fill(' ');
1200         std::cout << timing[i].timeInputPreparationRoot[average + 1];
1201         std::cout << "|";
1202       }
1203     } else if (type == 2) {
1204       std::cout.width(averageTimeWidth);
1205       std::cout.fill(' ');
1206       std::cout << timing[i].averageResults[0];
1207       std::cout << "|";
1208       if (standerDeviationPrint) {
1209         std::cout.width(standerDeviationWidth);
1210         std::cout.fill(' ');
1211         std::cout << timing[i].averageResults[1];
1212         std::cout << "|";
1213       }
1214     } else if (type == 3) {
1215       std::cout.width(averageTimeWidth);
1216       std::cout.fill(' ');
1217       std::cout << timing[i].timeOperationOnDeviceByRootHost[average];
1218       std::cout << "|";
1219       if (standerDeviationPrint) {
1220         std::cout.width(standerDeviationWidth);
1221         std::cout.fill(' ');
1222         std::cout << timing[i].timeOperationOnDeviceByRootHost[average + 1];
1223         std::cout << "|";
1224       }
1225     } else if (type == 4) {
1226       std::cout.width(averageTimeWidth);
1227       std::cout.fill(' ');
1228       std::cout << timing[i].averageResults[2];
1229       std::cout << "|";
1230       if (standerDeviationPrint) {
1231         std::cout.width(standerDeviationWidth);
1232         std::cout.fill(' ');
1233         std::cout << timing[i].averageResults[3];
1234         std::cout << "|";
1235       }
1236     } else if (type == 5) {
1237       std::cout.width(averageTimeWidth);
1238       std::cout.fill(' ');
1239       std::cout << timing[i].averageResults[6];
1240       std::cout << "|";
1241       if (standerDeviationPrint) {
1242         std::cout.width(standerDeviationWidth);
1243         std::cout.fill(' ');
1244         std::cout << timing[i].averageResults[7];
1245         std::cout << "|";
1246       }
1247     } else if (type == 6) {
1248       std::cout.width(averageTimeWidth);
1249       std::cout.fill(' ');
1250       std::cout << timing[i].timeOutputPreparationRoot[average];
1251       std::cout << "|";
1252       if (standerDeviationPrint) {
1253         std::cout.width(standerDeviationWidth);
1254         std::cout.fill(' ');
1255         std::cout << timing[i].timeOutputPreparationRoot[average + 1];
1256         std::cout << "|";
1257       }
1258     } else if (type == 7) {
1259       std::cout.width(averageTimeWidth);
1260       std::cout.fill(' ');
1261       std::cout << timing[i].averageResults[4];
1262       std::cout << "|";
1263       if (standerDeviationPrint) {
1264         std::cout.width(standerDeviationWidth);
1265         std::cout.fill(' ');
1266         std::cout << timing[i].averageResults[5];
1267         std::cout << "|";
1268       }
1269     }
1270   }
1271 }
1272 bool saveToFile(const std::string &name, const Timing &timing) {
1273   std::ofstream file(name + ".txt", std::ios::out | std::ios::app);
1274 
1275   if (!file.is_open()) {
1276     std::cout << "\nCannot open File nor Create File!" << std::endl;
1277     return 0;
1278   }
1279 
1280   file << sizeVector << std::endl;
1281   file << average << std::endl;
1282   file << task << std::endl;
1283   file << timing.timeInputPreparationRoot[average] << " " << timing.timeInputPreparationRoot[average + 1] << std::endl;
1284   file << timing.averageResults[0] << " " << timing.averageResults[1] << std::endl;
1285   file << timing.timeOperationOnDeviceByRootHost[average] << " " << timing.timeOperationOnDeviceByRootHost[average + 1]
1286        << std::endl;
1287   file << timing.averageResults[2] << " " << timing.averageResults[3] << std::endl;
1288   file << timing.averageResults[6] << " " << timing.averageResults[7] << std::endl;
1289   file << timing.timeOutputPreparationRoot[average] << " " << timing.timeOutputPreparationRoot[average + 1]
1290        << std::endl;
1291   file << timing.averageResults[4] << " " << timing.averageResults[5] << std::endl;
1292 
1293   file.close();
1294   if (!file.good()) {
1295     std::cout << "\n*ERROR While Writing The " + name + " file!!" << std::endl;
1296     return 0;
1297   }
1298   return 1;
1299 }
1300 void printHelp(void) {
1301   int rank = MPI::COMM_WORLD.Get_rank();
1302   if (!rank) {
1303     std::cout << "\n\n\t**************************************\n";
1304     std::cout << "\t* This is a Help for Command Opitions*";
1305     std::cout << "\n\t**************************************\n";
1306     std::cout << "\n\tYou as a user, can choose two ways to run the program:\n";
1307     std::cout << "\n\t1) mpirun -np <number of Process/ors> -s <size of Vector> -t <number of task> -a <average size> "
1308                  "-p <part to run>\n";
1309     std::cout << "\n\t2) cmsenv_mpirun -np <number of Process/ors> -s <size of Vector> -t <number of task> -a <average "
1310                  "size> -p <part to run>\n";
1311     std::cout << "\n\t[-np] is for number of processes or processors that you would like to run.";
1312     std::cout
1313         << "\n\t[-s] is the size of vector that you would like to send, the type is float and there are two vectors.";
1314     std::cout << "\n\t[-t] is the number of repeating of task on the Device(GPU) side.";
1315     std::cout << "\n\t[-a] is the number of repeating the part that user has chosen.";
1316     std::cout << "\n\t[-p] is the choice of what part to run in the program.";
1317     std::cout << "\n\t[-q] is to print Stander Deviation.";
1318     std::cout << "\n\t[-f] is to save the results into a file for each part.";
1319     std::cout << "\n\n\tExample for only local Machine: ";
1320     std::cout << "\n\tcmsenv_mpirun -np 2 mpiCudaGeneric -p1 -s200 -t1 -a1\n";
1321     std::cout << "\n\tExample for two Machines connected: ";
1322     std::cout
1323         << "\n\tcmsenv_mpirun -H <machine Name as Root>,<machine Name as Host> -np 2 mpiCudaGeneric -p1 -s200 -t1 -a1";
1324     std::cout << "\n\tExample for two Machines connected Using ucx: ";
1325     std::cout << "\n\tcmsenv_mpirun -H <machine Name as Root>,<machine Name as Host> -np 2 -mca pml ucx -- "
1326                  "mpiCudaGeneric -p1 -s200 -t1 -a1";
1327     std::cout << "\n\n\tFor the Parts, we have in this program 4 Parts:";
1328     std::cout << "\n\t1)The Root, who does not have a GPU, using MPI Blocking send and receive to Host, The Host is "
1329                  "who have a GPU, then Host:";
1330     std::cout << "\n\t  uses cudaMalloc and copies the receiving values to GPU side. Next, the GPU does the compuation";
1331     std::cout << "\n\t  Finaly, the Host copies the results from GPU, sends them back to The Root using MPI Blocking "
1332                  "Send.\n\n";
1333   }
1334 }