/////////////////////////////////////////////////////////////////////////////////////////////////
//
// FilterHpc.cc: 2-dimensional grid simulation program
// to support High Performance Computing benchmark study.
// Copyright 2006 National Center for Ecological Analysis
// and Synthesis (NCEAS)
// Author: Rick Reeves, NCEAS Scientific Programmer
//
// This program defines a 2-dimensional grid, then initializes
// the grid to a set of random numbers, then finally repalces
// each random value with the integer average of the cell
// and its surrounding FILTER_DIM * FILTERDIM  neighbors.
// The program employs Matrix 'wrapping' logic to compute the
// average for cells along the outer edges of the matrix.
// the program also uses a Clock object to generate random number
// series based on system time and also to time the program execution.
// 
// SimHpc demonstrates the High-Performance Computing (HPC) features
// available to C/Fortran/Java programmers on Linux machines: 
// 
// MPI (Message Passing Interface): Applications Progamming Interface
// (API) that enables programs (or parts of programs) to utilize multiple
// CPUs simultaneously as a means of decreasing execution time.
//
// Shared Memory: Supported under the Linux Interprocess Communication (IPC)
// API, allows multiple UNIX processes (such as used by MPI to distribute
// an application among multiple CPUs) to access the same section of physical
// computer memory. 
//
// This program's results best used in conjunction with the
// companion program SimStd.cc, which solves the same problem
// WITHOUT using multiple processors and shared memory features
// 
// Special note: This version of SimHpc is hard-coded to use 4 processors. 
// The next versions will adapt itself to different numbers of processors,
// as specified by the MPI command that is used to execute the program:
// 
//  %  mpiexec -n N (N is the desired number of processors)
//
// Another note: This program must be compiled using the mpiCC command under Linux 
// on a machine that has the MPI API installed.
//
/////////////////////////////////////////////////////////////////////////////////////////////////
//
// These three include files support Shared Memory and Interprocess Communications
//
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/shm.h>

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
//
// this include file supports the Message Passing Interface (MPI) API.
// MPI is used to execute this program 'in parallel' on multiple CPUs 
// (on computers equipped with them)
//
#include "mpi.h"
//
// this include file supports the user-defined C++ timer clock object
//
#include "Clock.h"
//
#define GRID_DIM     1000            // size of 2 dim processing grid
#define SUB_GRID_DIM (GRID_DIM / 2) // half the grid size
#define FILTER_DIM     9            // size of the filtering kernal, rows and columns.

#define FILTER_SIZE (FILTER_DIM * FILTER_DIM)
 
#define MAX_FILTER_ITERATIONS 100 // number of times to repeat the filtering operation 

#define MAX_PRINT_GRID_ROW 25      // upper limit on portion of grid that can be printed
#define MAX_PRINT_GRID_COL 25

#undef  TIME_INPUT_MATRIX_GEN
#define TIME_INPUT_MATRIX_GEN 1
//
// some data objects that support the program..
//
typedef struct  // defines the boundaries of the sub-grid allocated to each processor
{
   int iLLX; 
   int iULX; 
   int iLLY;
   int iULY;
   int iXyDim;
} 
QUAD_ASSIGN;

typedef struct // stores memory handle 'keys' used by program to access Shared Memory
{
   key_t key1;
   key_t key2;
}
RETURN_TYPE;

typedef struct // this may go away
{
  key_t key;
  char  *shmPtr;
  int   iBlockSize;
}
SHARED_MEMORY_HANDLE;

typedef enum  // specifies whether new memory is being allocated or existing memory accessed
{
  eAllocNewMem = 1,
  eUseExistMem
}
SHARED_MEM_OPTION;

char *GetSharedMemoryBlock(key_t key, SHARED_MEM_OPTION eOption);

QUAD_ASSIGN Quads[4] = {{0,(SUB_GRID_DIM-1),0,(SUB_GRID_DIM-1),SUB_GRID_DIM},
                        {(SUB_GRID_DIM),(GRID_DIM-1),0,(SUB_GRID_DIM-1),SUB_GRID_DIM},
                        {(SUB_GRID_DIM),(GRID_DIM-1),(SUB_GRID_DIM),(GRID_DIM-1),SUB_GRID_DIM},
                        {0,(SUB_GRID_DIM-1),(SUB_GRID_DIM),(GRID_DIM-1),SUB_GRID_DIM}};
//
// For passing information between MPI parent and child processes.
//
MPI_Aint displacements[2] = {0,sizeof(int)};
MPI_Datatype rectStruct;
MPI_Datatype types[2] = {MPI_INT,MPI_INT};
const int iNumFields = 2;
int blocklengths[iNumFields] = {1,1};

key_t GetNewKey(char * sPathName);

SHARED_MEMORY_HANDLE &GetSharedMemoryBlock(int iBlockSizeInBytes);

void PrintGrid(int iProcessId,key_t key);

static SHARED_MEMORY_HANDLE hOrigGrid;
//
// amount of shared memory (in bytes) to allocate in each shared memory block...
//
static int iShmSize;

char sMsgBuf[180];

Clock myClock;

int main(int argc,char *argv[])
{
   MPI_Status status;
   int iRank,iSize,iProcessId,iNprocesses,iBegX,iBegY,iEndX,iEndY,iDim,iDummyRetVal,
       iCtr,jCtr,kCtr,iCtr2,jCtr2,iPtr,jPtr,iSum,iHalfFilterDim,tag,src,dest;
   unsigned long lElapsedTimeSec,lElapsedTimeMin;
   float fAvg;

   MPI_Init(&argc,&argv);
   MPI_Comm_rank(MPI_COMM_WORLD,&iRank);
   MPI_Comm_size(MPI_COMM_WORLD,&iSize);

   RETURN_TYPE sRetData;
//
// initialize buffer to something, not important what.
//
// SHARED MEMORY STUFF
//
   key_t key;
   char *shm;
   int (*sOrigGridPtr)[(GRID_DIM)]; // pointer to the entire grid
   int (*sFiltGridPtr)[(GRID_DIM)]; // pointer to block of one quadrant ints; use as matrix

   iProcessId  = iRank;
   iNprocesses = iSize;

   iShmSize = GRID_DIM * GRID_DIM * sizeof(int);
//
fprintf(stdout,"top of main: iProcessID: %d iNProcesses: %d\n",iProcessId,iNprocesses);
fflush(stdout);
// 
   MPI_Type_struct(iNumFields,blocklengths,displacements,types,&rectStruct);
   MPI_Type_commit(&rectStruct);
//
// the processing code goes here
// This will be replicated on all four processors
//
   if (iProcessId == 0)
   {
//
// get an execution start time
//
      myClock.GetStartingTime();
//
// establish the shared memory, then send a message to the other proceses
// that it is time to begin processing. 
//
// note: other processes are blocked with a recv() call, waiting for a message from proc 0....
// 
// just in case, remove the memory segment id
//
// note: this key will be used by the other processes to communicate with the shared memory!
// 
       if ((key = GetNewKey("/dev/null")) == (key_t) -1)
       {
           printf("main: GetNewKey fails to return valid key: %d\n",hOrigGrid.key);
          MPI_Finalize();
          exit(1);
       }
//
// Precaution: delete the shared memory associated with 'key'
//
//       sprintf(sCommand,"ipcrm -M %d",key);
//       system(sCommand);
//
// create and attach to a NEW block of shared memory for the 'input' grid: 
//
       if ((shm = GetSharedMemoryBlock(key, eAllocNewMem)) == NULL)
       {
          fprintf(stdout,"GetSharedMemoryBlock call: Failed to allocate new memory block\n");
          MPI_Finalize();
          exit(1);
       }  
       else
       {
//
// initialize the transfer structure to pass the keys to the child processes
// with the keys, they can attach to the shared memory area
//
           sRetData.key1 = key;
//
// since we are about to reuse the 'shm' pointer, lets initialise
// the input grid pointer to the block of memory just allocated
// thus, process 0 will be connected.
//
           sOrigGridPtr = (int (*)[GRID_DIM]) shm;
       }
//
// create and attach to a NEW block of shared memory for the 'filtered' grid:
// need to get a new key first....
//
       if ((key = GetNewKey("/dev/null")) == (key_t) -1)
       {
           printf("main: GetNewKey fails to return valid key: %d\n",hOrigGrid.key);
          MPI_Finalize();
          exit(1);
       }
       if ((shm = GetSharedMemoryBlock(key, eAllocNewMem)) == NULL)
       {
          fprintf(stdout,"GetSharedMemoryBlock call: Failed to allocate new memory block\n");
          MPI_Finalize();
          exit(1);
       }
       else
       {
//
// initialize the transfer structure to pass to the child processes
//
           sRetData.key2 = key;
           sFiltGridPtr = (int (*)[GRID_DIM]) shm;
       }
//
// get the aproximate execution start time from the O/S
//
#ifdef TIME_INPUT_MATRIX_GEN
   myClock.GetStartingTime();
#endif
       for (iCtr = 0; iCtr < GRID_DIM; iCtr++)
       {
          for (jCtr = 0; jCtr < GRID_DIM; jCtr++)
          {
             sOrigGridPtr[iCtr][jCtr] = (int)(GRID_DIM * (myClock.GetNextRandValFromCache()/(RAND_MAX + 1.0)));
          }
       }
      fprintf(stdout,"Printing the initial matrix......\n");
      fflush(stdout);
      PrintGrid(iProcessId,(key_t)sRetData.key1);
      fprintf(stdout,"there WAS the initial matrix......\n");
      fflush(stdout);
//
#ifdef TIME_INPUT_MATRIX_GEN
//
// Elapsed time calculation: uncomment this to see how long random # matrix takes to generate
// 
   myClock.GetCurrentTime();
   lElapsedTimeSec = myClock.GetElapsedTimeInSec();
   lElapsedTimeSec = lElapsedTimeSec;
   lElapsedTimeMin = lElapsedTimeSec / 60;
   if (lElapsedTimeMin > 0)
   lElapsedTimeSec = lElapsedTimeSec % lElapsedTimeMin;

   sprintf(sMsgBuf,"(Hpc Version) elapsed time to generate input matrix: %ld minutes %ld seconds.\n",
           lElapsedTimeMin,lElapsedTimeSec);
#endif
//
// OK, now start processing...
//
       for (dest = 1; dest < 4; dest++)
       {
           fprintf(stdout,"BEFORE processing starts: proc 0 sending key %d to proc %d\n.....",sRetData.key1,dest);
           fflush(stdout);
           tag  = 0;
//
// now we really dont send any useful info via MPI, we just unblock the other processes..
//
           MPI_Send(&sRetData,1,rectStruct,dest,tag,MPI_COMM_WORLD); 
/* 
   note: how to bcast to all the other processes at once? 
*/
           fprintf(stdout,"BEFORE processing...message sent to proc %d.\n",dest);
           fflush(stdout);
       }
   }
   else /* the other processes */
   {
       fprintf(stdout,"BEFORE processing: proc %d waiting to recieve.....\n",iProcessId);
       fflush(stdout);
       tag = 0; // recieving from the host (0) process.
       src = 0;
       MPI_Recv(&sRetData,1,rectStruct,src,tag,MPI_COMM_WORLD,&status); /* waiting for the pointer and a special tag */
       fprintf(stdout,"BEFORE processing.....proc %d has RECIEVED key: %d starting processing\n",iProcessId,sRetData.key1);
       fflush(stdout);
//
// Attach to the shared memory segments that were created in process 0
//
       if ((shm = GetSharedMemoryBlock(sRetData.key1, eUseExistMem)) == NULL)
       {
          fprintf(stdout,"GetSharedMemoryBlock call: Failed to connect to existing memory block\n");
          fflush(stdout);
          MPI_Finalize();
          exit(1);
       }
       else
       {
//
// initialize the transfer structure to pass to the child processes
//
           sOrigGridPtr = (int (*)[GRID_DIM]) shm;
       }
       if ((shm = GetSharedMemoryBlock(sRetData.key2, eUseExistMem)) == NULL)
       {
          fprintf(stdout,"GetSharedMemoryBlock call: Failed to connect to existing memory block\n");
          fflush(stdout);
          MPI_Finalize();
          exit(1);
       }
       else
       {
//
// initialize the transfer structure to pass to the child processes
//
           sFiltGridPtr = (int (*)[GRID_DIM]) shm;
       }
   }
//
//  begin the actual processing now that everyone has connected to the shared memory area
// 
// create the 'virtual' 2 dim array ptr
//
   fprintf(stdout,"DURING: process %d starts grid processing..\n",iProcessId);
   fflush(stdout);
   iDim  = Quads[iProcessId].iXyDim;
   iBegX = Quads[iProcessId].iLLX;
   iEndX = Quads[iProcessId].iULX;
   iBegY = Quads[iProcessId].iLLY;
   iEndY = Quads[iProcessId].iULY;

   fprintf(stdout,"process %d, iBegX: %d iBegY: %d iEndX = %d iEndY = %d\n",iProcessId,iBegX,iBegY,iEndX,iEndY);
   fflush(stdout);
   fprintf(stdout,"before assignment: ProcessId: %d: ULC of grid: %ul / %d\n",iProcessId,&sOrigGridPtr[0][0],sOrigGridPtr[0][0]);
   fflush(stdout);
//
// run the convolution filter on the appropriate quadrant of the grid
//
   iHalfFilterDim = ((FILTER_DIM - 1) / 2);
   for (kCtr = 0; kCtr < MAX_FILTER_ITERATIONS; kCtr++)
   {
      if (!(kCtr % 100))
      {
          printf("proc %d generating FILTERED matrix iteration %d..\n",iProcessId,kCtr);
          fflush(stdout);
      }
      for (iCtr = iBegX; iCtr <= iEndX; iCtr++)
      {
//         if (iCtr == iBegX || !(iCtr % 20))
//         {
//            fprintf(stdout,"generating FILTERED matrix row %d...\n",iCtr);
//            fflush(stdout);
//         }
         for (jCtr = iBegY; jCtr <= iEndY; jCtr++)
         {
//
// the convolution is done for each cell: each cell replaced by
// the average of its FILT_GRID_SIZE square neighborhood.
//
            iSum = 0;
            fAvg = 0.0;
//
// iPtr is the URC of the local neighborhood
//
            iPtr = iCtr - iHalfFilterDim;
            if (iPtr < 0)
               iPtr = GRID_DIM - iHalfFilterDim;
            for (iCtr2 = 1; iCtr2 <= FILTER_DIM; iCtr2++)
            {
               jPtr = jCtr - iHalfFilterDim;
               if (jPtr < 0)
                  jPtr = GRID_DIM - iHalfFilterDim;
               for (jCtr2 = 1; jCtr2 <= FILTER_DIM; jCtr2++)
               {
                  iSum += sOrigGridPtr[iPtr][jPtr];
                  jPtr++;
                  if (jPtr == GRID_DIM)
                     jPtr = 0;
               }
               iPtr++;
               if (iPtr == GRID_DIM)
                 iPtr = 0;
            }
//
// convolution filter ends here
//
//            fprintf(stdout,"process %d assigning value to filtGrid[%d][%d]: %d\n",
//                    iProcessId,iCtr,jCtr,int(iSum / (float)FILTER_SIZE));
//           fflush(stdout);
              sFiltGridPtr[iCtr][jCtr] = int(iSum / (float)FILTER_SIZE);
//              sFiltGridPtr[iCtr][jCtr] = (iProcessId + 1);
         }
      }
   }
//
// Set up a blocking message to make Process 0 wait for the other processes to finish,
// then print out the grid, which should be complete.
//
   if (iProcessId > 0)
   {
      dest = 0;
      tag = 0;
      fprintf(stdout,"AFTER processing: process %d SENDING message to proc 0...\n",iProcessId);
      fflush(stdout);
      MPI_Send(&iDummyRetVal,1,MPI_INT,dest,tag,MPI_COMM_WORLD); 
//      fprintf(stderr,"process %d SENT message to proc 0\n",iProcessId);
   }
   else if (iProcessId == 0)
   {
//
// look for message from all other processes. When they all report in, print the results and end the program.
//
      fprintf(stderr,"********* AFTER processing: Proc %d waiting for messages from others....\n",iProcessId);
      for (tag = 0,iCtr = 1;iCtr < iNprocesses;iCtr++)
         MPI_Recv(&iDummyRetVal,1,MPI_INT,iCtr,tag,MPI_COMM_WORLD,&status); /* waiting for the pointer and a special tag */

      fprintf(stdout,"********* AFTER processing: Process %d has ALL subprocesses reporing in...\n",iProcessId);
      fflush(stdout);
//
// detach from the shared memory segment - one time for all processes
//
      shmdt(shm);
//
// print out the matrix
//
      fprintf(stdout,"AFTER processing: in process %d and FINISHED. here is the matrix......\n",iProcessId);
      fflush(stdout);
      PrintGrid(iProcessId,key);
      fprintf(stdout,"....THERE WAS the matrix - done.\n",iProcessId);
      fflush(stdout);
//
// Elapsed time calculation. Print out time to create input matrix if it was computed.
//
#ifdef TIME_INPUT_MATRIX_GEN
    printf("%s\n",sMsgBuf);
#endif   
      myClock.GetCurrentTime();
      lElapsedTimeSec = myClock.GetElapsedTimeInSec();
      lElapsedTimeSec = lElapsedTimeSec;
      lElapsedTimeMin = lElapsedTimeSec / 60;
      if (lElapsedTimeMin > 0)
         lElapsedTimeSec = lElapsedTimeSec % lElapsedTimeMin;
      fprintf(stdout,"(HPC Version) Program is done. Total elapsed time is %ld minutes %ld seconds.\n",
           lElapsedTimeMin,lElapsedTimeSec);
      fflush(stdout);
   }
   MPI_Finalize();
   return(0);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
// 
// PrintGrid() function prints a 2-dimensional grid to the terminal screen.
// Copyright 2006 National Center for Ecological Analysis 
// and Synthesis (NCEAS) 
// Author: Rick Reeves, NCEAS Scientific Programmer
//
// Input argument: a pointer (scaled to a one-dimenisional array) to the 
// start of the desired grid.
// Note: sometimes the grid will be larger than can be displayed on the 
// screen, so we print out only the upper left-hand MAX_PRINT_GRID_ROW(COL)
// portion of the matrix, or the entire matrix, whichever is smaller.
//
/////////////////////////////////////////////////////////////////////////////////////////////////
//
void PrintGrid(int iProcessId,key_t key)
{
   int iCtr, jCtr,iPrintRowLim,iPrintColLim;
   int shmid;
   char *shm;
   int (*vptr)[GRID_DIM]; // pointer to block of 10 ints; use as matrix

   fprintf(stdout,"PrintGrid printing the matrix:\n");
   fflush(stdout);
//
//  Attach to the shared memory segment that was created in process 0
//
   if ((shmid = shmget(key, iShmSize, 0777)) < 0)
   {
       printf ("PrintGrid (process %d): The shmget call failed, error number = %d\n",iProcessId,errno);
       perror("shmget");
       MPI_Finalize();
       exit(1);
   }
   if ((shm = (char*)shmat(shmid, (char *)NULL, 0)) == (char *) -1)
   {
       perror("shmat");
       printf ("PrintGrid (process %d): The shmat call failed, error number = %d\n",iProcessId,errno);
       exit(1); // find a better way to kill this
   }
//
// set the upper bound on grid output.
//
   if (GRID_DIM > MAX_PRINT_GRID_ROW)
      iPrintRowLim = MAX_PRINT_GRID_ROW;
   else 
      iPrintRowLim = GRID_DIM;
   if (GRID_DIM > MAX_PRINT_GRID_COL)
      iPrintColLim = MAX_PRINT_GRID_COL;
   else 
      iPrintColLim = GRID_DIM;
//
   vptr = (int (*)[GRID_DIM]) shm;

   for (iCtr = 0; iCtr <  iPrintRowLim; iCtr++)
   {
      for (jCtr = 0; jCtr <  iPrintColLim; jCtr++)
      {
        fprintf(stdout,"%3d ",vptr[iCtr][jCtr]);
        fflush(stdout);
      }
      fprintf(stdout,"\n");
      fflush(stdout);
   }
   fprintf(stdout,"PrintGrid finished printing the matrix\n");
//
// detach from the shared memory
//
   shmdt(shm);
//
   fflush(stdout);
}
//
// routine generates a new key used to connect to shared memory resources
//
/////////////////////////////////////////////////////////////////////////////////////////////////
// 
// GetNewKey() function generates a Unix shared memory key
// Copyright 2006 National Center for Ecological Analysis 
// and Synthesis (NCEAS) 
// Author: Rick Reeves, NCEAS Scientific Programmer
//
// Input argument: sPathName (char string): valid path name required by ftok() funciton
// 
// Returns: A valid and unused shared memory key.
//
///////////////////////////////////////////////////////////////////////////////////////////
//
key_t GetNewKey(char * sPathName)
{
   static char sCommand[80];
   key_t t_newKey = -1;

   static int  key_id = (int) 'A';

   t_newKey = ftok(sPathName,key_id);
   if ( t_newKey == (key_t)-1 )
   {
       printf("GetNewKey: ftok() creation for /dev/null, %d (key_id) failed...\n",key_id);
   }
   else
   {
      printf("GetNewKey: ftok() created key for /dev/null, %d\n",t_newKey);
      hOrigGrid.key = t_newKey;
   }
//
// Precaution: delete any shared memory associated with 'key'
//
   sprintf(sCommand,"ipcrm -M %d",t_newKey);
   system(sCommand);
//
// increment the key_id seed to the next ascii code. 
// At some point (apx 60 values), reset to the start.
//
   key_id++;
   if (key_id == (int)'z')
     key_id = (int) 'A'; 

   return(t_newKey);
}
//
// Routine allocates a block of shared memory OR connects to an existing block.
//   
/////////////////////////////////////////////////////////////////////////////////////////////////
// 
// GetSharedMemoryBlock() function returns pointer to a valid block of shared memory
// Copyright 2006 National Center for Ecological Analysis 
// and Synthesis (NCEAS) 
// Author: Rick Reeves, NCEAS Scientific Programmer
//
// Input arguments: key_t key: Valid UNIX shared memory key, previously created by ftok()
//
//  SHARED_MEM_OPTION eOption: Indicates whether key already points to valid shared memory block
//                             allocated by shmget(). If not, shmget() is called to allocate memory.
// Returns: Either a valid pointer to shared memory, or NULL if the allocation failed.
//
///////////////////////////////////////////////////////////////////////////////////////////
//
char *GetSharedMemoryBlock(key_t key, SHARED_MEM_OPTION eOption)
{
   int shmid;
   char *shm;

   extern int iShmSize;

   shm = (char *) NULL;
   switch(eOption)
   {
      case eAllocNewMem:
        if ((shmid = shmget(key, iShmSize, IPC_CREAT | IPC_EXCL | 0777)) < 0)
        {
           perror("shmget");
           printf ("\nGetSharedMemoryBlock (eAllocNewMem) The shmget call failed, error number = %d\n",errno);
        }
        else
        {
           printf("\nGetSharedMemoryBlock: The shmget call WORKED\n");
        }
      break;
      case eUseExistMem :
       if ((shmid = shmget(key, iShmSize, 0777)) < 0)
       {
           printf ("\nGetSharedMemoryBlock (eExistMem): The shmget call failed, error number = %d\n",errno);
           perror("shmget");
       }
      break;
   }
   if (shmid >= 0)
   {
//
//  now attach to the specified segment, getting a char pointer to the space. 
//
      if ((shm = (char*)shmat(shmid, (char *)NULL, 0)) == (char *) -1)
      {
         perror("shmat");
         printf ("\nGetSharedMemoryBlock: The shmat call failed, error number = %d\n",errno);
      }
   }
   return (shm);
}
