///////////////////////////////////////////////////////////////////////////////////////////////
//
// Program: MeanFilterMPISharedMemory.pas
//
// Abstract:
//
//  The objective of this  program is to demonstrate the use
//  of Unix Shared Memory and MPI messge passing services
//  (provided by the MPICH2 appliction library) to implement
//  a CPU-intensive algorithm with potentially large memory
//  requirements in a multiprocessor environment.
//
//  This program implements a simple Moving Window filter
//  on a small integer raster grid. Each cell in the grid
//  is replced by the average of the nine surrounding cells
//  (including, for simplicity, the 'target cell'.
//
// Instructions for building and running this program:
//
//  Three steps: 1) Compile the program
//               2) Configure the MPI run-time environment 
//               3) Run the program
//
// 1) Compile the program with this command:
//
//  /usr/bin/fpc -MObjFPC -Scgi -O1 -gl -vewnhi -l 
//  0Fi/data/scientist_share/scientist/FreePascalMPI/ 
//  ww-Fu/data/computer/reeves/FreePascalStuff/mpich2/lib/
//  x86_64-linux/ -Fu/usr/lib/lazarus/0.9.28.2/
//  packager/units/x86_64-linux/
//  -Fu/data/scientist_share/scientist/FreePascalMPI/ -Fu. 
//  -FE/data/scientist_share/scientist/FreePascalMPI/
//  -oMpiSharedMemoryEXE  MeanFilterMPISharedMemory.pas
//
// The script file BuildMeanFilterMpi.sh contains this command;
// just run it to compile this program: 
//
//    user@eos: ./BuildMeanFilterMpi.sh
//
// 2) Configure the MPI run-time environment:
//
// This program is configured to run  on NCEAS 'eos'
// multiprocessor server, which must have the MPICH2 'mpd' 
// daemon running as a background process. Feel free to 
// consult the author of this program for help with configuring
// the MPICH run-time environment. Or, you can check for the
// mpd daemon by entering the command: 
//
//   user@eos: mpdtrace  
//
//   If this command does not return the value: 'eos',
//   enter the command:
//
//   user@eos: mpd --ncpus=4 &
//
//   This will start the mpd daemon, supporting 4 processors, 
//   as a background process. Then, re-enter the 'mpdtrace'
//   command to confirm the status of the mpd daemon. 
//
//  3) Run the program: enter the command:
//
//      mpiexec -n 4 ./MeanFilterMPISharedMemoryEXE
//
// Author: Rick Reeves, Scientific Programmer,
// National Center for Ecological Analysis and Synthesis
// July, 2010
//
///////////////////////////////////////////////////////////////////////////////////////////////
program MeanFilterMPISharedMemory;

{$mode objfpc}{$H+}
{$IFDEF Unix}
{$Linklib c}
{$ENDIF}
// if you get the error: undefined reference to 'pthread_getspcific' then enable the following:
{off $Linklib pthread}

uses
 MPI, IPC; //Baseunix;

type

QuadAssign = record
     iLLX : integer;
     iURX : integer;
     iLLY : integer;
     iURY : integer;
     iXyDim : integer;
end;

type sharedMemKeys = record
     key1 : Tkey;
     key2 : Tkey;
end;

type PInt = ^Integer;

 var
    iRank,
    iSize,
    shmidIn,
    shmidOut,
    iSegSize,
    iTag,
    iSrc,
    iDest,
    iDummyRetVal,
    iHalfWindowDim,
    iBegX,iBegY,iEndX,iEndY,iOneDimIndex,
    iSubBegX,iSubBegY,iSubEndX,iSubEndY,iSubOneDimIndex,
    iOff,iSubNet,iCtr,jCtr,iSubCtr,jSubCtr,iCount,iSum: integer;
//
    fCount, fSum, fMean : real;
//
    key: Tkey;
//
    pSegInPtr,
    pSegFiltPtr: PInt;

// for shmctl (shared memory control) call:

    PShmid_DS: ^TShmid_ds;

// Note: Currently set up for a four processor demo case.
//       A full-scale program would detect and assign the optimal number of processors,
//       or at the very least, accept 'n_proc' as a command line parameter.

Const Quads : array [1..4] of QuadAssign = ((iLLX:1; iURX:4; iLLY:1; iURY:4; iXyDim:4),
                                            (iLLX:5; iURX:8; iLLY:1; iURY:4; iXyDim:4),
                                            (iLLX:1; iURX:4; iLLY:5; iURY:8; iXyDim:4),
                                            (iLLX:5; iURX:8; iLLY:5; iURY:8; iXyDim:4));

Const ftokpath = '.'#0;
Const cGridDim = 64;
Const cGridOneDim = 8;
Const cFilterDim  = 3; // square, three rows, three columns
const cNumProcs   = 4;
const cSizeOfInt  = 4;
const iUpperRandBound = 1000;

// This is the global data structure used to assign
// portions of the solution matrix to the sub processes

var mpiRectStruct: MPI_Datatype;
var vDisplacements: array [1..2] of MPI_Datatype;
var vDatatypes: array [1..2] of MPI_Datatype;
var vBlocklengths: array[1..2] of MPI_Aint;
var mpiStatus: MPI_Status;
var sRetData : sharedMemKeys;

{$IFDEF WINDOWS}{$R MeanFilterMPISharedMemory.rc}{$ENDIF}

 begin

//
// mpi process and data management set up -
//
    vDatatypes[1] := MPI_INT;
    vDatatypes[2] := MPI_INT;

    vBlocklengths[1] := 1;
    vBlocklengths[2] := 1;

    vDisplacements[1] := 0;
    vDisplacements[2] := 4;

    MPI_Init(@argc,@argv);
    MPI_Comm_rank(MPI_COMM_WORLD,@iRank);
    MPI_Comm_size(MPI_COMM_WORLD,@iSize);
    if (iRank = 0) then writeln('*** Very Top of program Our NEW MPI rank= ',iRank,' mysize= ',iSize,' cFilterDim= ',cFilterDim,' ');

// last MPI setup calls, then we move to 'process-specific' sections
    MPI_Type_struct(2,@vBlocklengths,@vDisplacements,@vDatatypes,@mpiRectStruct);
    MPI_Type_commit(@mpiRectStruct);

    if (iRank = 0) then  // The 'Parent' process
    begin

// Here are things that only the 'driving' process (with rank of 0) does
// such as, allocate the shared memory block for all processes

// Obtain blocks of Shared Memory for the 'input' and 'filtered' grids

       iSegSize := cGridDim * cSizeOfInt;
       writeln('**** Process ZERO top: iSegSize: ',iSegSize);

// shared memory blocks allocated for a given 'key' value remain in memory between runs.

       key := ftok (pchar(@ftokpath[1]),ord('y'));
       shmidIn := shmget(key,iSegSize,IPC_CREAT or IPC_EXCL or 438);
       writeln('Process 0 - shmidIn: ',shmidIn,' key: ',key);

// create the (new) shared memory block

       writeln ('Creating NEW shared memory segment.');
       pSegInPtr:=shmat(shmidIn,nil,0);
       pSegInPtr[1] := 100; // test the assignment
       sRetData.key1 := key;
       Writeln ('Proc 0 Shmat : NEW Shared INPUT memory allocated. key is: ',sRetData.key1,' First Cell is: ',pSegInPtr[1]);

// Create a second shared memory block for the (filtered) output grid

       key := ftok (pchar(@ftokpath[1]),ord('z'));
       shmidOut := shmget(key,iSegSize,IPC_CREAT or IPC_EXCL or 438);
       writeln('Process 0 - shmidOut: ',shmidOut,' key: ',key);

// Now, create the (output) shared memory block

       pSegFiltPtr:=shmat(shmidOut,nil,0);
       pSegFiltPtr[1] := 200; // test the assignment
       sRetData.key2 := key;
       Writeln ('Proc 0 Shmat : NEW Shared OUTPUT memory allocated. key is: ',sRetData.key2,' First Cell is: ',pSegFiltPtr[1]);

// Initialize the input  and output grids

       Randomize;
       For iCtr:=1 to cGridDim do
       begin
         pSegInPtr[iCtr] := integer(Random(iUpperRandBound));
         pSegFiltPtr[iCtr] := -999;
       end;
       Writeln ('Proc 0 : Input grid memory has been randomized. First Cell: ',pSegInPtr[1]);
       Writeln ('Proc 0 : Filtered grid memory has been initialized. First Cell: ',pSegFiltPtr[1]);

// control the child processes (to process 'their' portions of the grid)
// send message to child processes: start working on quadrants

       for iDest := 1 to (cNumProcs - 1) do
       begin
          writeln(' From Parent Process: ',iRank,' to Child Process: ',iDest,' Sending key1/key2: ',sRetData.key1,'/',sRetData.key2);
          iTag := 0;
          MPI_Send(@sRetData,2,MPI_INT,iDest,iTag,MPI_COMM_WORLD);
          writeln(' Just sent From Parent Process to Child Process: ',iDest);
       end;
    end // of iRank == 0 block
    else // things done by the 'child processes' (id = 1 through 'nProcesses -1' )
    begin
           writeln(' Inside Child Process: ',iRank,' Waiting for Instructions from Parent....');
           iTag := 0; // sent from parent
           iSrc := 0;
           MPI_Recv(@sRetData,2,MPI_INT,iSrc,iTag,MPI_COMM_WORLD,@mpiStatus);
           writeln('.......Inside Child Process: ',iRank,' RECIEVED for Instructions from Parent');

// Now, connect to the shared memory blocks - use the 'key' passed from the parent procss via MPI

           shmidIn := shmget(sRetData.key1,iSegSize,IPC_CREAT or IPC_EXCL or 438);  // incoming grid
           if (iRank = 1) then writeln('Child Process ',iRank,' key1: ',sRetData.key1,' key2: ',sRetData.key2,' ShmidIn is: ',shmidIn);
           If shmidIn <> -1 then
           begin
              writeln(' Inside Child Process: ',iRank,' INCOMING shared memory segment does not exist. Exiting..');
              MPI_Finalize();
              exit();
           end
           else
           begin
              writeln('Existing Shared INPUT Memory - attach as client');
              shmidIn := shmget(sRetData.key1,iSegSize,0);
              pSegInPtr := shmat(shmidIn,Nil,0);
              if (iRank = 1) then writeln('Child Process ',iRank,' Got INPUT ptr first value: ',pSegInPtr[1]);
           end;

           shmidOut := shmget(sRetData.key2,iSegSize,0777);  // filtered grid
           if (iRank = 1) then writeln('Child Process ',iRank,' ShmidOut is: ',shmidOut);
           If shmidOut <> -1 then
           begin
              writeln(' Inside Child Process: ',iRank,' Filtered shared memory segment does not exist. Exiting..');
              MPI_Finalize();
              exit();
           end
           else
           begin
              writeln('Existing Shared OUTPUT Memory - attach as client');
              shmidOut := shmget(sRetData.key2,iSegSize,0);
              pSegFiltPtr := shmat(shmidOut,Nil,0);
              if (iRank = 1) then writeln('Child Process ',iRank,' Got FILTERED ptr first value: ',pSegFiltPtr[1]);
           end;
    end;

// OK, all processes are connected to the input and output shared memory grids
// Actual processing code is the same for all processes

    iBegX := Quads[iRank+1].iLLX;
    iEndX := Quads[iRank+1].iURX;
    iBegY := Quads[iRank+1].iLLY;
    iEndY := Quads[iRank+1].iURY;

if (iRank = 0) then writeln('Proc 0 Top of Conv loop: iBegX/iEndX: ',iBegX,' / ',iEndX,' iBegY/iEndY: ', iBegY,' / ',iEndY);

// convolution filter processing: Use many redundant iterations to build up a timing sample

    iHalfWindowDim :=  1; //Integer((cFilterDim - 1.0) / 2.0);

// core processing: replace each cell by the mean of its kernal.

writeln('iHalfWindowDim: ',iHalfWindowDim,' cGridOneDim: ',cGridOneDim);

    For iCtr:= iBegY to iEndY do
    begin
       For jCtr:= iBegX to iEndX do
       begin

// compute mean of the local neighborhood around each cell.

// get the 'kernal coords' surrounding current cell
// note: around the edges, kernal may not be 'full size'
// That is OK for this basic demo.

if (iRank = 0) then writeln('......In Proc 0 About to compute new cell iCtr / jCtr: ',iCtr,' / ',jCtr);

         iSubBegY := iCtr - iHalfWindowDim;
         if (iSubBegY < 1) then iSubBegY := 1;
         iSubBegX := jCtr - iHalfWindowDim;
         if (iSubBegX < 1) then iSubBegX := 1;

         iSubEndY := iCtr + iHalfWindowDim;
         if (iSubEndY > cGridOneDim) then iSubEndY := cGridOneDim;
         iSubEndX := jCtr + iHalfWindowDim;
         if (iSubEndX > cGridOneDim) then iSubEndX := cGridOneDim;

if (iRank = 0) then writeln('......In loop Prc 0: SubBegX/SubEndX: ',iSubBegX,' / ',iSubEndX,' SubBegY/SubEndY ', iSubBegY,' / ',iSubEndY);

// compute the sum, and then the mean

         iSum := 0;
         fSum := 0.0;
         fMean := 0.0;
         iCount := 0;
         For iSubCtr:= iSubBegY to iSubEndY do
         begin
            For jSubCtr:= iSubBegX to iSubEndX do
            begin
               iSubNet := iSubCtr - 1;
               iSubOneDimIndex := (iSubNet * cGridOneDim) + jSubCtr;
               iSum := iSum + pSegInPtr[iSubOneDimIndex];
if (iRank = 0) then writeln('..........Sum loop: iSubCtr / jSubCtr: ',iSubCtr,' / ',jSubCtr,' iSubNet: ',iSubNet,' iSubOneDimIndex: ',iSubOneDimIndex,' Cell: ',pSegInPtr[iSubOneDimIndex],' Sum: ',iSum);
               iCount := iCount + 1;
            end
         end;
         fCount := iCount;
         fSum := iSum;
         fMean := fSum / fCount;

         // Assign mean to output grid

          iSubNet := iCtr - 1;
          iOneDimIndex := (iSubNet * cGridOneDim) + jCtr;
if (iRank = 0) then writeln('.......Proc 0 Output Grid: Row / Col / iSubNet / OneDimIndex: ',iCtr,' / ',jCtr,' / ',iSubNet,' / ',iOneDimIndex,' fSum / fCount / fMean: ',iSum,' / ',iCount,' / ',fMean,' ');

//if (iRank = 0) then writeln('.........Proc 0 About to Assign fMean: ',fMean,' to cell ', iOneDimIndex);
          pSegFiltPtr[iOneDimIndex] := trunc(fMean);
//if (iRank = 10) then writeln('Proc 0 Just assigned fMean: to cell ', iOneDimIndex);
       end; // main image column
    end;    // main image row

// OK, at the end of this, we have processed all of the grid cells assigned to this process
// set up 'blocking' mechanism to make parent wait for child processes to be complete.

    if iRank > 0 then
    begin
      shmdt(pSegInPtr);
      shmdt(pSegFiltPtr);
      iDest := 0;
      iTag := 0;
      MPI_Send(@iDummyRetVal,1,MPI_INT,iDest,iTag,MPI_COMM_WORLD);
      writeln(' The Child Process ',iRank,' Detached from Shared Memory, sent *Done* to Parent');
    end
    else
    begin

// Parent process waits for all children to report in

      writeln(' Parent Process waiting to hear back from all ',(cNumProcs - 1),' Child Proceses');
      iTag := 0;
      For iCtr:=1 to (cNumProcs - 1) do
      begin
         MPI_Recv(@iDummyRetVal,1,MPI_INT,iCtr,iTag,MPI_COMM_WORLD,@mpiStatus);
         writeln('Parent Process hears back from Child Process: ',iCtr);
      end;
      writeln(' Parent Process *HAS* heard back from all ',(cNumProcs - 1),' Child Proceses');

// this would be a good place to print the two grids.

      writeln('Here is the input grid: ');
      iOff := 0;
      For iCtr:= 1 to cGridOneDim do
      begin
        writeln(pSegInPtr[iOff+1],' ',pSegInPtr[iOff+2],' ',pSegInPtr[iOff+3],' ',pSegInPtr[iOff+4],' ',pSegInPtr[iOff+5],' ',pSegInPtr[iOff+6],' ',pSegInPtr[iOff+7],' ',pSegInPtr[iOff+8],' ');
        iOff := iOff + cGridOneDim;
      end;
      writeln(' ');
      writeln('Here is the FILTERED grid: ');
      iOff := 0;
      For iCtr:= 1 to cGridOneDim do
      begin
        writeln(pSegFiltPtr[iOff+1],' ',pSegFiltPtr[iOff+2],' ',pSegFiltPtr[iOff+3],' ',pSegFiltPtr[iOff+4],' ',pSegFiltPtr[iOff+5],' ',pSegFiltPtr[iOff+6],' ',pSegFiltPtr[iOff+7],' ',pSegFiltPtr[iOff+8],' ');
        iOff := iOff + cGridOneDim;
      end;

//  Detach from shared memory, send a 'delete' message for the shared memory, end the MPI Session...

      writeln('In Process Zero, detaching from memory: shmIdIn: ',shmidIn,' shmidOut: ',shmidOut);
      shmdt(pSegInPtr);
      shmdt(pSegFiltPtr);
      shmctl(shmidIn,IPC_RMID,PShmid_DS);
      shmctl(shmidOut,IPC_RMID,PShmid_DS);
      writeln(' ');
      writeln(' Thats it from Process Zero!. Program Ends');
    end;
    MPI_Finalize;
end.
