/* Implement the game of Life, as invented by J.H. Conway.

Unfortunately, C99 variable length arrays bring C up to only about the
array-handling expressiveness of Fortran 77.  So it uses some horrible
macros to imitate the main array-handling code in the Fortran 90
original.  Note that array storage is C-like (i.e. zero-based and second
subscript varying fastest), not a Fortran clone, and the code has been
improved where feasible.

Some unnecessary code is present to suppress gcc's warnings, while
enabling as much checking as possible. */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <sys/time.h>

#define CHECK       0



/* Provide a useful auxiliary. */

void crash (const char *message) {
    fprintf(stderr,"%s\n",message);
    exit(EXIT_FAILURE);
}



/* Definitions for (somewhat) cleaner matrix handling.  The main ones
are emulation of the forms of a Fortran assignment statement for the
features that we use.  No attempt has been made to optimise any of this
code.  'inline' is not used because, if you think that you know what it
means in C99, you probably don't - and it's subtly incompatible with
C++'s 'inline', too. */

typedef signed char byte;

typedef struct {
    byte *truebase, *base;
    int lwb1, lim1, lwb2, lim2, stride;
} matrix;

#define access(x,i,j) ((x).base[(i)*(x).stride+(j)])

void setup (matrix * mat, int lwb1, int lim1, int lwb2, int lim2) {
    if (lwb1 == 0 && lim1 == 0 && lwb2 == 0 && lim2 == 0)
        mat->truebase = mat->base = NULL;
    else {
        if (lwb1 >= lim1 || lwb2 >= lim2)
            crash("Invalid arguments for function setup");
        mat->truebase = mat->base =
            (byte *)malloc((size_t)((lim1-lwb1)*(lim2-lwb2)));
        if (mat->base == NULL) crash("Unable to allocate array space");
    }
    mat->lwb1 = lwb1;
    mat->lim1 = lim1;
    mat->lwb2 = lwb2;
    mat->lim2 = lim2;
    mat->stride = lim1-lwb1;
    mat->base -= lwb1*mat->stride+lwb2;
}

void cleanup (matrix * mat) {
    if (mat->truebase != NULL) free(mat->truebase);
    mat->truebase = mat->base = NULL;
}

void toseq (byte * target, matrix source, int from1, int lim1,
        int from2, int lim2) {
    int i, j, k;

#if CHECK
    if (from1 > lim1 || from2 > lim2 || from1 < source.lwb1 ||
            lim1 > source.lim1 || from2 < source.lwb2 ||
            lim2 > source.lim2)
        crash("Bounds error in function toseq");
#endif
    k = 0;
    for (i = from1; i < lim1; ++i)
        for (j = from2; j < lim2; ++j)
            target[k++] = access(source,i,j);
}

void fromseq (matrix target, int from1, int lim1,
        int from2, int lim2, const byte * source) {
    int i, j, k;

#if CHECK
    if (from1 > lim1 || from2 > lim2 || from1 < target.lwb1 ||
            lim1 > target.lim1 || from2 < target.lwb2 ||
            lim2 > target.lim2)
        crash("Bounds error in function fromseq");
#endif
    k = 0;
    for (i = from1; i < lim1; ++i)
        for (j = from2; j < lim2; ++j)
            access(target,i,j) = source[k++];
}



/* These procedures are a gratuitously inefficient way of performing
these tasks, but are used to ensure that there is a chance of wrong
answers if the program has race conditions.  The obvious and more
efficient code will work even in that case, primarily because of
atomicity.  Despite my best attempts, some compilers will optimise these
to uselessness anyway.  'volatile' is not used for the same reasons that
'inline' isn't, except redoubled in spades. */

int Get (byte * value) {
    int x;

    x = *value;
    *value = x^0x3f;
    return x;
}

void Set (byte * location, int value) {
    *location = (*location>>6)+value;
}

void Zero (matrix target, int from1, int lim1, int from2, int lim2) {
    int i, j;

#if CHECK
    if (from1 > lim1 || from2 > lim2 || from1 < target.lwb1 ||
            lim1 > target.lim1 || from2 < target.lwb2 ||
            lim2 > target.lim2)
        crash("Bounds error in function zero");
#endif
    for (i = from1; i < lim1; ++i)
        for (j = from2; j < lim2; ++j)
            access(target,i,j) = access(target,i,j)^0x3f;
    for (i = from1; i < lim1; ++i)
        for (j = from2; j < lim2; ++j) Set(&access(target,i,j),0);
}



/* Variables that are passed by host association in Fortran. */

static int steps, size;



/* This performs one step in the game on the local grid.  It assumes
that the grid is valid, both in shape and content.  One of the higher
bits is used to store what the value will be next step (note that the
neighbour count can reach 8).  Using a second grid would be needed when
using a bitmap and provide the compiler more scope for
autoparallelisation, but that's an issue for an OpenMP course, not a MPI
one! */

void Iterate (matrix grid) {
#define GRID(i,j) access(grid,i,j)
    int i, j, k, m;

#pragma omp single
{ /* This matters */
    for (i = grid.lwb1+1; i < grid.lim1-1; ++i)
        for (j = grid.lwb2+1; j < grid.lim2-1; ++j) {
            m = Get(&GRID(i,j));
            k = GRID(i-1,j-1)+GRID(i-1,j+1)+GRID(i+1,j-1)+
                    GRID(i+1,j+1)+GRID(i-1,j)+GRID(i+1,j)+
                    GRID(i,j-1)+GRID(i,j+1);
            switch (k&15) {
case 2:         m *= 17;
                break;
case 3:         m += 16;
            }
            Set(&GRID(i,j),m);
        }
    for (i = grid.lwb1+1; i < grid.lim1-1; ++i)
        for (j = grid.lwb2+1; j < grid.lim2-1; ++j)
            Set(&GRID(i,j),(Get(&GRID(i,j)) >> 4));
} /* This matters */
#undef GRID
}



/* And here is the main program. */

int main (void) {
    int i, k, n;
    double wall, cpu;
    struct timeval tv;
    clock_t stamp;
    char filename[100];
    FILE *file;
    matrix total;

/* Read in the parameters. */

    if (scanf("%99s",filename) != 1 || filename[0] == '\0')
        crash("Invalid filename");
    if (scanf("%d",&steps) != 1 || steps < 1 || steps > 1000000)
        crash("Invalid number of steps");

/* Read in the size and allocate the grid. */

    if ((file = fopen(filename,"r")) == NULL)
        crash("Unable to open input file");
    if (fscanf(file,"%d",&size) != 1 || size < 1 || size > 1000)
        crash("Invalid grid size");
    setup(&total,-1,size+1,-1,size+1);
    Zero(total,-1,0,-1,size+1);
    Zero(total,-1,size+1,-1,0);
    Zero(total,size,size+1,-1,size+1);
    Zero(total,-1,size+1,size,size+1);
    if (getc(file) != '\n') crash("Invalid input file format");

/* Read in the data, and display the initial grid.  This is not very
efficient in memory use, but so what?  For real programs, this often
needs to be handled more subtly, which can be tricky. */

    k = 0;
    for (n = 0; n < size; ++n) {
        for (i = 0; i < size; ++i) {
            if ((k = getc(file)) != '.' && k != '*')
                crash("Invalid input grid");
            access(total,n,i) = (byte)(k == '*');
            putchar(k);
        }
        if (getc(file) != '\n') crash("Invalid input grid");
        putchar('\n');
    }
    if (getc(file) != EOF) crash("Invalid input grid");
    fclose(file);    /* This rarely detects errors, anyway */
    putchar('\n');

/* Now run the game.  We need to pass down most of the variables, so we
make them file-scope static.  We need to keep the boundary empty for
repeatable results. */

    if (gettimeofday(&tv,NULL) != 0 ||
            (stamp = clock()) == (clock_t)-1)
        crash("Unable to get times");
    wall = tv.tv_sec+1.0e-6*tv.tv_usec;
    cpu = stamp/(double)CLOCKS_PER_SEC;
/* The private(n) is needed. */
#pragma omp parallel default(none), shared(total,size,steps), private(n)
    for (n = 0; n < steps; ++n) {
        (void)Iterate(total);
#pragma omp single
{ /* This matters */
        Zero(total,-1,0,-1,size+1);
        Zero(total,-1,size+1,-1,0);
        Zero(total,size,size+1,-1,size+1);
        Zero(total,-1,size+1,size,size+1);
} /* This matters */
    }
    if (gettimeofday(&tv,NULL) != 0 ||
            (stamp = clock()) == (clock_t)-1)
        crash("Unable to get times");
    wall = tv.tv_sec+1.0e-6*tv.tv_usec-wall;
    cpu = stamp/(double)CLOCKS_PER_SEC-cpu;

/* Print the result and time taken. */

    for (n = 0; n < size; ++n) {
        for (i = 0; i < size; ++i) putchar(".*"[access(total,n,i)]);
        putchar('\n');
    }
    printf("\nTime = %.2f seconds, CPU = %.2f seconds\n",wall,cpu);
    cleanup(&total);
    return EXIT_SUCCESS;
}
