#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <sys/time.h>

#ifdef __cplusplus
extern "C" void dpotrf_(const char *, const int *, double *,
    const int *, int *);
extern "C" void dpotrs_(const char *, const int *, const int *,
    const double *, const int *, double *, const int *, int *);
#else
extern void dpotrf_(const char *, const int *, double *,
    const int *, int *);
extern void dpotrs_(const char *, const int *, const int *,
    const double *, const int *, double *, const int *, int *);
#endif


static int size;    /* To emulate Fortran's assumed-shape arrays */


void fail (const char *message) {
    printf("%s\n",message);
    exit(EXIT_FAILURE);
}


void times (const char *which) {
/* If which is not empty, print the times since the previous call. */
    static double last_wall = 0.0, last_cpu = 0.0;
    double wall, cpu;
    struct timeval tv;
    clock_t stamp;

    wall = last_wall;
    cpu = last_cpu;
    if (gettimeofday(&tv,NULL) != 0 ||
            (stamp = clock()) == (clock_t)-1)
        fail("Unable to get times");
    last_wall = tv.tv_sec+1.0e-6*tv.tv_usec;
    last_cpu = stamp/(double)CLOCKS_PER_SEC;
    if (strlen(which) > 0) {
        wall = last_wall-wall;
        cpu = last_cpu-cpu;
        printf("%s time = %.2f seconds, CPU = %.2f seconds\n",which,wall,cpu);
    }
}


double check (const double *a, const double *b, const double *c,
    int transpose) {
/* This calculates MAXVAL(ABS(c - MATMUL(a, b))), with b optionally
transposed.  There is some optimisation, because the simple code is
SO slow - Fortran is OK, though. */
    int i, j, k;
    double x, y = 0.0, *temp;

    if ((temp = (double *)malloc(size*sizeof(double))) == NULL)
        fail("Unable to get workspace");
    for (i = 0; i < size; ++i) {
        for (k = 0; k < size; ++k) temp[k] = a[i+k*size];
        for (j = 0; j < size; ++j) {
            x = 0.0;
            for (k = 0; k < size; ++k)
                x += temp[k] * (transpose ? b[j+k*size] : b[k+j*size]);
            x = fabs(c[i+j*size]-x);
            if (x > y) y = x;
         }
    }
    return y;
}

void cholesky (double *a) {
    int i, j, k;
    double x, y;

    for (j = 0; j < size; ++j) {
        x = 0.0;
/* Note that most of the clauses aren't needed here, because the default
rules do what is wanted.  The reduction is. */
#pragma omp parallel for default(none), schedule(static), \
    shared(a,j,size), private(i), reduction(+:x)
        for (i = 0; i < j; ++i) {
            a[i+j*size] = 0.0;
            x += a[j+i*size] * a[j+i*size];
        }
        x = sqrt(a[j+j*size]-x);
        a[j+j*size] = x;
/* Note that most of the clauses aren't needed here, because the default
rules do what is wanted. The private(k,y) is. */
#pragma omp parallel for default(none), schedule(static), \
    shared(a,j,x,size), private(i,k,y)
        for (i = j+1; i < size; ++i) {
            y = 0.0;
            for (k = 0; k < j; ++k)
                y += a[i+k*size] * a[j+k*size];
           a[i+j*size] = (a[i+j*size] - y) / x;
        }
    }
}

void solve (const double *a, double *b) {
    int i, j, k;
    double x;

    for (i = 0; i < size; ++i) {
        for (j = 0; j < size; ++j) {
            b[i+j*size] /= a[i+i*size];
/* Note that most of the clauses aren't needed here, because the default
rules do what is wanted. */
#pragma omp parallel for default(none), schedule(static), \
    shared(a,b,i,j,size), private(k)
            for (k = i+1; k < size; ++k)
                b[k+j*size] -= b[i+j*size] * a[k+i*size];
         }
    }
    for (j = 0; j < size; ++j) {
        for (i = size-1; i >= 0; --i) {
            x = 0.0;
/* Note that most of the clauses aren't needed here, because the default
rules do what is wanted.  The reduction is. */
#pragma omp parallel for default(none), schedule(static), \
    shared(a,b,i,j,size), private(k), reduction(+:x)
            for (k = i+1; k < size; ++k)
               x += a[k+i*size] * b[k+j*size];
            b[i+j*size] = (b[i+j*size]-x)/a[i+i*size];
        }
    }
}


int main (int argc, char *argv[]) {
    double *a, *a1, *b, *b1;
    int info, i, j;
    FILE *f;

/* Read the file name from the argument and set up the data. */
    if (argc != 2) fail("Wrong number of arguments");
    if ((f = fopen(argv[1],"rb")) == NULL)
        fail("Unable to open input file");
    if (fread(&size,sizeof(int),1,f) != 1 || size < 1 || size > 1000000)
        fail("Invalid value of size");
    if ((a = (double *)malloc(size*size*sizeof(double))) == NULL ||
            (b = (double *)malloc(size*size*sizeof(double))) == NULL ||
            (a1 = (double *)malloc(size*size*sizeof(double))) == NULL ||
            (b1 = (double *)malloc(size*size*sizeof(double))) == NULL)
        fail("Unable to allocate space");
    if ((int)fread(a,sizeof(double),size*size,f) != size*size ||
            (int)fread(b,sizeof(double),size*size,f) != size*size ||
            ferror(f) || fclose(f))
        fail("Unable to read data");
    memcpy(a1,a,size*size*sizeof(double));
    memcpy(b1,b,size*size*sizeof(double));
 
/* Time and check the use of LAPACK. */
    times("");
    dpotrf_("l",&size,a,&size,&info);
    if (info != 0) fail("Error in calling LAPACK");
    for (i = 0; i < size; ++i)
        for (j = 0; j < i; ++j) a[j+i*size] = 0.0;
    times("LAPACK Cholesky");
    printf("Error in result = %10.2e\n",check(a,a,a1,1));
    times("");
    dpotrs_("l",&size,&size,a,&size,b,&size,&info);
    if (info != 0) fail("Error in calling LAPACK");
    times("LAPACK solver");
    printf("Error in result = %10.2e\n",check(a1,b,b1,0));
 
/* Time and check the use of the LAPACK code converted to Fortran 90. */
    memcpy(a,a1,size*size*sizeof(double));
    times("");
    cholesky(a);
    times("Coded Cholesky");
    printf("Error in result = %10.2e\n",check(a,a,a1,1));
    memcpy(b,b1,size*size*sizeof(double));
    times("");
    solve(a, b);
    times("Coded solver");
    printf("Error in result = %10.2e\n",check(a1,b,b1,0));

    return 0;
}
