#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <math.h>
#include <stdint.h>

#if !defined(SINGLECORE) && !defined(MULTICORE)
#define MULTICORE
#endif

#ifdef MULTICORE
#include <sys/mman.h>
#include <sys/wait.h>
#endif

#define I8MAX  100
#define I16MAX 10000
#define I32MAX 1000000000
#define I64MAX 10000000000
#define F32MAX 33000000
#define F64MAX 3300000000
#define MAXDEPTH 4
#define BITWOPS (16 * 1024)
#define SQRTNUM 1024

#define MEMARRSIZ (1 << 24)
uint16_t memarr[MEMARRSIZ];

typedef unsigned long long ull;

ull total_score_single = 0, total_score_multi = 0;
int nproc;

uint64_t *sharedproct;

int64_t tdiff(struct timespec *a, struct timespec *b) {
  return ((a->tv_sec * 1000000000) + a->tv_nsec) -
           ((b->tv_sec * 1000000000) + b->tv_nsec);
}

void i8() {
  for (int16_t i = 0; i < I8MAX; i++);
}

void i16() {
  for (int16_t i = 0; i < I16MAX; i++);
}

void i32() {
  for (int32_t i = 0; i < I32MAX; i++);
}
void i64() {
  for (int64_t i = 0; i < I64MAX; i++);
}

void f32() {
  for (float f = 0; f < F32MAX; f += 1.1111f);
}

void f64() {
  for (double f = 0; f < F64MAX; f += 1.1111f);
}

void bitwop() {
  uint32_t x = memarr[0];
  uint32_t j = memarr[1];

  for (int i = 0; i < BITWOPS; i++)
    x = (x ^ j) << j;
}

void memseq() {
  uint64_t acc;
  for (int i = 0; i < MEMARRSIZ; i++)
    acc += memarr[i];
}

void memrnd() {
  uint16_t acc, n;
  for (int i = 0; i < MEMARRSIZ; i++) {
    n += acc = memarr[acc];
  }
}

void tsqrt() {
  float f = F32MAX;
  for (int i = 0; i < SQRTNUM; i++)
    f = sqrt(f);
}

int qsortcmp(const void *a, const void *b) {
  return *(uint16_t*) a - *(uint16_t*) b;
}

void tqsort() {
  qsort(memarr, MEMARRSIZ, sizeof(uint16_t), qsortcmp);
}

void fcall_in(int d) {
  if (d >= MAXDEPTH)
    return;
  for (int i = 0; i < MAXDEPTH; i++)
    fcall_in(d + 1);
}
void fcall() {
  fcall_in(0);
}

#define do_test(fn, max) _do_test(fn, max, #fn)
void _do_test(void (*fn)(void), uint64_t max, char *n) {
  // single core
  struct timespec start, end;
  clock_gettime(CLOCK_MONOTONIC, &start);
  fn();
  clock_gettime(CLOCK_MONOTONIC, &end);

  uint64_t diff = tdiff(&end, &start);
  if (!diff)
    diff = 1;
  uint64_t single_score = (max << 12) / diff;
  printf("%8s |   single: %llu\n", n, single_score);

  total_score_single += single_score;

#ifdef MULTICORE
  // multi core
  for (int i = 0; i < nproc; i++) {
    int pid = fork();
    if (pid == 0) {
      clock_gettime(CLOCK_MONOTONIC, &start);
      fn();
      clock_gettime(CLOCK_MONOTONIC, &end);
      uint64_t diff = tdiff(&end, &start);
      if (!diff)
	diff = 1;
      sharedproct[i] = (max << 12) / diff;
      exit(0);
    } else if (pid < 0) {
      perror("fork");
    }
  }

  int status = 0;
  while (wait(&status) > 0);

  uint64_t multi_score = 0;
  for (int i = 0; i < nproc; i++)
    multi_score += sharedproct[i];
  total_score_multi += multi_score;

  printf("%8s |    multi: %llu\n", n, multi_score);
  printf("%8s | weighted: %llu\n", n, multi_score / nproc);
#endif
}

int main(int argc, char **argv) {
  puts("cbench 102");
  puts("configured tests:    i8 i16 i32 i64 f32 f64 bitwop memseq memrnd tsqrt tqsort fcall");

#ifdef MULTICORE
  puts("multi core:          yes");
  int cores = sysconf(_SC_NPROCESSORS_ONLN);
  printf("detected core count: %d\n", cores);
#else
  puts("multi core:          no");
#endif

  FILE *urandom = fopen("/dev/urandom", "rb");
  fread(memarr, sizeof(uint16_t), MEMARRSIZ, urandom);
  fclose(urandom);

#ifdef MULTICORE
  nproc = cores;
  char *threads_env;
  if ((threads_env = getenv("THREADS"))) {
    nproc = atoi(threads_env);
    if (nproc < 1)
      nproc = cores;
  }
  printf("usable thread count: %d\n", nproc);
  sharedproct = mmap(NULL, sizeof(uint64_t) * nproc, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
#endif

  puts("starting cbench\n");

  do_test(i8, I8MAX);
  do_test(i16, I16MAX);
  do_test(i32, I32MAX);
  do_test(i64, I64MAX);
  do_test(f32, F32MAX);
  do_test(f64, F64MAX);
  do_test(bitwop, BITWOPS);
  do_test(memseq, MEMARRSIZ);
  do_test(memrnd, MEMARRSIZ);
  do_test(tsqrt, SQRTNUM);
  do_test(tqsort, MEMARRSIZ);
  do_test(fcall, MAXDEPTH * 16);

  printf("\n  cbench |   single: %llu\n", total_score_single);

#ifdef MULTICORE
  printf("  cbench |    multi: %llu\n", total_score_multi);
  printf("  cbench | weighted: %llu\n", total_score_multi / nproc);
#endif
}
