#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <math.h>
#include <stdint.h>

#if !defined(SINGLECORE) && !defined(MULTICORE)
#define MULTICORE
#endif

#ifdef MULTICORE
#include <sys/mman.h>
#include <sys/wait.h>
#endif

#define I8MAX  127
#define I16MAX 10000
#define I32MAX 1000000000
#define I64MAX 10000000000
#define F32MAX 33000000
#define F64MAX 3300000000
#define MAXDEPTH 4
#define BITWOPS (16 * 1024)
#define SQRTNUM 1024

#define MEMARRSIZ (1 << 24)
uint16_t memarr[MEMARRSIZ];

typedef unsigned long long ull;

ull total_score_single = 0, total_score_multi = 0;
int nproc;

uint64_t *sharedproct;

uint64_t clock_ns() {
  return clock() * 1000;
}

void i8() {
  for (register int8_t i = 0; i < I8MAX; i++) asm("");
}

void i16() {
  for (register int16_t i = 0; i < I16MAX; i++) asm("");
}

void i32() {
  for (register int32_t i = 0; i < I32MAX; i++) asm("");
}
void i64() {
  for (register int64_t i = 0; i < I64MAX; i++) asm("");
}

void f32() {
  for (register float f = 0; f < F32MAX; f += 1.1111f) asm("");
}

void f64() {
  for (register double f = 0; f < F64MAX; f += 1.1111) asm("");
}

uint32_t bitwop() {
  register uint32_t x = memarr[0];
  register uint32_t j = memarr[1];

  for (register int i = 0; i < BITWOPS; i++)
    x = (x ^ j) << j;
  return x;
}

uint64_t memseq() {
  register uint64_t acc;
  for (register int i = 0; i < MEMARRSIZ; i++)
    acc += memarr[i];
  return acc;
}

uint16_t memrnd() {
  register uint16_t acc, n;
  for (register int i = 0; i < MEMARRSIZ; i++) {
    n += acc = memarr[acc];
  }
  return n;
}

float tsqrt() {
  register float f = F32MAX;
  for (register int i = 0; i < SQRTNUM; i++)
    f = sqrt(f);
  return f;
}

int qsortcmp(const void *a, const void *b) {
  return *(uint16_t*) a - *(uint16_t*) b;
}

void tqsort() {
  qsort(memarr, MEMARRSIZ, sizeof(uint16_t), qsortcmp);
}

void fcall_in(int d) {
  if (d >= MAXDEPTH)
    return;
  for (int i = 0; i < MAXDEPTH; i++)
    fcall_in(d + 1);
}
void fcall() {
  fcall_in(0);
}

#define do_test(fn, max) _do_test((void(*)()) fn, max, #fn)
void _do_test(void (*fn)(), uint64_t max, char *n) {
  // single core
  uint64_t start = clock_ns();
  fn();
  uint64_t end = clock_ns();

  uint64_t diff = end - start;
  if (!diff)
    diff = 1;
  uint64_t single_score = (max << 12) / diff;
  printf("%8s |   single: %llu\n", n, single_score);

  total_score_single += single_score;

#ifdef MULTICORE
  // multi core
  for (int i = 0; i < nproc; i++) {
    int pid = fork();
    if (pid == 0) {
      uint64_t start = clock_ns();
      fn();
      uint64_t end = clock_ns();
      uint64_t diff = end - start;
      if (!diff)
	diff = 1;
      sharedproct[i] = (max << 12) / diff;
      exit(0);
    } else if (pid < 0) {
      perror("fork");
    }
  }

  int status = 0;
  while (wait(&status) > 0);

  uint64_t multi_score = 0;
  for (int i = 0; i < nproc; i++)
    multi_score += sharedproct[i];
  total_score_multi += multi_score;

  printf("%8s |    multi: %llu\n", n, multi_score);
  printf("%8s | weighted: %llu\n", n, multi_score / nproc);
#endif
}

int main(int argc, char **argv) {
  puts("cbench 103");
  puts("configured tests:    i8 i16 i32 i64 f32 f64 bitwop memseq memrnd tsqrt tqsort fcall");

#ifdef MULTICORE
  puts("multi core:          yes");
  int cores = sysconf(_SC_NPROCESSORS_ONLN);
  printf("detected core count: %d\n", cores);
#else
  puts("multi core:          no");
#endif

  for (size_t i = 0; i < MEMARRSIZ; i++)
    memarr[i] = (uint16_t) (rand() & 0xFFFF);

#ifdef MULTICORE
  nproc = cores;
  char *threads_env;
  if ((threads_env = getenv("THREADS"))) {
    nproc = atoi(threads_env);
    if (nproc < 1)
      nproc = cores;
  }
  printf("usable thread count: %d\n", nproc);
  sharedproct = mmap(NULL, sizeof(uint64_t) * nproc, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
#endif

  puts("starting cbench\n");

  do_test(i8, I8MAX);
  do_test(i16, I16MAX);
  do_test(i32, I32MAX);
  do_test(i64, I64MAX);
  do_test(f32, F32MAX);
  do_test(f64, F64MAX);
  do_test(bitwop, BITWOPS);
  do_test(memseq, MEMARRSIZ);
  do_test(memrnd, MEMARRSIZ);
  do_test(tsqrt, SQRTNUM);
  do_test(tqsort, MEMARRSIZ);
  do_test(fcall, MAXDEPTH * 16);

  printf("\n  cbench |   single: %llu\n", total_score_single);

#ifdef MULTICORE
  printf("  cbench |    multi: %llu\n", total_score_multi);
  printf("  cbench | weighted: %llu\n", total_score_multi / nproc);
#endif
}
