Multi Level Cache Benchmark With Gaussian Blur Computation (Integer Key Only)

#include <iostream>
#include <fstream>
#include <math.h>

#include "LruClockCache.h"
#include "DirectMappedCache.h"
#include "CacheThreader.h"
#include "CpuBenchmarker.h"

int findMandelbrot(double cr, double ci, int max_iterations) {
  int i = 0;
  double zr = 0.0, zi = 0.0;
  while (i < max_iterations && zr * zr + zi * zi < 4.0) {
    double temp = zr * zr - zi * zi + cr;
    zi = 2.0 * zr * zi + ci;
    zr = temp;
    ++i;
  }
  return i;
}

double mapToReal(int x, int imageWidth, double minR, double maxR) {
  double range = maxR - minR;
  return x * (range / imageWidth) + minR;
}

double mapToImaginary(int y, int imageHeight, double minI, double maxI) {
  double range = maxI - minI;
  return y * (range / imageHeight) + minI;
}

int main() {
  // mandelbrot generation + (gaussian blur X5) using a multi-level cache

  using namespace std;

  int imageWidth, imageHeight, maxN;
  double minR, maxR, minI, maxI;

  imageWidth = 1024;
  imageHeight = 1024;
  maxN = 512;
  minR = -1.5;
  maxR = 0.7;
  minI = -1.0;
  maxI = 1.0;

  size_t cacheScaling = 1;
  for (int cacheScalingIter = 0; cacheScalingIter < 8; cacheScalingIter++) {

    cacheScaling *= 2;
    size_t readmiss = 0;
    size_t writemiss = 0;
    size_t read = 0;
    size_t write = 0;
    const int LLCsize = 1024 * 128 * cacheScaling;
    std::vector < int > backingStore(5000000);
    auto LLC = std::make_shared < LruClockCache < int, int >> (LLCsize,
      [ & ](int key) {
        readmiss++;
        return backingStore[key];
      },
      [ & ](int key, int value) {
        writemiss++;
        backingStore[key] = value;
      });

    const int L2size = LLCsize / 4;
    const int L1size = L2size / 4; // L1 size has to be integer power of 2 !!!

    CacheThreader < LruClockCache, int, int > cache(LLC, L1size, L2size);

    ofstream output("output_image_scaling" + std::to_string(cacheScaling) + ".ppm");
    output << "P6" << endl;
    output << imageWidth << " " << imageHeight << endl;
    output << "255" << endl;

    for (int i = 0; i < imageHeight; i++) {
      for (int j = 0; j < imageWidth; j++) {
        double cr = mapToReal(j, imageWidth, minR, maxR);
        double ci = mapToImaginary(i, imageHeight, minI, maxI);

        cache.set(i + j * imageWidth, findMandelbrot(cr, ci, maxN));
        write++;

      }
    }

    // benchmark
    {

      const int nGauss = 9;
      const int GaussianBlurKernel[nGauss] = {
        1,        2,        1,
        2,        4,        2,
        1,        2,        1
      };
      size_t nRepeats = 5;
      size_t nReadsPerIter = nGauss;
      size_t nWritesPerIter = 1;
      size_t totalLookups = nRepeats * (imageHeight - 2) * (imageWidth - 2) * (nReadsPerIter + nWritesPerIter);
      CpuBenchmarker bench(totalLookups * sizeof(int), "L1 tags = " + std::to_string(L1size) + " L2 tags=" + std::to_string(L2size) + " LLC tags=" + std::to_string(LLCsize) + " performance", totalLookups);

      // image softening (may not be accurate)
      // column-major iteration better for the L1 but doesn't matter for L2
      // "nRepeats" iterations better for benchmarking
      // tiled-computing to make even better cache-hit ratio
      for (size_t k = 0; k < nRepeats; k++) {
        for (int tiley = 0; tiley < 32; tiley++) {
          for (int tilex = 0; tilex < 32; tilex++) {
            for (int jj = 0; jj < 32; jj++) {
              for (int ii = 0; ii < 32; ii++) {
                const int i = tilex * 32 + ii;
                const int j = tiley * 32 + jj;
                if (j >= 1 && j <= imageHeight - 2 && i >= 1 && i <= imageWidth - 2) {
                  unsigned int pixelAccumulator1 = 0;
                  unsigned int pixelAccumulator2 = 0;
                  unsigned int pixelAccumulator3 = 0;

                  pixelAccumulator1 += cache.get(i - 1 + (j - 1) * imageWidth) * GaussianBlurKernel[-1 + 1 + (-1 + 1) * 3];
                  pixelAccumulator2 += cache.get(i + 0 + (j - 1) * imageWidth) * GaussianBlurKernel[+0 + 1 + (-1 + 1) * 3];
                  pixelAccumulator3 += cache.get(i + 1 + (j - 1) * imageWidth) * GaussianBlurKernel[+1 + 1 + (-1 + 1) * 3];

                  pixelAccumulator1 += cache.get(i - 1 + (j - 0) * imageWidth) * GaussianBlurKernel[-1 + 1 + (-0 + 1) * 3];
                  pixelAccumulator2 += cache.get(i + 0 + (j - 0) * imageWidth) * GaussianBlurKernel[+0 + 1 + (-0 + 1) * 3];
                  pixelAccumulator3 += cache.get(i + 1 + (j - 0) * imageWidth) * GaussianBlurKernel[+1 + 1 + (-0 + 1) * 3];

                  pixelAccumulator1 += cache.get(i - 1 + (j + 1) * imageWidth) * GaussianBlurKernel[-1 + 1 + (+1 + 1) * 3];
                  pixelAccumulator2 += cache.get(i + 0 + (j + 1) * imageWidth) * GaussianBlurKernel[+0 + 1 + (+1 + 1) * 3];
                  pixelAccumulator3 += cache.get(i + 1 + (j + 1) * imageWidth) * GaussianBlurKernel[+1 + 1 + (+1 + 1) * 3];

                  const int n = (pixelAccumulator1 + pixelAccumulator2 + pixelAccumulator3) >> 4;
                  cache.set(i + j * imageWidth, n);

                  read += 9;
                  write++;
                }
              }
            }
          }
        }
      }

      std::cout << "LLC cache hit ratio (read)=" << (read - readmiss) / (float) read << std::endl;
      std::cout << "LLC cache hit ratio (write)=" << (write - writemiss) / (float) write << std::endl;
    }
    for (int i = 0; i < imageHeight; i++) {
      for (int j = 0; j < imageWidth; j++) {

        int n = cache.get(i + j * imageWidth);

        int r = ((int) sqrt(n) % 256);
        int gr = (2 * n % 256);
        int b = (n % 256);
        write++;
        output << (char) r << (char) gr << (char) b;
      }

    }
    cout << "Finished!" << endl;

    cache.flush();
    output.flush();
  }
  return 0;
}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Multi Level Cache Benchmark With Gaussian Blur Computation (Integer Key Only)

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Clone this wiki locally