-
Notifications
You must be signed in to change notification settings - Fork 6
Multi Level Cache Benchmark With Gaussian Blur Computation (Integer Key Only)
Hüseyin Tuğrul BÜYÜKIŞIK edited this page Oct 8, 2021
·
10 revisions
#include <iostream>
#include <fstream>
#include <math.h>
#include "LruClockCache.h"
#include "DirectMappedCache.h"
#include "CacheThreader.h"
#include "CpuBenchmarker.h"
int findMandelbrot(double cr, double ci, int max_iterations) {
int i = 0;
double zr = 0.0, zi = 0.0;
while (i < max_iterations && zr * zr + zi * zi < 4.0) {
double temp = zr * zr - zi * zi + cr;
zi = 2.0 * zr * zi + ci;
zr = temp;
++i;
}
return i;
}
double mapToReal(int x, int imageWidth, double minR, double maxR) {
double range = maxR - minR;
return x * (range / imageWidth) + minR;
}
double mapToImaginary(int y, int imageHeight, double minI, double maxI) {
double range = maxI - minI;
return y * (range / imageHeight) + minI;
}
int main() {
// mandelbrot generation + (gaussian blur X5) using a multi-level cache
using namespace std;
int imageWidth, imageHeight, maxN;
double minR, maxR, minI, maxI;
imageWidth = 1024;
imageHeight = 1024;
maxN = 512;
minR = -1.5;
maxR = 0.7;
minI = -1.0;
maxI = 1.0;
size_t cacheScaling = 1;
for (int cacheScalingIter = 0; cacheScalingIter < 8; cacheScalingIter++) {
cacheScaling *= 2;
size_t readmiss = 0;
size_t writemiss = 0;
size_t read = 0;
size_t write = 0;
const int LLCsize = 1024 * 128 * cacheScaling;
std::vector < int > backingStore(5000000);
auto LLC = std::make_shared < LruClockCache < int, int >> (LLCsize,
[ & ](int key) {
readmiss++;
return backingStore[key];
},
[ & ](int key, int value) {
writemiss++;
backingStore[key] = value;
});
const int L2size = LLCsize / 4;
const int L1size = L2size / 4; // L1 size has to be integer power of 2 !!!
CacheThreader < LruClockCache, int, int > cache(LLC, L1size, L2size);
ofstream output("output_image_scaling" + std::to_string(cacheScaling) + ".ppm");
output << "P6" << endl;
output << imageWidth << " " << imageHeight << endl;
output << "255" << endl;
for (int i = 0; i < imageHeight; i++) {
for (int j = 0; j < imageWidth; j++) {
double cr = mapToReal(j, imageWidth, minR, maxR);
double ci = mapToImaginary(i, imageHeight, minI, maxI);
cache.set(i + j * imageWidth, findMandelbrot(cr, ci, maxN));
write++;
}
}
// benchmark
{
const int nGauss = 9;
const int GaussianBlurKernel[nGauss] = {
1, 2, 1,
2, 4, 2,
1, 2, 1
};
size_t nRepeats = 5;
size_t nReadsPerIter = nGauss;
size_t nWritesPerIter = 1;
size_t totalLookups = nRepeats * (imageHeight - 2) * (imageWidth - 2) * (nReadsPerIter + nWritesPerIter);
CpuBenchmarker bench(totalLookups * sizeof(int), "L1 tags = " + std::to_string(L1size) + " L2 tags=" + std::to_string(L2size) + " LLC tags=" + std::to_string(LLCsize) + " performance", totalLookups);
// image softening (may not be accurate)
// column-major iteration better for the L1 but doesn't matter for L2
// "nRepeats" iterations better for benchmarking
// tiled-computing to make even better cache-hit ratio
for (size_t k = 0; k < nRepeats; k++) {
for (int tiley = 0; tiley < 32; tiley++) {
for (int tilex = 0; tilex < 32; tilex++) {
for (int jj = 0; jj < 32; jj++) {
for (int ii = 0; ii < 32; ii++) {
const int i = tilex * 32 + ii;
const int j = tiley * 32 + jj;
if (j >= 1 && j <= imageHeight - 2 && i >= 1 && i <= imageWidth - 2) {
unsigned int pixelAccumulator1 = 0;
unsigned int pixelAccumulator2 = 0;
unsigned int pixelAccumulator3 = 0;
pixelAccumulator1 += cache.get(i - 1 + (j - 1) * imageWidth) * GaussianBlurKernel[-1 + 1 + (-1 + 1) * 3];
pixelAccumulator2 += cache.get(i + 0 + (j - 1) * imageWidth) * GaussianBlurKernel[+0 + 1 + (-1 + 1) * 3];
pixelAccumulator3 += cache.get(i + 1 + (j - 1) * imageWidth) * GaussianBlurKernel[+1 + 1 + (-1 + 1) * 3];
pixelAccumulator1 += cache.get(i - 1 + (j - 0) * imageWidth) * GaussianBlurKernel[-1 + 1 + (-0 + 1) * 3];
pixelAccumulator2 += cache.get(i + 0 + (j - 0) * imageWidth) * GaussianBlurKernel[+0 + 1 + (-0 + 1) * 3];
pixelAccumulator3 += cache.get(i + 1 + (j - 0) * imageWidth) * GaussianBlurKernel[+1 + 1 + (-0 + 1) * 3];
pixelAccumulator1 += cache.get(i - 1 + (j + 1) * imageWidth) * GaussianBlurKernel[-1 + 1 + (+1 + 1) * 3];
pixelAccumulator2 += cache.get(i + 0 + (j + 1) * imageWidth) * GaussianBlurKernel[+0 + 1 + (+1 + 1) * 3];
pixelAccumulator3 += cache.get(i + 1 + (j + 1) * imageWidth) * GaussianBlurKernel[+1 + 1 + (+1 + 1) * 3];
const int n = (pixelAccumulator1 + pixelAccumulator2 + pixelAccumulator3) >> 4;
cache.set(i + j * imageWidth, n);
read += 9;
write++;
}
}
}
}
}
}
std::cout << "LLC cache hit ratio (read)=" << (read - readmiss) / (float) read << std::endl;
std::cout << "LLC cache hit ratio (write)=" << (write - writemiss) / (float) write << std::endl;
}
for (int i = 0; i < imageHeight; i++) {
for (int j = 0; j < imageWidth; j++) {
int n = cache.get(i + j * imageWidth);
int r = ((int) sqrt(n) % 256);
int gr = (2 * n % 256);
int b = (n % 256);
write++;
output << (char) r << (char) gr << (char) b;
}
}
cout << "Finished!" << endl;
cache.flush();
output.flush();
}
return 0;
}