Here's a little demo which computes the average pixel value of an OpenGL rendering. As a sanity check I compute the average value on the cpu-side using glReadPixels
and then compare to computing the average value using a "mip-map" style, ping-pong, texture-buffer GPGPU reduction. Finally I render the buffers for fun.
I'm using yimg just to read in a .png file to render as an example.
On my weak little laptop, the GPU code is about 30 times faster. Can't shrug at that! Rookie mistake. Made timings in debug mode. In release there's hardly a speed up : - (
I was careful to compute the average in a coherent way (the images get progressively blurred out, rather than averaging the image quadrants recursively). This would be useful if, say, computing the average of 100 renderings. You could render them into a 10s x 10s texture and run the GPU reduction until the result is just a 10x10 image containing the 100 results. That'd only require a single final call to glReadPixels
(rather than calling glReadPixels
to read the final single pixel result of each reduction).
#include <YImage.hpp>
#include <YImage.cpp>
#include <GLUT/glut.h>
#include <iostream>
#include <string>
#include <iomanip>
// Size of image rounded up to next power of 2
size_t s;
// Shader program for doing reduction
GLuint reduction_prog_id;
// Need two textures and buffers to ping-pong
GLuint tex_id[] = {0,0};
GLuint fbo_id[] = {0,0};
GLuint dfbo_id[] = {0,0};
// Image (something to render in the first place)
std::string filename("hockney-512.png");
YImage yimg;
void init_render_to_texture(
const size_t width,
const size_t height,
GLuint & tex_id,
GLuint & fbo_id,
GLuint & dfbo_id)
{
using namespace std;
// Delete if already exists
glDeleteTextures(1,&tex_id);
glDeleteFramebuffersEXT(1,&fbo_id);
glDeleteFramebuffersEXT(1,&dfbo_id);
// http://www.opengl.org/wiki/Framebuffer_Object_Examples#Quick_example.2C_render_to_texture_.282D.29
glGenTextures(1, &tex_id);
glBindTexture(GL_TEXTURE_2D, tex_id);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
//NULL means reserve texture memory, but texels are undefined
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F_ARB, width, height, 0, GL_BGRA, GL_FLOAT, NULL);
glBindTexture(GL_TEXTURE_2D, 0);
glGenFramebuffersEXT(1, &fbo_id);
glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, fbo_id);
//Attach 2D texture to this FBO
glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT, GL_TEXTURE_2D, tex_id, 0);
glGenRenderbuffersEXT(1, &dfbo_id);
glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, dfbo_id);
glRenderbufferStorageEXT(GL_RENDERBUFFER_EXT, GL_DEPTH_COMPONENT24, width, height);
//Attach depth buffer to FBO (for this example it's not really needed, but if
//drawing a 3D scene it would be necessary to attach something)
glFramebufferRenderbufferEXT(GL_FRAMEBUFFER_EXT, GL_DEPTH_ATTACHMENT_EXT, GL_RENDERBUFFER_EXT, dfbo_id);
//Does the GPU support current FBO configuration?
GLenum status;
status = glCheckFramebufferStatusEXT(GL_FRAMEBUFFER_EXT);
assert(status == GL_FRAMEBUFFER_COMPLETE_EXT);
// Unbind to clean up
glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, 0);
glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0);
}
int main(int argc, char * argv[])
{
glutInit(&argc,argv);
if(argc>1)
{
filename = argv[1];
}
yimg.load(filename.c_str());
s = std::max(yimg.width(),yimg.height());
// http://stackoverflow.com/a/466278/148668
s--;
s |= s >> 1;
s |= s >> 2;
s |= s >> 4;
s |= s >> 8;
s |= s >> 16;
s++;
glutInitDisplayString("rgba depth double stencil");
glutInitWindowSize(2*s,s);
glutCreateWindow("texture-reduction");
glutDisplayFunc(
[]()
{
using namespace std;
// Initialize **both** buffers and set to render into first
for(int i = 1;i>=0;i--)
{
glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, fbo_id[i]);
glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, dfbo_id[i]);
glClearColor(0,0,0,1);
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
}
// Render something
yimg.flip();
glMatrixMode(GL_PROJECTION);
glPushMatrix();
glLoadIdentity();
glOrtho(0,s,0,s, -10000,10000);
glViewport(0,0,s,s);
glRasterPos2f(0,0);
glDrawPixels(yimg.width(), yimg.height(), GL_RGBA, GL_UNSIGNED_BYTE, yimg.data());
glPopMatrix();
// Even the cpu code should use a buffer (rather than the screen)
GLfloat * rgb = new GLfloat[yimg.width() * yimg.height() * 3];
glReadPixels(0, 0, yimg.width(), yimg.height(), GL_RGB, GL_FLOAT, rgb);
// Gather into double: sequential add is prone to numerical error
double avg[] = {0,0,0};
for(size_t i = 0;i<yimg.width()*yimg.height()*3;i+=3)
{
avg[0] += rgb[i + 0];
avg[1] += rgb[i + 1];
avg[2] += rgb[i + 2];
}
for_each(avg,avg+3,[](double & c){c/=yimg.width()*yimg.height();});
delete[] rgb;
// Size of square being rendered
assert(((s != 0) && ((s & (~s + 1)) == s)) && "s should be power of 2");
size_t h = s/2;
// odd or even ping-pong iteration
int odd = 1;
// Tell shader about texel step size
glUseProgram(reduction_prog_id);
glUniform1f(glGetUniformLocation(reduction_prog_id,"dt"),0.5f/float(s));
glEnable(GL_TEXTURE_2D);
while(h)
{
// Select which texture to draw/compute into
glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, fbo_id[odd]);
glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, dfbo_id[odd]);
// Select which texture to draw/compute from
glBindTexture(GL_TEXTURE_2D,tex_id[1-odd]);
// Scale to smaller square
glViewport(0,0,h,h);
const float f = 2.*(float)h/(float)s;
// Draw quad filling viewport with shrinking texture coordinates
glBegin(GL_QUADS);
glTexCoord2f(0,0);
glVertex2f (-1,-1);
glTexCoord2f(f,0);
glVertex2f (1,-1);
glTexCoord2f(f,f);
glVertex2f (1,1);
glTexCoord2f(0,f);
glVertex2f (-1,1);
glEnd();
// ping-pong
odd = 1-odd;
h = h/2;
}
// Read corner pixel of last render
float px[3];
glReadPixels(0, 0, 1, 1, GL_RGB, GL_FLOAT, px);
// Correct for size not power of 2
for_each(px,px+3,[](float& c){c*=(float)s*s/yimg.width()/yimg.height();});
cout<<" gpu: "<< px[0]<<" "<< px[1]<<" "<< px[2]<<" "<<endl;
cout<<" cpu: "<<avg[0]<<" "<<avg[1]<<" "<<avg[2]<<" "<<endl;
// Purely for vanity, draw the buffers
glUseProgram(0);
glBindFramebufferEXT(GL_FRAMEBUFFER_EXT,0);
glBindRenderbufferEXT(GL_RENDERBUFFER_EXT,0);
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
for(odd = 0;odd<2;odd++)
{
glViewport(odd*s,0,s,s);
glEnable(GL_TEXTURE_2D);
glBindTexture(GL_TEXTURE_2D,tex_id[odd]);
glBegin(GL_QUADS);
glTexCoord2f(0,0);
glVertex2f (-1,-1);
glTexCoord2f(1,0);
glVertex2f (1,-1);
glTexCoord2f(1,1);
glVertex2f (1,1);
glTexCoord2f(0,1);
glVertex2f (-1,1);
glEnd();
}
glutSwapBuffers();
}
);
init_render_to_texture(s,s,tex_id[0],fbo_id[0],dfbo_id[0]);
init_render_to_texture(s,s,tex_id[1],fbo_id[1],dfbo_id[1]);
// Vertex shader is a **true** pass-through
const std::string vertex_shader = R"(
#version 120
void main()
{
gl_Position = gl_Vertex;
gl_TexCoord[0] = gl_MultiTexCoord0;
}
)";
// fragment shader sums texture of this pixel and left/bottom neighbors
const std::string fragment_shader = R"(
#version 120
// size of a half-texel in 1/pixels units: 0.5/(full size)
uniform float dt;
uniform sampler2D texture;
void main()
{
gl_FragColor = 0.25*(
texture2D(texture, gl_TexCoord[0].st - vec2( 0, 0)) +
texture2D(texture, gl_TexCoord[0].st - vec2(dt, 0)) +
texture2D(texture, gl_TexCoord[0].st - vec2(dt,dt)) +
texture2D(texture, gl_TexCoord[0].st - vec2( 0,dt)));
}
)";
// Compile and link reduction shaders into program
const auto & compile_shader = [](const GLint type, const char * str)->GLuint
{
GLuint id = glCreateShader(type);
glShaderSource(id,1,&str,NULL);
glCompileShader(id);
return id;
};
GLuint vid = compile_shader(GL_VERTEX_SHADER,vertex_shader.c_str());
GLuint fid = compile_shader(GL_FRAGMENT_SHADER,fragment_shader.c_str());
reduction_prog_id = glCreateProgram();
glAttachShader(reduction_prog_id,vid);
glAttachShader(reduction_prog_id,fid);
glLinkProgram(reduction_prog_id);
glutMainLoop();
}