Canny Edge Detection Image Processing Algorithm Acceleration using OpenCL on DE1 FPGA-SoC

Click here to download My B.Sc. Thesis

I have implemented a real-time application for canny edge detection algorithm by OpenCL on FPGA-SoC platform as my B.Sc. thesis. Thanks to the advances in Heterogeneous Computing, we can accelerate applications which use both of the parallel and sequential operations by OpenCL framework. FPGA-SoC is one of the best platforms for implementing these algorithms by OpenCL, because they utilize Linux operating system and FPGA at the same time for sequential and parallel operations respectively.

canny.cpp



#include 
#include 
#include 
#include "CL/opencl.h"
#include "iostream"
#include 
#include 
#include "AOCLUtils/aocl_utils.h"
#include "AOCLUtils/bmpfuncs.h"


using namespace aocl_utils;



cl_context context = NULL;
cl_int errNum;
static cl_device_id  deviceID;    

inline void 
checkErr(cl_int err, const char * name)
{
    if (err != CL_SUCCESS) {
        std::cerr << "ERROR: " <<  name  << " (" << err << ")" << std::endl;
        exit(EXIT_FAILURE);
    }
}

void CL_CALLBACK contextCallback(
	const char * errInfo,
	const void * private_info,
	size_t cb,
	void * user_data)
{
	std::cout << "Error occured during context use: " << errInfo << std::endl;
	// should really perform any clearup and so on at this point
	// but for simplicitly just exit.
	exit(1);
}


cl_context CreateContext()
{	

	int platform_id = 0;

    cl_int errNum;
    cl_uint numPlatforms;
    cl_platform_id firstPlatformId[1];
    cl_context context = NULL;
		
    // First, select an OpenCL platform to run on.  For this example, we
    // simply choose the first available platform.  Normally, you would
    // query for all available platforms and select the most appropriate one.
    errNum = clGetPlatformIDs(1, firstPlatformId, &numPlatforms);
    if (errNum != CL_SUCCESS || numPlatforms <= 0)
    {
        std::cerr << "Failed to find any OpenCL platforms." << std::endl;
        return NULL;
    }

	cl_int err;
	size_t size;
	err = clGetPlatformInfo(firstPlatformId[platform_id], CL_PLATFORM_NAME, 0, NULL, &size);
	char * name = (char *)alloca(sizeof(char) * size);
	err = clGetPlatformInfo(firstPlatformId[platform_id], CL_PLATFORM_NAME, size, name, NULL);
	err = clGetPlatformInfo(firstPlatformId[platform_id], CL_PLATFORM_VENDOR, 0, NULL, &size);
	char * vname = (char *)alloca(sizeof(char) * size);
	err = clGetPlatformInfo(firstPlatformId[platform_id], CL_PLATFORM_VENDOR, size, vname, NULL);
	std::cout << "Platform name: " << name << std::endl
	<< "Vendor name : " << vname << std::endl;

    // Next, create an OpenCL context on the platform.  Attempt to
    // create a GPU-based context, and if that fails, try to create
    // a CPU-based context.
    cl_context_properties contextProperties[] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)firstPlatformId[platform_id],
        0
    };
	context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_ALL,
                                      NULL, NULL, &errNum);
	if (errNum == CL_SUCCESS){
		std::cout<< "GPU Platform Successfully Selected!"<< std::endl;
	}
	else 
    {
        std::cerr << "Failed to create an context because not finding platform." << std::endl;
        return NULL;
    }


    return context;
}


cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *device)
{
    cl_int errNum;
    cl_device_id *devices;
    cl_command_queue commandQueue = NULL;
    size_t deviceBufferSize = -1;

    // First get the size of the devices buffer
    errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);
    if (errNum != CL_SUCCESS)
    {
        std::cerr << "Failed call to clGetContextInfo(...,CL_CONTEXT_DEVICES,...)";
        return NULL;
    }

    if (deviceBufferSize <= 0)
    {
        std::cerr << "No devices available.";
        return NULL;
    }

    // Allocate memory for the devices buffer
    devices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];
    errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, devices, NULL);
    if (errNum != CL_SUCCESS)
    {
        delete [] devices;
        std::cerr << "Failed to get device IDs";
        return NULL;
    }

    // In this example, we just choose the first available device.  In a
    // real program, you would likely use all available devices or choose
    // the highest performance device based on OpenCL device queries
    commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
    if (commandQueue == NULL)
    {
        delete [] devices;
        std::cerr << "Failed to create commandQueue for device 0";
        return NULL;
    }

    *device = devices[0];
    delete [] devices;
    return commandQueue;
}

#define MY_WORK_GROUP_SIZE 16

cl_kernel CreateKernel(char* kernel_name) {

	std::string binary_file = getBoardBinaryFile("canny", deviceID);
  printf("Using AOCX: %s\n", binary_file.c_str());
  cl_program program = createProgramFromBinary(context, binary_file.c_str(), &deviceID, 1);

  // Build the program that was just created.
  cl_int status = clBuildProgram(program, 0, NULL, "", NULL, NULL);
  checkError(status, "Failed to build program");

  // Create the kernel - name passed in here must match kernel name in the
  // original CL file, that was compiled into an AOCX file using the AOC tool  
  cl_kernel kernel = clCreateKernel(program, kernel_name, &status);
  checkError(status, "Failed to create kernel");

	// Now al
	return kernel;	
};

// you can change lowThresh and highThresh of canny algorithm in hysteresis.cl file
//	float lowThresh = 10;
//	float highThresh = 70;

cl_mem inputSignalBuffer;
cl_mem outputSignalBuffer;	
cl_uint numPlatforms;
cl_uint numDevices;
cl_platform_id * platformIDs;	
cl_command_queue queue;		
cl_mem thetaSignalBuffer;
cl_mem maskBuffer;

// using bmp file reader lib
int rows;
int cols;
unsigned char *input_bitmap;

unsigned char *output_bitmap;

void cleanup() {
  
}


int main(int argc, char** argv)
{

	input_bitmap = readImage("E:/Projects_and_Programms/OpenCL/OpenCL_for_FPGA/4_Canny_with_Emulation_True_BMP_Lib/hello_world/bin/input.bmp", &cols,
      &rows);
	output_bitmap = (uchar*)malloc(rows*cols);

	storeImage(input_bitmap, "output_in_mirro.bmp", rows, cols, "input.bmp");
	

    std::ofstream err_out("erros.txt");
    std::streambuf *coutbuf = std::cerr.rdbuf(); //save old buf
    std::cerr.rdbuf(err_out.rdbuf()); //redirect std::cout to out.tx!t


	context = CreateContext();
			if (context == NULL)
	{
		std::cerr << "Failed to create OpenCL context." << std::endl;
		return 1;
	}

	queue = CreateCommandQueue(context, &deviceID);
	if (queue == NULL)
	{
		std::cerr<< "Failed to create OpenCL Command Queue"<< std::endl;
		return 1;
	}	
	// Now allocate buffers

	cl_kernel gaussian_kernel = CreateKernel("gaussian");
	cl_kernel sobel_kernel = CreateKernel("sobel");
	cl_kernel non_max_sup_kernel = CreateKernel("non_max_supression");
	cl_kernel hysteresis_kernel = CreateKernel("hysteresis");

		
	
		inputSignalBuffer = clCreateBuffer(
			context,
			CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR| CL_MEM_COPY_HOST_PTR,
			rows * cols  ,
			static_cast(input_bitmap),
			&errNum);
	
		outputSignalBuffer = clCreateBuffer(
			context,
			CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
			rows * cols  ,
			NULL,
			&errNum);
		checkErr(errNum, "clCreateBuffer(outputSignal)");

		// chon dar theta ham neveshte mishavad ham khande mishavad, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR tarif mishavad
		// dar theta dar stage2, kernele soble.cl, neveshte mishavad va dar stage2, non_max_supp.cl az an khande mishavad

		thetaSignalBuffer= clCreateBuffer( 
			context,
			CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
			rows * cols  ,
			NULL,
			&errNum);
		checkErr(errNum, "clCreateBuffer(outputSignal)");

		//////////////////////////////////// Gaussian Filter //////////////////////////////


		errNum  = clSetKernelArg(gaussian_kernel, 0, sizeof(cl_mem), &inputSignalBuffer);	
		errNum |= clSetKernelArg(gaussian_kernel, 1, sizeof(cl_mem), &outputSignalBuffer);
		errNum |= clSetKernelArg(gaussian_kernel, 2, sizeof(cl_uint), &rows);
		errNum |= clSetKernelArg(gaussian_kernel, 3, sizeof(cl_uint), &cols);	
		checkErr(errNum, "clSetKernelArg");

		size_t localWorkSize[2] = { 16, 16 };
		size_t globalWorkSize[2] =  { rows,
									  cols };

		// Queue the kernel up for execution across the array
		errNum = clEnqueueNDRangeKernel(
			queue, 
			gaussian_kernel, 
			2, 
			NULL,
			globalWorkSize, 
			localWorkSize,
			0, 
			NULL, 
			NULL);
		checkErr(errNum, "clEnqueueNDRangeKernel");

		//////////////////////////////////////////////////////// Sobel Filter ////////////////////	

		
		errNum  = clSetKernelArg(sobel_kernel, 0, sizeof(cl_mem), &outputSignalBuffer);	
		errNum |= clSetKernelArg(sobel_kernel, 1, sizeof(cl_mem), &inputSignalBuffer);
		errNum |= clSetKernelArg(sobel_kernel, 2, sizeof(cl_mem), &thetaSignalBuffer);
		errNum |= clSetKernelArg(sobel_kernel, 3, sizeof(cl_uint), &rows);
		errNum |= clSetKernelArg(sobel_kernel, 4, sizeof(cl_uint), &cols);	
		checkErr(errNum, "clSetKernelArg");

		errNum = clEnqueueNDRangeKernel(
			queue, 
			sobel_kernel, 
			2, 
			NULL,
			globalWorkSize, 
			localWorkSize,
			0, 
			NULL, 
			NULL);
		checkErr(errNum, "clEnqueueNDRangeKernel");

		////////////////////////////////// Non-maximum Supression //////////////////////////


		errNum  = clSetKernelArg(non_max_sup_kernel, 0, sizeof(cl_mem), &inputSignalBuffer);	
		errNum |= clSetKernelArg(non_max_sup_kernel, 1, sizeof(cl_mem), &outputSignalBuffer);
		errNum |= clSetKernelArg(non_max_sup_kernel, 2, sizeof(cl_mem), &thetaSignalBuffer);
		errNum |= clSetKernelArg(non_max_sup_kernel, 3, sizeof(cl_uint), &rows);
		errNum |= clSetKernelArg(non_max_sup_kernel, 4, sizeof(cl_uint), &cols);	
		checkErr(errNum, "clSetKernelArg");

		errNum = clEnqueueNDRangeKernel(
			queue, 
			non_max_sup_kernel, 
			2, 
			NULL,
			globalWorkSize, 
			localWorkSize,
			0, 
			NULL, 
			NULL);
		checkErr(errNum, "clEnqueueNDRangeKernel");
		////////////////////////////////// Hysteresis //////////////////////////


		errNum  = clSetKernelArg(hysteresis_kernel, 0, sizeof(cl_mem), &outputSignalBuffer);	
		errNum |= clSetKernelArg(hysteresis_kernel, 1, sizeof(cl_mem), &inputSignalBuffer);	
		errNum |= clSetKernelArg(hysteresis_kernel, 2, sizeof(cl_uint), &rows);
		errNum |= clSetKernelArg(hysteresis_kernel, 3, sizeof(cl_uint), &cols);	
		checkErr(errNum, "clSetKernelArg");

		errNum = clEnqueueNDRangeKernel(
			queue, 
			hysteresis_kernel, 
			2, 
			NULL,
			globalWorkSize, 
			localWorkSize,
			0, 
			NULL, 
			NULL);
		checkErr(errNum, "clEnqueueNDRangeKernel");


		//////////////////////////////// Reading final result from output //////////////////////////


		errNum = clEnqueueReadBuffer(
			queue, 
			inputSignalBuffer, 
			CL_TRUE,
			0, 
			rows * cols  , 
			output_bitmap,
			0, 
			NULL, 
			NULL);
		checkErr(errNum, "clEnqueueReadBuffer");


	std::ofstream output_file("output_bitmap.txt");
	std::ofstream intput_file("intput_bitmap.txt");
	for(int i=0; i