#include <npp.h>

#include <ImagesCPU.h>
#include <ImagesNPP.h>
#include <ImageIO.h>
#include <Exceptions.h>

#include <iostream>

#include <assert.h>


#include <stdio.h>

#ifndef WIN32

#define fscanf_s fscanf

inline void fopen_s(FILE** file, const char* name, const char* mode) 
{
    *file = fopen(name, mode);
}
#endif


void loadMiddleburyMRFData(const std::string &filename, int* &dataCostArray, int* &hCueTransposed, int* &vCue, int &width, int &height, int &nLabels) 
{
	FILE *fp;
	fopen_s(&fp, filename.c_str(),"rb");
    
	if( fp == 0 ) throw(new npp::Exception("File not found!"));

	fscanf_s(fp,"%d %d %d",&width,&height,&nLabels);

	int i, n, x, y;
	int gt;
	for(i = 0; i < width * height; i++)
		fscanf_s(fp,"%d",&gt);

	dataCostArray = (int*) malloc(width * height * nLabels * sizeof(int));
    n = 0;
	int v;

	for(int c=0; c < nLabels; c++) {
		for(i = 0; i < width * height; i++) {
			fscanf_s(fp,"%d",&v);
			dataCostArray[n++] = v;
		}
	}

	hCueTransposed = (int*) malloc(width * height * sizeof(int));
	vCue = (int*) malloc(width * height * sizeof(int));

	for(y = 0; y < height; y++) {
		for(x = 0; x < width-1; x++) {
			fscanf_s(fp,"%d",&v);
			hCueTransposed[x*height+y] = v;
		}
		hCueTransposed[(width-1)*height+y] = 0;
	}

	for(y = 0; y < height-1; y++) {
		for(x = 0; x < width; x++) {
			fscanf_s(fp,"%d",&v);
			vCue[y*width+x] = v;
		}
	}
	for(x = 0; x < width; x++) {
		vCue[(height-1)*width+x] = 0;
	}

	fclose(fp);

}


int computeEnergy(int &E_d, int &E_s, unsigned char* h_label, int step, int* hCueTransposed, int* vCue, int* data, int width, int height)
{
    E_d = 0;
    E_s = 0;

    for( int y=0; y<height; ++y) {
        for( int x=0; x < width; ++x) {
            int myLabel = h_label[y*step+x] > 0 ? 1 : 0;
            
            // Data
            E_d += data[myLabel * width*height + y*width+x];
            

            // Right
            if( x< width-1 ) {
                int rightLabel = h_label[y*width+x+1] > 0 ? 1 : 0;
                if( rightLabel != myLabel ) {
                    E_s += hCueTransposed[x*height+y];
                }
            }

            // Bottom
            if( y < height-1 ) {
                int bottomLabel = h_label[(y+1)*width+x] > 0 ? 1 : 0;
                if( bottomLabel != myLabel ) {
                    E_s += vCue[y*width+x];
                }
            }

        }
    }

	return E_d + E_s;
}

int main(int argc, char* argv[])
{
    try
    {
        // if more than one command line arg, use the first arg as the filename,
        // otherwise assume the filename included with the sample
        std::string sFilename = "../../data/person.txt";

        if (argc >= 2)
            sFilename = argv[1];

        std::string sResultFilename = sFilename;
        
        std::string::size_type dot = sResultFilename.rfind('.');
        if (dot != std::string::npos) sResultFilename = sResultFilename.substr(0, dot);
        sResultFilename += "_segmentation.pgm";

        if (argc >= 3)
            sResultFilename = argv[2];

		// load MRF declaration
		int width, height, nLabels;
		int *hCue, *vCue, *dataCostArray;

		loadMiddleburyMRFData(sFilename, dataCostArray, hCue, vCue, width, height, nLabels);
		NPP_ASSERT(nLabels == 2);

		std::cout << "Dataset: " << sFilename << std::endl;
		std::cout << "Size: " << width << "x" << height << std::endl;

		NppiSize size;
		size.width = width;
		size.height = height;

		NppiRect roi;
		roi.x=0; roi.y=0;
		roi.width=width; roi.height=height;

		
		// Setup flow network
		int step, transposed_step;
		Npp32s *d_source, *d_sink, *d_terminals, *d_left_transposed, *d_right_transposed, *d_top, *d_bottom;

		// Setup terminal capacities
		d_source = nppiMalloc_32s_C1(width, height, &step);
		cudaMemcpy2D(d_source, step, dataCostArray, width * sizeof(int), width*sizeof(int), height, cudaMemcpyHostToDevice);
		d_sink = nppiMalloc_32s_C1(width, height, &step);
		cudaMemcpy2D(d_sink, step, &dataCostArray[width*height], width * sizeof(int), width*sizeof(int), height, cudaMemcpyHostToDevice);

		d_terminals = nppiMalloc_32s_C1(width, height, &step);
		nppiSub_32s_C1R(d_sink, step, d_source, step, d_terminals, step, size);
		

		// Setup edge capacities
		NppiSize edgeTranposedSize;
		edgeTranposedSize.width = height;
		edgeTranposedSize.height = width-1;

		NppiSize oneRowTranposedSize;
		oneRowTranposedSize.width = height;
		oneRowTranposedSize.height = 1;

		d_right_transposed = nppiMalloc_32s_C1(height, width, &transposed_step);
		cudaMemcpy2D(d_right_transposed, transposed_step, hCue, height * sizeof(int), height * sizeof(int), width, cudaMemcpyHostToDevice);
		
		d_left_transposed = nppiMalloc_32s_C1(height, width, &transposed_step);
		nppiSet_32s_C1R(0, d_left_transposed, transposed_step, oneRowTranposedSize);
		nppiCopy_32s_C1R(d_right_transposed, transposed_step, d_left_transposed + transposed_step/sizeof(int), transposed_step, edgeTranposedSize);

		NppiSize edgeSize;
		edgeSize.width = width;
		edgeSize.height = height-1;

		NppiSize oneRowSize;
		oneRowSize.width = width;
		oneRowSize.height = 1;

		d_bottom = nppiMalloc_32s_C1(width, height, &step);
		cudaMemcpy2D(d_bottom, step, vCue, width * sizeof(int), width*sizeof(int), height, cudaMemcpyHostToDevice);
		
		d_top = nppiMalloc_32s_C1(width, height, &step);
		nppiSet_32s_C1R(0, d_top, step, oneRowSize);
		nppiCopy_32s_C1R(d_bottom, step, d_top + step/sizeof(int), step, edgeSize);
		
		// Allocate temp storage for graphcut computation
		Npp8u* pBuffer;
		int bufferSize;
		nppiGraphcutGetSize(size, &bufferSize);
		cudaMalloc(&pBuffer, bufferSize);
		
		// Allocate label storage
		npp::ImageNPP_8u_C1 oDeviceDst(width, height);

		cudaEvent_t start, stop;
		cudaEventCreate(&start); cudaEventCreate(&stop);

		// Compute the graphcut
		cudaEventRecord(start,0);
		nppiGraphcut_32s8u(d_terminals, d_left_transposed, d_right_transposed, d_top, d_bottom, step, transposed_step, size, oDeviceDst.data(),
			oDeviceDst.pitch(), pBuffer);
		cudaEventRecord(stop,0);
		cudaEventSynchronize(stop);

		float time;
		cudaEventElapsedTime(&time, start, stop);
		std::cout << "Elapsed Time: " << time << " ms" << std::endl;

        // declare a host image object for an 8-bit grayscale image
        npp::ImageCPU_8u_C1 oHostDst(width, height);		
		// and copy the labeling result
		oDeviceDst.copyTo(oHostDst.data(), oHostDst.pitch());

		int E_d, E_s;
		std::cout << "Graphcut Cost: " << computeEnergy(E_d, E_s, oHostDst.data(), oHostDst.pitch(), hCue, vCue, dataCostArray, width, height ) << std::endl;
		std::cout << "(E_d = " << E_d << ", E_s = " << E_s << ")" << std::endl; 

		std::cout << "Saving segmtation result as " << sResultFilename << std::endl;
		saveImage(sResultFilename, oHostDst);

		cudaFree(pBuffer);
		cudaFree(d_top);
		cudaFree(d_bottom);
		cudaFree(d_left_transposed);
		cudaFree(d_right_transposed);
		cudaFree(d_source);
		cudaFree(d_sink);
		cudaFree(d_terminals);
    }
    catch (npp::Exception & rException)
    {
        std::cerr << "Program error! The following exception occurred: \n";
        std::cerr << rException << std::endl;
        std::cerr << "Aborting." << std::endl;
        
        return -1;
    }
    catch (...)
    {
        std::cerr << "Program error! An unknow type of exception occurred. \n";
        std::cerr << "Aborting." << std::endl;
        
        return -1;
    }
    
    return 0;
}
