// (c) MX^Add

#include "PostEfx.h"
#include "Renderer/FrameBuffer.h"
#include "BaseTypes/PicoIntrinsics.h"
#include "Renderer/Rasterizers.h"

#ifdef PI_PICO_TARGET
#include <hardware/interp.h> // Interpolators
#include <pico/time.h>
#undef MIN
#undef MAX
#else
#ifndef PICO_DO_VGA
#define PICO_DO_VGA 0
#endif
#endif

#if PICO_DO_VGA
__attribute__((always_inline)) static inline uint32 __SwapRB(uint32 v) 
{ 
	return ((v & 0xF800F800u) >> 11) | (v & 0x07E007E0u) | ((v & 0x001F001Fu) << 11);
}

#define CNDREV(x) __SwapRB(x)
#else
#define CNDREV(x) __REV16(x)
#endif

static constexpr uint32 RedMask     = uint32(0x1F) << 11;
static constexpr uint32 GreenMask   = uint32(0x3F) << 5;
static constexpr uint32 BlueMask    = uint32(0x1F);
static constexpr uint32 MaskRB      = uint32(0x1F) | (uint32(0x1F) << 16);
static constexpr uint32 MaskG       = uint32(0x3F) | (uint32(0x3F) << 16);
static constexpr uint32 MaskRBS     = uint32(RedMask | BlueMask) | (uint32(RedMask | BlueMask) << 16);
static constexpr uint32 MaskGS      = uint32(GreenMask) | (uint32(GreenMask) << 16);
static constexpr uint32 MaskRBD     = (uint32(0x1F) | (uint32(0x1F) << 16)) << 11;
static constexpr uint32 MaskGD      = (uint32(0x3F) | (uint32(0x3F) << 16)) << 5;
static constexpr uint16 RGBColorKey =  uint16(0x1F) | (uint16(0x1F) << 11);

static void __not_in_flash_func(DoPostEfxInternal)(uint32 PostEfx, uint16* FrontBuffer, uint32 PtrOffset)
{
	__restrict uint32 *g = (uint32*)RasterizerFramebuffer;
	__restrict uint32 *h = (uint32*)(FrontBuffer ? FrontBuffer : RasterizerFramebuffer);
					   g+= PtrOffset;
					   h+= PtrOffset;

		  EPostEffect Effect       = EPostEffect((PostEfx >> 8) & 0xFF);
	const uint8       EfxIntensity = uint8(PostEfx & 0xFF);
	const uint16      EfxData      = uint16(PostEfx>>16);

	if (((Effect == EPostEffect::Blur || Effect == EPostEffect::BlurFade) && !FrontBuffer))
		Effect = EPostEffect::None;

	if (Effect == EPostEffect::FadeToBlack && EfxIntensity == 0)
		Effect = EPostEffect::None;

	if (Effect == EPostEffect::BlurFade && EfxIntensity == 0)
		Effect = EPostEffect::Blur;

	// Do the effects 

	switch (Effect)
	{
		case EPostEffect::Blur: // Around 2.1ms on device...
			{
				for (uint32 i = 0; i < (RasterizerSizeX * RasterizerSizeY) >> 2; i++, g++, h++)
				{
					uint32 D = *g;
					uint32 S = CNDREV(*h);
						  *h = CNDREV((__UHADD16(S & MaskRBS, D & MaskRBS) & MaskRBS) | ((((S & MaskGS) + (D & MaskGS)) >> 1) & MaskGS));
				}
			}
		break;

		case EPostEffect::BlurFade: // Around 4.0ms on device...
			{
				uint32 FRB  = uint32((0xFF-EfxIntensity) >> 3); if (FRB > 0x1E) FRB = 0x1E;
				uint32 FG   = uint32((0xFF-EfxIntensity) >> 2); if (FG  > 0x3E) FG  = 0x3E;
				uint32 FRBS = FRB << 6;
				uint32 FGS  = FG >> 1;

				for (uint32 i = 0; i < (RasterizerSizeX * RasterizerSizeY) >> 2; i++, g++, h++)
				{
					uint32 D = *g;
					uint32 S = CNDREV(*h);
					uint32 R = (__UHADD16(S & MaskRBS, D & MaskRBS) & MaskRBS) | ((((S & MaskGS) + (D & MaskGS)) >> 1) & MaskGS);

						   R = ((((  R        & MaskRB) * FRB) >> 5) & MaskRB)   | 
							   ((((((R >> 5)  & MaskG ) * FGS)     ) & MaskGD )) |
						       ((((((R >> 11) & MaskRB) * FRBS)    ) & MaskRBD)) ; 

					*h = CNDREV(R);
				}
			}
		break;

		case EPostEffect::FadeToBlack: // Around 2.7ms on device...
			{
				uint32 FRB  = uint32((0xFF-EfxIntensity) >> 3); if (FRB > 0x1E) FRB = 0x1E;
				uint32 FG   = uint32((0xFF-EfxIntensity) >> 2); if (FG  > 0x3E) FG  = 0x3E;
				uint32 FRBS = FRB << 6;
				uint32 FGS  = FG >> 1;

				for (uint32 i = 0; i < (RasterizerSizeX * RasterizerSizeY) >> 2; i++, g++, h++)
				{
					uint32 v = *g;

					uint32 R = ((((  v        & MaskRB) * FRB) >> 5) & MaskRB)   | 
							   ((((((v >> 5)  & MaskG ) * FGS)     ) & MaskGD )) |
							   ((((((v >> 11) & MaskRB) * FRBS)    ) & MaskRBD)) ; 

					*h = CNDREV(R);
				}
			}
		break;

		case EPostEffect::FadeToColor: // Around 5.4ms on device...
			{
				uint32 FRB = uint32(EfxIntensity >> 3); if (FRB > 0x1E) FRB = 0x1E;
				uint32 FG  = uint32(EfxIntensity >> 2); if (FG  > 0x3E) FG  = 0x3E;

				const uint32 DR   = EfxData >> 11;
				const uint32 DG   =(EfxData >> 5) & 0x3F;
				const uint32 DB   = EfxData       & 0x1F;
				const uint32 PDR  = DR | (DR << 16);
				const uint32 PDG  = DG | (DG << 16);
				const uint32 PDB  = DB | (DB << 16);

				for (uint32 i = 0; i < (RasterizerSizeX * RasterizerSizeY) >> 2; i++, g++, h++)
				{
					uint32 v = *g;

					uint32 r  = (v >> 11) & MaskRB;
					uint32 g  = (v >>  5) & MaskG;
					uint32 b  =  v        & MaskRB;

					uint32 R  = (__SADD16(((__SSUB16(PDR, r) * FRB) >> 5) & MaskRB, r) << 11) | 
								(__SADD16(((__SSUB16(PDG, g) * FG ) >> 6) & MaskG,  g) <<  5) |
						         __SADD16(((__SSUB16(PDB, b) * FRB) >> 5) & MaskRB, b)        ;
					
					*h = CNDREV(R);
				}
			}
		break;

		default:
		{
			for (uint32 i = 0; i < (RasterizerSizeX * RasterizerSizeY) >> 2; i++, g++, h++)
				*h = CNDREV(*g);
		}
	}

	return;
}

void DoPostEfx(uint32 PostEfx, uint16* FrontBuffer)
{
	// Dirty hack - clear first & last line, we use them for some temp storage

	memset(RasterizerFramebuffer, 0, RasterizerSizeX * sizeof(uint16));
	memset(RasterizerFramebuffer + RasterizerSizeX * (RasterizerSizeY - 1), 0, RasterizerSizeX * sizeof(uint16));

	// Do the effects

	struct FLocalData
	{
		uint16 *FrontBuffer;
		uint32  PostEfx;
	};

	FLocalData *Local = (FLocalData *)RasterizerGetInSpace();

	Local->FrontBuffer = FrontBuffer;
	Local->PostEfx     = PostEfx;

	PushSecondCoreWork([](const void* Arg)
	{
		const FLocalData *Local = (FLocalData *)Arg;
		DoPostEfxInternal(Local->PostEfx, Local->FrontBuffer, (RasterizerSizeX * RasterizerSizeY) >> 2);
		return;
	});

	DoPostEfxInternal(PostEfx, FrontBuffer, 0);
	FlushWorkItems();
	return;
}

void BlitImage(sint32 offx, sint32 offy, const uint16* ImageData, const uint32 ImageSizeX, const uint32 ImageSizeY, bool ColorKey)
{
	if (offx > RasterizerSizeX || offx + ImageSizeX <= 0)
		return;

	if (offy > RasterizerSizeY || offy + ImageSizeY <= 0)
		return;

	__restrict uint16 *g = (uint16*)RasterizerFramebuffer;

	sint32 BltStartX = 0;
	sint32 BltSizeX  = ImageSizeX;
	sint32 BltStartY = 0;
	sint32 BltSizeY  = ImageSizeY;

	if (offx < 0)
	{
		BltStartX = -offx;
		BltSizeX -= BltStartX;
		offx      = 0;
	}

	if (offy < 0)
	{
		BltStartY = -offy;
		BltSizeY -= BltStartY;
		offy      = 0;
	}

	g         += offy * RasterizerSizeX;
	ImageData += BltStartY * ImageSizeX + BltStartX;

	for (sint32 Y = offy; Y < MIN(offy + BltSizeY, RasterizerSizeY); Y++, g += RasterizerSizeX, ImageData += ImageSizeX)
	{
		for (sint32 X = offx; X < MIN(offx + BltSizeX, RasterizerSizeX); X++)
		{
			uint16 Color = ImageData[X-offx];
			if (!ColorKey || Color != RGBColorKey)
				g[X] = Color;
		}
	}

	return;
}

void BlitImage(sint32 offx, sint32 offy, uint8 Amount, const uint16 *ImageData, const uint32 ImageSizeX, const uint32 ImageSizeY, bool ColorKey)
{
	if (Amount == 0)
		return;

	if (Amount > 0xF8)
	{
		BlitImage(offx, offy, ImageData, ImageSizeX, ImageSizeY, ColorKey);
		return;
	}

	if (offx > RasterizerSizeX || offx + ImageSizeX <= 0)
		return;

	if (offy > RasterizerSizeY || offy + ImageSizeY <= 0)
		return;

	__restrict uint16 *g = (uint16*)RasterizerFramebuffer;

	sint32 BltStartX = 0;
	sint32 BltSizeX  = ImageSizeX;
	sint32 BltStartY = 0;
	sint32 BltSizeY  = ImageSizeY;

	if (offx < 0)
	{
		BltStartX = -offx;
		BltSizeX -= BltStartX;
		offx      = 0;
	}

	if (offy < 0)
	{
		BltStartY = -offy;
		BltSizeY -= BltStartY;
		offy      = 0;
	}

	g         += offy * RasterizerSizeX;
	ImageData += BltStartY * ImageSizeX + BltStartX;

	const sint32 al = sint32(Amount);

	for (sint32 Y = offy; Y < MIN(offy + BltSizeY, RasterizerSizeY); Y++, g += RasterizerSizeX, ImageData += ImageSizeX)
	{
		for (sint32 X = offx; X < MIN(offx + BltSizeX, RasterizerSizeX); X++)
		{
			uint16 Color = ImageData[X-offx];
			if (!ColorKey || Color != RGBColorKey)
			{
				const uint16 dc = g[X];

				const sint32 cr =  Color >> 11;
				const sint32 cg = (Color >> 5) & 0x3F;
				const sint32 cb =  Color & 0x1F;
				
				const sint32 dr =  dc >> 11;
				const sint32 dg = (dc >> 5) & 0x3F;
				const sint32 db =  dc & 0x1F;

				g[X] = ((dr + (((cr - dr) * al) >> 8)) << 11) |
					   ((dg + (((cg - dg) * al) >> 8)) <<  5) |
					    (db + (((cb - db) * al) >> 8))        ;
			}
		}
	}

	return;
}
