// (c) MX^Add

#include "Renderer/Rasterizers.h"
#include "Renderer/FrameBuffer.h"
#include "Renderer/SoftwareRasterizer.h"
#include "BaseTypes/PicoIntrinsics.h"

#ifdef PI_PICO_TARGET
#include <hardware/interp.h> // Interpolators
#include <pico/time.h>
#undef MIN
#undef MAX
#endif

#define Z_BUFFER_ACCESS_IS_ATOMIC   0     // Set this to one, if ZBuffer conflict becomes real issue ...
#define RASTERIZER_DITHER_COLORS    1     // Dither colors on vertex colored triangles
#define RASTERIZER_FAST_PATH_NOCLIP 1     // Do fast-path on nonclipping triangles
#define RASTERIZER_USE_HYBRIDS      1     // Use boxes for small triangles (as they have lower setup cost), and slopes for bigger ones
#define RASTERIZER_HYBRIDS_MARGIN   50.0f // Triangles with < this area will use barycentrics box rasterization

//
// Debug
//
#if ((defined VISIBILITY_CHECKER) && VISIBILITY_CHECKER)
uint32 __VisPixelsRendered;
#endif

//
// Temp space for clipped vertices (2 * 24 * MaximumVertexSize)
//
static constexpr  uint32 TriangleClipingMaxVerts = 24;
alignas(4) static uint8  TriangleClipingScratchSpace0[TriangleClipingMaxVerts * 2 * MAX(sizeof(FVertexXYXW), sizeof(FVertexXYXWF), MAX(sizeof(FVertexXYXWC), sizeof(FVertexXYXWUV), sizeof(FVertexXYXWUVF)))];
alignas(4) static uint8  TriangleClipingScratchSpace1[TriangleClipingMaxVerts * 2 * MAX(sizeof(FVertexXYXW), sizeof(FVertexXYXWF), MAX(sizeof(FVertexXYXWC), sizeof(FVertexXYXWUV), sizeof(FVertexXYXWUVF)))];
alignas(4) static uint8  TriangleClipingScratchSpaceMainThread[3 * MAX(sizeof(FVertexXYXW), sizeof(FVertexXYXWF), MAX(sizeof(FVertexXYXWC), sizeof(FVertexXYXWUV), sizeof(FVertexXYXWUVF)))];

//
// Prototypes
//
static void SecondCoreRasterizeTriangle3DWireframeConstColor(bool FromSecondCore, uint16 Color);
static void SecondCoreRasterizeTriangle3DWireframeVertexColors(bool FromSecondCore);
static void SecondCoreRasterizeTriangle3DConstColor(bool FromSecondCore, uint16 Color);
static void SecondCoreRasterizeTriangle3DVertexFactors(bool FromSecondCore, const uint16 *Lockup);
static void SecondCoreRasterizeTriangle3DVertexColors(bool FromSecondCore, bool AdditiveBlend);
static void SecondCoreRasterizeTriangle3DTexturedUnlit(bool FromSecondCore, const uint16 *Texture);
static void SecondCoreRasterizeTriangle3DTexturedLit(bool FromSecondCore, const uint16 *Texture);

static void RasterizeInternalLine(const FVector4Di &a, const FVector4Di &b, uint16 Color);
static void RasterizeInternalLine(const FVector4Di &a, const FColor16 &ac, const FVector4Di &b, const FColor16 &bc);

static void __not_in_flash_func(RasterizeTriangleInternalSolidColor)     (const FVertexXYXW    * __restrict a, const FVertexXYXW    * __restrict b, const FVertexXYXW    * __restrict c, uint16 Color);
static void                     RasterizeTriangleInternalLerpColor       (const FVertexXYXWC   * __restrict a, const FVertexXYXWC   * __restrict b, const FVertexXYXWC   * __restrict c);
static void __not_in_flash_func(RasterizeTriangleInternalLerpColorAdd)   (const FVertexXYXWC   * __restrict a, const FVertexXYXWC   * __restrict b, const FVertexXYXWC   * __restrict c);
static void __not_in_flash_func(RasterizeTriangleInternalLerpColorFactor)(const FVertexXYXWF   * __restrict a, const FVertexXYXWF   * __restrict b, const FVertexXYXWF   * __restrict c, const uint16 * __restrict Lockup);
static void                     RasterizeTriangleInternalTextured        (const FVertexXYXWUV  * __restrict a, const FVertexXYXWUV  * __restrict b, const FVertexXYXWUV  * __restrict c, const uint16 * __restrict Texture);
static void __not_in_flash_func(RasterizeTriangleInternalTexturedLit)    (const FVertexXYXWUVF * __restrict a, const FVertexXYXWUVF * __restrict b, const FVertexXYXWUVF * __restrict c, const uint16 * __restrict Texture);

//
// Debug
//
#ifdef DEMO_EDITOR
extern uint32 DebugTrissRass;
#endif

//
// Multithreading support
//
#ifdef PI_PICO_TARGET
#include "pico/multicore.h"
#undef MIN
#undef MAX
static constexpr  uint32 Core1StackSize = 1024;
alignas(4) static uint32 Core1Stack[Core1StackSize];
static void (*SecondCoreFunctionStub)(void);
#else
#define WIN32_LEAN_AND_MEAN
#define NOGDI
#define NOMINMAX
#include <windows.h>
static void SecondCoreFunction();
static DWORD WINAPI SecondCoreFunctionProxy(LPVOID) 
{
    SecondCoreFunction();
    return 0;
}
#endif

struct FWorkItem
{
    enum class EWorkType : uint8
    {
        Color,
        Colors,
        ColorsAdd,
        Factors,
        UVs,
        UVFs,
        Function
    };

    uint8         Data[3 * MAX(sizeof(FVertexXYXW), sizeof(FVertexXYXWF), MAX(sizeof(FVertexXYXWC), sizeof(FVertexXYXWUV), sizeof(FVertexXYXWUVF)))];
    const uint16 *Ptr;
    uint16        Arg;
    uint8         Flg;
    EWorkType     Type;

    FWorkItem() {}
};

static constexpr  uint32    NumerOfWorkSlots = 32; // Must be Pow2 !
alignas(4) static uint32    WorkSlotFlags[NumerOfWorkSlots];
alignas(4) static FWorkItem WorkSlotItems[NumerOfWorkSlots];

//
// Do work step, can be called from ANY core, will return true, if any work has been done
//
static bool DoWorkStepOnSlots(bool FromSecondCore) 
{
    bool Result = false;

    for (uint32 i = 0; i < NumerOfWorkSlots; i++)
    {
        if (__sync_bool_compare_and_swap(&WorkSlotFlags[i], 2, 3)) // Ready --> InProgress
        {
            const FWorkItem &Item = WorkSlotItems[i];

            switch (Item.Type)
            {
                case FWorkItem::EWorkType::Color:
                    memcpy(FromSecondCore ? TriangleClipingScratchSpace1 : TriangleClipingScratchSpace0, Item.Data, sizeof(FVertexXYXW)*3);
                    if (Item.Flg == 1)
                        SecondCoreRasterizeTriangle3DWireframeConstColor(FromSecondCore, Item.Arg);
                    else
                        SecondCoreRasterizeTriangle3DConstColor(FromSecondCore, Item.Arg);
                break;

                case FWorkItem::EWorkType::Colors:
                    memcpy(FromSecondCore ? TriangleClipingScratchSpace1 : TriangleClipingScratchSpace0, Item.Data, sizeof(FVertexXYXWC)*3);
                    if (Item.Flg == 1)
                        SecondCoreRasterizeTriangle3DWireframeVertexColors(FromSecondCore);
                    else
                        SecondCoreRasterizeTriangle3DVertexColors(FromSecondCore, false);
                break;

                case FWorkItem::EWorkType::ColorsAdd:
                    memcpy(FromSecondCore ? TriangleClipingScratchSpace1 : TriangleClipingScratchSpace0, Item.Data, sizeof(FVertexXYXWC)*3);
                    SecondCoreRasterizeTriangle3DVertexColors(FromSecondCore, true);
                break;

                case FWorkItem::EWorkType::Factors:
                    memcpy(FromSecondCore ? TriangleClipingScratchSpace1 : TriangleClipingScratchSpace0, Item.Data, sizeof(FVertexXYXWF)*3);
                    SecondCoreRasterizeTriangle3DVertexFactors(FromSecondCore, Item.Ptr);
                break;

                case FWorkItem::EWorkType::UVs:
                    memcpy(FromSecondCore ? TriangleClipingScratchSpace1 : TriangleClipingScratchSpace0, Item.Data, sizeof(FVertexXYXWUV)*3);
                    SecondCoreRasterizeTriangle3DTexturedUnlit(FromSecondCore, Item.Ptr);
                break;

                case FWorkItem::EWorkType::UVFs:
                    memcpy(FromSecondCore ? TriangleClipingScratchSpace1 : TriangleClipingScratchSpace0, Item.Data, sizeof(FVertexXYXWUVF)*3);
                    SecondCoreRasterizeTriangle3DTexturedLit(FromSecondCore, Item.Ptr);
                break;

                case FWorkItem::EWorkType::Function:
                    {
                        void (*Fnc)(const void *Arg) = (void (*)(const void *))Item.Ptr;
                        Fnc(Item.Data);
                    }
                break;
            }

            #ifdef PI_PICO_TARGET
            __atomic_store_n(&WorkSlotFlags[i], 0, __ATOMIC_RELAXED);
            #else
            __sync_bool_compare_and_swap(&WorkSlotFlags[i], 3, 0); // InProgress --> Empty
            #endif
            Result = true;
        }
    }

    return Result;
}

static void SecondCoreFunction()
{
    while (1)
    {
        DoWorkStepOnSlots(true);
        #ifdef PI_PICO_TARGET
        if (SecondCoreFunctionStub)
            SecondCoreFunctionStub();
        #endif
    }
}

void RasterizerSpawnSecondCoreFunction(void (*SecondCoreFunctionCall)(void))
{
    static bool IsSecondCoreSpawned = false;

    if (!IsSecondCoreSpawned)
    {
        IsSecondCoreSpawned = true;
        #ifdef PI_PICO_TARGET
        SecondCoreFunctionStub = SecondCoreFunctionCall;
        multicore_reset_core1(); // ensure we're in the reset state
        multicore_launch_core1_with_stack(SecondCoreFunction, (uint32_t *)Core1Stack, Core1StackSize * sizeof(uint32));
        #else
        ::CreateThread(nullptr, 0, SecondCoreFunctionProxy, nullptr, 0, nullptr);   
        #endif
    }

    return;
}

void FlushWorkItems()
{
    //
    // Flush all works (Do it on both cores if needed)
    //
    DoWorkStepOnSlots(false);

    while (1)
    {
        bool InProgress = false;

        for (uint32 i = 0; i < NumerOfWorkSlots; i++)
        {
            #ifdef PI_PICO_TARGET
            if (__atomic_load_n(&WorkSlotFlags[i], __ATOMIC_RELAXED))   // Check if empty
            #else
            if (!__sync_bool_compare_and_swap(&WorkSlotFlags[i], 0, 0)) // Check if empty
            #endif                
            {
                InProgress = true;
                break;
            }
        }

        if (!InProgress)
            break;
    }

    return;
}

#ifdef PI_PICO_TARGET
#if Z_BUFFER_ACCESS_IS_ATOMIC
__attribute__((always_inline)) static inline bool TryUpdateZBuffer(uint16 *ptr, uint16 z) 
{ 
    do 
    {
        uint16 old = __ldrexh(ptr); // load exclusive halfword
        if (old <= z)               
        {                           
            __clrex();              // cancel exclusive access
            return false;           
        }                           
                                    
    } while (__strexh(z, ptr));     // try to store; retry if failed

    return true;
}
#else
__attribute__((always_inline)) static inline bool TryUpdateZBuffer(uint16* ptr, uint16 z)
{
    if (*ptr <= z)
        return false;
    *ptr = z;
    return true;
}
#endif
__attribute__((always_inline)) static inline bool TryTestZBuffer(const uint16* ptr, uint16 z)
{
    if (*ptr <= z)
        return false;
    return true;
}
#else
static __forceinline bool TryUpdateZBuffer(uint16* ptr, uint16 z)
{
    #if ((defined VISIBILITY_CHECKER) && VISIBILITY_CHECKER)
    if (*ptr < z)
        return false;
    #else
    if (*ptr <= z)
        return false;
    #endif
    DBGINCP();
    *ptr = z;
    return true;
}

static __forceinline bool TryTestZBuffer(uint16* ptr, uint16 z)
{
    #if ((defined VISIBILITY_CHECKER) && VISIBILITY_CHECKER)
    if (*ptr < z)
        return false;
    #else
    if (*ptr <= z)
        return false;
    #endif
    DBGINCP();
    return true;
}
#endif

#ifdef PI_PICO_TARGET
__attribute__((always_inline)) static inline uint32 GetFreeSlot()
#else
static __forceinline uint32 GetFreeSlot()
#endif
{
    static uint32 NextFreeSlot = 0;

    for (uint32 i = 0; i < NumerOfWorkSlots; i++)
    {
        uint32 Index = (i + NextFreeSlot) & (NumerOfWorkSlots - 1);
        if (__sync_bool_compare_and_swap(&WorkSlotFlags[Index], 0, 1)) // Empty --> Allocated
        {
            NextFreeSlot++;
            return Index;
        }
    }

    return 0xFFFFFFFF;
}

//
// Bitmasks for clipping planes
//
enum EClipPlane : uint8 
{
    CLIP_LEFT   = 1 << 0, // x < -w
    CLIP_RIGHT  = 1 << 1, // x > +w
    CLIP_BOTTOM = 1 << 2, // y < -w
    CLIP_TOP    = 1 << 3, // y > +w
    CLIP_NEAR   = 1 << 4, // z < -w
    CLIP_FAR    = 1 << 5  // z > +w
};

//
// Compute clip code for a point in homogeneous space
//
#ifdef PI_PICO_TARGET
__attribute__((always_inline))
#endif
static inline uint8 ComputeClipCode(const FVector4D &p) 
{
    uint8 r = 0;

    if (p.x < -p.w) r |= CLIP_LEFT;
    if (p.x >  p.w) r |= CLIP_RIGHT;
    if (p.y < -p.w) r |= CLIP_BOTTOM;
    if (p.y >  p.w) r |= CLIP_TOP;
//  if (p.z < -p.w) r |= CLIP_NEAR;
    if (p.z <    0) r |= CLIP_NEAR;
    if (p.z >  p.w) r |= CLIP_FAR;

    return r;
}

//
// Check if any plane may clip the triangle
//
#ifdef PI_PICO_TARGET
__attribute__((always_inline)) 
#endif
static inline bool ComputeAnyClip(const FVector4D &p) 
{
    if (p.x < -p.w) return true;
    if (p.x >  p.w) return true;
    if (p.y < -p.w) return true;
    if (p.y >  p.w) return true;
//  if (p.z < -p.w) return true;
    if (p.z <    0) return true;
    if (p.z >  p.w) return true;

    return false;
}

//
// Return true if point is inside or on plane, false otherwise
//
#ifdef PI_PICO_TARGET
__attribute__((always_inline))
#endif
static inline bool ComputeIsInside(const FVector4D &v, EClipPlane plane) 
{
    switch (plane) 
    {
        case CLIP_LEFT:   return v.x >= -v.w;
        case CLIP_RIGHT:  return v.x <=  v.w;
        case CLIP_BOTTOM: return v.y >= -v.w;
        case CLIP_TOP:    return v.y <=  v.w;
//      case CLIP_NEAR:   return v.z >= -v.w;
        case CLIP_NEAR:   return v.z >=    0;
        case CLIP_FAR:    return v.z <=  v.w;
    }

    return false;
}

//
// Compute intersection factor
//
#ifdef PI_PICO_TARGET
__attribute__((always_inline))
#endif
static inline Scalar IntersectionFactor(FVector4D &p0, FVector4D &p1, uint8 plane) 
{
    Scalar dp0, dp1;

    switch (plane) 
    {
        case CLIP_LEFT:   dp0 = p0.x + p0.w; dp1 = p1.x + p1.w; break;
        case CLIP_RIGHT:  dp0 = p0.w - p0.x; dp1 = p1.w - p1.x; break;
        case CLIP_BOTTOM: dp0 = p0.y + p0.w; dp1 = p1.y + p1.w; break;
        case CLIP_TOP:    dp0 = p0.w - p0.y; dp1 = p1.w - p1.y; break;
//      case CLIP_NEAR:   dp0 = p0.z + p0.w; dp1 = p1.z + p1.w; break;
        case CLIP_NEAR:   dp0 = p0.z;        dp1 = p1.z;        break;
        case CLIP_FAR:    dp0 = p0.w - p0.z; dp1 = p1.w - p1.z; break;
    }

    Scalar denom = dp0 - dp1;

    if (FScalar::Abs(denom) < FixedEpsilon) 
        return 0;

    return dp0 / denom;
}

//
// Intersect line with a single plane
//
#ifdef PI_PICO_TARGET
__attribute__((always_inline)) 
#endif
static inline bool IntersectPlane(FVector4D &p0, FVector4D &p1, uint8 plane) 
{
    Scalar dp0, dp1;

    switch (plane) 
    {
        case CLIP_LEFT:   dp0 = p0.x + p0.w; dp1 = p1.x + p1.w; break;
        case CLIP_RIGHT:  dp0 = p0.w - p0.x; dp1 = p1.w - p1.x; break;
        case CLIP_BOTTOM: dp0 = p0.y + p0.w; dp1 = p1.y + p1.w; break;
        case CLIP_TOP:    dp0 = p0.w - p0.y; dp1 = p1.w - p1.y; break;
//      case CLIP_NEAR:   dp0 = p0.z + p0.w; dp1 = p1.z + p1.w; break;
        case CLIP_NEAR:   dp0 = p0.z;        dp1 = p1.z;        break;
        case CLIP_FAR:    dp0 = p0.w - p0.z; dp1 = p1.w - p1.z; break;
    }

    Scalar denom = dp0 - dp1;

    if (FScalar::Abs(denom) < FixedEpsilon) 
        return false; // Parallel to plane

    if (dp0 < 0) 
        p0.Lerp(p0, p1, dp0 / denom);
    else         
        p1.Lerp(p0, p1, dp0 / denom);

    return true;
}

//
// Clip a line segment against the clip volume in homogeneous space
//
static bool ClipLineToFrustum(FVector4D& p0, FVector4D& p1) 
{
    for (uint8 plane = 0; plane < 6; ++plane) 
    {
        uint8 code0 = ComputeClipCode(p0);
        uint8 code1 = ComputeClipCode(p1);

        if ((code0 & code1) != 0) return false; // Trivially reject
        if ((code0 | code1) == 0) return true;  // Trivially accept

        if ((code0 & (1 << plane)) || (code1 & (1 << plane))) 
        {
            if (!IntersectPlane(p0, p1, 1 << plane))
                return false;
        }
    }

    return true;
}

//
// Clip line to frustum, then rasterize it (with ZTest)
//
void RasterizeLine3D(const FVector4D& _a, const FVector4D& _b, uint16 Color, bool AlwaysTop)
{
    FVector4D a = _a;
    FVector4D b = _b;

    if (!ClipLineToFrustum(a, b)) 
        return;

    FVector4Di ia, ib;

    Scalar InvAW = 1.0f / a.w;
    Scalar InvBW = 1.0f / b.w;

	ia.x = sint32(((a.x * InvAW) * HalfScreenX + HalfScreenX));
	ib.x = sint32(((b.x * InvBW) * HalfScreenX + HalfScreenX));
    ia.y = sint32(((a.y * InvAW) * HalfScreenY - HalfScreenY));
	ib.y = sint32(((b.y * InvBW) * HalfScreenY - HalfScreenY));
    ia.z = AlwaysTop ? 0 : sint32((a.z * InvAW) * 65535.0f); // Fractional part from 0 to 0xFFFF
	ib.z = AlwaysTop ? 0 : sint32((b.z * InvBW) * 65535.0f); // Fractional part from 0 to 0xFFFF

    RasterizeInternalLine(ia, ib, Color);
	return;
}

//
// Clip polygon to all 6 frustum planes, possibly generate 24 vertices, if polygon is clipped by all 6 of them
//

template <typename T>
static uint8 ClipPolygonToPlane(const T *inVerts, uint8 inCount, T *outVerts, EClipPlane plane) 
{
    uint8 outCount = 0;

    for (uint8 i = 0; i < inCount; i++) 
    {
        T curr = inVerts[i];
        T prev = inVerts[((inCount + i) - 1) % inCount];

        bool currIn = ComputeIsInside(curr.v, plane);
        bool prevIn = ComputeIsInside(prev.v, plane);

        if (currIn && prevIn) 
        {
            outVerts[outCount++] = curr;
        } 
        else 
        if (prevIn && !currIn) 
        {
            Scalar t = IntersectionFactor(prev.v, curr.v, plane);
            outVerts[outCount++].Lerp(prev, curr, t);
        } 
        else 
        if (!prevIn && currIn) 
        {
            Scalar t = IntersectionFactor(prev.v, curr.v, plane);
            outVerts[outCount++].Lerp(prev, curr, t);
            outVerts[outCount++] = curr;
        }
    }

    return outCount;
}

//
// Helpers
//
void *RasterizerGetInSpace()
{
    return &TriangleClipingScratchSpaceMainThread[0];
}

void PushSecondCoreWork(void (*Fnc)(const void *Arg))
{
    uint32 i = GetFreeSlot();
    if (i != 0xFFFFFFFF)
    {
        FWorkItem &Item = WorkSlotItems[i];

        memcpy(Item.Data, TriangleClipingScratchSpaceMainThread, sizeof(TriangleClipingScratchSpaceMainThread));

        Item.Type    = FWorkItem::EWorkType::Function;
        Item.Arg     = 0;
        Item.Flg     = 0;
        Item.Ptr     = (uint16 *)Fnc;

        #ifdef PI_PICO_TARGET
        __atomic_store_n(&WorkSlotFlags[i], 2, __ATOMIC_RELEASE); // Allocated --> Ready
        #else
        __sync_bool_compare_and_swap(&WorkSlotFlags[i], 1, 2); // Allocated --> Ready
        #endif
        return;
    }

    Fnc(TriangleClipingScratchSpaceMainThread);
    return;
}

void RasterizeTriangle3DWireframeConstColor(uint16 Color)
{
    #if (!((defined VISIBILITY_CHECKER) && VISIBILITY_CHECKER))
    uint32 i = GetFreeSlot();
    if (i != 0xFFFFFFFF)
    {
        FWorkItem &Item = WorkSlotItems[i];

        memcpy(Item.Data, TriangleClipingScratchSpaceMainThread, sizeof(FVertexXYXW)*3);

        Item.Type    = FWorkItem::EWorkType::Color;
        Item.Arg     = Color;
        Item.Flg     = 1;
        Item.Ptr     = nullptr;

        #ifdef PI_PICO_TARGET
        __atomic_store_n(&WorkSlotFlags[i], 2, __ATOMIC_RELEASE); // Allocated --> Ready
        #else
        __sync_bool_compare_and_swap(&WorkSlotFlags[i], 1, 2); // Allocated --> Ready
        #endif
        return;
    }
    #endif

    // If no free slots, then do locally
    memcpy(TriangleClipingScratchSpace0, TriangleClipingScratchSpaceMainThread, sizeof(FVertexXYXW)*3);
    SecondCoreRasterizeTriangle3DWireframeConstColor(false, Color);

    return;
}

void RasterizeTriangle3DWireframeVertexColors()
{
    #if (!((defined VISIBILITY_CHECKER) && VISIBILITY_CHECKER))
    uint32 i = GetFreeSlot();
    if (i != 0xFFFFFFFF)
    {
        FWorkItem &Item = WorkSlotItems[i];

        memcpy(Item.Data, TriangleClipingScratchSpaceMainThread, sizeof(FVertexXYXWC)*3);

        Item.Type    = FWorkItem::EWorkType::Colors;
        Item.Arg     = 0;
        Item.Flg     = 1;
        Item.Ptr     = nullptr;

        #ifdef PI_PICO_TARGET
        __atomic_store_n(&WorkSlotFlags[i], 2, __ATOMIC_RELEASE); // Allocated --> Ready
        #else
        __sync_bool_compare_and_swap(&WorkSlotFlags[i], 1, 2); // Allocated --> Ready
        #endif
        return;
    }
    #endif

    // If no free slots, then do locally
    memcpy(TriangleClipingScratchSpace0, TriangleClipingScratchSpaceMainThread, sizeof(FVertexXYXWC)*3);
    SecondCoreRasterizeTriangle3DWireframeVertexColors(false);

    return;
}

void RasterizeTriangle3DConstColor(uint16 Color)
{
    #if (!((defined VISIBILITY_CHECKER) && VISIBILITY_CHECKER))
    uint32 i = GetFreeSlot();
    if (i != 0xFFFFFFFF)
    {
        FWorkItem &Item = WorkSlotItems[i];

        memcpy(Item.Data, TriangleClipingScratchSpaceMainThread, sizeof(FVertexXYXW)*3);

        Item.Type    = FWorkItem::EWorkType::Color;
        Item.Arg     = Color;
        Item.Flg     = 0;
        Item.Ptr     = nullptr;

        #ifdef PI_PICO_TARGET
        __atomic_store_n(&WorkSlotFlags[i], 2, __ATOMIC_RELEASE); // Allocated --> Ready
        #else
        __sync_bool_compare_and_swap(&WorkSlotFlags[i], 1, 2); // Allocated --> Ready
        #endif
        return;
    }
    #endif

    // If no free slots, then do locally
    memcpy(TriangleClipingScratchSpace0, TriangleClipingScratchSpaceMainThread, sizeof(FVertexXYXW)*3);
    SecondCoreRasterizeTriangle3DConstColor(false, Color);

    return;
}

void RasterizeTriangle3DVertexColors(bool AdditiveBlend)
{
    #if (!((defined VISIBILITY_CHECKER) && VISIBILITY_CHECKER))
    uint32 i = GetFreeSlot();
    if (i != 0xFFFFFFFF)
    {
        FWorkItem &Item = WorkSlotItems[i];

        memcpy(Item.Data, TriangleClipingScratchSpaceMainThread, sizeof(FVertexXYXWC)*3);

        Item.Type    = AdditiveBlend ? FWorkItem::EWorkType::ColorsAdd : FWorkItem::EWorkType::Colors;
        Item.Arg     = 0;
        Item.Flg     = 0;
        Item.Ptr     = nullptr;

        #ifdef PI_PICO_TARGET
        __atomic_store_n(&WorkSlotFlags[i], 2, __ATOMIC_RELEASE); // Allocated --> Ready
        #else
        __sync_bool_compare_and_swap(&WorkSlotFlags[i], 1, 2); // Allocated --> Ready
        #endif
        return;
    }
    #endif

    // If no free slots, then do locally
    memcpy(TriangleClipingScratchSpace0, TriangleClipingScratchSpaceMainThread, sizeof(FVertexXYXWC)*3);
    SecondCoreRasterizeTriangle3DVertexColors(false, AdditiveBlend);

    return;
}

void RasterizeTriangle3DVertexFactors(const uint16 *Lockup)
{
    #if (!((defined VISIBILITY_CHECKER) && VISIBILITY_CHECKER))
    uint32 i = GetFreeSlot();
    if (i != 0xFFFFFFFF)
    {
        FWorkItem &Item = WorkSlotItems[i];

        memcpy(Item.Data, TriangleClipingScratchSpaceMainThread, sizeof(FVertexXYXWF)*3);

        Item.Type    = FWorkItem::EWorkType::Factors;
        Item.Arg     = 0;
        Item.Flg     = 0;
        Item.Ptr     = Lockup;

        #ifdef PI_PICO_TARGET
        __atomic_store_n(&WorkSlotFlags[i], 2, __ATOMIC_RELEASE); // Allocated --> Ready
        #else
        __sync_bool_compare_and_swap(&WorkSlotFlags[i], 1, 2); // Allocated --> Ready
        #endif
        return;
    }
    #endif

    // If no free slots, then do locally
    memcpy(TriangleClipingScratchSpace0, TriangleClipingScratchSpaceMainThread, sizeof(FVertexXYXWF)*3);
    SecondCoreRasterizeTriangle3DVertexFactors(false, Lockup);

    return;
}

void RasterizeTriangle3DTexturedUnlit(const uint16 *Texture)
{
    #if (!((defined VISIBILITY_CHECKER) && VISIBILITY_CHECKER))
    uint32 i = GetFreeSlot();
    if (i != 0xFFFFFFFF)
    {
        FWorkItem &Item = WorkSlotItems[i];

        memcpy(Item.Data, TriangleClipingScratchSpaceMainThread, sizeof(FVertexXYXWUV)*3);

        Item.Type    = FWorkItem::EWorkType::UVs;
        Item.Arg     = 0;
        Item.Flg     = 0;
        Item.Ptr     = Texture;

        #ifdef PI_PICO_TARGET
        __atomic_store_n(&WorkSlotFlags[i], 2, __ATOMIC_RELEASE); // Allocated --> Ready
        #else
        __sync_bool_compare_and_swap(&WorkSlotFlags[i], 1, 2); // Allocated --> Ready
        #endif
        return;
    }
    #endif

    // If no free slots, then do locally
    memcpy(TriangleClipingScratchSpace0, TriangleClipingScratchSpaceMainThread, sizeof(FVertexXYXWUV)*3);
    SecondCoreRasterizeTriangle3DTexturedUnlit(false, Texture);

    return;
}

void RasterizeTriangle3DTexturedLit(const uint16 *Texture)
{
    #if (!((defined VISIBILITY_CHECKER) && VISIBILITY_CHECKER))
    uint32 i = GetFreeSlot();
    if (i != 0xFFFFFFFF)
    {
        FWorkItem &Item = WorkSlotItems[i];

        memcpy(Item.Data, TriangleClipingScratchSpaceMainThread, sizeof(FVertexXYXWUVF)*3);

        Item.Type    = FWorkItem::EWorkType::UVFs;
        Item.Arg     = 0;
        Item.Flg     = 0;
        Item.Ptr     = Texture;

        #ifdef PI_PICO_TARGET
        __atomic_store_n(&WorkSlotFlags[i], 2, __ATOMIC_RELEASE); // Allocated --> Ready
        #else
        __sync_bool_compare_and_swap(&WorkSlotFlags[i], 1, 2); // Allocated --> Ready
        #endif
        return;
    }
    #endif

    // If no free slots, then do locally
    memcpy(TriangleClipingScratchSpace0, TriangleClipingScratchSpaceMainThread, sizeof(FVertexXYXWUVF)*3);
    SecondCoreRasterizeTriangle3DTexturedLit(false, Texture);

    return;
}

//
// Clip wireframe triangle to frustum and rasterize it (with ZTest)
//
#ifdef PI_PICO_TARGET
__attribute__((noinline))
#endif
void SecondCoreRasterizeTriangle3DWireframeConstColor(bool FromSecondCore, uint16 Color)
{
    uint8 *ScratchSpace = FromSecondCore ? &TriangleClipingScratchSpace1[0] : &TriangleClipingScratchSpace0[0];

    FVertexXYXW *Input  = (FVertexXYXW *)&ScratchSpace[0];
    FVertexXYXW *Output = (FVertexXYXW *)&ScratchSpace[TriangleClipingMaxVerts * sizeof(FVertexXYXW)];
    uint8        Count  = 3;

    for (uint8 p = 0; p < 6; p++) 
    {
        Count = ClipPolygonToPlane<FVertexXYXW>(Input, Count, Output, (EClipPlane)(1 << p));

        if (Count < 3) 
            return; // Fully clipped

        SWAP(Input, Output);
    }

    FVector4Di *InputI = (FVector4Di *)&Input[0];
    
    for (uint8 i = 0; i < Count; i++)
    {
        FVector4D &v = Input[i].v;
        Scalar InvW  = 1.0f / v.w;

        InputI[i].x = sint32(((v.x * InvW) * HalfScreenX + HalfScreenX));
        InputI[i].y = sint32(((v.y * InvW) * HalfScreenY - HalfScreenY));
        InputI[i].z = sint32(( v.z * InvW) * 65535.0f);
    }

    for (uint8 i = 1; i < Count - 1; i++) 
    {
       RasterizeInternalLine(InputI[0], InputI[i], Color);
       RasterizeInternalLine(InputI[i], InputI[i+1], Color);
       if (i == Count-2)
           RasterizeInternalLine(InputI[i+1], InputI[0], Color);
    }

	return;
}

#ifdef PI_PICO_TARGET
__attribute__((noinline))
#endif
void SecondCoreRasterizeTriangle3DWireframeVertexColors(bool FromSecondCore)
{
    uint8 *ScratchSpace = FromSecondCore ? &TriangleClipingScratchSpace1[0] : &TriangleClipingScratchSpace0[0];

    FVertexXYXWC *Input  = (FVertexXYXWC *)&ScratchSpace[0];
    FVertexXYXWC *Output = (FVertexXYXWC *)&ScratchSpace[TriangleClipingMaxVerts * sizeof(FVertexXYXWC)];
    uint8         Count  = 3;

    for (uint8 p = 0; p < 6; p++) 
    {
        Count = ClipPolygonToPlane<FVertexXYXWC>(Input, Count, Output, (EClipPlane)(1 << p));

        if (Count < 3) 
            return; // Fully clipped

        SWAP(Input, Output);
    }
    
    for (uint8 i = 0; i < Count; i++)
    {
        FVector4D   &v     = Input[i].v;
        FVector4Di *InputI = (FVector4Di *)&Input[i].v;

        Scalar InvW  = 1.0f / v.w;

        InputI->x = sint32(((v.x * InvW) * HalfScreenX + HalfScreenX));
        InputI->y = sint32(((v.y * InvW) * HalfScreenY - HalfScreenY));
        InputI->z = sint32(( v.z * InvW) * 65535.0f);
    }

    for (uint8 i = 1; i < Count - 1; i++) 
    {
       RasterizeInternalLine(*((FVector4Di *)&Input[0].v), Input[0].c, *((FVector4Di *)&Input[i].v), Input[i].c);
       RasterizeInternalLine(*((FVector4Di *)&Input[i].v), Input[i].c, *((FVector4Di *)&Input[i+1].v), Input[i+1].c);
       if (i == Count-2)
           RasterizeInternalLine(*((FVector4Di *)&Input[i+1].v), Input[i+1].c, *((FVector4Di *)&Input[0].v), Input[0].c);
    }

	return;
}

//
// Clip solid colored triangle to frustum and rasterize it (with ZTest)
//
#ifdef PI_PICO_TARGET
__attribute__((noinline))
#endif
void SecondCoreRasterizeTriangle3DConstColor(bool FromSecondCore, uint16 Color)
{
    uint8 *ScratchSpace = FromSecondCore ? &TriangleClipingScratchSpace1[0] : &TriangleClipingScratchSpace0[0];

    FVertexXYXW *Input  = (FVertexXYXW *)&ScratchSpace[0];
    FVertexXYXW *Output = (FVertexXYXW *)&ScratchSpace[TriangleClipingMaxVerts * sizeof(FVertexXYXW)];
    uint8        Count  = 3;

    #if RASTERIZER_FAST_PATH_NOCLIP
    if (!ComputeAnyClip(Input[0].v) && !ComputeAnyClip(Input[1].v) && !ComputeAnyClip(Input[2].v))
    {
        for (uint8 i = 0; i < Count; i++)
        {
            FVector4D &v = Input[i].v;
            Scalar InvW  = 1.0f / v.w;

            v.x = v.x * InvW * HalfScreenX + HalfScreenX;
            v.y = v.y * InvW * HalfScreenY - HalfScreenY;
            v.z*= 65535.0f;
        }

        RasterizeTriangleInternalSolidColor(&Input[0], &Input[1], &Input[2], Color);
        return;
    }
    #endif

    for (uint8 p = 0; p < 6; p++) 
    {
        Count = ClipPolygonToPlane<FVertexXYXW>(Input, Count, Output, (EClipPlane)(1 << p));

        if (Count < 3) 
            return; // Fully clipped

        SWAP(Input, Output);
    }

    for (uint8 i = 0; i < Count; i++)
    {
        FVector4D &v = Input[i].v;
        Scalar InvW  = 1.0f / v.w;

        v.x = v.x * InvW * HalfScreenX + HalfScreenX;
        v.y = v.y * InvW * HalfScreenY - HalfScreenY;
        v.z*= 65535.0f;
    }

    for (uint8 i = 1; i < Count - 1; i++) 
    {
        RasterizeTriangleInternalSolidColor(&Input[0], &Input[i], &Input[i+1], Color);
    }

	return;
}

//
// Clip solid factor colored triangle to frustum and rasterize it (with ZTest)
//
#ifdef PI_PICO_TARGET
__attribute__((noinline))
#endif
void SecondCoreRasterizeTriangle3DVertexFactors(bool FromSecondCore, const uint16* Lockup)
{
    uint8 *ScratchSpace = FromSecondCore ? &TriangleClipingScratchSpace1[0] : &TriangleClipingScratchSpace0[0];

    FVertexXYXWF *Input  = (FVertexXYXWF *)&ScratchSpace[0];
    FVertexXYXWF *Output = (FVertexXYXWF *)&ScratchSpace[TriangleClipingMaxVerts * sizeof(FVertexXYXWF)];
    uint8         Count  = 3;

    #if RASTERIZER_FAST_PATH_NOCLIP
    if (!ComputeAnyClip(Input[0].v) && !ComputeAnyClip(Input[1].v) && !ComputeAnyClip(Input[2].v))
    {
        for (uint8 i = 0; i < Count; i++)
        {
            FVector4D &v = Input[i].v;
            Scalar InvW  = 1.0f / v.w;

            v.x = v.x * InvW * HalfScreenX + HalfScreenX;
            v.y = v.y * InvW * HalfScreenY - HalfScreenY;
            v.z*= 65535.0f;
        }

        RasterizeTriangleInternalLerpColorFactor(&Input[0], &Input[1], &Input[2], Lockup);
        return;
    }
    #endif

    for (uint8 p = 0; p < 6; p++) 
    {
        Count = ClipPolygonToPlane<FVertexXYXWF>(Input, Count, Output, (EClipPlane)(1 << p));

        if (Count < 3) 
            return; // Fully clipped

        SWAP(Input, Output);
    }

    for (uint8 i = 0; i < Count; i++)
    {
        FVector4D &v = Input[i].v;
        Scalar InvW  = 1.0f / v.w;

        v.x = v.x * InvW * HalfScreenX + HalfScreenX;
        v.y = v.y * InvW * HalfScreenY - HalfScreenY;
        v.z*= 65535.0f;
    }

    for (uint8 i = 1; i < Count - 1; i++) 
    {
        RasterizeTriangleInternalLerpColorFactor(&Input[0], &Input[i], &Input[i+1], Lockup);
    }

    return;
}

//
// Clip solid goroud colored triangle to frustum and rasterize it (with ZTest)
//
#ifdef PI_PICO_TARGET
__attribute__((noinline))
#endif
void SecondCoreRasterizeTriangle3DVertexColors(bool FromSecondCore, bool AdditiveBlend)
{
    uint8 *ScratchSpace = FromSecondCore ? &TriangleClipingScratchSpace1[0] : &TriangleClipingScratchSpace0[0];

    FVertexXYXWC *Input  = (FVertexXYXWC *)&ScratchSpace[0];
    FVertexXYXWC *Output = (FVertexXYXWC *)&ScratchSpace[TriangleClipingMaxVerts * sizeof(FVertexXYXWC)];
    uint8         Count  = 3;

    #if RASTERIZER_FAST_PATH_NOCLIP
    if (!ComputeAnyClip(Input[0].v) && !ComputeAnyClip(Input[1].v) && !ComputeAnyClip(Input[2].v))
    {
        for (uint8 i = 0; i < Count; i++)
        {
            FVector4D &v = Input[i].v;
            Scalar InvW  = 1.0f / v.w;

            v.x = v.x * InvW * HalfScreenX + HalfScreenX;
            v.y = v.y * InvW * HalfScreenY - HalfScreenY;
            v.z*= 65535.0f;
        }


        if (AdditiveBlend)
            RasterizeTriangleInternalLerpColorAdd(&Input[0], &Input[1], &Input[2]);
        else
            RasterizeTriangleInternalLerpColor(&Input[0], &Input[1], &Input[2]);
        return;
    }
    #endif

    for (uint8 p = 0; p < 6; p++) 
    {
        Count = ClipPolygonToPlane<FVertexXYXWC>(Input, Count, Output, (EClipPlane)(1 << p));

        if (Count < 3) 
            return; // Fully clipped

        SWAP(Input, Output);
    }

    for (uint8 i = 0; i < Count; i++)
    {
        FVector4D &v = Input[i].v;
        Scalar InvW  = 1.0f / v.w;

        v.x = v.x * InvW * HalfScreenX + HalfScreenX;
        v.y = v.y * InvW * HalfScreenY - HalfScreenY;
        v.z*= 65535.0f;
    }

    for (uint8 i = 1; i < Count - 1; i++)
    {
        if (AdditiveBlend)
            RasterizeTriangleInternalLerpColorAdd(&Input[0], &Input[i], &Input[i+1]);
        else
            RasterizeTriangleInternalLerpColor(&Input[0], &Input[i], &Input[i+1]);
    }

	return;
}

//
// Clip solid textured triangle to frustum and rasterize it (with ZTest)
//
#ifdef PI_PICO_TARGET
__attribute__((noinline))
#endif
void SecondCoreRasterizeTriangle3DTexturedUnlit(bool FromSecondCore, const uint16 *Texture)
{
    uint8 *ScratchSpace = FromSecondCore ? &TriangleClipingScratchSpace1[0] : &TriangleClipingScratchSpace0[0];

    FVertexXYXWUV *Input  = (FVertexXYXWUV *)&ScratchSpace[0];
    FVertexXYXWUV *Output = (FVertexXYXWUV *)&ScratchSpace[TriangleClipingMaxVerts * sizeof(FVertexXYXWUV)];
    uint8          Count  = 3;

    #if RASTERIZER_FAST_PATH_NOCLIP
    if (!ComputeAnyClip(Input[0].v) && !ComputeAnyClip(Input[1].v) && !ComputeAnyClip(Input[2].v))
    {
        for (uint8 i = 0; i < Count; i++)
        {
            FVector4D &v = Input[i].v;
            Scalar InvW  = 1.0f / v.w;

            v.x = v.x * InvW * HalfScreenX + HalfScreenX;
            v.y = v.y * InvW * HalfScreenY - HalfScreenY;
            v.z*= 65535.0f;
        }

        RasterizeTriangleInternalTextured(&Input[0], &Input[1], &Input[2], Texture);
        return;
    }
    #endif

    for (uint8 p = 0; p < 6; p++) 
    {
        Count = ClipPolygonToPlane<FVertexXYXWUV>(Input, Count, Output, (EClipPlane)(1 << p));

        if (Count < 3) 
            return; // Fully clipped

        SWAP(Input, Output);
    }

    for (uint8 i = 0; i < Count; i++)
    {
        FVector4D &v = Input[i].v;
        Scalar InvW  = 1.0f / v.w;

        v.x = v.x * InvW * HalfScreenX + HalfScreenX;
        v.y = v.y * InvW * HalfScreenY - HalfScreenY;
        v.z*= 65535.0f;
    }

    for (uint8 i = 1; i < Count - 1; i++)
    {
        RasterizeTriangleInternalTextured(&Input[0], &Input[i], &Input[i+1], Texture);
    }

    return;
}

//
// Clip solid textured, shaded triangle to frustum and rasterize it (with ZTest)
//
#ifdef PI_PICO_TARGET
__attribute__((noinline))
#endif
void SecondCoreRasterizeTriangle3DTexturedLit(bool FromSecondCore, const uint16 *Texture)
{
    uint8 *ScratchSpace = FromSecondCore ? &TriangleClipingScratchSpace1[0] : &TriangleClipingScratchSpace0[0];

    FVertexXYXWUVF *Input  = (FVertexXYXWUVF *)&ScratchSpace[0];
    FVertexXYXWUVF *Output = (FVertexXYXWUVF *)&ScratchSpace[TriangleClipingMaxVerts * sizeof(FVertexXYXWUVF)];
    uint8           Count  = 3;

    #if RASTERIZER_FAST_PATH_NOCLIP
    if (!ComputeAnyClip(Input[0].v) && !ComputeAnyClip(Input[1].v) && !ComputeAnyClip(Input[2].v))
    {
        for (uint8 i = 0; i < Count; i++)
        {
            FVector4D &v = Input[i].v;
            Scalar InvW  = 1.0f / v.w;

            v.x = v.x * InvW * HalfScreenX + HalfScreenX;
            v.y = v.y * InvW * HalfScreenY - HalfScreenY;
            v.z*= 65535.0f;
        }

        RasterizeTriangleInternalTexturedLit(&Input[0], &Input[1], &Input[2], Texture);
        return;
    }
    #endif

    for (uint8 p = 0; p < 6; p++) 
    {
        Count = ClipPolygonToPlane<FVertexXYXWUVF>(Input, Count, Output, (EClipPlane)(1 << p));

        if (Count < 3) 
            return; // Fully clipped

        SWAP(Input, Output);
    }

    for (uint8 i = 0; i < Count; i++)
    {
        FVector4D &v = Input[i].v;
        Scalar InvW  = 1.0f / v.w;

        v.x = v.x * InvW * HalfScreenX + HalfScreenX;
        v.y = v.y * InvW * HalfScreenY - HalfScreenY;
        v.z*= 65535.0f;
    }

    for (uint8 i = 1; i < Count - 1; i++)
    {
        RasterizeTriangleInternalTexturedLit(&Input[0], &Input[i], &Input[i+1], Texture);
    }

    return;
}

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//                                                                                                                   //
//                                            -= LOW LEVEL RASTERIZERS =-                                            //
//                                                                                                                   //
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

//
// Rasterize, internal, coords are already preprocessed (z/w on both ends, this will couse issues with triangles !)
//
#ifdef PI_PICO_TARGET
__attribute__((noinline))
#endif
static void RasterizeInternalLine(const FVector4Di &a, const FVector4Di &b, uint16 Color)
{
    sint32 x0 = a.x;
    sint32 x1 = b.x;
    sint32 y0 = a.y;
    sint32 y1 = b.y;
    sint32 z0 = a.z;
    sint32 z1 = b.z;

    sint32 dx =  ABS(x1 - x0);
    sint32 dy = -ABS(y1 - y0);
    sint32 sx = x0 < x1 ? 1 : -1;
    sint32 sy = y0 < y1 ? 1 : -1;
    sint32 sm = MAX(dx, dy);
    sint32 sz = sm == 0 ? 0 : (z1 - z0) / sm;
    sint32 er = dx + dy;

    while (1) 
    {
        uint32 off = x0 + y0 * RasterizerSizeX;

        if (TryUpdateZBuffer(&RasterizerDepthbuffer[off], z0))
            RasterizerFramebuffer[off] = Color;

        if (x0 == x1 && y0 == y1) 
            break;

        sint32 e2 = 2 * er;

        if (e2 >= dy) 
        {
            er += dy;
            x0 += sx;
        }

        if (e2 <= dx) 
        {
            er += dx;
            y0 += sy;
        }

        z0 += sz;
    }

    return;
}

//
// Rasterize, internal, coords are already preprocessed (z/w on both ends, this will couse issues with triangles !)
//
#ifdef PI_PICO_TARGET
__attribute__((noinline))
#endif
static void RasterizeInternalLine(const FVector4Di& a, const FColor16& ac, const FVector4Di& b, const FColor16& bc)
{
    sint32 x0 = a.x;
    sint32 x1 = b.x;
    sint32 y0 = a.y;
    sint32 y1 = b.y;
    sint32 z0 = a.z;
    sint32 z1 = b.z;

    sint32 dx =  ABS(x1 - x0);
    sint32 dy = -ABS(y1 - y0);
    sint32 sx = x0 < x1 ? 1 : -1;
    sint32 sy = y0 < y1 ? 1 : -1;
    sint32 sm = MAX(dx, dy);
    sint32 sz = sm == 0 ? 0 : (z1 - z0) / sm;
    sint32 er = dx + dy;

    Scalar  cr = Scalar(sint32(ac.r));
    Scalar  cg = Scalar(sint32(ac.g));
    Scalar  cb = Scalar(sint32(ac.b));

    Scalar fsm = 1.0f / Scalar(sm);
    Scalar  dr = sm == 0 ? 0 : Scalar(sint32(bc.r) - sint32(ac.r)) * fsm;
    Scalar  dg = sm == 0 ? 0 : Scalar(sint32(bc.g) - sint32(ac.g)) * fsm;
    Scalar  db = sm == 0 ? 0 : Scalar(sint32(bc.b) - sint32(ac.b)) * fsm;

    while (1) 
    {
        uint32 off = x0 + y0 * RasterizerSizeX;

        if (TryUpdateZBuffer(&RasterizerDepthbuffer[off], z0))
            RasterizerFramebuffer[off] = FColor16(cr, cg, cb).toColorNoClip();

        if (x0 == x1 && y0 == y1) 
            break;

        sint32 e2 = 2 * er;

        if (e2 >= dy) 
        {
            er += dy;
            x0 += sx;
        }

        if (e2 <= dx) 
        {
            er += dx;
            y0 += sy;
        }

        z0 += sz;
        cr += dr;
        cg += dg;
        cb += db;
    }

    return;
}

//
// Rasterize solid colored triangle (with ZTest, perspective corrected)
//
#ifdef PI_PICO_TARGET
__attribute__((noinline))
#endif
static void __not_in_flash_func(RasterizeTriangleInternalSolidColor)(const FVertexXYXW * __restrict v0, const FVertexXYXW * __restrict v1, const FVertexXYXW * __restrict v2, uint16 Color)
{
    Scalar Area = (v2->v.x - v0->v.x) * (v1->v.y - v0->v.y) - (v2->v.y - v0->v.y) * (v1->v.x - v0->v.x);

    if (Area <= 0) // Backface culling (if you want two-sided, return only if Area == 0 (degenerated triangle))
        return;

#ifdef DEMO_EDITOR
    DebugTrissRass++;
#endif

    const uint16 ColorBright = FColor16(Color).toBrighter();

    __restrict uint16 *ColorPtr = RasterizerFramebuffer;
    __restrict uint16 *DepthPtr = RasterizerDepthbuffer;

    #if RASTERIZER_USE_HYBRIDS
    if (Area < RASTERIZER_HYBRIDS_MARGIN)
    {
        //
        // For small triangles minimize setup as much as possible
        //
        const FVector4D &v0v = v0->v;
        const FVector4D &v1v = v1->v;
        const FVector4D &v2v = v2->v;

        const sint32 min_x = sint32(FScalar::Min(v0v.x, FScalar::Min(v1v.x, v2v.x)));
        const sint32 min_y = sint32(FScalar::Min(v0v.y, FScalar::Min(v1v.y, v2v.y)));
        const sint32 max_x = sint32(FScalar::Max(v0v.x, FScalar::Max(v1v.x, v2v.x)));
        const sint32 max_y = sint32(FScalar::Max(v0v.y, FScalar::Max(v1v.y, v2v.y)));

        if (min_x == max_x && min_y == max_y)
        {
            const sint32 Offpt = min_x + min_y * RasterizerSizeX;

            if (TryUpdateZBuffer(&DepthPtr[Offpt], sint32(v0v.z / v0v.w)))
                ColorPtr[Offpt] = ((min_x ^ min_y) & 0x01) ? ColorBright : Color;

            return;
        }

        const sint32 Offpt = min_y * RasterizerSizeX;

        DepthPtr += Offpt;
        ColorPtr += Offpt;

        const Scalar dp0 = (v2v.x - v1v.x);
        const Scalar dp1 = (v0v.x - v2v.x);
        const Scalar dp2 = (v1v.x - v0v.x);

        Scalar p_y = Scalar(min_y) + FixedHalf;
        Scalar p_x = Scalar(min_x) + FixedHalf;

        Scalar p0  = (p_y - v1v.y) * dp0;
        Scalar p1  = (p_y - v2v.y) * dp1;
        Scalar p2  = (p_y - v0v.y) * dp2;

        const Scalar dw0  = (v2v.y - v1v.y);
        const Scalar dw1  = (v0v.y - v2v.y);
        const Scalar dw2  = (v1v.y - v0v.y);

        const Scalar qw0  = (p_x - v1v.x) * dw0;
        const Scalar qw1  = (p_x - v2v.x) * dw1;
        const Scalar qw2  = (p_x - v0v.x) * dw2;

        for (sint32 y = min_y; y <= max_y; y++, ColorPtr += RasterizerSizeX, DepthPtr += RasterizerSizeX, p0 += dp0, p1 += dp1, p2 += dp2)
        {
            Scalar w0 = qw0 - p0, w1 = qw1 - p1, w2 = qw2 - p2;

            for (sint32 x = min_x; x <= max_x; x++, w0 += dw0, w1 += dw1, w2 += dw2)
            {
                if (FScalar::Min(w0, FScalar::Min(w1, w2)) >= 0.0f)
                {
                    // Interpolate z and w, then do perspective division every pixel (not a great thing for performance)

                    sint32 pz = sint32(((w0 * v0v.z + w1 * v1v.z + w2 * v2v.z) / (w0 * v0v.w + w1 * v1v.w + w2 * v2v.w)));

                    if (TryUpdateZBuffer(&DepthPtr[x], pz))
                        ColorPtr[x] = ((x ^ y) & 0x01) ? ColorBright : Color;
                }
            }
        }

        return;
    }
    #endif

    if (v0->v.y > v1->v.y) SWAP(v0, v1);
    if (v0->v.y > v2->v.y) SWAP(v0, v2);
    if (v1->v.y > v2->v.y) SWAP(v1, v2);

    Scalar dyR = v2->v.y - v0->v.y;
    if (dyR < 0.001f)
        return;

    // For each scanline

    const sint32 v0y = sint32(v0->v.y);
    const sint32 v1y = sint32(v1->v.y);
    const sint32 v2y = sint32(v2->v.y);
    const sint32 Off = v0y * RasterizerSizeX;

    ColorPtr += Off;
    DepthPtr += Off;

    // Precompute barycentric setup

    const Scalar denom = 1.0f / ((v1->v.y - v2->v.y) * (v0->v.x - v2->v.x) + (v2->v.x - v1->v.x) * (v0->v.y - v2->v.y)); // NOTE::It's +/-Area, but after sorting, so sign is not preserved...
    const Scalar dw0dx = (v1->v.y - v2->v.y) * denom;
    const Scalar dw1dx = dyR * denom;
    const Scalar dw2dx = -dw0dx - dw1dx;
                 dyR   = 1.0f / dyR;

    // Top part (v0 to v1)

    if (v1y != v0y)
    {
        Scalar dyL = v1->v.y - v0->v.y;
        
        if (dyL > 0.001f)
        {
                         dyL    = 1.0f / dyL;
            const Scalar dxL    = v1->v.x - v0->v.x;
            const Scalar dxR    = v2->v.x - v0->v.x;
            const Scalar slopeL = dxL * dyL;
            const Scalar slopeR = dxR * dyR;

            for (sint32 y = v0y; y <= ((v2y != v1y) ? v1y-1 : v1y); y++, ColorPtr+=RasterizerSizeX, DepthPtr+=RasterizerSizeX)
            {
                if (y >= RasterizerSizeY)
                    break;

                const Scalar py = Scalar(y) + FixedHalf;
                const Scalar tL = (py - v0->v.y) * dyL;
                const Scalar tR = (py - v0->v.y) * dyR;
                      Scalar xL = v0->v.x + tL * dxL;
                      Scalar xR = v0->v.x + tR * dxR;

                if (xL > xR) SWAP(xL, xR);

                // Compute barycentric at (xL, y)

                const sint32 ixL = MAX(0, sint32(xL));
                const sint32 ixR = MIN(sint32(xR), RasterizerSizeX - 1);

                // Barycentric

                const Scalar p_x = xL + FixedHalf;
                const Scalar w0 = ((v1->v.y - v2->v.y) * (p_x - v2->v.x) + (v2->v.x - v1->v.x) * (py - v2->v.y)) * denom;
                const Scalar w1 = ((v2->v.y - v0->v.y) * (p_x - v2->v.x) + (v0->v.x - v2->v.x) * (py - v2->v.y)) * denom;
                const Scalar w2 = 1.0f - w0 - w1;

                // Precompute barycentric increments in x direction

                Scalar cur_dd = Scalar(ixL) - xL;
                Scalar cur_w0 = w0 + dw0dx * cur_dd;
                Scalar cur_w1 = w1 + dw1dx * cur_dd;
                Scalar cur_w2 = w2 + dw2dx * cur_dd;

                for (sint32 x = ixL; x <= ixR; x++, cur_w0 += dw0dx, cur_w1 += dw1dx, cur_w2 += dw2dx)
                {
                    if (FScalar::Min(cur_w0, FScalar::Min(cur_w1, cur_w2)) < 0.0f)
                        continue;

                    sint32 pz = sint32((cur_w0 * v0->v.z + cur_w1 * v1->v.z + cur_w2 * v2->v.z) / (cur_w0 * v0->v.w + cur_w1 * v1->v.w + cur_w2 * v2->v.w));

                    if (TryUpdateZBuffer(&DepthPtr[x], pz))
                        ColorPtr[x] = ((x ^ y) & 0x01) ? ColorBright : Color;
                }
            }
        }
    }

    // Bottom part (v1 to v2)

    if (v2y != v1y)
    {
        Scalar dyL = v2->v.y - v1->v.y;
        
        if (dyL > 0.001f)
        {
                         dyL    = 1.0f / dyL;
            const Scalar dxL    = v2->v.x - v1->v.x;
            const Scalar dxR    = v2->v.x - v0->v.x;
            const Scalar slopeL = dxL * dyL;
            const Scalar slopeR = dxR * dyR;

            for (sint32 y = v1y; y <= v2y; y++, ColorPtr+=RasterizerSizeX, DepthPtr+=RasterizerSizeX) 
            {
                if (y >= RasterizerSizeY)
                    break;

                const Scalar py = Scalar(y) + FixedHalf;
                const Scalar tL = (py - v1->v.y) * dyL;
                const Scalar tR = (py - v0->v.y) * dyR;
                      Scalar xL = v1->v.x + tL * dxL;
                      Scalar xR = v0->v.x + tR * dxR;

                if (xL > xR) SWAP(xL, xR);

                // Compute barycentric at (xL, y)

                const sint32 ixL = MAX(0, sint32(xL));
                const sint32 ixR = MIN(sint32(xR), RasterizerSizeX - 1);

                // Barycentric

                const Scalar p_x = xL + FixedHalf;
                const Scalar w0 = ((v1->v.y - v2->v.y) * (p_x - v2->v.x) + (v2->v.x - v1->v.x) * (py - v2->v.y)) * denom;
                const Scalar w1 = ((v2->v.y - v0->v.y) * (p_x - v2->v.x) + (v0->v.x - v2->v.x) * (py - v2->v.y)) * denom;
                const Scalar w2 = 1.0f - w0 - w1;

                // Precompute barycentric increments in x direction

                Scalar cur_dd = Scalar(ixL) - xL;
                Scalar cur_w0 = w0 + dw0dx * cur_dd;
                Scalar cur_w1 = w1 + dw1dx * cur_dd;
                Scalar cur_w2 = w2 + dw2dx * cur_dd;

                for (sint32 x = ixL; x <= ixR; x++, cur_w0 += dw0dx, cur_w1 += dw1dx, cur_w2 += dw2dx)
                {
                    if (FScalar::Min(cur_w0, FScalar::Min(cur_w1, cur_w2)) < 0.0f)
                        continue;

                    sint32 pz = sint32((cur_w0 * v0->v.z + cur_w1 * v1->v.z + cur_w2 * v2->v.z) / (cur_w0 * v0->v.w + cur_w1 * v1->v.w + cur_w2 * v2->v.w));

                    if (TryUpdateZBuffer(&DepthPtr[x], pz))
                        ColorPtr[x] = ((x ^ y) & 0x01) ? ColorBright : Color;
                }
            }
        }
    }

    return;
}

//
// Rasterize factor colored triangle (with ZTest, perspective corrected)
//
#ifdef PI_PICO_TARGET
__attribute__((noinline))
#endif
static void __not_in_flash_func(RasterizeTriangleInternalLerpColorFactor)(const FVertexXYXWF * __restrict v0, const FVertexXYXWF * __restrict v1, const FVertexXYXWF * __restrict v2, const uint16 * __restrict Lockup)
{
    Scalar Area = (v2->v.x - v0->v.x) * (v1->v.y - v0->v.y) - (v2->v.y - v0->v.y) * (v1->v.x - v0->v.x);

    if (Area <= 0) // Backface culling (if you want two-sided, return only if Area == 0 (degenerated triangle))
        return;

#ifdef DEMO_EDITOR
    DebugTrissRass++;
#endif

    __restrict uint16 *ColorPtr = RasterizerFramebuffer;
    __restrict uint16 *DepthPtr = RasterizerDepthbuffer;

    #if RASTERIZER_USE_HYBRIDS
    if (Area < RASTERIZER_HYBRIDS_MARGIN)
    {
        //
        // For small triangles minimize setup as much as possible
        //
        const FVector4D &v0v = v0->v;
        const FVector4D &v1v = v1->v;
        const FVector4D &v2v = v2->v;

        const sint32 min_x = sint32(FScalar::Min(v0v.x, FScalar::Min(v1v.x, v2v.x)));
        const sint32 min_y = sint32(FScalar::Min(v0v.y, FScalar::Min(v1v.y, v2v.y)));
        const sint32 max_x = sint32(FScalar::Max(v0v.x, FScalar::Max(v1v.x, v2v.x)));
        const sint32 max_y = sint32(FScalar::Max(v0v.y, FScalar::Max(v1v.y, v2v.y)));

        if (min_x == max_x && min_y == max_y)
        {
            const sint32 Offpt = min_x + min_y * RasterizerSizeX;

            if (TryUpdateZBuffer(&DepthPtr[Offpt], sint32(v0v.z / v0v.w)))
                ColorPtr[Offpt] = Lockup[sint32(v0->f) + (((min_x ^ min_y) & 0x01) << 5)];

            return;
        }

        const sint32 Offpt = min_y * RasterizerSizeX;

        ColorPtr += Offpt;
        DepthPtr += Offpt;

        Area = 1.0f / Area;

        const Scalar dp0 = (v2v.x - v1v.x);
        const Scalar dp1 = (v0v.x - v2v.x);
        const Scalar dp2 = (v1v.x - v0v.x);

        Scalar p_y = Scalar(min_y) + FixedHalf;
        Scalar p_x = Scalar(min_x) + FixedHalf;

        Scalar p0  = (p_y - v1v.y) * dp0;
        Scalar p1  = (p_y - v2v.y) * dp1;
        Scalar p2  = (p_y - v0v.y) * dp2;

        const Scalar dw0  = (v2v.y - v1v.y);
        const Scalar dw1  = (v0v.y - v2v.y);
        const Scalar dw2  = (v1v.y - v0v.y);

        const Scalar qw0  = (p_x - v1v.x) * dw0;
        const Scalar qw1  = (p_x - v2v.x) * dw1;
        const Scalar qw2  = (p_x - v0v.x) * dw2;

        for (sint32 y = min_y; y <= max_y; y++, ColorPtr += RasterizerSizeX, DepthPtr += RasterizerSizeX, p0 += dp0, p1 += dp1, p2 += dp2)
        {
            Scalar w0 = qw0 - p0, w1 = qw1 - p1, w2 = qw2 - p2;

            for (sint32 x = min_x; x <= max_x; x++, w0 += dw0, w1 += dw1, w2 += dw2)
            {
                if (FScalar::Min(w0, FScalar::Min(w1, w2)) >= 0.0f)
                {
                    // Interpolate z and w, then do perspective division every pixel (not a great thing for performance)

                    sint32 pz = sint32(((w0 * v0v.z + w1 * v1v.z + w2 * v2v.z) / (w0 * v0v.w + w1 * v1v.w + w2 * v2v.w)));

                    if (TryUpdateZBuffer(&DepthPtr[x], pz))
                        ColorPtr[x] = Lockup[sint32((w0 * v0->f + w1 * v1->f + w2 * v2->f) * Area) + (((x ^ y) & 0x01) << 5)]; // Lockup with dither pattern ...
                }
            }
        }

        return;
    }
    #endif

    if (v0->v.y > v1->v.y) SWAP(v0, v1);
    if (v0->v.y > v2->v.y) SWAP(v0, v2);
    if (v1->v.y > v2->v.y) SWAP(v1, v2);

    Scalar dyR = v2->v.y - v0->v.y;
    if (dyR < 0.001f)
        return;

    // For each scanline

    const sint32 v0y = sint32(v0->v.y);
    const sint32 v1y = sint32(v1->v.y);
    const sint32 v2y = sint32(v2->v.y);
    const sint32 Off = v0y * RasterizerSizeX;

    ColorPtr += Off;
    DepthPtr += Off;

    // Precompute barycentric setup

    const Scalar denom = 1.0f / ((v1->v.y - v2->v.y) * (v0->v.x - v2->v.x) + (v2->v.x - v1->v.x) * (v0->v.y - v2->v.y)); // NOTE::It's +/-Area, but after sorting, so sign is not preserved...
    const Scalar dw0dx = (v1->v.y - v2->v.y) * denom;
    const Scalar dw1dx = dyR * denom;
    const Scalar dw2dx = -dw0dx - dw1dx;
                 dyR   = 1.0f / dyR;

    // Top part (v0 to v1)

    if (v1y != v0y)
    {
        Scalar dyL = v1->v.y - v0->v.y;
        
        if (dyL > 0.001f)
        {
                         dyL    = 1.0f / dyL;
            const Scalar dxL    = v1->v.x - v0->v.x;
            const Scalar dxR    = v2->v.x - v0->v.x;
            const Scalar slopeL = dxL * dyL;
            const Scalar slopeR = dxR * dyR;

            for (sint32 y = v0y; y <= ((v2y != v1y) ? v1y-1 : v1y); y++, ColorPtr+=RasterizerSizeX, DepthPtr+=RasterizerSizeX)
            {
                if (y >= RasterizerSizeY)
                    break;

                const Scalar py = Scalar(y) + FixedHalf;
                const Scalar tL = (py - v0->v.y) * dyL;
                const Scalar tR = (py - v0->v.y) * dyR;
                      Scalar xL = v0->v.x + tL * dxL;
                      Scalar xR = v0->v.x + tR * dxR;

                if (xL > xR) SWAP(xL, xR);

                // Compute barycentric at (xL, y)

                const sint32 ixL = MAX(0, sint32(xL));
                const sint32 ixR = MIN(sint32(xR), RasterizerSizeX - 1);

                // Barycentric

                const Scalar p_x = xL + FixedHalf;
                const Scalar w0 = ((v1->v.y - v2->v.y) * (p_x - v2->v.x) + (v2->v.x - v1->v.x) * (py - v2->v.y)) * denom;
                const Scalar w1 = ((v2->v.y - v0->v.y) * (p_x - v2->v.x) + (v0->v.x - v2->v.x) * (py - v2->v.y)) * denom;
                const Scalar w2 = 1.0f - w0 - w1;

                // Precompute barycentric increments in x direction

                Scalar cur_dd = Scalar(ixL) - xL;
                Scalar cur_w0 = w0 + dw0dx * cur_dd;
                Scalar cur_w1 = w1 + dw1dx * cur_dd;
                Scalar cur_w2 = w2 + dw2dx * cur_dd;

                for (sint32 x = ixL; x <= ixR; x++, cur_w0 += dw0dx, cur_w1 += dw1dx, cur_w2 += dw2dx)
                {
                    if (FScalar::Min(cur_w0, FScalar::Min(cur_w1, cur_w2)) < 0.0f)
                        continue;

                    sint32 pz = sint32((cur_w0 * v0->v.z + cur_w1 * v1->v.z + cur_w2 * v2->v.z) / (cur_w0 * v0->v.w + cur_w1 * v1->v.w + cur_w2 * v2->v.w));

                    if (TryUpdateZBuffer(&DepthPtr[x], pz))
                        ColorPtr[x] = Lockup[sint32(cur_w0 * v0->f + cur_w1 * v1->f + cur_w2 * v2->f) + (((x ^ y) & 0x01) << 5)]; // Lockup with dither pattern ...
                }
            }
        }
    }

    // Bottom part (v1 to v2)

    if (v2y != v1y)
    {
        Scalar dyL = v2->v.y - v1->v.y;

        if (dyL > 0.001f)
        {
                         dyL    = 1.0f / dyL;
            const Scalar dxL    = v2->v.x - v1->v.x;
            const Scalar dxR    = v2->v.x - v0->v.x;
            const Scalar slopeL = dxL * dyL;
            const Scalar slopeR = dxR * dyR;

            for (sint32 y = v1y; y <= v2y; y++, ColorPtr+=RasterizerSizeX, DepthPtr+=RasterizerSizeX) 
            {
                if (y >= RasterizerSizeY)
                    break;

                const Scalar py = Scalar(y) + FixedHalf;
                const Scalar tL = (py - v1->v.y) * dyL;
                const Scalar tR = (py - v0->v.y) * dyR;
                      Scalar xL = v1->v.x + tL * dxL;
                      Scalar xR = v0->v.x + tR * dxR;

                if (xL > xR) SWAP(xL, xR);

                // Compute barycentric at (xL, y)

                const sint32 ixL = MAX(0, sint32(xL));
                const sint32 ixR = MIN(sint32(xR), RasterizerSizeX - 1);

                // Barycentric

                const Scalar p_x = xL + FixedHalf;
                const Scalar w0 = ((v1->v.y - v2->v.y) * (p_x - v2->v.x) + (v2->v.x - v1->v.x) * (py - v2->v.y)) * denom;
                const Scalar w1 = ((v2->v.y - v0->v.y) * (p_x - v2->v.x) + (v0->v.x - v2->v.x) * (py - v2->v.y)) * denom;
                const Scalar w2 = 1.0f - w0 - w1;

                // Precompute barycentric increments in x direction

                Scalar cur_dd = Scalar(ixL) - xL;
                Scalar cur_w0 = w0 + dw0dx * cur_dd;
                Scalar cur_w1 = w1 + dw1dx * cur_dd;
                Scalar cur_w2 = w2 + dw2dx * cur_dd;

                for (sint32 x = ixL; x <= ixR; x++, cur_w0 += dw0dx, cur_w1 += dw1dx, cur_w2 += dw2dx)
                {
                    if (FScalar::Min(cur_w0, FScalar::Min(cur_w1, cur_w2)) < 0.0f)
                        continue;

                    sint32 pz = sint32((cur_w0 * v0->v.z + cur_w1 * v1->v.z + cur_w2 * v2->v.z) / (cur_w0 * v0->v.w + cur_w1 * v1->v.w + cur_w2 * v2->v.w));

                    if (TryUpdateZBuffer(&DepthPtr[x], pz))
                        ColorPtr[x] = Lockup[sint32(cur_w0 * v0->f + cur_w1 * v1->f + cur_w2 * v2->f) + (((x ^ y) & 0x01) << 5)]; // Lockup with dither pattern ...
                }
            }
        }
    }

    return;
}

//
// Rasterize vertex colored triangle (with ZTest, perspective corrected)
//
#ifdef PI_PICO_TARGET
__attribute__((noinline))
#endif
static void RasterizeTriangleInternalLerpColor(const FVertexXYXWC * __restrict v0, const FVertexXYXWC * __restrict v1, const FVertexXYXWC * __restrict v2)
{
    Scalar Area = (v2->v.x - v0->v.x) * (v1->v.y - v0->v.y) - (v2->v.y - v0->v.y) * (v1->v.x - v0->v.x);

    if (Area <= 0) // Backface culling (if you want two-sided, return only if Area == 0 (degenerated triangle))
        return;

#ifdef DEMO_EDITOR
    DebugTrissRass++;
#endif

    __restrict uint16 *ColorPtr = RasterizerFramebuffer;
    __restrict uint16 *DepthPtr = RasterizerDepthbuffer;

    #if RASTERIZER_USE_HYBRIDS
    if (Area < RASTERIZER_HYBRIDS_MARGIN)
    {
        //
        // For small triangles minimize setup as much as possible
        //
        const FVector4D &v0v = v0->v;
        const FVector4D &v1v = v1->v;
        const FVector4D &v2v = v2->v;

        const sint32 min_x = sint32(FScalar::Min(v0v.x, FScalar::Min(v1v.x, v2v.x)));
        const sint32 min_y = sint32(FScalar::Min(v0v.y, FScalar::Min(v1v.y, v2v.y)));
        const sint32 max_x = sint32(FScalar::Max(v0v.x, FScalar::Max(v1v.x, v2v.x)));
        const sint32 max_y = sint32(FScalar::Max(v0v.y, FScalar::Max(v1v.y, v2v.y)));

        if (min_x == max_x && min_y == max_y)
        {
            const sint32 Offpt = min_x + min_y * RasterizerSizeX;

            if (TryUpdateZBuffer(&DepthPtr[Offpt], sint32(v0v.z / v0v.w)))
                ColorPtr[Offpt] = v0->c.toColorNoClip();

            return;
        }

        const sint32 Offpt = min_y * RasterizerSizeX;

        ColorPtr += Offpt;
        DepthPtr += Offpt;

        Area = 1.0f / Area;

        const Scalar dp0 = (v2v.x - v1v.x);
        const Scalar dp1 = (v0v.x - v2v.x);
        const Scalar dp2 = (v1v.x - v0v.x);

        Scalar p_y = Scalar(min_y) + FixedHalf;
        Scalar p_x = Scalar(min_x) + FixedHalf;

        Scalar p0  = (p_y - v1v.y) * dp0;
        Scalar p1  = (p_y - v2v.y) * dp1;
        Scalar p2  = (p_y - v0v.y) * dp2;

        const Scalar dw0  = (v2v.y - v1v.y);
        const Scalar dw1  = (v0v.y - v2v.y);
        const Scalar dw2  = (v1v.y - v0v.y);

        const Scalar qw0  = (p_x - v1v.x) * dw0;
        const Scalar qw1  = (p_x - v2v.x) * dw1;
        const Scalar qw2  = (p_x - v0v.x) * dw2;

        for (sint32 y = min_y; y <= max_y; y++, ColorPtr += RasterizerSizeX, DepthPtr += RasterizerSizeX, p0 += dp0, p1 += dp1, p2 += dp2)
        {
            Scalar w0 = qw0 - p0, w1 = qw1 - p1, w2 = qw2 - p2;

            for (sint32 x = min_x; x <= max_x; x++, w0 += dw0, w1 += dw1, w2 += dw2)
            {
                if (FScalar::Min(w0, FScalar::Min(w1, w2)) >= 0.0f)
                {
                    // Interpolate z and w, then do perspective division every pixel (not a great thing for performance)

                    sint32 pz = sint32(((w0 * v0v.z + w1 * v1v.z + w2 * v2v.z) / (w0 * v0v.w + w1 * v1v.w + w2 * v2v.w)));

                    if (TryUpdateZBuffer(&DepthPtr[x], pz))
                    {
                        #if RASTERIZER_DITHER_COLORS
                        ColorPtr[x] = FColor16::BarycentricInterpolationDither(v0->c, w0 * Area, v1->c, w1 * Area, v2->c, w2 * Area, ((x ^ y) & 0x01));
                        #else
                        ColorPtr[x] = FColor16::BarycentricInterpolation(v0->c, w0 * Area, v1->c, w1 * Area, v2->c, w2 * Area);
                        #endif    
                    }
                }
            }
        }

        return;
    }
    #endif

    if (v0->v.y > v1->v.y) SWAP(v0, v1);
    if (v0->v.y > v2->v.y) SWAP(v0, v2);
    if (v1->v.y > v2->v.y) SWAP(v1, v2);

    Scalar dyR = v2->v.y - v0->v.y;
    if (dyR < 0.001f)
        return;

    // For each scanline

    const sint32 v0y = sint32(v0->v.y);
    const sint32 v1y = sint32(v1->v.y);
    const sint32 v2y = sint32(v2->v.y);
    const sint32 Off = v0y * RasterizerSizeX;

    ColorPtr += Off;
    DepthPtr += Off;

    // Precompute barycentric setup

    const Scalar denom = 1.0f / ((v1->v.y - v2->v.y) * (v0->v.x - v2->v.x) + (v2->v.x - v1->v.x) * (v0->v.y - v2->v.y)); // NOTE::It's +/-Area, but after sorting, so sign is not preserved...
    const Scalar dw0dx = (v1->v.y - v2->v.y) * denom;
    const Scalar dw1dx = dyR * denom;
    const Scalar dw2dx = -dw0dx - dw1dx;
                 dyR   = 1.0f / dyR;

    // Top part (v0 to v1)

    if (v1y != v0y)
    {
        Scalar dyL = v1->v.y - v0->v.y;
        
        if (dyL > 0.001f)
        {
                         dyL    = 1.0f / dyL;
            const Scalar dxL    = v1->v.x - v0->v.x;
            const Scalar dxR    = v2->v.x - v0->v.x;
            const Scalar slopeL = dxL * dyL;
            const Scalar slopeR = dxR * dyR;

            for (sint32 y = v0y; y <= ((v2y != v1y) ? v1y-1 : v1y); y++, ColorPtr+=RasterizerSizeX, DepthPtr+=RasterizerSizeX)
            {
                if (y >= RasterizerSizeY)
                    break;

                const Scalar py = Scalar(y) + FixedHalf;
                const Scalar tL = (py - v0->v.y) * dyL;
                const Scalar tR = (py - v0->v.y) * dyR;
                      Scalar xL = v0->v.x + tL * dxL;
                      Scalar xR = v0->v.x + tR * dxR;

                if (xL > xR) SWAP(xL, xR);

                // Compute barycentric at (xL, y)

                const sint32 ixL = MAX(0, sint32(xL));
                const sint32 ixR = MIN(sint32(xR), RasterizerSizeX - 1);

                // Barycentric

                const Scalar p_x = xL + FixedHalf;
                const Scalar w0 = ((v1->v.y - v2->v.y) * (p_x - v2->v.x) + (v2->v.x - v1->v.x) * (py - v2->v.y)) * denom;
                const Scalar w1 = ((v2->v.y - v0->v.y) * (p_x - v2->v.x) + (v0->v.x - v2->v.x) * (py - v2->v.y)) * denom;
                const Scalar w2 = 1.0f - w0 - w1;

                // Precompute barycentric increments in x direction

                Scalar cur_dd = Scalar(ixL) - xL;
                Scalar cur_w0 = w0 + dw0dx * cur_dd;
                Scalar cur_w1 = w1 + dw1dx * cur_dd;
                Scalar cur_w2 = w2 + dw2dx * cur_dd;

                for (sint32 x = ixL; x <= ixR; x++, cur_w0 += dw0dx, cur_w1 += dw1dx, cur_w2 += dw2dx)
                {
                    if (FScalar::Min(cur_w0, FScalar::Min(cur_w1, cur_w2)) < 0.0f)
                        continue;

                    sint32 pz = sint32((cur_w0 * v0->v.z + cur_w1 * v1->v.z + cur_w2 * v2->v.z) / (cur_w0 * v0->v.w + cur_w1 * v1->v.w + cur_w2 * v2->v.w));

                    if (TryUpdateZBuffer(&DepthPtr[x], pz))
                    {
                        #if RASTERIZER_DITHER_COLORS
                        ColorPtr[x] = FColor16::BarycentricInterpolationDither(v0->c, cur_w0, v1->c, cur_w1, v2->c, cur_w2, ((x ^ y) & 0x01));
                        #else
                        ColorPtr[x] = FColor16::BarycentricInterpolation(v0->c, cur_w0, v1->c, cur_w1, v2->c, cur_w2);
                        #endif
                    }
                }
            }
        }
    }

    // Bottom part (v1 to v2)

    if (v2y != v1y)
    {
        Scalar dyL = v2->v.y - v1->v.y;

        if (dyL > 0.001f)
        {
                         dyL    = 1.0f / dyL;
            const Scalar dxL    = v2->v.x - v1->v.x;
            const Scalar dxR    = v2->v.x - v0->v.x;
            const Scalar slopeL = dxL * dyL;
            const Scalar slopeR = dxR * dyR;

            for (sint32 y = v1y; y <= v2y; y++, ColorPtr+=RasterizerSizeX, DepthPtr+=RasterizerSizeX) 
            {
                if (y >= RasterizerSizeY)
                    break;

                const Scalar py = Scalar(y) + FixedHalf;
                const Scalar tL = (py - v1->v.y) * dyL;
                const Scalar tR = (py - v0->v.y) * dyR;
                      Scalar xL = v1->v.x + tL * dxL;
                      Scalar xR = v0->v.x + tR * dxR;

                if (xL > xR) SWAP(xL, xR);

                // Compute barycentric at (xL, y)

                const sint32 ixL = MAX(0, sint32(xL));
                const sint32 ixR = MIN(sint32(xR), RasterizerSizeX - 1);

                // Barycentric

                const Scalar p_x = xL + FixedHalf;
                const Scalar w0 = ((v1->v.y - v2->v.y) * (p_x - v2->v.x) + (v2->v.x - v1->v.x) * (py - v2->v.y)) * denom;
                const Scalar w1 = ((v2->v.y - v0->v.y) * (p_x - v2->v.x) + (v0->v.x - v2->v.x) * (py - v2->v.y)) * denom;
                const Scalar w2 = 1.0f - w0 - w1;

                // Precompute barycentric increments in x direction

                Scalar cur_dd = Scalar(ixL) - xL;
                Scalar cur_w0 = w0 + dw0dx * cur_dd;
                Scalar cur_w1 = w1 + dw1dx * cur_dd;
                Scalar cur_w2 = w2 + dw2dx * cur_dd;

                for (sint32 x = ixL; x <= ixR; x++, cur_w0 += dw0dx, cur_w1 += dw1dx, cur_w2 += dw2dx)
                {
                    if (FScalar::Min(cur_w0, FScalar::Min(cur_w1, cur_w2)) < 0.0f)
                        continue;

                    sint32 pz = sint32((cur_w0 * v0->v.z + cur_w1 * v1->v.z + cur_w2 * v2->v.z) / (cur_w0 * v0->v.w + cur_w1 * v1->v.w + cur_w2 * v2->v.w));

                    if (TryUpdateZBuffer(&DepthPtr[x], pz))
                    {
                        #if RASTERIZER_DITHER_COLORS
                        ColorPtr[x] = FColor16::BarycentricInterpolationDither(v0->c, cur_w0, v1->c, cur_w1, v2->c, cur_w2, ((x ^ y) & 0x01));
                        #else
                        ColorPtr[x] = FColor16::BarycentricInterpolation(v0->c, cur_w0, v1->c, cur_w1, v2->c, cur_w2);
                        #endif
                    }
                }
            }
        }
    }

    return;
}

//
// Rasterize vertex colored triangle (with ZTest, perspective corrected, additive blend)
//
#ifdef PI_PICO_TARGET
__attribute__((noinline))
#endif
static void __not_in_flash_func(RasterizeTriangleInternalLerpColorAdd)(const FVertexXYXWC* __restrict v0, const FVertexXYXWC* __restrict v1, const FVertexXYXWC* __restrict v2)
{
    Scalar Area = (v2->v.x - v0->v.x) * (v1->v.y - v0->v.y) - (v2->v.y - v0->v.y) * (v1->v.x - v0->v.x);

    if (Area <= 0) // Backface culling (if you want two-sided, return only if Area == 0 (degenerated triangle))
        return;

#ifdef DEMO_EDITOR
    DebugTrissRass++;
#endif

    __restrict uint16 *ColorPtr = RasterizerFramebuffer;
    __restrict uint16 *DepthPtr = RasterizerDepthbuffer;

    #if RASTERIZER_USE_HYBRIDS
    if (Area < RASTERIZER_HYBRIDS_MARGIN)
    {
        //
        // For small triangles minimize setup as much as possible
        //
        const FVector4D &v0v = v0->v;
        const FVector4D &v1v = v1->v;
        const FVector4D &v2v = v2->v;

        const sint32 min_x = sint32(FScalar::Min(v0v.x, FScalar::Min(v1v.x, v2v.x)));
        const sint32 min_y = sint32(FScalar::Min(v0v.y, FScalar::Min(v1v.y, v2v.y)));
        const sint32 max_x = sint32(FScalar::Max(v0v.x, FScalar::Max(v1v.x, v2v.x)));
        const sint32 max_y = sint32(FScalar::Max(v0v.y, FScalar::Max(v1v.y, v2v.y)));

        if (min_x == max_x && min_y == max_y)
        {
            const sint32 Offpt = min_x + min_y * RasterizerSizeX;

            if (TryUpdateZBuffer(&DepthPtr[Offpt], sint32(v0v.z / v0v.w)))
                ColorPtr[Offpt] = v0->c.AddSat(ColorPtr[Offpt]);

            return;
        }

        const sint32 Offpt = min_y * RasterizerSizeX;

        ColorPtr += Offpt;
        DepthPtr += Offpt;

        Area = 1.0f / Area;

        const Scalar dp0 = (v2v.x - v1v.x);
        const Scalar dp1 = (v0v.x - v2v.x);
        const Scalar dp2 = (v1v.x - v0v.x);

        Scalar p_y = Scalar(min_y) + FixedHalf;
        Scalar p_x = Scalar(min_x) + FixedHalf;

        Scalar p0  = (p_y - v1v.y) * dp0;
        Scalar p1  = (p_y - v2v.y) * dp1;
        Scalar p2  = (p_y - v0v.y) * dp2;

        const Scalar dw0  = (v2v.y - v1v.y);
        const Scalar dw1  = (v0v.y - v2v.y);
        const Scalar dw2  = (v1v.y - v0v.y);

        const Scalar qw0  = (p_x - v1v.x) * dw0;
        const Scalar qw1  = (p_x - v2v.x) * dw1;
        const Scalar qw2  = (p_x - v0v.x) * dw2;

        for (sint32 y = min_y; y <= max_y; y++, ColorPtr += RasterizerSizeX, DepthPtr += RasterizerSizeX, p0 += dp0, p1 += dp1, p2 += dp2)
        {
            Scalar w0 = qw0 - p0, w1 = qw1 - p1, w2 = qw2 - p2;

            for (sint32 x = min_x; x <= max_x; x++, w0 += dw0, w1 += dw1, w2 += dw2)
            {
                if (FScalar::Min(w0, FScalar::Min(w1, w2)) >= 0.0f)
                {
                    // Interpolate z and w, then do perspective division every pixel (not a great thing for performance)

                    sint32 pz = sint32(((w0 * v0v.z + w1 * v1v.z + w2 * v2v.z) / (w0 * v0v.w + w1 * v1v.w + w2 * v2v.w)));

                    if (TryTestZBuffer(&DepthPtr[x], pz))
                    {
                        #if RASTERIZER_DITHER_COLORS
                        ColorPtr[x] = FColor16::BarycentricInterpolationDither(v0->c, w0 * Area, v1->c, w1 * Area, v2->c, w2 * Area, ((x ^ y) & 0x01), ColorPtr[x]);
                        #else
                        ColorPtr[x] = FColor16::BarycentricInterpolation(v0->c, w0 * Area, v1->c, w1 * Area, v2->c, w2 * Area, ColorPtr[x]);
                        #endif  
                    }
                }
            }
        }

        return;
    }
    #endif

    if (v0->v.y > v1->v.y) SWAP(v0, v1);
    if (v0->v.y > v2->v.y) SWAP(v0, v2);
    if (v1->v.y > v2->v.y) SWAP(v1, v2);

    Scalar dyR = v2->v.y - v0->v.y;
    if (dyR < 0.001f)
        return;

    // For each scanline

    const sint32 v0y = sint32(v0->v.y);
    const sint32 v1y = sint32(v1->v.y);
    const sint32 v2y = sint32(v2->v.y);
    const sint32 Off = v0y * RasterizerSizeX;

    ColorPtr += Off;
    DepthPtr += Off;

    // Precompute barycentric setup

    const Scalar denom = 1.0f / ((v1->v.y - v2->v.y) * (v0->v.x - v2->v.x) + (v2->v.x - v1->v.x) * (v0->v.y - v2->v.y)); // NOTE::It's +/-Area, but after sorting, so sign is not preserved...
    const Scalar dw0dx = (v1->v.y - v2->v.y) * denom;
    const Scalar dw1dx = dyR * denom;
    const Scalar dw2dx = -dw0dx - dw1dx;
                 dyR   = 1.0f / dyR;

    // Top part (v0 to v1)

    if (v1y != v0y)
    {
        Scalar dyL = v1->v.y - v0->v.y;
        
        if (dyL > 0.001f)
        {
                         dyL    = 1.0f / dyL;
            const Scalar dxL    = v1->v.x - v0->v.x;
            const Scalar dxR    = v2->v.x - v0->v.x;
            const Scalar slopeL = dxL * dyL;
            const Scalar slopeR = dxR * dyR;

            for (sint32 y = v0y; y <= ((v2y != v1y) ? v1y-1 : v1y); y++, ColorPtr+=RasterizerSizeX, DepthPtr+=RasterizerSizeX) 
            {
                if (y >= RasterizerSizeY)
                    break;

                const Scalar py = Scalar(y) + FixedHalf;
                const Scalar tL = (py - v0->v.y) * dyL;
                const Scalar tR = (py - v0->v.y) * dyR;
                      Scalar xL = v0->v.x + tL * dxL;
                      Scalar xR = v0->v.x + tR * dxR;

                if (xL > xR) SWAP(xL, xR);

                // Compute barycentric at (xL, y)

                const sint32 ixL = MAX(0, sint32(xL));
                const sint32 ixR = MIN(sint32(xR), RasterizerSizeX - 1);

                // Barycentric

                const Scalar p_x = xL + FixedHalf;
                const Scalar w0 = ((v1->v.y - v2->v.y) * (p_x - v2->v.x) + (v2->v.x - v1->v.x) * (py - v2->v.y)) * denom;
                const Scalar w1 = ((v2->v.y - v0->v.y) * (p_x - v2->v.x) + (v0->v.x - v2->v.x) * (py - v2->v.y)) * denom;
                const Scalar w2 = 1.0f - w0 - w1;

                // Precompute barycentric increments in x direction

                Scalar cur_dd = Scalar(ixL) - xL;
                Scalar cur_w0 = w0 + dw0dx * cur_dd;
                Scalar cur_w1 = w1 + dw1dx * cur_dd;
                Scalar cur_w2 = w2 + dw2dx * cur_dd;

                for (sint32 x = ixL; x <= ixR; x++, cur_w0 += dw0dx, cur_w1 += dw1dx, cur_w2 += dw2dx)
                {
                    if (FScalar::Min(cur_w0, FScalar::Min(cur_w1, cur_w2)) < 0.0f)
                        continue;

                    sint32 pz = sint32((cur_w0 * v0->v.z + cur_w1 * v1->v.z + cur_w2 * v2->v.z) / (cur_w0 * v0->v.w + cur_w1 * v1->v.w + cur_w2 * v2->v.w));

                    if (TryTestZBuffer(&DepthPtr[x], pz))
                    {
                        #if RASTERIZER_DITHER_COLORS
                        ColorPtr[x] = FColor16::BarycentricInterpolationDither(v0->c, cur_w0, v1->c, cur_w1, v2->c, cur_w2, ((x ^ y) & 0x01), ColorPtr[x]);
                        #else
                        ColorPtr[x] = FColor16::BarycentricInterpolation(v0->c, cur_w0, v1->c, cur_w1, v2->c, cur_w2, ColorPtr[x]);
                        #endif
                    }
                }
            }
        }
    }

    // Bottom part (v1 to v2)

    if (v2y != v1y)
    {
        Scalar dyL = v2->v.y - v1->v.y;

        if (dyL > 0.001f)
        {
                         dyL    = 1.0f / dyL;
            const Scalar dxL    = v2->v.x - v1->v.x;
            const Scalar dxR    = v2->v.x - v0->v.x;
            const Scalar slopeL = dxL * dyL;
            const Scalar slopeR = dxR * dyR;

            for (sint32 y = v1y; y <= v2y; y++, ColorPtr+=RasterizerSizeX, DepthPtr+=RasterizerSizeX) 
            {
                if (y >= RasterizerSizeY)
                    break;

                const Scalar py = Scalar(y) + FixedHalf;
                const Scalar tL = (py - v1->v.y) * dyL;
                const Scalar tR = (py - v0->v.y) * dyR;
                      Scalar xL = v1->v.x + tL * dxL;
                      Scalar xR = v0->v.x + tR * dxR;

                if (xL > xR) SWAP(xL, xR);

                // Compute barycentric at (xL, y)

                const sint32 ixL = MAX(0, sint32(xL));
                const sint32 ixR = MIN(sint32(xR), RasterizerSizeX - 1);

                // Barycentric

                const Scalar p_x = xL + FixedHalf;
                const Scalar w0 = ((v1->v.y - v2->v.y) * (p_x - v2->v.x) + (v2->v.x - v1->v.x) * (py - v2->v.y)) * denom;
                const Scalar w1 = ((v2->v.y - v0->v.y) * (p_x - v2->v.x) + (v0->v.x - v2->v.x) * (py - v2->v.y)) * denom;
                const Scalar w2 = 1.0f - w0 - w1;

                // Precompute barycentric increments in x direction

                Scalar cur_dd = Scalar(ixL) - xL;
                Scalar cur_w0 = w0 + dw0dx * cur_dd;
                Scalar cur_w1 = w1 + dw1dx * cur_dd;
                Scalar cur_w2 = w2 + dw2dx * cur_dd;

                for (sint32 x = ixL; x <= ixR; x++, cur_w0 += dw0dx, cur_w1 += dw1dx, cur_w2 += dw2dx)
                {
                    if (FScalar::Min(cur_w0, FScalar::Min(cur_w1, cur_w2)) < 0.0f)
                        continue;

                    sint32 pz = sint32((cur_w0 * v0->v.z + cur_w1 * v1->v.z + cur_w2 * v2->v.z) / (cur_w0 * v0->v.w + cur_w1 * v1->v.w + cur_w2 * v2->v.w));

                    if (TryTestZBuffer(&DepthPtr[x], pz))
                    {
                        #if RASTERIZER_DITHER_COLORS
                        ColorPtr[x] = FColor16::BarycentricInterpolationDither(v0->c, cur_w0, v1->c, cur_w1, v2->c, cur_w2, ((x ^ y) & 0x01), ColorPtr[x]);
                        #else
                        ColorPtr[x] = FColor16::BarycentricInterpolation(v0->c, cur_w0, v1->c, cur_w1, v2->c, cur_w2, ColorPtr[x]);
                        #endif
                    }
                }
            }
        }
    }

    return;
}

//
// Rasterize textured triangle (with ZTest, perspective corrected)
//
#ifdef PI_PICO_TARGET
__attribute__((noinline))
#endif
void RasterizeTriangleInternalTextured(const FVertexXYXWUV* __restrict v0, const FVertexXYXWUV* __restrict v1, const FVertexXYXWUV* __restrict v2, const uint16* __restrict Texture)
{
    Scalar Area = (v2->v.x - v0->v.x) * (v1->v.y - v0->v.y) - (v2->v.y - v0->v.y) * (v1->v.x - v0->v.x);

    if (Area <= 0) // Backface culling (if you want two-sided, return only if Area == 0 (degenerated triangle))
        return;

#ifdef DEMO_EDITOR
    DebugTrissRass++;
#endif

    __restrict uint16 *ColorPtr = RasterizerFramebuffer;
    __restrict uint16 *DepthPtr = RasterizerDepthbuffer;

    #if RASTERIZER_USE_HYBRIDS
    if (Area < RASTERIZER_HYBRIDS_MARGIN)
    {
        //
        // For small triangles minimize setup as much as possible
        //
        Area = 1.0f / Area;

        const FVector4D &v0v = v0->v;
        const FVector4D &v1v = v1->v;
        const FVector4D &v2v = v2->v;

        const sint32 min_x = sint32(FScalar::Min(v0v.x, FScalar::Min(v1v.x, v2v.x)));
        const sint32 min_y = sint32(FScalar::Min(v0v.y, FScalar::Min(v1v.y, v2v.y)));
        const sint32 max_x = sint32(FScalar::Max(v0v.x, FScalar::Max(v1v.x, v2v.x)));
        const sint32 max_y = sint32(FScalar::Max(v0v.y, FScalar::Max(v1v.y, v2v.y)));

        const sint32 Offpt = min_y * RasterizerSizeX;

        *ColorPtr += Offpt;
        *DepthPtr += Offpt;

        const Scalar dp0 = (v2v.x - v1v.x);
        const Scalar dp1 = (v0v.x - v2v.x);
        const Scalar dp2 = (v1v.x - v0v.x);

        Scalar p_y = Scalar(min_y) + FixedHalf;
        Scalar p_x = Scalar(min_x) + FixedHalf;

        Scalar p0  = (p_y - v1v.y) * dp0;
        Scalar p1  = (p_y - v2v.y) * dp1;
        Scalar p2  = (p_y - v0v.y) * dp2;

        const Scalar dw0  = (v2v.y - v1v.y);
        const Scalar dw1  = (v0v.y - v2v.y);
        const Scalar dw2  = (v1v.y - v0v.y);

        const Scalar qw0  = (p_x - v1v.x) * dw0;
        const Scalar qw1  = (p_x - v2v.x) * dw1;
        const Scalar qw2  = (p_x - v0v.x) * dw2;

        for (sint32 y = min_y; y <= max_y; y++, ColorPtr += RasterizerSizeX, DepthPtr += RasterizerSizeX, p0 += dp0, p1 += dp1, p2 += dp2)
        {
            Scalar w0 = qw0 - p0, w1 = qw1 - p1, w2 = qw2 - p2;

            for (sint32 x = min_x; x <= max_x; x++, w0 += dw0, w1 += dw1, w2 += dw2)
            {
                if (FScalar::Min(w0, FScalar::Min(w1, w2)) >= 0.0f)
                {
                    // Interpolate z and w, then do perspective division every pixel (not a great thing for performance)

                    sint32 pz = sint32(((w0 * v0v.z + w1 * v1v.z + w2 * v2v.z) / (w0 * v0v.w + w1 * v1v.w + w2 * v2v.w)));

                    if (TryUpdateZBuffer(&DepthPtr[x], pz))
                    {
                        FVector2D uv = (v0->uv * w0 + v1->uv * w1 + v2->uv * w2) * Area;
                        uint32    tu = uint32(uv.x) & (FSoftwareRasterizer::TextureCacheSize-1);
                        uint32    tv = uint32(uv.y) & (FSoftwareRasterizer::TextureCacheSize-1);
                        ColorPtr[x]  = Texture[tu + tv * FSoftwareRasterizer::TextureCacheSize];
                    }
                }
            }
        }

        return;
    }
    #endif

    if (v0->v.y > v1->v.y) SWAP(v0, v1);
    if (v0->v.y > v2->v.y) SWAP(v0, v2);
    if (v1->v.y > v2->v.y) SWAP(v1, v2);

    Scalar dyR = v2->v.y - v0->v.y;
    if (dyR < 0.001f)
        return;

    // For each scanline

    const sint32 v0y = sint32(v0->v.y);
    const sint32 v1y = sint32(v1->v.y);
    const sint32 v2y = sint32(v2->v.y);
    const sint32 Off = v0y * RasterizerSizeX;

    ColorPtr += Off;
    DepthPtr += Off;

    // Precompute barycentric setup

    const Scalar denom = 1.0f / ((v1->v.y - v2->v.y) * (v0->v.x - v2->v.x) + (v2->v.x - v1->v.x) * (v0->v.y - v2->v.y)); // NOTE::It's +/-Area, but after sorting, so sign is not preserved...
    const Scalar dw0dx = (v1->v.y - v2->v.y) * denom;
    const Scalar dw1dx = dyR * denom;
    const Scalar dw2dx = -dw0dx - dw1dx;
                 dyR   = 1.0f / dyR;

    // Top part (v0 to v1)

    if (v1y != v0y)
    {
        Scalar dyL = v1->v.y - v0->v.y;
        
        if (dyL > 0.001f)
        {
                         dyL    = 1.0f / dyL;
            const Scalar dxL    = v1->v.x - v0->v.x;
            const Scalar dxR    = v2->v.x - v0->v.x;
            const Scalar slopeL = dxL * dyL;
            const Scalar slopeR = dxR * dyR;

            for (sint32 y = v0y; y <= ((v2y != v1y) ? v1y-1 : v1y); y++, ColorPtr+=RasterizerSizeX, DepthPtr+=RasterizerSizeX)
            {
                if (y >= RasterizerSizeY)
                    break;

                const Scalar py = Scalar(y) + FixedHalf;
                const Scalar tL = (py - v0->v.y) * dyL;
                const Scalar tR = (py - v0->v.y) * dyR;
                      Scalar xL = v0->v.x + tL * dxL;
                      Scalar xR = v0->v.x + tR * dxR;

                if (xL > xR) SWAP(xL, xR);

                // Compute barycentric at (xL, y)

                const sint32 ixL = MAX(0, sint32(xL));
                const sint32 ixR = MIN(sint32(xR), RasterizerSizeX - 1);

                // Barycentric

                const Scalar p_x = xL + FixedHalf;
                const Scalar w0 = ((v1->v.y - v2->v.y) * (p_x - v2->v.x) + (v2->v.x - v1->v.x) * (py - v2->v.y)) * denom;
                const Scalar w1 = ((v2->v.y - v0->v.y) * (p_x - v2->v.x) + (v0->v.x - v2->v.x) * (py - v2->v.y)) * denom;
                const Scalar w2 = 1.0f - w0 - w1;

                // Precompute barycentric increments in x direction

                Scalar cur_dd = Scalar(ixL) - xL;
                Scalar cur_w0 = w0 + dw0dx * cur_dd;
                Scalar cur_w1 = w1 + dw1dx * cur_dd;
                Scalar cur_w2 = w2 + dw2dx * cur_dd;

                for (sint32 x = ixL; x <= ixR; x++, cur_w0 += dw0dx, cur_w1 += dw1dx, cur_w2 += dw2dx)
                {
                    if (FScalar::Min(cur_w0, FScalar::Min(cur_w1, cur_w2)) < 0.0f)
                        continue;

                    sint32 pz = sint32((cur_w0 * v0->v.z + cur_w1 * v1->v.z + cur_w2 * v2->v.z) / (cur_w0 * v0->v.w + cur_w1 * v1->v.w + cur_w2 * v2->v.w));

                    if (TryUpdateZBuffer(&DepthPtr[x], pz))
                    {
                        FVector2D uv = v0->uv * cur_w0 + v1->uv * cur_w1 + v2->uv * cur_w2;
                        uint32    tu = uint32(uv.x) & (FSoftwareRasterizer::TextureCacheSize-1);
                        uint32    tv = uint32(uv.y) & (FSoftwareRasterizer::TextureCacheSize-1);
                        ColorPtr[x]  = Texture[tu + tv * FSoftwareRasterizer::TextureCacheSize];
                    }
                }
            }
        }
    }

    // Bottom part (v1 to v2)

    if (v2y != v1y)
    {
        Scalar dyL = v2->v.y - v1->v.y;

        if (dyL > 0.001f)
        {
                         dyL    = 1.0f / dyL;
            const Scalar dxL    = v2->v.x - v1->v.x;
            const Scalar dxR    = v2->v.x - v0->v.x;
            const Scalar slopeL = dxL * dyL;
            const Scalar slopeR = dxR * dyR;

            for (sint32 y = v1y; y <= v2y; y++, ColorPtr+=RasterizerSizeX, DepthPtr+=RasterizerSizeX) 
            {
                if (y >= RasterizerSizeY)
                    break;

                const Scalar py = Scalar(y) + FixedHalf;
                const Scalar tL = (py - v1->v.y) * dyL;
                const Scalar tR = (py - v0->v.y) * dyR;
                      Scalar xL = v1->v.x + tL * dxL;
                      Scalar xR = v0->v.x + tR * dxR;

                if (xL > xR) SWAP(xL, xR);

                // Compute barycentric at (xL, y)

                const sint32 ixL = MAX(0, sint32(xL));
                const sint32 ixR = MIN(sint32(xR), RasterizerSizeX - 1);

                // Barycentric

                const Scalar p_x = xL + FixedHalf;
                const Scalar w0 = ((v1->v.y - v2->v.y) * (p_x - v2->v.x) + (v2->v.x - v1->v.x) * (py - v2->v.y)) * denom;
                const Scalar w1 = ((v2->v.y - v0->v.y) * (p_x - v2->v.x) + (v0->v.x - v2->v.x) * (py - v2->v.y)) * denom;
                const Scalar w2 = 1.0f - w0 - w1;

                // Precompute barycentric increments in x direction

                Scalar cur_dd = Scalar(ixL) - xL;
                Scalar cur_w0 = w0 + dw0dx * cur_dd;
                Scalar cur_w1 = w1 + dw1dx * cur_dd;
                Scalar cur_w2 = w2 + dw2dx * cur_dd;

                for (sint32 x = ixL; x <= ixR; x++, cur_w0 += dw0dx, cur_w1 += dw1dx, cur_w2 += dw2dx)
                {
                    if (FScalar::Min(cur_w0, FScalar::Min(cur_w1, cur_w2)) < 0.0f)
                        continue;

                    sint32 pz = sint32((cur_w0 * v0->v.z + cur_w1 * v1->v.z + cur_w2 * v2->v.z) / (cur_w0 * v0->v.w + cur_w1 * v1->v.w + cur_w2 * v2->v.w));

                    if (TryUpdateZBuffer(&DepthPtr[x], pz))
                    {
                        FVector2D uv = v0->uv * cur_w0 + v1->uv * cur_w1 + v2->uv * cur_w2;
                        uint32    tu = uint32(uv.x) & (FSoftwareRasterizer::TextureCacheSize-1);
                        uint32    tv = uint32(uv.y) & (FSoftwareRasterizer::TextureCacheSize-1);
                        ColorPtr[x]  = Texture[tu + tv * FSoftwareRasterizer::TextureCacheSize];
                    }
                }
            }
        }
    }

    return;
}

//
// Rasterize textured, shaded triangle (with ZTest, perspective corrected)
//
#ifdef PI_PICO_TARGET
__attribute__((noinline))
#endif
static void __not_in_flash_func(RasterizeTriangleInternalTexturedLit)(const FVertexXYXWUVF* __restrict v0, const FVertexXYXWUVF* __restrict v1, const FVertexXYXWUVF* __restrict v2, const uint16* __restrict Texture)
{
    Scalar Area = (v2->v.x - v0->v.x) * (v1->v.y - v0->v.y) - (v2->v.y - v0->v.y) * (v1->v.x - v0->v.x);

    if (Area <= 0) // Backface culling (if you want two-sided, return only if Area == 0 (degenerated triangle))
        return;

#ifdef DEMO_EDITOR
    DebugTrissRass++;
#endif

    __restrict uint16 *ColorPtr = RasterizerFramebuffer;
    __restrict uint16 *DepthPtr = RasterizerDepthbuffer;

    #if RASTERIZER_USE_HYBRIDS
    if (Area < RASTERIZER_HYBRIDS_MARGIN)
    {
        //
        // For small triangles minimize setup as much as possible
        //
        Area = 1.0f / Area;

        const FVector4D &v0v = v0->v;
        const FVector4D &v1v = v1->v;
        const FVector4D &v2v = v2->v;

        const sint32 min_x = sint32(FScalar::Min(v0v.x, FScalar::Min(v1v.x, v2v.x)));
        const sint32 min_y = sint32(FScalar::Min(v0v.y, FScalar::Min(v1v.y, v2v.y)));
        const sint32 max_x = sint32(FScalar::Max(v0v.x, FScalar::Max(v1v.x, v2v.x)));
        const sint32 max_y = sint32(FScalar::Max(v0v.y, FScalar::Max(v1v.y, v2v.y)));

        const sint32 Offpt = min_y * RasterizerSizeX;

        ColorPtr += Offpt;
        DepthPtr += Offpt;

        const Scalar dp0 = (v2v.x - v1v.x);
        const Scalar dp1 = (v0v.x - v2v.x);
        const Scalar dp2 = (v1v.x - v0v.x);

        Scalar p_y = Scalar(min_y) + FixedHalf;
        Scalar p_x = Scalar(min_x) + FixedHalf;

        Scalar p0  = (p_y - v1v.y) * dp0;
        Scalar p1  = (p_y - v2v.y) * dp1;
        Scalar p2  = (p_y - v0v.y) * dp2;

        const Scalar dw0  = (v2v.y - v1v.y);
        const Scalar dw1  = (v0v.y - v2v.y);
        const Scalar dw2  = (v1v.y - v0v.y);

        const Scalar qw0  = (p_x - v1v.x) * dw0;
        const Scalar qw1  = (p_x - v2v.x) * dw1;
        const Scalar qw2  = (p_x - v0v.x) * dw2;

        for (sint32 y = min_y; y <= max_y; y++, ColorPtr += RasterizerSizeX, DepthPtr += RasterizerSizeX, p0 += dp0, p1 += dp1, p2 += dp2)
        {
            Scalar w0 = qw0 - p0, w1 = qw1 - p1, w2 = qw2 - p2;

            for (sint32 x = min_x; x <= max_x; x++, w0 += dw0, w1 += dw1, w2 += dw2)
            {
                if (FScalar::Min(w0, FScalar::Min(w1, w2)) >= 0.0f)
                {
                    // Interpolate z and w, then do perspective division every pixel (not a great thing for performance)

                    sint32 pz = sint32(((w0 * v0v.z + w1 * v1v.z + w2 * v2v.z) / (w0 * v0v.w + w1 * v1v.w + w2 * v2v.w)));

                    if (TryUpdateZBuffer(&DepthPtr[x], pz))
                    {
                        FVector2D uv = (v0->uv * w0 + v1->uv * w1 + v2->uv * w2) * Area;
                        Scalar    f  = (v0->f  * w0 + v1->f  * w1 + v2->f  * w2) * Area;
                        uint32    tu = uint32(uv.x) & (FSoftwareRasterizer::TextureCacheSize-1);
                        uint32    tv = uint32(uv.y) & (FSoftwareRasterizer::TextureCacheSize-1);
                        ColorPtr[x]  = FColor16(Texture[tu + tv * FSoftwareRasterizer::TextureCacheSize]).ScaledValueToWhite(sint32(f * 65536.0f)).toColorNoClip();
                    }
                }
            }
        }

        return;
    }
    #endif

    if (v0->v.y > v1->v.y) SWAP(v0, v1);
    if (v0->v.y > v2->v.y) SWAP(v0, v2);
    if (v1->v.y > v2->v.y) SWAP(v1, v2);

    Scalar dyR = v2->v.y - v0->v.y;
    if (dyR < 0.001f)
        return;

    // For each scanline

    const sint32 v0y = sint32(v0->v.y);
    const sint32 v1y = sint32(v1->v.y);
    const sint32 v2y = sint32(v2->v.y);
    const sint32 Off = v0y * RasterizerSizeX;

    ColorPtr += Off;
    DepthPtr += Off;

    // Precompute barycentric setup

    const Scalar denom = 1.0f / ((v1->v.y - v2->v.y) * (v0->v.x - v2->v.x) + (v2->v.x - v1->v.x) * (v0->v.y - v2->v.y)); // NOTE::It's +/-Area, but after sorting, so sign is not preserved...
    const Scalar dw0dx = (v1->v.y - v2->v.y) * denom;
    const Scalar dw1dx = dyR * denom;
    const Scalar dw2dx = -dw0dx - dw1dx;
                 dyR   = 1.0f / dyR;

    // Top part (v0 to v1)

    if (v1y != v0y)
    {
        Scalar dyL = v1->v.y - v0->v.y;
        
        if (dyL > 0.001f)
        {
                         dyL    = 1.0f / dyL;
            const Scalar dxL    = v1->v.x - v0->v.x;
            const Scalar dxR    = v2->v.x - v0->v.x;
            const Scalar slopeL = dxL * dyL;
            const Scalar slopeR = dxR * dyR;

            for (sint32 y = v0y; y <= ((v2y != v1y) ? v1y-1 : v1y); y++, ColorPtr+=RasterizerSizeX, DepthPtr+=RasterizerSizeX)
            {
                if (y >= RasterizerSizeY)
                    break;

                const Scalar py = Scalar(y) + FixedHalf;
                const Scalar tL = (py - v0->v.y) * dyL;
                const Scalar tR = (py - v0->v.y) * dyR;
                      Scalar xL = v0->v.x + tL * dxL;
                      Scalar xR = v0->v.x + tR * dxR;

                if (xL > xR) SWAP(xL, xR);

                // Compute barycentric at (xL, y)

                const sint32 ixL = MAX(0, sint32(xL));
                const sint32 ixR = MIN(sint32(xR), RasterizerSizeX - 1);

                // Barycentric

                const Scalar p_x = xL + FixedHalf;
                const Scalar w0 = ((v1->v.y - v2->v.y) * (p_x - v2->v.x) + (v2->v.x - v1->v.x) * (py - v2->v.y)) * denom;
                const Scalar w1 = ((v2->v.y - v0->v.y) * (p_x - v2->v.x) + (v0->v.x - v2->v.x) * (py - v2->v.y)) * denom;
                const Scalar w2 = 1.0f - w0 - w1;

                // Precompute barycentric increments in x direction

                Scalar cur_dd = Scalar(ixL) - xL;
                Scalar cur_w0 = w0 + dw0dx * cur_dd;
                Scalar cur_w1 = w1 + dw1dx * cur_dd;
                Scalar cur_w2 = w2 + dw2dx * cur_dd;

                for (sint32 x = ixL; x <= ixR; x++, cur_w0 += dw0dx, cur_w1 += dw1dx, cur_w2 += dw2dx)
                {
                    if (FScalar::Min(cur_w0, FScalar::Min(cur_w1, cur_w2)) < 0.0f)
                        continue;

                    sint32 pz = sint32((cur_w0 * v0->v.z + cur_w1 * v1->v.z + cur_w2 * v2->v.z) / (cur_w0 * v0->v.w + cur_w1 * v1->v.w + cur_w2 * v2->v.w));

                    if (TryUpdateZBuffer(&DepthPtr[x], pz))
                    {
                        FVector2D uv = v0->uv * cur_w0 + v1->uv * cur_w1 + v2->uv * cur_w2;
                        Scalar    f  = v0->f  * cur_w0 + v1->f  * cur_w1 + v2->f  * cur_w2;
                        uint32    tu = uint32(uv.x) & (FSoftwareRasterizer::TextureCacheSize-1);
                        uint32    tv = uint32(uv.y) & (FSoftwareRasterizer::TextureCacheSize-1);
                        ColorPtr[x]  = FColor16(Texture[tu + tv * FSoftwareRasterizer::TextureCacheSize]).ScaledValueToWhite(sint32(f * 65536.0f)).toColorNoClip();
                    }
                }
            }
        }
    }

    // Bottom part (v1 to v2)

    if (v2y != v1y)
    {
        Scalar dyL = v2->v.y - v1->v.y;

        if (dyL > 0.001f)
        {
                         dyL    = 1.0f / dyL;
            const Scalar dxL    = v2->v.x - v1->v.x;
            const Scalar dxR    = v2->v.x - v0->v.x;
            const Scalar slopeL = dxL * dyL;
            const Scalar slopeR = dxR * dyR;

            for (sint32 y = v1y; y <= v2y; y++, ColorPtr+=RasterizerSizeX, DepthPtr+=RasterizerSizeX) 
            {
                if (y >= RasterizerSizeY)
                    break;

                const Scalar py = Scalar(y) + FixedHalf;
                const Scalar tL = (py - v1->v.y) * dyL;
                const Scalar tR = (py - v0->v.y) * dyR;
                      Scalar xL = v1->v.x + tL * dxL;
                      Scalar xR = v0->v.x + tR * dxR;

                if (xL > xR) SWAP(xL, xR);

                // Compute barycentric at (xL, y)

                const sint32 ixL = MAX(0, sint32(xL));
                const sint32 ixR = MIN(sint32(xR), RasterizerSizeX - 1);

                // Barycentric

                const Scalar p_x = xL + FixedHalf;
                const Scalar w0 = ((v1->v.y - v2->v.y) * (p_x - v2->v.x) + (v2->v.x - v1->v.x) * (py - v2->v.y)) * denom;
                const Scalar w1 = ((v2->v.y - v0->v.y) * (p_x - v2->v.x) + (v0->v.x - v2->v.x) * (py - v2->v.y)) * denom;
                const Scalar w2 = 1.0f - w0 - w1;

                // Precompute barycentric increments in x direction

                Scalar cur_dd = Scalar(ixL) - xL;
                Scalar cur_w0 = w0 + dw0dx * cur_dd;
                Scalar cur_w1 = w1 + dw1dx * cur_dd;
                Scalar cur_w2 = w2 + dw2dx * cur_dd;

                for (sint32 x = ixL; x <= ixR; x++, cur_w0 += dw0dx, cur_w1 += dw1dx, cur_w2 += dw2dx)
                {
                    if (FScalar::Min(cur_w0, FScalar::Min(cur_w1, cur_w2)) < 0.0f)
                        continue;

                    sint32 pz = sint32((cur_w0 * v0->v.z + cur_w1 * v1->v.z + cur_w2 * v2->v.z) / (cur_w0 * v0->v.w + cur_w1 * v1->v.w + cur_w2 * v2->v.w));

                    if (TryUpdateZBuffer(&DepthPtr[x], pz))
                    {
                        FVector2D uv = v0->uv * cur_w0 + v1->uv * cur_w1 + v2->uv * cur_w2;
                        Scalar    f  = v0->f  * cur_w0 + v1->f  * cur_w1 + v2->f  * cur_w2;
                        uint32    tu = uint32(uv.x) & (FSoftwareRasterizer::TextureCacheSize-1);
                        uint32    tv = uint32(uv.y) & (FSoftwareRasterizer::TextureCacheSize-1);
                        ColorPtr[x]  = FColor16(Texture[tu + tv * FSoftwareRasterizer::TextureCacheSize]).ScaledValueToWhite(sint32(f * 65536.0f)).toColorNoClip();
                    }
                }
            }
        }
    }

    return;
}
