#include "Algorithm/GGXMultiScatter.h"

#include "Algorithm/MicrofacetGGX.h"

#define GGX_MS_LUT_NOV_SIZE 32
#define GGX_MS_LUT_ROUGH_SIZE 32
#define GGX_MS_LUT_SAMPLES 256

static float g_ggx_E_lut[GGX_MS_LUT_ROUGH_SIZE][GGX_MS_LUT_NOV_SIZE];
static float g_ggx_Eavg_lut[GGX_MS_LUT_ROUGH_SIZE];
static bool g_ggx_ms_lut_initialized = false;

static inline float saturatef(float x)
{
    return fminf(fmaxf(x, 0.0f), 1.0f);
}

static inline float radical_inverse_vdc(uint32_t bits)
{
    bits = (bits << 16u) | (bits >> 16u);
    bits = ((bits & 0x55555555u) << 1u) | ((bits & 0xAAAAAAAAu) >> 1u);
    bits = ((bits & 0x33333333u) << 2u) | ((bits & 0xCCCCCCCCu) >> 2u);
    bits = ((bits & 0x0F0F0F0Fu) << 4u) | ((bits & 0xF0F0F0F0u) >> 4u);
    bits = ((bits & 0x00FF00FFu) << 8u) | ((bits & 0xFF00FF00u) >> 8u);
    return (float)bits * 2.3283064365386963e-10f; // / 2^32
}

static inline vec3s vec3_div_safe(vec3s a, vec3s b, float eps)
{
    return (vec3s){a.x / fmaxf(b.x, eps), a.y / fmaxf(b.y, eps), a.z / fmaxf(b.z, eps)};
}

static inline vec3s fresnel_schlick_avg(vec3s f0)
{
    // Cosine-weighted hemispherical average for Schlick Fresnel.
    // Approximation: <(1 - cos)^5>_cos = 1/21.
    const float t = 1.0f / 21.0f;
    return glms_vec3_add(f0, glms_vec3_scale(glms_vec3_sub(glms_vec3_one(), f0), t));
}

static inline float ggx_ms_Eavg(float roughness)
{
    ggx_ms_init_lut_once();

    roughness = saturatef(roughness);
    float y = roughness * (float)(GGX_MS_LUT_ROUGH_SIZE - 1);
    uint32_t y0 = (uint32_t)floorf(y);
    uint32_t y1 = (y0 + 1u < GGX_MS_LUT_ROUGH_SIZE) ? (y0 + 1u) : y0;
    float ty = y - (float)y0;
    return saturatef(glm_lerp(g_ggx_Eavg_lut[y0], g_ggx_Eavg_lut[y1], ty));
}

float ggx_ms_E(float NoV, float roughness)
{
    ggx_ms_init_lut_once();

    NoV = saturatef(NoV);
    roughness = saturatef(roughness);

    float x = NoV * (float)(GGX_MS_LUT_NOV_SIZE - 1);
    float y = roughness * (float)(GGX_MS_LUT_ROUGH_SIZE - 1);

    uint32_t x0 = (uint32_t)floorf(x);
    uint32_t y0 = (uint32_t)floorf(y);
    uint32_t x1 = (x0 + 1u < GGX_MS_LUT_NOV_SIZE) ? (x0 + 1u) : x0;
    uint32_t y1 = (y0 + 1u < GGX_MS_LUT_ROUGH_SIZE) ? (y0 + 1u) : y0;

    float tx = x - (float)x0;
    float ty = y - (float)y0;

    float e00 = g_ggx_E_lut[y0][x0];
    float e10 = g_ggx_E_lut[y0][x1];
    float e01 = g_ggx_E_lut[y1][x0];
    float e11 = g_ggx_E_lut[y1][x1];

    float e0 = glm_lerp(e00, e10, tx);
    float e1 = glm_lerp(e01, e11, tx);
    return saturatef(glm_lerp(e0, e1, ty));
}

void ggx_ms_init_lut_once(void)
{
    if (g_ggx_ms_lut_initialized)
    {
        return;
    }

#ifdef _OPENMP
#pragma omp critical(ggx_ms_lut_init)
#endif
    {
        if (!g_ggx_ms_lut_initialized)
        {
            vec3s n = (vec3s){0.0f, 0.0f, 1.0f};

            for (uint32_t ry = 0; ry < GGX_MS_LUT_ROUGH_SIZE; ry++)
            {
                float roughness = ((float)ry + 0.5f) / (float)GGX_MS_LUT_ROUGH_SIZE;
                roughness = fmaxf(roughness, 0.001f);

                float Eavg = 0.0f;

                for (uint32_t ix = 0; ix < GGX_MS_LUT_NOV_SIZE; ix++)
                {
                    float NoV = ((float)ix + 0.5f) / (float)GGX_MS_LUT_NOV_SIZE;
                    NoV = fmaxf(NoV, 1e-4f);

                    float sin_theta = sqrtf(fmaxf(0.0f, 1.0f - NoV * NoV));
                    vec3s v = glms_vec3_normalize((vec3s){sin_theta, 0.0f, NoV});

                    float sum = 0.0f;
                    uint32_t valid = 0;
                    uint32_t scramble = hash_uint32((ry + 1u) * 0x9E3779B9u ^ (ix + 1u) * 0x7F4A7C15u);

                    for (uint32_t s = 0; s < GGX_MS_LUT_SAMPLES; s++)
                    {
                        float u1 = ((float)s + 0.5f) / (float)GGX_MS_LUT_SAMPLES;
                        float u2 = radical_inverse_vdc(s ^ scramble);

                        vec3s h = ggx_sample_vndf(n, v, roughness, u1, u2);
                        vec3s l = glms_vec3_reflect(glms_vec3_negate(v), h);
                        float NoL = l.z;
                        if (NoL <= 0.0f)
                        {
                            continue;
                        }

                        // For F=1, importance sampling with VNDF yields a simple estimator:
                        // rho_ss(v) = E[ G1(NoL) ].
                        float alpha_lut = roughness * roughness;
                        float alpha2_lut = alpha_lut * alpha_lut;
                        float lambda_v = sqrtf(alpha2_lut + (1.0f - alpha2_lut) * NoV * NoV);
                        float lambda_l = sqrtf(alpha2_lut + (1.0f - alpha2_lut) * NoL * NoL);
                        sum += NoL * (NoV + lambda_v) / fmaxf(NoL * lambda_v + NoV * lambda_l, 1e-7f);

                        valid++;
                    }

                    // float E = (valid > 0u) ? (sum / (float)valid) : 0.0f;
                    float E = sum / (float)GGX_MS_LUT_SAMPLES;
                    E = saturatef(E);

                    g_ggx_E_lut[ry][ix] = E;

                    // Accumulate hemispherical average with cosine weighting:
                    // Eavg = 2 * \int_0^1 E(mu) * mu dmu
                    float mu = NoV;
                    Eavg += E * mu;
                }

                Eavg = 2.0f * Eavg * (1.0f / (float)GGX_MS_LUT_NOV_SIZE);
                g_ggx_Eavg_lut[ry] = saturatef(Eavg);
            }

            g_ggx_ms_lut_initialized = true;
        }
    }
}

vec3s ggx_multi_scatter_lambert(vec3s f0, float NoV, float NoL, float roughness)
{
    float Eo = ggx_ms_E(NoV, roughness);
    float Ei = ggx_ms_E(NoL, roughness);
    float Eavg = ggx_ms_Eavg(roughness);

    vec3s Favg = fresnel_schlick_avg(f0);

    // Series factor for multiple bounces inside the microfacet layer.
    // C = Favg^2 / (1 - Favg * (1 - Eavg))
    vec3s Favg2 = glms_vec3_mul(Favg, Favg);
    vec3s denom = glms_vec3_sub(glms_vec3_one(), glms_vec3_scale(Favg, (1.0f - Eavg)));
    vec3s C = vec3_div_safe(Favg2, denom, 1e-6f);

    float scale = (1.0f - Eo) * (1.0f - Ei);
    scale /= (PI * fmaxf(1.0f - Eavg, 1e-6f));

    return glms_vec3_scale(C, scale);
}