#include "Rendering/Texture.h"
#include "Common/String.h"

#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"

#define GET_CHANNEL_DATA(pixel, channel, channel_count, default, max) (channel < channel_count ? pixel[channel] : default) / max

bool texture_collection_init(uint16_t size, texture_collection_t* textures)
{
    texture_collection_t temp = {0};
    temp.buffer = (texture_asset_t*)malloc(size * sizeof(texture_asset_t));
    if (temp.buffer == NULL)
    {
        return false;
    }

    temp.size = size;
    temp.count = 0;
    *textures = temp;

    return true;
}

void texture_collection_resize(texture_collection_t* textures, uint16_t size)
{
    if (size == INVALID_TEXTURE_ID)
    {
        size = INVALID_TEXTURE_ID - 1;
    }

    if (size == textures->size)
    {
        return;
    }

    texture_asset_t* temp = (texture_asset_t*)realloc(textures->buffer, size * sizeof(texture_asset_t));
    if (temp != NULL)
    {
        textures->buffer = temp;
        textures->size = size;
    }
}

void texture_collection_free(texture_collection_t* textures)
{
    if (textures == NULL)
    {
        return;
    }

    for (uint16_t i = 0; i < textures->count; i++)
    {
        texture_free(&textures->buffer[i].texture);

        char* full_name = textures->buffer[i].full_name;
        if (full_name != NULL)
        {
            free(full_name);
        }
    }

    free(textures->buffer);
    textures->buffer = NULL;
}


static inline void read_pixel_raw(const char* data, uint32_t x, uint32_t y, uint32_t width, uint8_t channel_count, stride_t stride, char* out_pixel_data)
{
    size_t pixel_offset = (size_t)(y * width + x) * channel_count * stride;
    memcpy(out_pixel_data, data + pixel_offset, (size_t)channel_count * stride);
}

static inline void write_pixel_raw(char* data, uint32_t x, uint32_t y, uint32_t width, uint8_t channel_count, stride_t stride, const char* in_pixel_data)
{
     size_t pixel_offset = (size_t)(y * width + x) * channel_count * stride;
     memcpy(data + pixel_offset, in_pixel_data, (size_t)channel_count * stride);
}

static void average_pixels_box(const char* current_data, uint32_t current_width, uint32_t current_height,
                               uint32_t src_x, uint32_t src_y, uint8_t channel_count, stride_t stride, char* out_averaged_pixel) {

    size_t pixel_byte_size = (size_t)channel_count * stride;
    float pixel_count = 0.0f;

#if defined(__clang__) || defined(__GNUC__)
    char pixel_data[pixel_byte_size]; // Buffer to read individual pixel data

    // Use a float buffer to accumulate the sum for each channel
    // This allows us to sum values from different strides by converting them to float
    float sum_float[channel_count];
    memset(sum_float, 0, sizeof(float) * channel_count);
#else
    char* pixel_data = (char*)malloc(pixel_byte_size); // Buffer to read individual pixel data
    if (pixel_data == NULL)
    {
        return;
    }
    
    float* sum_float = (float*)calloc(channel_count, sizeof(float));
    if (sum_float == NULL)
    {
        free(pixel_data);
        return;
    }
#endif

    // Loop through the 2x2 block in the current level
    for (int dy = 0; dy < 2; ++dy)
    {
        for (int dx = 0; dx < 2; ++dx)
        {
            uint32_t current_x = src_x + dx;
            uint32_t current_y = src_y + dy;

            // Check if the pixel is within the bounds of the current level
            if (current_x < current_width && current_y < current_height) {
                // Read the raw pixel data
                read_pixel_raw(current_data, current_x, current_y, current_width, channel_count, stride, pixel_data);

                // Sum the pixel data channel by channel, converting to float for summation
                for (uint8_t c = 0; c < channel_count; c++)
                {
                    switch (stride)
                    {
                        case UINT_8:
                            sum_float[c] += (float)(((uint8_t*)pixel_data)[c]);
                            break;
                        case UINT_16:
                            sum_float[c] += (float)(((uint16_t*)pixel_data)[c]);
                            break;
                        case FLOAT_32:
                            sum_float[c] += ((float*)pixel_data)[c];
                            break;
                        default:
                            break;
                    }
                }
                pixel_count += 1.0f;
            }
        }
    }

    // Divide the sum by the pixel count to get the average for each channel
    if (pixel_count > 0.0f)
    {
        // Convert the averaged float values back to the original stride type and write to the output buffer
        for (uint8_t c = 0; c < channel_count; c++) {
            float average_value = sum_float[c] / pixel_count;

            switch (stride)
            {
                case UINT_8:
                    ((uint8_t*)out_averaged_pixel)[c] = (uint8_t)glm_clamp(average_value, 0.0f, 255.0f);
                    break;
                case UINT_16:
                    ((uint16_t*)out_averaged_pixel)[c] = (uint16_t)glm_clamp(average_value, 0.0f, 65535.0f);
                    break;
                case FLOAT_32:
                    ((float*)out_averaged_pixel)[c] = average_value;
                    break;
                default:
                    break;
            }
        }
    }
    else
    {
        // This case should ideally not happen if current_width or current_height > 1,
        // but as a safeguard, zero out the output buffer.
        memset(out_averaged_pixel, 0, pixel_byte_size);
    }

#if !defined(__clang__) && !defined(__GNUC__)
    free(pixel_data);
    free(sum_float);
#endif
}

static void generate_mipmap(char* raw_data, mipmap_t* texture_data, uint32_t width, uint32_t height, uint8_t channel_count, uint8_t max_level, stride_t stride)
{
    // Store the base level (Level 0)
    texture_data[0] = (mipmap_t)
    {
        .width = width,
        .height = height,
        .data = raw_data,
    };

    char* current_data = raw_data;
    uint32_t current_width = width;
    uint32_t current_height = height;
    int level = 1;

    uint32_t pixel_byte_size = channel_count * stride;
#if defined(__clang__) || defined(__GNUC__)
    char averaged_pixel_buffer[pixel_byte_size];
#else
    char* averaged_pixel_buffer = (char*)malloc(pixel_byte_size);
    if (averaged_pixel_buffer == NULL)
    {
        return;
    }
#endif

    // Continue generating levels as long as at least one dimension is greater than 1
    while ((current_width > 1 || current_height > 1) && level <= max_level)
    {
        uint32_t next_width = max(1u, current_width / 2);
        uint32_t next_height = max(1u, current_height / 2);

        size_t next_level_size = (size_t)next_width * next_height * channel_count * stride;
        char* next_data = (char*)malloc(next_level_size);
        if (next_data == NULL)
        {
            break;
        }

        // Iterate through each pixel in the NEXT mipmap level
        for (uint32_t y = 0; y < next_height; ++y)
        {
            for (uint32_t x = 0; x < next_width; ++x)
            {
                // Calculate the starting coordinates (top-left corner) of the 2x2 block in the CURRENT level
                uint32_t src_x = x * 2;
                uint32_t src_y = y * 2;

                // Average the pixels in the 2x2 block from the CURRENT level using Box filter
                average_pixels_box(current_data, current_width, current_height,
                                   src_x, src_y, channel_count, stride, averaged_pixel_buffer);

                // Write the averaged pixel value to the corresponding location in the NEXT level
                write_pixel_raw(next_data, x, y, next_width, channel_count, stride, averaged_pixel_buffer);
            }
        }

        texture_data[level] = (mipmap_t)
        {
            .width = next_width,
            .height = next_height,
            .data = next_data,
        };

        // Update for the next iteration
        current_data = next_data;
        current_width = next_width;
        current_height = next_height;
        level++;
    }

#if !defined(__clang__) && !defined(__GNUC__)
    free(averaged_pixel_buffer);
#endif
}

texture_handle_t texture_load(const char* filename, bool srgb, bool mipmap, stride_t stride, texture_collection_t* textures)
{
    // TODO: This hurts performance, consider using a hash map or similar structure for faster lookups

    // for (uint16_t i = 0; i < textures->count; i++)
    // {
    //     if (strcmp(textures->buffer[i].full_name, filename) == 0)
    //     {
    //         return (texture_entity_t){.id = i};
    //     }
    // }

    int width, height, channels;
    char* raw_data = NULL;

    switch (stride)
    {
        case UINT_8:
            raw_data = (char*)stbi_load(filename, &width, &height, &channels, 0);
            break;

        case UINT_16:
            raw_data = (char*)stbi_load_16(filename, &width, &height, &channels, 0);
            break;
        case FLOAT_32:
            raw_data = (char*)stbi_loadf(filename, &width, &height, &channels, 0);
            break;
    }

    if (raw_data == NULL)
    {
        return invalid_texture_handle();
    }

    uint8_t max_mip_level = mipmap ? (uint8_t)log2f(fmaxf((float)width, (float)height)) : 0;
    mipmap_t* temp_texture_data = (mipmap_t*)calloc((size_t)max_mip_level + 1, sizeof(mipmap_t));
    if (temp_texture_data == NULL)
    {
        stbi_image_free(raw_data);
        return invalid_texture_handle();
    }

    generate_mipmap(raw_data, temp_texture_data, (uint32_t)width, (uint32_t)height, (uint8_t)channels, max_mip_level, stride);

    texture_t texture = {0};

    texture.texel_size = (vec2s){1.0f / (float)width, 1.0f / (float)height};
    texture.width = (uint32_t)width;
    texture.height = (uint32_t)height;

    texture.channel_count = (uint8_t)channels;
    texture.max_mip = max_mip_level;
    texture.stride = stride;
    texture.data = temp_texture_data;

    texture.wrap_mode = WM_REPEAT;
    texture.filter_mode = FM_LINEAR;

    if (textures->count >= textures->size)
    {
        texture_collection_resize(textures, textures->size * 2);
    }

    texture_handle_t entity = {.id = textures->count};

    textures->buffer[textures->count] = (texture_asset_t){.full_name = string_copy(filename), .texture = texture};
    textures->count++;

    return entity;
}

static inline void warp_uv(wrap_mode_t mode, vec2s* uv)
{
    switch (mode)
    {
        case WM_REPEAT:
            uv->x = fmodf(fabsf(uv->x), 1.0f);
            uv->y = fmodf(fabsf(uv->y), 1.0f);
            break;
        case WM_CLAMP:
            *uv = glms_vec2_clamp(*uv, 0.0f, 1.0f);
            break;
    }
}

static vec4s get_pixel_data_from_buffer(const char* data, uint32_t x, uint32_t y, uint32_t width, uint8_t channel_count, stride_t stride)
{
    size_t pixel_start_offset = (size_t)(y * width + x) * channel_count * stride;

    vec4s out = {0.0f, 0.0f, 0.0f, 1.0f};

    for (int c = 0; c < channel_count && c < 4; c++)
    {
        float value = 0.0f;
        size_t channel_offset = pixel_start_offset + (size_t)c * stride;

        if (channel_offset >= (size_t)y * width * channel_count * stride + (size_t)width * channel_count * stride)
        {
            continue;
        }

        switch (stride)
        {
            case UINT_8:
                value = (float)(((uint8_t*)data)[channel_offset]) / 255.0f;
                break;
            case UINT_16:
                value = (float)(*((uint16_t*)(data + channel_offset))) / 65535.0f;
                break;
            case FLOAT_32:
                value = *((float*)(data + channel_offset));
                break;
            default:
                value = (c == 3) ? 1.0f : 0.0f;
                break;
        }
        out.raw[c] = value;
    }

    return out;
}

vec4s texture_get_pixel(const texture_t* texture, vec2s uv, uint8_t lod)
{
    uint8_t mip_level = (uint8_t)glm_clamp(lod, 0, texture->max_mip);
    const mipmap_t* mipmap = &texture->data[mip_level];
    if (mipmap->data == NULL)
    {
        return (vec4s){0.0f, 0.0f, 0.0f, 1.0f};
    }

    uint32_t x = (uint32_t)floorf(uv.x * (mipmap->width - 1));
    uint32_t y = (uint32_t)floorf(uv.y * (mipmap->height - 1));

    x = x < mipmap->width ? x : mipmap->width - 1;
    y = y < mipmap->height ? y : mipmap->height - 1;

    return get_pixel_data_from_buffer(mipmap->data, x, y, mipmap->width, texture->channel_count, texture->stride);
}

// Calculate LOD based on Ray Cones
float texture_get_sample_lod(const texture_t* texture, const texture_sample_context_t* sample_context)
{
    // 1. Calculate the ray footprint on the surface
    // If we hit the surface at an angle, the footprint elongates.
    float cos_theta = fabsf(glms_vec3_dot(sample_context->normal, sample_context->view_direction));
    float surface_width = sample_context->ray_width / fmaxf(cos_theta, 0.001f); // Project width onto surface

    // 2. Estimate UV density (How much UV changes per meter of surface)
    // This is an approximation. A more accurate way uses Triangle derivatives (Ray Differentials).
    // For a triangle, we can approximate the scale:
    float edge1_len = glms_vec3_norm(sample_context->edge1);
    float edge2_len = glms_vec3_norm(sample_context->edge2);
    float uv_area = fabsf((sample_context->uv1.x * sample_context->uv2.y) - (sample_context->uv1.y * sample_context->uv2.x)); // Approximation of UV area
    float geo_area = glms_vec3_norm(glms_vec3_cross(sample_context->edge1, sample_context->edge2));
    
    // Ratio of Texture Area to Geometric Area
    float uv_density = sqrtf(uv_area / geo_area);

    // 3. Calculate texture footprint
    // How many texels does our ray cover?
    float texels_covered = surface_width * uv_density * fmaxf(texture->width, texture->height);

    // 4. Convert to LOD
    // LOD 0 = 1 texel. LOD 1 = 2 texels. LOD 2 = 4 texels.
    // log2(texels_covered) gives the mip level.
    return log2f(texels_covered) * 0.5f;
}

static vec4s nearest_filter(const texture_t* texture, vec2s uv, uint8_t lod)
{
    return texture_get_pixel(texture, uv, lod);
}

static vec4s linear_filter(const texture_t* texture, vec2s uv, uint8_t lod)
{
    uint8_t mip_level = (uint8_t)glm_clamp((float)lod, 0.0f, (float)texture->max_mip);
    const mipmap_t* mipmap = &texture->data[mip_level];

    if (mipmap->data == NULL)
    {
        return (vec4s){0.0f, 0.0f, 0.0f, 1.0f};
    }

    float x = uv.x * (float)(mipmap->width - 1);
    float y = uv.y * (float)(mipmap->height - 1);

    uint32_t x0 = (uint32_t)floorf(x);
    uint32_t y0 = (uint32_t)floorf(y);

    uint32_t x1 = x0 + 1;
    uint32_t y1 = y0 + 1;

    float sx = x - (float)x0;
    float sy = y - (float)y0;

    x0 = (uint32_t)glm_clamp((float)x0, 0.0f, (float)mipmap->width - 1.0f);
    x1 = (uint32_t)glm_clamp((float)x1, 0.0f, (float)mipmap->width - 1.0f);
    y0 = (uint32_t)glm_clamp((float)y0, 0.0f, (float)mipmap->height - 1.0f);
    y1 = (uint32_t)glm_clamp((float)y1, 0.0f, (float)mipmap->height - 1.0f);

    // Get the pixel values for the four corners of the 2x2 block
    vec4s c00 = get_pixel_data_from_buffer(mipmap->data, x0, y0, mipmap->width, texture->channel_count, texture->stride);
    vec4s c10 = get_pixel_data_from_buffer(mipmap->data, x1, y0, mipmap->width, texture->channel_count, texture->stride);
    vec4s c01 = get_pixel_data_from_buffer(mipmap->data, x0, y1, mipmap->width, texture->channel_count, texture->stride);
    vec4s c11 = get_pixel_data_from_buffer(mipmap->data, x1, y1, mipmap->width, texture->channel_count, texture->stride);

    vec4s c0 = glms_vec4_lerp(c00, c10, sx);   // Interpolate along x for the top row
    vec4s c1 = glms_vec4_lerp(c01, c11, sx);   // Interpolate along x for the bottom row
    vec4s result = glms_vec4_lerp(c0, c1, sy); // Interpolate along y

    return result;
}

static inline vec4s filter_texture(const texture_t* texture, vec2s uv, float lod)
{
    switch (texture->filter_mode)
    {
        case FM_NEAREST:
            return nearest_filter(texture, uv, (uint8_t)lod);
        case FM_LINEAR:
            return linear_filter(texture, uv, (uint8_t)lod);
        default:
            return (vec4s){0.0f, 0.0f, 0.0f, 1.0f};
    }
}

vec4s texture_sample(const texture_t* texture, const texture_sample_context_t* sample_context, vec2s uv)
{
    warp_uv(texture->wrap_mode, &uv);
    float lod = texture_get_sample_lod(texture, sample_context);
    return filter_texture(texture, uv, lod);
}

vec4s texture_sample_lod(const texture_t* texture, vec2s uv, float lod)
{
    lod = glm_clamp(lod, 0.0f, texture->max_mip);
    warp_uv(texture->wrap_mode, &uv);
    return filter_texture(texture, uv, lod);
}

void texture_free(texture_t* texture)
{
    if (texture != NULL && texture->data != NULL)
    {
        stbi_image_free(texture->data[0].data);
        for (uint8_t i = 1; i <= texture->max_mip; i++)
        {
            free(texture->data[i].data);
        }
    }
}