using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Diagnosers; using BenchmarkDotNet.Engines; using Misaki.HighPerformance.Image; using Misaki.HighPerformance.Jobs; using Misaki.HighPerformance.Mathematics; using Misaki.HighPerformance.Mathematics.SPMD; using SkiaSharp; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using static Misaki.HighPerformance.Mathematics.math; //using TFloat = Misaki.HighPerformance.Mathematics.SPMD.WideLane; //using TInt = Misaki.HighPerformance.Mathematics.SPMD.WideLane; namespace Misaki.HighPerformance.Test.Benchmark; internal unsafe struct MipLevel { public float* data; public uint width; public uint height; public int offset; public float roughness; } //internal unsafe struct GGXMipGenerationJobSPMD : IJobParallelFor internal unsafe struct GGXMipGenerationJobSPMD : IJobParallelFor where TFloat : unmanaged, ISPMDLane where TInt : unmanaged, ISPMDLane { public const uint SAMPLE_COUNT = 1024u; public ImageResultFloat image; public MipLevel* pMipLevels; public float* radicalInverse_VdCLut; public int numMipLevels; [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] public static float RadicalInverse_VdC(uint bits) { bits = (bits << 16) | (bits >> 16); bits = ((bits & 0x55555555u) << 1) | ((bits & 0xAAAAAAAAu) >> 1); bits = ((bits & 0x33333333u) << 2) | ((bits & 0xCCCCCCCCu) >> 2); bits = ((bits & 0x0F0F0F0Fu) << 4) | ((bits & 0xF0F0F0F0u) >> 4); bits = ((bits & 0x00FF00FFu) << 8) | ((bits & 0xFF00FF00u) >> 8); return bits * 2.3283064365386963e-10f; // bits / 0x100000000 } [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] private static Vector2 Hammersley(TFloat i, uint N, float* lut) { var x = i / N; var y = TFloat.Load(lut + (int)i[0]); return MathV.Create(x, y); } // --- GGX Importance Sampling --- [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] private static Vector3 ImportanceSampleGGX(Vector2 Xi, Vector3 N, float roughness) { var a = roughness * roughness; // Disney/Epic remap roughness for better visual linearity var phi = 2.0f * PI * Xi.x; var cosTheta = TFloat.Sqrt((1.0f - Xi.y) / (1.0f + (a * a - 1.0f) * Xi.y)); var sinTheta = TFloat.Sqrt(1.0f - cosTheta * cosTheta); // Spherical to Cartesian coordinates (Halfway vector) TFloat.SinCos(phi, out var sinPhi, out var cosPhi); var H = MathV.Create(cosPhi * sinTheta, sinPhi * sinTheta, cosTheta); // Tangent space to World space var mask = TFloat.Abs(N.z) < 0.999f; var up = MathV.Select(mask, MathV.Create(0.0f, 0.0f, 1.0f), MathV.Create(1.0f, 0.0f, 0.0f)); var tangent = MathV.Normalize(MathV.Cross(up, N)); var bitangent = MathV.Cross(N, tangent); var sampleVec = (tangent * H.x) + (bitangent * H.y) + (N * H.z); return MathV.Normalize(sampleVec); } // --- Image Sampling Helpers --- // Maps a 3D direction vector to 2D equirectangular UVs [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] private static Vector2 DirToEquirectangularUV(Vector3 dir) { var u = TFloat.Atan2(dir.z, dir.x); var v = TFloat.Asin(dir.y); u = u / (2.0f * PI) + 0.5f; v = v / PI + 0.5f; return MathV.Create(u, v); } // Samples the source HDR image using bilinear interpolation (simplified to nearest neighbor for brevity here) // Do not inline this function to avoid register pressure and code bloat in the main sampling loop, as this is a relatively heavy operation and we want to keep it separate for better instruction cache usage. [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)] private static Vector3 SampleEquirectangularMap(float* img, int w, int h, Vector3 dir) { var uv = DirToEquirectangularUV(dir); // Nearest neighbor pixel coordinates var px = (uv.x * (w - 1.0f)).Cast(); var py = (uv.y * (h - 1.0f)).Cast(); // Clamp px = TInt.Clamp(px, TInt.Zero, w - 1); py = TInt.Clamp(py, TInt.Zero, h - 1); // Assuming float RGB array format var idx = (py * w + px) * 3; return MathV.GatherVector3(img, idx.GetUnsafePtr(), 1); } [MethodImpl(MethodImplOptions.AggressiveOptimization)] public void Execute(int loopIndex, ref readonly JobExecutionContext ctx) { var m = 0; while (m < numMipLevels - 1 && loopIndex >= pMipLevels[m + 1].offset) { m++; } var span = new ReadOnlySpan(pMipLevels, numMipLevels); var pLevel = &pMipLevels[m]; var w = (int)pLevel->width; var h = (int)pLevel->height; var pData = pLevel->data; var local_i = loopIndex - pLevel->offset; var x = local_i % w; var y = local_i / w; var u = (float)x / (w - 1); var v = (float)y / (h - 1); var phi = (u - 0.5f) * 2.0f * PI; var theta = (v - 0.5f) * PI; sincos(theta, out var sinTheta, out var cosTheta); sincos(phi, out var sinPhi, out var cosPhi); var N = float3(cosTheta * cosPhi, sinTheta, cosTheta * sinPhi); N = normalize(N); // For split-sum, we assume View and Reflection directions equal the Normal var V = N; var R = N; var vN = MathV.Create( TFloat.Create(N.x), TFloat.Create(N.y), TFloat.Create(N.z) ); var vV = MathV.Create( TFloat.Create(V.x), TFloat.Create(V.y), TFloat.Create(V.z) ); var vPrefilteredColor = Vector3.Zero; var vTotalWeight = TFloat.Zero; // 3. Monte Carlo Integration Loop var dynamicSampleCount = (uint)max(1.0f, SAMPLE_COUNT * pLevel->roughness); var vDynamicSampleCount = TFloat.Create(dynamicSampleCount); var vLumaVector = MathV.Create(0.2126f, 0.7152f, 0.0722f); for (var i = 0u; i < dynamicSampleCount; i += (uint)TFloat.LaneWidth) { var laneIndices = TFloat.Sequence(i, 1.0f); var validLaneMask = laneIndices < vDynamicSampleCount; // Generate a Hammersley random sequence point var Xi = Hammersley(laneIndices, dynamicSampleCount, radicalInverse_VdCLut); // Get the halfway vector based on GGX NDF var H = ImportanceSampleGGX(Xi, vN, pLevel->roughness); // Calculate Light direction var L = MathV.Reflect(-vV, H); L = MathV.Normalize(L); var NdotL = TFloat.Max(MathV.Dot(vN, L), TFloat.Zero); var sampleColor = SampleEquirectangularMap(image.Data, (int)image.Width, (int)image.Height, L); NdotL &= validLaneMask; // The Karis Average Weight: 1 / (1 + luma) // A normal sky pixel (luma 1.0) gets a weight of 0.5. // A sun pixel (luma 1000.0) gets a tiny weight of ~0.001, naturally suppressing it. // This introduce bias, but significantly reduces fireflies without needing solid angle sampling or cdf inversion. // And since this is a mip generation step, a little bias is acceptable for much better performance and stability. var luma = MathV.Dot(sampleColor, vLumaVector); var fireflyWeight = TFloat.One / (TFloat.One + luma); var finalWeight = NdotL * fireflyWeight; vPrefilteredColor += sampleColor * finalWeight; vTotalWeight += finalWeight; } var totalWeight = 0.0f; var prefilteredColor = float3(0, 0, 0); for (var i = 0; i < TFloat.LaneWidth; i++) { prefilteredColor.x += vPrefilteredColor.x[i]; prefilteredColor.y += vPrefilteredColor.y[i]; prefilteredColor.z += vPrefilteredColor.z[i]; totalWeight += vTotalWeight[i]; } // 4. Average the result if (totalWeight > 0.0f) { prefilteredColor *= 1.0f / totalWeight; } // Write to output mip array var out_idx = (y * w + x) * 3; pData[out_idx] = prefilteredColor.x; pData[out_idx + 1] = prefilteredColor.y; pData[out_idx + 2] = prefilteredColor.z; } } internal unsafe struct GGXMipGenerationJob : IJobParallelFor { public const uint SAMPLE_COUNT = 1024u; public ImageResultFloat image; public MipLevel* pMipLevels; public float* radicalInverse_VdCLut; public int numMipLevels; [MethodImpl(MethodImplOptions.AggressiveInlining)] public static float RadicalInverse_VdC(uint bits) { bits = (bits << 16) | (bits >> 16); bits = ((bits & 0x55555555u) << 1) | ((bits & 0xAAAAAAAAu) >> 1); bits = ((bits & 0x33333333u) << 2) | ((bits & 0xCCCCCCCCu) >> 2); bits = ((bits & 0x0F0F0F0Fu) << 4) | ((bits & 0xF0F0F0F0u) >> 4); bits = ((bits & 0x00FF00FFu) << 8) | ((bits & 0xFF00FF00u) >> 8); return bits * 2.3283064365386963e-10f; // bits / 0x100000000 } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static float2 Hammersley(uint i, uint N, float* lut) { return float2((float)i / N, lut[i]); } // --- GGX Importance Sampling --- [MethodImpl(MethodImplOptions.AggressiveInlining)] private static float3 ImportanceSampleGGX(float2 Xi, float3 N, float roughness) { var a = roughness * roughness; // Disney/Epic remap roughness for better visual linearity var phi = 2.0f * PI * Xi.x; var cosTheta = sqrt((1.0f - Xi.y) / (1.0f + (a * a - 1.0f) * Xi.y)); var sinTheta = sqrt(1.0f - cosTheta * cosTheta); // Spherical to Cartesian coordinates (Halfway vector) sincos(phi, out var sinPhi, out var cosPhi); var H = float3(cosPhi * sinTheta, sinPhi * sinTheta, cosTheta); // Tangent space to World space var up = abs(N.z) < 0.999f ? float3(0.0f, 0.0f, 1.0f) : float3(1.0f, 0.0f, 0.0f); var tangent = normalize(cross(up, N)); var bitangent = cross(N, tangent); var sampleVec = (tangent * H.x) + (bitangent * H.y) + (N * H.z); return normalize(sampleVec); } // --- Image Sampling Helpers --- // Maps a 3D direction vector to 2D equirectangular UVs [MethodImpl(MethodImplOptions.AggressiveInlining)] private static float2 DirToEquirectangularUV(float3 dir) { var uv = float2(atan2(dir.z, dir.x), asin(dir.y)); uv.x = uv.x / (2.0f * PI) + 0.5f; uv.y = uv.y / PI + 0.5f; return uv; } // Samples the source HDR image using bilinear interpolation (simplified to nearest neighbor for brevity here) [MethodImpl(MethodImplOptions.AggressiveInlining)] private static float3 SampleEquirectangularMap(float* img, int w, int h, float3 dir) { var uv = DirToEquirectangularUV(dir); // Nearest neighbor pixel coordinates var px = (int)(uv.x * (w - 1)); var py = (int)(uv.y * (h - 1)); // Clamp px = clamp(px, 0, w - 1); py = clamp(py, 0, h - 1); // Assuming float RGB array format var idx = (py * w + px) * 3; return float3(img[idx], img[idx + 1], img[idx + 2]); } public void Execute(int loopIndex, ref readonly JobExecutionContext ctx) { var m = 0; while (m < numMipLevels - 1 && loopIndex >= pMipLevels[m + 1].offset) { m++; } var pLevel = &pMipLevels[m]; var w = (int)pLevel->width; var h = (int)pLevel->height; var pData = pLevel->data; var local_i = loopIndex - pLevel->offset; var x = local_i % w; var y = local_i / w; var u = (float)x / (w - 1); var v = (float)y / (h - 1); var phi = (u - 0.5f) * 2.0f * PI; var theta = (v - 0.5f) * PI; sincos(theta, out var sinTheta, out var cosTheta); sincos(phi, out var sinPhi, out var cosPhi); var N = float3(cosTheta * cosPhi, sinTheta, cosTheta * sinPhi); N = normalize(N); // For split-sum, we assume View and Reflection directions equal the Normal var V = N; var R = N; var prefilteredColor = float3(0, 0, 0); var totalWeight = 0.0f; // 3. Monte Carlo Integration Loop var dynamicSampleCount = (uint)max(1.0f, SAMPLE_COUNT * pLevel->roughness); for (var i = 0u; i < dynamicSampleCount; i++) { // Generate a Hammersley random sequence point var Xi = Hammersley(i, dynamicSampleCount, radicalInverse_VdCLut); // Get the halfway vector based on GGX NDF var H = ImportanceSampleGGX(Xi, N, pLevel->roughness); // Calculate Light direction var L = reflect(-V, H); L = normalize(L); var NdotL = max(dot(N, L), 0.0f); // If light is above the horizon if (NdotL > 0.0f) { var sampleColor = SampleEquirectangularMap(image.Data, (int)image.Width, (int)image.Height, L); prefilteredColor += sampleColor * NdotL; totalWeight += NdotL; } } // 4. Average the result if (totalWeight > 0.0f) { prefilteredColor *= 1.0f / totalWeight; } // Write to output mip array var out_idx = (y * w + x) * 3; pData[out_idx] = prefilteredColor.x; pData[out_idx + 1] = prefilteredColor.y; pData[out_idx + 2] = prefilteredColor.z; } } [SimpleJob(RunStrategy.ColdStart, launchCount: 1, warmupCount: 0, iterationCount: 1, invocationCount: 1, id: "QuickRun")] public unsafe class GGXMipGenerationBenchmark { private ImageResultFloat _image; private int _mipLevels; private int _totalPixel; private float** _pResult; private MipLevel* _pMipLevels; private float* _radicalInverse_VdCLut; private JobScheduler _jobScheduler = null!; [GlobalSetup] public void Setup() { //const string imagePath = "F:\\c\\SimpleRayTracer\\native\\assets\\hdri\\golden_gate_hills_1k.hdr"; const string imagePath = "C:\\Users\\Misaki\\Downloads\\grasslands_sunset_4k.hdr"; using var stream = new FileStream(imagePath, FileMode.Open, FileAccess.Read); _image = ImageResultFloat.FromStream(stream, ColorComponents.RGB); _mipLevels = (int)Math.Floor(Math.Log2(Math.Max(_image.Width, _image.Height))) + 1; _pResult = (float**)NativeMemory.Alloc((nuint)(_mipLevels * sizeof(float*))); _pMipLevels = (MipLevel*)NativeMemory.Alloc((nuint)(_mipLevels * sizeof(MipLevel))); uint w, h; for (var i = 0; i < _mipLevels; i++) { w = Math.Max(1, _image.Width >> i); h = Math.Max(1, _image.Height >> i); var sizeInBytes = (nuint)(w * h * 3 * sizeof(float)); _pResult[i] = (float*)NativeMemory.Alloc(sizeInBytes); _pMipLevels[i] = new MipLevel { width = w, height = h, offset = _totalPixel, data = _pResult[i], roughness = (float)i / (_mipLevels - 1) // Linear roughness from 0 to 1 across mip levels }; _totalPixel += (int)(w * h); } var desc = new JobSchedulerDesc { DependencyChainCapacity = 16, ThreadCount = Environment.ProcessorCount - 1, ThreadPriority = ThreadPriority.Normal, }; _radicalInverse_VdCLut = (float*)NativeMemory.Alloc(GGXMipGenerationJob.SAMPLE_COUNT * sizeof(float)); for (var i = 0u; i < GGXMipGenerationJob.SAMPLE_COUNT; i++) { _radicalInverse_VdCLut[i] = GGXMipGenerationJob.RadicalInverse_VdC(i); } _jobScheduler = new JobScheduler(in desc); } public void DumpMipLevelToPng(float* pData, int width, int height, string filePath) { // Create a standard 32-bit RGBA bitmap using var bitmap = new SKBitmap(width, height, SKColorType.Rgba8888, SKAlphaType.Opaque); // Get a pointer to the SkiaSharp pixel buffer var pPixels = (byte*)bitmap.GetPixels(); for (var y = 0; y < height; y++) { for (var x = 0; x < width; x++) { // Your data is tightly packed floats: R, G, B var inIdx = (y * width + x) * 3; var r = pData[inIdx]; var g = pData[inIdx + 1]; var b = pData[inIdx + 2]; // Basic Tone Mapping (Exposure + Gamma Correction) so we can see HDR values on a normal screen // Gamma 2.2 = roughly pow(color, 1.0/2.2) r = MathF.Pow(MathF.Max(0, r), 1.0f / 2.2f); g = MathF.Pow(MathF.Max(0, g), 1.0f / 2.2f); b = MathF.Pow(MathF.Max(0, b), 1.0f / 2.2f); // Convert 0.0-1.0 to 0-255 byte var rByte = (byte)Math.Clamp(r * 255.0f, 0, 255); var gByte = (byte)Math.Clamp(g * 255.0f, 0, 255); var bByte = (byte)Math.Clamp(b * 255.0f, 0, 255); // Write to Skia's buffer (RGBA) var outIdx = (y * width + x) * 4; pPixels[outIdx] = rByte; pPixels[outIdx + 1] = gByte; pPixels[outIdx + 2] = bByte; pPixels[outIdx + 3] = 255; // Alpha } } // Save out the preview using var data = bitmap.Encode(SKEncodedImageFormat.Png, 100); using var stream = File.OpenWrite(filePath); data.SaveTo(stream); } [GlobalCleanup] public void Cleanup() { #if true for (var i = 0; i < _mipLevels; i++) { DumpMipLevelToPng(_pResult[i], (int)_pMipLevels[i].width, (int)_pMipLevels[i].height, $"C:\\Users\\Misaki\\Downloads\\Im\\mip_level_{i}.png"); } #endif _image.Dispose(); for (var i = 0; i < _mipLevels; i++) { NativeMemory.Free(_pResult[i]); } NativeMemory.Free(_pResult); NativeMemory.Free(_pMipLevels); NativeMemory.Free(_radicalInverse_VdCLut); _jobScheduler.Dispose(); } [Benchmark(Baseline = true)] public void JobGGX() { JobHandle handle; if (WideLane.IsSupported) { var job = new GGXMipGenerationJobSPMD, WideLane> { image = _image, pMipLevels = _pMipLevels, numMipLevels = _mipLevels, radicalInverse_VdCLut = _radicalInverse_VdCLut }; handle = _jobScheduler.ScheduleParallelFor(in job, _totalPixel, 64); } else { var job = new GGXMipGenerationJobSPMD, ScalarLane> { image = _image, pMipLevels = _pMipLevels, numMipLevels = _mipLevels, radicalInverse_VdCLut = _radicalInverse_VdCLut }; handle = _jobScheduler.ScheduleParallelFor(in job, _totalPixel, 64); } _jobScheduler.Wait(handle); } //[Benchmark] public void ParallelGGX() { var job = new GGXMipGenerationJob { image = _image, pMipLevels = _pMipLevels, numMipLevels = _mipLevels, radicalInverse_VdCLut = _radicalInverse_VdCLut }; Parallel.For(0, _totalPixel, new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount - 1 }, i => { var localJob = job; var ctx = new JobExecutionContext(); localJob.Execute(i, in ctx); }); } [Benchmark] public void SingleThreadGGX() { var job = new GGXMipGenerationJob { image = _image, pMipLevels = _pMipLevels, numMipLevels = _mipLevels, radicalInverse_VdCLut = _radicalInverse_VdCLut }; var ctx = new JobExecutionContext(); job.Run(_totalPixel, in ctx); } }