Files
Misaki.HighPerformance/Misaki.HighPerformance.Test/Benchmark/GGXMipGenerationBenchmark.cs
Misaki 90461cd0ca Add SPMD lane reductions, gather, and SinCos API changes
- Added MaskLoad, Gather, and reduction methods (ReduceAdd, ReduceMax, ReduceMin) to ISPMDLane<TSelf, TNumber> with XML docs
- Changed SinCos to use out parameters instead of tuple return
- Implemented reductions in ScalarLane and WideLane (loop-based, TODO: SIMD)
- Added GetUnsafePtr to ISPMDLane
- Extended MathV to support Sin, Cos, SinCos, Tan, Asin, Acos, Atan, Atan2 for Vector2/3/4
- Improved WideLane.Sequence to use best vector type
- Updated GGX mip generation for new SinCos signature
- Bumped version to 1.3.2
- Enabled PNG dumping in GGX benchmark
2026-04-29 13:26:02 +09:00

580 lines
20 KiB
C#

using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Diagnosers;
using BenchmarkDotNet.Engines;
using Misaki.HighPerformance.Image;
using Misaki.HighPerformance.Jobs;
using Misaki.HighPerformance.Mathematics;
using Misaki.HighPerformance.Mathematics.SPMD;
using SkiaSharp;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using static Misaki.HighPerformance.Mathematics.math;
//using TFloat = Misaki.HighPerformance.Mathematics.SPMD.WideLane<float>;
//using TInt = Misaki.HighPerformance.Mathematics.SPMD.WideLane<int>;
namespace Misaki.HighPerformance.Test.Benchmark;
internal unsafe struct MipLevel
{
public float* data;
public uint width;
public uint height;
public int offset;
public float roughness;
}
//internal unsafe struct GGXMipGenerationJobSPMD : IJobParallelFor
internal unsafe struct GGXMipGenerationJobSPMD<TFloat, TInt> : IJobParallelFor
where TFloat : unmanaged, ISPMDLane<TFloat, float>
where TInt : unmanaged, ISPMDLane<TInt, int>
{
public const uint SAMPLE_COUNT = 1024u;
public ImageResultFloat image;
public MipLevel* pMipLevels;
public float* radicalInverse_VdCLut;
public int numMipLevels;
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
public static float RadicalInverse_VdC(uint bits)
{
bits = (bits << 16) | (bits >> 16);
bits = ((bits & 0x55555555u) << 1) | ((bits & 0xAAAAAAAAu) >> 1);
bits = ((bits & 0x33333333u) << 2) | ((bits & 0xCCCCCCCCu) >> 2);
bits = ((bits & 0x0F0F0F0Fu) << 4) | ((bits & 0xF0F0F0F0u) >> 4);
bits = ((bits & 0x00FF00FFu) << 8) | ((bits & 0xFF00FF00u) >> 8);
return bits * 2.3283064365386963e-10f; // bits / 0x100000000
}
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
private static Vector2<TFloat, float> Hammersley(TFloat i, uint N, float* lut)
{
var x = i / N;
var y = TFloat.Load(lut + (int)i[0]);
return MathV.Create<TFloat, float>(x, y);
}
// --- GGX Importance Sampling ---
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
private static Vector3<TFloat, float> ImportanceSampleGGX(Vector2<TFloat, float> Xi, Vector3<TFloat, float> N, float roughness)
{
var a = roughness * roughness; // Disney/Epic remap roughness for better visual linearity
var phi = 2.0f * PI * Xi.x;
var cosTheta = TFloat.Sqrt((1.0f - Xi.y) / (1.0f + (a * a - 1.0f) * Xi.y));
var sinTheta = TFloat.Sqrt(1.0f - cosTheta * cosTheta);
// Spherical to Cartesian coordinates (Halfway vector)
TFloat.SinCos(phi, out var sinPhi, out var cosPhi);
var H = MathV.Create<TFloat, float>(cosPhi * sinTheta, sinPhi * sinTheta, cosTheta);
// Tangent space to World space
var mask = TFloat.Abs(N.z) < 0.999f;
var up = MathV.Select(mask, MathV.Create<TFloat, float>(0.0f, 0.0f, 1.0f), MathV.Create<TFloat, float>(1.0f, 0.0f, 0.0f));
var tangent = MathV.Normalize(MathV.Cross(up, N));
var bitangent = MathV.Cross(N, tangent);
var sampleVec = (tangent * H.x) + (bitangent * H.y) + (N * H.z);
return MathV.Normalize(sampleVec);
}
// --- Image Sampling Helpers ---
// Maps a 3D direction vector to 2D equirectangular UVs
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
private static Vector2<TFloat, float> DirToEquirectangularUV(Vector3<TFloat, float> dir)
{
var u = TFloat.Atan2(dir.z, dir.x);
var v = TFloat.Asin(dir.y);
u = u / (2.0f * PI) + 0.5f;
v = v / PI + 0.5f;
return MathV.Create<TFloat, float>(u, v);
}
// Samples the source HDR image using bilinear interpolation (simplified to nearest neighbor for brevity here)
// Do not inline this function to avoid register pressure and code bloat in the main sampling loop, as this is a relatively heavy operation and we want to keep it separate for better instruction cache usage.
[MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
private static Vector3<TFloat, float> SampleEquirectangularMap(float* img, int w, int h, Vector3<TFloat, float> dir)
{
var uv = DirToEquirectangularUV(dir);
// Nearest neighbor pixel coordinates
var px = (uv.x * (w - 1.0f)).Cast<TInt, int>();
var py = (uv.y * (h - 1.0f)).Cast<TInt, int>();
// Clamp
px = TInt.Clamp(px, TInt.Zero, w - 1);
py = TInt.Clamp(py, TInt.Zero, h - 1);
// Assuming float RGB array format
var idx = (py * w + px) * 3;
return MathV.GatherVector3<TFloat, float>(img, idx.GetUnsafePtr(), 1);
}
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
public void Execute(int loopIndex, ref readonly JobExecutionContext ctx)
{
var m = 0;
while (m < numMipLevels - 1 && loopIndex >= pMipLevels[m + 1].offset)
{
m++;
}
var span = new ReadOnlySpan<MipLevel>(pMipLevels, numMipLevels);
var pLevel = &pMipLevels[m];
var w = (int)pLevel->width;
var h = (int)pLevel->height;
var pData = pLevel->data;
var local_i = loopIndex - pLevel->offset;
var x = local_i % w;
var y = local_i / w;
var u = (float)x / (w - 1);
var v = (float)y / (h - 1);
var phi = (u - 0.5f) * 2.0f * PI;
var theta = (v - 0.5f) * PI;
sincos(theta, out var sinTheta, out var cosTheta);
sincos(phi, out var sinPhi, out var cosPhi);
var N = float3(cosTheta * cosPhi, sinTheta, cosTheta * sinPhi);
N = normalize(N);
// For split-sum, we assume View and Reflection directions equal the Normal
var V = N;
var R = N;
var vN = MathV.Create<TFloat, float>(
TFloat.Create(N.x),
TFloat.Create(N.y),
TFloat.Create(N.z)
);
var vV = MathV.Create<TFloat, float>(
TFloat.Create(V.x),
TFloat.Create(V.y),
TFloat.Create(V.z)
);
var vPrefilteredColor = Vector3<TFloat, float>.Zero;
var vTotalWeight = TFloat.Zero;
// 3. Monte Carlo Integration Loop
var dynamicSampleCount = (uint)max(1.0f, SAMPLE_COUNT * pLevel->roughness);
var vDynamicSampleCount = TFloat.Create(dynamicSampleCount);
var vLumaVector = MathV.Create<TFloat, float>(0.2126f, 0.7152f, 0.0722f);
for (var i = 0u; i < dynamicSampleCount; i += (uint)TFloat.LaneWidth)
{
var laneIndices = TFloat.Sequence(i, 1.0f);
var validLaneMask = laneIndices < vDynamicSampleCount;
// Generate a Hammersley random sequence point
var Xi = Hammersley(laneIndices, dynamicSampleCount, radicalInverse_VdCLut);
// Get the halfway vector based on GGX NDF
var H = ImportanceSampleGGX(Xi, vN, pLevel->roughness);
// Calculate Light direction
var L = MathV.Reflect(-vV, H);
L = MathV.Normalize(L);
var NdotL = TFloat.Max(MathV.Dot(vN, L), TFloat.Zero);
var sampleColor = SampleEquirectangularMap(image.Data, (int)image.Width, (int)image.Height, L);
NdotL &= validLaneMask;
// The Karis Average Weight: 1 / (1 + luma)
// A normal sky pixel (luma 1.0) gets a weight of 0.5.
// A sun pixel (luma 1000.0) gets a tiny weight of ~0.001, naturally suppressing it.
// This introduce bias, but significantly reduces fireflies without needing solid angle sampling or cdf inversion.
// And since this is a mip generation step, a little bias is acceptable for much better performance and stability.
var luma = MathV.Dot(sampleColor, vLumaVector);
var fireflyWeight = TFloat.One / (TFloat.One + luma);
var finalWeight = NdotL * fireflyWeight;
vPrefilteredColor += sampleColor * finalWeight;
vTotalWeight += finalWeight;
}
var totalWeight = 0.0f;
var prefilteredColor = float3(0, 0, 0);
for (var i = 0; i < TFloat.LaneWidth; i++)
{
prefilteredColor.x += vPrefilteredColor.x[i];
prefilteredColor.y += vPrefilteredColor.y[i];
prefilteredColor.z += vPrefilteredColor.z[i];
totalWeight += vTotalWeight[i];
}
// 4. Average the result
if (totalWeight > 0.0f)
{
prefilteredColor *= 1.0f / totalWeight;
}
// Write to output mip array
var out_idx = (y * w + x) * 3;
pData[out_idx] = prefilteredColor.x;
pData[out_idx + 1] = prefilteredColor.y;
pData[out_idx + 2] = prefilteredColor.z;
}
}
internal unsafe struct GGXMipGenerationJob : IJobParallelFor
{
public const uint SAMPLE_COUNT = 1024u;
public ImageResultFloat image;
public MipLevel* pMipLevels;
public float* radicalInverse_VdCLut;
public int numMipLevels;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static float RadicalInverse_VdC(uint bits)
{
bits = (bits << 16) | (bits >> 16);
bits = ((bits & 0x55555555u) << 1) | ((bits & 0xAAAAAAAAu) >> 1);
bits = ((bits & 0x33333333u) << 2) | ((bits & 0xCCCCCCCCu) >> 2);
bits = ((bits & 0x0F0F0F0Fu) << 4) | ((bits & 0xF0F0F0F0u) >> 4);
bits = ((bits & 0x00FF00FFu) << 8) | ((bits & 0xFF00FF00u) >> 8);
return bits * 2.3283064365386963e-10f; // bits / 0x100000000
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static float2 Hammersley(uint i, uint N, float* lut)
{
return float2((float)i / N, lut[i]);
}
// --- GGX Importance Sampling ---
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static float3 ImportanceSampleGGX(float2 Xi, float3 N, float roughness)
{
var a = roughness * roughness; // Disney/Epic remap roughness for better visual linearity
var phi = 2.0f * PI * Xi.x;
var cosTheta = sqrt((1.0f - Xi.y) / (1.0f + (a * a - 1.0f) * Xi.y));
var sinTheta = sqrt(1.0f - cosTheta * cosTheta);
// Spherical to Cartesian coordinates (Halfway vector)
sincos(phi, out var sinPhi, out var cosPhi);
var H = float3(cosPhi * sinTheta, sinPhi * sinTheta, cosTheta);
// Tangent space to World space
var up = abs(N.z) < 0.999f ? float3(0.0f, 0.0f, 1.0f) : float3(1.0f, 0.0f, 0.0f);
var tangent = normalize(cross(up, N));
var bitangent = cross(N, tangent);
var sampleVec = (tangent * H.x) + (bitangent * H.y) + (N * H.z);
return normalize(sampleVec);
}
// --- Image Sampling Helpers ---
// Maps a 3D direction vector to 2D equirectangular UVs
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static float2 DirToEquirectangularUV(float3 dir)
{
var uv = float2(atan2(dir.z, dir.x), asin(dir.y));
uv.x = uv.x / (2.0f * PI) + 0.5f;
uv.y = uv.y / PI + 0.5f;
return uv;
}
// Samples the source HDR image using bilinear interpolation (simplified to nearest neighbor for brevity here)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static float3 SampleEquirectangularMap(float* img, int w, int h, float3 dir)
{
var uv = DirToEquirectangularUV(dir);
// Nearest neighbor pixel coordinates
var px = (int)(uv.x * (w - 1));
var py = (int)(uv.y * (h - 1));
// Clamp
px = clamp(px, 0, w - 1);
py = clamp(py, 0, h - 1);
// Assuming float RGB array format
var idx = (py * w + px) * 3;
return float3(img[idx], img[idx + 1], img[idx + 2]);
}
public void Execute(int loopIndex, ref readonly JobExecutionContext ctx)
{
var m = 0;
while (m < numMipLevels - 1 && loopIndex >= pMipLevels[m + 1].offset)
{
m++;
}
var pLevel = &pMipLevels[m];
var w = (int)pLevel->width;
var h = (int)pLevel->height;
var pData = pLevel->data;
var local_i = loopIndex - pLevel->offset;
var x = local_i % w;
var y = local_i / w;
var u = (float)x / (w - 1);
var v = (float)y / (h - 1);
var phi = (u - 0.5f) * 2.0f * PI;
var theta = (v - 0.5f) * PI;
sincos(theta, out var sinTheta, out var cosTheta);
sincos(phi, out var sinPhi, out var cosPhi);
var N = float3(cosTheta * cosPhi, sinTheta, cosTheta * sinPhi);
N = normalize(N);
// For split-sum, we assume View and Reflection directions equal the Normal
var V = N;
var R = N;
var prefilteredColor = float3(0, 0, 0);
var totalWeight = 0.0f;
// 3. Monte Carlo Integration Loop
var dynamicSampleCount = (uint)max(1.0f, SAMPLE_COUNT * pLevel->roughness);
for (var i = 0u; i < dynamicSampleCount; i++)
{
// Generate a Hammersley random sequence point
var Xi = Hammersley(i, dynamicSampleCount, radicalInverse_VdCLut);
// Get the halfway vector based on GGX NDF
var H = ImportanceSampleGGX(Xi, N, pLevel->roughness);
// Calculate Light direction
var L = reflect(-V, H);
L = normalize(L);
var NdotL = max(dot(N, L), 0.0f);
// If light is above the horizon
if (NdotL > 0.0f)
{
var sampleColor = SampleEquirectangularMap(image.Data, (int)image.Width, (int)image.Height, L);
prefilteredColor += sampleColor * NdotL;
totalWeight += NdotL;
}
}
// 4. Average the result
if (totalWeight > 0.0f)
{
prefilteredColor *= 1.0f / totalWeight;
}
// Write to output mip array
var out_idx = (y * w + x) * 3;
pData[out_idx] = prefilteredColor.x;
pData[out_idx + 1] = prefilteredColor.y;
pData[out_idx + 2] = prefilteredColor.z;
}
}
[SimpleJob(RunStrategy.ColdStart, launchCount: 1, warmupCount: 0, iterationCount: 1, invocationCount: 1, id: "QuickRun")]
public unsafe class GGXMipGenerationBenchmark
{
private ImageResultFloat _image;
private int _mipLevels;
private int _totalPixel;
private float** _pResult;
private MipLevel* _pMipLevels;
private float* _radicalInverse_VdCLut;
private JobScheduler _jobScheduler = null!;
[GlobalSetup]
public void Setup()
{
//const string imagePath = "F:\\c\\SimpleRayTracer\\native\\assets\\hdri\\golden_gate_hills_1k.hdr";
const string imagePath = "C:\\Users\\Misaki\\Downloads\\grasslands_sunset_4k.hdr";
using var stream = new FileStream(imagePath, FileMode.Open, FileAccess.Read);
_image = ImageResultFloat.FromStream(stream, ColorComponents.RGB);
_mipLevels = (int)Math.Floor(Math.Log2(Math.Max(_image.Width, _image.Height))) + 1;
_pResult = (float**)NativeMemory.Alloc((nuint)(_mipLevels * sizeof(float*)));
_pMipLevels = (MipLevel*)NativeMemory.Alloc((nuint)(_mipLevels * sizeof(MipLevel)));
uint w, h;
for (var i = 0; i < _mipLevels; i++)
{
w = Math.Max(1, _image.Width >> i);
h = Math.Max(1, _image.Height >> i);
var sizeInBytes = (nuint)(w * h * 3 * sizeof(float));
_pResult[i] = (float*)NativeMemory.Alloc(sizeInBytes);
_pMipLevels[i] = new MipLevel
{
width = w,
height = h,
offset = _totalPixel,
data = _pResult[i],
roughness = (float)i / (_mipLevels - 1) // Linear roughness from 0 to 1 across mip levels
};
_totalPixel += (int)(w * h);
}
var desc = new JobSchedulerDesc
{
DependencyChainCapacity = 16,
ThreadCount = Environment.ProcessorCount - 1,
ThreadPriority = ThreadPriority.Normal,
};
_radicalInverse_VdCLut = (float*)NativeMemory.Alloc(GGXMipGenerationJob.SAMPLE_COUNT * sizeof(float));
for (var i = 0u; i < GGXMipGenerationJob.SAMPLE_COUNT; i++)
{
_radicalInverse_VdCLut[i] = GGXMipGenerationJob.RadicalInverse_VdC(i);
}
_jobScheduler = new JobScheduler(in desc);
}
public void DumpMipLevelToPng(float* pData, int width, int height, string filePath)
{
// Create a standard 32-bit RGBA bitmap
using var bitmap = new SKBitmap(width, height, SKColorType.Rgba8888, SKAlphaType.Opaque);
// Get a pointer to the SkiaSharp pixel buffer
var pPixels = (byte*)bitmap.GetPixels();
for (var y = 0; y < height; y++)
{
for (var x = 0; x < width; x++)
{
// Your data is tightly packed floats: R, G, B
var inIdx = (y * width + x) * 3;
var r = pData[inIdx];
var g = pData[inIdx + 1];
var b = pData[inIdx + 2];
// Basic Tone Mapping (Exposure + Gamma Correction) so we can see HDR values on a normal screen
// Gamma 2.2 = roughly pow(color, 1.0/2.2)
r = MathF.Pow(MathF.Max(0, r), 1.0f / 2.2f);
g = MathF.Pow(MathF.Max(0, g), 1.0f / 2.2f);
b = MathF.Pow(MathF.Max(0, b), 1.0f / 2.2f);
// Convert 0.0-1.0 to 0-255 byte
var rByte = (byte)Math.Clamp(r * 255.0f, 0, 255);
var gByte = (byte)Math.Clamp(g * 255.0f, 0, 255);
var bByte = (byte)Math.Clamp(b * 255.0f, 0, 255);
// Write to Skia's buffer (RGBA)
var outIdx = (y * width + x) * 4;
pPixels[outIdx] = rByte;
pPixels[outIdx + 1] = gByte;
pPixels[outIdx + 2] = bByte;
pPixels[outIdx + 3] = 255; // Alpha
}
}
// Save out the preview
using var data = bitmap.Encode(SKEncodedImageFormat.Png, 100);
using var stream = File.OpenWrite(filePath);
data.SaveTo(stream);
}
[GlobalCleanup]
public void Cleanup()
{
#if true
for (var i = 0; i < _mipLevels; i++)
{
DumpMipLevelToPng(_pResult[i], (int)_pMipLevels[i].width, (int)_pMipLevels[i].height, $"C:\\Users\\Misaki\\Downloads\\Im\\mip_level_{i}.png");
}
#endif
_image.Dispose();
for (var i = 0; i < _mipLevels; i++)
{
NativeMemory.Free(_pResult[i]);
}
NativeMemory.Free(_pResult);
NativeMemory.Free(_pMipLevels);
NativeMemory.Free(_radicalInverse_VdCLut);
_jobScheduler.Dispose();
}
[Benchmark(Baseline = true)]
public void JobGGX()
{
JobHandle handle;
if (WideLane.IsSupported)
{
var job = new GGXMipGenerationJobSPMD<WideLane<float>, WideLane<int>>
{
image = _image,
pMipLevels = _pMipLevels,
numMipLevels = _mipLevels,
radicalInverse_VdCLut = _radicalInverse_VdCLut
};
handle = _jobScheduler.ScheduleParallelFor(in job, _totalPixel, 64);
}
else
{
var job = new GGXMipGenerationJobSPMD<ScalarLane<float>, ScalarLane<int>>
{
image = _image,
pMipLevels = _pMipLevels,
numMipLevels = _mipLevels,
radicalInverse_VdCLut = _radicalInverse_VdCLut
};
handle = _jobScheduler.ScheduleParallelFor(in job, _totalPixel, 64);
}
_jobScheduler.Wait(handle);
}
//[Benchmark]
public void ParallelGGX()
{
var job = new GGXMipGenerationJob
{
image = _image,
pMipLevels = _pMipLevels,
numMipLevels = _mipLevels,
radicalInverse_VdCLut = _radicalInverse_VdCLut
};
Parallel.For(0, _totalPixel, new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount - 1 }, i =>
{
var localJob = job;
var ctx = new JobExecutionContext();
localJob.Execute(i, in ctx);
});
}
[Benchmark]
public void SingleThreadGGX()
{
var job = new GGXMipGenerationJob
{
image = _image,
pMipLevels = _pMipLevels,
numMipLevels = _mipLevels,
radicalInverse_VdCLut = _radicalInverse_VdCLut
};
var ctx = new JobExecutionContext();
job.Run(_totalPixel, in ctx);
}
}