Refactor trigonometric funcs, optimize GGX benchmark

- Replaced SIMD-based Sin/Cos/SinCos in WideLane with generic polynomial approximations for hardware independence.
- Updated ScalarLane Cast to use CreateTruncating.
- Applied AggressiveOptimization to key GGX methods; improved luma calculation and radical inverse LUT handling.
- Enhanced GGX benchmark setup, cleanup, and timing logic.
- Bumped project version to 1.3.1.
This commit is contained in:
2026-04-28 22:17:59 +09:00
parent 1074f9836e
commit 0acaf00767
5 changed files with 145 additions and 67 deletions

View File

@@ -1,4 +1,6 @@
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Diagnosers;
using BenchmarkDotNet.Engines;
using Misaki.HighPerformance.Image;
using Misaki.HighPerformance.Jobs;
using Misaki.HighPerformance.Mathematics;
@@ -41,7 +43,7 @@ internal unsafe struct GGXMipGenerationJobSPMD<TFloat, TInt> : IJobParallelFor
return bits * 2.3283064365386963e-10f; // bits / 0x100000000
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
private static Vector2<TFloat, float> Hammersley(TFloat i, uint N, float* lut)
{
var x = i / N;
@@ -50,7 +52,7 @@ internal unsafe struct GGXMipGenerationJobSPMD<TFloat, TInt> : IJobParallelFor
}
// --- GGX Importance Sampling ---
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
private static Vector3<TFloat, float> ImportanceSampleGGX(Vector2<TFloat, float> Xi, Vector3<TFloat, float> N, float roughness)
{
var a = roughness * roughness; // Disney/Epic remap roughness for better visual linearity
@@ -82,7 +84,7 @@ internal unsafe struct GGXMipGenerationJobSPMD<TFloat, TInt> : IJobParallelFor
// --- Image Sampling Helpers ---
// Maps a 3D direction vector to 2D equirectangular UVs
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
private static Vector2<TFloat, float> DirToEquirectangularUV(Vector3<TFloat, float> dir)
{
var u = TFloat.Atan2(dir.z, dir.x);
@@ -94,7 +96,7 @@ internal unsafe struct GGXMipGenerationJobSPMD<TFloat, TInt> : IJobParallelFor
}
// Samples the source HDR image using bilinear interpolation (simplified to nearest neighbor for brevity here)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
private static Vector3<TFloat, float> SampleEquirectangularMap(float* img, int w, int h, Vector3<TFloat, float> dir)
{
var uv = DirToEquirectangularUV(dir);
@@ -112,6 +114,7 @@ internal unsafe struct GGXMipGenerationJobSPMD<TFloat, TInt> : IJobParallelFor
return MathV.GatherVector3<TFloat, float>(img, idx.GetUnsafePtr(), 1);
}
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
public void Execute(int loopIndex, ref readonly JobExecutionContext ctx)
{
var m = 0;
@@ -157,17 +160,14 @@ internal unsafe struct GGXMipGenerationJobSPMD<TFloat, TInt> : IJobParallelFor
TFloat.Create(V.z)
);
//var vPrefilteredColorX = TFloat.Zero;
//var vPrefilteredColorY = TFloat.Zero;
//var vPrefilteredColorZ = TFloat.Zero;
var vPrefilteredColor = Vector3<TFloat, float>.Zero;
var vTotalWeight = TFloat.Zero;
// 3. Monte Carlo Integration Loop
// We assume WideLane is supported in the test.
var dynamicSampleCount = (uint)max(1.0f, SAMPLE_COUNT * pLevel->roughness);
var vDynamicSampleCount = TFloat.Create(dynamicSampleCount);
var vLumaVector = MathV.Create<TFloat, float>(0.2126f, 0.7152f, 0.0722f);
for (var i = 0u; i < dynamicSampleCount; i += (uint)TFloat.LaneWidth)
{
@@ -194,7 +194,7 @@ internal unsafe struct GGXMipGenerationJobSPMD<TFloat, TInt> : IJobParallelFor
// A sun pixel (luma 1000.0) gets a tiny weight of ~0.001, naturally suppressing it.
// This introduce bias, but significantly reduces fireflies without needing solid angle sampling or cdf inversion.
// And since this is a mip generation step, a little bias is acceptable for much better performance and stability.
var luma = MathV.Dot(sampleColor, MathV.Create<TFloat, float>(0.2126f, 0.7152f, 0.0722f));
var luma = MathV.Dot(sampleColor, vLumaVector);
var fireflyWeight = TFloat.One / (TFloat.One + luma);
var finalWeight = NdotL * fireflyWeight;
@@ -381,6 +381,7 @@ internal unsafe struct GGXMipGenerationJob : IJobParallelFor
}
}
[SimpleJob(RunStrategy.ColdStart, launchCount: 1, warmupCount: 0, iterationCount: 1, invocationCount: 1, id: "QuickRun")]
public unsafe class GGXMipGenerationBenchmark
{
private ImageResultFloat _image;
@@ -388,7 +389,7 @@ public unsafe class GGXMipGenerationBenchmark
private int _totalPixel;
private float** _pResult;
private MipLevel* _pMipLevels;
private float* radicalInverse_VdCLut;
private float* _radicalInverse_VdCLut;
private JobScheduler _jobScheduler = null!;
@@ -434,10 +435,10 @@ public unsafe class GGXMipGenerationBenchmark
ThreadPriority = ThreadPriority.Normal,
};
radicalInverse_VdCLut = (float*)NativeMemory.Alloc(GGXMipGenerationJob.SAMPLE_COUNT * sizeof(float));
_radicalInverse_VdCLut = (float*)NativeMemory.Alloc(GGXMipGenerationJob.SAMPLE_COUNT * sizeof(float));
for (var i = 0u; i < GGXMipGenerationJob.SAMPLE_COUNT; i++)
{
radicalInverse_VdCLut[i] = GGXMipGenerationJob.RadicalInverse_VdC(i);
_radicalInverse_VdCLut[i] = GGXMipGenerationJob.RadicalInverse_VdC(i);
}
_jobScheduler = new JobScheduler(in desc);
@@ -490,10 +491,12 @@ public unsafe class GGXMipGenerationBenchmark
[GlobalCleanup]
public void Cleanup()
{
#if false
for (var i = 0; i < _mipLevels; i++)
{
DumpMipLevelToPng(_pResult[i], (int)_pMipLevels[i].width, (int)_pMipLevels[i].height, $"C:\\Users\\Misaki\\Downloads\\Im\\mip_level_{i}.png");
}
#endif
_image.Dispose();
for (var i = 0; i < _mipLevels; i++)
@@ -503,12 +506,12 @@ public unsafe class GGXMipGenerationBenchmark
NativeMemory.Free(_pResult);
NativeMemory.Free(_pMipLevels);
NativeMemory.Free(radicalInverse_VdCLut);
NativeMemory.Free(_radicalInverse_VdCLut);
_jobScheduler.Dispose();
}
[Benchmark]
[Benchmark(Baseline = true)]
public void JobGGX()
{
JobHandle handle;
@@ -519,7 +522,7 @@ public unsafe class GGXMipGenerationBenchmark
image = _image,
pMipLevels = _pMipLevels,
numMipLevels = _mipLevels,
radicalInverse_VdCLut = radicalInverse_VdCLut
radicalInverse_VdCLut = _radicalInverse_VdCLut
};
handle = _jobScheduler.ScheduleParallelFor(in job, _totalPixel, 64);
@@ -531,7 +534,7 @@ public unsafe class GGXMipGenerationBenchmark
image = _image,
pMipLevels = _pMipLevels,
numMipLevels = _mipLevels,
radicalInverse_VdCLut = radicalInverse_VdCLut
radicalInverse_VdCLut = _radicalInverse_VdCLut
};
handle = _jobScheduler.ScheduleParallelFor(in job, _totalPixel, 64);
@@ -548,7 +551,7 @@ public unsafe class GGXMipGenerationBenchmark
image = _image,
pMipLevels = _pMipLevels,
numMipLevels = _mipLevels,
radicalInverse_VdCLut = radicalInverse_VdCLut
radicalInverse_VdCLut = _radicalInverse_VdCLut
};
Parallel.For(0, _totalPixel, new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount - 1 }, i =>
@@ -567,12 +570,10 @@ public unsafe class GGXMipGenerationBenchmark
image = _image,
pMipLevels = _pMipLevels,
numMipLevels = _mipLevels,
radicalInverse_VdCLut = radicalInverse_VdCLut
radicalInverse_VdCLut = _radicalInverse_VdCLut
};
//var handle = _jobScheduler.ScheduleParallelFor(in job, _totalPixel, 64);
//_jobScheduler.Wait(handle);
var ctx = new JobExecutionContext();
job.Run(_totalPixel, in ctx);
}
}
}