From 0acaf007673940b4b8b908797f5848d139f380e3 Mon Sep 17 00:00:00 2001 From: Misaki Date: Tue, 28 Apr 2026 22:17:59 +0900 Subject: [PATCH] Refactor trigonometric funcs, optimize GGX benchmark - Replaced SIMD-based Sin/Cos/SinCos in WideLane with generic polynomial approximations for hardware independence. - Updated ScalarLane Cast to use CreateTruncating. - Applied AggressiveOptimization to key GGX methods; improved luma calculation and radical inverse LUT handling. - Enhanced GGX benchmark setup, cleanup, and timing logic. - Bumped project version to 1.3.1. --- ...ki.HighPerformance.Mathematics.SPMD.csproj | 2 +- .../ScalerLane.cs | 2 +- .../WideLane.cs | 149 +++++++++++++----- .../Benchmark/GGXMipGenerationBenchmark.cs | 43 ++--- Misaki.HighPerformance.Test/Program.cs | 16 +- 5 files changed, 145 insertions(+), 67 deletions(-) diff --git a/Misaki.HighPerformance.Mathematics.SPMD/Misaki.HighPerformance.Mathematics.SPMD.csproj b/Misaki.HighPerformance.Mathematics.SPMD/Misaki.HighPerformance.Mathematics.SPMD.csproj index 0fbdfed..8bcc1e4 100644 --- a/Misaki.HighPerformance.Mathematics.SPMD/Misaki.HighPerformance.Mathematics.SPMD.csproj +++ b/Misaki.HighPerformance.Mathematics.SPMD/Misaki.HighPerformance.Mathematics.SPMD.csproj @@ -7,7 +7,7 @@ true true Misaki - 1.3.0 + 1.3.1 $(AssemblyVersion) https://git.personalnas.com/Misaki/Misaki.HighPerformance.git https://git.personalnas.com/Misaki/Misaki.HighPerformance.git diff --git a/Misaki.HighPerformance.Mathematics.SPMD/ScalerLane.cs b/Misaki.HighPerformance.Mathematics.SPMD/ScalerLane.cs index 9667066..cfa0889 100644 --- a/Misaki.HighPerformance.Mathematics.SPMD/ScalerLane.cs +++ b/Misaki.HighPerformance.Mathematics.SPMD/ScalerLane.cs @@ -171,7 +171,7 @@ public readonly unsafe struct ScalarLane : ISPMDLane where TOtherNumber : unmanaged, INumber, IBinaryNumber, IMinMaxValue, IBitwiseOperators { - return TOther.Create(TOtherNumber.CreateChecked(value)); + return TOther.Create(TOtherNumber.CreateTruncating(value)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] diff --git a/Misaki.HighPerformance.Mathematics.SPMD/WideLane.cs b/Misaki.HighPerformance.Mathematics.SPMD/WideLane.cs index e4ef3e7..5e9795c 100644 --- a/Misaki.HighPerformance.Mathematics.SPMD/WideLane.cs +++ b/Misaki.HighPerformance.Mathematics.SPMD/WideLane.cs @@ -578,58 +578,129 @@ public readonly unsafe partial struct WideLane : ISPMDLane Sin(WideLane value) { - if (typeof(TNumber) == typeof(float)) - { - ref var v = ref Unsafe.As, Vector>(ref value); - var result = Vector.Sin(v); - return new WideLane(Unsafe.As, Vector>(ref result)); - } - else if (typeof(TNumber) == typeof(double)) - { - ref var v = ref Unsafe.As, Vector>(ref value); - var result = Vector.Sin(v); - return new WideLane(Unsafe.As, Vector>(ref result)); - } + var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI - return value; + var x_sin = value; + var y_sin = x_sin * invPi; + var k_sin = Round(y_sin); + var z_sin = y_sin - k_sin; + + var half = Create(TNumber.CreateTruncating(0.5f)); + var two = Create(TNumber.CreateTruncating(2.0f)); + + var k_even_sin = Round(k_sin * half) * two; + var sign_sin = One - two * Abs(k_sin - k_even_sin); + + var c1 = Create(TNumber.CreateTruncating(3.14159265f)); // PI + var c3 = Create(TNumber.CreateTruncating(-5.16771278f)); // -PI^3 / 6 + var c5 = Create(TNumber.CreateTruncating(2.55016404f)); // PI^5 / 120 + var c7 = Create(TNumber.CreateTruncating(-0.59926453f)); // -PI^7 / 5040 + var c9 = Create(TNumber.CreateTruncating(0.08214589f)); // PI^9 / 362880 + + var z2_sin = z_sin * z_sin; + var poly_sin = MultipleAdd(z2_sin, c9, c7); // c7 + c9*z^2 + poly_sin = MultipleAdd(z2_sin, poly_sin, c5); // c5 + z^2*(...) + poly_sin = MultipleAdd(z2_sin, poly_sin, c3); // c3 + z^2*(...) + poly_sin = MultipleAdd(z2_sin, poly_sin, c1); // c1 + z^2*(...) + poly_sin = z_sin * poly_sin; // z * (...) + + return poly_sin * sign_sin; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Cos(WideLane value) { - if (typeof(TNumber) == typeof(float)) - { - ref var v = ref Unsafe.As, Vector>(ref value); - var result = Vector.Cos(v); - return new WideLane(Unsafe.As, Vector>(ref result)); - } - else if (typeof(TNumber) == typeof(double)) - { - ref var v = ref Unsafe.As, Vector>(ref value); - var result = Vector.Cos(v); - return new WideLane(Unsafe.As, Vector>(ref result)); - } + var halfPi = Create(TNumber.CreateTruncating(1.570796327f)); + var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI - return value; + var x_cos = value + halfPi; + var y_cos = x_cos * invPi; + var k_cos = Round(y_cos); + var z_cos = y_cos - k_cos; + + var half = Create(TNumber.CreateTruncating(0.5f)); + var two = Create(TNumber.CreateTruncating(2.0f)); + + var k_even_cos = Round(k_cos * half) * two; + var sign_cos = One - two * Abs(k_cos - k_even_cos); + + var c1 = Create(TNumber.CreateTruncating(3.14159265f)); // PI + var c3 = Create(TNumber.CreateTruncating(-5.16771278f)); // -PI^3 / 6 + var c5 = Create(TNumber.CreateTruncating(2.55016404f)); // PI^5 / 120 + var c7 = Create(TNumber.CreateTruncating(-0.59926453f)); // -PI^7 / 5040 + var c9 = Create(TNumber.CreateTruncating(0.08214589f)); // PI^9 / 362880 + + var z2_cos = z_cos * z_cos; + var poly_cos = MultipleAdd(z2_cos, c9, c7); + poly_cos = MultipleAdd(z2_cos, poly_cos, c5); + poly_cos = MultipleAdd(z2_cos, poly_cos, c3); + poly_cos = MultipleAdd(z2_cos, poly_cos, c1); + poly_cos = z_cos * poly_cos; + + return poly_cos * sign_cos; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static (WideLane sin, WideLane cos) SinCos(WideLane value) { - if (typeof(TNumber) == typeof(float)) - { - ref var v = ref Unsafe.As, Vector>(ref value); - var (sin, cos) = Vector.SinCos(v); - return (new WideLane(Unsafe.As, Vector>(ref sin)), new WideLane(Unsafe.As, Vector>(ref cos))); - } - else if (typeof(TNumber) == typeof(double)) - { - ref var v = ref Unsafe.As, Vector>(ref value); - var (sin, cos) = Vector.SinCos(v); - return (new WideLane(Unsafe.As, Vector>(ref sin)), new WideLane(Unsafe.As, Vector>(ref cos))); - } + var halfPi = Create(TNumber.CreateTruncating(1.570796327f)); + var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI - return (value, value); + var x_sin = value; + var x_cos = value + halfPi; + + // Range Reduction + // We map any angle to the interval [-0.5, 0.5] (corresponding to the actual angle range [-PI/2, PI/2]) + // y = x * (1 / PI) + var y_sin = x_sin * invPi; + var y_cos = x_cos * invPi; + + // k = Round(y) + var k_sin = Round(y_sin); + var k_cos = Round(y_cos); + + // z = y - k (Now, the range of z is perfectly reduced to [-0.5, 0.5]) + var z_sin = y_sin - k_sin; + var z_cos = y_cos - k_cos; + + // 2. Branchless Sign Flip + // Mathematical principle: Sin(x + k*PI) = Sin(x) * (-1)^k + // We need to compute (-1)^k. To avoid inefficient bit operations or branches, we compute it with floating-point math: + // sign = 1.0 - 2.0 * Abs(k - 2.0 * Round(k * 0.5)) + var half = Create(TNumber.CreateTruncating(0.5f)); + var two = Create(TNumber.CreateTruncating(2.0f)); + var one = One; + + var k_even_sin = Round(k_sin * half) * two; + var sign_sin = one - two * Abs(k_sin - k_even_sin); + + var k_even_cos = Round(k_cos * half) * two; + var sign_cos = one - two * Abs(k_cos - k_even_cos); + + // 3. Taylor/Remez Polynomial for Sin(PI * z) + // For z in [-0.5, 0.5],Calculate sin(PI * z) + // z * (C1 + z^2 * (C3 + z^2 * (C5 + z^2 * (C7 + z^2 * C9)))) + var c1 = Create(TNumber.CreateTruncating(3.14159265f)); // PI + var c3 = Create(TNumber.CreateTruncating(-5.16771278f)); // -PI^3 / 6 + var c5 = Create(TNumber.CreateTruncating(2.55016404f)); // PI^5 / 120 + var c7 = Create(TNumber.CreateTruncating(-0.59926453f)); // -PI^7 / 5040 + var c9 = Create(TNumber.CreateTruncating(0.08214589f)); // PI^9 / 362880 + + var z2_sin = z_sin * z_sin; + var poly_sin = MultipleAdd(z2_sin, c9, c7); // c7 + c9*z^2 + poly_sin = MultipleAdd(z2_sin, poly_sin, c5); // c5 + z^2*(...) + poly_sin = MultipleAdd(z2_sin, poly_sin, c3); // c3 + z^2*(...) + poly_sin = MultipleAdd(z2_sin, poly_sin, c1); // c1 + z^2*(...) + poly_sin = z_sin * poly_sin; // z * (...) + + var z2_cos = z_cos * z_cos; + var poly_cos = MultipleAdd(z2_cos, c9, c7); + poly_cos = MultipleAdd(z2_cos, poly_cos, c5); + poly_cos = MultipleAdd(z2_cos, poly_cos, c3); + poly_cos = MultipleAdd(z2_cos, poly_cos, c1); + poly_cos = z_cos * poly_cos; + + return (poly_sin * sign_sin, poly_cos * sign_cos); } [MethodImpl(MethodImplOptions.AggressiveInlining)] diff --git a/Misaki.HighPerformance.Test/Benchmark/GGXMipGenerationBenchmark.cs b/Misaki.HighPerformance.Test/Benchmark/GGXMipGenerationBenchmark.cs index 9704bb1..bb64ab6 100644 --- a/Misaki.HighPerformance.Test/Benchmark/GGXMipGenerationBenchmark.cs +++ b/Misaki.HighPerformance.Test/Benchmark/GGXMipGenerationBenchmark.cs @@ -1,4 +1,6 @@ using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Diagnosers; +using BenchmarkDotNet.Engines; using Misaki.HighPerformance.Image; using Misaki.HighPerformance.Jobs; using Misaki.HighPerformance.Mathematics; @@ -41,7 +43,7 @@ internal unsafe struct GGXMipGenerationJobSPMD : IJobParallelFor return bits * 2.3283064365386963e-10f; // bits / 0x100000000 } - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] private static Vector2 Hammersley(TFloat i, uint N, float* lut) { var x = i / N; @@ -50,7 +52,7 @@ internal unsafe struct GGXMipGenerationJobSPMD : IJobParallelFor } // --- GGX Importance Sampling --- - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] private static Vector3 ImportanceSampleGGX(Vector2 Xi, Vector3 N, float roughness) { var a = roughness * roughness; // Disney/Epic remap roughness for better visual linearity @@ -82,7 +84,7 @@ internal unsafe struct GGXMipGenerationJobSPMD : IJobParallelFor // --- Image Sampling Helpers --- // Maps a 3D direction vector to 2D equirectangular UVs - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] private static Vector2 DirToEquirectangularUV(Vector3 dir) { var u = TFloat.Atan2(dir.z, dir.x); @@ -94,7 +96,7 @@ internal unsafe struct GGXMipGenerationJobSPMD : IJobParallelFor } // Samples the source HDR image using bilinear interpolation (simplified to nearest neighbor for brevity here) - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)] private static Vector3 SampleEquirectangularMap(float* img, int w, int h, Vector3 dir) { var uv = DirToEquirectangularUV(dir); @@ -112,6 +114,7 @@ internal unsafe struct GGXMipGenerationJobSPMD : IJobParallelFor return MathV.GatherVector3(img, idx.GetUnsafePtr(), 1); } + [MethodImpl(MethodImplOptions.AggressiveOptimization)] public void Execute(int loopIndex, ref readonly JobExecutionContext ctx) { var m = 0; @@ -157,17 +160,14 @@ internal unsafe struct GGXMipGenerationJobSPMD : IJobParallelFor TFloat.Create(V.z) ); - //var vPrefilteredColorX = TFloat.Zero; - //var vPrefilteredColorY = TFloat.Zero; - //var vPrefilteredColorZ = TFloat.Zero; var vPrefilteredColor = Vector3.Zero; var vTotalWeight = TFloat.Zero; // 3. Monte Carlo Integration Loop - // We assume WideLane is supported in the test. var dynamicSampleCount = (uint)max(1.0f, SAMPLE_COUNT * pLevel->roughness); var vDynamicSampleCount = TFloat.Create(dynamicSampleCount); + var vLumaVector = MathV.Create(0.2126f, 0.7152f, 0.0722f); for (var i = 0u; i < dynamicSampleCount; i += (uint)TFloat.LaneWidth) { @@ -194,7 +194,7 @@ internal unsafe struct GGXMipGenerationJobSPMD : IJobParallelFor // A sun pixel (luma 1000.0) gets a tiny weight of ~0.001, naturally suppressing it. // This introduce bias, but significantly reduces fireflies without needing solid angle sampling or cdf inversion. // And since this is a mip generation step, a little bias is acceptable for much better performance and stability. - var luma = MathV.Dot(sampleColor, MathV.Create(0.2126f, 0.7152f, 0.0722f)); + var luma = MathV.Dot(sampleColor, vLumaVector); var fireflyWeight = TFloat.One / (TFloat.One + luma); var finalWeight = NdotL * fireflyWeight; @@ -381,6 +381,7 @@ internal unsafe struct GGXMipGenerationJob : IJobParallelFor } } +[SimpleJob(RunStrategy.ColdStart, launchCount: 1, warmupCount: 0, iterationCount: 1, invocationCount: 1, id: "QuickRun")] public unsafe class GGXMipGenerationBenchmark { private ImageResultFloat _image; @@ -388,7 +389,7 @@ public unsafe class GGXMipGenerationBenchmark private int _totalPixel; private float** _pResult; private MipLevel* _pMipLevels; - private float* radicalInverse_VdCLut; + private float* _radicalInverse_VdCLut; private JobScheduler _jobScheduler = null!; @@ -434,10 +435,10 @@ public unsafe class GGXMipGenerationBenchmark ThreadPriority = ThreadPriority.Normal, }; - radicalInverse_VdCLut = (float*)NativeMemory.Alloc(GGXMipGenerationJob.SAMPLE_COUNT * sizeof(float)); + _radicalInverse_VdCLut = (float*)NativeMemory.Alloc(GGXMipGenerationJob.SAMPLE_COUNT * sizeof(float)); for (var i = 0u; i < GGXMipGenerationJob.SAMPLE_COUNT; i++) { - radicalInverse_VdCLut[i] = GGXMipGenerationJob.RadicalInverse_VdC(i); + _radicalInverse_VdCLut[i] = GGXMipGenerationJob.RadicalInverse_VdC(i); } _jobScheduler = new JobScheduler(in desc); @@ -490,10 +491,12 @@ public unsafe class GGXMipGenerationBenchmark [GlobalCleanup] public void Cleanup() { +#if false for (var i = 0; i < _mipLevels; i++) { DumpMipLevelToPng(_pResult[i], (int)_pMipLevels[i].width, (int)_pMipLevels[i].height, $"C:\\Users\\Misaki\\Downloads\\Im\\mip_level_{i}.png"); } +#endif _image.Dispose(); for (var i = 0; i < _mipLevels; i++) @@ -503,12 +506,12 @@ public unsafe class GGXMipGenerationBenchmark NativeMemory.Free(_pResult); NativeMemory.Free(_pMipLevels); - NativeMemory.Free(radicalInverse_VdCLut); + NativeMemory.Free(_radicalInverse_VdCLut); _jobScheduler.Dispose(); } - [Benchmark] + [Benchmark(Baseline = true)] public void JobGGX() { JobHandle handle; @@ -519,7 +522,7 @@ public unsafe class GGXMipGenerationBenchmark image = _image, pMipLevels = _pMipLevels, numMipLevels = _mipLevels, - radicalInverse_VdCLut = radicalInverse_VdCLut + radicalInverse_VdCLut = _radicalInverse_VdCLut }; handle = _jobScheduler.ScheduleParallelFor(in job, _totalPixel, 64); @@ -531,7 +534,7 @@ public unsafe class GGXMipGenerationBenchmark image = _image, pMipLevels = _pMipLevels, numMipLevels = _mipLevels, - radicalInverse_VdCLut = radicalInverse_VdCLut + radicalInverse_VdCLut = _radicalInverse_VdCLut }; handle = _jobScheduler.ScheduleParallelFor(in job, _totalPixel, 64); @@ -548,7 +551,7 @@ public unsafe class GGXMipGenerationBenchmark image = _image, pMipLevels = _pMipLevels, numMipLevels = _mipLevels, - radicalInverse_VdCLut = radicalInverse_VdCLut + radicalInverse_VdCLut = _radicalInverse_VdCLut }; Parallel.For(0, _totalPixel, new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount - 1 }, i => @@ -567,12 +570,10 @@ public unsafe class GGXMipGenerationBenchmark image = _image, pMipLevels = _pMipLevels, numMipLevels = _mipLevels, - radicalInverse_VdCLut = radicalInverse_VdCLut + radicalInverse_VdCLut = _radicalInverse_VdCLut }; - //var handle = _jobScheduler.ScheduleParallelFor(in job, _totalPixel, 64); - //_jobScheduler.Wait(handle); var ctx = new JobExecutionContext(); job.Run(_totalPixel, in ctx); } -} \ No newline at end of file +} diff --git a/Misaki.HighPerformance.Test/Program.cs b/Misaki.HighPerformance.Test/Program.cs index 4543f56..2c438e1 100644 --- a/Misaki.HighPerformance.Test/Program.cs +++ b/Misaki.HighPerformance.Test/Program.cs @@ -8,16 +8,22 @@ using System.Runtime.InteropServices; //BenchmarkRunner.Run(); +const int count = 1; + var bench = new GGXMipGenerationBenchmark(); bench.Setup(); var sw = System.Diagnostics.Stopwatch.StartNew(); -bench.JobGGX(); -sw.Stop(); -Console.WriteLine($"GGX Mip Generation: {sw.Elapsed.TotalMilliseconds} ms"); -bench.Cleanup(); -//Console.WriteLine(sw.Elapsed.TotalMilliseconds); +for (int i = 0; i < count; i++) +{ + bench.JobGGX(); +} + +sw.Stop(); +var avgTime = sw.Elapsed.TotalMilliseconds / count; +Console.WriteLine($"GGX Mip Generation: {avgTime} ms"); +bench.Cleanup(); //AllocationManager.Initialize(AllocationManagerInitOpts.Default); //var set = new UnsafeBitSet(100, AllocationHandle.Persistent, AllocationOption.Clear);