diff --git a/Misaki.HighPerformance.Mathematics.SPMD/Misaki.HighPerformance.Mathematics.SPMD.csproj b/Misaki.HighPerformance.Mathematics.SPMD/Misaki.HighPerformance.Mathematics.SPMD.csproj
index 0fbdfed..8bcc1e4 100644
--- a/Misaki.HighPerformance.Mathematics.SPMD/Misaki.HighPerformance.Mathematics.SPMD.csproj
+++ b/Misaki.HighPerformance.Mathematics.SPMD/Misaki.HighPerformance.Mathematics.SPMD.csproj
@@ -7,7 +7,7 @@
true
true
Misaki
- 1.3.0
+ 1.3.1
$(AssemblyVersion)
https://git.personalnas.com/Misaki/Misaki.HighPerformance.git
https://git.personalnas.com/Misaki/Misaki.HighPerformance.git
diff --git a/Misaki.HighPerformance.Mathematics.SPMD/ScalerLane.cs b/Misaki.HighPerformance.Mathematics.SPMD/ScalerLane.cs
index 9667066..cfa0889 100644
--- a/Misaki.HighPerformance.Mathematics.SPMD/ScalerLane.cs
+++ b/Misaki.HighPerformance.Mathematics.SPMD/ScalerLane.cs
@@ -171,7 +171,7 @@ public readonly unsafe struct ScalarLane : ISPMDLane
where TOtherNumber : unmanaged, INumber, IBinaryNumber, IMinMaxValue, IBitwiseOperators
{
- return TOther.Create(TOtherNumber.CreateChecked(value));
+ return TOther.Create(TOtherNumber.CreateTruncating(value));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
diff --git a/Misaki.HighPerformance.Mathematics.SPMD/WideLane.cs b/Misaki.HighPerformance.Mathematics.SPMD/WideLane.cs
index e4ef3e7..5e9795c 100644
--- a/Misaki.HighPerformance.Mathematics.SPMD/WideLane.cs
+++ b/Misaki.HighPerformance.Mathematics.SPMD/WideLane.cs
@@ -578,58 +578,129 @@ public readonly unsafe partial struct WideLane : ISPMDLane Sin(WideLane value)
{
- if (typeof(TNumber) == typeof(float))
- {
- ref var v = ref Unsafe.As, Vector>(ref value);
- var result = Vector.Sin(v);
- return new WideLane(Unsafe.As, Vector>(ref result));
- }
- else if (typeof(TNumber) == typeof(double))
- {
- ref var v = ref Unsafe.As, Vector>(ref value);
- var result = Vector.Sin(v);
- return new WideLane(Unsafe.As, Vector>(ref result));
- }
+ var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI
- return value;
+ var x_sin = value;
+ var y_sin = x_sin * invPi;
+ var k_sin = Round(y_sin);
+ var z_sin = y_sin - k_sin;
+
+ var half = Create(TNumber.CreateTruncating(0.5f));
+ var two = Create(TNumber.CreateTruncating(2.0f));
+
+ var k_even_sin = Round(k_sin * half) * two;
+ var sign_sin = One - two * Abs(k_sin - k_even_sin);
+
+ var c1 = Create(TNumber.CreateTruncating(3.14159265f)); // PI
+ var c3 = Create(TNumber.CreateTruncating(-5.16771278f)); // -PI^3 / 6
+ var c5 = Create(TNumber.CreateTruncating(2.55016404f)); // PI^5 / 120
+ var c7 = Create(TNumber.CreateTruncating(-0.59926453f)); // -PI^7 / 5040
+ var c9 = Create(TNumber.CreateTruncating(0.08214589f)); // PI^9 / 362880
+
+ var z2_sin = z_sin * z_sin;
+ var poly_sin = MultipleAdd(z2_sin, c9, c7); // c7 + c9*z^2
+ poly_sin = MultipleAdd(z2_sin, poly_sin, c5); // c5 + z^2*(...)
+ poly_sin = MultipleAdd(z2_sin, poly_sin, c3); // c3 + z^2*(...)
+ poly_sin = MultipleAdd(z2_sin, poly_sin, c1); // c1 + z^2*(...)
+ poly_sin = z_sin * poly_sin; // z * (...)
+
+ return poly_sin * sign_sin;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Cos(WideLane value)
{
- if (typeof(TNumber) == typeof(float))
- {
- ref var v = ref Unsafe.As, Vector>(ref value);
- var result = Vector.Cos(v);
- return new WideLane(Unsafe.As, Vector>(ref result));
- }
- else if (typeof(TNumber) == typeof(double))
- {
- ref var v = ref Unsafe.As, Vector>(ref value);
- var result = Vector.Cos(v);
- return new WideLane(Unsafe.As, Vector>(ref result));
- }
+ var halfPi = Create(TNumber.CreateTruncating(1.570796327f));
+ var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI
- return value;
+ var x_cos = value + halfPi;
+ var y_cos = x_cos * invPi;
+ var k_cos = Round(y_cos);
+ var z_cos = y_cos - k_cos;
+
+ var half = Create(TNumber.CreateTruncating(0.5f));
+ var two = Create(TNumber.CreateTruncating(2.0f));
+
+ var k_even_cos = Round(k_cos * half) * two;
+ var sign_cos = One - two * Abs(k_cos - k_even_cos);
+
+ var c1 = Create(TNumber.CreateTruncating(3.14159265f)); // PI
+ var c3 = Create(TNumber.CreateTruncating(-5.16771278f)); // -PI^3 / 6
+ var c5 = Create(TNumber.CreateTruncating(2.55016404f)); // PI^5 / 120
+ var c7 = Create(TNumber.CreateTruncating(-0.59926453f)); // -PI^7 / 5040
+ var c9 = Create(TNumber.CreateTruncating(0.08214589f)); // PI^9 / 362880
+
+ var z2_cos = z_cos * z_cos;
+ var poly_cos = MultipleAdd(z2_cos, c9, c7);
+ poly_cos = MultipleAdd(z2_cos, poly_cos, c5);
+ poly_cos = MultipleAdd(z2_cos, poly_cos, c3);
+ poly_cos = MultipleAdd(z2_cos, poly_cos, c1);
+ poly_cos = z_cos * poly_cos;
+
+ return poly_cos * sign_cos;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static (WideLane sin, WideLane cos) SinCos(WideLane value)
{
- if (typeof(TNumber) == typeof(float))
- {
- ref var v = ref Unsafe.As, Vector>(ref value);
- var (sin, cos) = Vector.SinCos(v);
- return (new WideLane(Unsafe.As, Vector>(ref sin)), new WideLane(Unsafe.As, Vector>(ref cos)));
- }
- else if (typeof(TNumber) == typeof(double))
- {
- ref var v = ref Unsafe.As, Vector>(ref value);
- var (sin, cos) = Vector.SinCos(v);
- return (new WideLane(Unsafe.As, Vector>(ref sin)), new WideLane(Unsafe.As, Vector>(ref cos)));
- }
+ var halfPi = Create(TNumber.CreateTruncating(1.570796327f));
+ var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI
- return (value, value);
+ var x_sin = value;
+ var x_cos = value + halfPi;
+
+ // Range Reduction
+ // We map any angle to the interval [-0.5, 0.5] (corresponding to the actual angle range [-PI/2, PI/2])
+ // y = x * (1 / PI)
+ var y_sin = x_sin * invPi;
+ var y_cos = x_cos * invPi;
+
+ // k = Round(y)
+ var k_sin = Round(y_sin);
+ var k_cos = Round(y_cos);
+
+ // z = y - k (Now, the range of z is perfectly reduced to [-0.5, 0.5])
+ var z_sin = y_sin - k_sin;
+ var z_cos = y_cos - k_cos;
+
+ // 2. Branchless Sign Flip
+ // Mathematical principle: Sin(x + k*PI) = Sin(x) * (-1)^k
+ // We need to compute (-1)^k. To avoid inefficient bit operations or branches, we compute it with floating-point math:
+ // sign = 1.0 - 2.0 * Abs(k - 2.0 * Round(k * 0.5))
+ var half = Create(TNumber.CreateTruncating(0.5f));
+ var two = Create(TNumber.CreateTruncating(2.0f));
+ var one = One;
+
+ var k_even_sin = Round(k_sin * half) * two;
+ var sign_sin = one - two * Abs(k_sin - k_even_sin);
+
+ var k_even_cos = Round(k_cos * half) * two;
+ var sign_cos = one - two * Abs(k_cos - k_even_cos);
+
+ // 3. Taylor/Remez Polynomial for Sin(PI * z)
+ // For z in [-0.5, 0.5],Calculate sin(PI * z)
+ // z * (C1 + z^2 * (C3 + z^2 * (C5 + z^2 * (C7 + z^2 * C9))))
+ var c1 = Create(TNumber.CreateTruncating(3.14159265f)); // PI
+ var c3 = Create(TNumber.CreateTruncating(-5.16771278f)); // -PI^3 / 6
+ var c5 = Create(TNumber.CreateTruncating(2.55016404f)); // PI^5 / 120
+ var c7 = Create(TNumber.CreateTruncating(-0.59926453f)); // -PI^7 / 5040
+ var c9 = Create(TNumber.CreateTruncating(0.08214589f)); // PI^9 / 362880
+
+ var z2_sin = z_sin * z_sin;
+ var poly_sin = MultipleAdd(z2_sin, c9, c7); // c7 + c9*z^2
+ poly_sin = MultipleAdd(z2_sin, poly_sin, c5); // c5 + z^2*(...)
+ poly_sin = MultipleAdd(z2_sin, poly_sin, c3); // c3 + z^2*(...)
+ poly_sin = MultipleAdd(z2_sin, poly_sin, c1); // c1 + z^2*(...)
+ poly_sin = z_sin * poly_sin; // z * (...)
+
+ var z2_cos = z_cos * z_cos;
+ var poly_cos = MultipleAdd(z2_cos, c9, c7);
+ poly_cos = MultipleAdd(z2_cos, poly_cos, c5);
+ poly_cos = MultipleAdd(z2_cos, poly_cos, c3);
+ poly_cos = MultipleAdd(z2_cos, poly_cos, c1);
+ poly_cos = z_cos * poly_cos;
+
+ return (poly_sin * sign_sin, poly_cos * sign_cos);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
diff --git a/Misaki.HighPerformance.Test/Benchmark/GGXMipGenerationBenchmark.cs b/Misaki.HighPerformance.Test/Benchmark/GGXMipGenerationBenchmark.cs
index 9704bb1..bb64ab6 100644
--- a/Misaki.HighPerformance.Test/Benchmark/GGXMipGenerationBenchmark.cs
+++ b/Misaki.HighPerformance.Test/Benchmark/GGXMipGenerationBenchmark.cs
@@ -1,4 +1,6 @@
using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Diagnosers;
+using BenchmarkDotNet.Engines;
using Misaki.HighPerformance.Image;
using Misaki.HighPerformance.Jobs;
using Misaki.HighPerformance.Mathematics;
@@ -41,7 +43,7 @@ internal unsafe struct GGXMipGenerationJobSPMD : IJobParallelFor
return bits * 2.3283064365386963e-10f; // bits / 0x100000000
}
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
private static Vector2 Hammersley(TFloat i, uint N, float* lut)
{
var x = i / N;
@@ -50,7 +52,7 @@ internal unsafe struct GGXMipGenerationJobSPMD : IJobParallelFor
}
// --- GGX Importance Sampling ---
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
private static Vector3 ImportanceSampleGGX(Vector2 Xi, Vector3 N, float roughness)
{
var a = roughness * roughness; // Disney/Epic remap roughness for better visual linearity
@@ -82,7 +84,7 @@ internal unsafe struct GGXMipGenerationJobSPMD : IJobParallelFor
// --- Image Sampling Helpers ---
// Maps a 3D direction vector to 2D equirectangular UVs
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
private static Vector2 DirToEquirectangularUV(Vector3 dir)
{
var u = TFloat.Atan2(dir.z, dir.x);
@@ -94,7 +96,7 @@ internal unsafe struct GGXMipGenerationJobSPMD : IJobParallelFor
}
// Samples the source HDR image using bilinear interpolation (simplified to nearest neighbor for brevity here)
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
private static Vector3 SampleEquirectangularMap(float* img, int w, int h, Vector3 dir)
{
var uv = DirToEquirectangularUV(dir);
@@ -112,6 +114,7 @@ internal unsafe struct GGXMipGenerationJobSPMD : IJobParallelFor
return MathV.GatherVector3(img, idx.GetUnsafePtr(), 1);
}
+ [MethodImpl(MethodImplOptions.AggressiveOptimization)]
public void Execute(int loopIndex, ref readonly JobExecutionContext ctx)
{
var m = 0;
@@ -157,17 +160,14 @@ internal unsafe struct GGXMipGenerationJobSPMD : IJobParallelFor
TFloat.Create(V.z)
);
- //var vPrefilteredColorX = TFloat.Zero;
- //var vPrefilteredColorY = TFloat.Zero;
- //var vPrefilteredColorZ = TFloat.Zero;
var vPrefilteredColor = Vector3.Zero;
var vTotalWeight = TFloat.Zero;
// 3. Monte Carlo Integration Loop
- // We assume WideLane is supported in the test.
var dynamicSampleCount = (uint)max(1.0f, SAMPLE_COUNT * pLevel->roughness);
var vDynamicSampleCount = TFloat.Create(dynamicSampleCount);
+ var vLumaVector = MathV.Create(0.2126f, 0.7152f, 0.0722f);
for (var i = 0u; i < dynamicSampleCount; i += (uint)TFloat.LaneWidth)
{
@@ -194,7 +194,7 @@ internal unsafe struct GGXMipGenerationJobSPMD : IJobParallelFor
// A sun pixel (luma 1000.0) gets a tiny weight of ~0.001, naturally suppressing it.
// This introduce bias, but significantly reduces fireflies without needing solid angle sampling or cdf inversion.
// And since this is a mip generation step, a little bias is acceptable for much better performance and stability.
- var luma = MathV.Dot(sampleColor, MathV.Create(0.2126f, 0.7152f, 0.0722f));
+ var luma = MathV.Dot(sampleColor, vLumaVector);
var fireflyWeight = TFloat.One / (TFloat.One + luma);
var finalWeight = NdotL * fireflyWeight;
@@ -381,6 +381,7 @@ internal unsafe struct GGXMipGenerationJob : IJobParallelFor
}
}
+[SimpleJob(RunStrategy.ColdStart, launchCount: 1, warmupCount: 0, iterationCount: 1, invocationCount: 1, id: "QuickRun")]
public unsafe class GGXMipGenerationBenchmark
{
private ImageResultFloat _image;
@@ -388,7 +389,7 @@ public unsafe class GGXMipGenerationBenchmark
private int _totalPixel;
private float** _pResult;
private MipLevel* _pMipLevels;
- private float* radicalInverse_VdCLut;
+ private float* _radicalInverse_VdCLut;
private JobScheduler _jobScheduler = null!;
@@ -434,10 +435,10 @@ public unsafe class GGXMipGenerationBenchmark
ThreadPriority = ThreadPriority.Normal,
};
- radicalInverse_VdCLut = (float*)NativeMemory.Alloc(GGXMipGenerationJob.SAMPLE_COUNT * sizeof(float));
+ _radicalInverse_VdCLut = (float*)NativeMemory.Alloc(GGXMipGenerationJob.SAMPLE_COUNT * sizeof(float));
for (var i = 0u; i < GGXMipGenerationJob.SAMPLE_COUNT; i++)
{
- radicalInverse_VdCLut[i] = GGXMipGenerationJob.RadicalInverse_VdC(i);
+ _radicalInverse_VdCLut[i] = GGXMipGenerationJob.RadicalInverse_VdC(i);
}
_jobScheduler = new JobScheduler(in desc);
@@ -490,10 +491,12 @@ public unsafe class GGXMipGenerationBenchmark
[GlobalCleanup]
public void Cleanup()
{
+#if false
for (var i = 0; i < _mipLevels; i++)
{
DumpMipLevelToPng(_pResult[i], (int)_pMipLevels[i].width, (int)_pMipLevels[i].height, $"C:\\Users\\Misaki\\Downloads\\Im\\mip_level_{i}.png");
}
+#endif
_image.Dispose();
for (var i = 0; i < _mipLevels; i++)
@@ -503,12 +506,12 @@ public unsafe class GGXMipGenerationBenchmark
NativeMemory.Free(_pResult);
NativeMemory.Free(_pMipLevels);
- NativeMemory.Free(radicalInverse_VdCLut);
+ NativeMemory.Free(_radicalInverse_VdCLut);
_jobScheduler.Dispose();
}
- [Benchmark]
+ [Benchmark(Baseline = true)]
public void JobGGX()
{
JobHandle handle;
@@ -519,7 +522,7 @@ public unsafe class GGXMipGenerationBenchmark
image = _image,
pMipLevels = _pMipLevels,
numMipLevels = _mipLevels,
- radicalInverse_VdCLut = radicalInverse_VdCLut
+ radicalInverse_VdCLut = _radicalInverse_VdCLut
};
handle = _jobScheduler.ScheduleParallelFor(in job, _totalPixel, 64);
@@ -531,7 +534,7 @@ public unsafe class GGXMipGenerationBenchmark
image = _image,
pMipLevels = _pMipLevels,
numMipLevels = _mipLevels,
- radicalInverse_VdCLut = radicalInverse_VdCLut
+ radicalInverse_VdCLut = _radicalInverse_VdCLut
};
handle = _jobScheduler.ScheduleParallelFor(in job, _totalPixel, 64);
@@ -548,7 +551,7 @@ public unsafe class GGXMipGenerationBenchmark
image = _image,
pMipLevels = _pMipLevels,
numMipLevels = _mipLevels,
- radicalInverse_VdCLut = radicalInverse_VdCLut
+ radicalInverse_VdCLut = _radicalInverse_VdCLut
};
Parallel.For(0, _totalPixel, new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount - 1 }, i =>
@@ -567,12 +570,10 @@ public unsafe class GGXMipGenerationBenchmark
image = _image,
pMipLevels = _pMipLevels,
numMipLevels = _mipLevels,
- radicalInverse_VdCLut = radicalInverse_VdCLut
+ radicalInverse_VdCLut = _radicalInverse_VdCLut
};
- //var handle = _jobScheduler.ScheduleParallelFor(in job, _totalPixel, 64);
- //_jobScheduler.Wait(handle);
var ctx = new JobExecutionContext();
job.Run(_totalPixel, in ctx);
}
-}
\ No newline at end of file
+}
diff --git a/Misaki.HighPerformance.Test/Program.cs b/Misaki.HighPerformance.Test/Program.cs
index 4543f56..2c438e1 100644
--- a/Misaki.HighPerformance.Test/Program.cs
+++ b/Misaki.HighPerformance.Test/Program.cs
@@ -8,16 +8,22 @@ using System.Runtime.InteropServices;
//BenchmarkRunner.Run();
+const int count = 1;
+
var bench = new GGXMipGenerationBenchmark();
bench.Setup();
var sw = System.Diagnostics.Stopwatch.StartNew();
-bench.JobGGX();
-sw.Stop();
-Console.WriteLine($"GGX Mip Generation: {sw.Elapsed.TotalMilliseconds} ms");
-bench.Cleanup();
-//Console.WriteLine(sw.Elapsed.TotalMilliseconds);
+for (int i = 0; i < count; i++)
+{
+ bench.JobGGX();
+}
+
+sw.Stop();
+var avgTime = sw.Elapsed.TotalMilliseconds / count;
+Console.WriteLine($"GGX Mip Generation: {avgTime} ms");
+bench.Cleanup();
//AllocationManager.Initialize(AllocationManagerInitOpts.Default);
//var set = new UnsafeBitSet(100, AllocationHandle.Persistent, AllocationOption.Clear);