SPMD SIMD math library & lock-free job system integration

- Add new SPMD SIMD math project with scalar/vector lanes
- Integrate SPMD jobs and scheduling into job system
- Implement lock-free job dependency management
- Update math functions for .NET 10 and SIMD performance
- Add SPMD benchmarks, compress-store tests, and race tests
- Introduce generic Result<T> error handling utilities
- Solution/project file updates and code cleanup
This commit is contained in:
2026-02-11 22:44:30 +09:00
parent c36405645b
commit a9c143c2a2
22 changed files with 3433 additions and 221 deletions

View File

@@ -1,7 +1,8 @@
#define NOISE_BENCHMARK
#define ADD_BENCHMARK
using BenchmarkDotNet.Attributes;
using Misaki.HighPerformance.Mathematics;
using Misaki.HighPerformance.Mathematics.SPMD;
using System.Numerics;
using System.Runtime.Intrinsics;
@@ -9,7 +10,7 @@ namespace Misaki.HighPerformance.Test.Benchmark;
public class MathematicsBenchmark
{
#if VECTOR_BENCHMARK
#if ADD_BENCHMARK
private Vector4 _va = new Vector4(1, 2, 1, 2);
private Vector4 _vb = new Vector4(3, 4, 3, 4);
@@ -39,61 +40,52 @@ public class MathematicsBenchmark
}
#endif
#if NOISE_BENCHMARK
#if FMA_BENCHMARK
private Vector4 _va = new Vector4(1, 2, 1, 2);
private Vector4 _vb = new Vector4(3, 4, 3, 4);
private Vector4 _vc = new Vector4(5, 6, 5, 6);
private const int _SIZE = 32;
private Vector128<float> _va128 = Vector128.Create(1f, 2f, 1f, 2f);
private Vector128<float> _vb128 = Vector128.Create(3f, 4f, 3f, 4f);
private Vector128<float> _vc128 = Vector128.Create(5f, 6f, 5f, 6f);
private float4 _fa = new float4(1, 2, 1, 2);
private float4 _fb = new float4(3, 4, 3, 4);
private float4 _fc = new float4(5, 6, 5, 6);
[Benchmark]
public unsafe void VectorNoise()
public Vector4 Vector4()
{
var buf = stackalloc float[_SIZE * _SIZE];
var job = new Misaki.HighPerformance.Test.Jobs.NoiseJobVector
for (var i = 0; i < 10; i++)
{
buffers = buf,
width = _SIZE,
height = _SIZE,
};
for (var i = 0; i < _SIZE * _SIZE; i++)
{
job.Execute(i, 0);
_va = _vb * _vc + _va;
}
return _va;
}
[Benchmark]
public unsafe void MathNoise()
public Vector128<float> VectorFMA()
{
var buf = stackalloc float[_SIZE * _SIZE];
var job = new Misaki.HighPerformance.Test.Jobs.NoiseJobMath
for (var i = 0; i < 10; i++)
{
buffers = buf,
width = _SIZE,
height = _SIZE,
};
for (var i = 0; i < _SIZE * _SIZE; i++)
{
job.Execute(i, 0);
_va128 = System.Runtime.Intrinsics.X86.Fma.MultiplyAdd(_vb128, _vc128, _va128);
}
return _va128;
}
[Benchmark]
// This is 10x faster than VectorNoise and MathNoise, but writing a burst like compiler to compile MathNoise into this is incredibly hard.
public unsafe void MathVNoise()
public float4 floatFMA()
{
var buf = stackalloc float[_SIZE * _SIZE];
var job = new Misaki.HighPerformance.Test.Jobs.NoiseJobMathV
for (var i = 0; i < 10; i++)
{
buffers = buf,
width = _SIZE,
height = _SIZE,
};
for (var i = 0; i < _SIZE * _SIZE / 8; i++)
{
job.Execute(i, 0);
_fa = _fb * _fc + _fa;
}
return _fa;
}
#endif
#if MATRIX_BENCHMARK

View File

@@ -0,0 +1,98 @@
using BenchmarkDotNet.Attributes;
using Misaki.HighPerformance.Jobs;
using Misaki.HighPerformance.Mathematics.SPMD;
using System.Runtime.InteropServices;
namespace Misaki.HighPerformance.Test.Benchmark;
public unsafe class SPMDBenchmark
{
private const int _SIZE = 512;
private JobScheduler _scheduler = null!;
private float* _buf;
[GlobalSetup]
public void Setup()
{
_scheduler = new JobScheduler(Environment.ProcessorCount);
_buf = (float*)NativeMemory.Alloc(sizeof(float) * _SIZE * _SIZE);
}
[GlobalCleanup]
public void Cleanup()
{
_scheduler.Dispose();
NativeMemory.Free(_buf);
}
[Benchmark]
public void VectorNoiseSingleThread()
{
var job = new Jobs.NoiseJobVector
{
buffers = _buf,
width = _SIZE,
height = _SIZE,
};
job.Run(_SIZE * _SIZE, 0);
}
//[Benchmark]
public void VectorNoise()
{
var job = new Jobs.NoiseJobVector
{
buffers = _buf,
width = _SIZE,
height = _SIZE,
};
var handle = _scheduler.ScheduleParallel(ref job, _SIZE * _SIZE, 64);
_scheduler.WaitComplete(handle);
}
//[Benchmark]
public void MathNoise()
{
var job = new Jobs.NoiseJobMath
{
buffers = _buf,
width = _SIZE,
height = _SIZE,
};
var handle = _scheduler.ScheduleParallel(ref job, _SIZE * _SIZE, 64);
_scheduler.WaitComplete(handle);
}
//[Benchmark(Baseline = true)]
public void ManualSPMDNoise()
{
var job = new Jobs.NoiseJobMathV
{
buffers = _buf,
width = _SIZE,
height = _SIZE,
};
var iterations = (_SIZE * _SIZE + 8 - 1) / 8;
var handle = _scheduler.ScheduleParallel(ref job, iterations, 64);
_scheduler.WaitComplete(handle);
}
[Benchmark(Baseline = true)]
public void SPMDNoise()
{
var job = new Jobs.NoiseJobMathSPMD
{
buffers = _buf,
width = _SIZE,
height = _SIZE,
};
var handle = _scheduler.ScheduleParallelSPDM<Jobs.NoiseJobMathSPMD, float>(ref job, _SIZE * _SIZE, 64, -1, JobHandle.Invalid);
_scheduler.WaitComplete(handle);
}
}

View File

@@ -1,9 +1,9 @@
using Misaki.HighPerformance.Jobs;
using Misaki.HighPerformance.Mathematics;
using Misaki.HighPerformance.Mathematics.SPMD;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace Misaki.HighPerformance.Test.Jobs;
@@ -16,7 +16,7 @@ internal unsafe struct NoiseJobVector : IJobParallelFor
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static float Frac(float x)
{
return x - MathF.Truncate(x);
return x - MathF.Floor(x);
}
private static Vector2 GradientNoiseDirect(Vector2 uv)
@@ -101,7 +101,7 @@ internal unsafe struct NoiseJobMathV : IJobParallelFor
private static Vector256<float> Mod289(Vector256<float> x)
{
var div = x / Vector256.Create(289.0f);
var flr = Vector256.Floor(div);
var flr = Vector256.Truncate(div);
return x - (flr * Vector256.Create(289.0f));
}
@@ -119,12 +119,13 @@ internal unsafe struct NoiseJobMathV : IJobParallelFor
var hy = Mod289(iy);
var p = hx * Vector256.Create(34.0f) + Vector256.Create(1.0f);
p = Mod289(p * hx + hy);
p = Mod289(p * hx) + hy;
var pPrev = p;
p = p * Vector256.Create(34.0f) + Vector256.Create(1.0f);
p = Mod289(p * hx);
p = Mod289(p * pPrev);
var r = (p / 41.0f);
r = (r - Vector256.Floor(r)) * 2.0f - Vector256<float>.One;
r = (r - Vector256.Truncate(r)) * 2.0f - Vector256<float>.One;
var gx = r - Vector256.Floor(r + Vector256.Create(0.5f));
var gy = Vector256.Abs(r) - Vector256.Create(0.5f);
@@ -153,10 +154,10 @@ internal unsafe struct NoiseJobMathV : IJobParallelFor
var d10 = GradDot(ipX + Vector256<float>.One, ipY, fpX - Vector256<float>.One, fpY);
var d11 = GradDot(ipX + Vector256<float>.One, ipY + Vector256<float>.One, fpX - Vector256<float>.One, fpY - Vector256<float>.One);
var lerpX1 = d00 + (d10 - d00) * uX;
var lerpX2 = d01 + (d11 - d01) * uX;
var lerpY1 = d00 + (d10 - d00) * uY;
var lerpY2 = d01 + (d11 - d01) * uY;
return lerpX1 + (lerpX2 - lerpX1) * uY;
return lerpY1 + (lerpY2 - lerpY1) * uX;
}
public void Execute(int loopIndex, int threadIndex)
@@ -164,15 +165,17 @@ internal unsafe struct NoiseJobMathV : IJobParallelFor
// ---------------------------------------------------------
// IMPORTANT: Loop Stride is now 8!
// ---------------------------------------------------------
int baseIndex = loopIndex * 8;
var baseIndex = loopIndex * 8;
// Safety check
if (baseIndex + 7 >= width * height)
{
return;
}
// Calculate Coords
int y = baseIndex / width;
int x = baseIndex % width;
var y = baseIndex / width;
var x = baseIndex % width;
// Sequence: 0, 1, 2, 3, 4, 5, 6, 7
var vSeqX = Vector256.Create(0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f);
@@ -185,6 +188,81 @@ internal unsafe struct NoiseJobMathV : IJobParallelFor
var result = GradientNoiseAVX(vBaseX / vWidth, vBaseY / vHeight);
// Store 8 floats (32 bytes)
Avx.Store(buffers + baseIndex, result);
result.Store(buffers + baseIndex);
}
}
internal unsafe struct NoiseJobMathSPMD : IJobSPMD<float>
{
public float* buffers;
public int width;
public int height;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static T GradDot<T>(T ix, T iy, T fx, T fy)
where T : ISPMD<T, float>
{
var c289 = T.Create(289f);
var c34 = T.Create(34f);
var c1 = T.Create(1f);
var c41 = T.Create(41f);
var c2 = T.Create(2f);
var half = T.Create(0.5f);
ix %= c289;
iy %= c289;
var x = (c34 * ix + c1) * ix % c289 + iy;
x = (c34 * x + c1) * x % c289;
x = T.Frac(x / c41) * c2 - c1;
var gx = x - T.Floor(x + half);
var gy = T.Abs(x) - half;
// normalize
var len = T.Sqrt(gx * gx + gy * gy);
gx /= len;
gy /= len;
return gx * fx + gy * fy;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static T Noise<T>(T uvX, T uvY)
where T : ISPMD<T, float>
{
var c1 = T.Create(1f);
var c6 = T.Create(6f);
var c10 = T.Create(10f);
var c15 = T.Create(15f);
var ipX = T.Floor(uvX);
var ipY = T.Floor(uvY);
var fpX = uvX - ipX;
var fpY = uvY - ipY;
var d00 = GradDot(ipX, ipY, fpX, fpY);
var d01 = GradDot(ipX, ipY + c1, fpX, fpY - c1);
var d10 = GradDot(ipX + c1, ipY, fpX - c1, fpY);
var d11 = GradDot(ipX + c1, ipY + c1, fpX - c1, fpY - c1);
// fade
var uX = fpX * fpX * fpX * (fpX * (fpX * c6 - c15) + c10);
var uY = fpY * fpY * fpY * (fpY * (fpY * c6 - c15) + c10);
return T.Lerp(T.Lerp(d00, d10, uY), T.Lerp(d01, d11, uY), uX);
}
public readonly void Execute<TLane>(int baseIndex, int threadIndex)
where TLane : ISPMD<TLane, float>
{
var indices = TLane.Sequence(baseIndex, 1f);
var w = TLane.Create(width);
var h = TLane.Create(height);
var uvX = (indices % w) / w;
var uvY = TLane.Floor(indices / w) / h;
var result = Noise(uvX, uvY);
result.Store(buffers + baseIndex);
}
}

View File

@@ -24,6 +24,7 @@
<ProjectReference Include="..\Misaki.HighPerformance.Image\Misaki.HighPerformance.Image.csproj" />
<ProjectReference Include="..\Misaki.HighPerformance.Jobs\Misaki.HighPerformance.Jobs.csproj" />
<ProjectReference Include="..\Misaki.HighPerformance.LowLevel\Misaki.HighPerformance.LowLevel.csproj" />
<ProjectReference Include="..\Misaki.HighPerformance.Mathematics.SPMD\Misaki.HighPerformance.Mathematics.SPMD.csproj" />
<ProjectReference Include="..\Misaki.HighPerformance.Mathematics\Misaki.HighPerformance.Mathematics.csproj" />
<ProjectReference Include="..\Misaki.HighPerformance\Misaki.HighPerformance.csproj" />
<ProjectReference Include="..\Misaki.HighPerformance.Analyzer\Misaki.HighPerformance.Analyzer\Misaki.HighPerformance.Analyzer.csproj" OutputItemType="Analyzer" ReferenceOutputAssembly="false" />

View File

@@ -1,9 +1,15 @@
using Misaki.HighPerformance;
using Misaki.HighPerformance.Jobs;
using Misaki.HighPerformance.LowLevel;
using Misaki.HighPerformance.LowLevel.Utilities;
using Misaki.HighPerformance.Mathematics.SPMD;
using Misaki.HighPerformance.Test.UnitTest.Jobs;
using System.Numerics;
using System.Runtime.Intrinsics;
using System.Text;
BenchmarkDotNet.Running.BenchmarkRunner.Run<Misaki.HighPerformance.Test.Benchmark.MathematicsBenchmark>();
BenchmarkDotNet.Running.BenchmarkRunner.Run<Misaki.HighPerformance.Test.Benchmark.SPMDBenchmark>();
//return;
//using Misaki.HighPerformance.Collections;
//using Misaki.HighPerformance.LowLevel.Buffer;
//using Misaki.HighPerformance.LowLevel.Collections;

View File

@@ -0,0 +1,114 @@
using Misaki.HighPerformance.Mathematics.SPMD;
using System.Numerics;
namespace Misaki.HighPerformance.Test.UnitTest.Jobs;
public static class CompressStoreTest
{
public static void Run()
{
Console.WriteLine("--- Testing CompressStore (Double) ---");
// Test 1: Simple Pattern (True, False, True, False...)
TestPattern_Double(
input: new double[] { 1, 2, 3, 4, 5, 6, 7, 8 },
// Mask: Keep only even numbers (values > 0)
// We simulate a mask by comparing against 0 or -1
keepPattern: new bool[] { true, false, true, false, true, false, true, false }
);
// Test 2: All True
TestPattern_Double(
input: new double[] { 10, 20, 30, 40, 50, 60, 70, 80 },
keepPattern: new bool[] { true, true, true, true, true, true, true, true }
);
// Test 3: All False
TestPattern_Double(
input: new double[] { 10, 20, 30, 40, 50, 60, 70, 80 },
keepPattern: new bool[] { false, false, false, false, false, false, false, false }
);
// Test 4: Sparse (First and Last only)
TestPattern_Double(
input: new double[] { 1, 2, 3, 4, 5, 6, 7, 8 },
keepPattern: new bool[] { true, false, false, false, false, false, false, true }
);
}
private unsafe static void TestPattern_Double(double[] input, bool[] keepPattern)
{
// 1. Setup Input Vector
// Handle case where Vector<T> is smaller than 8 (e.g. 2 or 4)
var vecSize = Vector<double>.Count;
var safeInput = new double[vecSize];
var safeMaskVal = new double[vecSize];
// Expected Output Calculation
var expected = new double[vecSize];
var expectedCount = 0;
for (var i = 0; i < vecSize; i++)
{
safeInput[i] = input[i];
// If we want to keep it, make mask "GreaterThan" true
// We'll compare X > 0.
// If keep=true, val=1. If keep=false, val=-1.
safeMaskVal[i] = keepPattern[i] ? 1 : -1;
if (keepPattern[i])
{
expected[expectedCount++] = input[i];
}
}
// 2. Create WideLanes
var vInput = WideLane<double>.Load(ref safeInput.AsSpan().GetPinnableReference());
// Create Mask: greater than 0
var vMaskVal = WideLane<double>.Load(ref safeMaskVal.AsSpan().GetPinnableReference());
var vZero = WideLane<double>.Create(0);
var vMask = WideLane<double>.GreaterThan(vMaskVal, vZero);
// 3. Run CompressStore
var outputBuffer = new double[vecSize];
var actualCount = 0;
fixed (double* ptr = outputBuffer)
{
actualCount = vInput.CompressStore(vMask, ptr);
}
// 4. Verify
var pass = actualCount == expectedCount;
for (var i = 0; i < expectedCount; i++)
{
if (outputBuffer[i] != expected[i])
pass = false;
}
// 5. Report
var hardware = (vecSize == 4) ? "AVX2 (256-bit)" : (vecSize == 2) ? "SSE/NEON (128-bit)" : "Scalar";
Console.Write($"[{hardware}] Pattern: ");
for (var i = 0; i < vecSize; i++)
Console.Write(keepPattern[i] ? "1" : "0");
if (pass)
{
Console.WriteLine($" -> PASS (Count: {actualCount})");
}
else
{
Console.WriteLine($" -> FAIL!");
Console.WriteLine($" Expected Count: {expectedCount}, Actual: {actualCount}");
Console.Write(" Expected Data: ");
foreach (var d in expected)
Console.Write($"{d} ");
Console.WriteLine();
Console.Write(" Actual Data: ");
foreach (var d in outputBuffer)
Console.Write($"{d} ");
Console.WriteLine();
}
}
}

View File

@@ -1,19 +1,28 @@
using Misaki.HighPerformance.Jobs;
using Misaki.HighPerformance.Jobs;
using Misaki.HighPerformance.LowLevel.Buffer;
using Misaki.HighPerformance.LowLevel.Collections;
using Misaki.HighPerformance.LowLevel.Utilities;
using Misaki.HighPerformance.Mathematics.SPMD;
using System.Runtime.InteropServices;
namespace Misaki.HighPerformance.Test.UnitTest.Jobs;
[TestClass]
[DoNotParallelize]
public unsafe class TestJobSystem
{
private JobScheduler _jobScheduler = null!;
public TestContext TestContext
{
get;
set;
}
[TestInitialize]
public void Initialize()
{
_jobScheduler = new JobScheduler(Environment.ProcessorCount);
_jobScheduler = new JobScheduler(3);
}
[TestCleanup]
@@ -251,4 +260,102 @@ public unsafe class TestJobSystem
Assert.AreEqual(JobState.Completed, _jobScheduler.GetJobStatus(completedHandle));
}
[TestMethod]
public void RaceConditionTest()
{
const int jobCount = 20000;
var pExecutedCount = (int*)NativeMemory.Alloc(sizeof(int));
*pExecutedCount = 0;
var startSignal = false;
// 1. Create a "Gatekeeper" vectorJob that spins/blocks a worker thread until signaled.
// This allows us to control exactly when the dependency completes.
var rootJob = new WaitJob { pSignal = &startSignal };
var rootHandle = _jobScheduler.Schedule(ref rootJob);
// 2. Start a background task to flood the scheduler with dependencies on the Gatekeeper.
using var barrier = new Barrier(2);
var scheduleTask = Task.Run(() =>
{
var depJob = new IncrementJob { pCounter = pExecutedCount };
barrier.SignalAndWait(TestContext.CancellationTokenSource.Token); // Synchronize start with main thread
for (var i = 0; i < jobCount; i++)
{
// CONTENTION POINT:
// Trying to add a dependency to 'rootHandle'.
// Eventually, this will happen exactly while 'rootHandle' is transitioning to Completed.
_jobScheduler.Schedule(ref depJob, rootHandle);
}
}, TestContext.CancellationTokenSource.Token);
barrier.SignalAndWait(TestContext.CancellationTokenSource.Token); // Wait for scheduler task to be ready
// Allow the scheduling loop to get a head start and queue some readers
Thread.Sleep(5);
// 3. Open the gate.
// This triggers the Gatekeeper to complete. It will change its State and iterate its dependency list.
// This happens CONCURRENTLY with the loop above adding more items to that same list.
startSignal = true;
scheduleTask.Wait(TestContext.CancellationTokenSource.Token);
// 4. Validate results
// If the lock-free logic works, every single dependent vectorJob must eventually execute.
// If there is a race (e.g., missed notification), pExecutedCount will stick below jobCount.
var spin = new SpinWait();
var timeout = DateTime.Now.AddSeconds(10);
while (Volatile.Read(ref *pExecutedCount) < jobCount)
{
if (DateTime.Now > timeout)
{
break;
}
spin.SpinOnce();
}
// Ensure the root vectorJob is officially cleaned up
_jobScheduler.WaitComplete(rootHandle);
Assert.AreEqual(jobCount, *pExecutedCount, "Race condition detected: Some dependent jobs failed to execute (Wait timeout).");
NativeMemory.Free(pExecutedCount);
}
[TestMethod]
public void SPMDCorrectness()
{
const int size = 8;
var vectorBuf = stackalloc float[size * size];
var vs = new Span<float>(vectorBuf, size * size);
var vectorJob = new Misaki.HighPerformance.Test.Jobs.NoiseJobVector
{
buffers = vectorBuf,
width = size,
height = size,
};
vectorJob.Run(size * size, -1);
var spmdBuf = stackalloc float[size * size];
var ss = new Span<float>(spmdBuf, size * size);
var spmdJob = new Misaki.HighPerformance.Test.Jobs.NoiseJobMath
{
buffers = spmdBuf,
width = size,
height = size,
};
spmdJob.Run(size * size, -1);
var eq = vs.SequenceCompareTo(ss);
Assert.AreEqual(0, eq);
}
}

View File

@@ -1,4 +1,4 @@
using Misaki.HighPerformance.Jobs;
using Misaki.HighPerformance.Jobs;
namespace Misaki.HighPerformance.Test.UnitTest.Jobs;
@@ -70,4 +70,28 @@ internal unsafe struct ParallelMultiplyJob : IJobParallelFor
{
inout[loopIndex] *= multiplier;
}
}
public unsafe struct WaitJob : IJob
{
public bool* pSignal;
public void Execute(int loopIndex)
{
var spin = new SpinWait();
while (!Volatile.Read(ref *pSignal))
{
spin.SpinOnce();
}
}
}
public unsafe struct IncrementJob : IJob
{
public int* pCounter;
public void Execute(int loopIndex)
{
Interlocked.Increment(ref *pCounter);
}
}