Refactor SPMD jobs for true vectorized/masked execution

- Change IJobSPMD.Execute to (indices, mask, ctx) signature for all arities, enabling proper vectorized/masked SPMD execution.
- Update all SPMD job wrappers, extension methods, and test jobs to use new interface.
- Add AVX2 gather/masked gather support to MathV.GatherVector2/3/4 and related methods; use [ConstantExpected] byte scale.
- Improve gather/select logic, pointer arithmetic, and overloads for ref/int* index access.
- Refactor GGXMipGenerationBenchmark and jobs for SPMD, with per-mip-level vectorized jobs and improved memory access.
- Clean up code, fix naming, update comments, and bump version to 1.3.6.
This commit is contained in:
2026-05-03 23:32:04 +09:00
parent 4ffb41e210
commit 99fcbec753
14 changed files with 1965 additions and 605 deletions

View File

@@ -2,7 +2,6 @@ using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Engines;
using Misaki.HighPerformance.Image;
using Misaki.HighPerformance.Jobs;
using Misaki.HighPerformance.Mathematics;
using Misaki.HighPerformance.Mathematics.SPMD;
using SkiaSharp;
using System.Runtime.CompilerServices;
@@ -23,29 +22,180 @@ internal unsafe struct MipLevel
public float roughness;
}
//internal unsafe struct GGXMipGenerationJobSPMD : IJobParallelFor
internal unsafe struct GGXMipGenerationJobSPMD : IJobSPMD<float, int>
{
public ImageResultFloat image;
public MipLevel mipLevel;
public float* radicalInverse_VdCLut;
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
private static Vector2<TFloat, float> Hammersley<TFloat>(int i, uint N, float* lut)
where TFloat : unmanaged, ISPMDLane<TFloat, float>
{
var x = i / N;
var y = TFloat.Load(lut + i);
return MathV.Create<TFloat, float>(x, y);
}
// --- GGX Importance Sampling ---
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
private static Vector3<TFloat, float> ImportanceSampleGGX<TFloat>(Vector2<TFloat, float> Xi, Vector3<TFloat, float> N, float roughness)
where TFloat : unmanaged, ISPMDLane<TFloat, float>
{
var a = roughness * roughness; // Disney/Epic remap roughness for better visual linearity
var phi = 2.0f * PI * Xi.x;
var cosTheta = TFloat.Sqrt((1.0f - Xi.y) / TFloat.MultipleAdd(a * a - 1.0f, Xi.y, 1.0f));
var sinTheta = TFloat.Sqrt(1.0f - cosTheta * cosTheta);
// Spherical to Cartesian coordinates (Halfway vector)
TFloat.SinCos(phi, out var sinPhi, out var cosPhi);
var H = MathV.Create<TFloat, float>(cosPhi * sinTheta, sinPhi * sinTheta, cosTheta);
// Tangent space to World space
var mask = TFloat.Abs(N.z) < 0.999f;
var up = MathV.Select(mask, MathV.Create<TFloat, float>(0.0f, 0.0f, 1.0f), MathV.Create<TFloat, float>(1.0f, 0.0f, 0.0f));
var tangent = MathV.Normalize(MathV.Cross(up, N));
var bitangent = MathV.Cross(N, tangent);
var sampleVec = (tangent * H.x) + (bitangent * H.y) + (N * H.z);
return MathV.Normalize(sampleVec);
}
// --- Image Sampling Helpers ---
// Maps a 3D direction vector to 2D equirectangular UVs
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
private static Vector2<TFloat, float> DirToEquirectangularUV<TFloat>(Vector3<TFloat, float> dir)
where TFloat : unmanaged, ISPMDLane<TFloat, float>
{
var u = TFloat.Atan2(dir.z, dir.x);
var v = TFloat.Asin(dir.y);
u = u / (2.0f * PI) + 0.5f;
v = v / PI + 0.5f;
return MathV.Create<TFloat, float>(u, v);
}
// Samples the source HDR image using bilinear interpolation (simplified to nearest neighbor for brevity here)
// Do not inline this function to avoid register pressure and code bloat in the main sampling loop, as this is a relatively heavy operation and we want to keep it separate for better instruction cache usage.
[MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
private static Vector3<TFloat, float> SampleEquirectangularMap<TFloat, TInt>(float* img, int w, int h, Vector3<TFloat, float> dir, TFloat mask)
where TFloat : unmanaged, ISPMDLane<TFloat, float>
where TInt : unmanaged, ISPMDLane<TInt, int>
{
var uv = DirToEquirectangularUV(dir);
// Nearest neighbor pixel coordinates
var px = (uv.x * (w - 1.0f)).Cast<TInt, int>();
var py = (uv.y * (h - 1.0f)).Cast<TInt, int>();
// Clamp
px = TInt.Clamp(px, TInt.Zero, w - 1);
py = TInt.Clamp(py, TInt.Zero, h - 1);
// Assuming float RGB array format
var idx = (py * w + px) * 3;
return MathV.MaskGatherVector3<TFloat, float>(img, idx.GetUnsafePtr(), mask, 4);
}
public readonly void Execute<TFloat, TInt>(TFloat indices, TFloat mask, ref readonly JobExecutionContext ctx)
where TFloat : unmanaged, ISPMDLane<TFloat, float>
where TInt : unmanaged, ISPMDLane<TInt, int>
{
var w = (int)mipLevel.width;
var h = (int)mipLevel.height;
var pData = mipLevel.data;
var indexSequence = indices.Cast<TInt, int>();
var x = indexSequence % w;
var y = indexSequence / w;
var u = x.Cast<TFloat, float>() / (w - 1);
var v = y.Cast<TFloat, float>() / (h - 1);
var phi = (u - 0.5f) * 2.0f * PI;
var theta = (v - 0.5f) * PI;
TFloat.SinCos(theta, out var sinTheta, out var cosTheta);
TFloat.SinCos(phi, out var sinPhi, out var cosPhi);
var N = MathV.Create<TFloat, float>(cosTheta * cosPhi, sinTheta, cosTheta * sinPhi);
N = MathV.Normalize(N);
// For split-sum, we assume View and Reflection directions equal the Normal
var V = N;
var R = N;
var prefilteredColor = Vector3<TFloat, float>.Zero;
var totalWeight = TFloat.Zero;
// 3. Monte Carlo Integration Loop
var dynamicSampleCount = (uint)max(1.0f, GGXMipGenerationBenchmark.SAMPLE_COUNT * mipLevel.roughness);
var vDynamicSampleCount = TFloat.Create(dynamicSampleCount);
var lumaVector = MathV.Create<TFloat, float>(0.2126f, 0.7152f, 0.0722f);
for (var i = 0; i < dynamicSampleCount; i++)
{
// Generate a Hammersley random sequence point
var Xi = Hammersley<TFloat>(i, dynamicSampleCount, radicalInverse_VdCLut);
// Get the halfway vector based on GGX NDF
var H = ImportanceSampleGGX(Xi, N, mipLevel.roughness);
// Calculate Light direction
var L = MathV.Reflect(-V, H);
L = MathV.Normalize(L);
var NdotL = TFloat.Max(MathV.Dot(N, L), TFloat.Zero);
var sampleColor = SampleEquirectangularMap<TFloat, TInt>(image.Data, (int)image.Width, (int)image.Height, L, mask);
// The Karis Average Weight: 1 / (1 + luma)
// A normal sky pixel (luma 1.0) gets a weight of 0.5.
// A sun pixel (luma 1000.0) gets a tiny weight of ~0.001, naturally suppressing it.
// This introduce bias, but significantly reduces fireflies without needing solid angle sampling or cdf inversion.
// And since this is a mip generation step, a little bias is acceptable for much better performance and stability.
var luma = MathV.Dot(sampleColor, lumaVector);
var fireflyWeight = TFloat.One / (TFloat.One + luma);
var finalWeight = NdotL * fireflyWeight;
prefilteredColor += sampleColor * finalWeight;
totalWeight += finalWeight;
}
prefilteredColor = MathV.Select(totalWeight > 0.0f, prefilteredColor * TFloat.Rcp(totalWeight), prefilteredColor);
// Write to output mip array
var out_idx = (y * w + x) * 3;
// TODO: Optimize this
for (var i = 0; i < TFloat.LaneWidth; i++)
{
if (mask[i] == 0.0f)
{
continue;
}
var idx = out_idx[i];
pData[idx] = prefilteredColor.x[i];
pData[idx + 1] = prefilteredColor.y[i];
pData[idx + 2] = prefilteredColor.z[i];
}
}
}
internal unsafe struct GGXMipGenerationJobSPMD<TFloat, TInt> : IJobParallelFor
where TFloat : unmanaged, ISPMDLane<TFloat, float>
where TInt : unmanaged, ISPMDLane<TInt, int>
{
public const uint SAMPLE_COUNT = 1024u;
public ImageResultFloat image;
public MipLevel* pMipLevels;
public float* radicalInverse_VdCLut;
public int numMipLevels;
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
public static float RadicalInverse_VdC(uint bits)
{
bits = (bits << 16) | (bits >> 16);
bits = ((bits & 0x55555555u) << 1) | ((bits & 0xAAAAAAAAu) >> 1);
bits = ((bits & 0x33333333u) << 2) | ((bits & 0xCCCCCCCCu) >> 2);
bits = ((bits & 0x0F0F0F0Fu) << 4) | ((bits & 0xF0F0F0F0u) >> 4);
bits = ((bits & 0x00FF00FFu) << 8) | ((bits & 0xFF00FF00u) >> 8);
return bits * 2.3283064365386963e-10f; // bits / 0x100000000
}
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
private static Vector2<TFloat, float> Hammersley(TFloat i, uint N, float* lut)
{
@@ -110,7 +260,7 @@ internal unsafe struct GGXMipGenerationJobSPMD<TFloat, TInt> : IJobParallelFor
// Assuming float RGB array format
var idx = (py * w + px) * 3;
return MathV.GatherVector3<TFloat, float>(img, idx.GetUnsafePtr(), 1);
return MathV.GatherVector3<TFloat, float>(img, idx.GetUnsafePtr(), 4);
}
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
@@ -122,7 +272,6 @@ internal unsafe struct GGXMipGenerationJobSPMD<TFloat, TInt> : IJobParallelFor
m++;
}
var span = new ReadOnlySpan<MipLevel>(pMipLevels, numMipLevels);
var pLevel = &pMipLevels[m];
var w = (int)pLevel->width;
@@ -164,7 +313,7 @@ internal unsafe struct GGXMipGenerationJobSPMD<TFloat, TInt> : IJobParallelFor
// 3. Monte Carlo Integration Loop
var dynamicSampleCount = (uint)max(1.0f, SAMPLE_COUNT * pLevel->roughness);
var dynamicSampleCount = (uint)max(1.0f, GGXMipGenerationBenchmark.SAMPLE_COUNT * pLevel->roughness);
var vDynamicSampleCount = TFloat.Create(dynamicSampleCount);
var vLumaVector = MathV.Create<TFloat, float>(0.2126f, 0.7152f, 0.0722f);
@@ -226,15 +375,11 @@ internal unsafe struct GGXMipGenerationJobSPMD<TFloat, TInt> : IJobParallelFor
}
}
internal unsafe struct GGXMipGenerationJob : IJobParallelFor
[SimpleJob(RunStrategy.ColdStart, launchCount: 1, warmupCount: 0, iterationCount: 1, invocationCount: 1, id: "QuickRun")]
public unsafe class GGXMipGenerationBenchmark
{
public const uint SAMPLE_COUNT = 1024u;
public ImageResultFloat image;
public MipLevel* pMipLevels;
public float* radicalInverse_VdCLut;
public int numMipLevels;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static float RadicalInverse_VdC(uint bits)
{
@@ -246,143 +391,6 @@ internal unsafe struct GGXMipGenerationJob : IJobParallelFor
return bits * 2.3283064365386963e-10f; // bits / 0x100000000
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static float2 Hammersley(uint i, uint N, float* lut)
{
return float2((float)i / N, lut[i]);
}
// --- GGX Importance Sampling ---
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static float3 ImportanceSampleGGX(float2 Xi, float3 N, float roughness)
{
var a = roughness * roughness; // Disney/Epic remap roughness for better visual linearity
var phi = 2.0f * PI * Xi.x;
var cosTheta = sqrt((1.0f - Xi.y) / (1.0f + (a * a - 1.0f) * Xi.y));
var sinTheta = sqrt(1.0f - cosTheta * cosTheta);
// Spherical to Cartesian coordinates (Halfway vector)
sincos(phi, out var sinPhi, out var cosPhi);
var H = float3(cosPhi * sinTheta, sinPhi * sinTheta, cosTheta);
// Tangent space to World space
var up = abs(N.z) < 0.999f ? float3(0.0f, 0.0f, 1.0f) : float3(1.0f, 0.0f, 0.0f);
var tangent = normalize(cross(up, N));
var bitangent = cross(N, tangent);
var sampleVec = (tangent * H.x) + (bitangent * H.y) + (N * H.z);
return normalize(sampleVec);
}
// --- Image Sampling Helpers ---
// Maps a 3D direction vector to 2D equirectangular UVs
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static float2 DirToEquirectangularUV(float3 dir)
{
var uv = float2(atan2(dir.z, dir.x), asin(dir.y));
uv.x = uv.x / (2.0f * PI) + 0.5f;
uv.y = uv.y / PI + 0.5f;
return uv;
}
// Samples the source HDR image using bilinear interpolation (simplified to nearest neighbor for brevity here)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static float3 SampleEquirectangularMap(float* img, int w, int h, float3 dir)
{
var uv = DirToEquirectangularUV(dir);
// Nearest neighbor pixel coordinates
var px = (int)(uv.x * (w - 1));
var py = (int)(uv.y * (h - 1));
// Clamp
px = clamp(px, 0, w - 1);
py = clamp(py, 0, h - 1);
// Assuming float RGB array format
var idx = (py * w + px) * 3;
return float3(img[idx], img[idx + 1], img[idx + 2]);
}
public void Execute(int loopIndex, ref readonly JobExecutionContext ctx)
{
var m = 0;
while (m < numMipLevels - 1 && loopIndex >= pMipLevels[m + 1].offset)
{
m++;
}
var pLevel = &pMipLevels[m];
var w = (int)pLevel->width;
var h = (int)pLevel->height;
var pData = pLevel->data;
var local_i = loopIndex - pLevel->offset;
var x = local_i % w;
var y = local_i / w;
var u = (float)x / (w - 1);
var v = (float)y / (h - 1);
var phi = (u - 0.5f) * 2.0f * PI;
var theta = (v - 0.5f) * PI;
sincos(theta, out var sinTheta, out var cosTheta);
sincos(phi, out var sinPhi, out var cosPhi);
var N = float3(cosTheta * cosPhi, sinTheta, cosTheta * sinPhi);
N = normalize(N);
// For split-sum, we assume View and Reflection directions equal the Normal
var V = N;
var R = N;
var prefilteredColor = float3(0, 0, 0);
var totalWeight = 0.0f;
// 3. Monte Carlo Integration Loop
var dynamicSampleCount = (uint)max(1.0f, SAMPLE_COUNT * pLevel->roughness);
for (var i = 0u; i < dynamicSampleCount; i++)
{
// Generate a Hammersley random sequence point
var Xi = Hammersley(i, dynamicSampleCount, radicalInverse_VdCLut);
// Get the halfway vector based on GGX NDF
var H = ImportanceSampleGGX(Xi, N, pLevel->roughness);
// Calculate Light direction
var L = reflect(-V, H);
L = normalize(L);
var NdotL = max(dot(N, L), 0.0f);
// If light is above the horizon
if (NdotL > 0.0f)
{
var sampleColor = SampleEquirectangularMap(image.Data, (int)image.Width, (int)image.Height, L);
prefilteredColor += sampleColor * NdotL;
totalWeight += NdotL;
}
}
// 4. Average the result
if (totalWeight > 0.0f)
{
prefilteredColor *= 1.0f / totalWeight;
}
// Write to output mip array
var out_idx = (y * w + x) * 3;
pData[out_idx] = prefilteredColor.x;
pData[out_idx + 1] = prefilteredColor.y;
pData[out_idx + 2] = prefilteredColor.z;
}
}
[SimpleJob(RunStrategy.ColdStart, launchCount: 1, warmupCount: 0, iterationCount: 1, invocationCount: 1, id: "QuickRun")]
public unsafe class GGXMipGenerationBenchmark
{
private ImageResultFloat _image;
private int _mipLevels;
private int _totalPixel;
@@ -395,12 +403,12 @@ public unsafe class GGXMipGenerationBenchmark
[GlobalSetup]
public void Setup()
{
const string imagePath = "F:\\c\\SimpleRayTracer\\native\\assets\\hdri\\golden_gate_hills_1k.hdr";
//const string imagePath = "C:\\Users\\Misaki\\Downloads\\grasslands_sunset_4k.hdr";
//const string imagePath = "F:\\c\\SimpleRayTracer\\native\\assets\\hdri\\golden_gate_hills_1k.hdr";
const string imagePath = "C:\\Users\\Misaki\\Downloads\\grasslands_sunset_4k.hdr";
using var stream = new FileStream(imagePath, FileMode.Open, FileAccess.Read);
_image = ImageResultFloat.FromStream(stream, ColorComponents.RGB);
_mipLevels = (int)Math.Floor(Math.Log2(Math.Max(_image.Width, _image.Height))) + 1;
_mipLevels = (int)MathF.Floor(MathF.Log2(Math.Max(_image.Width, _image.Height))) + 1;
_pResult = (float**)NativeMemory.Alloc((nuint)(_mipLevels * sizeof(float*)));
_pMipLevels = (MipLevel*)NativeMemory.Alloc((nuint)(_mipLevels * sizeof(MipLevel)));
@@ -434,10 +442,10 @@ public unsafe class GGXMipGenerationBenchmark
ThreadPriority = ThreadPriority.Normal,
};
_radicalInverse_VdCLut = (float*)NativeMemory.Alloc(GGXMipGenerationJob.SAMPLE_COUNT * sizeof(float));
for (var i = 0u; i < GGXMipGenerationJob.SAMPLE_COUNT; i++)
_radicalInverse_VdCLut = (float*)NativeMemory.Alloc(SAMPLE_COUNT * sizeof(float));
for (var i = 0u; i < SAMPLE_COUNT; i++)
{
_radicalInverse_VdCLut[i] = GGXMipGenerationJob.RadicalInverse_VdC(i);
_radicalInverse_VdCLut[i] = RadicalInverse_VdC(i);
}
_jobScheduler = new JobScheduler(in desc);
@@ -490,7 +498,7 @@ public unsafe class GGXMipGenerationBenchmark
[GlobalCleanup]
public void Cleanup()
{
#if true
#if false
for (var i = 0; i < _mipLevels; i++)
{
DumpMipLevelToPng(_pResult[i], (int)_pMipLevels[i].width, (int)_pMipLevels[i].height, $"C:\\Users\\Misaki\\Downloads\\Im\\mip_level_{i}.png");
@@ -542,37 +550,43 @@ public unsafe class GGXMipGenerationBenchmark
_jobScheduler.Wait(handle);
}
//[Benchmark]
public void ParallelGGX()
{
var job = new GGXMipGenerationJob
{
image = _image,
pMipLevels = _pMipLevels,
numMipLevels = _mipLevels,
radicalInverse_VdCLut = _radicalInverse_VdCLut
};
Parallel.For(0, _totalPixel, new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount - 1 }, i =>
{
var localJob = job;
var ctx = new JobExecutionContext();
localJob.Execute(i, in ctx);
});
}
[Benchmark]
public void SingleThreadGGX()
public void MipLevelJobGGX()
{
var job = new GGXMipGenerationJob
var pJobHandles = (Span<JobHandle>)stackalloc JobHandle[_mipLevels];
for (var m = 0; m < _mipLevels; m++)
{
image = _image,
pMipLevels = _pMipLevels,
numMipLevels = _mipLevels,
radicalInverse_VdCLut = _radicalInverse_VdCLut
};
var job = new GGXMipGenerationJobSPMD
{
image = _image,
mipLevel = _pMipLevels[m],
radicalInverse_VdCLut = _radicalInverse_VdCLut
};
var ctx = new JobExecutionContext();
job.Run(_totalPixel, in ctx);
var totalPixel = (int)(_pMipLevels[m].width * _pMipLevels[m].height);
var handle = _jobScheduler.ScheduleParallelSPDM<GGXMipGenerationJobSPMD, float, int>(ref job, totalPixel, 64, false, JobPriority.Normal);
pJobHandles[m] = handle;
}
_jobScheduler.WaitAll(pJobHandles);
}
//[Benchmark]
//public void ParallelGGX()
//{
// var job = new GGXMipGenerationJob
// {
// image = _image,
// pMipLevels = _pMipLevels,
// numMipLevels = _mipLevels,
// radicalInverse_VdCLut = _radicalInverse_VdCLut
// };
// Parallel.For(0, _totalPixel, new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount - 1 }, i =>
// {
// var localJob = job;
// var ctx = new JobExecutionContext();
// localJob.Execute(i, in ctx);
// });
//}
}