Refactor SIMD gather, tighten constraints, doc & test opts
- Require TLane : unmanaged, ISPMDLane for stricter type safety and direct memory ops - Refactor GatherVectorN and WideLane<T>.Gather to use Unsafe.SkipInit and direct assignment, removing stackalloc and TLane.Load for better SIMD performance - Use Vector.Sum in WideLane<T>.ReduceAdd - Add/improve XML docs for ReduceAdd/ReduceMax/ReduceMin - Update test project for AOT, AVX2, speed optimization, and disable reflection - Tweak GGXMipGenerationBenchmark and Program.cs for improved benchmarking and output
This commit is contained in:
@@ -597,13 +597,22 @@ public unsafe interface ISPMDLane<TSelf, TNumber> : ISPMDLane, IEquatable<TSelf>
|
||||
static abstract TSelf Rsqrt(TSelf value);
|
||||
|
||||
/// <summary>
|
||||
/// Horizontally reduces the lane value by adding all lanes together, returning a single-lane result.
|
||||
/// Reduces the lane value to a single scalar by adding all lanes together.
|
||||
/// </summary>
|
||||
/// <param name="a"></param>
|
||||
/// <param name="b"></param>
|
||||
/// <returns></returns>
|
||||
/// <param name="value">The lane value to reduce.</param>
|
||||
/// <returns>The reduced scalar value.</returns>
|
||||
static abstract TNumber ReduceAdd(TSelf value);
|
||||
/// <summary>
|
||||
/// Reduces the lane value to a single scalar by finding the maximum element.
|
||||
/// </summary>
|
||||
/// <param name="value">The lane value to reduce.</param>
|
||||
/// <returns>The reduced scalar value.</returns>
|
||||
static abstract TNumber ReduceMax(TSelf value);
|
||||
/// <summary>
|
||||
/// Reduces the lane value to a single scalar by finding the minimum element.
|
||||
/// </summary>
|
||||
/// <param name="value">The lane value to reduce.</param>
|
||||
/// <returns>The reduced scalar value.</returns>
|
||||
static abstract TNumber ReduceMin(TSelf value);
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
|
||||
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
|
||||
<Authors>Misaki</Authors>
|
||||
<AssemblyVersion>1.3.2</AssemblyVersion>
|
||||
<AssemblyVersion>1.3.3</AssemblyVersion>
|
||||
<Version>$(AssemblyVersion)</Version>
|
||||
<PackageProjectUrl>https://git.personalnas.com/Misaki/Misaki.HighPerformance.git</PackageProjectUrl>
|
||||
<RepositoryUrl>https://git.personalnas.com/Misaki/Misaki.HighPerformance.git</RepositoryUrl>
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -21,7 +21,7 @@ const string GenericParameters = $"{TLane}, {TNumber}";
|
||||
|
||||
var dimensions = new int[] { 2, 3, 4 };
|
||||
var components = new char[] { 'x', 'y', 'z', 'w' };
|
||||
var TLaneRestrictions = $@"where {TLane} : ISPMDLane<{TLane}, {TNumber}>";
|
||||
var TLaneRestrictions = $@"where {TLane} : unmanaged, ISPMDLane<{TLane}, {TNumber}>";
|
||||
var TNumberRestrictions = $@"where {TNumber} : unmanaged, INumber<{TNumber}>, IBinaryNumber<{TNumber}>, IMinMaxValue<{TNumber}>, IBitwiseOperators<{TNumber}, {TNumber}, {TNumber}>";
|
||||
#>
|
||||
|
||||
@@ -126,19 +126,24 @@ public static unsafe partial class MathV
|
||||
<#= TLaneRestrictions #>
|
||||
<#= TNumberRestrictions #>
|
||||
{
|
||||
var buffer = stackalloc <#= TNumber #>[TLane.LaneWidth * <#= dimension #>];
|
||||
<# for (int i = 0; i < dimension; i++) { #>
|
||||
Unsafe.SkipInit(out TLane <#= components[i] #>);
|
||||
var p<#= components[i] #> = (<#= TNumber #>*)&<#= components[i] #>;
|
||||
<# } #>
|
||||
|
||||
for (var i = 0; i < TLane.LaneWidth; i++)
|
||||
{
|
||||
var scalarIdx = int.CreateTruncating(indices[i]);
|
||||
|
||||
<# for (int i = 0; i < dimension; i++) { #>
|
||||
buffer[<#= i #> * TLane.LaneWidth + i] = pData[scalarIdx + <#= i #> * scale];
|
||||
p<#= components[i] #>[i] = pData[scalarIdx + <#= i #> * scale];
|
||||
<# } #>
|
||||
}
|
||||
|
||||
return new <#= vectorType #>
|
||||
return new Vector<#= dimension #><TLane, TNumber>
|
||||
{
|
||||
<# for (int i = 0; i < dimension; i++) { #>
|
||||
<#= components[i] #> = TLane.Load(buffer + <#= i #> * TLane.LaneWidth),
|
||||
<#= components[i] #> = <#= components[i] #>,
|
||||
<# } #>
|
||||
};
|
||||
}
|
||||
@@ -148,19 +153,24 @@ public static unsafe partial class MathV
|
||||
<#= TLaneRestrictions #>
|
||||
<#= TNumberRestrictions #>
|
||||
{
|
||||
var buffer = stackalloc <#= TNumber #>[TLane.LaneWidth * <#= dimension #>];
|
||||
<# for (int i = 0; i < dimension; i++) { #>
|
||||
Unsafe.SkipInit(out TLane <#= components[i] #>);
|
||||
var p<#= components[i] #> = (<#= TNumber #>*)&<#= components[i] #>;
|
||||
<# } #>
|
||||
|
||||
for (var i = 0; i < TLane.LaneWidth; i++)
|
||||
{
|
||||
var scalarIdx = pIndices[i];
|
||||
var scalerIdx = pIndices[i];
|
||||
|
||||
<# for (int i = 0; i < dimension; i++) { #>
|
||||
buffer[<#= i #> * TLane.LaneWidth + i] = pData[scalarIdx + <#= i #> * scale];
|
||||
p<#= components[i] #>[i] = pData[scalerIdx + <#= i #> * scale];
|
||||
<# } #>
|
||||
}
|
||||
|
||||
return new <#= vectorType #>
|
||||
return new Vector<#= dimension #><TLane, TNumber>
|
||||
{
|
||||
<# for (int i = 0; i < dimension; i++) { #>
|
||||
<#= components[i] #> = TLane.Load(buffer + <#= i #> * TLane.LaneWidth),
|
||||
<#= components[i] #> = <#= components[i] #>,
|
||||
<# } #>
|
||||
};
|
||||
}
|
||||
@@ -170,19 +180,24 @@ public static unsafe partial class MathV
|
||||
<#= TLaneRestrictions #>
|
||||
<#= TNumberRestrictions #>
|
||||
{
|
||||
var buffer = stackalloc <#= TNumber #>[TLane.LaneWidth * <#= dimension #>];
|
||||
<# for (int i = 0; i < dimension; i++) { #>
|
||||
Unsafe.SkipInit(out TLane <#= components[i] #>);
|
||||
var p<#= components[i] #> = (<#= TNumber #>*)&<#= components[i] #>;
|
||||
<# } #>
|
||||
|
||||
for (var i = 0; i < TLane.LaneWidth; i++)
|
||||
{
|
||||
var scalarIdx = int.CreateTruncating(indices[i]);
|
||||
|
||||
<# for (int i = 0; i < dimension; i++) { #>
|
||||
buffer[<#= i #> * TLane.LaneWidth + i] = Unsafe.Add(ref baseAddress, scalarIdx + <#= i #> * scale);
|
||||
p<#= components[i] #>[i] = Unsafe.Add(ref baseAddress, scalarIdx + <#= i #> * scale);
|
||||
<# } #>
|
||||
}
|
||||
|
||||
return new <#= vectorType #>
|
||||
return new Vector<#= dimension #><TLane, TNumber>
|
||||
{
|
||||
<# for (int i = 0; i < dimension; i++) { #>
|
||||
<#= components[i] #> = TLane.Load(buffer + <#= i #> * TLane.LaneWidth),
|
||||
<#= components[i] #> = <#= components[i] #>,
|
||||
<# } #>
|
||||
};
|
||||
}
|
||||
@@ -192,19 +207,24 @@ public static unsafe partial class MathV
|
||||
<#= TLaneRestrictions #>
|
||||
<#= TNumberRestrictions #>
|
||||
{
|
||||
var buffer = stackalloc <#= TNumber #>[TLane.LaneWidth * <#= dimension #>];
|
||||
<# for (int i = 0; i < dimension; i++) { #>
|
||||
Unsafe.SkipInit(out TLane <#= components[i] #>);
|
||||
var p<#= components[i] #> = (<#= TNumber #>*)&<#= components[i] #>;
|
||||
<# } #>
|
||||
|
||||
for (var i = 0; i < TLane.LaneWidth; i++)
|
||||
{
|
||||
var scalarIdx = Unsafe.Add(ref baseIndex, i);
|
||||
|
||||
<# for (int i = 0; i < dimension; i++) { #>
|
||||
buffer[<#= i #> * TLane.LaneWidth + i] = Unsafe.Add(ref baseAddress, scalarIdx + <#= i #> * scale);
|
||||
p<#= components[i] #>[i] = Unsafe.Add(ref baseAddress, scalarIdx + <#= i #> * scale);
|
||||
<# } #>
|
||||
}
|
||||
|
||||
return new <#= vectorType #>
|
||||
return new Vector<#= dimension #><TLane, TNumber>
|
||||
{
|
||||
<# for (int i = 0; i < dimension; i++) { #>
|
||||
<#= components[i] #> = TLane.Load(buffer + <#= i #> * TLane.LaneWidth),
|
||||
<#= components[i] #> = <#= components[i] #>,
|
||||
<# } #>
|
||||
};
|
||||
}
|
||||
|
||||
@@ -207,49 +207,69 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static WideLane<TNumber> Gather(TNumber* pData, WideLane<TNumber> indices, int scale)
|
||||
{
|
||||
var buffer = stackalloc TNumber[LaneWidth];
|
||||
for (var i = 0; i < LaneWidth; i++)
|
||||
Unsafe.SkipInit(out Vector<TNumber> result);
|
||||
|
||||
var pResult = (TNumber*)&result;
|
||||
var pIndices = (TNumber*)&indices;
|
||||
|
||||
var count = Vector<TNumber>.Count;
|
||||
for (var i = 0; i < count; i++)
|
||||
{
|
||||
buffer[i] = pData[int.CreateTruncating(indices[i]) * scale / sizeof(TNumber)];
|
||||
var idx = int.CreateTruncating(pIndices[i]);
|
||||
pResult[i] = pData[idx * scale / sizeof(TNumber)];
|
||||
}
|
||||
|
||||
return new WideLane<TNumber>(Vector.Load(buffer));
|
||||
return new WideLane<TNumber>(result);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static WideLane<TNumber> Gather(TNumber* pData, int* pIndices, int scale)
|
||||
{
|
||||
var buffer = stackalloc TNumber[LaneWidth];
|
||||
for (var i = 0; i < LaneWidth; i++)
|
||||
Unsafe.SkipInit(out Vector<TNumber> result);
|
||||
|
||||
var pResult = (TNumber*)&result;
|
||||
|
||||
var count = Vector<TNumber>.Count;
|
||||
for (var i = 0; i < count; i++)
|
||||
{
|
||||
buffer[i] = pData[pIndices[i] * scale / sizeof(TNumber)];
|
||||
pResult[i] = pData[pIndices[i] * scale / sizeof(TNumber)];
|
||||
}
|
||||
|
||||
return new WideLane<TNumber>(Vector.Load(buffer));
|
||||
return new WideLane<TNumber>(result);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static WideLane<TNumber> Gather(ref TNumber baseAddress, WideLane<TNumber> indices, int scale)
|
||||
{
|
||||
var buffer = stackalloc TNumber[LaneWidth];
|
||||
for (var i = 0; i < LaneWidth; i++)
|
||||
Unsafe.SkipInit(out Vector<TNumber> result);
|
||||
|
||||
var pResult = (TNumber*)&result;
|
||||
var pIndices = (TNumber*)&indices;
|
||||
|
||||
var count = Vector<TNumber>.Count;
|
||||
for (var i = 0; i < count; i++)
|
||||
{
|
||||
buffer[i] = Unsafe.Add(ref baseAddress, int.CreateTruncating(indices[i]) * scale / sizeof(TNumber));
|
||||
var idx = int.CreateTruncating(pIndices[i]);
|
||||
pResult[i] = Unsafe.Add(ref baseAddress, idx * scale / sizeof(TNumber));
|
||||
}
|
||||
|
||||
return new WideLane<TNumber>(Vector.Load(buffer));
|
||||
return new WideLane<TNumber>(result);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static WideLane<TNumber> Gather(ref TNumber baseAddress, ref int baseIndex, int scale)
|
||||
{
|
||||
var buffer = stackalloc TNumber[LaneWidth];
|
||||
for (var i = 0; i < LaneWidth; i++)
|
||||
Unsafe.SkipInit(out Vector<TNumber> result);
|
||||
|
||||
var pResult = (TNumber*)&result;
|
||||
|
||||
var count = Vector<TNumber>.Count;
|
||||
for (var i = 0; i < count; i++)
|
||||
{
|
||||
buffer[i] = Unsafe.Add(ref baseAddress, Unsafe.Add(ref baseIndex, i) * scale / sizeof(TNumber));
|
||||
pResult[i] = Unsafe.Add(ref baseAddress, Unsafe.Add(ref baseIndex, i) * scale / sizeof(TNumber));
|
||||
}
|
||||
|
||||
return new WideLane<TNumber>(Vector.Load(buffer));
|
||||
return new WideLane<TNumber>(result);
|
||||
}
|
||||
|
||||
|
||||
@@ -274,11 +294,9 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public int CompressStore(WideLane<TNumber> mask, TNumber* pDestination)
|
||||
{
|
||||
var size = sizeof(TNumber);
|
||||
|
||||
if (LaneWidth == Vector512<TNumber>.Count && Vector512.IsHardwareAccelerated)
|
||||
{
|
||||
if (size == 4)
|
||||
if (sizeof(TNumber) == 4)
|
||||
{
|
||||
ref var vec = ref Unsafe.As<WideLane<TNumber>, Vector512<uint>>(ref Unsafe.AsRef(in this));
|
||||
var m = Unsafe.As<WideLane<TNumber>, Vector512<uint>>(ref mask);
|
||||
@@ -292,7 +310,7 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
return BitOperations.PopCount(moveMask);
|
||||
}
|
||||
|
||||
if (size == 8)
|
||||
if (sizeof(TNumber) == 8)
|
||||
{
|
||||
ref var vec = ref Unsafe.As<WideLane<TNumber>, Vector512<ulong>>(ref Unsafe.AsRef(in this));
|
||||
var m = Unsafe.As<WideLane<TNumber>, Vector512<ulong>>(ref mask);
|
||||
@@ -308,7 +326,7 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
}
|
||||
else if (LaneWidth == Vector256<TNumber>.Count && Vector256.IsHardwareAccelerated)
|
||||
{
|
||||
if (size == 4)
|
||||
if (sizeof(TNumber) == 4)
|
||||
{
|
||||
ref var vec = ref Unsafe.As<WideLane<TNumber>, Vector256<uint>>(ref Unsafe.AsRef(in this));
|
||||
var m = Unsafe.As<WideLane<TNumber>, Vector256<uint>>(ref mask);
|
||||
@@ -322,7 +340,7 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
return BitOperations.PopCount(moveMask);
|
||||
}
|
||||
|
||||
if (size == 8)
|
||||
if (sizeof(TNumber) == 8)
|
||||
{
|
||||
ref var vec = ref Unsafe.As<WideLane<TNumber>, Vector256<ulong>>(ref Unsafe.AsRef(in this));
|
||||
var m = Unsafe.As<WideLane<TNumber>, Vector256<ulong>>(ref mask);
|
||||
@@ -340,7 +358,7 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
}
|
||||
else if (LaneWidth == Vector128<TNumber>.Count && Vector128.IsHardwareAccelerated)
|
||||
{
|
||||
if (size == 4)
|
||||
if (sizeof(TNumber) == 4)
|
||||
{
|
||||
ref var vec = ref Unsafe.As<WideLane<TNumber>, Vector128<uint>>(ref Unsafe.AsRef(in this));
|
||||
var m = Unsafe.As<WideLane<TNumber>, Vector128<uint>>(ref mask);
|
||||
@@ -354,7 +372,7 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
return BitOperations.PopCount(moveMask);
|
||||
}
|
||||
|
||||
if (size == 8)
|
||||
if (sizeof(TNumber) == 8)
|
||||
{
|
||||
ref var vec = ref Unsafe.As<WideLane<TNumber>, Vector128<ulong>>(ref Unsafe.AsRef(in this));
|
||||
var m = Unsafe.As<WideLane<TNumber>, Vector128<ulong>>(ref mask);
|
||||
@@ -568,10 +586,8 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
var result = Vector.FusedMultiplyAdd(va, vb, vc);
|
||||
return new WideLane<TNumber>(Unsafe.As<Vector<double>, Vector<TNumber>>(ref result));
|
||||
}
|
||||
else
|
||||
{
|
||||
return new WideLane<TNumber>((a.value * b.value) + c.value);
|
||||
}
|
||||
|
||||
return new WideLane<TNumber>((a.value * b.value) + c.value);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
@@ -1041,15 +1057,7 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static TNumber ReduceAdd(WideLane<TNumber> value)
|
||||
{
|
||||
// TODO: Use shuffle and add.
|
||||
|
||||
var result = TNumber.Zero;
|
||||
for (var i = 0; i < LaneWidth; i++)
|
||||
{
|
||||
result += value[i];
|
||||
}
|
||||
|
||||
return result;
|
||||
return Vector.Sum(value.value);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
using BenchmarkDotNet.Attributes;
|
||||
using BenchmarkDotNet.Diagnosers;
|
||||
using BenchmarkDotNet.Engines;
|
||||
using Misaki.HighPerformance.Image;
|
||||
using Misaki.HighPerformance.Jobs;
|
||||
@@ -396,8 +395,8 @@ public unsafe class GGXMipGenerationBenchmark
|
||||
[GlobalSetup]
|
||||
public void Setup()
|
||||
{
|
||||
//const string imagePath = "F:\\c\\SimpleRayTracer\\native\\assets\\hdri\\golden_gate_hills_1k.hdr";
|
||||
const string imagePath = "C:\\Users\\Misaki\\Downloads\\grasslands_sunset_4k.hdr";
|
||||
const string imagePath = "F:\\c\\SimpleRayTracer\\native\\assets\\hdri\\golden_gate_hills_1k.hdr";
|
||||
//const string imagePath = "C:\\Users\\Misaki\\Downloads\\grasslands_sunset_4k.hdr";
|
||||
using var stream = new FileStream(imagePath, FileMode.Open, FileAccess.Read);
|
||||
_image = ImageResultFloat.FromStream(stream, ColorComponents.RGB);
|
||||
|
||||
|
||||
@@ -6,6 +6,10 @@
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<AllowUnsafeBlocks>True</AllowUnsafeBlocks>
|
||||
<PublishAot>True</PublishAot>
|
||||
<OptimizationPreference>Speed</OptimizationPreference>
|
||||
<IlcInstructionSet>avx2</IlcInstructionSet>
|
||||
<IlcDisableReflection>true</IlcDisableReflection>
|
||||
</PropertyGroup>
|
||||
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU'">
|
||||
|
||||
@@ -8,11 +8,16 @@ using System.Runtime.InteropServices;
|
||||
|
||||
//BenchmarkRunner.Run<GGXMipGenerationBenchmark>();
|
||||
|
||||
const int count = 1;
|
||||
const int count = 16;
|
||||
|
||||
var bench = new GGXMipGenerationBenchmark();
|
||||
bench.Setup();
|
||||
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
bench.JobGGX();
|
||||
}
|
||||
|
||||
var sw = System.Diagnostics.Stopwatch.StartNew();
|
||||
|
||||
for (int i = 0; i < count; i++)
|
||||
@@ -22,7 +27,7 @@ for (int i = 0; i < count; i++)
|
||||
|
||||
sw.Stop();
|
||||
var avgTime = sw.Elapsed.TotalMilliseconds / count;
|
||||
Console.WriteLine($"GGX Mip Generation: {avgTime} ms");
|
||||
Console.WriteLine($"GGX Mip Generation (Inline): {avgTime} ms");
|
||||
bench.Cleanup();
|
||||
|
||||
//AllocationManager.Initialize(AllocationManagerInitOpts.Default);
|
||||
|
||||
Reference in New Issue
Block a user