diff --git a/Misaki.HighPerformance.Mathematics.SPMD/ISPMDLane.cs b/Misaki.HighPerformance.Mathematics.SPMD/ISPMDLane.cs index e8c2ecf..184ad43 100644 --- a/Misaki.HighPerformance.Mathematics.SPMD/ISPMDLane.cs +++ b/Misaki.HighPerformance.Mathematics.SPMD/ISPMDLane.cs @@ -16,14 +16,6 @@ public interface ISPMDLane } } -// TODO: -// - ReduceAdd -// - ReduceMin -// - ReduceMax -// - LeadingZeroCount -// - TrailingZeroCount -// - PopCount - /// /// Represents a single-lane or multi-lane (vectorized) SPMD value and the operations supported on it. /// @@ -65,6 +57,14 @@ public unsafe interface ISPMDLane : ISPMDLane, IEquatable get; } + /// + /// Gets a lane value where all bits are set to 1 for each lane. + /// + static abstract TSelf AllBitsSet + { + get; + } + /// /// Gets the element value for the specified lane index. /// diff --git a/Misaki.HighPerformance.Mathematics.SPMD/Misaki.HighPerformance.Mathematics.SPMD.csproj b/Misaki.HighPerformance.Mathematics.SPMD/Misaki.HighPerformance.Mathematics.SPMD.csproj index 3431fed..cbbe3df 100644 --- a/Misaki.HighPerformance.Mathematics.SPMD/Misaki.HighPerformance.Mathematics.SPMD.csproj +++ b/Misaki.HighPerformance.Mathematics.SPMD/Misaki.HighPerformance.Mathematics.SPMD.csproj @@ -7,23 +7,31 @@ true true Misaki - 1.3.3 + 1.3.4 $(AssemblyVersion) https://git.personalnas.com/Misaki/Misaki.HighPerformance.git https://git.personalnas.com/Misaki/Misaki.HighPerformance.git + false + contentFiles True + $(DefineConstants) True + $(DefineConstants) - + - - + + true + contentFiles\cs\any\Misaki.HighPerformance.LowLevel\ + false + Compile + diff --git a/Misaki.HighPerformance.Mathematics.SPMD/ScalerLane.cs b/Misaki.HighPerformance.Mathematics.SPMD/ScalerLane.cs index f663a7f..a3951fb 100644 --- a/Misaki.HighPerformance.Mathematics.SPMD/ScalerLane.cs +++ b/Misaki.HighPerformance.Mathematics.SPMD/ScalerLane.cs @@ -40,6 +40,12 @@ public readonly unsafe struct ScalarLane : ISPMDLane new ScalarLane(TNumber.MaxValue); } + public static ScalarLane AllBitsSet + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get => new ScalarLane(TNumber.AllBitsSet); + } + public readonly TNumber this[int index] { [MethodImpl(MethodImplOptions.AggressiveInlining)] diff --git a/Misaki.HighPerformance.Mathematics.SPMD/Templates/IJobSPMD.tt b/Misaki.HighPerformance.Mathematics.SPMD/Templates/IJobSPMD.tt index a7e7a5c..41a6286 100644 --- a/Misaki.HighPerformance.Mathematics.SPMD/Templates/IJobSPMD.tt +++ b/Misaki.HighPerformance.Mathematics.SPMD/Templates/IJobSPMD.tt @@ -10,7 +10,6 @@ using System.Numerics; namespace Misaki.HighPerformance.Mathematics.SPMD; <# - const string TLane = "TLane"; const string TNumber = "TNumber"; const string GenericParameters = $"{TLane}, {TNumber}"; diff --git a/Misaki.HighPerformance.Mathematics.SPMD/Templates/MathV.Vector.gen.cs b/Misaki.HighPerformance.Mathematics.SPMD/Templates/MathV.Vector.gen.cs index 9c031d1..fb4a196 100644 --- a/Misaki.HighPerformance.Mathematics.SPMD/Templates/MathV.Vector.gen.cs +++ b/Misaki.HighPerformance.Mathematics.SPMD/Templates/MathV.Vector.gen.cs @@ -44,21 +44,21 @@ public static unsafe partial class MathV where TLane : unmanaged, ISPMDLane where TNumber : unmanaged, INumber, IBinaryNumber, IMinMaxValue, IBitwiseOperators { - var width = TLane.LaneWidth; + Unsafe.SkipInit(out TLane x); + var px = (TNumber*)&x; + Unsafe.SkipInit(out TLane y); + var py = (TNumber*)&y; - var x = stackalloc TNumber[width]; - var y = stackalloc TNumber[width]; - - for (var i = 0; i < width; i++) + for (var i = 0; i < TLane.LaneWidth; i++) { - x[i] = pSrc[i * 2 + 0]; - y[i] = pSrc[i * 2 + 1]; + px[i] = pSrc[i * 2 + 0]; + py[i] = pSrc[i * 2 + 1]; } return new Vector2 { - x = TLane.Load(x), - y = TLane.Load(y), + x = x, + y = y, }; } @@ -515,24 +515,25 @@ public static unsafe partial class MathV where TLane : unmanaged, ISPMDLane where TNumber : unmanaged, INumber, IBinaryNumber, IMinMaxValue, IBitwiseOperators { - var width = TLane.LaneWidth; + Unsafe.SkipInit(out TLane x); + var px = (TNumber*)&x; + Unsafe.SkipInit(out TLane y); + var py = (TNumber*)&y; + Unsafe.SkipInit(out TLane z); + var pz = (TNumber*)&z; - var x = stackalloc TNumber[width]; - var y = stackalloc TNumber[width]; - var z = stackalloc TNumber[width]; - - for (var i = 0; i < width; i++) + for (var i = 0; i < TLane.LaneWidth; i++) { - x[i] = pSrc[i * 3 + 0]; - y[i] = pSrc[i * 3 + 1]; - z[i] = pSrc[i * 3 + 2]; + px[i] = pSrc[i * 3 + 0]; + py[i] = pSrc[i * 3 + 1]; + pz[i] = pSrc[i * 3 + 2]; } return new Vector3 { - x = TLane.Load(x), - y = TLane.Load(y), - z = TLane.Load(z), + x = x, + y = y, + z = z, }; } @@ -1024,27 +1025,29 @@ public static unsafe partial class MathV where TLane : unmanaged, ISPMDLane where TNumber : unmanaged, INumber, IBinaryNumber, IMinMaxValue, IBitwiseOperators { - var width = TLane.LaneWidth; + Unsafe.SkipInit(out TLane x); + var px = (TNumber*)&x; + Unsafe.SkipInit(out TLane y); + var py = (TNumber*)&y; + Unsafe.SkipInit(out TLane z); + var pz = (TNumber*)&z; + Unsafe.SkipInit(out TLane w); + var pw = (TNumber*)&w; - var x = stackalloc TNumber[width]; - var y = stackalloc TNumber[width]; - var z = stackalloc TNumber[width]; - var w = stackalloc TNumber[width]; - - for (var i = 0; i < width; i++) + for (var i = 0; i < TLane.LaneWidth; i++) { - x[i] = pSrc[i * 4 + 0]; - y[i] = pSrc[i * 4 + 1]; - z[i] = pSrc[i * 4 + 2]; - w[i] = pSrc[i * 4 + 3]; + px[i] = pSrc[i * 4 + 0]; + py[i] = pSrc[i * 4 + 1]; + pz[i] = pSrc[i * 4 + 2]; + pw[i] = pSrc[i * 4 + 3]; } return new Vector4 { - x = TLane.Load(x), - y = TLane.Load(y), - z = TLane.Load(z), - w = TLane.Load(w), + x = x, + y = y, + z = z, + w = w, }; } diff --git a/Misaki.HighPerformance.Mathematics.SPMD/Templates/MathV.Vector.tt b/Misaki.HighPerformance.Mathematics.SPMD/Templates/MathV.Vector.tt index 0a09272..424f615 100644 --- a/Misaki.HighPerformance.Mathematics.SPMD/Templates/MathV.Vector.tt +++ b/Misaki.HighPerformance.Mathematics.SPMD/Templates/MathV.Vector.tt @@ -65,23 +65,22 @@ public static unsafe partial class MathV <#= TLaneRestrictions #> <#= TNumberRestrictions #> { - var width = TLane.LaneWidth; - <# for (int i = 0; i < dimension; i++) { #> - var <#= components[i] #> = stackalloc <#= TNumber #>[width]; + Unsafe.SkipInit(out TLane <#= components[i] #>); + var p<#= components[i] #> = (<#= TNumber #>*)&<#= components[i] #>; <# } #> - for (var i = 0; i < width; i++) + for (var i = 0; i < TLane.LaneWidth; i++) { <# for (int i = 0; i < dimension; i++) { #> - <#= components[i] #>[i] = pSrc[i * <#= dimension #> + <#= i #>]; + p<#= components[i] #>[i] = pSrc[i * <#= dimension #> + <#= i #>]; <# } #> } return new <#= vectorType #> { <# for (int i = 0; i < dimension; i++) { #> - <#= components[i] #> = <#= TLane #>.Load(<#= components[i] #>), + <#= components[i] #> = <#= components[i] #>, <# } #> }; } diff --git a/Misaki.HighPerformance.Mathematics.SPMD/WideLane.cs b/Misaki.HighPerformance.Mathematics.SPMD/WideLane.cs index bdaa112..6e83737 100644 --- a/Misaki.HighPerformance.Mathematics.SPMD/WideLane.cs +++ b/Misaki.HighPerformance.Mathematics.SPMD/WideLane.cs @@ -8,12 +8,12 @@ namespace Misaki.HighPerformance.Mathematics.SPMD; public static unsafe class WideLane { - internal static readonly uint* s_shuffleTable512_32bit; - internal static readonly ulong* s_shuffleTable512_64bit; - internal static readonly uint* s_shuffleTable256_32bit; - internal static readonly ulong* s_shuffleTable256_64bit; - internal static readonly uint* s_shuffleTable128_32bit; - internal static readonly ulong* s_shuffleTable128_64bit; + internal static readonly uint* s_pShuffleTable512_32bit; + internal static readonly ulong* s_pShuffleTable512_64bit; + internal static readonly uint* s_pShuffleTable256_32bit; + internal static readonly ulong* s_pShuffleTable256_64bit; + internal static readonly uint* s_pShuffleTable128_32bit; + internal static readonly ulong* s_pShuffleTable128_64bit; /// /// Gets whether WideLane is supported on the current hardware. @@ -22,12 +22,12 @@ public static unsafe class WideLane static WideLane() { - s_shuffleTable512_32bit = ShuffleTableGenerator.ComputeShuffleTable512_32Bit(); - s_shuffleTable512_64bit = ShuffleTableGenerator.ComputeShuffleTable512_64Bit(); - s_shuffleTable256_32bit = ShuffleTableGenerator.ComputeShuffleTable256_32Bit(); - s_shuffleTable256_64bit = ShuffleTableGenerator.ComputeShuffleTable256_64Bit(); - s_shuffleTable128_32bit = ShuffleTableGenerator.ComputeShuffleTable128_32Bit(); - s_shuffleTable128_64bit = ShuffleTableGenerator.ComputeShuffleTable128_64Bit(); + s_pShuffleTable512_32bit = ShuffleTableGenerator.ComputeShuffleTable512_32Bit(); + s_pShuffleTable512_64bit = ShuffleTableGenerator.ComputeShuffleTable512_64Bit(); + s_pShuffleTable256_32bit = ShuffleTableGenerator.ComputeShuffleTable256_32Bit(); + s_pShuffleTable256_64bit = ShuffleTableGenerator.ComputeShuffleTable256_64Bit(); + s_pShuffleTable128_32bit = ShuffleTableGenerator.ComputeShuffleTable128_32Bit(); + s_pShuffleTable128_64bit = ShuffleTableGenerator.ComputeShuffleTable128_64Bit(); } } @@ -69,6 +69,12 @@ public readonly unsafe partial struct WideLane : ISPMDLane Create(TNumber.MaxValue); } + public static WideLane AllBitsSet + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get => Create(TNumber.AllBitsSet); + } + public readonly TNumber this[int index] { [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -194,48 +200,26 @@ public readonly unsafe partial struct WideLane : ISPMDLane MaskLoad(WideLane mask, ref TNumber value) { - return MaskLoad(mask, (TNumber*)Unsafe.AsPointer(ref value)); + var vector = Vector.LoadUnsafe(ref value); + return new WideLane(Vector.ConditionalSelect(mask.value, vector, Vector.Zero)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane MaskLoad(WideLane mask, TNumber* pValue) { - var vector = Vector.Load(pValue); - return new WideLane(Vector.ConditionalSelect(mask.value, vector, Vector.Zero)); + return MaskLoad(mask, ref Unsafe.AsRef(pValue)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Gather(TNumber* pData, WideLane indices, int scale) { - Unsafe.SkipInit(out Vector result); - - var pResult = (TNumber*)&result; - var pIndices = (TNumber*)&indices; - - var count = Vector.Count; - for (var i = 0; i < count; i++) - { - var idx = int.CreateTruncating(pIndices[i]); - pResult[i] = pData[idx * scale / sizeof(TNumber)]; - } - - return new WideLane(result); + return Gather(ref Unsafe.AsRef(pData), indices, scale); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Gather(TNumber* pData, int* pIndices, int scale) { - Unsafe.SkipInit(out Vector result); - - var pResult = (TNumber*)&result; - - var count = Vector.Count; - for (var i = 0; i < count; i++) - { - pResult[i] = pData[pIndices[i] * scale / sizeof(TNumber)]; - } - - return new WideLane(result); + return Gather(ref Unsafe.AsRef(pData), ref Unsafe.AsRef(pIndices), scale); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -287,12 +271,6 @@ public readonly unsafe partial struct WideLane : ISPMDLane mask, ref TNumber destination) - { - return CompressStore(mask, (TNumber*)Unsafe.AsPointer(in destination)); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public int CompressStore(WideLane mask, TNumber* pDestination) { if (LaneWidth == Vector512.Count && Vector512.IsHardwareAccelerated) { @@ -303,10 +281,10 @@ public readonly unsafe partial struct WideLane : ISPMDLane(ref destination)); return BitOperations.PopCount(moveMask); } @@ -317,10 +295,10 @@ public readonly unsafe partial struct WideLane : ISPMDLane(ref destination)); return BitOperations.PopCount(moveMask); } } @@ -333,10 +311,10 @@ public readonly unsafe partial struct WideLane : ISPMDLane(ref destination)); return BitOperations.PopCount(moveMask); } @@ -349,10 +327,10 @@ public readonly unsafe partial struct WideLane : ISPMDLane(ref destination)); return BitOperations.PopCount(moveMask); } } @@ -365,10 +343,10 @@ public readonly unsafe partial struct WideLane : ISPMDLane(ref destination)); return BitOperations.PopCount(moveMask); } @@ -378,9 +356,9 @@ public readonly unsafe partial struct WideLane : ISPMDLane, Vector128>(ref mask); var moveMask = m.ExtractMostSignificantBits(); // Offset is (moveMask * 2) because each control vector has 2 elements - var shuffle = Vector128.Load(WideLane.s_shuffleTable128_64bit + (moveMask * 2)); + var shuffle = Vector128.Load(WideLane.s_pShuffleTable128_64bit + (moveMask * 2)); var compressed = Vector128.Shuffle(vec, shuffle); - compressed.Store((ulong*)pDestination); + compressed.StoreUnsafe(ref Unsafe.As(ref destination)); return BitOperations.PopCount(moveMask); } } @@ -390,15 +368,21 @@ public readonly unsafe partial struct WideLane : ISPMDLane mask, TNumber* pDestination) + { + return CompressStore(mask, ref Unsafe.AsRef(pDestination)); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] public readonly Vector AsVector() { @@ -617,6 +601,7 @@ public readonly unsafe partial struct WideLane : ISPMDLane Sin(WideLane value) { +#if MHP_FASTMATH var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI var x_sin = value; @@ -644,11 +629,28 @@ public readonly unsafe partial struct WideLane : ISPMDLane, Vector>(ref value); + var result = Vector.Sin(v); + return new WideLane(Unsafe.As, Vector>(ref result)); + } + else if (typeof(TNumber) == typeof(double)) + { + ref var v = ref Unsafe.As, Vector>(ref value); + var result = Vector.Sin(v); + return new WideLane(Unsafe.As, Vector>(ref result)); + } + + return value; +#endif } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Cos(WideLane value) { +#if MHP_FASTMATH var halfPi = Create(TNumber.CreateTruncating(1.570796327f)); var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI @@ -677,11 +679,30 @@ public readonly unsafe partial struct WideLane : ISPMDLane, Vector>(ref value); + var result = Vector.Cos(v); + return new WideLane(Unsafe.As, Vector>(ref result)); + } + else if (typeof(TNumber) == typeof(double)) + { + ref var v = ref Unsafe.As, Vector>(ref value); + var result = Vector.Cos(v); + return new WideLane(Unsafe.As, Vector>(ref result)); + } + + return value; +#endif } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void SinCos(WideLane value, out WideLane sin, out WideLane cos) { +#if MHP_FASTMATH + // We use Taylor/Remez polynomial approximation for Sin(PI * z) and Cos(PI * z) on the reduced range of z in [-0.5, 0.5]. + var halfPi = Create(TNumber.CreateTruncating(1.570796327f)); var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI @@ -741,6 +762,27 @@ public readonly unsafe partial struct WideLane : ISPMDLane, Vector>(ref value); + var (sinResult, cosResult) = Vector.SinCos(v); + sin = new WideLane(Unsafe.As, Vector>(ref sinResult)); + cos = new WideLane(Unsafe.As, Vector>(ref cosResult)); + } + else if (typeof(TNumber) == typeof(double)) + { + ref var v = ref Unsafe.As, Vector>(ref value); + var (sinResult, cosResult) = Vector.SinCos(v); + sin = new WideLane(Unsafe.As, Vector>(ref sinResult)); + cos = new WideLane(Unsafe.As, Vector>(ref cosResult)); + } + else + { + sin = value; + cos = value; + } +#endif } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -799,7 +841,7 @@ public readonly unsafe partial struct WideLane : ISPMDLane : ISPMDLane : ISPMDLane.Count) { ref var vf = ref Unsafe.As, Vector128>(ref value); - var result = Sse.Reciprocal(vf); - return Unsafe.As, WideLane>(ref result); + var x0 = Sse.Reciprocal(vf); +#if MHP_FASTMATH + return Unsafe.As, WideLane>(ref x0); +#else + // SSE and AVX provide fast approximate reciprocal instructions but the precision is very low (11 bits). + // In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases. + var x1 = x0 * (Vector128.Create(2.0f) - x0 * vf); + return Unsafe.As, WideLane>(ref x1); +#endif } else if (Avx.IsSupported && LaneWidth == Vector256.Count) { ref var vf = ref Unsafe.As, Vector256>(ref value); - var result = Avx.Reciprocal(vf); - return Unsafe.As, WideLane>(ref result); + var x0 = Avx.Reciprocal(vf); +#if MHP_FASTMATH + return Unsafe.As, WideLane>(ref x0); +#else + // SSE and AVX provide fast approximate reciprocal instructions but the precision is very low (11 bits). + // In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases. + var x1 = x0 * (Vector256.Create(2.0f) - x0 * vf); + return Unsafe.As, WideLane>(ref x1); +#endif } } @@ -1039,14 +1095,28 @@ public readonly unsafe partial struct WideLane : ISPMDLane.Count) { ref var vf = ref Unsafe.As, Vector128>(ref value); - var result = Sse.ReciprocalSqrt(vf); - return Unsafe.As, WideLane>(ref result); + var x0 = Sse.ReciprocalSqrt(vf); +#if MHP_FASTMATH + return Unsafe.As, WideLane>(ref x0); +#else + // SSE and AVX provide fast approximate reciprocal sqrt instructions but the precision is very low (11 bits). + // In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases. + var x1 = x0 * Vector128.Create(0.5f) * (Vector128.Create(3.0f) - (vf * x0 * x0)); + return Unsafe.As, WideLane>(ref x1); +#endif } else if (Avx.IsSupported && LaneWidth == Vector256.Count) { ref var vf = ref Unsafe.As, Vector256>(ref value); - var result = Avx.ReciprocalSqrt(vf); - return Unsafe.As, WideLane>(ref result); + var x0 = Avx.ReciprocalSqrt(vf); +#if MHP_FASTMATH + return Unsafe.As, WideLane>(ref x0); +#else + // SSE and AVX provide fast approximate reciprocal sqrt instructions but the precision is very low (11 bits). + // In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases. + var x1 = x0 * Vector256.Create(0.5f) * (Vector256.Create(3.0f) - (vf * x0 * x0)); + return Unsafe.As, WideLane>(ref x1); +#endif } }