diff --git a/Misaki.HighPerformance.Mathematics.SPMD/ISPMDLane.cs b/Misaki.HighPerformance.Mathematics.SPMD/ISPMDLane.cs
index e8c2ecf..184ad43 100644
--- a/Misaki.HighPerformance.Mathematics.SPMD/ISPMDLane.cs
+++ b/Misaki.HighPerformance.Mathematics.SPMD/ISPMDLane.cs
@@ -16,14 +16,6 @@ public interface ISPMDLane
}
}
-// TODO:
-// - ReduceAdd
-// - ReduceMin
-// - ReduceMax
-// - LeadingZeroCount
-// - TrailingZeroCount
-// - PopCount
-
///
/// Represents a single-lane or multi-lane (vectorized) SPMD value and the operations supported on it.
///
@@ -65,6 +57,14 @@ public unsafe interface ISPMDLane : ISPMDLane, IEquatable
get;
}
+ ///
+ /// Gets a lane value where all bits are set to 1 for each lane.
+ ///
+ static abstract TSelf AllBitsSet
+ {
+ get;
+ }
+
///
/// Gets the element value for the specified lane index.
///
diff --git a/Misaki.HighPerformance.Mathematics.SPMD/Misaki.HighPerformance.Mathematics.SPMD.csproj b/Misaki.HighPerformance.Mathematics.SPMD/Misaki.HighPerformance.Mathematics.SPMD.csproj
index 3431fed..cbbe3df 100644
--- a/Misaki.HighPerformance.Mathematics.SPMD/Misaki.HighPerformance.Mathematics.SPMD.csproj
+++ b/Misaki.HighPerformance.Mathematics.SPMD/Misaki.HighPerformance.Mathematics.SPMD.csproj
@@ -7,23 +7,31 @@
true
true
Misaki
- 1.3.3
+ 1.3.4
$(AssemblyVersion)
https://git.personalnas.com/Misaki/Misaki.HighPerformance.git
https://git.personalnas.com/Misaki/Misaki.HighPerformance.git
+ false
+ contentFiles
True
+ $(DefineConstants)
True
+ $(DefineConstants)
-
+
-
-
+
+ true
+ contentFiles\cs\any\Misaki.HighPerformance.LowLevel\
+ false
+ Compile
+
diff --git a/Misaki.HighPerformance.Mathematics.SPMD/ScalerLane.cs b/Misaki.HighPerformance.Mathematics.SPMD/ScalerLane.cs
index f663a7f..a3951fb 100644
--- a/Misaki.HighPerformance.Mathematics.SPMD/ScalerLane.cs
+++ b/Misaki.HighPerformance.Mathematics.SPMD/ScalerLane.cs
@@ -40,6 +40,12 @@ public readonly unsafe struct ScalarLane : ISPMDLane new ScalarLane(TNumber.MaxValue);
}
+ public static ScalarLane AllBitsSet
+ {
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ get => new ScalarLane(TNumber.AllBitsSet);
+ }
+
public readonly TNumber this[int index]
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
diff --git a/Misaki.HighPerformance.Mathematics.SPMD/Templates/IJobSPMD.tt b/Misaki.HighPerformance.Mathematics.SPMD/Templates/IJobSPMD.tt
index a7e7a5c..41a6286 100644
--- a/Misaki.HighPerformance.Mathematics.SPMD/Templates/IJobSPMD.tt
+++ b/Misaki.HighPerformance.Mathematics.SPMD/Templates/IJobSPMD.tt
@@ -10,7 +10,6 @@ using System.Numerics;
namespace Misaki.HighPerformance.Mathematics.SPMD;
<#
-
const string TLane = "TLane";
const string TNumber = "TNumber";
const string GenericParameters = $"{TLane}, {TNumber}";
diff --git a/Misaki.HighPerformance.Mathematics.SPMD/Templates/MathV.Vector.gen.cs b/Misaki.HighPerformance.Mathematics.SPMD/Templates/MathV.Vector.gen.cs
index 9c031d1..fb4a196 100644
--- a/Misaki.HighPerformance.Mathematics.SPMD/Templates/MathV.Vector.gen.cs
+++ b/Misaki.HighPerformance.Mathematics.SPMD/Templates/MathV.Vector.gen.cs
@@ -44,21 +44,21 @@ public static unsafe partial class MathV
where TLane : unmanaged, ISPMDLane
where TNumber : unmanaged, INumber, IBinaryNumber, IMinMaxValue, IBitwiseOperators
{
- var width = TLane.LaneWidth;
+ Unsafe.SkipInit(out TLane x);
+ var px = (TNumber*)&x;
+ Unsafe.SkipInit(out TLane y);
+ var py = (TNumber*)&y;
- var x = stackalloc TNumber[width];
- var y = stackalloc TNumber[width];
-
- for (var i = 0; i < width; i++)
+ for (var i = 0; i < TLane.LaneWidth; i++)
{
- x[i] = pSrc[i * 2 + 0];
- y[i] = pSrc[i * 2 + 1];
+ px[i] = pSrc[i * 2 + 0];
+ py[i] = pSrc[i * 2 + 1];
}
return new Vector2
{
- x = TLane.Load(x),
- y = TLane.Load(y),
+ x = x,
+ y = y,
};
}
@@ -515,24 +515,25 @@ public static unsafe partial class MathV
where TLane : unmanaged, ISPMDLane
where TNumber : unmanaged, INumber, IBinaryNumber, IMinMaxValue, IBitwiseOperators
{
- var width = TLane.LaneWidth;
+ Unsafe.SkipInit(out TLane x);
+ var px = (TNumber*)&x;
+ Unsafe.SkipInit(out TLane y);
+ var py = (TNumber*)&y;
+ Unsafe.SkipInit(out TLane z);
+ var pz = (TNumber*)&z;
- var x = stackalloc TNumber[width];
- var y = stackalloc TNumber[width];
- var z = stackalloc TNumber[width];
-
- for (var i = 0; i < width; i++)
+ for (var i = 0; i < TLane.LaneWidth; i++)
{
- x[i] = pSrc[i * 3 + 0];
- y[i] = pSrc[i * 3 + 1];
- z[i] = pSrc[i * 3 + 2];
+ px[i] = pSrc[i * 3 + 0];
+ py[i] = pSrc[i * 3 + 1];
+ pz[i] = pSrc[i * 3 + 2];
}
return new Vector3
{
- x = TLane.Load(x),
- y = TLane.Load(y),
- z = TLane.Load(z),
+ x = x,
+ y = y,
+ z = z,
};
}
@@ -1024,27 +1025,29 @@ public static unsafe partial class MathV
where TLane : unmanaged, ISPMDLane
where TNumber : unmanaged, INumber, IBinaryNumber, IMinMaxValue, IBitwiseOperators
{
- var width = TLane.LaneWidth;
+ Unsafe.SkipInit(out TLane x);
+ var px = (TNumber*)&x;
+ Unsafe.SkipInit(out TLane y);
+ var py = (TNumber*)&y;
+ Unsafe.SkipInit(out TLane z);
+ var pz = (TNumber*)&z;
+ Unsafe.SkipInit(out TLane w);
+ var pw = (TNumber*)&w;
- var x = stackalloc TNumber[width];
- var y = stackalloc TNumber[width];
- var z = stackalloc TNumber[width];
- var w = stackalloc TNumber[width];
-
- for (var i = 0; i < width; i++)
+ for (var i = 0; i < TLane.LaneWidth; i++)
{
- x[i] = pSrc[i * 4 + 0];
- y[i] = pSrc[i * 4 + 1];
- z[i] = pSrc[i * 4 + 2];
- w[i] = pSrc[i * 4 + 3];
+ px[i] = pSrc[i * 4 + 0];
+ py[i] = pSrc[i * 4 + 1];
+ pz[i] = pSrc[i * 4 + 2];
+ pw[i] = pSrc[i * 4 + 3];
}
return new Vector4
{
- x = TLane.Load(x),
- y = TLane.Load(y),
- z = TLane.Load(z),
- w = TLane.Load(w),
+ x = x,
+ y = y,
+ z = z,
+ w = w,
};
}
diff --git a/Misaki.HighPerformance.Mathematics.SPMD/Templates/MathV.Vector.tt b/Misaki.HighPerformance.Mathematics.SPMD/Templates/MathV.Vector.tt
index 0a09272..424f615 100644
--- a/Misaki.HighPerformance.Mathematics.SPMD/Templates/MathV.Vector.tt
+++ b/Misaki.HighPerformance.Mathematics.SPMD/Templates/MathV.Vector.tt
@@ -65,23 +65,22 @@ public static unsafe partial class MathV
<#= TLaneRestrictions #>
<#= TNumberRestrictions #>
{
- var width = TLane.LaneWidth;
-
<# for (int i = 0; i < dimension; i++) { #>
- var <#= components[i] #> = stackalloc <#= TNumber #>[width];
+ Unsafe.SkipInit(out TLane <#= components[i] #>);
+ var p<#= components[i] #> = (<#= TNumber #>*)&<#= components[i] #>;
<# } #>
- for (var i = 0; i < width; i++)
+ for (var i = 0; i < TLane.LaneWidth; i++)
{
<# for (int i = 0; i < dimension; i++) { #>
- <#= components[i] #>[i] = pSrc[i * <#= dimension #> + <#= i #>];
+ p<#= components[i] #>[i] = pSrc[i * <#= dimension #> + <#= i #>];
<# } #>
}
return new <#= vectorType #>
{
<# for (int i = 0; i < dimension; i++) { #>
- <#= components[i] #> = <#= TLane #>.Load(<#= components[i] #>),
+ <#= components[i] #> = <#= components[i] #>,
<# } #>
};
}
diff --git a/Misaki.HighPerformance.Mathematics.SPMD/WideLane.cs b/Misaki.HighPerformance.Mathematics.SPMD/WideLane.cs
index bdaa112..6e83737 100644
--- a/Misaki.HighPerformance.Mathematics.SPMD/WideLane.cs
+++ b/Misaki.HighPerformance.Mathematics.SPMD/WideLane.cs
@@ -8,12 +8,12 @@ namespace Misaki.HighPerformance.Mathematics.SPMD;
public static unsafe class WideLane
{
- internal static readonly uint* s_shuffleTable512_32bit;
- internal static readonly ulong* s_shuffleTable512_64bit;
- internal static readonly uint* s_shuffleTable256_32bit;
- internal static readonly ulong* s_shuffleTable256_64bit;
- internal static readonly uint* s_shuffleTable128_32bit;
- internal static readonly ulong* s_shuffleTable128_64bit;
+ internal static readonly uint* s_pShuffleTable512_32bit;
+ internal static readonly ulong* s_pShuffleTable512_64bit;
+ internal static readonly uint* s_pShuffleTable256_32bit;
+ internal static readonly ulong* s_pShuffleTable256_64bit;
+ internal static readonly uint* s_pShuffleTable128_32bit;
+ internal static readonly ulong* s_pShuffleTable128_64bit;
///
/// Gets whether WideLane is supported on the current hardware.
@@ -22,12 +22,12 @@ public static unsafe class WideLane
static WideLane()
{
- s_shuffleTable512_32bit = ShuffleTableGenerator.ComputeShuffleTable512_32Bit();
- s_shuffleTable512_64bit = ShuffleTableGenerator.ComputeShuffleTable512_64Bit();
- s_shuffleTable256_32bit = ShuffleTableGenerator.ComputeShuffleTable256_32Bit();
- s_shuffleTable256_64bit = ShuffleTableGenerator.ComputeShuffleTable256_64Bit();
- s_shuffleTable128_32bit = ShuffleTableGenerator.ComputeShuffleTable128_32Bit();
- s_shuffleTable128_64bit = ShuffleTableGenerator.ComputeShuffleTable128_64Bit();
+ s_pShuffleTable512_32bit = ShuffleTableGenerator.ComputeShuffleTable512_32Bit();
+ s_pShuffleTable512_64bit = ShuffleTableGenerator.ComputeShuffleTable512_64Bit();
+ s_pShuffleTable256_32bit = ShuffleTableGenerator.ComputeShuffleTable256_32Bit();
+ s_pShuffleTable256_64bit = ShuffleTableGenerator.ComputeShuffleTable256_64Bit();
+ s_pShuffleTable128_32bit = ShuffleTableGenerator.ComputeShuffleTable128_32Bit();
+ s_pShuffleTable128_64bit = ShuffleTableGenerator.ComputeShuffleTable128_64Bit();
}
}
@@ -69,6 +69,12 @@ public readonly unsafe partial struct WideLane : ISPMDLane Create(TNumber.MaxValue);
}
+ public static WideLane AllBitsSet
+ {
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ get => Create(TNumber.AllBitsSet);
+ }
+
public readonly TNumber this[int index]
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -194,48 +200,26 @@ public readonly unsafe partial struct WideLane : ISPMDLane MaskLoad(WideLane mask, ref TNumber value)
{
- return MaskLoad(mask, (TNumber*)Unsafe.AsPointer(ref value));
+ var vector = Vector.LoadUnsafe(ref value);
+ return new WideLane(Vector.ConditionalSelect(mask.value, vector, Vector.Zero));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane MaskLoad(WideLane mask, TNumber* pValue)
{
- var vector = Vector.Load(pValue);
- return new WideLane(Vector.ConditionalSelect(mask.value, vector, Vector.Zero));
+ return MaskLoad(mask, ref Unsafe.AsRef(pValue));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Gather(TNumber* pData, WideLane indices, int scale)
{
- Unsafe.SkipInit(out Vector result);
-
- var pResult = (TNumber*)&result;
- var pIndices = (TNumber*)&indices;
-
- var count = Vector.Count;
- for (var i = 0; i < count; i++)
- {
- var idx = int.CreateTruncating(pIndices[i]);
- pResult[i] = pData[idx * scale / sizeof(TNumber)];
- }
-
- return new WideLane(result);
+ return Gather(ref Unsafe.AsRef(pData), indices, scale);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Gather(TNumber* pData, int* pIndices, int scale)
{
- Unsafe.SkipInit(out Vector result);
-
- var pResult = (TNumber*)&result;
-
- var count = Vector.Count;
- for (var i = 0; i < count; i++)
- {
- pResult[i] = pData[pIndices[i] * scale / sizeof(TNumber)];
- }
-
- return new WideLane(result);
+ return Gather(ref Unsafe.AsRef(pData), ref Unsafe.AsRef(pIndices), scale);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -287,12 +271,6 @@ public readonly unsafe partial struct WideLane : ISPMDLane mask, ref TNumber destination)
- {
- return CompressStore(mask, (TNumber*)Unsafe.AsPointer(in destination));
- }
-
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- public int CompressStore(WideLane mask, TNumber* pDestination)
{
if (LaneWidth == Vector512.Count && Vector512.IsHardwareAccelerated)
{
@@ -303,10 +281,10 @@ public readonly unsafe partial struct WideLane : ISPMDLane(ref destination));
return BitOperations.PopCount(moveMask);
}
@@ -317,10 +295,10 @@ public readonly unsafe partial struct WideLane : ISPMDLane(ref destination));
return BitOperations.PopCount(moveMask);
}
}
@@ -333,10 +311,10 @@ public readonly unsafe partial struct WideLane : ISPMDLane(ref destination));
return BitOperations.PopCount(moveMask);
}
@@ -349,10 +327,10 @@ public readonly unsafe partial struct WideLane : ISPMDLane(ref destination));
return BitOperations.PopCount(moveMask);
}
}
@@ -365,10 +343,10 @@ public readonly unsafe partial struct WideLane : ISPMDLane(ref destination));
return BitOperations.PopCount(moveMask);
}
@@ -378,9 +356,9 @@ public readonly unsafe partial struct WideLane : ISPMDLane, Vector128>(ref mask);
var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 2) because each control vector has 2 elements
- var shuffle = Vector128.Load(WideLane.s_shuffleTable128_64bit + (moveMask * 2));
+ var shuffle = Vector128.Load(WideLane.s_pShuffleTable128_64bit + (moveMask * 2));
var compressed = Vector128.Shuffle(vec, shuffle);
- compressed.Store((ulong*)pDestination);
+ compressed.StoreUnsafe(ref Unsafe.As(ref destination));
return BitOperations.PopCount(moveMask);
}
}
@@ -390,15 +368,21 @@ public readonly unsafe partial struct WideLane : ISPMDLane mask, TNumber* pDestination)
+ {
+ return CompressStore(mask, ref Unsafe.AsRef(pDestination));
+ }
+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public readonly Vector AsVector()
{
@@ -617,6 +601,7 @@ public readonly unsafe partial struct WideLane : ISPMDLane Sin(WideLane value)
{
+#if MHP_FASTMATH
var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI
var x_sin = value;
@@ -644,11 +629,28 @@ public readonly unsafe partial struct WideLane : ISPMDLane, Vector>(ref value);
+ var result = Vector.Sin(v);
+ return new WideLane(Unsafe.As, Vector>(ref result));
+ }
+ else if (typeof(TNumber) == typeof(double))
+ {
+ ref var v = ref Unsafe.As, Vector>(ref value);
+ var result = Vector.Sin(v);
+ return new WideLane(Unsafe.As, Vector>(ref result));
+ }
+
+ return value;
+#endif
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Cos(WideLane value)
{
+#if MHP_FASTMATH
var halfPi = Create(TNumber.CreateTruncating(1.570796327f));
var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI
@@ -677,11 +679,30 @@ public readonly unsafe partial struct WideLane : ISPMDLane, Vector>(ref value);
+ var result = Vector.Cos(v);
+ return new WideLane(Unsafe.As, Vector>(ref result));
+ }
+ else if (typeof(TNumber) == typeof(double))
+ {
+ ref var v = ref Unsafe.As, Vector>(ref value);
+ var result = Vector.Cos(v);
+ return new WideLane(Unsafe.As, Vector>(ref result));
+ }
+
+ return value;
+#endif
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void SinCos(WideLane value, out WideLane sin, out WideLane cos)
{
+#if MHP_FASTMATH
+ // We use Taylor/Remez polynomial approximation for Sin(PI * z) and Cos(PI * z) on the reduced range of z in [-0.5, 0.5].
+
var halfPi = Create(TNumber.CreateTruncating(1.570796327f));
var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI
@@ -741,6 +762,27 @@ public readonly unsafe partial struct WideLane : ISPMDLane, Vector>(ref value);
+ var (sinResult, cosResult) = Vector.SinCos(v);
+ sin = new WideLane(Unsafe.As, Vector>(ref sinResult));
+ cos = new WideLane(Unsafe.As, Vector>(ref cosResult));
+ }
+ else if (typeof(TNumber) == typeof(double))
+ {
+ ref var v = ref Unsafe.As, Vector>(ref value);
+ var (sinResult, cosResult) = Vector.SinCos(v);
+ sin = new WideLane(Unsafe.As, Vector>(ref sinResult));
+ cos = new WideLane(Unsafe.As, Vector>(ref cosResult));
+ }
+ else
+ {
+ sin = value;
+ cos = value;
+ }
+#endif
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -799,7 +841,7 @@ public readonly unsafe partial struct WideLane : ISPMDLane : ISPMDLane : ISPMDLane.Count)
{
ref var vf = ref Unsafe.As, Vector128>(ref value);
- var result = Sse.Reciprocal(vf);
- return Unsafe.As, WideLane>(ref result);
+ var x0 = Sse.Reciprocal(vf);
+#if MHP_FASTMATH
+ return Unsafe.As, WideLane>(ref x0);
+#else
+ // SSE and AVX provide fast approximate reciprocal instructions but the precision is very low (11 bits).
+ // In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases.
+ var x1 = x0 * (Vector128.Create(2.0f) - x0 * vf);
+ return Unsafe.As, WideLane>(ref x1);
+#endif
}
else if (Avx.IsSupported && LaneWidth == Vector256.Count)
{
ref var vf = ref Unsafe.As, Vector256>(ref value);
- var result = Avx.Reciprocal(vf);
- return Unsafe.As, WideLane>(ref result);
+ var x0 = Avx.Reciprocal(vf);
+#if MHP_FASTMATH
+ return Unsafe.As, WideLane>(ref x0);
+#else
+ // SSE and AVX provide fast approximate reciprocal instructions but the precision is very low (11 bits).
+ // In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases.
+ var x1 = x0 * (Vector256.Create(2.0f) - x0 * vf);
+ return Unsafe.As, WideLane>(ref x1);
+#endif
}
}
@@ -1039,14 +1095,28 @@ public readonly unsafe partial struct WideLane : ISPMDLane.Count)
{
ref var vf = ref Unsafe.As, Vector128>(ref value);
- var result = Sse.ReciprocalSqrt(vf);
- return Unsafe.As, WideLane>(ref result);
+ var x0 = Sse.ReciprocalSqrt(vf);
+#if MHP_FASTMATH
+ return Unsafe.As, WideLane>(ref x0);
+#else
+ // SSE and AVX provide fast approximate reciprocal sqrt instructions but the precision is very low (11 bits).
+ // In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases.
+ var x1 = x0 * Vector128.Create(0.5f) * (Vector128.Create(3.0f) - (vf * x0 * x0));
+ return Unsafe.As, WideLane>(ref x1);
+#endif
}
else if (Avx.IsSupported && LaneWidth == Vector256.Count)
{
ref var vf = ref Unsafe.As, Vector256>(ref value);
- var result = Avx.ReciprocalSqrt(vf);
- return Unsafe.As, WideLane>(ref result);
+ var x0 = Avx.ReciprocalSqrt(vf);
+#if MHP_FASTMATH
+ return Unsafe.As, WideLane>(ref x0);
+#else
+ // SSE and AVX provide fast approximate reciprocal sqrt instructions but the precision is very low (11 bits).
+ // In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases.
+ var x1 = x0 * Vector256.Create(0.5f) * (Vector256.Create(3.0f) - (vf * x0 * x0));
+ return Unsafe.As, WideLane>(ref x1);
+#endif
}
}