Add AllBitsSet, refactor WideLane, improve math paths

- Add static AllBitsSet property to ISPMDLane and implement in ScalarLane and WideLane
- Refactor WideLane shuffle table pointers and update usages
- Improve pointer safety and mask handling in CompressStore, Gather, and MaskLoad
- Enhance Sin, Cos, SinCos with fast-math and hardware fallback
- Add Newton-Raphson refinement for reciprocal/sqrt when not fast-math
- Optimize MathV.Vector vector loading (struct init, pointer ops)
- Update project file: version 1.3.4, content packaging, AOT settings
- Minor code cleanup and naming consistency fixes
This commit is contained in:
2026-05-01 12:19:58 +09:00
parent 5b4832a886
commit 18a181f57a
7 changed files with 209 additions and 124 deletions

View File

@@ -8,12 +8,12 @@ namespace Misaki.HighPerformance.Mathematics.SPMD;
public static unsafe class WideLane
{
internal static readonly uint* s_shuffleTable512_32bit;
internal static readonly ulong* s_shuffleTable512_64bit;
internal static readonly uint* s_shuffleTable256_32bit;
internal static readonly ulong* s_shuffleTable256_64bit;
internal static readonly uint* s_shuffleTable128_32bit;
internal static readonly ulong* s_shuffleTable128_64bit;
internal static readonly uint* s_pShuffleTable512_32bit;
internal static readonly ulong* s_pShuffleTable512_64bit;
internal static readonly uint* s_pShuffleTable256_32bit;
internal static readonly ulong* s_pShuffleTable256_64bit;
internal static readonly uint* s_pShuffleTable128_32bit;
internal static readonly ulong* s_pShuffleTable128_64bit;
/// <summary>
/// Gets whether WideLane is supported on the current hardware.
@@ -22,12 +22,12 @@ public static unsafe class WideLane
static WideLane()
{
s_shuffleTable512_32bit = ShuffleTableGenerator.ComputeShuffleTable512_32Bit();
s_shuffleTable512_64bit = ShuffleTableGenerator.ComputeShuffleTable512_64Bit();
s_shuffleTable256_32bit = ShuffleTableGenerator.ComputeShuffleTable256_32Bit();
s_shuffleTable256_64bit = ShuffleTableGenerator.ComputeShuffleTable256_64Bit();
s_shuffleTable128_32bit = ShuffleTableGenerator.ComputeShuffleTable128_32Bit();
s_shuffleTable128_64bit = ShuffleTableGenerator.ComputeShuffleTable128_64Bit();
s_pShuffleTable512_32bit = ShuffleTableGenerator.ComputeShuffleTable512_32Bit();
s_pShuffleTable512_64bit = ShuffleTableGenerator.ComputeShuffleTable512_64Bit();
s_pShuffleTable256_32bit = ShuffleTableGenerator.ComputeShuffleTable256_32Bit();
s_pShuffleTable256_64bit = ShuffleTableGenerator.ComputeShuffleTable256_64Bit();
s_pShuffleTable128_32bit = ShuffleTableGenerator.ComputeShuffleTable128_32Bit();
s_pShuffleTable128_64bit = ShuffleTableGenerator.ComputeShuffleTable128_64Bit();
}
}
@@ -69,6 +69,12 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
get => Create(TNumber.MaxValue);
}
public static WideLane<TNumber> AllBitsSet
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Create(TNumber.AllBitsSet);
}
public readonly TNumber this[int index]
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -194,48 +200,26 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane<TNumber> MaskLoad(WideLane<TNumber> mask, ref TNumber value)
{
return MaskLoad(mask, (TNumber*)Unsafe.AsPointer(ref value));
var vector = Vector.LoadUnsafe(ref value);
return new WideLane<TNumber>(Vector.ConditionalSelect(mask.value, vector, Vector<TNumber>.Zero));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane<TNumber> MaskLoad(WideLane<TNumber> mask, TNumber* pValue)
{
var vector = Vector.Load(pValue);
return new WideLane<TNumber>(Vector.ConditionalSelect(mask.value, vector, Vector<TNumber>.Zero));
return MaskLoad(mask, ref Unsafe.AsRef<TNumber>(pValue));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane<TNumber> Gather(TNumber* pData, WideLane<TNumber> indices, int scale)
{
Unsafe.SkipInit(out Vector<TNumber> result);
var pResult = (TNumber*)&result;
var pIndices = (TNumber*)&indices;
var count = Vector<TNumber>.Count;
for (var i = 0; i < count; i++)
{
var idx = int.CreateTruncating(pIndices[i]);
pResult[i] = pData[idx * scale / sizeof(TNumber)];
}
return new WideLane<TNumber>(result);
return Gather(ref Unsafe.AsRef<TNumber>(pData), indices, scale);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane<TNumber> Gather(TNumber* pData, int* pIndices, int scale)
{
Unsafe.SkipInit(out Vector<TNumber> result);
var pResult = (TNumber*)&result;
var count = Vector<TNumber>.Count;
for (var i = 0; i < count; i++)
{
pResult[i] = pData[pIndices[i] * scale / sizeof(TNumber)];
}
return new WideLane<TNumber>(result);
return Gather(ref Unsafe.AsRef<TNumber>(pData), ref Unsafe.AsRef<int>(pIndices), scale);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -287,12 +271,6 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int CompressStore(WideLane<TNumber> mask, ref TNumber destination)
{
return CompressStore(mask, (TNumber*)Unsafe.AsPointer(in destination));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int CompressStore(WideLane<TNumber> mask, TNumber* pDestination)
{
if (LaneWidth == Vector512<TNumber>.Count && Vector512.IsHardwareAccelerated)
{
@@ -303,10 +281,10 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 16) because each control vector has 16 elements
var shuffle = Vector512.Load(WideLane.s_shuffleTable512_32bit + (moveMask * 16));
var shuffle = Vector512.Load(WideLane.s_pShuffleTable512_32bit + (moveMask * 16));
var compressed = Vector512.Shuffle(vec, shuffle);
compressed.Store((uint*)pDestination);
compressed.StoreUnsafe(ref Unsafe.As<TNumber, uint>(ref destination));
return BitOperations.PopCount(moveMask);
}
@@ -317,10 +295,10 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 8) because each control vector has 8 elements
var shuffle = Vector512.Load(WideLane.s_shuffleTable512_64bit + (moveMask * 8));
var shuffle = Vector512.Load(WideLane.s_pShuffleTable512_64bit + (moveMask * 8));
var compressed = Vector512.Shuffle(vec, shuffle);
compressed.Store((ulong*)pDestination);
compressed.StoreUnsafe(ref Unsafe.As<TNumber, ulong>(ref destination));
return BitOperations.PopCount(moveMask);
}
}
@@ -333,10 +311,10 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 8) because each control vector has 8 elements
var shuffle = Vector256.Load(WideLane.s_shuffleTable256_32bit + (moveMask * 8));
var shuffle = Vector256.Load(WideLane.s_pShuffleTable256_32bit + (moveMask * 8));
var compressed = Vector256.Shuffle(vec, shuffle);
compressed.Store((uint*)pDestination);
compressed.StoreUnsafe(ref Unsafe.As<TNumber, uint>(ref destination));
return BitOperations.PopCount(moveMask);
}
@@ -349,10 +327,10 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 4) because each control vector has 4 elements
var shuffle = Vector256.Load(WideLane.s_shuffleTable256_64bit + (moveMask * 4));
var shuffle = Vector256.Load(WideLane.s_pShuffleTable256_64bit + (moveMask * 4));
var compressed = Vector256.Shuffle(vec, shuffle);
compressed.Store((ulong*)pDestination);
compressed.StoreUnsafe(ref Unsafe.As<TNumber, ulong>(ref destination));
return BitOperations.PopCount(moveMask);
}
}
@@ -365,10 +343,10 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 4) because each control vector has 4 elements
var shuffle = Vector128.Load(WideLane.s_shuffleTable128_32bit + (moveMask * 4));
var shuffle = Vector128.Load(WideLane.s_pShuffleTable128_32bit + (moveMask * 4));
var compressed = Vector128.Shuffle(vec, shuffle);
compressed.Store((uint*)pDestination);
compressed.StoreUnsafe(ref Unsafe.As<TNumber, uint>(ref destination));
return BitOperations.PopCount(moveMask);
}
@@ -378,9 +356,9 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
var m = Unsafe.As<WideLane<TNumber>, Vector128<ulong>>(ref mask);
var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 2) because each control vector has 2 elements
var shuffle = Vector128.Load(WideLane.s_shuffleTable128_64bit + (moveMask * 2));
var shuffle = Vector128.Load(WideLane.s_pShuffleTable128_64bit + (moveMask * 2));
var compressed = Vector128.Shuffle(vec, shuffle);
compressed.Store((ulong*)pDestination);
compressed.StoreUnsafe(ref Unsafe.As<TNumber, ulong>(ref destination));
return BitOperations.PopCount(moveMask);
}
}
@@ -390,15 +368,21 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
var count = 0;
for (var i = 0; i < LaneWidth; i++)
{
if (mask.value[i] == ~TNumber.Zero)
if (mask.value[i] == TNumber.AllBitsSet)
{
pDestination[count++] = value[i];
Unsafe.Add(ref destination, count++) = value[i];
}
}
return count;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int CompressStore(WideLane<TNumber> mask, TNumber* pDestination)
{
return CompressStore(mask, ref Unsafe.AsRef<TNumber>(pDestination));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public readonly Vector<TNumber> AsVector()
{
@@ -617,6 +601,7 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane<TNumber> Sin(WideLane<TNumber> value)
{
#if MHP_FASTMATH
var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI
var x_sin = value;
@@ -644,11 +629,28 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
poly_sin = z_sin * poly_sin; // z * (...)
return poly_sin * sign_sin;
#else
if (typeof(TNumber) == typeof(float))
{
ref var v = ref Unsafe.As<WideLane<TNumber>, Vector<float>>(ref value);
var result = Vector.Sin(v);
return new WideLane<TNumber>(Unsafe.As<Vector<float>, Vector<TNumber>>(ref result));
}
else if (typeof(TNumber) == typeof(double))
{
ref var v = ref Unsafe.As<WideLane<TNumber>, Vector<double>>(ref value);
var result = Vector.Sin(v);
return new WideLane<TNumber>(Unsafe.As<Vector<double>, Vector<TNumber>>(ref result));
}
return value;
#endif
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane<TNumber> Cos(WideLane<TNumber> value)
{
#if MHP_FASTMATH
var halfPi = Create(TNumber.CreateTruncating(1.570796327f));
var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI
@@ -677,11 +679,30 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
poly_cos = z_cos * poly_cos;
return poly_cos * sign_cos;
#else
if (typeof(TNumber) == typeof(float))
{
ref var v = ref Unsafe.As<WideLane<TNumber>, Vector<float>>(ref value);
var result = Vector.Cos(v);
return new WideLane<TNumber>(Unsafe.As<Vector<float>, Vector<TNumber>>(ref result));
}
else if (typeof(TNumber) == typeof(double))
{
ref var v = ref Unsafe.As<WideLane<TNumber>, Vector<double>>(ref value);
var result = Vector.Cos(v);
return new WideLane<TNumber>(Unsafe.As<Vector<double>, Vector<TNumber>>(ref result));
}
return value;
#endif
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void SinCos(WideLane<TNumber> value, out WideLane<TNumber> sin, out WideLane<TNumber> cos)
{
#if MHP_FASTMATH
// We use Taylor/Remez polynomial approximation for Sin(PI * z) and Cos(PI * z) on the reduced range of z in [-0.5, 0.5].
var halfPi = Create(TNumber.CreateTruncating(1.570796327f));
var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI
@@ -741,6 +762,27 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
sin = poly_sin * sign_sin;
cos = poly_cos * sign_cos;
#else
if (typeof(TNumber) == typeof(float))
{
ref var v = ref Unsafe.As<WideLane<TNumber>, Vector<float>>(ref value);
var (sinResult, cosResult) = Vector.SinCos(v);
sin = new WideLane<TNumber>(Unsafe.As<Vector<float>, Vector<TNumber>>(ref sinResult));
cos = new WideLane<TNumber>(Unsafe.As<Vector<float>, Vector<TNumber>>(ref cosResult));
}
else if (typeof(TNumber) == typeof(double))
{
ref var v = ref Unsafe.As<WideLane<TNumber>, Vector<double>>(ref value);
var (sinResult, cosResult) = Vector.SinCos(v);
sin = new WideLane<TNumber>(Unsafe.As<Vector<double>, Vector<TNumber>>(ref sinResult));
cos = new WideLane<TNumber>(Unsafe.As<Vector<double>, Vector<TNumber>>(ref cosResult));
}
else
{
sin = value;
cos = value;
}
#endif
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -799,7 +841,7 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
var pi = Create(TNumber.CreateTruncating(Math.PI));
var isNegative = LessThan(value, Zero);
return Select(isNegative, pi - result, result);
}
@@ -999,7 +1041,7 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
One,
Select(
LessThan(value, Zero),
~Zero,
AllBitsSet,
Zero));
}
@@ -1017,14 +1059,28 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
if (Sse.IsSupported && LaneWidth == Vector128<float>.Count)
{
ref var vf = ref Unsafe.As<WideLane<TNumber>, Vector128<float>>(ref value);
var result = Sse.Reciprocal(vf);
return Unsafe.As<Vector128<float>, WideLane<TNumber>>(ref result);
var x0 = Sse.Reciprocal(vf);
#if MHP_FASTMATH
return Unsafe.As<Vector128<float>, WideLane<TNumber>>(ref x0);
#else
// SSE and AVX provide fast approximate reciprocal instructions but the precision is very low (11 bits).
// In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases.
var x1 = x0 * (Vector128.Create(2.0f) - x0 * vf);
return Unsafe.As<Vector128<float>, WideLane<TNumber>>(ref x1);
#endif
}
else if (Avx.IsSupported && LaneWidth == Vector256<float>.Count)
{
ref var vf = ref Unsafe.As<WideLane<TNumber>, Vector256<float>>(ref value);
var result = Avx.Reciprocal(vf);
return Unsafe.As<Vector256<float>, WideLane<TNumber>>(ref result);
var x0 = Avx.Reciprocal(vf);
#if MHP_FASTMATH
return Unsafe.As<Vector256<float>, WideLane<TNumber>>(ref x0);
#else
// SSE and AVX provide fast approximate reciprocal instructions but the precision is very low (11 bits).
// In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases.
var x1 = x0 * (Vector256.Create(2.0f) - x0 * vf);
return Unsafe.As<Vector256<float>, WideLane<TNumber>>(ref x1);
#endif
}
}
@@ -1039,14 +1095,28 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
if (Sse.IsSupported && LaneWidth == Vector128<float>.Count)
{
ref var vf = ref Unsafe.As<WideLane<TNumber>, Vector128<float>>(ref value);
var result = Sse.ReciprocalSqrt(vf);
return Unsafe.As<Vector128<float>, WideLane<TNumber>>(ref result);
var x0 = Sse.ReciprocalSqrt(vf);
#if MHP_FASTMATH
return Unsafe.As<Vector128<float>, WideLane<TNumber>>(ref x0);
#else
// SSE and AVX provide fast approximate reciprocal sqrt instructions but the precision is very low (11 bits).
// In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases.
var x1 = x0 * Vector128.Create(0.5f) * (Vector128.Create(3.0f) - (vf * x0 * x0));
return Unsafe.As<Vector128<float>, WideLane<TNumber>>(ref x1);
#endif
}
else if (Avx.IsSupported && LaneWidth == Vector256<float>.Count)
{
ref var vf = ref Unsafe.As<WideLane<TNumber>, Vector256<float>>(ref value);
var result = Avx.ReciprocalSqrt(vf);
return Unsafe.As<Vector256<float>, WideLane<TNumber>>(ref result);
var x0 = Avx.ReciprocalSqrt(vf);
#if MHP_FASTMATH
return Unsafe.As<Vector256<float>, WideLane<TNumber>>(ref x0);
#else
// SSE and AVX provide fast approximate reciprocal sqrt instructions but the precision is very low (11 bits).
// In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases.
var x1 = x0 * Vector256.Create(0.5f) * (Vector256.Create(3.0f) - (vf * x0 * x0));
return Unsafe.As<Vector256<float>, WideLane<TNumber>>(ref x1);
#endif
}
}