using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace Misaki.HighPerformance.Mathematics.SPMD;
public static unsafe class WideLane
{
internal static readonly uint* s_shuffleTable512_32bit;
internal static readonly ulong* s_shuffleTable512_64bit;
internal static readonly uint* s_shuffleTable256_32bit;
internal static readonly ulong* s_shuffleTable256_64bit;
internal static readonly uint* s_shuffleTable128_32bit;
internal static readonly ulong* s_shuffleTable128_64bit;
///
/// Gets whether WideLane is supported on the current hardware.
///
public static bool IsSupported => Vector.IsHardwareAccelerated;
static WideLane()
{
s_shuffleTable512_32bit = ShuffleTableGenerator.ComputeShuffleTable512_32Bit();
s_shuffleTable512_64bit = ShuffleTableGenerator.ComputeShuffleTable512_64Bit();
s_shuffleTable256_32bit = ShuffleTableGenerator.ComputeShuffleTable256_32Bit();
s_shuffleTable256_64bit = ShuffleTableGenerator.ComputeShuffleTable256_64Bit();
s_shuffleTable128_32bit = ShuffleTableGenerator.ComputeShuffleTable128_32Bit();
s_shuffleTable128_64bit = ShuffleTableGenerator.ComputeShuffleTable128_64Bit();
}
}
[StructLayout(LayoutKind.Sequential)]
public readonly unsafe struct WideLane : ISPMD, TNumber>
where TNumber : unmanaged, INumber, IBinaryNumber, IMinMaxValue, IBitwiseOperators
{
private static readonly Vector s_indices;
public readonly Vector value;
public static int LaneWidth
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Vector.Count;
}
public static WideLane Zero
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => new(Vector.Zero);
}
public static WideLane One
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => new(Vector.One);
}
public static WideLane MinValue
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Create(TNumber.MinValue);
}
public static WideLane MaxValue
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Create(TNumber.MaxValue);
}
public readonly TNumber this[int index]
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => value[index];
}
static WideLane()
{
var pValues = stackalloc TNumber[LaneWidth];
for (var i = 0; i < LaneWidth; i++)
{
pValues[i] = TNumber.CreateTruncating(i);
}
s_indices = Vector.Load(pValues);
}
public WideLane(Vector value)
{
this.value = value;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector VectorFloor(Vector vector)
{
if (typeof(TNumber) == typeof(float))
{
ref var v = ref Unsafe.As, Vector>(ref vector);
var floored = Vector.Floor(v);
return Unsafe.As, Vector>(ref floored);
}
else if (typeof(TNumber) == typeof(double))
{
ref var v = ref Unsafe.As, Vector>(ref vector);
var floored = Vector.Floor(v);
return Unsafe.As, Vector>(ref floored);
}
return vector;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector VectorTruncate(Vector vector)
{
if (typeof(TNumber) == typeof(float))
{
ref var v = ref Unsafe.As, Vector>(ref vector);
var truncated = Vector.Truncate(v);
return Unsafe.As, Vector>(ref truncated);
}
else if (typeof(TNumber) == typeof(double))
{
ref var v = ref Unsafe.As, Vector>(ref vector);
var truncated = Vector.Truncate(v);
return Unsafe.As, Vector>(ref truncated);
}
return vector;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Create(TNumber value)
{
return new(Vector.Create(value));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Create(params ReadOnlySpan values)
{
return new(Vector.Create(values));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Create(Vector value)
{
return new(value);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Sequence(TNumber start, TNumber step)
{
return new(Vector.Create(start) + (Vector.Create(step) * s_indices));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Load(ref TNumber value)
{
return new(Vector.LoadUnsafe(ref value));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Load(TNumber* pValue)
{
return new(Vector.Load(pValue));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public readonly void Store(ref TNumber destination)
{
value.StoreUnsafe(ref destination);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public readonly void Store(TNumber* pDestination)
{
value.Store(pDestination);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int CompressStore(WideLane mask, ref TNumber destination)
{
return CompressStore(mask, (TNumber*)Unsafe.AsPointer(in destination));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int CompressStore(WideLane mask, TNumber* pDestination)
{
var size = sizeof(TNumber);
if (LaneWidth == Vector512.Count && Vector512.IsHardwareAccelerated)
{
if (size == 4)
{
ref var vec = ref Unsafe.As, Vector512>(ref Unsafe.AsRef(in this));
var m = Unsafe.As, Vector512>(ref mask);
var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 16) because each control vector has 16 elements
var shuffle = Vector512.Load(WideLane.s_shuffleTable512_32bit + (moveMask * 16));
var compressed = Vector512.Shuffle(vec, shuffle);
compressed.Store((uint*)pDestination);
return BitOperations.PopCount(moveMask);
}
if (size == 8)
{
ref var vec = ref Unsafe.As, Vector512>(ref Unsafe.AsRef(in this));
var m = Unsafe.As, Vector512>(ref mask);
var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 8) because each control vector has 8 elements
var shuffle = Vector512.Load(WideLane.s_shuffleTable512_64bit + (moveMask * 8));
var compressed = Vector512.Shuffle(vec, shuffle);
compressed.Store((ulong*)pDestination);
return BitOperations.PopCount(moveMask);
}
}
else if (LaneWidth == Vector256.Count && Vector256.IsHardwareAccelerated)
{
if (size == 4)
{
ref var vec = ref Unsafe.As, Vector256>(ref Unsafe.AsRef(in this));
var m = Unsafe.As, Vector256>(ref mask);
var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 8) because each control vector has 8 elements
var shuffle = Vector256.Load(WideLane.s_shuffleTable256_32bit + (moveMask * 8));
var compressed = Vector256.Shuffle(vec, shuffle);
compressed.Store((uint*)pDestination);
return BitOperations.PopCount(moveMask);
}
if (size == 8)
{
ref var vec = ref Unsafe.As, Vector256>(ref Unsafe.AsRef(in this));
var m = Unsafe.As, Vector256>(ref mask);
// For 64-bit, ExtractMostSignificantBits only populates 4 bits (0-15)
var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 4) because each control vector has 4 elements
var shuffle = Vector256.Load(WideLane.s_shuffleTable256_64bit + (moveMask * 4));
var compressed = Vector256.Shuffle(vec, shuffle);
compressed.Store((ulong*)pDestination);
return BitOperations.PopCount(moveMask);
}
}
else if (LaneWidth == Vector128.Count && Vector128.IsHardwareAccelerated)
{
if (size == 4)
{
ref var vec = ref Unsafe.As, Vector128>(ref Unsafe.AsRef(in this));
var m = Unsafe.As, Vector128>(ref mask);
var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 4) because each control vector has 4 elements
var shuffle = Vector128.Load(WideLane.s_shuffleTable128_32bit + (moveMask * 4));
var compressed = Vector128.Shuffle(vec, shuffle);
compressed.Store((uint*)pDestination);
return BitOperations.PopCount(moveMask);
}
if (size == 8)
{
ref var vec = ref Unsafe.As, Vector128>(ref Unsafe.AsRef(in this));
var m = Unsafe.As, Vector128>(ref mask);
var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 2) because each control vector has 2 elements
var shuffle = Vector128.Load(WideLane.s_shuffleTable128_64bit + (moveMask * 2));
var compressed = Vector128.Shuffle(vec, shuffle);
compressed.Store((ulong*)pDestination);
return BitOperations.PopCount(moveMask);
}
}
// This is slow, but correct on ANY hardware.
// Check sign bit of the mask lane
var count = 0;
for (var i = 0; i < LaneWidth; i++)
{
if (mask.value[i] == ~TNumber.Zero)
{
pDestination[count++] = value[i];
}
}
return count;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public readonly Vector AsVector()
{
return value;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane operator +(WideLane a, WideLane b)
{
return new(a.value + b.value);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane operator -(WideLane a, WideLane b)
{
return new(a.value - b.value);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane operator *(WideLane a, WideLane b)
{
return new(a.value * b.value);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane operator /(WideLane a, WideLane b)
{
return new(a.value / b.value);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane operator %(WideLane a, WideLane b)
{
return new(a.value - VectorFloor(a.value / b.value) * b.value);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane operator -(WideLane a)
{
return new(-a.value);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane operator &(WideLane a, WideLane b)
{
return new(a.value & b.value);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane operator |(WideLane a, WideLane b)
{
return new(a.value | b.value);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane operator ^(WideLane a, WideLane b)
{
return new(a.value ^ b.value);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane operator ~(WideLane a)
{
return new(~a.value);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane operator ==(WideLane a, WideLane b)
{
return Equal(a, b);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane operator !=(WideLane a, WideLane b)
{
return ~Equal(a, b);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane operator >(WideLane a, WideLane b)
{
return GreaterThan(a, b);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane operator >=(WideLane a, WideLane b)
{
return GreaterThanOrEqual(a, b);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane operator <(WideLane a, WideLane b)
{
return LessThan(a, b);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane operator <=(WideLane a, WideLane b)
{
return LessThanOrEqual(a, b);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static implicit operator WideLane(TNumber value)
{
return Create(value);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Abs(WideLane value)
{
return new(Vector.Abs(value.value));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Floor(WideLane value)
{
if (typeof(TNumber) == typeof(float))
{
ref var v = ref Unsafe.As, Vector>(ref value);
var floored = Vector.Floor(v);
return new WideLane(Unsafe.As, Vector>(ref floored));
}
else if (typeof(TNumber) == typeof(double))
{
ref var v = ref Unsafe.As, Vector>(ref value);
var floored = Vector.Floor(v);
return new WideLane(Unsafe.As, Vector>(ref floored));
}
return value;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Frac(WideLane value)
{
return new(value.value - VectorFloor(value.value));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Sqrt(WideLane value)
{
return new(Vector.SquareRoot(value.value));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Lerp(WideLane a, WideLane b, WideLane t)
{
return new(a.value + (b.value - a.value) * t.value);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane MultipleAdd(WideLane a, WideLane b, WideLane c)
{
if (typeof(TNumber) == typeof(float))
{
ref var va = ref Unsafe.As, Vector>(ref a);
ref var vb = ref Unsafe.As, Vector>(ref b);
ref var vc = ref Unsafe.As, Vector>(ref c);
var result = Vector.FusedMultiplyAdd(va, vb, vc);
return new WideLane(Unsafe.As, Vector>(ref result));
}
else if (typeof(TNumber) == typeof(double))
{
ref var va = ref Unsafe.As, Vector>(ref a);
ref var vb = ref Unsafe.As, Vector>(ref b);
ref var vc = ref Unsafe.As, Vector>(ref c);
var result = Vector.FusedMultiplyAdd(va, vb, vc);
return new WideLane(Unsafe.As, Vector>(ref result));
}
else
{
return new((a.value * b.value) + c.value);
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Min(WideLane a, WideLane b)
{
return new(Vector.Min(a.value, b.value));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Max(WideLane a, WideLane b)
{
return new(Vector.Max(a.value, b.value));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Clamp(WideLane value, WideLane min, WideLane max)
{
return new(Vector.Clamp(value.value, min.value, max.value));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Saturate(WideLane value)
{
return Clamp(value, Create(TNumber.Zero), Create(TNumber.One));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Sin(WideLane value)
{
if (typeof(TNumber) == typeof(float))
{
ref var v = ref Unsafe.As, Vector>(ref value);
var result = Vector.Sin(v);
return new WideLane(Unsafe.As, Vector>(ref result));
}
else if (typeof(TNumber) == typeof(double))
{
ref var v = ref Unsafe.As, Vector>(ref value);
var result = Vector.Sin(v);
return new WideLane(Unsafe.As, Vector>(ref result));
}
return value;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Cos(WideLane value)
{
if (typeof(TNumber) == typeof(float))
{
ref var v = ref Unsafe.As, Vector>(ref value);
var result = Vector.Cos(v);
return new WideLane(Unsafe.As, Vector>(ref result));
}
else if (typeof(TNumber) == typeof(double))
{
ref var v = ref Unsafe.As, Vector>(ref value);
var result = Vector.Cos(v);
return new WideLane(Unsafe.As, Vector>(ref result));
}
return value;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static (WideLane sin, WideLane cos) SinCos(WideLane value)
{
if (typeof(TNumber) == typeof(float))
{
ref var v = ref Unsafe.As, Vector>(ref value);
var (sin, cos) = Vector.SinCos(v);
return (new WideLane(Unsafe.As, Vector>(ref sin)), new WideLane(Unsafe.As, Vector>(ref cos)));
}
else if (typeof(TNumber) == typeof(double))
{
ref var v = ref Unsafe.As, Vector>(ref value);
var (sin, cos) = Vector.SinCos(v);
return (new WideLane(Unsafe.As, Vector>(ref sin)), new WideLane(Unsafe.As, Vector>(ref cos)));
}
return (value, value);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Tan(WideLane value)
{
// 1. Range Reduction
// Transform value into range [-pi/4, pi/4].
// This is complex to do right (Payne-Hanek), but for games
// a simple approximation: value = value - (PI * Round(value / PI)) is good enough.
var pi = Create(TNumber.CreateTruncating(Math.PI));
var x = value - pi * Round(value / pi);
// 2. The Approximation (Remez Polynomial)
// tan(value) ~= value + c1*value^3 + c2*value^5
// Factored (Horner's Method) for fewer ops: value * (1 + value^2 * (c1 + c2*value^2))
var x2 = x * x;
var vc1 = Create(TNumber.CreateTruncating(0.3333314036)); // 1/3
var vc2 = Create(TNumber.CreateTruncating(0.1333923995)); // 2/15
// x2 * (c1 + c2 * x2)
var poly = MultipleAdd(x2, vc2, vc1);
// value * (1 + x2 * poly)
return MultipleAdd(x, MultipleAdd(x2, poly, One), Zero);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Asin(WideLane value)
{
// asin(value) = pi/2 - acos(value)
var piOver2 = Create(TNumber.CreateTruncating(Math.PI / 2));
return piOver2 - Acos(value);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Acos(WideLane value)
{
// 0 <= value <= 1 : acos(value) = sqrt(1 - value) * (c0 + c1*value + c2*value^2 + c3*value^3)
// value < 0 : acos(value) = pi - acos(-value)
var x = Abs(value);
var c0 = Create(TNumber.CreateTruncating(1.5707288f)); // pi/2
var c1 = Create(TNumber.CreateTruncating(-0.2121144f));
var c2 = Create(TNumber.CreateTruncating(0.0742610f));
var c3 = Create(TNumber.CreateTruncating(-0.0187293f));
var term1 = MultipleAdd(x, c3, c2);
var term2 = MultipleAdd(x, term1, c1);
var poly = MultipleAdd(x, term2, c0);
var sqrtTerm = Sqrt(One - x);
var result = poly * sqrtTerm;
var pi = Create(TNumber.CreateTruncating(Math.PI));
var isNegative = LessThan(value, Zero);
return Select(isNegative, pi - result, result);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Atan(WideLane value)
{
// atan(value) = value * (c1 + c2*value^2)
var c1 = Create(TNumber.CreateTruncating(0.97239411f));
var c2 = Create(TNumber.CreateTruncating(-0.19194795f));
var x2 = value * value;
var poly = MultipleAdd(x2, c2, c1);
return value * poly;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Atan2(WideLane y, WideLane x)
{
var absX = Abs(x);
var absY = Abs(y);
// 1. Determine the ratio (input to Atan)
// If |value| > |y|, we are in the "shallow" region, ratio = y/value
// If |y| > |value|, we are in the "steep" region, ratio = value/y (and we transform result)
var yGtX = GreaterThan(absY, absX);
// Select numerator and denominator to ensure ratio is always in [-1, 1]
var num = Select(yGtX, absX, absY);
var den = Select(yGtX, absY, absX);
var t = num / den; // t is now in [0, 1]
var t2 = t * t;
// 2. Polynomial Approximation (Odd function: value * (c1 + c2*value^2))
var c1 = Create(TNumber.CreateTruncating(0.97239411f));
var c2 = Create(TNumber.CreateTruncating(-0.19194795f));
// (c1 + c2 * t2)
var poly = MultipleAdd(c2, t2, c1);
// result = t * poly
var result = t * poly;
// 3. Reconstruct the angle
// If we swapped value/y (yGtX), the identity is: atan(value/y) = PI/2 - atan(y/value)
var halfPi = Create(TNumber.CreateTruncating(1.570796327f));
result = Select(yGtX, halfPi - result, result);
// 4. Adjust for Quadrants (Signs)
// If value < 0, we are in quadrants 2 or 3, so we need to add PI
var pi = Create(TNumber.CreateTruncating(3.141592654f));
var xLtZero = LessThan(x, Zero);
result = Select(xLtZero, pi - result, result);
// If y < 0, the result should be negative (standard atan2 convention)
// NOTE: This sign flip strategy depends on exact polynomial range mapping,
// but typically just copy the sign of Y to the result.
var yLtZero = LessThan(y, Zero);
// If original Y was negative, negate the result
// (This works because our ratio logic effectively computed atan(|y|/|value|) above)
var negativeResult = -result;
return Select(yLtZero, negativeResult, result);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Pow(WideLane x, WideLane y)
{
return Exp(y * Log(x));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Exp(WideLane value)
{
if (typeof(TNumber) == typeof(float))
{
ref var v = ref Unsafe.As, Vector>(ref value);
var result = Vector.Exp(v);
return new WideLane(Unsafe.As, Vector>(ref result));
}
else if (typeof(TNumber) == typeof(double))
{
ref var v = ref Unsafe.As, Vector>(ref value);
var result = Vector.Exp(v);
return new WideLane(Unsafe.As, Vector>(ref result));
}
return value;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Exp2(WideLane value)
{
return Pow(Create(TNumber.CreateTruncating(2)), value);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Log(WideLane value)
{
if (typeof(TNumber) == typeof(float))
{
ref var v = ref Unsafe.As, Vector>(ref value);
var result = Vector.Log(v);
return new WideLane(Unsafe.As, Vector