using System.Diagnostics.CodeAnalysis;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace Misaki.HighPerformance.HPC;
public static unsafe class WideLane
{
internal static readonly uint* s_pShuffleTable512_32bit;
internal static readonly ulong* s_pShuffleTable512_64bit;
internal static readonly uint* s_pShuffleTable256_32bit;
internal static readonly ulong* s_pShuffleTable256_64bit;
internal static readonly uint* s_pShuffleTable128_32bit;
internal static readonly ulong* s_pShuffleTable128_64bit;
///
/// Gets whether WideLane is supported on the current hardware.
///
public static bool IsSupported => Vector.IsHardwareAccelerated;
static WideLane()
{
s_pShuffleTable512_32bit = ShuffleTableGenerator.ComputeShuffleTable512_32Bit();
s_pShuffleTable512_64bit = ShuffleTableGenerator.ComputeShuffleTable512_64Bit();
s_pShuffleTable256_32bit = ShuffleTableGenerator.ComputeShuffleTable256_32Bit();
s_pShuffleTable256_64bit = ShuffleTableGenerator.ComputeShuffleTable256_64Bit();
s_pShuffleTable128_32bit = ShuffleTableGenerator.ComputeShuffleTable128_32Bit();
s_pShuffleTable128_64bit = ShuffleTableGenerator.ComputeShuffleTable128_64Bit();
}
}
// TODO: We can use source generator to generate the optimized code for different hardware (e.g.,SSE, AVX, AVX2, etc.) and select the best version at runtime.
// Right now, we rely on Vector API to auto vectorize the code.
// This works fine in jit, but require user to build multiple binaries with different target architectures to get the best performance in NativeAOT via IlcInstructionSet.
[StructLayout(LayoutKind.Sequential)]
public readonly unsafe partial struct WideLane : ISPMDLane, TNumber>
where TNumber : unmanaged, INumber, IBinaryNumber, IMinMaxValue, IBitwiseOperators
{
public readonly Vector value;
public static int LaneWidth
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Vector.Count;
}
public static WideLane Zero
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Unsafe.BitCast, WideLane>(Vector.Zero);
}
public static WideLane One
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Unsafe.BitCast, WideLane>(Vector.One);
}
public static WideLane MinValue
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Create(TNumber.MinValue);
}
public static WideLane MaxValue
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Create(TNumber.MaxValue);
}
public static WideLane AllBitsSet
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Create(TNumber.AllBitsSet);
}
public readonly TNumber this[int index]
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => value[index];
}
public WideLane(Vector value)
{
this.value = value;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector VectorFloor(Vector vector)
{
if (typeof(TNumber) == typeof(float))
{
var v = Unsafe.BitCast, Vector>(vector);
var floored = Vector.Floor(v);
return Unsafe.BitCast, Vector>(floored);
}
else if (typeof(TNumber) == typeof(double))
{
var v = Unsafe.BitCast, Vector>(vector);
var floored = Vector.Floor(v);
return Unsafe.BitCast, Vector>(floored);
}
return vector;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector VectorTruncate(Vector vector)
{
if (typeof(TNumber) == typeof(float))
{
var v = Unsafe.BitCast, Vector>(vector);
var truncated = Vector.Truncate(v);
return Unsafe.BitCast, Vector>(truncated);
}
else if (typeof(TNumber) == typeof(double))
{
var v = Unsafe.BitCast, Vector>(vector);
var truncated = Vector.Truncate(v);
return Unsafe.BitCast, Vector>(truncated);
}
return vector;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Create(TNumber value)
{
return Unsafe.BitCast, WideLane>(Vector.Create(value));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Create(params ReadOnlySpan values)
{
return Unsafe.BitCast, WideLane>(Vector.Create(values));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Create(Vector value)
{
return Unsafe.BitCast, WideLane>(value);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Sequence(TNumber start, TNumber step)
{
if (LaneWidth == Vector512.Count)
{
var v = Vector512.CreateSequence(start, step);
return Unsafe.BitCast, WideLane>(v);
}
else if (LaneWidth == Vector256.Count)
{
var v = Vector256.CreateSequence(start, step);
return Unsafe.BitCast, WideLane>(v);
}
else if (LaneWidth == Vector128.Count)
{
var v = Vector128.CreateSequence(start, step);
return Unsafe.BitCast, WideLane>(v);
}
else if (LaneWidth == Vector64.Count)
{
var v = Vector64.CreateSequence(start, step);
return Unsafe.BitCast, WideLane>(v);
}
else
{
return Unsafe.BitCast, WideLane>(Vector.Create(start) + (Vector.Create(step) * Vector.Indices));
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Load(ref TNumber value)
{
return Unsafe.BitCast, WideLane>(Vector.LoadUnsafe(ref value));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Load(TNumber* pValue)
{
return Unsafe.BitCast, WideLane>(Vector.Load(pValue));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane MaskLoad(TNumber* pValue, WideLane mask)
{
if (Avx.IsSupported)
{
if (LaneWidth == Vector128.Count)
{
if (sizeof(TNumber) == sizeof(float))
{
var result = Avx.MaskLoad((float*)pValue, Unsafe.BitCast, Vector128>(mask));
return Unsafe.BitCast, WideLane>(result);
}
else if (sizeof(TNumber) == sizeof(double))
{
var result = Avx.MaskLoad((double*)pValue, Unsafe.BitCast, Vector128>(mask));
return Unsafe.BitCast, WideLane>(result);
}
}
else if (LaneWidth == Vector256.Count)
{
if (sizeof(TNumber) == sizeof(float))
{
var result = Avx.MaskLoad((float*)pValue, Unsafe.BitCast, Vector256>(mask));
return Unsafe.BitCast, WideLane>(result);
}
else if (sizeof(TNumber) == sizeof(double))
{
var result = Avx.MaskLoad((double*)pValue, Unsafe.BitCast, Vector256>(mask));
return Unsafe.BitCast, WideLane>(result);
}
}
}
Unsafe.SkipInit(out WideLane v);
var pv = (TNumber*)&v;
for (var i = 0; i < LaneWidth; i++)
{
// TODO: Can we assume pData is padded with TLane.LaneWidth?
// In that case we can use Load + Select instead of checking mask for each element, which should be faster.
pv[i] = (mask.value[i] != TNumber.Zero) ? pValue[i] : TNumber.Zero;
}
return v;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane MaskLoad(ref TNumber value, WideLane mask)
{
return MaskLoad((TNumber*)Unsafe.AsPointer(ref value), mask);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Gather(TNumber* pData, WideLane indices, [ConstantExpected(Min = (byte)(1), Max = (byte)(8))] byte scale)
{
if (Avx2.IsSupported)
{
if (LaneWidth == Vector128.Count)
{
if (sizeof(TNumber) == sizeof(uint))
{
var vidx = SPMDUtility.GetIndicesAs128Int32(indices.value);
var vx = Avx2.GatherVector128((uint*)pData, vidx, scale);
return Unsafe.BitCast, WideLane>(vx);
}
else if (sizeof(TNumber) == sizeof(ulong))
{
var vidx = SPMDUtility.GetIndicesAs128Int64(indices.value);
var vx = Avx2.GatherVector128((ulong*)pData, vidx, scale);
return Unsafe.BitCast, WideLane>(vx);
}
}
else if (LaneWidth == Vector256.Count)
{
if (sizeof(TNumber) == sizeof(uint))
{
var vidx = SPMDUtility.GetIndicesAs256Int32(indices.value);
var vx = Avx2.GatherVector256((uint*)pData, vidx, scale);
return Unsafe.BitCast, WideLane>(vx);
}
else if (sizeof(TNumber) == sizeof(ulong))
{
var vidx = SPMDUtility.GetIndicesAs256Int64(indices.value);
var vx = Avx2.GatherVector256((ulong*)pData, vidx, scale);
return Unsafe.BitCast, WideLane>(vx);
}
}
}
Unsafe.SkipInit(out Vector result);
var pResult = (TNumber*)&result;
var count = Vector.Count;
for (var i = 0; i < count; i++)
{
var idx = int.CreateTruncating(indices[i]);
pResult[i] = *(TNumber*)((byte*)pData + (idx * scale));
}
return Unsafe.BitCast, WideLane>(result);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Gather(TNumber* pData, int* pIndices, [ConstantExpected(Min = (byte)(1), Max = (byte)(8))] byte scale)
{
if (Avx2.IsSupported)
{
if (LaneWidth == Vector128.Count)
{
if (sizeof(TNumber) == sizeof(uint))
{
var vidx = Vector128.Load(pIndices);
var vx = Avx2.GatherVector128((uint*)pData, vidx, scale);
return Unsafe.BitCast, WideLane>(vx);
}
else if (sizeof(TNumber) == sizeof(ulong))
{
var vidx = Vector128.Load(pIndices);
var vx = Avx2.GatherVector128((ulong*)pData, vidx, scale);
return Unsafe.BitCast, WideLane>(vx);
}
}
else if (LaneWidth == Vector256.Count)
{
if (sizeof(TNumber) == sizeof(uint))
{
var vidx = Vector256.Load(pIndices);
var vx = Avx2.GatherVector256((uint*)pData, vidx, scale);
return Unsafe.BitCast, WideLane>(vx);
}
else if (sizeof(TNumber) == sizeof(ulong))
{
var vidx = Vector128.Load(pIndices);
var vx = Avx2.GatherVector256((ulong*)pData, vidx, scale);
return Unsafe.BitCast, WideLane>(vx);
}
}
}
Unsafe.SkipInit(out Vector result);
var pResult = (TNumber*)&result;
var count = Vector.Count;
for (var i = 0; i < count; i++)
{
pResult[i] = *(TNumber*)((byte*)pData + (pIndices[i] * scale));
}
return Unsafe.BitCast, WideLane>(result);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Gather(ref TNumber baseAddress, WideLane indices, [ConstantExpected(Min = (byte)(1), Max = (byte)(8))] byte scale)
{
return Gather((TNumber*)Unsafe.AsPointer(ref baseAddress), indices, scale);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane Gather(ref TNumber baseAddress, ref int baseIndex, [ConstantExpected(Min = (byte)(1), Max = (byte)(8))] byte scale)
{
return Gather((TNumber*)Unsafe.AsPointer(ref baseAddress), (int*)Unsafe.AsPointer(ref baseIndex), scale);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane MaskGather(TNumber* pData, WideLane indices, WideLane mask, [ConstantExpected(Min = (byte)(1), Max = (byte)(8))] byte scale)
{
if (Avx2.IsSupported)
{
if (LaneWidth == Vector128.Count)
{
if (sizeof(TNumber) == sizeof(uint))
{
var vidx = SPMDUtility.GetIndicesAs128Int32(indices.value);
var vmask = Unsafe.BitCast, Vector128>(mask);
var vx = Avx2.GatherMaskVector128(Vector128.Zero, (uint*)pData, vidx, vmask, scale);
return Unsafe.BitCast, WideLane>(vx);
}
else if (sizeof(TNumber) == sizeof(ulong))
{
var vidx = SPMDUtility.GetIndicesAs128Int64(indices.value);
var vmask = Unsafe.BitCast, Vector128>(mask);
var vx = Avx2.GatherMaskVector128(Vector128.Zero, (ulong*)pData, vidx, vmask, scale);
return Unsafe.BitCast, WideLane>(vx);
}
}
else if (LaneWidth == Vector256.Count)
{
if (sizeof(TNumber) == sizeof(uint))
{
var vidx = SPMDUtility.GetIndicesAs256Int32(indices.value);
var vmask = Unsafe.BitCast, Vector256>(mask);
var vx = Avx2.GatherMaskVector256(Vector256.Zero, (uint*)pData, vidx, vmask, scale);
return Unsafe.BitCast, WideLane>(vx);
}
else if (sizeof(TNumber) == sizeof(ulong))
{
var vidx = SPMDUtility.GetIndicesAs256Int64(indices.value);
var vmask = Unsafe.BitCast, Vector256>(mask);
var vx = Avx2.GatherMaskVector256(Vector256.Zero, (ulong*)pData, vidx, vmask, scale);
return Unsafe.BitCast, WideLane>(vx);
}
}
}
Unsafe.SkipInit(out Vector result);
var pResult = (TNumber*)&result;
var count = Vector.Count;
for (var i = 0; i < count; i++)
{
var idx = int.CreateTruncating(indices[i]);
pResult[i] = *(TNumber*)((byte*)pData + (idx * scale));
}
return Unsafe.BitCast, WideLane>(result);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane MaskGather(TNumber* pData, int* pIndices, WideLane mask, [ConstantExpected(Min = (byte)(1), Max = (byte)(8))] byte scale)
{
if (Avx2.IsSupported)
{
if (LaneWidth == Vector128.Count)
{
if (sizeof(TNumber) == sizeof(uint))
{
var vidx = Vector128.Load(pIndices);
var vmask = Unsafe.BitCast, Vector128>(mask);
var vx = Avx2.GatherMaskVector128(Vector128.Zero, (uint*)pData, vidx, vmask, scale);
return Unsafe.BitCast, WideLane>(vx);
}
else if (sizeof(TNumber) == sizeof(ulong))
{
var vidx = Vector128.Load(pIndices);
var vmask = Unsafe.BitCast, Vector128>(mask);
var vx = Avx2.GatherMaskVector128(Vector128.Zero, (ulong*)pData, vidx, vmask, scale);
return Unsafe.BitCast, WideLane>(vx);
}
}
else if (LaneWidth == Vector256.Count)
{
if (sizeof(TNumber) == sizeof(uint))
{
var vidx = Vector256.Load(pIndices);
var vmask = Unsafe.BitCast, Vector256>(mask);
var vx = Avx2.GatherMaskVector256(Vector256.Zero, (uint*)pData, vidx, vmask, scale);
return Unsafe.BitCast, WideLane>(vx);
}
else if (sizeof(TNumber) == sizeof(ulong))
{
var vidx = Vector128.Load(pIndices);
var vmask = Unsafe.BitCast, Vector256>(mask);
var vx = Avx2.GatherMaskVector256(Vector256.Zero, (ulong*)pData, vidx, vmask, scale);
return Unsafe.BitCast, WideLane>(vx);
}
}
}
Unsafe.SkipInit(out Vector result);
var pResult = (TNumber*)&result;
var count = Vector.Count;
for (var i = 0; i < count; i++)
{
pResult[i] = *(TNumber*)((byte*)pData + (pIndices[i] * scale));
}
return Unsafe.BitCast, WideLane>(result);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public readonly void Store(ref TNumber destination)
{
value.StoreUnsafe(ref destination);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public readonly void Store(TNumber* pDestination)
{
value.Store(pDestination);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int CompressStore(ref TNumber destination, WideLane mask)
{
return CompressStore((TNumber*)Unsafe.AsPointer(ref destination), mask);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int CompressStore(TNumber* pDestination, WideLane mask)
{
if (LaneWidth == Vector512.Count && Vector512.IsHardwareAccelerated)
{
if (sizeof(TNumber) == 4)
{
var vec = Unsafe.BitCast, Vector512>(Unsafe.AsRef(in this));
var m = Unsafe.BitCast, Vector512>(mask);
var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 16) because each control vector has 16 elements
var shuffle = Vector512.Load(WideLane.s_pShuffleTable512_32bit + (moveMask * 16));
var compressed = Vector512.Shuffle(vec, shuffle);
compressed.Store((uint*)pDestination);
return BitOperations.PopCount(moveMask);
}
else if (sizeof(TNumber) == 8)
{
var vec = Unsafe.BitCast, Vector512>(Unsafe.AsRef(in this));
var m = Unsafe.BitCast, Vector512>(mask);
var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 8) because each control vector has 8 elements
var shuffle = Vector512.Load(WideLane.s_pShuffleTable512_64bit + (moveMask * 8));
var compressed = Vector512.Shuffle(vec, shuffle);
compressed.Store((ulong*)pDestination);
return BitOperations.PopCount(moveMask);
}
}
else if (LaneWidth == Vector256.Count && Vector256.IsHardwareAccelerated)
{
if (sizeof(TNumber) == 4)
{
var vec = Unsafe.BitCast, Vector256>(Unsafe.AsRef(in this));
var m = Unsafe.BitCast, Vector256>(mask);
var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 8) because each control vector has 8 elements
var shuffle = Vector256.Load(WideLane.s_pShuffleTable256_32bit + (moveMask * 8));
var compressed = Vector256.Shuffle(vec, shuffle);
compressed.Store((uint*)pDestination);
return BitOperations.PopCount(moveMask);
}
else if (sizeof(TNumber) == 8)
{
var vec = Unsafe.BitCast, Vector256>(Unsafe.AsRef(in this));
var m = Unsafe.BitCast, Vector256>(mask);
// For 64-bit, ExtractMostSignificantBits only populates 4 bits (0-15)
var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 4) because each control vector has 4 elements
var shuffle = Vector256.Load(WideLane.s_pShuffleTable256_64bit + (moveMask * 4));
var compressed = Vector256.Shuffle(vec, shuffle);
compressed.Store((ulong*)pDestination);
return BitOperations.PopCount(moveMask);
}
}
else if (LaneWidth == Vector128.Count && Vector128.IsHardwareAccelerated)
{
if (sizeof(TNumber) == 4)
{
var vec = Unsafe.BitCast, Vector128>(Unsafe.AsRef(in this));
var m = Unsafe.BitCast, Vector128>(mask);
var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 4) because each control vector has 4 elements
var shuffle = Vector128.Load(WideLane.s_pShuffleTable128_32bit + (moveMask * 4));
var compressed = Vector128.Shuffle(vec, shuffle);
compressed.Store((uint*)pDestination);
return BitOperations.PopCount(moveMask);
}
else if (sizeof(TNumber) == 8)
{
var vec = Unsafe.BitCast, Vector128>(Unsafe.AsRef(in this));
var m = Unsafe.BitCast, Vector128>(mask);
var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 2) because each control vector has 2 elements
var shuffle = Vector128.Load(WideLane.s_pShuffleTable128_64bit + (moveMask * 2));
var compressed = Vector128.Shuffle(vec, shuffle);
compressed.Store((ulong*)pDestination);
return BitOperations.PopCount(moveMask);
}
}
// This is slow, but correct on ANY hardware.
// Check sign bit of the mask lane
var count = 0;
for (var i = 0; i < LaneWidth; i++)
{
// TODO: Can we assume pData is padded with TLane.LaneWidth?
// In that case we can use Load + Select instead of checking mask for each element, which should be faster.
if (mask.value[i] != TNumber.Zero)
{
pDestination[count++] = value[i];
}
}
return count;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void MaskStore(TNumber* pDst, WideLane mask)
{
if (Avx.IsSupported)
{
if (LaneWidth == Vector128.Count)
{
if (sizeof(TNumber) == sizeof(float))
{
var v = Unsafe.BitCast, Vector128>(this);
var m = Unsafe.BitCast, Vector128>(mask);
Avx.MaskStore((float*)pDst, m, v);
return;
}
else if (sizeof(TNumber) == sizeof(double))
{
var v = Unsafe.BitCast, Vector128>(this);
var m = Unsafe.BitCast, Vector128>(mask);
Avx.MaskStore((double*)pDst, m, v);
return;
}
}
else if (LaneWidth == Vector256.Count)
{
if (sizeof(TNumber) == sizeof(float))
{
var v = Unsafe.BitCast, Vector256>(this);
var m = Unsafe.BitCast, Vector256>(mask);
Avx.MaskStore((float*)pDst, m, v);
return;
}
else if (sizeof(TNumber) == sizeof(double))
{
var v = Unsafe.BitCast, Vector256>(this);
var m = Unsafe.BitCast, Vector256>(mask);
Avx.MaskStore((double*)pDst, m, v);
return;
}
}
}
for (var i = 0; i < LaneWidth; i++)
{
if (mask.value[i] != TNumber.Zero)
{
pDst[i] = value[i];
}
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void MaskStore(ref TNumber destination, WideLane mask)
{
MaskStore((TNumber*)Unsafe.AsPointer(ref destination), mask);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Scatter(TNumber* pDst, WideLane indices)
{
for (var i = 0; i < LaneWidth; i++)
{
var idx = int.CreateTruncating(indices[i]);
pDst[idx] = value[i];
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Scatter(ref TNumber destination, WideLane indices)
{
for (var i = 0; i < LaneWidth; i++)
{
var idx = int.CreateTruncating(indices[i]);
Unsafe.Add(ref destination, idx) = value[i];
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Scatter(TNumber* pDst, int* pIndices)
{
for (var i = 0; i < LaneWidth; i++)
{
var idx = pIndices[i];
pDst[idx] = value[i];
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Scatter(ref TNumber destination, int* pIndices)
{
for (var i = 0; i < LaneWidth; i++)
{
var idx = pIndices[i];
Unsafe.Add(ref destination, idx) = value[i];
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void MaskScatter(TNumber* pDst, WideLane indices, WideLane mask)
{
for (var i = 0; i < LaneWidth; i++)
{
if (mask.value[i] != TNumber.Zero)
{
var idx = int.CreateTruncating(indices[i]);
pDst[idx] = value[i];
}
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void MaskScatter(ref TNumber destination, WideLane indices, WideLane mask)
{
for (var i = 0; i < LaneWidth; i++)
{
if (mask.value[i] != TNumber.Zero)
{
var idx = int.CreateTruncating(indices[i]);
Unsafe.Add(ref destination, idx) = value[i];
}
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void MaskScatter(TNumber* pDst, int* pIndices, WideLane mask)
{
for (var i = 0; i < LaneWidth; i++)
{
if (mask.value[i] != TNumber.Zero)
{
var idx = pIndices[i];
pDst[idx] = value[i];
}
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void MaskScatter(ref TNumber destination, int* pIndices, WideLane mask)
{
for (var i = 0; i < LaneWidth; i++)
{
if (mask.value[i] != TNumber.Zero)
{
var idx = pIndices[i];
Unsafe.Add(ref destination, idx) = value[i];
}
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public readonly Vector AsVector()
{
return value;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public readonly TNumber* GetUnsafePtr()
{
return (TNumber*)Unsafe.AsPointer(ref Unsafe.AsRef(in value));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public TOther BitCast()
where TOther : ISPMDLane
where TOtherNumber : unmanaged, INumber, IBinaryNumber, IMinMaxValue, IBitwiseOperators
{
return Unsafe.BitCast, TOther>(this);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane operator +(WideLane a, WideLane