SPMD SIMD math library & lock-free job system integration
- Add new SPMD SIMD math project with scalar/vector lanes - Integrate SPMD jobs and scheduling into job system - Implement lock-free job dependency management - Update math functions for .NET 10 and SIMD performance - Add SPMD benchmarks, compress-store tests, and race tests - Introduce generic Result<T> error handling utilities - Solution/project file updates and code cleanup
This commit is contained in:
206
Misaki.HighPerformance.Mathematics.SPMD/ShuffleTableGenerator.cs
Normal file
206
Misaki.HighPerformance.Mathematics.SPMD/ShuffleTableGenerator.cs
Normal file
@@ -0,0 +1,206 @@
|
||||
using System.Runtime.InteropServices;
|
||||
|
||||
namespace Misaki.HighPerformance.Mathematics.SPMD;
|
||||
|
||||
public static unsafe class ShuffleTableGenerator
|
||||
{
|
||||
public static uint* ComputeShuffleTable512_32Bit()
|
||||
{
|
||||
const nuint entryCount = 512;
|
||||
const int elementCount = 16;
|
||||
|
||||
// Align to 64 bytes for AVX-512 performance
|
||||
var table = (uint*)NativeMemory.AlignedAlloc(entryCount * elementCount * sizeof(uint), 64);
|
||||
|
||||
for (var mask = 0u; mask < entryCount; mask++)
|
||||
{
|
||||
// We are filling 16 integers for this mask
|
||||
var pRow = table + (mask * elementCount);
|
||||
var outputIndex = 0;
|
||||
// 1. Pack the valid indices to the front
|
||||
for (var bit = 0; bit < 16; bit++)
|
||||
{
|
||||
// Check if the i-th bit is set
|
||||
if ((mask & (1 << bit)) != 0)
|
||||
{
|
||||
pRow[outputIndex] = (uint)bit; // Write the Source Index
|
||||
outputIndex++;
|
||||
}
|
||||
}
|
||||
// 2. Fill the remaining slots (Pad with 0 or similar)
|
||||
// It doesn't strictly matter what these are, as we won't read them,
|
||||
// but filling with 0 is clean.
|
||||
while (outputIndex < 16)
|
||||
{
|
||||
pRow[outputIndex] = 0;
|
||||
outputIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
return table;
|
||||
}
|
||||
|
||||
public static ulong* ComputeShuffleTable512_64Bit()
|
||||
{
|
||||
const nuint entryCount = 256;
|
||||
const int elementCount = 8;
|
||||
|
||||
// Align to 64 bytes for AVX-512 performance
|
||||
var table = (ulong*)NativeMemory.AlignedAlloc(entryCount * elementCount * sizeof(ulong), 64);
|
||||
for (var mask = 0u; mask < entryCount; mask++)
|
||||
{
|
||||
// We are filling 8 integers for this mask
|
||||
var pRow = table + (mask * elementCount);
|
||||
var outputIndex = 0;
|
||||
|
||||
// 1. Pack the valid indices to the front
|
||||
for (var bit = 0; bit < 8; bit++)
|
||||
{
|
||||
// Check if the i-th bit is set
|
||||
if ((mask & (1 << bit)) != 0)
|
||||
{
|
||||
pRow[outputIndex] = (ulong)bit; // Write the Source Index
|
||||
outputIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Fill the remaining slots (Pad with 0 or similar)
|
||||
while (outputIndex < 8)
|
||||
{
|
||||
pRow[outputIndex] = 0;
|
||||
outputIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
return table;
|
||||
}
|
||||
|
||||
public static uint* ComputeShuffleTable256_32Bit()
|
||||
{
|
||||
const nuint entryCount = 256;
|
||||
const nuint elementCount = 8;
|
||||
|
||||
// Align to 32 bytes for AVX performance
|
||||
var table = (uint*)NativeMemory.AlignedAlloc(entryCount * elementCount * sizeof(uint), 32);
|
||||
|
||||
for (var mask = 0u; mask < entryCount; mask++)
|
||||
{
|
||||
// We are filling 8 integers for this mask
|
||||
var pRow = table + (mask * elementCount);
|
||||
|
||||
var outputIndex = 0;
|
||||
|
||||
for (var bit = 0; bit < 8; bit++)
|
||||
{
|
||||
if ((mask & (1 << bit)) != 0)
|
||||
{
|
||||
pRow[outputIndex] = (uint)bit;
|
||||
outputIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
while (outputIndex < 8)
|
||||
{
|
||||
pRow[outputIndex] = 0;
|
||||
outputIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
return table;
|
||||
}
|
||||
|
||||
public static ulong* ComputeShuffleTable256_64Bit()
|
||||
{
|
||||
const nuint entryCount = 16;
|
||||
const nuint elementCount = 4;
|
||||
|
||||
var table = (ulong*)NativeMemory.AlignedAlloc(entryCount * elementCount * sizeof(ulong), 32);
|
||||
|
||||
for (var mask = 0u; mask < entryCount; mask++)
|
||||
{
|
||||
var pRow = table + (mask * elementCount);
|
||||
var outputIndex = 0;
|
||||
|
||||
// We only check 4 bits because there are only 4 ulongs in a Vector256
|
||||
for (var bit = 0; bit < 4; bit++)
|
||||
{
|
||||
if ((mask & (1 << bit)) != 0)
|
||||
{
|
||||
pRow[outputIndex] = (ulong)bit;
|
||||
outputIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
// Fill remaining slots with 0 (or a specific 'clear' index)
|
||||
while (outputIndex < 4)
|
||||
{
|
||||
pRow[outputIndex] = 0;
|
||||
outputIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
return table;
|
||||
}
|
||||
|
||||
public static uint* ComputeShuffleTable128_32Bit()
|
||||
{
|
||||
const nuint entryCount = 16;
|
||||
const nuint elementCount = 4;
|
||||
|
||||
var table = (uint*)NativeMemory.AlignedAlloc(entryCount * elementCount * sizeof(uint), 16);
|
||||
|
||||
for (var mask = 0u; mask < entryCount; mask++)
|
||||
{
|
||||
var pRow = table + (mask * elementCount);
|
||||
var outputIndex = 0;
|
||||
|
||||
for (var bit = 0; bit < 4; bit++)
|
||||
{
|
||||
if ((mask & (1 << bit)) != 0)
|
||||
{
|
||||
pRow[outputIndex] = (uint)bit;
|
||||
outputIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
while (outputIndex < 4)
|
||||
{
|
||||
pRow[outputIndex] = 0;
|
||||
outputIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
return table;
|
||||
}
|
||||
|
||||
public static ulong* ComputeShuffleTable128_64Bit()
|
||||
{
|
||||
const nuint entryCount = 8;
|
||||
const nuint elementCount = 2;
|
||||
|
||||
var table = (ulong*)NativeMemory.AlignedAlloc(entryCount * elementCount * sizeof(ulong), 16);
|
||||
|
||||
for (var mask = 0u; mask < entryCount; mask++)
|
||||
{
|
||||
var pRow = table + (mask * elementCount);
|
||||
var outputIndex = 0;
|
||||
|
||||
for (var bit = 0; bit < 2; bit++)
|
||||
{
|
||||
if ((mask & (1 << bit)) != 0)
|
||||
{
|
||||
pRow[outputIndex] = (byte)bit;
|
||||
outputIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
while (outputIndex < 2)
|
||||
{
|
||||
pRow[outputIndex] = 0;
|
||||
outputIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
return table;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user