Major namespace migration from SPMD to HPC across all code, templates, and projects. Introduced Misaki.HighPerformance.HPC.Generator with Roslyn-based source generators for SIMD code (e.g., AVX2), including attribute and method generators. Renamed MultipleAdd to MultiplyAdd in all lanes and updated usages. Added AVX2 utility methods via codegen. Updated tests, benchmarks, and project references to use the new framework. Improved SIMD memory utilities and modernized project files. Removed legacy SPMD project from the solution.
207 lines
5.8 KiB
C#
207 lines
5.8 KiB
C#
using System.Runtime.InteropServices;
|
|
|
|
namespace Misaki.HighPerformance.HPC;
|
|
|
|
internal static unsafe class ShuffleTableGenerator
|
|
{
|
|
public static uint* ComputeShuffleTable512_32Bit()
|
|
{
|
|
const nuint entryCount = 512;
|
|
const int elementCount = 16;
|
|
|
|
// Align to 64 bytes for AVX-512 performance
|
|
var table = (uint*)NativeMemory.AlignedAlloc(entryCount * elementCount * sizeof(uint), 64);
|
|
|
|
for (var mask = 0u; mask < entryCount; mask++)
|
|
{
|
|
// We are filling 16 integers for this mask
|
|
var pRow = table + (mask * elementCount);
|
|
var outputIndex = 0;
|
|
// 1. Pack the valid indices to the front
|
|
for (var bit = 0; bit < 16; bit++)
|
|
{
|
|
// Check if the i-th bit is set
|
|
if ((mask & (1 << bit)) != 0)
|
|
{
|
|
pRow[outputIndex] = (uint)bit; // Write the Source Index
|
|
outputIndex++;
|
|
}
|
|
}
|
|
// 2. Fill the remaining slots (Pad with 0 or similar)
|
|
// It doesn't strictly matter what these are, as we won't read them,
|
|
// but filling with 0 is clean.
|
|
while (outputIndex < 16)
|
|
{
|
|
pRow[outputIndex] = 0;
|
|
outputIndex++;
|
|
}
|
|
}
|
|
|
|
return table;
|
|
}
|
|
|
|
public static ulong* ComputeShuffleTable512_64Bit()
|
|
{
|
|
const nuint entryCount = 256;
|
|
const int elementCount = 8;
|
|
|
|
// Align to 64 bytes for AVX-512 performance
|
|
var table = (ulong*)NativeMemory.AlignedAlloc(entryCount * elementCount * sizeof(ulong), 64);
|
|
for (var mask = 0u; mask < entryCount; mask++)
|
|
{
|
|
// We are filling 8 integers for this mask
|
|
var pRow = table + (mask * elementCount);
|
|
var outputIndex = 0;
|
|
|
|
// 1. Pack the valid indices to the front
|
|
for (var bit = 0; bit < 8; bit++)
|
|
{
|
|
// Check if the i-th bit is set
|
|
if ((mask & (1 << bit)) != 0)
|
|
{
|
|
pRow[outputIndex] = (ulong)bit; // Write the Source Index
|
|
outputIndex++;
|
|
}
|
|
}
|
|
|
|
// 2. Fill the remaining slots (Pad with 0 or similar)
|
|
while (outputIndex < 8)
|
|
{
|
|
pRow[outputIndex] = 0;
|
|
outputIndex++;
|
|
}
|
|
}
|
|
|
|
return table;
|
|
}
|
|
|
|
public static uint* ComputeShuffleTable256_32Bit()
|
|
{
|
|
const nuint entryCount = 256;
|
|
const nuint elementCount = 8;
|
|
|
|
// Align to 32 bytes for AVX performance
|
|
var table = (uint*)NativeMemory.AlignedAlloc(entryCount * elementCount * sizeof(uint), 32);
|
|
|
|
for (var mask = 0u; mask < entryCount; mask++)
|
|
{
|
|
// We are filling 8 integers for this mask
|
|
var pRow = table + (mask * elementCount);
|
|
|
|
var outputIndex = 0;
|
|
|
|
for (var bit = 0; bit < 8; bit++)
|
|
{
|
|
if ((mask & (1 << bit)) != 0)
|
|
{
|
|
pRow[outputIndex] = (uint)bit;
|
|
outputIndex++;
|
|
}
|
|
}
|
|
|
|
while (outputIndex < 8)
|
|
{
|
|
pRow[outputIndex] = 0;
|
|
outputIndex++;
|
|
}
|
|
}
|
|
|
|
return table;
|
|
}
|
|
|
|
public static ulong* ComputeShuffleTable256_64Bit()
|
|
{
|
|
const nuint entryCount = 16;
|
|
const nuint elementCount = 4;
|
|
|
|
var table = (ulong*)NativeMemory.AlignedAlloc(entryCount * elementCount * sizeof(ulong), 32);
|
|
|
|
for (var mask = 0u; mask < entryCount; mask++)
|
|
{
|
|
var pRow = table + (mask * elementCount);
|
|
var outputIndex = 0;
|
|
|
|
// We only check 4 bits because there are only 4 ulongs in a Vector256
|
|
for (var bit = 0; bit < 4; bit++)
|
|
{
|
|
if ((mask & (1 << bit)) != 0)
|
|
{
|
|
pRow[outputIndex] = (ulong)bit;
|
|
outputIndex++;
|
|
}
|
|
}
|
|
|
|
// Fill remaining slots with 0 (or a specific 'clear' index)
|
|
while (outputIndex < 4)
|
|
{
|
|
pRow[outputIndex] = 0;
|
|
outputIndex++;
|
|
}
|
|
}
|
|
|
|
return table;
|
|
}
|
|
|
|
public static uint* ComputeShuffleTable128_32Bit()
|
|
{
|
|
const nuint entryCount = 16;
|
|
const nuint elementCount = 4;
|
|
|
|
var table = (uint*)NativeMemory.AlignedAlloc(entryCount * elementCount * sizeof(uint), 16);
|
|
|
|
for (var mask = 0u; mask < entryCount; mask++)
|
|
{
|
|
var pRow = table + (mask * elementCount);
|
|
var outputIndex = 0;
|
|
|
|
for (var bit = 0; bit < 4; bit++)
|
|
{
|
|
if ((mask & (1 << bit)) != 0)
|
|
{
|
|
pRow[outputIndex] = (uint)bit;
|
|
outputIndex++;
|
|
}
|
|
}
|
|
|
|
while (outputIndex < 4)
|
|
{
|
|
pRow[outputIndex] = 0;
|
|
outputIndex++;
|
|
}
|
|
}
|
|
|
|
return table;
|
|
}
|
|
|
|
public static ulong* ComputeShuffleTable128_64Bit()
|
|
{
|
|
const nuint entryCount = 8;
|
|
const nuint elementCount = 2;
|
|
|
|
var table = (ulong*)NativeMemory.AlignedAlloc(entryCount * elementCount * sizeof(ulong), 16);
|
|
|
|
for (var mask = 0u; mask < entryCount; mask++)
|
|
{
|
|
var pRow = table + (mask * elementCount);
|
|
var outputIndex = 0;
|
|
|
|
for (var bit = 0; bit < 2; bit++)
|
|
{
|
|
if ((mask & (1 << bit)) != 0)
|
|
{
|
|
pRow[outputIndex] = (byte)bit;
|
|
outputIndex++;
|
|
}
|
|
}
|
|
|
|
while (outputIndex < 2)
|
|
{
|
|
pRow[outputIndex] = 0;
|
|
outputIndex++;
|
|
}
|
|
}
|
|
|
|
return table;
|
|
}
|
|
}
|