head -n 50 /System/Library/Frameworks/MetalPerformancePrimitives.framework/Versions/A/Headers/MPPTensorOpsMatMul2d.h
// -*- Metal -*-
//===-- MetalTensorOpsMatMul2d
//------------------------------------------------------===//
// Copyright (c) 2025 Apple Inc. All rights reserved
//===----------------------------------------------------------------------===//
// This API performs generalized matrix multiplication operation
// C = A*B + C;
// A and B can be tensor_handle, tensor_offset, and tensor_inline.
// C can be tensor_handle, tensor_offset, tensor_inline or cooperative_tensor.
// Data type combinations supported by this operation are as follows:
//
// A B C
// ---------------------------
// half half half
// half int8_t half
// int8_t half half
// half half float
// half float float
// half int8_t float
// float half float
// float float float
// float int8_t float
// int8_t half float
// int8_t float float
// int8_t int8_t int32_t
// bfloat bfloat bfloat
// bfloat bfloat float
// bfloat float float
// bfloat int8_t bfloat
// bfloat int8_t float
// float bfloat float
// int8_t bfloat bfloat
// int8_t bfloat float
// bfloat half bfloat
// bfloat half half
// bfloat half float
// half bfloat bfloat
// half bfloat half
// half bfloat float
//
// Basic usage is in the following example which takes M x K matrix A of type
// half, K x N matrix B of type half, both in device memory and produces M x N
// matrix C of type float in device memory. It tiles this matrix multiplication
// in thread groups, where each thread group computes a 64 x 32 tile of output
// but multiplying 64 x K tile of A with K x 32 tile of B. This compute kernel
// will be launched with dispatch grid of
//
// MTLSize threadgroups = MTLSizeMake((M + 63)/64, (N + 31)/32, 1);
//