Hi,
After I could not run an OpenCL kernel on the i.MX8, I tried to get the device info and got quite an interesting output via the below code.
// Getting platform and device information
cl_platform_id platformId = NULL;
cl_device_id deviceID = NULL;
cl_uint retNumDevices;
cl_uint retNumPlatforms;
cl_int ret = clGetPlatformIDs(1, &platformId, &retNumPlatforms);
ret = clGetDeviceIDs(platformId, CL_DEVICE_TYPE_DEFAULT, 1, &deviceID, &retNumDevices);
// Get GPU device information
char cBuffer[1024];
cl_uint maxComputeUnits;
cl_uint maxClockFreq;
cl_ulong globalMemSize;
cl_ulong localMemSize;
cl_ulong maxMemAllocSize;
size_t maxWorkGroupSize;
size_t maxWorkItemSizes[3];
cl_uint maxWorkItemDims;
cl_device_type deviceType;
printf("\n=== OpenCL Device Information ===\n");
// Device name
clGetDeviceInfo(deviceID, CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);
printf("Device Name: %s\n", cBuffer);
// Device vendor
clGetDeviceInfo(deviceID, CL_DEVICE_VENDOR, sizeof(cBuffer), &cBuffer, NULL);
printf("Device Vendor: %s\n", cBuffer);
// Device type
clGetDeviceInfo(deviceID, CL_DEVICE_TYPE, sizeof(deviceType), &deviceType, NULL);
printf("Device Type: ");
if (deviceType & CL_DEVICE_TYPE_GPU) printf("GPU\n");
else if (deviceType & CL_DEVICE_TYPE_CPU) printf("CPU\n");
else if (deviceType & CL_DEVICE_TYPE_ACCELERATOR) printf("Accelerator\n");
else printf("Unknown\n");
// OpenCL version
clGetDeviceInfo(deviceID, CL_DEVICE_OPENCL_C_VERSION, sizeof(cBuffer), &cBuffer, NULL);
printf("OpenCL C Version: %s\n", cBuffer);
// Driver version
clGetDeviceInfo(deviceID, CL_DRIVER_VERSION, sizeof(cBuffer), &cBuffer, NULL);
printf("Driver Version: %s\n", cBuffer);
// Compute units
clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(maxComputeUnits), &maxComputeUnits, NULL);
printf("Max Compute Units: %u\n", maxComputeUnits);
// Clock frequency
clGetDeviceInfo(deviceID, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(maxClockFreq), &maxClockFreq, NULL);
printf("Max Clock Frequency: %u MHz\n", maxClockFreq);
// Global memory
clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(globalMemSize), &globalMemSize, NULL);
printf("Global Memory Size: %lu MB\n", globalMemSize / (1024 * 1024));
// Max memory allocation
clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(maxMemAllocSize), &maxMemAllocSize, NULL);
printf("Max Memory Allocation: %lu MB\n", maxMemAllocSize / (1024 * 1024));
// Local memory
clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(localMemSize), &localMemSize, NULL);
printf("Local Memory Size: %lu KB\n", localMemSize / 1024);
// Max work group size
clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL);
printf("Max Work Group Size: %zu\n", maxWorkGroupSize);
// Max work item dimensions
clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(maxWorkItemDims), &maxWorkItemDims, NULL);
printf("Max Work Item Dimensions: %u\n", maxWorkItemDims);
// Max work item sizes
clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(maxWorkItemSizes), &maxWorkItemSizes, NULL);
printf("Max Work Item Sizes: %zu x %zu x %zu\n", maxWorkItemSizes[0], maxWorkItemSizes[1], maxWorkItemSizes[2]);
printf("=================================\n\n");
ids-1 | === OpenCL Device Information ===
ids-1 | Device Name:
ids-1 | Device Vendor:
ids-1 | Device Type: CPU
ids-1 | OpenCL C Version:
ids-1 | Driver Version:
ids-1 | Max Compute Units: 0
ids-1 | Max Clock Frequency: 1 MHz
ids-1 | Global Memory Size: 268434198 MB
ids-1 | Max Memory Allocation: 268435394 MB
ids-1 | Local Memory Size: 0 KB
ids-1 | Max Work Group Size: 281473657739148
ids-1 | Max Work Item Dimensions: 65535
ids-1 | Max Work Item Sizes: 281473657909640 x 281473657671680 x 281474911768320
ids-1 | =================================
On my PC, the same code produces the following output:
=== OpenCL Device Information ===
Device Name: NVIDIA GeForce RTX 4060
Device Vendor: NVIDIA Corporation
Device Type: GPU
OpenCL C Version: OpenCL C 1.2
Driver Version: 580.65.06
Max Compute Units: 24
Max Clock Frequency: 2460 MHz
Global Memory Size: 7782 MB
Max Memory Allocation: 1945 MB
Local Memory Size: 48 KB
Max Work Group Size: 1024
Max Work Item Dimensions: 3
Max Work Item Sizes: 1024 x 1024 x 64
=================================
The OpenCL kernel (and application) that I am mentioning has been tested on both AMD and NVIDIA GPUs and works perfectly well.
My Verdin module has been flashed with a standard Torizon OS image, I have not done any customisations (like drivers or device tree, U-boot, etc).
torizon@verdin-imx8mm-15400884:~$ uname -a
Linux verdin-imx8mm-15400884 5.15.177-6.8.2+git.5a6602cc6c32 #1-TorizonCore SMP PREEMPT Fri Mar 21 16:22:58 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux
Is there any document with a guide how to properly prepare the device for OpenCL?