diff --git a/Cargo.toml b/Cargo.toml index 1ccdab5..9d4b127 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "blaze-rs" description = "A Rustified OpenCL Experience" -version = "0.1.0" +version = "0.1.1" edition = "2021" authors = ["Alex Andreba "] license = "MIT" @@ -9,7 +9,13 @@ homepage = "https://blaze-rs.com" repository = "https://github.com/Aandreba/blaze" readme = "docs/src/intro.md" keywords = ["opencl", "rust", "blaze", "gpgpu", "gpu"] -categories = ["api-bindings", "concurrency", "hardware-support", "science", "rust-patterns"] +categories = [ + "api-bindings", + "concurrency", + "hardware-support", + "science", + "rust-patterns", +] exclude = [".github", "Makefile", "TODO.md", "CHANGELOG.md", "blase"] build = "build.rs" @@ -34,7 +40,7 @@ all-features = true rustdoc-args = ["--cfg", "docsrs"] [dependencies] -blaze-proc = { path = "blaze-proc", version = "0.1.0" } +blaze-proc = { path = "blaze-proc", version = "0.1.1" } opencl-sys = { version = "0.2.1", default-features = false } futures = { version = "0.3.21", optional = true } # ffmpeg-sys-next = { version = "5.0.1", optional = true } @@ -44,9 +50,9 @@ bytemuck = "1.10.0" bytemuck_derive = "1.1.1" crossbeam = "0.8.2" once_cell = "1.13.0" -utils-atomics = "0.4.5" +utils-atomics = "0.5.0" num-traits = "0.2.15" -num_enum = "0.5.7" +num_enum = "0.6.0" bitflags = "1" cfg-if = "1" bitvec = "1" @@ -65,4 +71,3 @@ camino = "1.1.2" [workspace] members = ["blaze-proc", "blase"] - diff --git a/blaze-proc/Cargo.toml b/blaze-proc/Cargo.toml index d4200bf..1a15ff2 100644 --- a/blaze-proc/Cargo.toml +++ b/blaze-proc/Cargo.toml @@ -1,16 +1,16 @@ -[package] -name = "blaze-proc" -version = "0.1.0" -edition = "2021" -description = "Blaze internal/external proc macros" -license = "MIT" - -[lib] -proc-macro = true - -[dependencies] -syn = { version = "1", features = ["full", "extra-traits"] } -proc-macro2 = "1" -quote = "1" -derive-syn-parse = "0.1.5" -elor = "1" \ No newline at end of file +[package] +name = "blaze-proc" +version = "0.1.1" +edition = "2021" +description = "Blaze internal/external proc macros" +license = "MIT" + +[lib] +proc-macro = true + +[dependencies] +syn = { version = "1", features = ["full", "extra-traits"] } +proc-macro2 = "1" +quote = "1" +derive-syn-parse = "0.1.5" +elor = "1" diff --git a/blaze-proc/src/cl/mod.rs b/blaze-proc/src/cl/mod.rs index dc54df4..6baa7f1 100644 --- a/blaze-proc/src/cl/mod.rs +++ b/blaze-proc/src/cl/mod.rs @@ -1,7 +1,7 @@ use derive_syn_parse::Parse; -use proc_macro2::{TokenStream, Ident}; -use quote::{quote, format_ident, ToTokens}; -use syn::{Visibility, Token, Generics, parse_quote, Abi, punctuated::Punctuated, Attribute, Expr}; +use proc_macro2::{Ident, TokenStream}; +use quote::{format_ident, quote, ToTokens}; +use syn::{parse_quote, punctuated::Punctuated, Abi, Attribute, Expr, Generics, Token, Visibility}; use crate::utils::to_pascal_case; @@ -13,15 +13,21 @@ macro_rules! peek_and_parse { } v - }} + }}; } flat_mod!(ty, kern, arg); -pub fn blaze_c (prog_vis: Visibility, ident: Ident, generics: Generics, blaze: Blaze, content: Expr) -> TokenStream { +pub fn blaze_c( + prog_vis: Visibility, + ident: Ident, + generics: Generics, + blaze: Blaze, + content: Expr, +) -> TokenStream { let Blaze { vis, kernels, .. } = blaze; let kernel_vis = vis; - let vis = prog_vis; + let vis = prog_vis; let phantom_generics = match generics.params.is_empty() { true => None, @@ -30,28 +36,41 @@ pub fn blaze_c (prog_vis: Visibility, ident: Ident, generics: Generics, blaze: B let lt = generics.lifetimes().map(|p| quote! { &#p () }); let iter = ty.chain(lt); - Some(quote! { #[doc(hidden)] __blaze_phtm__: ::core::marker::PhantomData::<(#(#iter),*)>,}) + Some( + quote! { #[doc(hidden)] __blaze_phtm__: ::core::marker::PhantomData::<(#(#iter),*)>,}, + ) } }; - let phantom_fill = phantom_generics.as_ref().map(|_| quote! { __blaze_phtm__: ::core::marker::PhantomData, }); + let phantom_fill = phantom_generics + .as_ref() + .map(|_| quote! { __blaze_phtm__: ::core::marker::PhantomData, }); let mut program_generics = generics.clone(); - program_generics.params.push(parse_quote!(C: ::blaze_rs::context::Context = ::blaze_rs::context::Global)); + program_generics.params.push(parse_quote!( + C: ::blaze_rs::context::Context = ::blaze_rs::context::Global + )); let (prog_imp, prog_ty, prog_wher) = program_generics.split_for_impl(); let (glob_imp, glob_ty, glob_wher) = generics.split_for_impl(); let kernel_names = kernels.iter().map(|x| &x.ident).collect::>(); - let kernel_attrs = kernels.iter().map(|x| x.attrs.attrs.as_slice()).collect::>(); - let kernel_extern_names = kernels.iter().map(|x| { - if let Some(ref name) = x.attrs.link_name { - return name.to_token_stream() - } + let kernel_attrs = kernels + .iter() + .map(|x| x.attrs.attrs.as_slice()) + .collect::>(); + let kernel_extern_names = kernels + .iter() + .map(|x| { + if let Some(ref name) = x.attrs.link_name { + return name.to_token_stream(); + } - let ident = &x.ident; - quote! { stringify!(#ident) } - }).collect::>(); + let ident = &x.ident; + quote! { stringify!(#ident) } + }) + .collect::>(); - let kernel_structs = kernels.iter() + let kernel_structs = kernels + .iter() .map(|x| create_kernel(&kernel_vis, &ident, &generics, &program_generics, x)); let kernel_defs = kernels.iter().map(|x| { @@ -82,19 +101,21 @@ pub fn blaze_c (prog_vis: Visibility, ident: Ident, generics: Generics, blaze: B let (__blaze_inner__, __blaze_kernels__) = ::blaze_rs::core::RawProgram::from_source_in(&__blaze_ctx__, #content, options)?; #( + #[allow(unused_doc_comments)] #(#kernel_attrs)* let mut #kernel_names = None; )* for __blaze_kernel__ in __blaze_kernels__.into_iter() { match __blaze_kernel__.name()?.as_str() { - #(#(#kernel_attrs)* #kernel_extern_names => #kernel_names = unsafe { Some(__blaze_kernel__.clone()) }),*, + #(#[allow(unused_doc_comments)] #(#kernel_attrs)* #kernel_extern_names => #kernel_names = unsafe { Some(__blaze_kernel__.clone()) }),*, _ => {} //__other => return Err(::blaze_rs::core::Error::new(::blaze_rs::core::ErrorKind::InvalidKernel, format!("unknown kernel '{}'", __other))) } } #( + #[allow(unused_doc_comments)] #(#kernel_attrs)* let #kernel_names = match #kernel_names { Some(__x) => ::std::sync::Mutex::new(__x), @@ -106,9 +127,15 @@ pub fn blaze_c (prog_vis: Visibility, ident: Ident, generics: Generics, blaze: B __blaze_inner__, __blaze_ctx__, #phantom_fill - #(#(#kernel_attrs)* #kernel_names),* + #(#[allow(unused_doc_comments)] #(#kernel_attrs)* #kernel_names),* }) } + + /// Returns the context of the program + #[inline] + #vis fn context (&self) -> &C { + &self.__blaze_ctx__ + } } impl #prog_imp ::std::ops::Deref for #ident #prog_ty #prog_wher { @@ -124,8 +151,20 @@ pub fn blaze_c (prog_vis: Visibility, ident: Ident, generics: Generics, blaze: B } } -fn create_kernel (default_vis: &Visibility, parent: &Ident, impl_generics: &Generics, parent_generics: &Generics, kernel: &Kernel) -> TokenStream { - let Kernel { attrs, vis, ident, args, .. } = kernel; +fn create_kernel( + default_vis: &Visibility, + parent: &Ident, + impl_generics: &Generics, + parent_generics: &Generics, + kernel: &Kernel, +) -> TokenStream { + let Kernel { + attrs, + vis, + ident, + args, + .. + } = kernel; let mut generics = parse_quote! { <'__scope__, '__env__: '__scope__> }; let (parent_imp, parent_ty, parent_wher) = parent_generics.split_for_impl(); let attrs = match attrs.attrs.is_empty() { @@ -138,25 +177,46 @@ fn create_kernel (default_vis: &Visibility, parent: &Ident, impl_generics: &Gene let vis = match vis { Visibility::Inherited => default_vis, - other => other + other => other, }; let name = args.iter().map(|x| x.name.clone()).collect::>(); - let new = args.iter().map(|x| x.ty(&mut generics, true)).collect::>(); + let new = args + .iter() + .map(|x| x.ty(&mut generics, true)) + .collect::>(); assert_eq!(name.len(), new.len()); //panic!("{name:?}: {new:?}"); - let pointer_names = args.iter().filter_map(|x| if x.ty.is_pointer() { Some(&x.name) } else { None }).collect::>(); - let set = args.iter().enumerate().map(|(i, x)| set_arg(x, u32::try_from(i).unwrap())).collect::>(); + let pointer_names = args + .iter() + .filter_map(|x| { + if x.ty.is_pointer() { + Some(&x.name) + } else { + None + } + }) + .collect::>(); + let set = args + .iter() + .enumerate() + .map(|(i, x)| set_arg(x, u32::try_from(i).unwrap())) + .collect::>(); //generics.params.extend(impl_generics.params.iter().cloned()); let blocking_ident = format_ident!("{ident}_blocking"); - let mut blocking_generics : Generics = parse_quote! { }; - let blocking_new = args.iter().map(|x| x.ty(&mut blocking_generics, false)).collect::>();(); + let mut blocking_generics: Generics = parse_quote! { }; + let blocking_new = args + .iter() + .map(|x| x.ty(&mut blocking_generics, false)) + .collect::>(); + (); let (blocking_impl, _, blocking_where) = blocking_generics.split_for_impl(); // Remove `'scope` lifetime - let event_params = generics.params + let event_params = generics + .params .iter() .take(1) .chain(generics.params.iter().skip(2)) @@ -166,22 +226,25 @@ fn create_kernel (default_vis: &Visibility, parent: &Ident, impl_generics: &Gene event_generics.params = event_params; event_generics.where_clause = generics.where_clause.clone(); - event_generics.params.extend(impl_generics.params.iter().cloned()); - let event_new = new.iter() + event_generics + .params + .extend(impl_generics.params.iter().cloned()); + let event_new = new + .iter() .map(|x| { let mut x = x.clone(); if let syn::Type::Reference(ref mut rf) = x { if rf.lifetime == Some(parse_quote! { '__env__ }) { - rf.lifetime = Some(parse_quote! { '__scope__ }); - } + rf.lifetime = Some(parse_quote! { '__scope__ }); + } } - return x + return x; }) .chain(impl_generics.type_params().map(|x| { let mut x = x.clone(); x.colon_token = None; x.bounds.clear(); - return parse_quote! { #x } + return parse_quote! { #x }; })) .collect::>(); let (_, event_type, _) = event_generics.split_for_impl(); @@ -253,7 +316,7 @@ fn create_kernel (default_vis: &Visibility, parent: &Ident, impl_generics: &Gene } } -fn set_arg (arg: &Argument, idx: u32) -> TokenStream { +fn set_arg(arg: &Argument, idx: u32) -> TokenStream { let Argument { name, .. } = arg; match arg.ty { @@ -261,8 +324,10 @@ fn set_arg (arg: &Argument, idx: u32) -> TokenStream { ::blaze_rs::buffer::KernelPointer::set_arg(#name, &mut __blaze_kernel__, &mut wait, #idx)? }, - Type::Image2d => quote! { __blaze_kernel__.set_argument(#idx, ::blaze_rs::image::DynImage2D::id_ref(#name))? }, - _ => quote! { __blaze_kernel__.set_argument(#idx, #name)? } + Type::Image2d => { + quote! { __blaze_kernel__.set_argument(#idx, ::blaze_rs::image::DynImage2D::id_ref(#name))? } + } + _ => quote! { __blaze_kernel__.set_argument(#idx, #name)? }, } } @@ -276,11 +341,11 @@ pub struct Blaze { pub brace_token: syn::token::Brace, #[inside(brace_token)] #[call(Punctuated::parse_terminated)] - pub kernels: Punctuated + pub kernels: Punctuated, } #[derive(Parse)] pub struct Link { pub eq_token: Token![=], - pub meta: Expr -} \ No newline at end of file + pub meta: Expr, +} diff --git a/blaze-proc/src/lib.rs b/blaze-proc/src/lib.rs index 13fe268..d5bff86 100644 --- a/blaze-proc/src/lib.rs +++ b/blaze-proc/src/lib.rs @@ -9,54 +9,63 @@ macro_rules! flat_mod { } } -use cl::{Link}; +use cl::Link; use derive_syn_parse::Parse; use error::Error; -use proc_macro2::{TokenStream, Ident}; -use quote::{ToTokens, quote, format_ident}; -use syn::{parse_macro_input, ItemStatic, Meta, DeriveInput, Generics, punctuated::Punctuated, Visibility, ItemType, WherePredicate, WhereClause, parse_quote}; +use proc_macro2::{Ident, TokenStream}; +use quote::{format_ident, quote, ToTokens}; +use syn::{ + parse_macro_input, parse_quote, punctuated::Punctuated, DeriveInput, Generics, ItemStatic, + ItemType, Meta, Visibility, WhereClause, WherePredicate, +}; use crate::cl::Blaze; +mod cl; mod context; mod error; -mod utils; -mod cl; mod num; +mod utils; #[proc_macro_derive(NumOps, attributes(uninit))] -pub fn derive_num_ops (items: proc_macro::TokenStream) -> proc_macro::TokenStream { +pub fn derive_num_ops(items: proc_macro::TokenStream) -> proc_macro::TokenStream { let items = parse_macro_input!(items as DeriveInput); num::derive_ops(items).into() } #[proc_macro_derive(NumOpsAssign, attributes(uninit))] -pub fn derive_num_ops_assign (items: proc_macro::TokenStream) -> proc_macro::TokenStream { +pub fn derive_num_ops_assign(items: proc_macro::TokenStream) -> proc_macro::TokenStream { let items = parse_macro_input!(items as DeriveInput); num::derive_ops_assign(items).into() } #[proc_macro_attribute] -pub fn global_context (_attrs: proc_macro::TokenStream, items: proc_macro::TokenStream) -> proc_macro::TokenStream { +pub fn global_context( + _attrs: proc_macro::TokenStream, + items: proc_macro::TokenStream, +) -> proc_macro::TokenStream { let items = parse_macro_input!(items as ItemStatic); context::global_context(items).into() } #[proc_macro] -pub fn error (items: proc_macro::TokenStream) -> proc_macro::TokenStream { +pub fn error(items: proc_macro::TokenStream) -> proc_macro::TokenStream { let input = parse_macro_input!(items as Error); input.to_token_stream().into() } #[proc_macro_attribute] -pub fn newtype (attrs: proc_macro::TokenStream, items: proc_macro::TokenStream) -> proc_macro::TokenStream { - fn extra_where (where_generics: Option<&WhereClause>, extra: WherePredicate) -> WhereClause { +pub fn newtype( + attrs: proc_macro::TokenStream, + items: proc_macro::TokenStream, +) -> proc_macro::TokenStream { + fn extra_where(where_generics: Option<&WhereClause>, extra: WherePredicate) -> WhereClause { match where_generics { Some(x) => { let mut x = x.clone(); x.predicates.push(extra); - return x - }, + return x; + } None => { let mut predicates = Punctuated::new(); @@ -64,17 +73,28 @@ pub fn newtype (attrs: proc_macro::TokenStream, items: proc_macro::TokenStream) WhereClause { where_token: Default::default(), - predicates + predicates, } } } } let inner_vis = parse_macro_input!(attrs as Visibility); - let ItemType { attrs, vis, ident, generics, ty, semi_token, .. } = parse_macro_input!(items as ItemType); + let ItemType { + attrs, + vis, + ident, + generics, + ty, + semi_token, + .. + } = parse_macro_input!(items as ItemType); let (impl_generics, ty_generics, where_generics) = generics.split_for_impl(); - let consumer_generics = extra_where(r#where_generics, parse_quote! { #ty: blaze_rs::event::Consumer }); + let consumer_generics = extra_where( + r#where_generics, + parse_quote! { #ty: blaze_rs::event::Consumer }, + ); let debug_generics = extra_where(r#where_generics, parse_quote! { #ty: ::core::fmt::Debug }); let clone_generics = extra_where(r#where_generics, parse_quote! { #ty: ::core::clone::Clone }); let copy_generics = extra_where(r#where_generics, parse_quote! { #ty: ::core::marker::Copy }); @@ -91,7 +111,7 @@ pub fn newtype (attrs: proc_macro::TokenStream, items: proc_macro::TokenStream) <#ty as blaze_rs::event::Consumer>::consume(self.0) } } - + impl #impl_generics ::core::fmt::Debug for #ident #ty_generics #debug_generics { #[inline(always)] fn fmt (&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result { @@ -107,14 +127,15 @@ pub fn newtype (attrs: proc_macro::TokenStream, items: proc_macro::TokenStream) } impl #impl_generics ::core::marker::Copy for #ident #ty_generics #copy_generics {} - }.into() + } + .into() } #[proc_macro] -pub fn join_various_blocking (items: proc_macro::TokenStream) -> proc_macro::TokenStream { +pub fn join_various_blocking(items: proc_macro::TokenStream) -> proc_macro::TokenStream { #[derive(Parse)] - struct Input (#[call(Punctuated::parse_terminated)] Punctuated); - + struct Input(#[call(Punctuated::parse_terminated)] Punctuated); + let item = parse_macro_input!(items as Input).0.into_iter(); let idx = (0..item.len()).map(syn::Index::from).collect::>(); @@ -128,11 +149,15 @@ pub fn join_various_blocking (items: proc_macro::TokenStream) -> proc_macro::Tok ),* )) }) - }}.into() + }} + .into() } #[proc_macro_attribute] -pub fn blaze (attrs: proc_macro::TokenStream, items: proc_macro::TokenStream) -> proc_macro::TokenStream { +pub fn blaze( + attrs: proc_macro::TokenStream, + items: proc_macro::TokenStream, +) -> proc_macro::TokenStream { let ident = parse_macro_input!(attrs as BlazeIdent); let items = parse_macro_input!(items as Blaze); @@ -142,19 +167,22 @@ pub fn blaze (attrs: proc_macro::TokenStream, items: proc_macro::TokenStream) -> let tokens = attr.tokens.clone().into(); let link = parse_macro_input!(tokens as Link); inner = Some(link.meta); - break + break; } } if let Some(inner) = inner { - return cl::blaze_c(ident.vis, ident.ident, ident.generics, items, inner).into() + return cl::blaze_c(ident.vis, ident.ident, ident.generics, items, inner).into(); } panic!("No source code specified"); } #[proc_macro_attribute] -pub fn docfg (attrs: proc_macro::TokenStream, items: proc_macro::TokenStream) -> proc_macro::TokenStream { +pub fn docfg( + attrs: proc_macro::TokenStream, + items: proc_macro::TokenStream, +) -> proc_macro::TokenStream { let attrs = parse_macro_input!(attrs as Meta); let items = parse_macro_input!(items as TokenStream); @@ -162,12 +190,13 @@ pub fn docfg (attrs: proc_macro::TokenStream, items: proc_macro::TokenStream) -> #[cfg_attr(docsrs, doc(cfg(#attrs)))] #[cfg(#attrs)] #items - }.into() + } + .into() } #[derive(Parse)] struct BlazeIdent { vis: Visibility, ident: Ident, - generics: Generics -} \ No newline at end of file + generics: Generics, +} diff --git a/build.rs b/build.rs index 7894ecb..778d98a 100644 --- a/build.rs +++ b/build.rs @@ -1,20 +1,20 @@ -use camino::Utf8Path; - -pub fn main() { - #[cfg(windows)] - include_opencl(); -} - -#[cfg(windows)] -fn include_opencl () { - if let Some(path) = option_env!("CUDA_PATH") { - let lib = Utf8Path::new(path).join("lib"); - #[cfg(target_pointer_width = "32")] - let path = lib.join("Win32"); - #[cfg(target_pointer_width = "64")] - let path = lib.join("x64"); - println!("cargo:rustc-link-search={path}"); - } else { - eprintln!("OpenCL library path not found. This may result in an error in Windoes systems.") - } -} \ No newline at end of file +use camino::Utf8Path; + +pub fn main() { + #[cfg(windows)] + include_opencl(); +} + +#[cfg(windows)] +fn include_opencl() { + if let Some(path) = option_env!("CUDA_PATH") { + let lib = Utf8Path::new(path).join("lib"); + #[cfg(target_pointer_width = "32")] + let path = lib.join("Win32"); + #[cfg(target_pointer_width = "64")] + let path = lib.join("x64"); + println!("cargo:rustc-link-search={path}"); + } else { + eprintln!("OpenCL library path not found. This may result in an error in Windows systems.") + } +} diff --git a/src/core/device.rs b/src/core/device.rs index 9620e91..a219f2a 100644 --- a/src/core/device.rs +++ b/src/core/device.rs @@ -1,1347 +1,1509 @@ -use core::{mem::MaybeUninit, num::{NonZeroUsize, NonZeroU32, NonZeroU64, IntErrorKind}, fmt::{Debug, Display}, str::FromStr}; -use std::{ptr::{NonNull}, ffi::c_void}; -use opencl_sys::*; -use blaze_proc::docfg; -use crate::buffer::flags::MemAccess; -use super::*; - -lazy_static! { - static ref DEVICES : Vec = unsafe { - let mut result = Vec::::new(); - - for platform in RawPlatform::all() { - let mut cnt = 0; - tri_panic!(clGetDeviceIDs(platform.id(), CL_DEVICE_TYPE_ALL, 0, core::ptr::null_mut(), &mut cnt)); - let cnt_size = usize::try_from(cnt).unwrap(); - - result.reserve(cnt_size); - tri_panic!(clGetDeviceIDs(platform.id(), CL_DEVICE_TYPE_ALL, cnt, result.as_mut_ptr().add(result.len()).cast(), core::ptr::null_mut())); - result.set_len(result.len() + cnt_size); - } - - result - }; -} - -/// OpenCL device -#[derive(PartialEq, Eq, Hash)] -#[repr(transparent)] -pub struct RawDevice (NonNull); - -impl RawDevice { - #[inline(always)] - pub const fn id (&self) -> cl_device_id { - self.0.as_ptr() - } - - #[inline(always)] - pub const unsafe fn from_id (id: cl_device_id) -> Option { - NonNull::new(id).map(Self) - } - - #[inline(always)] - pub const unsafe fn from_id_unchecked (id: cl_device_id) -> Self { - Self(NonNull::new_unchecked(id)) - } - - /// The default compute device address space size specified as an unsigned integer value in bits. Currently supported values are 32 or 64 bits. - #[inline(always)] - pub fn address_bits (&self) -> Result { - self.get_info_bits(CL_DEVICE_ADDRESS_BITS) - } - - /// Describes the various memory orders and scopes that the device supports for atomic memory operations. - #[docfg(feature = "cl3")] - #[inline(always)] - pub fn atomic_memory_capabilities (&self) -> Result> { - let v = self.get_info_bits::(opencl_sys::CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES)?; - Ok(AtomicCapabilities::from_bits(v)) - } - - /// Describes the various memory orders and scopes that the device supports for atomic fence operations. - #[docfg(feature = "cl3")] - #[inline(always)] - pub fn atomic_fence_capabilities (&self) -> Result> { - let v = self.get_info_bits::(opencl_sys::CL_DEVICE_ATOMIC_FENCE_CAPABILITIES)?; - Ok(AtomicCapabilities::from_bits(v)) - } - - /// Is ```true``` if the device is available and ```false``` if the device is not available. - #[inline(always)] - pub fn available (&self) -> Result { - let v = self.get_info_bits::(CL_DEVICE_AVAILABLE)?; - Ok(v != 0) - } - - /// A list of built-in kernels supported by the device. An empty list is returned if no built-in kernels are supported by the device. - #[docfg(feature = "cl1_2")] - #[inline(always)] - pub fn built_in_kernels (&self) -> Result> { - Ok(self.built_in_kernels_string()? - .split(';') - .map(str::trim) - .map(str::to_string) - .collect::>()) - } - - /// A semi-colon separated list of built-in kernels supported by the device. An empty string is returned if no built-in kernels are supported by the device. - #[docfg(feature = "cl1_2")] - #[inline(always)] - pub fn built_in_kernels_string (&self) -> Result { - self.get_info_string(opencl_sys::CL_DEVICE_BUILT_IN_KERNELS) - } - - /// Is ```false``` if the implementation does not have a compiler available to compile the program source. Is ```true``` if the compiler is available. This can be CL_FALSE for the embedded platform profile only. - #[inline(always)] - pub fn compiler_available (&self) -> Result { - let v = self.get_info_bits::(CL_DEVICE_COMPILER_AVAILABLE)?; - Ok(v != 0) - } - - /// Describes device-side enqueue capabilities of the device. - #[docfg(feature = "cl3")] - #[inline(always)] - pub fn device_enqueue_capabilities (&self) -> Result> { - let v = self.get_info_bits::(opencl_sys::CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES)?; - Ok(DeviceEnqueueCapabilities::from_bits(v)) - } - - /// Describes the OPTIONAL double precision floating-point capability of the OpenCL device - #[docfg(feature = "cl1_2")] - #[inline(always)] - pub fn double_fp_config (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_DOUBLE_FP_CONFIG) - } - - /// Is ```true``` if the OpenCL device is a little endian device and ```false``` otherwise. - #[inline(always)] - pub fn endian_little (&self) -> Result { - let v = self.get_info_bits::(CL_DEVICE_ENDIAN_LITTLE)?; - Ok(v != 0) - } - - /// Is ```true``` if the device implements error correction for the memories, caches, registers etc. in the device. Is ```false``` if the device does not implement error correction. This can be a requirement for certain clients of OpenCL. - #[inline(always)] - pub fn error_connection_support (&self) -> Result { - let v = self.get_info_bits::(CL_DEVICE_ERROR_CORRECTION_SUPPORT)?; - Ok(v != 0) - } - - /// Describes the execution capabilities of the device - #[inline(always)] - pub fn execution_capabilities (&self) -> Result { - self.get_info_bits(CL_DEVICE_EXECUTION_CAPABILITIES) - } - - /// Returns a list of extension names - #[inline(always)] - pub fn extensions (&self) -> Result> { - Ok ( - self.get_info_string(CL_DEVICE_EXTENSIONS)? - .split_whitespace() - .map(String::from) - .collect::>() - ) - } - - /// Returns a space-separated list of extension names (the extension names themselves do not contain any spaces) - #[inline(always)] - pub fn extensions_string (&self) -> Result { - self.get_info_string(CL_DEVICE_EXTENSIONS) - } - - /// Is ```true``` if the device supports the generic address space and its associated built-in functions, and ```false``` otherwise. - #[docfg(feature = "cl3")] - #[inline(always)] - pub fn generic_address_space_support (&self) -> Result { - let v = self.get_info_bits::(opencl_sys::CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT)?; - Ok(v != 0) - } - - /// Size of global memory cache in bytes. - #[inline(always)] - pub fn global_mem_cache_size (&self) -> Result { - self.get_info_bits(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) - } - - /// Type of global memory cache supported. - #[inline(always)] - pub fn global_mem_cache_type (&self) -> Result { - match self.get_info_bits::(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE)? { - CL_NONE => Ok(MemAccess::NONE), - CL_READ_ONLY_CACHE => Ok(MemAccess::READ_ONLY), - CL_READ_WRITE_CACHE => Ok(MemAccess::READ_WRITE), - _ => unreachable!() - } - } - - /// Size of global memory cache line in bytes. - #[inline(always)] - pub fn global_mem_cahceline_size (&self) -> Result { - self.get_info_bits(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) - } - - /// Size of global memory in bytes. - #[inline(always)] - pub fn global_mem_size (&self) -> Result { - self.get_info_bits(CL_DEVICE_GLOBAL_MEM_SIZE) - } - - /// Maximum preferred total size, in bytes, of all program variables in the global address space. This is a performance hint. An implementation may place such variables in storage with optimized device access. This query returns the capacity of such storage. The minimum value is 0. - #[docfg(feature = "cl2")] - #[inline(always)] - pub fn global_variable_preferred_total_size (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE) - } - - /// Describes the OPTIONAL half precision floating-point capability of the OpenCL device - #[inline(always)] - pub fn half_fp_config (&self) -> Result { - self.get_info_bits(CL_DEVICE_HALF_FP_CONFIG) - } - - /// Is ```true``` if the device and the host have a unified memory subsystem and is ```false``` otherwise. - #[docfg(feature = "cl1_1")] - #[cfg_attr(feature = "cl2", deprecated)] - #[inline(always)] - pub fn host_unified_memory (&self) -> Result { - let v = self.get_info_bits::(opencl_sys::CL_DEVICE_HOST_UNIFIED_MEMORY)?; - Ok(v != 0) - } - - /// The intermediate languages that can be supported by clCreateProgramWithIL for this device. - #[docfg(feature = "cl2_1")] - #[inline(always)] - pub fn il_version (&self) -> Result { - self.get_info_string(opencl_sys::CL_DEVICE_IL_VERSION) - } - - /// Is ```true``` if images are supported by the OpenCL device and ```false``` otherwise. - #[inline(always)] - pub fn image_support (&self) -> Result { - let v = self.get_info_bits::(CL_DEVICE_IMAGE_SUPPORT)?; - Ok(v != 0) - } - - /// Max number of images in a 1D or 2D image array. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE, the value is 0 otherwise. - #[docfg(feature = "cl1_2")] - #[inline(always)] - pub fn image_max_array_size (&self) -> Result> { - self.get_info_bits(opencl_sys::CL_DEVICE_IMAGE_MAX_ARRAY_SIZE).map(NonZeroUsize::new) - } - - /// Max number of pixels for a 1D image created from a buffer object. The minimum value is 65536 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE, the value is 0 otherwise. - #[docfg(feature = "cl1_2")] - #[inline(always)] - pub fn image_max_buffer_size (&self) -> Result> { - self.get_info_bits(opencl_sys::CL_DEVICE_IMAGE_MAX_BUFFER_SIZE).map(NonZeroUsize::new) - } - - /// The row pitch alignment size in pixels for 2D images created from a buffer. The value returned must be a power of 2. - #[docfg(feature = "cl2")] - #[inline(always)] - pub fn image_pitch_alignment (&self) -> Result> { - self.get_info_bits(opencl_sys::CL_DEVICE_IMAGE_PITCH_ALIGNMENT).map(NonZeroU32::new) - } - - /// This query specifies the minimum alignment in pixels of the host_ptr specified to clCreateBuffer or clCreateBufferWithProperties when a 2D image is created from a buffer which was created using CL_MEM_USE_HOST_PTR. The value returned must be a power of 2. - #[docfg(feature = "cl2")] - #[inline(always)] - pub fn image_base_address_alignment (&self) -> Result> { - self.get_info_bits(opencl_sys::CL_DEVICE_IMAGE_PITCH_ALIGNMENT).map(NonZeroU32::new) - } - - /// Max height of 2D image in pixels. The minimum value is 8192 if [`image_support`](RawDevice::image_support) is ```true```. - #[inline(always)] - pub fn image2d_max_height (&self) -> Result> { - self.get_info_bits::(CL_DEVICE_IMAGE2D_MAX_HEIGHT).map(NonZeroUsize::new) - } - - /// Max width of 2D image in pixels. The minimum value is 8192 if [`image_support`](RawDevice::image_support) is ```true```. - #[inline(always)] - pub fn image2d_max_width (&self) -> Result> { - self.get_info_bits::(CL_DEVICE_IMAGE2D_MAX_WIDTH).map(NonZeroUsize::new) - } - - /// Max depth of 3D image in pixels. The minimum value is 2048 if [`image_support`](RawDevice::image_support) is ```true```. - #[inline(always)] - pub fn image3d_max_depth (&self) -> Result> { - self.get_info_bits::(CL_DEVICE_IMAGE3D_MAX_DEPTH).map(NonZeroUsize::new) - } - - /// Max height of 3D image in pixels. The minimum value is 2048 if [`image_support`](RawDevice::image_support) is ```true```. - #[inline(always)] - pub fn image3d_max_height (&self) -> Result> { - self.get_info_bits::(CL_DEVICE_IMAGE3D_MAX_HEIGHT).map(NonZeroUsize::new) - } - - /// Max width of 3D image in pixels. The minimum value is 2048 if [`image_support`](RawDevice::image_support) is ```true```. - #[inline(always)] - pub fn image3d_max_width (&self) -> Result> { - self.get_info_bits::(CL_DEVICE_IMAGE3D_MAX_WIDTH).map(NonZeroUsize::new) - } - - /// Returns the latest version of the conformance test suite that this device has fully passed in accordance with the official conformance process. - #[docfg(feature = "cl3")] - #[inline(always)] - pub fn latest_conformance_version_passed (&self) -> Result { - self.get_info_string(opencl_sys::CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED) - } - - /// Size of local memory arena in bytes. The minimum value is 16 KB. - #[inline(always)] - pub fn local_mem_size (&self) -> Result { - unsafe { - Ok(NonZeroU64::new_unchecked(self.get_info_bits::(CL_DEVICE_LOCAL_MEM_SIZE)?)) - } - } - - /// Type of local memory supported. - #[inline(always)] - pub fn local_mem_type (&self) -> Result { - self.get_info_bits(CL_DEVICE_LOCAL_MEM_TYPE) - } - - #[docfg(feature = "cl1_2")] - #[inline(always)] - pub fn linker_available (&self) -> Result { - let v = self.get_info_bits::(opencl_sys::CL_DEVICE_LINKER_AVAILABLE)?; - Ok(v != 0) - } - - /// Maximum configured clock frequency of the device in MHz. - #[docfg(feature = "cl2_2")] - #[inline(always)] - pub fn max_clock_frequency (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_MAX_CLOCK_FREQUENCY) - } - - /// The number of parallel compute cores on the OpenCL device. The minimum value is 1. - #[inline(always)] - pub fn max_compute_units (&self) -> Result { - unsafe { - Ok(NonZeroU32::new_unchecked(self.get_info_bits::(CL_DEVICE_MAX_COMPUTE_UNITS)?)) - } - } - - /// Max number of arguments declared with the ```__constant``` qualifier in a kernel. The minimum value is 8. - #[inline(always)] - pub fn max_constant_args (&self) -> Result { - unsafe { - Ok(NonZeroU32::new_unchecked(self.get_info_bits::(CL_DEVICE_MAX_CONSTANT_ARGS)?)) - } - } - - /// Max size in bytes of a constant buffer allocation. The minimum value is 64 KB. - #[inline(always)] - pub fn max_constant_buffer_size (&self) -> Result { - unsafe { - Ok(NonZeroU64::new_unchecked(self.get_info_bits::(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE)?)) - } - } - - /// The maximum number of bytes of storage that may be allocated for any single variable in program scope or inside a function in an OpenCL kernel language declared in the global address space. - #[docfg(feature = "cl2")] - #[inline(always)] - pub fn max_global_variable_size (&self) -> Result> { - self.get_info_bits(opencl_sys::CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE).map(NonZeroUsize::new) - } - - /// Max size of memory object allocation in bytes. The minimum value is max (1/4th of [```global_mem_size```](), 128*1024*1024) - #[inline(always)] - pub fn max_mem_alloc_size (&self) -> Result { - unsafe { - Ok(NonZeroU64::new_unchecked(self.get_info_bits::(CL_DEVICE_MAX_MEM_ALLOC_SIZE)?)) - } - } - - /// Maximum number of sub-groups in a work-group that a device is capable of executing on a single compute unit, for any given kernel-instance running on the device. The minimum value is 1 if the device supports subgroups, and must be 0 for devices that do not support subgroups. Support for subgroups is required for an OpenCL 2.1 or 2.2 device. - #[docfg(feature = "cl2_1")] - #[inline(always)] - pub fn max_num_sub_groups (&self) -> Result> { - self.get_info_bits(opencl_sys::CL_DEVICE_MAX_NUM_SUB_GROUPS).map(NonZeroU32::new) - } - - /// The maximum number of events in use by a device queue. These refer to events returned by the enqueue_ built-in functions to a device queue or user events returned by the create_user_event built-in function that have not been released. The minimum value is 1024 for devices supporting on-device queues, and must be 0 for devices that do not support on-device queues. - #[docfg(feature = "cl2")] - #[inline(always)] - pub fn max_on_device_events (&self) -> Result> { - self.get_info_bits(opencl_sys::CL_DEVICE_MAX_ON_DEVICE_EVENTS).map(NonZeroU32::new) - } - - /// The maximum number of device queues that can be created for this device in a single context. The minimum value is 1 for devices supporting on-device queues, and must be 0 for devices that do not support on-device queues. - #[docfg(feature = "cl2")] - #[inline(always)] - pub fn max_on_device_queues (&self) -> Result> { - self.get_info_bits(opencl_sys::CL_DEVICE_MAX_ON_DEVICE_QUEUES).map(NonZeroU32::new) - } - - /// Max size in bytes of the arguments that can be passed to a kernel. The minimum value is 256. - #[inline(always)] - pub fn max_parameter_size (&self) -> Result { - unsafe { - Ok(NonZeroUsize::new_unchecked(self.get_info_bits::(CL_DEVICE_MAX_PARAMETER_SIZE)?)) - } - } - - /// The maximum number of pipe objects that can be passed as arguments to a kernel. The minimum value is 16 for devices supporting pipes, and must be 0 for devices that do not support pipes. - #[docfg(featurew = "cl2")] - #[inline(always)] - pub fn max_pipe_args (&self) -> Result> { - self.get_info_bits(opencl_sys::CL_DEVICE_MAX_PIPE_ARGS).map(NonZeroU32::new) - } - - /// Max number of simultaneous image objects that can be read by a kernel. The minimum value is 128 if [`image_support`](RawDevice::image_support) is ```true```. - #[inline(always)] - pub fn max_read_image_args (&self) -> Result> { - self.get_info_bits::(CL_DEVICE_MAX_READ_IMAGE_ARGS).map(NonZeroU32::new) - } - - /// Max number of image objects arguments of a kernel declared with the write_only or read_write qualifier. - #[docfg(feature = "cl2")] - #[inline(always)] - pub fn max_read_write_image_args (&self) -> Result> { - self.get_info_bits::(opencl_sys::CL_DEVICE_MAX_READ_IMAGE_ARGS).map(NonZeroU32::new) - } - - /// Maximum number of samplers that can be used in a kernel. The minimum value is 16 if [`image_support`](RawDevice::image_support) is ```true```. - #[inline(always)] - pub fn max_samplers (&self) -> Result> { - self.get_info_bits::(CL_DEVICE_MAX_SAMPLERS).map(NonZeroU32::new) - } - - /// Maximum number of work-items in a work-group executing a kernel using the data parallel execution model. The minimum value is 1. - #[inline(always)] - pub fn max_work_group_size (&self) -> Result { - unsafe { - Ok(NonZeroUsize::new_unchecked(self.get_info_bits::(CL_DEVICE_MAX_WORK_GROUP_SIZE)?)) - } - } - - /// Maximum dimensions that specify the global and local work-item IDs used by the data parallel execution model. The minimum value is 3. - #[inline(always)] - pub fn max_work_item_dimensions (&self) -> Result { - unsafe { - Ok(NonZeroU32::new_unchecked(self.get_info_bits::(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS)?)) - } - } - - /// Maximum number of work-items that can be specified in each dimension of the work-group to clEnqueueNDRangeKernel. Returns n ```usize``` entries, where n is the value returned by the query for [`max_work_item_dimensions`](RawDevice::max_work_item_dimensions). The minimum value is (1, 1, 1). - #[inline(always)] - pub fn max_work_item_sizes (&self) -> Result> { - let n = usize::try_from(self.max_work_item_dimensions()?.get()).unwrap(); - // FIXME: maybe using nonzero ints messes up the alignment? - let mut max_work_item_sizes = Vec::::with_capacity(n); - - let len = n.checked_mul(core::mem::size_of::()).expect("Integer multiplication oveflow. Too many work items to fit in a vector"); - unsafe { - clGetDeviceInfo(self.id(), CL_DEVICE_MAX_WORK_ITEM_SIZES, len, max_work_item_sizes.as_mut_ptr().cast(), core::ptr::null_mut()); - max_work_item_sizes.set_len(n); - } - - Ok(max_work_item_sizes) - } - - /// Max number of simultaneous image objects that can be written to by a kernel. The minimum value is 8 if [`image_support`](RawDevice::image_support) is ```true```. - #[inline(always)] - pub fn max_write_image_args (&self) -> Result> { - self.get_info_bits::(CL_DEVICE_MAX_WRITE_IMAGE_ARGS).map(NonZeroU32::new) - } - - /// Describes the alignment in bits of the base address of any allocated memory object. - #[inline(always)] - pub fn mem_base_addr_align (&self) -> Result { - self.get_info_bits(CL_DEVICE_MEM_BASE_ADDR_ALIGN) - } - - /// The smallest alignment in bytes which can be used for any data type. - #[cfg_attr(feature = "cl1_2", deprecated)] - #[inline(always)] - pub fn min_data_type_align_size (&self) -> Result { - self.get_info_bits(CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE) - } - - /// Device name string. - #[inline(always)] - pub fn name (&self) -> Result { - self.get_info_string(CL_DEVICE_NAME) - } - - /// Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector. - #[docfg(feature = "cl1_1")] - #[inline(always)] - pub fn native_vector_width_char (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) - } - - /// Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector. - #[docfg(feature = "cl1_1")] - #[inline(always)] - pub fn native_vector_width_short (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) - } - - /// Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector. - #[docfg(feature = "cl1_1")] - #[inline(always)] - pub fn native_vector_width_int (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) - } - - /// Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector. - #[docfg(feature = "cl1_1")] - #[inline(always)] - pub fn native_vector_width_long (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) - } - - /// Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector. - #[docfg(all(feature = "cl1_1", feature = "half"))] - #[inline(always)] - pub fn native_vector_width_half (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) - } - - /// Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector. - #[docfg(feature = "cl1_1")] - #[inline(always)] - pub fn native_vector_width_float (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) - } - - /// Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector - #[docfg(feature = "cl1_1")] - #[inline(always)] - pub fn native_vector_width_double (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) - } - - /// Is ```true``` if the device supports non-uniform work-groups, and ```false``` otherwise. - #[docfg(feature = "cl3")] - #[inline(always)] - pub fn non_uniform_work_group_support (&self) -> Result { - let v = self.get_info_bits::(opencl_sys::CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT)?; - Ok(v != 0) - } - - /// Returns the highest fully backwards compatible OpenCL C version supported by the compiler for the device. - #[docfg(feature = "cl1_1")] - #[cfg_attr(feature = "cl3", deprecated)] - #[inline(always)] - pub fn opencl_c_version (&self) -> Result { - self.get_info_string(opencl_sys::CL_DEVICE_OPENCL_C_VERSION) - } - - /// Returns the parent device to which this sub-device belongs. If device is a root-level device, a ```None``` value is returned. - #[docfg(feature = "cl1_2")] - #[inline] - pub fn parent (&self) -> Result> { - let v = self.get_info_bits::(opencl_sys::CL_DEVICE_PARENT_DEVICE)?; - if let Some(v) = NonNull::new(v) { - return Ok(Some(Self(v))) - } - - Ok(None) - } - - /// Returns the list of supported affinity domains for partitioning the device. - #[docfg(feature = "cl1_2")] - #[inline(always)] - pub fn partition_affinity_domain (&self) -> Result> { - let v = self.get_info_bits::(opencl_sys::CL_DEVICE_PARTITION_PROPERTIES)?; - - Ok(match v { - 0 => None, - _ => unsafe { Some(core::mem::transmute(v)) } - }) - } - - /// Returns the properties argument specified in clCreateSubDevices if device is a sub-device. - #[docfg(feature = "cl1_2")] - #[inline(always)] - pub fn partition_type (&self) -> Result> { - let v = self.get_info_array::(opencl_sys::CL_DEVICE_PARTITION_TYPE)?; - Ok(PartitionProperty::from_slice(&v)) - } - - /// Returns the maximum number of sub-devices that can be created when a device is partitioned. The value returned cannot exceed [max_compute_units](RawDevice::max_compute_units). - #[docfg(feature = "cl1_2")] - #[inline(always)] - pub fn partition_max_sub_devices (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_PARTITION_MAX_SUB_DEVICES) - } - - /// Returns the list of partition types supported by device. - #[docfg(feature = "cl1_2")] - #[inline(always)] - pub fn partition_properties (&self) -> Result> { - let v = self.get_info_array::(opencl_sys::CL_DEVICE_PARTITION_PROPERTIES)?; - Ok(PartitionProperty::from_slice(&v)) - } - - /// Is ```true``` if the device supports pipes, and ```false``` otherwise. Devices that return ```true``` must also return ```true``` for [`generic_address_space_support`](RawDevice::generic_address_space_support). - #[docfg(feature = "cl3")] - #[inline(always)] - pub fn pipe_support (&self) -> Result { - let v = self.get_info_bits::(opencl_sys::CL_DEVICE_PIPE_SUPPORT)?; - Ok(v != 0) - } - - /// The maximum number of reservations that can be active for a pipe per work-item in a kernel. A work-group reservation is counted as one reservation per work-item. The minimum value is 1 for devices supporting pipes, and must be 0 for devices that do not support pipes. - #[docfg(feature = "cl2")] - #[inline(always)] - pub fn pipe_max_active_reservations (&self) -> Result> { - self.get_info_bits(opencl_sys::CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS).map(NonZeroU32::new) - } - - /// The maximum size of pipe packet in bytes. Support for pipes is required for an OpenCL 2.0, 2.1, or 2.2 device. The minimum value is 1024 bytes if the device supports pipes, and must be 0 for devices that do not support pipes. - #[docfg(feature = "cl2")] - #[inline(always)] - pub fn pipe_max_packet_size (&self) -> Result> { - self.get_info_bits(opencl_sys::CL_DEVICE_PIPE_MAX_PACKET_SIZE).map(NonZeroU32::new) - } - - /// The platform associated with this device. - #[inline(always)] - pub fn platform (&self) -> Result { - let id = self.get_info_bits::(CL_DEVICE_PLATFORM)?; - unsafe { - return RawPlatform::from_id(id).ok_or_else(|| ErrorKind::InvalidPlatform.into()) - } - } - - /// Is ```true``` if the devices preference is for the user to be responsible for synchronization, when sharing memory objects between OpenCL and other APIs such as DirectX, ```false``` if the device / implementation has a performant path for performing synchronization of memory object shared between OpenCL and other APIs such as DirectX. - #[docfg(feature = "cl1_2")] - #[inline(always)] - pub fn preferred_interop_user_sync (&self) -> Result { - let v = self.get_info_bits::(opencl_sys::CL_DEVICE_PREFERRED_INTEROP_USER_SYNC)?; - Ok(v != 0) - } - - /// Returns the value representing the preferred alignment in bytes for OpenCL 2.0 fine-grained SVM atomic types. This query can return 0 which indicates that the preferred alignment is aligned to the natural size of the type. - #[docfg(feature = "cl2")] - #[inline(always)] - pub fn preferred_platform_atomic_alignment (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT) - } - - /// Returns the value representing the preferred alignment in bytes for OpenCL 2.0 atomic types to global memory. This query can return 0 which indicates that the preferred alignment is aligned to the natural size of the type. - #[docfg(feature = "cl2")] - #[inline(always)] - pub fn preferred_global_atomic_alignment (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT) - } - - /// Returns the value representing the preferred alignment in bytes for OpenCL 2.0 atomic types to local memory. This query can return 0 which indicates that the preferred alignment is aligned to the natural size of the type. - #[docfg(feature = "cl2")] - #[inline(always)] - pub fn preferred_local_atomic_alignment (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT) - } - - /// Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector. - #[docfg(feature = "cl1_1")] - #[inline(always)] - pub fn preferred_vector_width_char (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) - } - - /// Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector. - #[docfg(feature = "cl1_1")] - #[inline(always)] - pub fn preferred_vector_width_short (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) - } - - /// Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector. - #[docfg(feature = "cl1_1")] - #[inline(always)] - pub fn preferred_vector_width_int (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) - } - - /// Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector. - #[docfg(feature = "cl1_1")] - #[inline(always)] - pub fn preferred_vector_width_long (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) - } - - /// Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector. - #[docfg(feature = "cl1_1")] - #[inline(always)] - pub fn preferred_vector_width_half (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) - } - - /// Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector. - #[docfg(feature = "cl1_1")] - #[inline(always)] - pub fn preferred_vector_width_float (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) - } - - /// Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector. if the ```cl_khr_fp64``` extension is not supported, it must return 0. - #[docfg(feature = "cl1_1")] - #[inline(always)] - pub fn preferred_vector_width_double (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) - } - - /// Returns the preferred multiple of work-group size for the given device. This is a performance hint intended as a guide when specifying the local work size argument to clEnqueueNDRangeKernel. - #[docfg(feature = "cl3")] - #[inline(always)] - pub fn preferred_work_group_size_multiple (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE) - } - - /// Maximum size in bytes of the internal buffer that holds the output of printf calls from a kernel. The minimum value for the FULL profile is 1 MB. - #[docfg(feature = "cl1_2")] - #[inline(always)] - pub fn printf_buffer_size (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_PRINTF_BUFFER_SIZE) - .map(NonZeroUsize::new) - .map(Option::unwrap) - } - - /// OpenCL profile string. Returns the profile name supported by the device (see note) - #[inline(always)] - pub fn profile (&self) -> String { - self.get_info_string(CL_DEVICE_PROFILE).unwrap() - } - - /// Describes the resolution of device timer. This is measured in nanoseconds. - #[inline(always)] - pub fn profiling_timer_resolution (&self) -> Result { - self.get_info_bits(CL_DEVICE_PROFILING_TIMER_RESOLUTION) - } - - /// Describes the command-queue properties supported by the device. - #[cfg_attr(feature = "cl2", deprecated(note = "see `queue_on_host_properties`"))] - #[inline(always)] - pub fn queue_properties (&self) -> Result { - let v = self.get_info_bits::(CL_DEVICE_QUEUE_PROPERTIES)?; - Ok(CommandQueueProperties::from_bits(v)) - } - - /// Describes the on device command-queue properties supported by the device. - #[docfg(feature = "cl2")] - #[inline(always)] - pub fn queue_on_device_properties (&self) -> Result { - let v = self.get_info_bits::(opencl_sys::CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES)?; - Ok(CommandQueueProperties::from_bits(v)) - } - - /// The maximum size of the device queue in bytes. The minimum value is 256 KB for the full profile and 64 KB for the embedded profile for devices supporting on-device queues, and must be 0 for devices that do not support on-device queues. - #[docfg(feature = "cl2")] - #[inline(always)] - pub fn queue_on_device_max_size (&self) -> Result> { - self.get_info_bits(opencl_sys::CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE).map(NonZeroU32::new) - } - - /// The preferred size of the device queue, in bytes. Applications should use this size for the device queue to ensure good performance. The minimum value is 16 KB for devices supporting on-device queues, and must be 0 for devices that do not support on-device queues. - #[docfg(feature = "cl2")] - #[inline(always)] - pub fn queue_on_device_preferred_size (&self) -> Result> { - self.get_info_bits(opencl_sys::CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE).map(NonZeroU32::new) - } - - /// Describes the on host command-queue properties supported by the device. - #[docfg(feature = "cl2")] - #[inline(always)] - pub fn queue_on_host_properties (&self) -> Result { - let v = self.get_info_bits::(opencl_sys::CL_DEVICE_QUEUE_ON_HOST_PROPERTIES)?; - Ok(CommandQueueProperties::from_bits(v)) - } - - #[docfg(feature = "cl1_2")] - #[inline(always)] - pub fn reference_count (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_REFERENCE_COUNT) - } - - /// Describes single precision floating-point capability of the device. - #[inline(always)] - pub fn single_fp_config (&self) -> Result { - self.get_info_bits(CL_DEVICE_SINGLE_FP_CONFIG) - } - - #[docfg(feature = "cl2_1")] - #[inline(always)] - pub fn sub_group_independent_forward_progress (&self) -> Result { - let v = self.get_info_bits::(opencl_sys::CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS)?; - Ok(v != 0) - } - - /// Describes the various shared virtual memory (SVM) memory allocation types the device supports. - #[docfg(feature = "cl2")] - #[inline(always)] - pub fn svm_capabilities (&self) -> Result { - self.get_info_bits(opencl_sys::CL_DEVICE_SVM_CAPABILITIES) - } - - /// The OpenCL device type. - #[inline(always)] - pub fn ty (&self) -> Result { - self.get_info_bits(CL_DEVICE_TYPE) - } - - /// Vendor name string. - #[inline(always)] - pub fn vendor (&self) -> Result { - self.get_info_string(CL_DEVICE_VENDOR) - } - - /// A unique device vendor identifier. An example of a unique device identifier could be the PCIe ID. - #[inline(always)] - pub fn vendor_id (&self) -> Result { - self.get_info_bits(CL_DEVICE_VENDOR_ID) - } - - /// OpenCL version string. - #[inline(always)] - pub fn version_string (&self) -> Result { - self.get_info_string(CL_DEVICE_VERSION) - } - - /// OpenCL version - #[inline] - pub fn version (&self) -> Result { - let version = self.version_string()?; - let section = version.split(' ').nth(1).ok_or(ErrorKind::InvalidValue)?; - Version::from_str(section).map_err(|_| ErrorKind::InvalidValue.into()) - } - - /// Is ```true``` if the device supports work-group collective functions (e.g. work_group_broadcast, work_group_reduce and work_group_scan), and ```false``` otherwise. - #[docfg(feature = "cl3")] - #[inline(always)] - pub fn work_group_collective_functions_support (&self) -> Result { - let v = self.get_info_bits::(opencl_sys::CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT)?; - Ok(v != 0) - } - - /// OpenCL software driver version string in the form _major_number_._minor_number_. - #[inline(always)] - pub fn driver_version_string (&self) -> Result { - self.get_info_string(CL_DRIVER_VERSION) - } - - /// OpenCL software driver version - #[inline(always)] - pub fn driver_version (&self) -> Result { - let driver = self.driver_version_string()?; - Version::from_str(&driver).map_err(|_| ErrorKind::InvalidValue.into()) - } - - /// Creates an array of sub-devices that each reference a non-intersecting set of compute units within in_device, according to the partition scheme given by properties. - /// The output sub-devices may be used in every way that the root (or parent) device can be used, including creating contexts, building programs, further calls to [`create_sub_devices`](RawDevice::create_sub_devices) and creating command-queues. - /// When a command-queue is created against a sub-device, the commands enqueued on the queue are executed only on the sub-device. - #[docfg(feature = "cl1_2")] - #[inline] - pub fn create_sub_devices (&self, prop: PartitionProperty) -> Result> { - let prop = prop.to_bits(); - - let mut len = 0; - unsafe { - tri!(opencl_sys::clCreateSubDevices(self.id(), prop.as_ptr(), 0, core::ptr::null_mut(), std::ptr::addr_of_mut!(len))) - } - - let mut devices = Vec::with_capacity(len as usize); - unsafe { - tri!(opencl_sys::clCreateSubDevices(self.id(), prop.as_ptr(), len, devices.as_mut_ptr() as *mut _, core::ptr::null_mut())); - devices.set_len(devices.capacity()) - } - - Ok(devices) - } - - /// Replaces the default command queue on the device. - #[docfg(feature = "cl2_1")] - #[inline(always)] - pub fn set_default_command_queue (&self, ctx: crate::context::RawContext, queue: RawCommandQueue) -> Result<()> { - unsafe { - tri!(opencl_sys::clSetDefaultDeviceCommandQueue(ctx.id(), self.id(), queue.id())); - } - - Ok(()) - } - - /// Query synchronized host and device timestamps. - #[docfg(feature = "cl2_1")] - #[inline] - pub fn device_and_host_timer_nanos (&self) -> Result<[u64; 2]> { - let mut device = 0; - let mut host = 0; - - unsafe { - tri!(clGetDeviceAndHostTimer(self.id(), std::ptr::addr_of_mut!(device), std::ptr::addr_of_mut!(host))) - } - - Ok([device, host]) - } - - /// Query synchronized host and device timestamps. - #[docfg(feature = "cl2_1")] - #[inline(always)] - pub fn device_and_host_timer (&self) -> Result<(std::time::SystemTime, std::time::SystemTime)> { - let [device, host] = self.device_and_host_timer_nanos()?; - let device = std::time::UNIX_EPOCH.checked_add(std::time::Duration::from_nanos(device)).unwrap(); - let host = std::time::UNIX_EPOCH.checked_add(std::time::Duration::from_nanos(host)).unwrap(); - Ok((device, host)) - } - - /// Query the host clock. - #[docfg(feature = "cl2_1")] - #[inline(always)] - pub fn host_clock_nanos (&self) -> Result { - let mut host = 0; - unsafe { - tri!(clGetHostTimer(self.id(), std::ptr::addr_of_mut!(host))) - } - - Ok(host) - } - - /// Query the host clock. - #[docfg(feature = "cl2_1")] - #[inline(always)] - pub fn host_clock (&self) -> Result { - let host = self.host_clock_nanos()?; - Ok(std::time::UNIX_EPOCH + std::time::Duration::from_nanos(host)) - } - - #[inline(always)] - pub fn has_f16 (&self) -> Result { - let ext = self.extensions_string()?; - Ok(ext.split_whitespace().any(|x| x == "cl_khr_fp16")) - } - - #[inline(always)] - pub fn has_f64 (&self) -> Result { - let ext = self.extensions_string()?; - Ok(ext.split_whitespace().any(|x| x == "cl_khr_fp64")) - } - - #[inline(always)] - pub fn all () -> &'static [RawDevice] { - &once_cell::sync::Lazy::force(&DEVICES) - } - - #[inline(always)] - pub fn first () -> Option<&'static RawDevice> { - DEVICES.first() - } - - #[docfg(feature = "cl1_2")] - #[inline(always)] - pub unsafe fn retain (&self) -> Result<()> { - tri!(clRetainDevice(self.id())); - Ok(()) - } - - #[inline] - fn get_info_string (&self, ty: cl_device_info) -> Result { - unsafe { - let mut len = 0; - tri!(clGetDeviceInfo(self.id(), ty, 0, core::ptr::null_mut(), &mut len)); - - let mut result = Vec::::with_capacity(len); - tri!(clGetDeviceInfo(self.id(), ty, len, result.as_mut_ptr().cast(), core::ptr::null_mut())); - - result.set_len(len - 1); - Ok(String::from_utf8(result).unwrap()) - } - } - - #[allow(dead_code)] - #[inline] - fn get_info_array (&self, ty: cl_device_info) -> Result> { - unsafe { - let mut len = 0; - tri!(clGetDeviceInfo(self.id(), ty, 0, core::ptr::null_mut(), &mut len)); - - if len == 0 { - return Ok(Box::new([])) - } - - let mut result = Box::<[T]>::new_uninit_slice(len / core::mem::size_of::()); - tri!(clGetDeviceInfo(self.id(), ty, len, result.as_mut_ptr().cast(), core::ptr::null_mut())); - Ok(result.assume_init()) - } - } - - #[inline] - fn get_info_bits (&self, ty: cl_device_info) -> Result { - let mut value = MaybeUninit::::uninit(); - - unsafe { - tri!(clGetDeviceInfo(self.id(), ty, core::mem::size_of::(), value.as_mut_ptr().cast(), core::ptr::null_mut())); - Ok(value.assume_init()) - } - } -} - -impl Debug for RawDevice { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - f.debug_struct("Device") - .field("id", &self.0) - .field("name", &self.name()) - .field("vendor", &self.vendor()) - .field("type", &self.ty()) - .field("version", &self.version()) - .finish() - } -} - -impl Clone for RawDevice { - #[inline(always)] - fn clone(&self) -> Self { - #[cfg(feature = "cl1_2")] - unsafe { - tri_panic!(opencl_sys::clRetainDevice(self.id())) - } - - Self(self.0) - } -} - -#[docfg(feature = "cl1_2")] -impl Drop for RawDevice { - #[inline(always)] - fn drop (&mut self) { - unsafe { - tri_panic!(opencl_sys::clReleaseDevice(self.id())); - } - } -} - -unsafe impl Send for RawDevice {} -unsafe impl Sync for RawDevice {} - -#[docfg(feature = "cl3")] -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -#[non_exhaustive] -pub struct AtomicCapabilities { - pub order: core::sync::atomic::Ordering, - /// Support for memory ordering constraints that apply to a single work-item. - pub work_item_scope: bool, - pub scope: AtomicScope -} - -#[docfg(feature = "cl3")] -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -#[non_exhaustive] -#[repr(u64)] -pub enum AtomicScope { - /// Support for memory ordering constraints that apply to all work-items in a work-group. - WorkGroup = opencl_sys::CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP, - /// Support for memory ordering constraints that apply to all work-items executing on the device. - Device = opencl_sys::CL_DEVICE_ATOMIC_SCOPE_DEVICE, - /// Support for memory ordering constraints that apply to all work-items executing across all devices that can share SVM memory with each other and the host process. - AllDevices = opencl_sys::CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES -} - -#[cfg(feature = "cl3")] -impl AtomicCapabilities { - pub fn from_bits (bits: opencl_sys::cl_device_atomic_capabilities) -> Option { - let order; - let scope; - let work_item_scope = bits & opencl_sys::CL_DEVICE_ATOMIC_SCOPE_WORK_ITEM != 0; - - // ORDER - if bits & opencl_sys::CL_DEVICE_ATOMIC_ORDER_SEQ_CST != 0 { - order = core::sync::atomic::Ordering::SeqCst; - } - - else if bits & opencl_sys::CL_DEVICE_ATOMIC_ORDER_ACQ_REL != 0 { - order = core::sync::atomic::Ordering::AcqRel - } - - else if bits & opencl_sys::CL_DEVICE_ATOMIC_ORDER_RELAXED != 0 { - order = core::sync::atomic::Ordering::Relaxed - } - - else { - return None - } - - // SCOPE - if bits & opencl_sys::CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES != 0 { - scope = AtomicScope::AllDevices - } - - else if bits & opencl_sys::CL_DEVICE_ATOMIC_SCOPE_DEVICE != 0 { - scope = AtomicScope::Device - } - - else if bits & opencl_sys::CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP != 0 { - scope = AtomicScope::WorkGroup - } - - else { - return None; - } - - Some(Self { order, work_item_scope, scope }) - } -} - -bitflags::bitflags! { - /// The OpenCL device type. - #[repr(transparent)] - pub struct DeviceType : cl_device_type { - const CPU = CL_DEVICE_TYPE_CPU; - const GPU = CL_DEVICE_TYPE_GPU; - const ACCELERATOR = CL_DEVICE_TYPE_ACCELERATOR; - const DEFAULT = CL_DEVICE_TYPE_CUSTOM; - } - - /// Describes the floating-point capability of the OpenCL device. - #[repr(transparent)] - pub struct FpConfig : cl_device_fp_config { - /// Denorms are supported - const DENORM = CL_FP_DENORM; - /// INF and quiet NaNs are supported - const INF_NAN = CL_FP_INF_NAN; - /// Round to nearest even rounding mode supported - const ROUND_TO_NEAREST = CL_FP_ROUND_TO_NEAREST; - /// Round to zero rounding mode supported - const ROUND_TO_ZERO = CL_FP_ROUND_TO_ZERO; - /// Round to positive and negative infinity rounding modes supported - const ROUND_TO_INF = CL_FP_ROUND_TO_INF; - /// IEEE754-2008 fused multiply-add is supported - const FMA = CL_FP_FMA; - /// Divide and sqrt are correctly rounded as defined by the IEEE754 specification - const CORRECTLY_ROUNDED_DIVIDE_SQRT = CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT; - /// Basic floating-point operations (such as addition, subtraction, multiplication) are implemented in software - const SOFT_FLOAT = CL_FP_SOFT_FLOAT; - } - - /// Describes the execution capabilities of the device - #[repr(transparent)] - pub struct ExecCapabilities : cl_device_exec_capabilities { - const KERNEL = CL_EXEC_KERNEL; - const NATIVE_KERNEL = CL_EXEC_NATIVE_KERNEL; - } -} - -/// Type of local memory supported. This can be set to [```Self::Local```] implying dedicated local memory storage such as SRAM, or [```Self::Global```]. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -#[repr(u32)] -pub enum LocalMemType { - Local = CL_LOCAL, - Global = CL_GLOBAL -} - -#[docfg(feature = "cl1_2")] -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -#[non_exhaustive] -pub enum PartitionProperty { - /// Split the aggregate device into as many smaller aggregate devices as can be created, each containing n compute units. The value n is passed as the value accompanying this property. If n does not divide evenly into [`max_compute_units`](RawDevice::max_compute_units), then the remaining compute units are not used. - Equally (u32), - /// This property is followed by a list of compute unit. For each non-zero count m in the list, a sub-device is created with m compute units in it. The number of non-zero count entries in the list may not exceed [`partition_max_sub_devices`](RawDevice::partition_max_sub_devices). The total number of compute units specified may not exceed [max_compute_units](RawDevice::max_compute_units). - Counts (Vec), - /// Split the device into smaller aggregate devices containing one or more compute units that all share part of a cache hierarchy. - AffinityDomain (AffinityDomain) -} - -#[cfg(feature = "cl1_2")] -impl PartitionProperty { - pub fn from_slice (bits: &[opencl_sys::cl_device_partition_property]) -> Option { - if bits.len() == 0 { - return None; - } - - match unsafe { *bits.get_unchecked(0) } { - 0 => None, - opencl_sys::CL_DEVICE_PARTITION_EQUALLY => Some(Self::Equally(bits[1] as u32)), - opencl_sys::CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN => Some(Self::AffinityDomain(unsafe { core::mem::transmute(bits[1] as u64) })), - opencl_sys::CL_DEVICE_PARTITION_BY_COUNTS => { - let mut result = Vec::with_capacity(bits.len()); - - for i in 1..bits.len() { - const MAX_COUNT : isize = u32::MAX as isize; - - match bits[i] { - #[allow(unreachable_patterns)] - 0 | opencl_sys::CL_DEVICE_PARTITION_BY_COUNTS_LIST_END => break, - v @ 1..=MAX_COUNT => unsafe { result.push(NonZeroU32::new_unchecked(v as u32)) } - _ => return None - } - } - - Some(Self::Counts(result)) - }, - - other => panic!("Unknow partition property '{other}'") - } - } - - pub fn to_bits (&self) -> Box<[opencl_sys::cl_device_partition_property]> { - match self { - Self::Equally(n) => Box::new([opencl_sys::CL_DEVICE_PARTITION_EQUALLY, opencl_sys::cl_device_partition_property::try_from(*n).unwrap(), 0]) as Box<_>, - Self::AffinityDomain(x) => Box::new([opencl_sys::CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, opencl_sys::cl_device_partition_property::try_from(*x as u64).unwrap(), 0]) as Box<_>, - Self::Counts(x) => { - let mut result = Box::new_uninit_slice(2 + x.len()); - - unsafe { - result[0].write(opencl_sys::CL_DEVICE_PARTITION_BY_COUNTS); - - for i in 0..x.len() { - result[1 + i].write(opencl_sys::cl_device_partition_property::try_from(x[i].get()).unwrap()); - } - - result.last_mut().unwrap_unchecked().write(opencl_sys::CL_DEVICE_PARTITION_BY_COUNTS_LIST_END); - result.assume_init() - } - } - } - } -} - -#[docfg(feature = "cl1_2")] -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -#[repr(u64)] -#[non_exhaustive] -pub enum AffinityDomain { - /// Split the device into sub-devices comprised of compute units that share a NUMA node. - Numa = opencl_sys::CL_DEVICE_AFFINITY_DOMAIN_NUMA, - /// Split the device into sub-devices comprised of compute units that share a level 4 data cache. - L4Cache = opencl_sys::CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE, - /// Split the device into sub-devices comprised of compute units that share a level 3 data cache. - L3Cache = opencl_sys::CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE, - /// Split the device into sub-devices comprised of compute units that share a level 2 data cache. - L2Cache = opencl_sys::CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE, - /// Split the device into sub-devices comprised of compute units that share a level 1 data cache. - L1Cache = opencl_sys::CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE, - /// Split the device along the next partitionable affinity domain. The implementation shall find the first level along which the device or sub-device may be further subdivided in the order NUMA, L4, L3, L2, L1, and partition the device into sub-devices comprised of compute units that share memory subsystems at this level. - NextPartitionable = opencl_sys::CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE -} - -#[docfg(feature = "cl3")] -#[repr(u64)] -#[non_exhaustive] -pub enum DeviceEnqueueCapabilities { - /// Device supports device-side enqueue and on-device queues. - Supported = opencl_sys::CL_DEVICE_QUEUE_SUPPORTED, - /// Device supports a replaceable default on-device queue. - ReplaceableDefault = opencl_sys::CL_DEVICE_QUEUE_REPLACEABLE_DEFAULT -} - -#[cfg(feature = "cl3")] -impl DeviceEnqueueCapabilities { - pub fn from_bits (bits: opencl_sys::cl_device_device_enqueue_capabilities) -> Option { - if bits & opencl_sys::CL_DEVICE_QUEUE_REPLACEABLE_DEFAULT != 0 { - return Some(Self::ReplaceableDefault); - } - - if bits & opencl_sys::CL_DEVICE_QUEUE_SUPPORTED != 0 { - return Some(Self::Supported); - } - - None - } -} - -bitflags::bitflags! { - #[repr(transparent)] - pub struct SvmCapability: cl_device_svm_capabilities { - /// Support for coarse-grain buffer sharing using clSVMAlloc. Memory consistency is guaranteed at synchronization points and the host must use calls to clEnqueueMapBuffer and clEnqueueUnmapMemObject. - const COARSE_GRAIN_BUFFER = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER; - /// Support for fine-grain buffer sharing using clSVMAlloc. Memory consistency is guaranteed atsynchronization points without need for clEnqueueMapBuffer and clEnqueueUnmapMemObject. - const FINE_GRAIN_BUFFER = CL_DEVICE_SVM_FINE_GRAIN_BUFFER; - /// Support for sharing the host’s entire virtual memory including memory allocated using malloc. Memory consistency is guaranteed at synchronization points. - const FINE_GRAIN_SYSTEM = CL_DEVICE_SVM_FINE_GRAIN_SYSTEM; - /// Support for the OpenCL 2.0 atomic operations that provide memory consistency across the host and all OpenCL devices supporting fine-grain SVM allocations. - const ATOMICS = CL_DEVICE_SVM_ATOMICS; - } -} - -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] -#[repr(transparent)] -pub struct Version (cl_version); - -impl Version { - pub const CL1 : Self = Self::from_inner_parts(1, 0, 0); - pub const CL1_1 : Self = Self::from_inner_parts(1, 1, 0); - pub const CL1_2 : Self = Self::from_inner_parts(1, 2, 0); - pub const CL2 : Self = Self::from_inner_parts(2, 0, 0); - pub const CL2_1 : Self = Self::from_inner_parts(2, 1, 0); - pub const CL2_2 : Self = Self::from_inner_parts(2, 2, 0); - pub const CL3 : Self = Self::from_inner_parts(3, 0, 0); - - const MAJOR : u32 = CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS; - - #[inline(always)] - pub const fn from_bits (bits : u32) -> Self { - Self(bits) - } - - #[inline(always)] - pub const fn from_inner_parts (major: u32, minor: u32, patch: u32) -> Self { - Self ( - ((major & CL_VERSION_MAJOR_MASK) << Self::MAJOR) | - ((minor & CL_VERSION_MINOR_MASK) << CL_VERSION_PATCH_BITS) | - (patch & CL_VERSION_PATCH_MASK) - ) - } - - #[inline(always)] - pub const fn into_inner_parts (self) -> (u32, u32, u32) { - (self.major(), self.minor(), self.patch()) - } - - #[inline(always)] - pub const fn major(&self) -> u32 { - self.0 >> Self::MAJOR - } - - #[inline(always)] - pub const fn minor (&self) -> u32 { - (self.0 >> CL_VERSION_PATCH_BITS) & CL_VERSION_MINOR_MASK - } - - #[inline(always)] - pub const fn patch (&self) -> u32 { - self.0 & CL_VERSION_PATCH_MASK - } -} - -impl FromStr for Version { - type Err = IntErrorKind; - - fn from_str(s: &str) -> core::result::Result { - let mut parts = s.split('.'); - - let major = parts.next().ok_or(IntErrorKind::Empty)?.parse::().map_err(|e| e.kind().clone())?; - let minor = parts.next().ok_or(IntErrorKind::Empty)?.parse::().map_err(|e| e.kind().clone())?; - let patch_str = parts.next(); - - let patch; - if let Some(inner) = patch_str { - patch = Some(inner.parse::().map_err(|e| e.kind().clone())?) - } else { - patch = None; - } - - if parts.next().is_some() { - return Err(IntErrorKind::InvalidDigit); - } - - Ok(Self::from_inner_parts(major, minor, patch.unwrap_or_default())) - } -} - -impl Debug for Version { - #[inline(always)] - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - Display::fmt(&self, f) - } -} - -impl Display for Version { - #[inline(always)] - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - write!(f, "{}.{}.{}", self.major(), self.minor(), self.patch()) - } -} \ No newline at end of file +use super::*; +use crate::buffer::flags::MemAccess; +use blaze_proc::docfg; +use core::{ + fmt::{Debug, Display}, + mem::MaybeUninit, + num::{IntErrorKind, NonZeroU32, NonZeroU64, NonZeroUsize}, + str::FromStr, +}; +use opencl_sys::*; +use std::{ffi::c_void, ptr::NonNull}; + +lazy_static! { + static ref DEVICES: Vec = unsafe { + let mut result = Vec::::new(); + + for platform in RawPlatform::all() { + let mut cnt = 0; + tri_panic!(clGetDeviceIDs( + platform.id(), + CL_DEVICE_TYPE_ALL, + 0, + core::ptr::null_mut(), + &mut cnt + )); + let cnt_size = usize::try_from(cnt).unwrap(); + + result.reserve(cnt_size); + tri_panic!(clGetDeviceIDs( + platform.id(), + CL_DEVICE_TYPE_ALL, + cnt, + result.as_mut_ptr().add(result.len()).cast(), + core::ptr::null_mut() + )); + result.set_len(result.len() + cnt_size); + } + + result + }; +} + +/// OpenCL device +#[derive(PartialEq, Eq, Hash)] +#[repr(transparent)] +pub struct RawDevice(NonNull); + +impl RawDevice { + #[inline(always)] + pub const fn id(&self) -> cl_device_id { + self.0.as_ptr() + } + + #[inline(always)] + pub const unsafe fn from_id(id: cl_device_id) -> Option { + NonNull::new(id).map(Self) + } + + #[inline(always)] + pub const unsafe fn from_id_unchecked(id: cl_device_id) -> Self { + Self(NonNull::new_unchecked(id)) + } + + /// The default compute device address space size specified as an unsigned integer value in bits. Currently supported values are 32 or 64 bits. + #[inline(always)] + pub fn address_bits(&self) -> Result { + self.get_info_bits(CL_DEVICE_ADDRESS_BITS) + } + + /// Describes the various memory orders and scopes that the device supports for atomic memory operations. + #[docfg(feature = "cl3")] + #[inline(always)] + pub fn atomic_memory_capabilities(&self) -> Result> { + let v = self.get_info_bits::( + opencl_sys::CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES, + )?; + Ok(AtomicCapabilities::from_bits(v)) + } + + /// Describes the various memory orders and scopes that the device supports for atomic fence operations. + #[docfg(feature = "cl3")] + #[inline(always)] + pub fn atomic_fence_capabilities(&self) -> Result> { + let v = self.get_info_bits::( + opencl_sys::CL_DEVICE_ATOMIC_FENCE_CAPABILITIES, + )?; + Ok(AtomicCapabilities::from_bits(v)) + } + + /// Is ```true``` if the device is available and ```false``` if the device is not available. + #[inline(always)] + pub fn available(&self) -> Result { + let v = self.get_info_bits::(CL_DEVICE_AVAILABLE)?; + Ok(v != 0) + } + + /// A list of built-in kernels supported by the device. An empty list is returned if no built-in kernels are supported by the device. + #[docfg(feature = "cl1_2")] + #[inline(always)] + pub fn built_in_kernels(&self) -> Result> { + Ok(self + .built_in_kernels_string()? + .split(';') + .map(str::trim) + .map(str::to_string) + .collect::>()) + } + + /// A semi-colon separated list of built-in kernels supported by the device. An empty string is returned if no built-in kernels are supported by the device. + #[docfg(feature = "cl1_2")] + #[inline(always)] + pub fn built_in_kernels_string(&self) -> Result { + self.get_info_string(opencl_sys::CL_DEVICE_BUILT_IN_KERNELS) + } + + /// Is ```false``` if the implementation does not have a compiler available to compile the program source. Is ```true``` if the compiler is available. This can be CL_FALSE for the embedded platform profile only. + #[inline(always)] + pub fn compiler_available(&self) -> Result { + let v = self.get_info_bits::(CL_DEVICE_COMPILER_AVAILABLE)?; + Ok(v != 0) + } + + /// Describes device-side enqueue capabilities of the device. + #[docfg(feature = "cl3")] + #[inline(always)] + pub fn device_enqueue_capabilities(&self) -> Result> { + let v = self.get_info_bits::( + opencl_sys::CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES, + )?; + Ok(DeviceEnqueueCapabilities::from_bits(v)) + } + + /// Describes the OPTIONAL double precision floating-point capability of the OpenCL device + #[docfg(feature = "cl1_2")] + #[inline(always)] + pub fn double_fp_config(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_DOUBLE_FP_CONFIG) + } + + /// Is ```true``` if the OpenCL device is a little endian device and ```false``` otherwise. + #[inline(always)] + pub fn endian_little(&self) -> Result { + let v = self.get_info_bits::(CL_DEVICE_ENDIAN_LITTLE)?; + Ok(v != 0) + } + + /// Is ```true``` if the device implements error correction for the memories, caches, registers etc. in the device. Is ```false``` if the device does not implement error correction. This can be a requirement for certain clients of OpenCL. + #[inline(always)] + pub fn error_connection_support(&self) -> Result { + let v = self.get_info_bits::(CL_DEVICE_ERROR_CORRECTION_SUPPORT)?; + Ok(v != 0) + } + + /// Describes the execution capabilities of the device + #[inline(always)] + pub fn execution_capabilities(&self) -> Result { + self.get_info_bits(CL_DEVICE_EXECUTION_CAPABILITIES) + } + + /// Returns a list of extension names + #[inline(always)] + pub fn extensions(&self) -> Result> { + Ok(self + .get_info_string(CL_DEVICE_EXTENSIONS)? + .split_whitespace() + .map(String::from) + .collect::>()) + } + + /// Returns a space-separated list of extension names (the extension names themselves do not contain any spaces) + #[inline(always)] + pub fn extensions_string(&self) -> Result { + self.get_info_string(CL_DEVICE_EXTENSIONS) + } + + /// Is ```true``` if the device supports the generic address space and its associated built-in functions, and ```false``` otherwise. + #[docfg(feature = "cl3")] + #[inline(always)] + pub fn generic_address_space_support(&self) -> Result { + let v = + self.get_info_bits::(opencl_sys::CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT)?; + Ok(v != 0) + } + + /// Size of global memory cache in bytes. + #[inline(always)] + pub fn global_mem_cache_size(&self) -> Result { + self.get_info_bits(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) + } + + /// Type of global memory cache supported. + #[inline(always)] + pub fn global_mem_cache_type(&self) -> Result { + match self.get_info_bits::(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE)? { + CL_NONE => Ok(MemAccess::NONE), + CL_READ_ONLY_CACHE => Ok(MemAccess::READ_ONLY), + CL_READ_WRITE_CACHE => Ok(MemAccess::READ_WRITE), + _ => unreachable!(), + } + } + + /// Size of global memory cache line in bytes. + #[inline(always)] + pub fn global_mem_cahceline_size(&self) -> Result { + self.get_info_bits(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) + } + + /// Size of global memory in bytes. + #[inline(always)] + pub fn global_mem_size(&self) -> Result { + self.get_info_bits(CL_DEVICE_GLOBAL_MEM_SIZE) + } + + /// Maximum preferred total size, in bytes, of all program variables in the global address space. This is a performance hint. An implementation may place such variables in storage with optimized device access. This query returns the capacity of such storage. The minimum value is 0. + #[docfg(feature = "cl2")] + #[inline(always)] + pub fn global_variable_preferred_total_size(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE) + } + + /// Describes the OPTIONAL half precision floating-point capability of the OpenCL device + #[inline(always)] + pub fn half_fp_config(&self) -> Result { + self.get_info_bits(CL_DEVICE_HALF_FP_CONFIG) + } + + /// Is ```true``` if the device and the host have a unified memory subsystem and is ```false``` otherwise. + #[docfg(feature = "cl1_1")] + #[cfg_attr(feature = "cl2", deprecated)] + #[inline(always)] + pub fn host_unified_memory(&self) -> Result { + let v = self.get_info_bits::(opencl_sys::CL_DEVICE_HOST_UNIFIED_MEMORY)?; + Ok(v != 0) + } + + /// The intermediate languages that can be supported by clCreateProgramWithIL for this device. + #[docfg(feature = "cl2_1")] + #[inline(always)] + pub fn il_version(&self) -> Result { + self.get_info_string(opencl_sys::CL_DEVICE_IL_VERSION) + } + + /// Is ```true``` if images are supported by the OpenCL device and ```false``` otherwise. + #[inline(always)] + pub fn image_support(&self) -> Result { + let v = self.get_info_bits::(CL_DEVICE_IMAGE_SUPPORT)?; + Ok(v != 0) + } + + /// Max number of images in a 1D or 2D image array. The minimum value is 2048 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE, the value is 0 otherwise. + #[docfg(feature = "cl1_2")] + #[inline(always)] + pub fn image_max_array_size(&self) -> Result> { + self.get_info_bits(opencl_sys::CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) + .map(NonZeroUsize::new) + } + + /// Max number of pixels for a 1D image created from a buffer object. The minimum value is 65536 if CL_DEVICE_IMAGE_SUPPORT is CL_TRUE, the value is 0 otherwise. + #[docfg(feature = "cl1_2")] + #[inline(always)] + pub fn image_max_buffer_size(&self) -> Result> { + self.get_info_bits(opencl_sys::CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) + .map(NonZeroUsize::new) + } + + /// The row pitch alignment size in pixels for 2D images created from a buffer. The value returned must be a power of 2. + #[docfg(feature = "cl2")] + #[inline(always)] + pub fn image_pitch_alignment(&self) -> Result> { + self.get_info_bits(opencl_sys::CL_DEVICE_IMAGE_PITCH_ALIGNMENT) + .map(NonZeroU32::new) + } + + /// This query specifies the minimum alignment in pixels of the host_ptr specified to clCreateBuffer or clCreateBufferWithProperties when a 2D image is created from a buffer which was created using CL_MEM_USE_HOST_PTR. The value returned must be a power of 2. + #[docfg(feature = "cl2")] + #[inline(always)] + pub fn image_base_address_alignment(&self) -> Result> { + self.get_info_bits(opencl_sys::CL_DEVICE_IMAGE_PITCH_ALIGNMENT) + .map(NonZeroU32::new) + } + + /// Max height of 2D image in pixels. The minimum value is 8192 if [`image_support`](RawDevice::image_support) is ```true```. + #[inline(always)] + pub fn image2d_max_height(&self) -> Result> { + self.get_info_bits::(CL_DEVICE_IMAGE2D_MAX_HEIGHT) + .map(NonZeroUsize::new) + } + + /// Max width of 2D image in pixels. The minimum value is 8192 if [`image_support`](RawDevice::image_support) is ```true```. + #[inline(always)] + pub fn image2d_max_width(&self) -> Result> { + self.get_info_bits::(CL_DEVICE_IMAGE2D_MAX_WIDTH) + .map(NonZeroUsize::new) + } + + /// Max depth of 3D image in pixels. The minimum value is 2048 if [`image_support`](RawDevice::image_support) is ```true```. + #[inline(always)] + pub fn image3d_max_depth(&self) -> Result> { + self.get_info_bits::(CL_DEVICE_IMAGE3D_MAX_DEPTH) + .map(NonZeroUsize::new) + } + + /// Max height of 3D image in pixels. The minimum value is 2048 if [`image_support`](RawDevice::image_support) is ```true```. + #[inline(always)] + pub fn image3d_max_height(&self) -> Result> { + self.get_info_bits::(CL_DEVICE_IMAGE3D_MAX_HEIGHT) + .map(NonZeroUsize::new) + } + + /// Max width of 3D image in pixels. The minimum value is 2048 if [`image_support`](RawDevice::image_support) is ```true```. + #[inline(always)] + pub fn image3d_max_width(&self) -> Result> { + self.get_info_bits::(CL_DEVICE_IMAGE3D_MAX_WIDTH) + .map(NonZeroUsize::new) + } + + /// Returns the latest version of the conformance test suite that this device has fully passed in accordance with the official conformance process. + #[docfg(feature = "cl3")] + #[inline(always)] + pub fn latest_conformance_version_passed(&self) -> Result { + self.get_info_string(opencl_sys::CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED) + } + + /// Size of local memory arena in bytes. The minimum value is 16 KB. + #[inline(always)] + pub fn local_mem_size(&self) -> Result { + unsafe { + Ok(NonZeroU64::new_unchecked( + self.get_info_bits::(CL_DEVICE_LOCAL_MEM_SIZE)?, + )) + } + } + + /// Type of local memory supported. + #[inline(always)] + pub fn local_mem_type(&self) -> Result { + self.get_info_bits(CL_DEVICE_LOCAL_MEM_TYPE) + } + + #[docfg(feature = "cl1_2")] + #[inline(always)] + pub fn linker_available(&self) -> Result { + let v = self.get_info_bits::(opencl_sys::CL_DEVICE_LINKER_AVAILABLE)?; + Ok(v != 0) + } + + /// Maximum configured clock frequency of the device in MHz. + #[docfg(feature = "cl2_2")] + #[inline(always)] + pub fn max_clock_frequency(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_MAX_CLOCK_FREQUENCY) + } + + /// The number of parallel compute cores on the OpenCL device. The minimum value is 1. + #[inline(always)] + pub fn max_compute_units(&self) -> Result { + unsafe { + Ok(NonZeroU32::new_unchecked( + self.get_info_bits::(CL_DEVICE_MAX_COMPUTE_UNITS)?, + )) + } + } + + /// Max number of arguments declared with the ```__constant``` qualifier in a kernel. The minimum value is 8. + #[inline(always)] + pub fn max_constant_args(&self) -> Result { + unsafe { + Ok(NonZeroU32::new_unchecked( + self.get_info_bits::(CL_DEVICE_MAX_CONSTANT_ARGS)?, + )) + } + } + + /// Max size in bytes of a constant buffer allocation. The minimum value is 64 KB. + #[inline(always)] + pub fn max_constant_buffer_size(&self) -> Result { + unsafe { + Ok(NonZeroU64::new_unchecked(self.get_info_bits::( + CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, + )?)) + } + } + + /// The maximum number of bytes of storage that may be allocated for any single variable in program scope or inside a function in an OpenCL kernel language declared in the global address space. + #[docfg(feature = "cl2")] + #[inline(always)] + pub fn max_global_variable_size(&self) -> Result> { + self.get_info_bits(opencl_sys::CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE) + .map(NonZeroUsize::new) + } + + /// Max size of memory object allocation in bytes. The minimum value is max (1/4th of [```global_mem_size```](), 128*1024*1024) + #[inline(always)] + pub fn max_mem_alloc_size(&self) -> Result { + unsafe { + Ok(NonZeroU64::new_unchecked( + self.get_info_bits::(CL_DEVICE_MAX_MEM_ALLOC_SIZE)?, + )) + } + } + + /// Maximum number of sub-groups in a work-group that a device is capable of executing on a single compute unit, for any given kernel-instance running on the device. The minimum value is 1 if the device supports subgroups, and must be 0 for devices that do not support subgroups. Support for subgroups is required for an OpenCL 2.1 or 2.2 device. + #[docfg(feature = "cl2_1")] + #[inline(always)] + pub fn max_num_sub_groups(&self) -> Result> { + self.get_info_bits(opencl_sys::CL_DEVICE_MAX_NUM_SUB_GROUPS) + .map(NonZeroU32::new) + } + + /// The maximum number of events in use by a device queue. These refer to events returned by the enqueue_ built-in functions to a device queue or user events returned by the create_user_event built-in function that have not been released. The minimum value is 1024 for devices supporting on-device queues, and must be 0 for devices that do not support on-device queues. + #[docfg(feature = "cl2")] + #[inline(always)] + pub fn max_on_device_events(&self) -> Result> { + self.get_info_bits(opencl_sys::CL_DEVICE_MAX_ON_DEVICE_EVENTS) + .map(NonZeroU32::new) + } + + /// The maximum number of device queues that can be created for this device in a single context. The minimum value is 1 for devices supporting on-device queues, and must be 0 for devices that do not support on-device queues. + #[docfg(feature = "cl2")] + #[inline(always)] + pub fn max_on_device_queues(&self) -> Result> { + self.get_info_bits(opencl_sys::CL_DEVICE_MAX_ON_DEVICE_QUEUES) + .map(NonZeroU32::new) + } + + /// Max size in bytes of the arguments that can be passed to a kernel. The minimum value is 256. + #[inline(always)] + pub fn max_parameter_size(&self) -> Result { + unsafe { + Ok(NonZeroUsize::new_unchecked( + self.get_info_bits::(CL_DEVICE_MAX_PARAMETER_SIZE)?, + )) + } + } + + /// The maximum number of pipe objects that can be passed as arguments to a kernel. The minimum value is 16 for devices supporting pipes, and must be 0 for devices that do not support pipes. + #[docfg(featurew = "cl2")] + #[inline(always)] + pub fn max_pipe_args(&self) -> Result> { + self.get_info_bits(opencl_sys::CL_DEVICE_MAX_PIPE_ARGS) + .map(NonZeroU32::new) + } + + /// Max number of simultaneous image objects that can be read by a kernel. The minimum value is 128 if [`image_support`](RawDevice::image_support) is ```true```. + #[inline(always)] + pub fn max_read_image_args(&self) -> Result> { + self.get_info_bits::(CL_DEVICE_MAX_READ_IMAGE_ARGS) + .map(NonZeroU32::new) + } + + /// Max number of image objects arguments of a kernel declared with the write_only or read_write qualifier. + #[docfg(feature = "cl2")] + #[inline(always)] + pub fn max_read_write_image_args(&self) -> Result> { + self.get_info_bits::(opencl_sys::CL_DEVICE_MAX_READ_IMAGE_ARGS) + .map(NonZeroU32::new) + } + + /// Maximum number of samplers that can be used in a kernel. The minimum value is 16 if [`image_support`](RawDevice::image_support) is ```true```. + #[inline(always)] + pub fn max_samplers(&self) -> Result> { + self.get_info_bits::(CL_DEVICE_MAX_SAMPLERS) + .map(NonZeroU32::new) + } + + /// Maximum number of work-items in a work-group executing a kernel using the data parallel execution model. The minimum value is 1. + #[inline(always)] + pub fn max_work_group_size(&self) -> Result { + unsafe { + Ok(NonZeroUsize::new_unchecked( + self.get_info_bits::(CL_DEVICE_MAX_WORK_GROUP_SIZE)?, + )) + } + } + + /// Maximum dimensions that specify the global and local work-item IDs used by the data parallel execution model. The minimum value is 3. + #[inline(always)] + pub fn max_work_item_dimensions(&self) -> Result { + unsafe { + Ok(NonZeroU32::new_unchecked(self.get_info_bits::( + CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, + )?)) + } + } + + /// Maximum number of work-items that can be specified in each dimension of the work-group to clEnqueueNDRangeKernel. Returns n ```usize``` entries, where n is the value returned by the query for [`max_work_item_dimensions`](RawDevice::max_work_item_dimensions). The minimum value is (1, 1, 1). + #[inline(always)] + pub fn max_work_item_sizes(&self) -> Result> { + let n = usize::try_from(self.max_work_item_dimensions()?.get()).unwrap(); + // FIXME: maybe using nonzero ints messes up the alignment? + let mut max_work_item_sizes = Vec::::with_capacity(n); + + let len = n + .checked_mul(core::mem::size_of::()) + .expect("Integer multiplication oveflow. Too many work items to fit in a vector"); + unsafe { + clGetDeviceInfo( + self.id(), + CL_DEVICE_MAX_WORK_ITEM_SIZES, + len, + max_work_item_sizes.as_mut_ptr().cast(), + core::ptr::null_mut(), + ); + max_work_item_sizes.set_len(n); + } + + Ok(max_work_item_sizes) + } + + /// Max number of simultaneous image objects that can be written to by a kernel. The minimum value is 8 if [`image_support`](RawDevice::image_support) is ```true```. + #[inline(always)] + pub fn max_write_image_args(&self) -> Result> { + self.get_info_bits::(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) + .map(NonZeroU32::new) + } + + /// Describes the alignment in bits of the base address of any allocated memory object. + #[inline(always)] + pub fn mem_base_addr_align(&self) -> Result { + self.get_info_bits(CL_DEVICE_MEM_BASE_ADDR_ALIGN) + } + + /// The smallest alignment in bytes which can be used for any data type. + #[cfg_attr(feature = "cl1_2", deprecated)] + #[inline(always)] + pub fn min_data_type_align_size(&self) -> Result { + self.get_info_bits(CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE) + } + + /// Device name string. + #[inline(always)] + pub fn name(&self) -> Result { + self.get_info_string(CL_DEVICE_NAME) + } + + /// Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector. + #[docfg(feature = "cl1_1")] + #[inline(always)] + pub fn native_vector_width_char(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) + } + + /// Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector. + #[docfg(feature = "cl1_1")] + #[inline(always)] + pub fn native_vector_width_short(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) + } + + /// Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector. + #[docfg(feature = "cl1_1")] + #[inline(always)] + pub fn native_vector_width_int(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) + } + + /// Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector. + #[docfg(feature = "cl1_1")] + #[inline(always)] + pub fn native_vector_width_long(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) + } + + /// Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector. + #[docfg(all(feature = "cl1_1", feature = "half"))] + #[inline(always)] + pub fn native_vector_width_half(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) + } + + /// Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector. + #[docfg(feature = "cl1_1")] + #[inline(always)] + pub fn native_vector_width_float(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) + } + + /// Returns the native ISA vector width. The vector width is defined as the number of scalar elements that can be stored in the vector + #[docfg(feature = "cl1_1")] + #[inline(always)] + pub fn native_vector_width_double(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) + } + + /// Is ```true``` if the device supports non-uniform work-groups, and ```false``` otherwise. + #[docfg(feature = "cl3")] + #[inline(always)] + pub fn non_uniform_work_group_support(&self) -> Result { + let v = + self.get_info_bits::(opencl_sys::CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT)?; + Ok(v != 0) + } + + /// Returns the highest fully backwards compatible OpenCL C version supported by the compiler for the device. + #[docfg(feature = "cl1_1")] + #[cfg_attr(feature = "cl3", deprecated)] + #[inline(always)] + pub fn opencl_c_version(&self) -> Result { + self.get_info_string(opencl_sys::CL_DEVICE_OPENCL_C_VERSION) + } + + /// Returns the parent device to which this sub-device belongs. If device is a root-level device, a ```None``` value is returned. + #[docfg(feature = "cl1_2")] + #[inline] + pub fn parent(&self) -> Result> { + let v = self.get_info_bits::(opencl_sys::CL_DEVICE_PARENT_DEVICE)?; + if let Some(v) = NonNull::new(v) { + return Ok(Some(Self(v))); + } + + Ok(None) + } + + /// Returns the list of supported affinity domains for partitioning the device. + #[docfg(feature = "cl1_2")] + #[inline(always)] + pub fn partition_affinity_domain(&self) -> Result> { + let v = self.get_info_bits::( + opencl_sys::CL_DEVICE_PARTITION_PROPERTIES, + )?; + + Ok(match v { + 0 => None, + _ => unsafe { Some(core::mem::transmute(v)) }, + }) + } + + /// Returns the properties argument specified in clCreateSubDevices if device is a sub-device. + #[docfg(feature = "cl1_2")] + #[inline(always)] + pub fn partition_type(&self) -> Result> { + let v = self.get_info_array::( + opencl_sys::CL_DEVICE_PARTITION_TYPE, + )?; + Ok(PartitionProperty::from_slice(&v)) + } + + /// Returns the maximum number of sub-devices that can be created when a device is partitioned. The value returned cannot exceed [max_compute_units](RawDevice::max_compute_units). + #[docfg(feature = "cl1_2")] + #[inline(always)] + pub fn partition_max_sub_devices(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_PARTITION_MAX_SUB_DEVICES) + } + + /// Returns the list of partition types supported by device. + #[docfg(feature = "cl1_2")] + #[inline(always)] + pub fn partition_properties(&self) -> Result> { + let v = self.get_info_array::( + opencl_sys::CL_DEVICE_PARTITION_PROPERTIES, + )?; + Ok(PartitionProperty::from_slice(&v)) + } + + /// Is ```true``` if the device supports pipes, and ```false``` otherwise. Devices that return ```true``` must also return ```true``` for [`generic_address_space_support`](RawDevice::generic_address_space_support). + #[docfg(feature = "cl3")] + #[inline(always)] + pub fn pipe_support(&self) -> Result { + let v = self.get_info_bits::(opencl_sys::CL_DEVICE_PIPE_SUPPORT)?; + Ok(v != 0) + } + + /// The maximum number of reservations that can be active for a pipe per work-item in a kernel. A work-group reservation is counted as one reservation per work-item. The minimum value is 1 for devices supporting pipes, and must be 0 for devices that do not support pipes. + #[docfg(feature = "cl2")] + #[inline(always)] + pub fn pipe_max_active_reservations(&self) -> Result> { + self.get_info_bits(opencl_sys::CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS) + .map(NonZeroU32::new) + } + + /// The maximum size of pipe packet in bytes. Support for pipes is required for an OpenCL 2.0, 2.1, or 2.2 device. The minimum value is 1024 bytes if the device supports pipes, and must be 0 for devices that do not support pipes. + #[docfg(feature = "cl2")] + #[inline(always)] + pub fn pipe_max_packet_size(&self) -> Result> { + self.get_info_bits(opencl_sys::CL_DEVICE_PIPE_MAX_PACKET_SIZE) + .map(NonZeroU32::new) + } + + /// The platform associated with this device. + #[inline(always)] + pub fn platform(&self) -> Result { + let id = self.get_info_bits::(CL_DEVICE_PLATFORM)?; + unsafe { return RawPlatform::from_id(id).ok_or_else(|| ErrorKind::InvalidPlatform.into()) } + } + + /// Is ```true``` if the devices preference is for the user to be responsible for synchronization, when sharing memory objects between OpenCL and other APIs such as DirectX, ```false``` if the device / implementation has a performant path for performing synchronization of memory object shared between OpenCL and other APIs such as DirectX. + #[docfg(feature = "cl1_2")] + #[inline(always)] + pub fn preferred_interop_user_sync(&self) -> Result { + let v = self.get_info_bits::(opencl_sys::CL_DEVICE_PREFERRED_INTEROP_USER_SYNC)?; + Ok(v != 0) + } + + /// Returns the value representing the preferred alignment in bytes for OpenCL 2.0 fine-grained SVM atomic types. This query can return 0 which indicates that the preferred alignment is aligned to the natural size of the type. + #[docfg(feature = "cl2")] + #[inline(always)] + pub fn preferred_platform_atomic_alignment(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT) + } + + /// Returns the value representing the preferred alignment in bytes for OpenCL 2.0 atomic types to global memory. This query can return 0 which indicates that the preferred alignment is aligned to the natural size of the type. + #[docfg(feature = "cl2")] + #[inline(always)] + pub fn preferred_global_atomic_alignment(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT) + } + + /// Returns the value representing the preferred alignment in bytes for OpenCL 2.0 atomic types to local memory. This query can return 0 which indicates that the preferred alignment is aligned to the natural size of the type. + #[docfg(feature = "cl2")] + #[inline(always)] + pub fn preferred_local_atomic_alignment(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT) + } + + /// Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector. + #[docfg(feature = "cl1_1")] + #[inline(always)] + pub fn preferred_vector_width_char(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) + } + + /// Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector. + #[docfg(feature = "cl1_1")] + #[inline(always)] + pub fn preferred_vector_width_short(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) + } + + /// Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector. + #[docfg(feature = "cl1_1")] + #[inline(always)] + pub fn preferred_vector_width_int(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) + } + + /// Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector. + #[docfg(feature = "cl1_1")] + #[inline(always)] + pub fn preferred_vector_width_long(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) + } + + /// Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector. + #[docfg(feature = "cl1_1")] + #[inline(always)] + pub fn preferred_vector_width_half(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) + } + + /// Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector. + #[docfg(feature = "cl1_1")] + #[inline(always)] + pub fn preferred_vector_width_float(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) + } + + /// Preferred native vector width size for built-in scalar types that can be put into vectors. The vector width is defined as the number of scalar elements that can be stored in the vector. if the ```cl_khr_fp64``` extension is not supported, it must return 0. + #[docfg(feature = "cl1_1")] + #[inline(always)] + pub fn preferred_vector_width_double(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) + } + + /// Returns the preferred multiple of work-group size for the given device. This is a performance hint intended as a guide when specifying the local work size argument to clEnqueueNDRangeKernel. + #[docfg(feature = "cl3")] + #[inline(always)] + pub fn preferred_work_group_size_multiple(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE) + } + + /// Maximum size in bytes of the internal buffer that holds the output of printf calls from a kernel. The minimum value for the FULL profile is 1 MB. + #[docfg(feature = "cl1_2")] + #[inline(always)] + pub fn printf_buffer_size(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_PRINTF_BUFFER_SIZE) + .map(NonZeroUsize::new) + .map(Option::unwrap) + } + + /// OpenCL profile string. Returns the profile name supported by the device (see note) + #[inline(always)] + pub fn profile(&self) -> String { + self.get_info_string(CL_DEVICE_PROFILE).unwrap() + } + + /// Describes the resolution of device timer. This is measured in nanoseconds. + #[inline(always)] + pub fn profiling_timer_resolution(&self) -> Result { + self.get_info_bits(CL_DEVICE_PROFILING_TIMER_RESOLUTION) + } + + /// Describes the command-queue properties supported by the device. + #[cfg_attr(feature = "cl2", deprecated(note = "see `queue_on_host_properties`"))] + #[inline(always)] + pub fn queue_properties(&self) -> Result { + let v = self.get_info_bits::(CL_DEVICE_QUEUE_PROPERTIES)?; + Ok(CommandQueueProperties::from_bits(v)) + } + + /// Describes the on device command-queue properties supported by the device. + #[docfg(feature = "cl2")] + #[inline(always)] + pub fn queue_on_device_properties(&self) -> Result { + let v = self.get_info_bits::( + opencl_sys::CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES, + )?; + Ok(CommandQueueProperties::from_bits(v)) + } + + /// The maximum size of the device queue in bytes. The minimum value is 256 KB for the full profile and 64 KB for the embedded profile for devices supporting on-device queues, and must be 0 for devices that do not support on-device queues. + #[docfg(feature = "cl2")] + #[inline(always)] + pub fn queue_on_device_max_size(&self) -> Result> { + self.get_info_bits(opencl_sys::CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE) + .map(NonZeroU32::new) + } + + /// The preferred size of the device queue, in bytes. Applications should use this size for the device queue to ensure good performance. The minimum value is 16 KB for devices supporting on-device queues, and must be 0 for devices that do not support on-device queues. + #[docfg(feature = "cl2")] + #[inline(always)] + pub fn queue_on_device_preferred_size(&self) -> Result> { + self.get_info_bits(opencl_sys::CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE) + .map(NonZeroU32::new) + } + + /// Describes the on host command-queue properties supported by the device. + #[docfg(feature = "cl2")] + #[inline(always)] + pub fn queue_on_host_properties(&self) -> Result { + let v = self.get_info_bits::( + opencl_sys::CL_DEVICE_QUEUE_ON_HOST_PROPERTIES, + )?; + Ok(CommandQueueProperties::from_bits(v)) + } + + #[docfg(feature = "cl1_2")] + #[inline(always)] + pub fn reference_count(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_REFERENCE_COUNT) + } + + /// Describes single precision floating-point capability of the device. + #[inline(always)] + pub fn single_fp_config(&self) -> Result { + self.get_info_bits(CL_DEVICE_SINGLE_FP_CONFIG) + } + + #[docfg(feature = "cl2_1")] + #[inline(always)] + pub fn sub_group_independent_forward_progress(&self) -> Result { + let v = self.get_info_bits::( + opencl_sys::CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS, + )?; + Ok(v != 0) + } + + /// Describes the various shared virtual memory (SVM) memory allocation types the device supports. + #[docfg(feature = "cl2")] + #[inline(always)] + pub fn svm_capabilities(&self) -> Result { + self.get_info_bits(opencl_sys::CL_DEVICE_SVM_CAPABILITIES) + } + + /// The OpenCL device type. + #[inline(always)] + pub fn ty(&self) -> Result { + self.get_info_bits(CL_DEVICE_TYPE) + } + + /// Vendor name string. + #[inline(always)] + pub fn vendor(&self) -> Result { + self.get_info_string(CL_DEVICE_VENDOR) + } + + /// A unique device vendor identifier. An example of a unique device identifier could be the PCIe ID. + #[inline(always)] + pub fn vendor_id(&self) -> Result { + self.get_info_bits(CL_DEVICE_VENDOR_ID) + } + + /// OpenCL version string. + #[inline(always)] + pub fn version_string(&self) -> Result { + self.get_info_string(CL_DEVICE_VERSION) + } + + /// OpenCL version + #[inline] + pub fn version(&self) -> Result { + let version = self.version_string()?; + let section = version.split(' ').nth(1).ok_or(ErrorKind::InvalidValue)?; + Version::from_str(section).map_err(|_| ErrorKind::InvalidValue.into()) + } + + /// Is ```true``` if the device supports work-group collective functions (e.g. work_group_broadcast, work_group_reduce and work_group_scan), and ```false``` otherwise. + #[docfg(feature = "cl3")] + #[inline(always)] + pub fn work_group_collective_functions_support(&self) -> Result { + let v = self.get_info_bits::( + opencl_sys::CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT, + )?; + Ok(v != 0) + } + + /// OpenCL software driver version string in the form _major_number_._minor_number_. + #[inline(always)] + pub fn driver_version_string(&self) -> Result { + self.get_info_string(CL_DRIVER_VERSION) + } + + /// OpenCL software driver version + #[inline(always)] + pub fn driver_version(&self) -> Result { + let driver = self.driver_version_string()?; + Version::from_str(&driver).map_err(|_| ErrorKind::InvalidValue.into()) + } + + /// Creates an array of sub-devices that each reference a non-intersecting set of compute units within in_device, according to the partition scheme given by properties. + /// The output sub-devices may be used in every way that the root (or parent) device can be used, including creating contexts, building programs, further calls to [`create_sub_devices`](RawDevice::create_sub_devices) and creating command-queues. + /// When a command-queue is created against a sub-device, the commands enqueued on the queue are executed only on the sub-device. + #[docfg(feature = "cl1_2")] + #[inline] + pub fn create_sub_devices(&self, prop: PartitionProperty) -> Result> { + let prop = prop.to_bits(); + + let mut len = 0; + unsafe { + tri!(opencl_sys::clCreateSubDevices( + self.id(), + prop.as_ptr(), + 0, + core::ptr::null_mut(), + std::ptr::addr_of_mut!(len) + )) + } + + let mut devices = Vec::with_capacity(len as usize); + unsafe { + tri!(opencl_sys::clCreateSubDevices( + self.id(), + prop.as_ptr(), + len, + devices.as_mut_ptr() as *mut _, + core::ptr::null_mut() + )); + devices.set_len(devices.capacity()) + } + + Ok(devices) + } + + /// Replaces the default command queue on the device. + #[docfg(feature = "cl2_1")] + #[inline(always)] + pub fn set_default_command_queue( + &self, + ctx: crate::context::RawContext, + queue: RawCommandQueue, + ) -> Result<()> { + unsafe { + tri!(opencl_sys::clSetDefaultDeviceCommandQueue( + ctx.id(), + self.id(), + queue.id() + )); + } + + Ok(()) + } + + /// Query synchronized host and device timestamps. + #[docfg(feature = "cl2_1")] + #[inline] + pub fn device_and_host_timer_nanos(&self) -> Result<[u64; 2]> { + let mut device = 0; + let mut host = 0; + + unsafe { + tri!(clGetDeviceAndHostTimer( + self.id(), + std::ptr::addr_of_mut!(device), + std::ptr::addr_of_mut!(host) + )) + } + + Ok([device, host]) + } + + /// Query synchronized host and device timestamps. + #[docfg(feature = "cl2_1")] + #[inline(always)] + pub fn device_and_host_timer(&self) -> Result<(std::time::SystemTime, std::time::SystemTime)> { + let [device, host] = self.device_and_host_timer_nanos()?; + let device = std::time::UNIX_EPOCH + .checked_add(std::time::Duration::from_nanos(device)) + .unwrap(); + let host = std::time::UNIX_EPOCH + .checked_add(std::time::Duration::from_nanos(host)) + .unwrap(); + Ok((device, host)) + } + + /// Query the host clock. + #[docfg(feature = "cl2_1")] + #[inline(always)] + pub fn host_clock_nanos(&self) -> Result { + let mut host = 0; + unsafe { tri!(clGetHostTimer(self.id(), std::ptr::addr_of_mut!(host))) } + + Ok(host) + } + + /// Query the host clock. + #[docfg(feature = "cl2_1")] + #[inline(always)] + pub fn host_clock(&self) -> Result { + let host = self.host_clock_nanos()?; + Ok(std::time::UNIX_EPOCH + std::time::Duration::from_nanos(host)) + } + + #[inline(always)] + pub fn has_f16(&self) -> Result { + let ext = self.extensions_string()?; + Ok(ext.split_whitespace().any(|x| x == "cl_khr_fp16")) + } + + #[inline(always)] + pub fn has_f64(&self) -> Result { + let ext = self.extensions_string()?; + Ok(ext.split_whitespace().any(|x| x == "cl_khr_fp64")) + } + + #[inline(always)] + pub fn all() -> &'static [RawDevice] { + &once_cell::sync::Lazy::force(&DEVICES) + } + + #[inline(always)] + pub fn first() -> Option<&'static RawDevice> { + DEVICES.first() + } + + #[docfg(feature = "cl1_2")] + #[inline(always)] + pub unsafe fn retain(&self) -> Result<()> { + tri!(clRetainDevice(self.id())); + Ok(()) + } + + #[inline] + fn get_info_string(&self, ty: cl_device_info) -> Result { + unsafe { + let mut len = 0; + tri!(clGetDeviceInfo( + self.id(), + ty, + 0, + core::ptr::null_mut(), + &mut len + )); + + let mut result = Vec::::with_capacity(len); + tri!(clGetDeviceInfo( + self.id(), + ty, + len, + result.as_mut_ptr().cast(), + core::ptr::null_mut() + )); + + result.set_len(len - 1); + Ok(String::from_utf8(result).unwrap()) + } + } + + #[allow(dead_code)] + #[inline] + fn get_info_array(&self, ty: cl_device_info) -> Result> { + unsafe { + let mut len = 0; + tri!(clGetDeviceInfo( + self.id(), + ty, + 0, + core::ptr::null_mut(), + &mut len + )); + + if len == 0 { + return Ok(Box::new([])); + } + + let mut result = Box::<[T]>::new_uninit_slice(len / core::mem::size_of::()); + tri!(clGetDeviceInfo( + self.id(), + ty, + len, + result.as_mut_ptr().cast(), + core::ptr::null_mut() + )); + Ok(result.assume_init()) + } + } + + #[inline] + fn get_info_bits(&self, ty: cl_device_info) -> Result { + let mut value = MaybeUninit::::uninit(); + + unsafe { + tri!(clGetDeviceInfo( + self.id(), + ty, + core::mem::size_of::(), + value.as_mut_ptr().cast(), + core::ptr::null_mut() + )); + Ok(value.assume_init()) + } + } +} + +impl Debug for RawDevice { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("Device") + .field("id", &self.0) + .field("name", &self.name()) + .field("vendor", &self.vendor()) + .field("type", &self.ty()) + .field("version", &self.version()) + .finish() + } +} + +impl Clone for RawDevice { + #[inline(always)] + fn clone(&self) -> Self { + #[cfg(feature = "cl1_2")] + unsafe { + tri_panic!(opencl_sys::clRetainDevice(self.id())) + } + + Self(self.0) + } +} + +#[docfg(feature = "cl1_2")] +impl Drop for RawDevice { + #[inline(always)] + fn drop(&mut self) { + unsafe { + tri_panic!(opencl_sys::clReleaseDevice(self.id())); + } + } +} + +unsafe impl Send for RawDevice {} +unsafe impl Sync for RawDevice {} + +#[docfg(feature = "cl3")] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[non_exhaustive] +pub struct AtomicCapabilities { + pub order: core::sync::atomic::Ordering, + /// Support for memory ordering constraints that apply to a single work-item. + pub work_item_scope: bool, + pub scope: AtomicScope, +} + +#[docfg(feature = "cl3")] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[non_exhaustive] +#[repr(u64)] +pub enum AtomicScope { + /// Support for memory ordering constraints that apply to all work-items in a work-group. + WorkGroup = opencl_sys::CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP, + /// Support for memory ordering constraints that apply to all work-items executing on the device. + Device = opencl_sys::CL_DEVICE_ATOMIC_SCOPE_DEVICE, + /// Support for memory ordering constraints that apply to all work-items executing across all devices that can share SVM memory with each other and the host process. + AllDevices = opencl_sys::CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES, +} + +#[cfg(feature = "cl3")] +impl AtomicCapabilities { + pub fn from_bits(bits: opencl_sys::cl_device_atomic_capabilities) -> Option { + let order; + let scope; + let work_item_scope = bits & opencl_sys::CL_DEVICE_ATOMIC_SCOPE_WORK_ITEM != 0; + + // ORDER + if bits & opencl_sys::CL_DEVICE_ATOMIC_ORDER_SEQ_CST != 0 { + order = core::sync::atomic::Ordering::SeqCst; + } else if bits & opencl_sys::CL_DEVICE_ATOMIC_ORDER_ACQ_REL != 0 { + order = core::sync::atomic::Ordering::AcqRel + } else if bits & opencl_sys::CL_DEVICE_ATOMIC_ORDER_RELAXED != 0 { + order = core::sync::atomic::Ordering::Relaxed + } else { + return None; + } + + // SCOPE + if bits & opencl_sys::CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES != 0 { + scope = AtomicScope::AllDevices + } else if bits & opencl_sys::CL_DEVICE_ATOMIC_SCOPE_DEVICE != 0 { + scope = AtomicScope::Device + } else if bits & opencl_sys::CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP != 0 { + scope = AtomicScope::WorkGroup + } else { + return None; + } + + Some(Self { + order, + work_item_scope, + scope, + }) + } +} + +bitflags::bitflags! { + /// The OpenCL device type. + #[repr(transparent)] + pub struct DeviceType : cl_device_type { + const CPU = CL_DEVICE_TYPE_CPU; + const GPU = CL_DEVICE_TYPE_GPU; + const ACCELERATOR = CL_DEVICE_TYPE_ACCELERATOR; + const DEFAULT = CL_DEVICE_TYPE_CUSTOM; + } + + /// Describes the floating-point capability of the OpenCL device. + #[repr(transparent)] + pub struct FpConfig : cl_device_fp_config { + /// Denorms are supported + const DENORM = CL_FP_DENORM; + /// INF and quiet NaNs are supported + const INF_NAN = CL_FP_INF_NAN; + /// Round to nearest even rounding mode supported + const ROUND_TO_NEAREST = CL_FP_ROUND_TO_NEAREST; + /// Round to zero rounding mode supported + const ROUND_TO_ZERO = CL_FP_ROUND_TO_ZERO; + /// Round to positive and negative infinity rounding modes supported + const ROUND_TO_INF = CL_FP_ROUND_TO_INF; + /// IEEE754-2008 fused multiply-add is supported + const FMA = CL_FP_FMA; + /// Divide and sqrt are correctly rounded as defined by the IEEE754 specification + const CORRECTLY_ROUNDED_DIVIDE_SQRT = CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT; + /// Basic floating-point operations (such as addition, subtraction, multiplication) are implemented in software + const SOFT_FLOAT = CL_FP_SOFT_FLOAT; + } + + /// Describes the execution capabilities of the device + #[repr(transparent)] + pub struct ExecCapabilities : cl_device_exec_capabilities { + const KERNEL = CL_EXEC_KERNEL; + const NATIVE_KERNEL = CL_EXEC_NATIVE_KERNEL; + } +} + +/// Type of local memory supported. This can be set to [```Self::Local```] implying dedicated local memory storage such as SRAM, or [```Self::Global```]. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[repr(u32)] +pub enum LocalMemType { + Local = CL_LOCAL, + Global = CL_GLOBAL, +} + +#[docfg(feature = "cl1_2")] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[non_exhaustive] +pub enum PartitionProperty { + /// Split the aggregate device into as many smaller aggregate devices as can be created, each containing n compute units. The value n is passed as the value accompanying this property. If n does not divide evenly into [`max_compute_units`](RawDevice::max_compute_units), then the remaining compute units are not used. + Equally(u32), + /// This property is followed by a list of compute unit. For each non-zero count m in the list, a sub-device is created with m compute units in it. The number of non-zero count entries in the list may not exceed [`partition_max_sub_devices`](RawDevice::partition_max_sub_devices). The total number of compute units specified may not exceed [max_compute_units](RawDevice::max_compute_units). + Counts(Vec), + /// Split the device into smaller aggregate devices containing one or more compute units that all share part of a cache hierarchy. + AffinityDomain(AffinityDomain), +} + +#[cfg(feature = "cl1_2")] +impl PartitionProperty { + pub fn from_slice(bits: &[opencl_sys::cl_device_partition_property]) -> Option { + if bits.len() == 0 { + return None; + } + + match unsafe { *bits.get_unchecked(0) } { + 0 => None, + opencl_sys::CL_DEVICE_PARTITION_EQUALLY => Some(Self::Equally(bits[1] as u32)), + opencl_sys::CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN => { + Some(Self::AffinityDomain(unsafe { + core::mem::transmute(bits[1] as u64) + })) + } + opencl_sys::CL_DEVICE_PARTITION_BY_COUNTS => { + let mut result = Vec::with_capacity(bits.len()); + + for i in 1..bits.len() { + const MAX_COUNT: isize = u32::MAX as isize; + + match bits[i] { + #[allow(unreachable_patterns)] + 0 | opencl_sys::CL_DEVICE_PARTITION_BY_COUNTS_LIST_END => break, + v @ 1..=MAX_COUNT => unsafe { + result.push(NonZeroU32::new_unchecked(v as u32)) + }, + _ => return None, + } + } + + Some(Self::Counts(result)) + } + + other => panic!("Unknow partition property '{other}'"), + } + } + + pub fn to_bits(&self) -> Box<[opencl_sys::cl_device_partition_property]> { + match self { + Self::Equally(n) => Box::new([ + opencl_sys::CL_DEVICE_PARTITION_EQUALLY, + opencl_sys::cl_device_partition_property::try_from(*n).unwrap(), + 0, + ]) as Box<_>, + Self::AffinityDomain(x) => Box::new([ + opencl_sys::CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, + opencl_sys::cl_device_partition_property::try_from(*x as u64).unwrap(), + 0, + ]) as Box<_>, + Self::Counts(x) => { + let mut result = Box::new_uninit_slice(2 + x.len()); + + unsafe { + result[0].write(opencl_sys::CL_DEVICE_PARTITION_BY_COUNTS); + + for i in 0..x.len() { + result[1 + i].write( + opencl_sys::cl_device_partition_property::try_from(x[i].get()).unwrap(), + ); + } + + result + .last_mut() + .unwrap_unchecked() + .write(opencl_sys::CL_DEVICE_PARTITION_BY_COUNTS_LIST_END); + result.assume_init() + } + } + } + } +} + +#[docfg(feature = "cl1_2")] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[repr(u64)] +#[non_exhaustive] +pub enum AffinityDomain { + /// Split the device into sub-devices comprised of compute units that share a NUMA node. + Numa = opencl_sys::CL_DEVICE_AFFINITY_DOMAIN_NUMA, + /// Split the device into sub-devices comprised of compute units that share a level 4 data cache. + L4Cache = opencl_sys::CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE, + /// Split the device into sub-devices comprised of compute units that share a level 3 data cache. + L3Cache = opencl_sys::CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE, + /// Split the device into sub-devices comprised of compute units that share a level 2 data cache. + L2Cache = opencl_sys::CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE, + /// Split the device into sub-devices comprised of compute units that share a level 1 data cache. + L1Cache = opencl_sys::CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE, + /// Split the device along the next partitionable affinity domain. The implementation shall find the first level along which the device or sub-device may be further subdivided in the order NUMA, L4, L3, L2, L1, and partition the device into sub-devices comprised of compute units that share memory subsystems at this level. + NextPartitionable = opencl_sys::CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE, +} + +#[docfg(feature = "cl3")] +#[repr(u64)] +#[non_exhaustive] +pub enum DeviceEnqueueCapabilities { + /// Device supports device-side enqueue and on-device queues. + Supported = opencl_sys::CL_DEVICE_QUEUE_SUPPORTED, + /// Device supports a replaceable default on-device queue. + ReplaceableDefault = opencl_sys::CL_DEVICE_QUEUE_REPLACEABLE_DEFAULT, +} + +#[cfg(feature = "cl3")] +impl DeviceEnqueueCapabilities { + pub fn from_bits(bits: opencl_sys::cl_device_device_enqueue_capabilities) -> Option { + if bits & opencl_sys::CL_DEVICE_QUEUE_REPLACEABLE_DEFAULT != 0 { + return Some(Self::ReplaceableDefault); + } + + if bits & opencl_sys::CL_DEVICE_QUEUE_SUPPORTED != 0 { + return Some(Self::Supported); + } + + None + } +} + +bitflags::bitflags! { + #[repr(transparent)] + pub struct SvmCapability: cl_device_svm_capabilities { + /// Support for coarse-grain buffer sharing using clSVMAlloc. Memory consistency is guaranteed at synchronization points and the host must use calls to clEnqueueMapBuffer and clEnqueueUnmapMemObject. + const COARSE_GRAIN_BUFFER = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER; + /// Support for fine-grain buffer sharing using clSVMAlloc. Memory consistency is guaranteed atsynchronization points without need for clEnqueueMapBuffer and clEnqueueUnmapMemObject. + const FINE_GRAIN_BUFFER = CL_DEVICE_SVM_FINE_GRAIN_BUFFER; + /// Support for sharing the host’s entire virtual memory including memory allocated using malloc. Memory consistency is guaranteed at synchronization points. + const FINE_GRAIN_SYSTEM = CL_DEVICE_SVM_FINE_GRAIN_SYSTEM; + /// Support for the OpenCL 2.0 atomic operations that provide memory consistency across the host and all OpenCL devices supporting fine-grain SVM allocations. + const ATOMICS = CL_DEVICE_SVM_ATOMICS; + } +} + +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[repr(transparent)] +pub struct Version(cl_version); + +impl Version { + pub const CL1: Self = Self::from_inner_parts(1, 0, 0); + pub const CL1_1: Self = Self::from_inner_parts(1, 1, 0); + pub const CL1_2: Self = Self::from_inner_parts(1, 2, 0); + pub const CL2: Self = Self::from_inner_parts(2, 0, 0); + pub const CL2_1: Self = Self::from_inner_parts(2, 1, 0); + pub const CL2_2: Self = Self::from_inner_parts(2, 2, 0); + pub const CL3: Self = Self::from_inner_parts(3, 0, 0); + + const MAJOR: u32 = CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS; + + #[inline(always)] + pub const fn from_bits(bits: u32) -> Self { + Self(bits) + } + + #[inline(always)] + pub const fn from_inner_parts(major: u32, minor: u32, patch: u32) -> Self { + Self( + ((major & CL_VERSION_MAJOR_MASK) << Self::MAJOR) + | ((minor & CL_VERSION_MINOR_MASK) << CL_VERSION_PATCH_BITS) + | (patch & CL_VERSION_PATCH_MASK), + ) + } + + #[inline(always)] + pub const fn into_inner_parts(self) -> (u32, u32, u32) { + (self.major(), self.minor(), self.patch()) + } + + #[inline(always)] + pub const fn major(&self) -> u32 { + self.0 >> Self::MAJOR + } + + #[inline(always)] + pub const fn minor(&self) -> u32 { + (self.0 >> CL_VERSION_PATCH_BITS) & CL_VERSION_MINOR_MASK + } + + #[inline(always)] + pub const fn patch(&self) -> u32 { + self.0 & CL_VERSION_PATCH_MASK + } +} + +impl FromStr for Version { + type Err = IntErrorKind; + + fn from_str(s: &str) -> core::result::Result { + let mut parts = s.split('.'); + + let major = parts + .next() + .ok_or(IntErrorKind::Empty)? + .parse::() + .map_err(|e| e.kind().clone())?; + let minor = parts + .next() + .ok_or(IntErrorKind::Empty)? + .parse::() + .map_err(|e| e.kind().clone())?; + let patch_str = parts.next(); + + let patch; + if let Some(inner) = patch_str { + patch = Some(inner.parse::().map_err(|e| e.kind().clone())?) + } else { + patch = None; + } + + if parts.next().is_some() { + return Err(IntErrorKind::InvalidDigit); + } + + Ok(Self::from_inner_parts( + major, + minor, + patch.unwrap_or_default(), + )) + } +} + +impl Debug for Version { + #[inline(always)] + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + Display::fmt(&self, f) + } +} + +impl Display for Version { + #[inline(always)] + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "{}.{}.{}", self.major(), self.minor(), self.patch()) + } +} diff --git a/src/lib.rs b/src/lib.rs index e4a4a3d..404daef 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,35 @@ #![allow(macro_expanded_macro_exports_accessed_by_absolute_paths)] -#![feature(mem_copy_fn, box_into_inner, nonzero_min_max, new_uninit, unsize, iterator_try_collect, is_some_and, result_flattening, alloc_layout_extra, array_try_map, extend_one, const_nonnull_new, int_roundings, const_maybe_uninit_zeroed, unboxed_closures, const_ptr_as_ref, layout_for_ptr, const_maybe_uninit_array_assume_init, maybe_uninit_array_assume_init, const_option_ext, maybe_uninit_uninit_array, const_option, nonzero_ops, associated_type_bounds, ptr_metadata, fn_traits, vec_into_raw_parts, const_trait_impl, drain_filter, allocator_api)] +#![feature( + mem_copy_fn, + box_into_inner, + new_uninit, + unsize, + iterator_try_collect, + is_some_and, + result_flattening, + alloc_layout_extra, + array_try_map, + extend_one, + const_nonnull_new, + int_roundings, + const_maybe_uninit_zeroed, + unboxed_closures, + const_ptr_as_ref, + layout_for_ptr, + const_maybe_uninit_array_assume_init, + maybe_uninit_array_assume_init, + const_option_ext, + maybe_uninit_uninit_array, + const_option, + nonzero_ops, + associated_type_bounds, + ptr_metadata, + fn_traits, + vec_into_raw_parts, + const_trait_impl, + drain_filter, + allocator_api +)] #![cfg_attr(feature = "svm", feature(strict_provenance))] #![cfg_attr(docsrs, feature(doc_cfg, proc_macro_hygiene))] #![cfg_attr(debug_assertions, feature(backtrace_frames))] @@ -62,16 +92,18 @@ macro_rules! tri_panic { }}; } -mod blaze_rs { pub use crate::*; } +mod blaze_rs { + pub use crate::*; +} pub mod prelude { + pub use crate::buffer::rect::{RectBox2D, RectBuffer2D}; + pub use crate::buffer::{flags::*, Buffer, RawBuffer}; + pub use crate::context::{scope, Context, Global, RawContext, Scope, SimpleContext}; pub use crate::core::*; + pub use crate::event::{Event, RawEvent}; pub use crate::macros::*; - pub use crate::buffer::{RawBuffer, Buffer, flags::*}; - pub use crate::context::{Context, Global, RawContext, SimpleContext, Scope, scope}; - pub use crate::event::{RawEvent, Event}; pub use crate::memobj::RawMemObject; - pub use crate::buffer::rect::{RectBuffer2D, RectBox2D}; pub use crate::WaitList; } @@ -93,26 +125,26 @@ pub extern crate blaze_proc; pub mod macros { #[doc = include_str!("../docs/src/program/README.md")] pub use blaze_proc::blaze; - pub use blaze_proc::{global_context}; + pub use blaze_proc::global_context; /// Similar to [`Event::join_all_blocking`](crate::event::Event::join_all_blocking), but it can also join events with different [`Consumer`](crate::event::Consumer)s /// ```rust /// use blaze_rs::{prelude::*, macros::*}; /// use std::ops::Deref; - /// + /// /// #[global_context] /// static CONTEXT : SimpleContext = SimpleContext::default(); - /// + /// /// # fn main () -> Result<()> { /// /// let buffer = Buffer::new(&[1, 2, 3, 4, 5], MemAccess::default(), false)?; - /// + /// /// let (left, right) = scope(|s| { /// let left = buffer.read(s, 2.., None)?; /// let right = buffer.map(s, 2.., None)?; /// return join_various_blocking!(left, right) /// })?; - /// + /// /// assert_eq!(left.as_slice(), right.deref()); /// # return Ok(()) /// # } @@ -120,16 +152,16 @@ pub mod macros { pub use blaze_proc::join_various_blocking; } -#[doc = include_str!("../docs/src/raw.md")] -pub mod core; -#[doc = include_str!("../docs/src/context/README.md")] -pub mod context; -/// Generic memory object -pub mod memobj; /// Blaze buffers pub mod buffer; +#[doc = include_str!("../docs/src/context/README.md")] +pub mod context; +#[doc = include_str!("../docs/src/raw.md")] +pub mod core; #[doc = include_str!("../docs/src/events/README.md")] pub mod event; +/// Generic memory object +pub mod memobj; #[cfg_attr(docsrs, doc(cfg(feature = "image")))] #[cfg(feature = "image")] @@ -144,7 +176,7 @@ pub mod svm; /// # Error /// This method returns [`ErrorKind::InvalidEventWaitList`](core::ErrorKind::InvalidEventWaitList) if the list's size cannot fit inside a `u32`. #[inline] -pub fn wait_list (v: WaitList) -> core::Result<(u32, *const opencl_sys::cl_event)> { +pub fn wait_list(v: WaitList) -> core::Result<(u32, *const opencl_sys::cl_event)> { return match v { Some(v) => match v.len() { 0 => Ok((0, ::core::ptr::null())), @@ -152,18 +184,18 @@ pub fn wait_list (v: WaitList) -> core::Result<(u32, *const opencl_sys::cl_event let len = >::try_from(len) .map_err(|e| core::Error::new(core::ErrorKind::InvalidEventWaitList, e))?; - return Ok((len, v.as_ptr().cast())) - }, + return Ok((len, v.as_ptr().cast())); + } }, - None => Ok((0, ::core::ptr::null())) - } + None => Ok((0, ::core::ptr::null())), + }; } /// Creates a [`WaitList`] from a reference to a single [`RawEvent`] #[inline(always)] -pub const fn wait_list_from_ref (evt: &RawEvent) -> WaitList { - return Some(::core::slice::from_ref(evt)) +pub const fn wait_list_from_ref(evt: &RawEvent) -> WaitList { + return Some(::core::slice::from_ref(evt)); } /// A list of events to be awaited. -pub type WaitList<'a> = Option<&'a [prelude::RawEvent]>; \ No newline at end of file +pub type WaitList<'a> = Option<&'a [prelude::RawEvent]>; diff --git a/tests/mmul.rs b/tests/mmul.rs index 586b0db..90e308f 100644 --- a/tests/mmul.rs +++ b/tests/mmul.rs @@ -1,7 +1,7 @@ #[global_context] -static CONTEXT : SimpleContext = SimpleContext::default(); +static CONTEXT: SimpleContext = SimpleContext::default(); +use blaze_rs::prelude::*; use std::mem::MaybeUninit; -use blaze_rs::{prelude::*}; #[allow(dead_code)] static CODE : &str = " @@ -24,31 +24,32 @@ static CODE : &str = " #[blaze(MatrixOps)] #[link = CODE] extern "C" { + /// Hi! #[link_name = "mul"] - fn matrix_mul (k: u32, lhs: *const f32, rhs: *const f32, out: *mut MaybeUninit); + fn matrix_mul(k: u32, lhs: *const f32, rhs: *const f32, out: *mut MaybeUninit); } #[test] -fn buffer_mul () -> Result<()> { +fn buffer_mul() -> Result<()> { let ops = MatrixOps::new(None)?; - let lhs = RectBuffer2D::::new(&[1.,2.,4.,5.,7.,8.], 2, MemAccess::READ_ONLY, false)?; // 3 x 2 - let rhs = RectBuffer2D::::new(&[1.,2.,3.,4.,5.,6.], 3, MemAccess::READ_ONLY, false)?; // 2 x 3 + let lhs = RectBuffer2D::::new(&[1., 2., 4., 5., 7., 8.], 2, MemAccess::READ_ONLY, false)?; // 3 x 2 + let rhs = RectBuffer2D::::new(&[1., 2., 3., 4., 5., 6.], 3, MemAccess::READ_ONLY, false)?; // 2 x 3 let mut result = RectBuffer2D::::new_uninit(3, 3, MemAccess::WRITE_ONLY, false)?; // 3 x 3 unsafe { ops.matrix_mul_blocking(2, &lhs, &rhs, &mut result, [3, 3], None, None)?; } - + let result = unsafe { result.assume_init() }; println!("{:?}", result); - + Ok(()) } #[cfg(feature = "svm")] #[test] -fn svm_mul () -> Result<()> { +fn svm_mul() -> Result<()> { /*use blaze_rs::{buffer::rect::SvmRect2D, svm::Svm}; let ops = MatrixOps::new(None)?; @@ -61,7 +62,7 @@ fn svm_mul () -> Result<()> { let result = unsafe { result.assume_init() }; println!("{:?}", result); - + Ok(())*/ Ok(()) -} \ No newline at end of file +}