ocl_convolution/lib.rs
1//! OpenCL-accelerated 2D convolutions.
2//!
3//! [Convolution] is a fundamental building block in signal processing. This crate is focused
4//! on 2D convolutions (i.e., the signal is a still image) in the context of [deep learning]
5//! (more precisely, [convolutional neural networks][cnn]).
6//! The second requirement means that the convolution filter may contain many (order of hundreds)
7//! filters; and the input may contain many channels (order of hundreds or thousands), rather
8//! than traditional 3 or 4. Computing such convolutions is computationally heavy and can be
9//! effectively accelerated with the help of [OpenCL].
10//!
11//! # Features
12//!
13//! The crate implements convolutions on two numerical formats:
14//!
15//! - Single-precision floats (`f32`)
16//! - Signed 8-bit integers with 32-bit multiply-add accumulator (this format is frequently denoted
17//! `int8/32` in deep learning literature). Quantization parameters are applied uniformly
18//! to the entire layer.
19//!
20//! For both cases, dilated or grouped convolutions are supported.
21//!
22//! # Implementation details
23//!
24//! The implementation uses output-stationary workflow (see, e.g., [this paper] for
25//! the definition); that is, each element of the output tensor is computed in a single run
26//! of the OpenCL kernel. This minimizes memory overhead, but may not be the fastest algorithm.
27//!
28//! [Convolution]: https://en.wikipedia.org/wiki/Convolution
29//! [deep learning]: https://en.wikipedia.org/wiki/Deep_learning
30//! [cnn]: https://en.wikipedia.org/wiki/Convolutional_neural_network
31//! [OpenCL]: https://www.khronos.org/opencl/
32//! [this paper]: https://dl.acm.org/citation.cfm?id=3001177
33//!
34//! # Examples
35//!
36//! ## Floating-point convolution
37//!
38//! ```
39//! use ndarray::Array4;
40//! use rand::RngExt;
41//! use ocl_convolution::{Convolution, FeatureMap, Params};
42//!
43//! # fn main() -> Result<(), ocl::Error> {
44//! let convolution = Convolution::f32(3)?.build(Params {
45//! strides: [1, 1],
46//! pads: [0; 4],
47//! dilation: [1, 1],
48//! groups: 1,
49//! })?;
50//!
51//! // Generate random signal with 6x6 spatial dims and 3 channels.
52//! let mut rng = rand::rng();
53//! let signal = Array4::from_shape_fn([1, 6, 6, 3], |_| rng.random_range(-1.0..=1.0));
54//! // Construct two 3x3 spatial filters.
55//! let filters = Array4::from_shape_fn([2, 3, 3, 3], |_| rng.random_range(-1.0..=1.0));
56//! // Perform the convolution. The output must have 4x4 spatial dims
57//! // and contain 2 channels (1 per each filter). The output layout will
58//! // be the same as in the signal.
59//! let output = convolution.compute(
60//! // `FeatureMap` wraps `ArrayView4` with information about
61//! // memory layout (which is "channels-last" / NHWC in this case).
62//! FeatureMap::nhwc(&signal),
63//! &filters,
64//! )?;
65//! assert_eq!(output.shape(), [1, 4, 4, 2]);
66//!
67//! // For increased efficiency, we may pin filter memory.
68//! // This is especially useful when the same filters are convolved
69//! // with multiple signals.
70//! let convolution = convolution.with_filters(&filters)?;
71//! let new_output = convolution.compute(FeatureMap::nhwc(&signal))?;
72//! assert_eq!(output, new_output);
73//! # Ok(())
74//! # }
75//! ```
76//!
77//! ## Quantized convolution
78//!
79//! ```
80//! use ndarray::Array4;
81//! use rand::RngExt;
82//! use ocl_convolution::{Convolution, I8Params, FeatureMap, Params};
83//!
84//! # fn main() -> Result<(), ocl::Error> {
85//! const BIT_SHIFT: u8 = 16;
86//! let params = I8Params {
87//! common: Params::default(),
88//! // These params are found by profiling; here, they are
89//! // chosen randomly.
90//! bit_shift: BIT_SHIFT,
91//! scale: I8Params::convert_scale(BIT_SHIFT, 0.1),
92//! output_bias: -10,
93//! signal_bias: 20,
94//! filter_bias: -5,
95//! };
96//! let convolution = Convolution::i8(3)?.build(params)?;
97//!
98//! // Generate random signal with 6x6 spatial dims and 3 channels.
99//! let mut rng = rand::rng();
100//! let signal = Array4::from_shape_fn([1, 6, 6, 3], |_| rng.random_range(-127..=127));
101//! // Construct two 3x3 spatial filters.
102//! let filters = Array4::from_shape_fn([2, 3, 3, 3], |_| rng.random_range(-127..=127));
103//! // Perform the convolution. The output must have 4x4 spatial dims
104//! // and contain 2 channels (1 per each filter).
105//! let output = convolution.compute(
106//! FeatureMap::nhwc(&signal),
107//! &filters,
108//! )?;
109//! assert_eq!(output.shape(), [1, 4, 4, 2]);
110//! # Ok(())
111//! # }
112//! ```
113
114#![doc(html_root_url = "https://docs.rs/ocl-convolution/0.4.0")]
115
116use std::{fmt, marker::PhantomData};
117
118use ndarray::{Array4, ArrayView4};
119use ocl::OclPrm;
120
121use crate::{
122 base::Base,
123 buffers::{Filters, Pinned},
124};
125pub use crate::{
126 base::ConvolutionBuilder,
127 buffers::{FeatureMap, FeatureMapShape, Layout},
128 params::{I8Params, Params},
129};
130
131mod base;
132mod buffers;
133mod params;
134
135const SOURCE: &str = include_str!(concat!(env!("OUT_DIR"), "/conv.cl"));
136
137/// Supported element types for convolutions.
138pub trait ConvElement: OclPrm + Copy + 'static {
139 /// Type of the multiply-add accumulator.
140 type Acc: OclPrm + Copy + 'static;
141 /// Parameters of the convolution.
142 type Params: Copy + Into<Params> + Into<Self::ClParams>;
143 /// OpenCL-friendly version of parameters. This is considered an implementation detail.
144 type ClParams: OclPrm;
145}
146
147impl ConvElement for f32 {
148 type Acc = f32;
149 type Params = Params;
150 type ClParams = params::ClParams;
151}
152
153impl ConvElement for i8 {
154 type Acc = i32;
155 type Params = I8Params;
156 type ClParams = params::ClI8Params;
157}
158
159impl ConvolutionBuilder<f32> {
160 /// Creates a new floating-point convolution.
161 ///
162 /// # Errors
163 ///
164 /// Proxies OpenCL initialization errors.
165 pub fn build(&self, params: Params) -> ocl::Result<Convolution<f32>> {
166 Base::new(self, params).map(Convolution)
167 }
168}
169
170impl ConvolutionBuilder<i8> {
171 /// Creates a new quantized convolution.
172 ///
173 /// # Errors
174 ///
175 /// Proxies OpenCL initialization errors.
176 pub fn build(&self, params: I8Params) -> ocl::Result<Convolution<i8>> {
177 Base::new(self, params).map(Convolution)
178 }
179}
180
181/// Convolution without pinned memory.
182pub struct Convolution<T: ConvElement>(Base<PhantomData<T>>);
183
184impl<T> fmt::Debug for Convolution<T>
185where
186 T: ConvElement,
187 T::Params: fmt::Debug,
188{
189 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
190 formatter.debug_tuple("Convolution").field(&self.0).finish()
191 }
192}
193
194impl Convolution<f32> {
195 /// Creates a new floating-point convolution builder. `size` determines the filter size
196 /// and must be odd (1, 3, 5, ...).
197 ///
198 /// # Panics
199 ///
200 /// Panics if the filter `size` is even.
201 ///
202 /// # Errors
203 ///
204 /// Proxies OpenCL initialization errors.
205 pub fn f32(size: u32) -> ocl::Result<ConvolutionBuilder<f32>> {
206 ConvolutionBuilder::new(size, &[("KERNEL_TYPE", 32)], SOURCE)
207 }
208}
209
210/// Quantized convolution over signed 8-bit integers.
211///
212/// Due to use of `i8` inputs, computations are performed much faster than on `f32` inputs
213/// (the difference manifests most on the specialized hardware, but it is seen in this
214/// OpenCL-powered implementation as well).
215///
216/// ## Connection to real-value convolution
217///
218/// Quantized convolution mirrors real-valued convolution in which `i8` elements
219/// of the signal, filter and output tensors represent real-valued numbers with the
220/// following mapping:
221///
222/// ```
223/// let scale: f32 = // ...
224/// # 1.0;
225/// let bias: i32 = // ...
226/// # 0; drop(
227/// |x: i8| -> f32 { scale * (i32::from(x) - bias) as f32 }
228/// # )
229/// ```
230///
231/// `scale` and `bias` may differ for different tensors; these params are usually determined
232/// by *profiling* the corresponding convolutional neural network (see e.g. [this paper]).
233///
234/// Denote these quantization params for tensor `T` as `T.scale` and `T.bias`. Denote `S`
235/// the signal, `F` the filter, `O` the output. Convolution parameters must be set as follows:
236///
237/// | `I8Params` field | Value |
238/// |------------------|-----------|
239/// | `signal_bias` | `-S.bias` |
240/// | `filter_bias` | `-F.bias` |
241/// | `output_bias` | `O.bias` |
242/// | `scale` | `S.scale * F.scale / O.scale` |
243///
244/// `scale` is represented as a fixed-point number with [`bit_shift`] binary digits after
245/// the point. Note that filter biases `B` are not transformed during the computation.
246///
247/// # Computing convolution
248///
249/// Suppose `S` is the signal and `F` is the filter tensor; both contain `i8` values.
250/// The computation is performed as follows:
251///
252/// 1. Unbias the signal: `S := S + params.signal_bias`.
253/// 2. Unbias the filters: `F := F + params.filter_bias`.
254/// 3. Compute "standard" convolution output `O := S (*) F` using `i32` precision.
255/// 4. Upscale each number in the output: `O := O * params.scale`.
256/// 5. If there is filter bias `B` provided, apply bias to the output per each output channel:
257/// `O[f, ..] := O[f, ..] + B[f]`.
258/// 6. Downscale the output: `O := round(O / 2**self.bit_shift)`,
259/// where `round()` works as floating-point rounding with the default mode
260/// (round to nearest, ties to even).
261/// 7. Apply output bias: `O := O + params.output_bias`.
262/// 8. Saturate output to `i8` range.
263///
264/// [`bit_shift`]: I8Params::bit_shift
265/// [this paper]: https://arxiv.org/abs/1805.00907
266impl Convolution<i8> {
267 /// Creates a new `i8` convolution builder. `size` determines the filter size
268 /// and must be odd (1, 3, 5, ...).
269 ///
270 /// # Panics
271 ///
272 /// Panics if the filter `size` is even.
273 ///
274 /// # Errors
275 ///
276 /// Proxies OpenCL initialization errors.
277 pub fn i8(size: u32) -> ocl::Result<ConvolutionBuilder<i8>> {
278 ConvolutionBuilder::new(size, &[("KERNEL_TYPE", 8)], SOURCE)
279 }
280}
281
282impl<T: ConvElement> Convolution<T> {
283 /// Spatial size of the convolution.
284 pub fn size(&self) -> u32 {
285 self.0.size()
286 }
287
288 /// Returns general parameters of the convolution.
289 pub fn params(&self) -> T::Params {
290 self.0.params()
291 }
292
293 /// Sets convolution parameters.
294 ///
295 /// # Errors
296 ///
297 /// Proxies OpenCL initialization errors.
298 pub fn set_params(&mut self, params: T::Params) -> ocl::Result<()> {
299 self.0.set_params(params)
300 }
301
302 /// Returns the convolution with pinned filter memory.
303 ///
304 /// # Parameters
305 ///
306 /// - `filters` must have `MxK_HxK_WxC` layout, where `M` is the number of filters,
307 /// `K_H` and `K_W` are spatial dimensions of a filter, `C` is the number of input channels.
308 ///
309 /// # Errors
310 ///
311 /// Proxies OpenCL initialization errors.
312 pub fn with_filters<'a>(
313 self,
314 filters: impl Into<ArrayView4<'a, T>>,
315 ) -> ocl::Result<FiltersConvolution<T>> {
316 self.0
317 .with_filters(&filters.into(), None)
318 .map(FiltersConvolution)
319 }
320
321 /// Returns the convolution with pinned filter / filter bias memory.
322 ///
323 /// # Errors
324 ///
325 /// Proxies OpenCL initialization errors.
326 pub fn with_biased_filters<'a>(
327 self,
328 filters: impl Into<ArrayView4<'a, T>>,
329 filter_biases: &[T::Acc],
330 ) -> ocl::Result<FiltersConvolution<T>> {
331 self.0
332 .with_filters(&filters.into(), Some(filter_biases))
333 .map(FiltersConvolution)
334 }
335
336 /// Performs convolution on the provided `signal` and `filters`.
337 ///
338 /// # Parameters
339 ///
340 /// - `filters` must have `MxK_HxK_WxC` layout, where `M` is the number of filters,
341 /// `K_H` and `K_W` are spatial dimensions of a filter, `C` is the number of input channels.
342 ///
343 /// # Return value
344 ///
345 /// The output will have the same layout as `signal`. An error means something wrong
346 /// with OpenCL.
347 ///
348 /// # Panics
349 ///
350 /// - Panics if `filters` do not have expected spatial dimensions, i.e.,
351 /// `self.size() x self.size()`.
352 /// - Panics if the number of input channels differs from number of channels in `filters`.
353 ///
354 /// # Errors
355 ///
356 /// Proxies OpenCL initialization errors.
357 pub fn compute<'a>(
358 &self,
359 signal: FeatureMap<'_, T>,
360 filters: impl Into<ArrayView4<'a, T>>,
361 ) -> ocl::Result<Array4<T>> {
362 self.0.compute(signal, &filters.into(), None)
363 }
364
365 /// Performs convolution on the provided `signal` and `filters`, with the output offset
366 /// by the provided per-filter biases.
367 ///
368 /// Parameters, return value and panics are the same as for [`Self::compute()`].
369 ///
370 /// # Errors
371 ///
372 /// Proxies OpenCL initialization errors.
373 pub fn compute_with_biases<'a>(
374 &self,
375 signal: FeatureMap<'_, T>,
376 filters: impl Into<ArrayView4<'a, T>>,
377 filter_biases: &[T::Acc],
378 ) -> ocl::Result<Array4<T>> {
379 self.0.compute(signal, &filters.into(), Some(filter_biases))
380 }
381}
382
383/// Convolution with pinned filters memory. Pinning memory increases efficiency at the cost
384/// of making the convolution less flexible.
385///
386/// `FiltersConvolution` can be created by calling [`with_filters()`](Convolution::with_filters())
387/// or [`with_biased_filters()`](Convolution::with_biased_filters()) methods in `Convolution`.
388pub struct FiltersConvolution<T: ConvElement>(Base<Filters<T>>);
389
390impl<T> fmt::Debug for FiltersConvolution<T>
391where
392 T: ConvElement,
393 T::Params: fmt::Debug,
394{
395 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
396 formatter
397 .debug_tuple("FiltersConvolution")
398 .field(&self.0)
399 .finish()
400 }
401}
402
403impl<T: ConvElement> FiltersConvolution<T> {
404 /// Spatial size of the convolution.
405 pub fn size(&self) -> u32 {
406 self.0.size()
407 }
408
409 /// Returns general parameters of the convolution.
410 pub fn params(&self) -> T::Params {
411 self.0.params()
412 }
413
414 /// Sets convolution parameters.
415 ///
416 /// # Errors
417 ///
418 /// Proxies OpenCL initialization errors.
419 pub fn set_params(&mut self, params: T::Params) -> ocl::Result<()> {
420 self.0.set_params(params)
421 }
422
423 /// Pins signal and output memory for this convolution.
424 ///
425 /// # Errors
426 ///
427 /// Proxies OpenCL initialization errors.
428 pub fn pin(self, signal_shape: FeatureMapShape) -> ocl::Result<PinnedConvolution<T>> {
429 self.0.pinned(signal_shape).map(PinnedConvolution)
430 }
431
432 /// Computes the convolution on the provided signal.
433 ///
434 /// # Errors
435 ///
436 /// Proxies OpenCL initialization errors.
437 pub fn compute(&self, signal: FeatureMap<'_, T>) -> ocl::Result<Array4<T>> {
438 self.0.compute(signal)
439 }
440}
441
442/// Convolution with pinned memory for filters, signal and output. Pinning memory increases
443/// efficiency at the cost of making the convolution less flexible.
444///
445/// `PinnedConvolution` can be created from a [`FiltersConvolution`] by calling
446/// [`pin()`](FiltersConvolution::pin()).
447pub struct PinnedConvolution<T: ConvElement>(Base<Pinned<T>>);
448
449impl<T> fmt::Debug for PinnedConvolution<T>
450where
451 T: ConvElement,
452 T::Params: fmt::Debug,
453{
454 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
455 formatter
456 .debug_tuple("PinnedConvolution")
457 .field(&self.0)
458 .finish()
459 }
460}
461
462impl<T: ConvElement> PinnedConvolution<T> {
463 /// Spatial size of the convolution.
464 pub fn size(&self) -> u32 {
465 self.0.size()
466 }
467
468 /// Returns general parameters of the convolution.
469 pub fn params(&self) -> T::Params {
470 self.0.params()
471 }
472
473 /// Sets convolution parameters.
474 ///
475 /// # Errors
476 ///
477 /// Proxies OpenCL initialization errors.
478 pub fn set_params(&mut self, params: T::Params) -> ocl::Result<()> {
479 self.0.set_params(params)
480 }
481
482 /// Computes the convolution on the provided signal.
483 ///
484 /// # Panics
485 ///
486 /// - Panics if signal dimensions do not agree with the ones provided
487 /// to the [`pin()` method](FiltersConvolution::pin()).
488 ///
489 /// # Errors
490 ///
491 /// Proxies OpenCL initialization errors.
492 pub fn compute(&self, signal: FeatureMap<'_, T>) -> ocl::Result<Array4<T>> {
493 self.0.compute(signal)
494 }
495}
496
497#[cfg(doctest)]
498doc_comment::doctest!("../README.md");