ocl_convolution/
lib.rs

1//! OpenCL-accelerated 2D convolutions.
2//!
3//! [Convolution] is a fundamental building block in signal processing. This crate is focused
4//! on 2D convolutions (i.e., the signal is a still image) in the context of [deep learning]
5//! (more precisely, [convolutional neural networks][cnn]).
6//! The second requirement means that the convolution filter may contain many (order of hundreds)
7//! filters; and the input may contain many channels (order of hundreds or thousands), rather
8//! than traditional 3 or 4. Computing such convolutions is computationally heavy and can be
9//! effectively accelerated with the help of [OpenCL].
10//!
11//! # Features
12//!
13//! The crate implements convolutions on two numerical formats:
14//!
15//! - Single-precision floats (`f32`)
16//! - Signed 8-bit integers with 32-bit multiply-add accumulator (this format is frequently denoted
17//!   `int8/32` in deep learning literature). Quantization parameters are applied uniformly
18//!   to the entire layer.
19//!
20//! For both cases, dilated or grouped convolutions are supported.
21//!
22//! # Implementation details
23//!
24//! The implementation uses output-stationary workflow (see, e.g., [this paper] for
25//! the definition); that is, each element of the output tensor is computed in a single run
26//! of the OpenCL kernel. This minimizes memory overhead, but may not be the fastest algorithm.
27//!
28//! [Convolution]: https://en.wikipedia.org/wiki/Convolution
29//! [deep learning]: https://en.wikipedia.org/wiki/Deep_learning
30//! [cnn]: https://en.wikipedia.org/wiki/Convolutional_neural_network
31//! [OpenCL]: https://www.khronos.org/opencl/
32//! [this paper]: https://dl.acm.org/citation.cfm?id=3001177
33//!
34//! # Examples
35//!
36//! ## Floating-point convolution
37//!
38//! ```
39//! use ndarray::Array4;
40//! use rand::RngExt;
41//! use ocl_convolution::{Convolution, FeatureMap, Params};
42//!
43//! # fn main() -> Result<(), ocl::Error> {
44//! let convolution = Convolution::f32(3)?.build(Params {
45//!     strides: [1, 1],
46//!     pads: [0; 4],
47//!     dilation: [1, 1],
48//!     groups: 1,
49//! })?;
50//!
51//! // Generate random signal with 6x6 spatial dims and 3 channels.
52//! let mut rng = rand::rng();
53//! let signal = Array4::from_shape_fn([1, 6, 6, 3], |_| rng.random_range(-1.0..=1.0));
54//! // Construct two 3x3 spatial filters.
55//! let filters = Array4::from_shape_fn([2, 3, 3, 3], |_| rng.random_range(-1.0..=1.0));
56//! // Perform the convolution. The output must have 4x4 spatial dims
57//! // and contain 2 channels (1 per each filter). The output layout will
58//! // be the same as in the signal.
59//! let output = convolution.compute(
60//!     // `FeatureMap` wraps `ArrayView4` with information about
61//!     // memory layout (which is "channels-last" / NHWC in this case).
62//!     FeatureMap::nhwc(&signal),
63//!     &filters,
64//! )?;
65//! assert_eq!(output.shape(), [1, 4, 4, 2]);
66//!
67//! // For increased efficiency, we may pin filter memory.
68//! // This is especially useful when the same filters are convolved
69//! // with multiple signals.
70//! let convolution = convolution.with_filters(&filters)?;
71//! let new_output = convolution.compute(FeatureMap::nhwc(&signal))?;
72//! assert_eq!(output, new_output);
73//! # Ok(())
74//! # }
75//! ```
76//!
77//! ## Quantized convolution
78//!
79//! ```
80//! use ndarray::Array4;
81//! use rand::RngExt;
82//! use ocl_convolution::{Convolution, I8Params, FeatureMap, Params};
83//!
84//! # fn main() -> Result<(), ocl::Error> {
85//! const BIT_SHIFT: u8 = 16;
86//! let params = I8Params {
87//!     common: Params::default(),
88//!     // These params are found by profiling; here, they are
89//!     // chosen randomly.
90//!     bit_shift: BIT_SHIFT,
91//!     scale: I8Params::convert_scale(BIT_SHIFT, 0.1),
92//!     output_bias: -10,
93//!     signal_bias: 20,
94//!     filter_bias: -5,
95//! };
96//! let convolution = Convolution::i8(3)?.build(params)?;
97//!
98//! // Generate random signal with 6x6 spatial dims and 3 channels.
99//! let mut rng = rand::rng();
100//! let signal = Array4::from_shape_fn([1, 6, 6, 3], |_| rng.random_range(-127..=127));
101//! // Construct two 3x3 spatial filters.
102//! let filters = Array4::from_shape_fn([2, 3, 3, 3], |_| rng.random_range(-127..=127));
103//! // Perform the convolution. The output must have 4x4 spatial dims
104//! // and contain 2 channels (1 per each filter).
105//! let output = convolution.compute(
106//!     FeatureMap::nhwc(&signal),
107//!     &filters,
108//! )?;
109//! assert_eq!(output.shape(), [1, 4, 4, 2]);
110//! # Ok(())
111//! # }
112//! ```
113
114#![doc(html_root_url = "https://docs.rs/ocl-convolution/0.4.0")]
115
116use std::{fmt, marker::PhantomData};
117
118use ndarray::{Array4, ArrayView4};
119use ocl::OclPrm;
120
121use crate::{
122    base::Base,
123    buffers::{Filters, Pinned},
124};
125pub use crate::{
126    base::ConvolutionBuilder,
127    buffers::{FeatureMap, FeatureMapShape, Layout},
128    params::{I8Params, Params},
129};
130
131mod base;
132mod buffers;
133mod params;
134
135const SOURCE: &str = include_str!(concat!(env!("OUT_DIR"), "/conv.cl"));
136
137/// Supported element types for convolutions.
138pub trait ConvElement: OclPrm + Copy + 'static {
139    /// Type of the multiply-add accumulator.
140    type Acc: OclPrm + Copy + 'static;
141    /// Parameters of the convolution.
142    type Params: Copy + Into<Params> + Into<Self::ClParams>;
143    /// OpenCL-friendly version of parameters. This is considered an implementation detail.
144    type ClParams: OclPrm;
145}
146
147impl ConvElement for f32 {
148    type Acc = f32;
149    type Params = Params;
150    type ClParams = params::ClParams;
151}
152
153impl ConvElement for i8 {
154    type Acc = i32;
155    type Params = I8Params;
156    type ClParams = params::ClI8Params;
157}
158
159impl ConvolutionBuilder<f32> {
160    /// Creates a new floating-point convolution.
161    ///
162    /// # Errors
163    ///
164    /// Proxies OpenCL initialization errors.
165    pub fn build(&self, params: Params) -> ocl::Result<Convolution<f32>> {
166        Base::new(self, params).map(Convolution)
167    }
168}
169
170impl ConvolutionBuilder<i8> {
171    /// Creates a new quantized convolution.
172    ///
173    /// # Errors
174    ///
175    /// Proxies OpenCL initialization errors.
176    pub fn build(&self, params: I8Params) -> ocl::Result<Convolution<i8>> {
177        Base::new(self, params).map(Convolution)
178    }
179}
180
181/// Convolution without pinned memory.
182pub struct Convolution<T: ConvElement>(Base<PhantomData<T>>);
183
184impl<T> fmt::Debug for Convolution<T>
185where
186    T: ConvElement,
187    T::Params: fmt::Debug,
188{
189    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
190        formatter.debug_tuple("Convolution").field(&self.0).finish()
191    }
192}
193
194impl Convolution<f32> {
195    /// Creates a new floating-point convolution builder. `size` determines the filter size
196    /// and must be odd (1, 3, 5, ...).
197    ///
198    /// # Panics
199    ///
200    /// Panics if the filter `size` is even.
201    ///
202    /// # Errors
203    ///
204    /// Proxies OpenCL initialization errors.
205    pub fn f32(size: u32) -> ocl::Result<ConvolutionBuilder<f32>> {
206        ConvolutionBuilder::new(size, &[("KERNEL_TYPE", 32)], SOURCE)
207    }
208}
209
210/// Quantized convolution over signed 8-bit integers.
211///
212/// Due to use of `i8` inputs, computations are performed much faster than on `f32` inputs
213/// (the difference manifests most on the specialized hardware, but it is seen in this
214/// OpenCL-powered implementation as well).
215///
216/// ## Connection to real-value convolution
217///
218/// Quantized convolution mirrors real-valued convolution in which `i8` elements
219/// of the signal, filter and output tensors represent real-valued numbers with the
220/// following mapping:
221///
222/// ```
223/// let scale: f32 = // ...
224/// # 1.0;
225/// let bias: i32 = // ...
226/// # 0; drop(
227/// |x: i8| -> f32 { scale * (i32::from(x) - bias) as f32 }
228/// # )
229/// ```
230///
231/// `scale` and `bias` may differ for different tensors; these params are usually determined
232/// by *profiling* the corresponding convolutional neural network (see e.g. [this paper]).
233///
234/// Denote these quantization params for tensor `T` as `T.scale` and `T.bias`. Denote `S`
235/// the signal, `F` the filter, `O` the output. Convolution parameters must be set as follows:
236///
237/// | `I8Params` field | Value     |
238/// |------------------|-----------|
239/// | `signal_bias`    | `-S.bias` |
240/// | `filter_bias`    | `-F.bias` |
241/// | `output_bias`    | `O.bias`  |
242/// | `scale`          | `S.scale * F.scale / O.scale` |
243///
244/// `scale` is represented as a fixed-point number with [`bit_shift`] binary digits after
245/// the point. Note that filter biases `B` are not transformed during the computation.
246///
247/// # Computing convolution
248///
249/// Suppose `S` is the signal and `F` is the filter tensor; both contain `i8` values.
250/// The computation is performed as follows:
251///
252/// 1. Unbias the signal: `S := S + params.signal_bias`.
253/// 2. Unbias the filters: `F := F + params.filter_bias`.
254/// 3. Compute "standard" convolution output `O := S (*) F` using `i32` precision.
255/// 4. Upscale each number in the output: `O := O * params.scale`.
256/// 5. If there is filter bias `B` provided, apply bias to the output per each output channel:
257///    `O[f, ..] := O[f, ..] + B[f]`.
258/// 6. Downscale the output: `O := round(O / 2**self.bit_shift)`,
259///    where `round()` works as floating-point rounding with the default mode
260///    (round to nearest, ties to even).
261/// 7. Apply output bias: `O := O + params.output_bias`.
262/// 8. Saturate output to `i8` range.
263///
264/// [`bit_shift`]: I8Params::bit_shift
265/// [this paper]: https://arxiv.org/abs/1805.00907
266impl Convolution<i8> {
267    /// Creates a new `i8` convolution builder. `size` determines the filter size
268    /// and must be odd (1, 3, 5, ...).
269    ///
270    /// # Panics
271    ///
272    /// Panics if the filter `size` is even.
273    ///
274    /// # Errors
275    ///
276    /// Proxies OpenCL initialization errors.
277    pub fn i8(size: u32) -> ocl::Result<ConvolutionBuilder<i8>> {
278        ConvolutionBuilder::new(size, &[("KERNEL_TYPE", 8)], SOURCE)
279    }
280}
281
282impl<T: ConvElement> Convolution<T> {
283    /// Spatial size of the convolution.
284    pub fn size(&self) -> u32 {
285        self.0.size()
286    }
287
288    /// Returns general parameters of the convolution.
289    pub fn params(&self) -> T::Params {
290        self.0.params()
291    }
292
293    /// Sets convolution parameters.
294    ///
295    /// # Errors
296    ///
297    /// Proxies OpenCL initialization errors.
298    pub fn set_params(&mut self, params: T::Params) -> ocl::Result<()> {
299        self.0.set_params(params)
300    }
301
302    /// Returns the convolution with pinned filter memory.
303    ///
304    /// # Parameters
305    ///
306    /// - `filters` must have `MxK_HxK_WxC` layout, where `M` is the number of filters,
307    ///   `K_H` and `K_W` are spatial dimensions of a filter, `C` is the number of input channels.
308    ///
309    /// # Errors
310    ///
311    /// Proxies OpenCL initialization errors.
312    pub fn with_filters<'a>(
313        self,
314        filters: impl Into<ArrayView4<'a, T>>,
315    ) -> ocl::Result<FiltersConvolution<T>> {
316        self.0
317            .with_filters(&filters.into(), None)
318            .map(FiltersConvolution)
319    }
320
321    /// Returns the convolution with pinned filter / filter bias memory.
322    ///
323    /// # Errors
324    ///
325    /// Proxies OpenCL initialization errors.
326    pub fn with_biased_filters<'a>(
327        self,
328        filters: impl Into<ArrayView4<'a, T>>,
329        filter_biases: &[T::Acc],
330    ) -> ocl::Result<FiltersConvolution<T>> {
331        self.0
332            .with_filters(&filters.into(), Some(filter_biases))
333            .map(FiltersConvolution)
334    }
335
336    /// Performs convolution on the provided `signal` and `filters`.
337    ///
338    /// # Parameters
339    ///
340    /// - `filters` must have `MxK_HxK_WxC` layout, where `M` is the number of filters,
341    ///   `K_H` and `K_W` are spatial dimensions of a filter, `C` is the number of input channels.
342    ///
343    /// # Return value
344    ///
345    /// The output will have the same layout as `signal`. An error means something wrong
346    /// with OpenCL.
347    ///
348    /// # Panics
349    ///
350    /// - Panics if `filters` do not have expected spatial dimensions, i.e.,
351    ///   `self.size() x self.size()`.
352    /// - Panics if the number of input channels differs from number of channels in `filters`.
353    ///
354    /// # Errors
355    ///
356    /// Proxies OpenCL initialization errors.
357    pub fn compute<'a>(
358        &self,
359        signal: FeatureMap<'_, T>,
360        filters: impl Into<ArrayView4<'a, T>>,
361    ) -> ocl::Result<Array4<T>> {
362        self.0.compute(signal, &filters.into(), None)
363    }
364
365    /// Performs convolution on the provided `signal` and `filters`, with the output offset
366    /// by the provided per-filter biases.
367    ///
368    /// Parameters, return value and panics are the same as for [`Self::compute()`].
369    ///
370    /// # Errors
371    ///
372    /// Proxies OpenCL initialization errors.
373    pub fn compute_with_biases<'a>(
374        &self,
375        signal: FeatureMap<'_, T>,
376        filters: impl Into<ArrayView4<'a, T>>,
377        filter_biases: &[T::Acc],
378    ) -> ocl::Result<Array4<T>> {
379        self.0.compute(signal, &filters.into(), Some(filter_biases))
380    }
381}
382
383/// Convolution with pinned filters memory. Pinning memory increases efficiency at the cost
384/// of making the convolution less flexible.
385///
386/// `FiltersConvolution` can be created by calling [`with_filters()`](Convolution::with_filters())
387/// or [`with_biased_filters()`](Convolution::with_biased_filters()) methods in `Convolution`.
388pub struct FiltersConvolution<T: ConvElement>(Base<Filters<T>>);
389
390impl<T> fmt::Debug for FiltersConvolution<T>
391where
392    T: ConvElement,
393    T::Params: fmt::Debug,
394{
395    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
396        formatter
397            .debug_tuple("FiltersConvolution")
398            .field(&self.0)
399            .finish()
400    }
401}
402
403impl<T: ConvElement> FiltersConvolution<T> {
404    /// Spatial size of the convolution.
405    pub fn size(&self) -> u32 {
406        self.0.size()
407    }
408
409    /// Returns general parameters of the convolution.
410    pub fn params(&self) -> T::Params {
411        self.0.params()
412    }
413
414    /// Sets convolution parameters.
415    ///
416    /// # Errors
417    ///
418    /// Proxies OpenCL initialization errors.
419    pub fn set_params(&mut self, params: T::Params) -> ocl::Result<()> {
420        self.0.set_params(params)
421    }
422
423    /// Pins signal and output memory for this convolution.
424    ///
425    /// # Errors
426    ///
427    /// Proxies OpenCL initialization errors.
428    pub fn pin(self, signal_shape: FeatureMapShape) -> ocl::Result<PinnedConvolution<T>> {
429        self.0.pinned(signal_shape).map(PinnedConvolution)
430    }
431
432    /// Computes the convolution on the provided signal.
433    ///
434    /// # Errors
435    ///
436    /// Proxies OpenCL initialization errors.
437    pub fn compute(&self, signal: FeatureMap<'_, T>) -> ocl::Result<Array4<T>> {
438        self.0.compute(signal)
439    }
440}
441
442/// Convolution with pinned memory for filters, signal and output. Pinning memory increases
443/// efficiency at the cost of making the convolution less flexible.
444///
445/// `PinnedConvolution` can be created from a [`FiltersConvolution`] by calling
446/// [`pin()`](FiltersConvolution::pin()).
447pub struct PinnedConvolution<T: ConvElement>(Base<Pinned<T>>);
448
449impl<T> fmt::Debug for PinnedConvolution<T>
450where
451    T: ConvElement,
452    T::Params: fmt::Debug,
453{
454    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
455        formatter
456            .debug_tuple("PinnedConvolution")
457            .field(&self.0)
458            .finish()
459    }
460}
461
462impl<T: ConvElement> PinnedConvolution<T> {
463    /// Spatial size of the convolution.
464    pub fn size(&self) -> u32 {
465        self.0.size()
466    }
467
468    /// Returns general parameters of the convolution.
469    pub fn params(&self) -> T::Params {
470        self.0.params()
471    }
472
473    /// Sets convolution parameters.
474    ///
475    /// # Errors
476    ///
477    /// Proxies OpenCL initialization errors.
478    pub fn set_params(&mut self, params: T::Params) -> ocl::Result<()> {
479        self.0.set_params(params)
480    }
481
482    /// Computes the convolution on the provided signal.
483    ///
484    /// # Panics
485    ///
486    /// - Panics if signal dimensions do not agree with the ones provided
487    ///   to the [`pin()` method](FiltersConvolution::pin()).
488    ///
489    /// # Errors
490    ///
491    /// Proxies OpenCL initialization errors.
492    pub fn compute(&self, signal: FeatureMap<'_, T>) -> ocl::Result<Array4<T>> {
493        self.0.compute(signal)
494    }
495}
496
497#[cfg(doctest)]
498doc_comment::doctest!("../README.md");