1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
use ndarray::{Array4, ArrayView4};
use ocl::{
    builders::KernelBuilder, prm::Uint3, Buffer, Context, Device, Kernel, Platform, ProQue,
    Program, Queue,
};

use std::{convert::TryFrom, marker::PhantomData, sync::Mutex};

use crate::{
    buffers::{FeatureMap, FeatureMapShape, Filters, InputAndOutput, Pinned},
    params::{OutputParams, Params, WithParams},
    ConvElement,
};

/// Convolution builder. The same builder can be used to create multiple `Convolution`s
/// which share the same spatial size.
///
/// A builder can be created using [`Convolution::f32()`](crate::Convolution::f32()) or
/// [`Convolution::i8()`](crate::Convolution::i8()) methods.
#[derive(Debug)]
pub struct ConvolutionBuilder<T> {
    program: ProQue,
    filter_size: u32,
    _element_type: PhantomData<T>,
}

impl<T: ConvElement> ConvolutionBuilder<T> {
    /// Initializes a builder with a specific filter size.
    pub(crate) fn new(
        filter_size: u32,
        defines: &[(&'static str, i32)],
        source: &str,
    ) -> ocl::Result<Self> {
        // For some reason, certain OpenCL implementations (e.g., POCL) do not work well
        // when the list of devices for a platform is queried from multiple threads.
        // Hence, we introduce a `Mutex` to serialize these calls.
        static MUTEX: Mutex<()> = Mutex::new(());

        assert_eq!(
            filter_size % 2,
            1,
            "Even convolution sizes are not supported"
        );

        let mut program_builder = Program::builder();
        program_builder.cmplr_def(
            "FILTER_SIZE",
            i32::try_from(filter_size).expect("Cannot convert filter size to i32"),
        );
        for &(name, value) in defines {
            program_builder.cmplr_def(name, value);
        }
        program_builder.source(source);

        let (platform, device) = {
            let _lock = MUTEX.lock().ok();
            let platform = Platform::first()?;
            (platform, Device::first(platform)?)
        };

        let context = Context::builder()
            .platform(platform)
            .devices(device)
            .build()?;
        let program = ProQue::new(
            context.clone(),
            Queue::new(&context, device, None)?,
            program_builder.build(&context)?,
            None::<usize>,
        );

        Ok(Self {
            program,
            filter_size,
            _element_type: PhantomData,
        })
    }

    fn kernel_builder(&self) -> KernelBuilder<'_> {
        self.program.kernel_builder("conv")
    }
}

fn create_io<T: ConvElement, U: WithParams>(
    signal_shape: FeatureMapShape,
    filters: &Filters<T>,
    conv: &Base<U>,
) -> ocl::Result<InputAndOutput<T>> {
    assert_eq!(
        signal_shape.channels,
        filters.channel_count() * Into::<Params>::into(conv.params).groups,
        "Channel dimensionality in signal and filters must agree"
    );
    let io = InputAndOutput::new(signal_shape, filters.filter_count(), conv)?;
    io.pass_as_arguments(&conv.kernel).map(|()| io)
}

#[derive(Debug)]
pub(crate) struct Base<T: WithParams> {
    size: u32,
    params: T::Params,
    kernel: Kernel,
    buffers: T,
    context: Context,
}

impl<T: WithParams> Base<T> {
    pub fn kernel(&self) -> &Kernel {
        &self.kernel
    }

    pub fn queue(&self) -> &Queue {
        self.kernel
            .default_queue()
            .expect("kernel must come with a pre-configured queue")
    }

    pub fn size(&self) -> u32 {
        self.size
    }

    pub fn params(&self) -> T::Params {
        self.params
    }

    pub fn set_params(&mut self, params: T::Params) -> ocl::Result<()> {
        self.params = params;
        self.kernel
            .set_arg("params", Into::<T::ClParams>::into(params))
    }
}

impl<T: ConvElement> Base<PhantomData<T>> {
    pub fn new(builder: &ConvolutionBuilder<T>, params: T::Params) -> ocl::Result<Self> {
        let kernel = builder
            .kernel_builder()
            .arg_named("output", None::<&Buffer<T>>)
            .arg_named("out_params", OutputParams::default())
            .arg_named("signal", None::<&Buffer<T>>)
            .arg_named("signal_dims", Uint3::new(0, 0, 0))
            .arg_named("filters", None::<&Buffer<T>>)
            .arg_named("filter_biases", None::<&Buffer<T::Acc>>)
            .arg_named("params", Into::<T::ClParams>::into(params))
            .build()?;
        Ok(Base {
            size: builder.filter_size,
            params,
            kernel,
            buffers: PhantomData,
            context: builder.program.context().clone(),
        })
    }

    pub fn with_filters(
        self,
        filters: ArrayView4<'_, T>,
        filter_biases: Option<&[T::Acc]>,
    ) -> ocl::Result<Base<Filters<T>>> {
        let filters = Filters::new(filters, filter_biases, &self)?;
        Ok(Base {
            buffers: filters,
            size: self.size,
            params: self.params,
            kernel: self.kernel,
            context: self.context,
        })
    }

    pub fn compute(
        &self,
        signal: FeatureMap<'_, T>,
        filters: ArrayView4<'_, T>,
        filter_biases: Option<&[T::Acc]>,
    ) -> ocl::Result<Array4<T>> {
        let filter_channels =
            u32::try_from(filters.shape()[3]).expect("Cannot convert filter dimension to `u32`");
        assert_eq!(
            signal.shape().channels,
            filter_channels * Into::<Params>::into(self.params).groups,
            "Channel dimensionality in signal and filters must agree"
        );

        let filter_count =
            u32::try_from(filters.shape()[0]).expect("Cannot convert filter count to `u32`");
        let filters = Filters::new(filters, filter_biases, self)?;
        filters.pass_as_arguments(&self.kernel)?;
        let io = InputAndOutput::new(signal.shape(), filter_count, self)?;
        io.write_signal(signal)?;
        io.pass_as_arguments(&self.kernel)?;
        io.execute(&self.kernel, signal.layout())
    }
}

impl<T: ConvElement> Base<Filters<T>> {
    pub fn pinned(self, signal_shape: FeatureMapShape) -> ocl::Result<Base<Pinned<T>>> {
        let io = create_io(signal_shape, &self.buffers, &self)?;
        Ok(Base {
            size: self.size,
            params: self.params,
            kernel: self.kernel,
            buffers: Pinned { io, signal_shape },
            context: self.context,
        })
    }

    pub fn compute(&self, signal: FeatureMap<'_, T>) -> ocl::Result<Array4<T>> {
        let io = create_io(signal.shape(), &self.buffers, self)?;
        io.write_signal(signal)?;
        io.execute(&self.kernel, signal.layout())
    }
}

impl<T: ConvElement> Base<Pinned<T>> {
    pub fn compute(&self, signal: FeatureMap<'_, T>) -> ocl::Result<Array4<T>> {
        assert_eq!(
            signal.shape(),
            self.buffers.signal_shape,
            "Signal dimensions differ from the ones set when pinning signal memory"
        );
        self.buffers.io.write_signal(signal)?;
        self.buffers.io.execute(&self.kernel, signal.layout())
    }
}