iOS 10和macOS中的卷积神经网络

  let paddedK: [Float] = pad(sequence: kernel, other: x)

  现在,我们可以建立paddedX和paddedK之间的一个卷积:

  最后,卷积的结果是:

  // y = [1, 4, 10, 16, 22]

  Accelerate的卷积

  如果你想加速卷积处理,你可以使用Accelerate框架提供的vDSP_conv函数。同样,我需要处理边界条件和核反转。这一次,我对输入数组和核换个零填充的方式。另外,我需要反转核(文档里有解释),否则我得到的是两个序列的相关性。

  以下是用Accelerate的实现:

  import Accelerate

  let x: [Float] = [1, 2, 3, 4, 5], M = x.count

  let kernel: [Float] = [1, 2, 3], N = kernel.count

  let T = N+M-1

  var res = [Float](repeatElement(0, count: T))

  let zeros = [Float](repeatElement(0, count: N-1))

  let newXin = zeros + x + zeros

  vDSP_conv(newXin, 1, kernel.reverse(), 1, &res, 1, vDSP_Length(T), vDSP_Length(N))

  对于这个很短的输入序列,你不会感激Accelerate框架带来的加速。但如果我创建了100,000个元素的输入数组,并用和之前示例相同的w内核进行卷积。在我的MacBook Pro上,Swift的实现需要318 ms,而Accelerate的vDSP_conv方法只要159 ns。

  Metal的卷积

  让我们看一下如何用Metal实现相同的例子。看 这篇文 章学习如何配置一个GPU计算的Metal项目。

  在这个特殊的例子中,我们需要创建3个Metal纹理(遵守MTLTexture协议的对象):第一个纹理存储输入序列,第二个纹理存储核,第三个纹理存储最终结果。

  以下是创建这些纹理的源代码:

  import Metal

  let paddedX: [Float] = input + [Float](repeatElement(0, count: N-1))

  let paddedK: [Float] = kernel + [Float](repeatElement(0, count: M-1))

  let inputTextureDescriptor = MTLTextureDescriptor.texture2DDescriptor(with: .r32Float, width: paddedX.count, height: 1, mipmapped: false)

  inputTextureDescriptor.usage = .shaderRead

  inTexture = metalContext.device.newTexture(with: inputTextureDescriptor)

  let region = MTLRegionMake2D(0, 0, paddedX.count, 1)

  inTexture?.replace(region, mipmapLevel: 0, withBytes: paddedX, bytesPerRow: paddedX.count * sizeof(Float32.self))

  let kernelTextureDescriptor = MTLTextureDescriptor.texture2DDescriptor(with: .r32Float, width: paddedK.count, height: 1, mipmapped: false)

  kernelTexture = metalContext.device.newTexture(with: kernelTextureDescriptor)

  let kernelRegion = MTLRegionMake2D(0, 0, paddedK.count, 1)

  kernelTexture?.replace(kernelRegion, mipmapLevel: 0, withBytes: paddedK, bytesPerRow: paddedK.count * sizeof(Float32.self))

  let outputTextureDescriptor = MTLTextureDescriptor.texture2DDescriptor(with: .r32Float, width: paddedX.count, height: 1, mipmapped: false)

  outputTextureDescriptor.usage = .shaderWrite

  outTexture = metalContext.device.newTexture(with: outputTextureDescriptor)

  executeConvolution()

  在前面的源代码里,metalContext是下面的类的一个实例:

  final class MetalContext: NSObject {

  let device: MTLDevice

  let commandQueue: MTLCommandQueue

  let library: MTLLibrary

  override init() {

  // Get the device

  self.device = MTLCreateSystemDefaultDevice()!

  // Create a command queue

  self.commandQueue = device.newCommandQueue()

  // Get the default library

  self.library = device.newDefaultLibrary()!

  super.init()

  }

  }

  这只是一个助手类,我通常用来配置一个Metal栈的主要对象。

  最后一个executeConvolution()方法用来编码GPU命令:

  func executeConvolution() {

  guard let outTexture = self.outTexture else { return }

  let commandBuffer = metalContext.commandQueue.commandBuffer()

  let computeCommandEncoder = commandBuffer.computeCommandEncoder()

  computeCommandEncoder.setComputePipelineState(computePipelineState!)

  computeCommandEncoder.setTexture(inTexture, at: 0)

  computeCommandEncoder.setTexture(kernelTexture, at: 1)

  computeCommandEncoder.setTexture(outTexture, at: 2)

  computeCommandEncoder.dispatchThreadgroups(MTLSizeMake(T, 1, 1), threadsPerThreadgroup: MTLSizeMake(1, 1, 1))

  computeCommandEncoder.endEncoding()

  commandBuffer.commit()

  let region = MTLRegionMake1D(0, T)

  var buffer = [Float32](repeatElement(0, count: T))