#pragma once #include #include namespace at::native::quantized { namespace { // MakeConvOutputShape used from both CPU and CUDA libraries // and exporting symbol from torch_cpu would probably take more storage // than duplicating implementation which likely be inlined away template at::SmallVector MakeConvOutputShape( int N, // mini-batch int M, // output channels const std::array& input_image_shape, const std::vector& kernel, const torch::List& stride, const torch::List& padding, const torch::List& dilation); #if defined(USE_CUDA) || defined(USE_PYTORCH_QNNPACK) template <> at::SmallVector MakeConvOutputShape<2>( int N, // mini-batch int M, // output channels const std::array& input_image_shape, const std::vector& kernel, const at::List& stride, const at::List& padding, const at::List& dilation) { const int H = input_image_shape[0]; const int W = input_image_shape[1]; const int64_t Y_H = (H + 2 * padding[0] - dilation[0] * (kernel[0] - 1) - 1) / stride[0] + 1; const int64_t Y_W = (W + 2 * padding[1] - dilation[1] * (kernel[1] - 1) - 1) / stride[1] + 1; return {N, M, Y_H, Y_W}; } template <> at::SmallVector MakeConvOutputShape<3>( int N, // mini-batch int M, // output channels const std::array& input_image_shape, const std::vector& kernel, const at::List& stride, const at::List& padding, const torch::List& dilation) { const int D = input_image_shape[0]; const int H = input_image_shape[1]; const int W = input_image_shape[2]; const int64_t Y_D = (D + 2 * padding[0] - dilation[0] * (kernel[0] - 1) - 1) / stride[0] + 1; const int64_t Y_H = (H + 2 * padding[1] - dilation[1] * (kernel[1] - 1) - 1) / stride[1] + 1; const int64_t Y_W = (W + 2 * padding[2] - dilation[2] * (kernel[2] - 1) - 1) / stride[2] + 1; return {N, M, Y_D, Y_H, Y_W}; } #endif } // anonymous namespace } // namespace at::native::quantized