/*
 * Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */

#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
#if defined(_MSC_VER)
#pragma message("crt/device_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
#else
#warning "crt/device_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
#endif
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__
#endif

#if !defined(__DEVICE_FUNCTIONS_HPP__)
#define __DEVICE_FUNCTIONS_HPP__

/*******************************************************************************
*                                                                              *
*                                                                              *
*                                                                              *
*******************************************************************************/

#if defined(__cplusplus) && defined(__CUDACC__)

#if defined(__CUDACC_RTC__)
#define __DEVICE_FUNCTIONS_DECL__ __device__
#define __DEVICE_FUNCTIONS_STATIC_DECL__ __device__
#define __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ __device__ __host__ __cudart_builtin__
#else
#define __DEVICE_FUNCTIONS_DECL__ __device__
#define __DEVICE_FUNCTIONS_STATIC_DECL__ static __inline__ __device__
#define __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ static __inline__ __device__ __host__ __cudart_builtin__
#endif /* __CUDACC_RTC__ */

#include "builtin_types.h"
#include "device_types.h"
#include "host_defines.h"

#undef __DEVICE_FUNCTIONS_DECL__
#undef __DEVICE_FUNCTIONS_STATIC_DECL__

#endif /* __cplusplus && __CUDACC__ */

/*******************************************************************************
*                                                                              *
*                                                                              *
*                                                                              *
*******************************************************************************/

#ifdef __CUDACC__
# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
#define __CUDA_AND_AT_LEAST_SM_90__
#endif /* defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) */
# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)
#define __CUDA_AND_AT_LEAST_SM_70__
#endif /* defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700) */
# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
#define __CUDA_AND_AT_LEAST_SM_75__
#endif /* defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) */
#endif /* __CUDACC__ */

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimax_s32_relu(const int a, const int b){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  int res;
  asm("{max.s32.relu %0, %1, %2;}" : "=r"(res) : "r"(a), "r"(b));
  return res;
#else
  // Host and older architecture code
  int ans = max(a, b);

  return (ans > 0) ? ans : 0;
#endif
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax_s16x2_relu(const unsigned int a, const unsigned int b){
  unsigned int res;
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  asm("{max.s16x2.relu %0, %1, %2;}" : "=r"(res) : "r"(a), "r"(b));
#elif defined(__CUDA_ARCH__)
  res = __vmaxs2(__vmaxs2(a, b), 0U);
#else
  // Host and older architecture code
  // Separate our high and low bit:
  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
  unsigned short aU_hi = (unsigned short)(a >> 16);

  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
  unsigned short bU_hi = (unsigned short)(b >> 16);

  //cast to signed:
  short aS_lo = *(short*)& aU_lo;
  short aS_hi = *(short*)& aU_hi;

  short bS_lo = *(short*)& bU_lo;
  short bS_hi = *(short*)& bU_hi;

  // Get answer
  short ansS_lo = (short)max(aS_lo, bS_lo);
  short ansS_hi = (short)max(aS_hi, bS_hi);

  // relu
  if(ansS_lo < 0){ ansS_lo = 0; }
  if(ansS_hi < 0){ ansS_hi = 0; }

  // Cast back to unsigned:
  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;

  // Put answer back together:
  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
#endif

  return res;
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimin_s32_relu(const int a, const int b){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  int res;
  asm("{min.s32.relu %0, %1, %2;}" : "=r"(res) : "r"(a), "r"(b));
  return res;
#else
  // Host and older architecture code
    int ans = min(a, b);
    
    return (ans > 0) ? ans : 0;
#endif
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin_s16x2_relu(const unsigned int a, const unsigned int b){
  unsigned int res;
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  asm("{min.s16x2.relu %0, %1, %2;}" : "=r"(res) : "r"(a), "r"(b));
#elif defined(__CUDA_ARCH__)
  res = __vmaxs2(__vmins2(a, b), 0U);
#else
  // Host and older architecture code
  // Separate our high and low bit:
  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
  unsigned short aU_hi = (unsigned short)(a >> 16);

  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
  unsigned short bU_hi = (unsigned short)(b >> 16);

  //cast to signed:
  short aS_lo = *(short*)& aU_lo;
  short aS_hi = *(short*)& aU_hi;

  short bS_lo = *(short*)& bU_lo;
  short bS_hi = *(short*)& bU_hi;

  // Get answer
  short ansS_lo = (short)min(aS_lo, bS_lo);
  short ansS_hi = (short)min(aS_hi, bS_hi);

  // relu
  if(ansS_lo < 0){ ansS_lo = 0; }
  if(ansS_hi < 0){ ansS_hi = 0; }

  // Cast back to unsigned:
  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;

  // Put answer back together:
  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
#endif

  return res;
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimax3_s32(const int a, const int b, const int c){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  int res;
  asm ("{.reg .s32 t1; \n\t"
      "max.s32 t1, %1, %2; \n\t"
      "max.s32 %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
  return res;
#else
  // Host and older architecture code
  return max(max(a, b), c);
#endif
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax3_s16x2(const unsigned int a, const unsigned int b, const unsigned int c){
  unsigned int res;
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  // Future asm code (naming/syntax may change):
  asm ("{.reg .b32 t1; \n\t"
      "max.s16x2 t1, %1, %2; \n\t"
      "max.s16x2 %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
#elif defined(__CUDA_AND_AT_LEAST_SM_70__)
  res = __vmaxs2(__vmaxs2(a, b), c);
#else
  // Host and older architecture code
  // Separate our high and low bit:
  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
  unsigned short aU_hi = (unsigned short)(a >> 16);

  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
  unsigned short bU_hi = (unsigned short)(b >> 16);

  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
  unsigned short cU_hi = (unsigned short)(c >> 16);

  //cast to signed:
  short aS_lo = *(short*)& aU_lo;
  short aS_hi = *(short*)& aU_hi;

  short bS_lo = *(short*)& bU_lo;
  short bS_hi = *(short*)& bU_hi;

  short cS_lo = *(short*)& cU_lo;
  short cS_hi = *(short*)& cU_hi;

  // Get answer
  short ansS_lo = (short)max(max(aS_lo, bS_lo), cS_lo);
  short ansS_hi = (short)max(max(aS_hi, bS_hi), cS_hi);

  // Cast back to unsigned:
  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;

  // Put answer back together:
  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
#endif
  return res;
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax3_u32(const unsigned int a, const unsigned int b, const unsigned int c){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
int res;
  asm ("{.reg .u32 t1; \n\t"
      "max.u32 t1, %1, %2; \n\t"
      "max.u32 %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
  return res;
#else
  // Host and older architecture code
  return max(max(a, b), c);
#endif
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax3_u16x2(const unsigned int a, const unsigned int b, const unsigned int c){
  unsigned int res;
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  asm ("{.reg .b32 t1; \n\t"
      "max.u16x2 t1, %1, %2; \n\t"
      "max.u16x2 %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
#elif defined(__CUDA_ARCH__)
  res = __vmaxu2(__vmaxu2(a, b), c);
#else
  // Host and older architecture code
  // Separate our high and low bit:
  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
  unsigned short aU_hi = (unsigned short)(a >> 16);

  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
  unsigned short bU_hi = (unsigned short)(b >> 16);

  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
  unsigned short cU_hi = (unsigned short)(c >> 16);

  // Get answer
  unsigned short ansU_lo = (unsigned short)max(max(aU_lo, bU_lo), cU_lo);
  unsigned short ansU_hi = (unsigned short)max(max(aU_hi, bU_hi), cU_hi);

  // Put answer back together:
  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
#endif

  return res;
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimin3_s32(const int a, const int b, const int c){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  int res;
  asm ("{.reg .s32 t1; \n\t"
      "min.s32 t1, %1, %2; \n\t"
      "min.s32 %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
  return res;
#else
  // Host and older architecture code
  return min(min(a, b), c);
#endif
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin3_s16x2(const unsigned int a, const unsigned int b, const unsigned int c){
  unsigned int res;
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  asm ("{.reg .b32 t1; \n\t"
      "min.s16x2 t1, %1, %2; \n\t"
      "min.s16x2 %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
#elif defined(__CUDA_AND_AT_LEAST_SM_70__)
  res = __vmins2(__vmins2(a, b), c);
#else
  // Host and older architecture code
  // Separate our high and low bit:
  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
  unsigned short aU_hi = (unsigned short)(a >> 16);

  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
  unsigned short bU_hi = (unsigned short)(b >> 16);

  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
  unsigned short cU_hi = (unsigned short)(c >> 16);

  //cast to signed:
  short aS_lo = *(short*)& aU_lo;
  short aS_hi = *(short*)& aU_hi;

  short bS_lo = *(short*)& bU_lo;
  short bS_hi = *(short*)& bU_hi;

  short cS_lo = *(short*)& cU_lo;
  short cS_hi = *(short*)& cU_hi;

  // Get answer
  short ansS_lo = (short)min(min(aS_lo, bS_lo), cS_lo);
  short ansS_hi = (short)min(min(aS_hi, bS_hi), cS_hi);

  // Cast back to unsigned:
  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;

  // Put answer back together:
  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
#endif

  return res;
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin3_u32(const unsigned int a, const unsigned int b, const unsigned int c){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  int res;
  asm ("{.reg .u32 t1; \n\t"
      "min.u32 t1, %1, %2; \n\t"
      "min.u32 %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
  return res;
#else
  // Host and older architecture code
  return min(min(a, b), c);
#endif
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin3_u16x2(const unsigned int a, const unsigned int b, const unsigned int c){
  unsigned int res;
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  asm ("{.reg .b32 t1; \n\t"
      "min.u16x2 t1, %1, %2; \n\t"
      "min.u16x2 %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
#elif defined(__CUDA_ARCH__)
  res = __vminu2(__vminu2(a, b), c);
#else
  // Host and older architecture code
  // Separate our high and low bit:
  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
  unsigned short aU_hi = (unsigned short)(a >> 16);

  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
  unsigned short bU_hi = (unsigned short)(b >> 16);

  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
  unsigned short cU_hi = (unsigned short)(c >> 16);

  // Get answer
  unsigned short ansU_lo = (unsigned short)min(min(aU_lo, bU_lo), cU_lo);
  unsigned short ansU_hi = (unsigned short)min(min(aU_hi, bU_hi), cU_hi);

  // Put answer back together:
  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
#endif

  return res;
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimax3_s32_relu(const int a, const int b, const int c){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  int res;
  asm ("{.reg .s32 t1; \n\t"
      "max.s32.relu t1, %1, %2; \n\t"
      "max.s32.relu %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
  return res;
#else
  // Host and older architecture code
  int ans = max(max(a, b), c);

  return (ans > 0) ? ans : 0;
#endif
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimax3_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c){
  unsigned int res;
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  asm ("{.reg .b32 t1; \n\t"
      "max.s16x2.relu t1, %1, %2; \n\t"
      "max.s16x2.relu %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
#elif defined(__CUDA_AND_AT_LEAST_SM_75__)
  res = __vimax_s16x2_relu(__vmaxs2(a, b), c);
#else
  // Host and older architecture code
  // Separate our high and low bit:
  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
  unsigned short aU_hi = (unsigned short)(a >> 16);

  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
  unsigned short bU_hi = (unsigned short)(b >> 16);

  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
  unsigned short cU_hi = (unsigned short)(c >> 16);

  //cast to signed:
  short aS_lo = *(short*)& aU_lo;
  short aS_hi = *(short*)& aU_hi;

  short bS_lo = *(short*)& bU_lo;
  short bS_hi = *(short*)& bU_hi;

  short cS_lo = *(short*)& cU_lo;
  short cS_hi = *(short*)& cU_hi;

  // Get answer
  short ansS_lo = (short)max(max(aS_lo, bS_lo), cS_lo);
  short ansS_hi = (short)max(max(aS_hi, bS_hi), cS_hi);

  // relu
  if(ansS_lo < 0){ansS_lo = 0;}
  if(ansS_hi < 0){ansS_hi = 0;}

  // Cast back to unsigned:
  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;

  // Put answer back together:
  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
#endif

  return res;
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimin3_s32_relu(const int a, const int b, const int c){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  int res;
  asm ("{.reg .s32 t1; \n\t"
      "min.s32.relu t1, %1, %2; \n\t"
      "min.s32.relu %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
  return res;
#else
  // Host and older architecture code
  int ans = min(min(a, b), c);

  return (ans > 0) ? ans : 0;
#endif
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vimin3_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c){
  unsigned res;
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  asm ("{.reg .b32 t1; \n\t"
      "min.s16x2.relu t1, %1, %2; \n\t"
      "min.s16x2.relu %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
#elif defined(__CUDA_AND_AT_LEAST_SM_75__)
  res = __vimin_s16x2_relu(__vmins2(a, b), c);
#else
  // Host and older architecture code
  // Separate our high and low bit:
  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
  unsigned short aU_hi = (unsigned short)(a >> 16);

  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
  unsigned short bU_hi = (unsigned short)(b >> 16);

  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
  unsigned short cU_hi = (unsigned short)(c >> 16);

  //cast to signed:
  short aS_lo = *(short*)& aU_lo;
  short aS_hi = *(short*)& aU_hi;

  short bS_lo = *(short*)& bU_lo;
  short bS_hi = *(short*)& bU_hi;

  short cS_lo = *(short*)& cU_lo;
  short cS_hi = *(short*)& cU_hi;

  // Get answer
  short ansS_lo = (short)min(min(aS_lo, bS_lo), cS_lo);
  short ansS_hi = (short)min(min(aS_hi, bS_hi), cS_hi);

  // relu
  if(ansS_lo < 0){ansS_lo = 0;}
  if(ansS_hi < 0){ansS_hi = 0;}

  // Cast back to unsigned:
  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;

  // Put answer back together:
  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
#endif

  return res;
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __viaddmax_s32(const int a, const int b, const int c){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  int res;
  asm ("{.reg .s32 t1; \n\t"
      "add.s32 t1, %1, %2; \n\t"
      "max.s32 %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
  return res;
#else
  // Host and older architecture code
  return max(a + b, c);
#endif
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmax_s16x2(const unsigned int a, const unsigned int b, const unsigned int c){
  unsigned int res;
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  asm ("{.reg .b32 t1; \n\t"
      "add.s16x2 t1, %1, %2; \n\t"
      "max.s16x2 %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
#elif defined(__CUDA_ARCH__)
  res = __vmaxs2(__vadd2(a, b), c);
#else
  // Host and older architecture code
  // Separate our high and low bit:
  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
  unsigned short aU_hi = (unsigned short)(a >> 16);

  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
  unsigned short bU_hi = (unsigned short)(b >> 16);

  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
  unsigned short cU_hi = (unsigned short)(c >> 16);

  //cast to signed:
  short aS_lo = *(short*)& aU_lo;
  short aS_hi = *(short*)& aU_hi;

  short bS_lo = *(short*)& bU_lo;
  short bS_hi = *(short*)& bU_hi;

  short cS_lo = *(short*)& cU_lo;
  short cS_hi = *(short*)& cU_hi;

  // Get answer
  short ansS_lo = (short)max((short)(aS_lo + bS_lo), cS_lo);
  short ansS_hi = (short)max((short)(aS_hi + bS_hi), cS_hi);

  // Cast back to unsigned:
  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;

  // Put answer back together:
  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
#endif

  return res;
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmax_u32(const unsigned int a, const unsigned int b, const unsigned int c){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  unsigned int res;
  asm ("{.reg .u32 t1; \n\t"
      "add.u32 t1, %1, %2; \n\t"
      "max.u32 %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
  return res;
#else
  // Host and older architecture code
  return max(a + b, c);
#endif
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmax_u16x2(const unsigned int a, const unsigned int b, const unsigned int c){
  unsigned int res;
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  asm ("{.reg .b32 t1; \n\t"
      "add.u16x2 t1, %1, %2; \n\t"
      "max.u16x2 %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
#elif defined(__CUDA_ARCH__)
  res = __vmaxu2(__vadd2(a, b), c);
#else
  // Host and older architecture code
  // Separate our high and low bit:
  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
  unsigned short aU_hi = (unsigned short)(a >> 16);

  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
  unsigned short bU_hi = (unsigned short)(b >> 16);

  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
  unsigned short cU_hi = (unsigned short)(c >> 16);

  // Get answer
  unsigned short ansU_lo = (unsigned short)max((unsigned short)(aU_lo + bU_lo), cU_lo);
  unsigned short ansU_hi = (unsigned short)max((unsigned short)(aU_hi + bU_hi), cU_hi);

  // Put answer back together:
  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
#endif

  return res;
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __viaddmin_s32(const int a, const int b, const int c){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  int res;
  asm ("{.reg .s32 t1; \n\t"
      "add.s32 t1, %1, %2; \n\t"
      "min.s32 %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
  return res;
#else
  // Host and older architecture code
  return min(a + b, c);
#endif
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmin_s16x2(const unsigned int a, const unsigned int b, const unsigned int c){
  unsigned int res;
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  asm ("{.reg .b32 t1; \n\t"
      "add.s16x2 t1, %1, %2; \n\t"
      "min.s16x2 %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
#elif defined(__CUDA_ARCH__)
  res = __vmins2(__vadd2(a, b), c);
#else
  // Host and older architecture code
  // Separate our high and low bit:
  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
  unsigned short aU_hi = (unsigned short)(a >> 16);

  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
  unsigned short bU_hi = (unsigned short)(b >> 16);

  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
  unsigned short cU_hi = (unsigned short)(c >> 16);

  //cast to signed:
  short aS_lo = *(short*)& aU_lo;
  short aS_hi = *(short*)& aU_hi;

  short bS_lo = *(short*)& bU_lo;
  short bS_hi = *(short*)& bU_hi;

  short cS_lo = *(short*)& cU_lo;
  short cS_hi = *(short*)& cU_hi;

  // Get answer
  short ansS_lo = (short)min((short)(aS_lo + bS_lo), cS_lo);
  short ansS_hi = (short)min((short)(aS_hi + bS_hi), cS_hi);

  // Cast back to unsigned:
  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;

  // Put answer back together:
  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
#endif

  return res;
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmin_u32(const unsigned int a, const unsigned int b, const unsigned int c){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  unsigned int res;
  asm ("{.reg .u32 t1; \n\t"
      "add.u32 t1, %1, %2; \n\t"
      "min.u32 %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
  return res;
#else
  // Host and older architecture code
  return min(a + b, c);
#endif
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmin_u16x2(const unsigned int a, const unsigned int b, const unsigned int c){
  unsigned int res;
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  asm ("{.reg .b32 t1; \n\t"
      "add.u16x2 t1, %1, %2; \n\t"
      "min.u16x2 %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
#elif defined(__CUDA_ARCH__)
  res = __vminu2(__vadd2(a, b), c);
#else
  // Host and older architecture code
  // Separate our high and low bit:
  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
  unsigned short aU_hi = (unsigned short)(a >> 16);

  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
  unsigned short bU_hi = (unsigned short)(b >> 16);

  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
  unsigned short cU_hi = (unsigned short)(c >> 16);

  // Get answer
  unsigned short ansU_lo = (unsigned short)min((unsigned short)(aU_lo + bU_lo), cU_lo);
  unsigned short ansU_hi = (unsigned short)min((unsigned short)(aU_hi + bU_hi), cU_hi);

  // Put answer back together:
  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
#endif

  return res;
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __viaddmax_s32_relu(const int a, const int b, const int c){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  int res;
  asm ("{.reg .s32 t1; \n\t"
      "add.s32 t1, %1, %2; \n\t"
      "max.s32.relu %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
  return res;
#else
  // Host and older architecture code
  int ans = max(a + b, c);

  return (ans > 0) ? ans : 0;
#endif
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmax_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c){
  unsigned int res;
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  asm ("{.reg .b32 t1; \n\t"
      "add.s16x2 t1, %1, %2; \n\t"
      "max.s16x2.relu %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
#elif defined(__CUDA_ARCH__)
  res = __vimax_s16x2_relu(__vadd2(a, b), c);
#else
  // Host and older architecture code
  // Separate our high and low bit:
  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
  unsigned short aU_hi = (unsigned short)(a >> 16);

  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
  unsigned short bU_hi = (unsigned short)(b >> 16);

  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
  unsigned short cU_hi = (unsigned short)(c >> 16);

  //cast to signed:
  short aS_lo = *(short*)& aU_lo;
  short aS_hi = *(short*)& aU_hi;

  short bS_lo = *(short*)& bU_lo;
  short bS_hi = *(short*)& bU_hi;

  short cS_lo = *(short*)& cU_lo;
  short cS_hi = *(short*)& cU_hi;

  // Get answer
  short ansS_lo = (short)max((short)(aS_lo + bS_lo), cS_lo);
  short ansS_hi = (short)max((short)(aS_hi + bS_hi), cS_hi);

  if(ansS_lo < 0){ansS_lo = 0;}
  if(ansS_hi < 0){ansS_hi = 0;}

  // Cast back to unsigned:
  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;

  // Put answer back together:
  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
#endif

  return res;
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __viaddmin_s32_relu(const int a, const int b, const int c){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  int res;
  asm ("{.reg .s32 t1; \n\t"
      "add.s32 t1, %1, %2; \n\t"
      "min.s32.relu %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
  return res;
#else
  // Host and older architecture code
  int ans = min(a + b, c);

  return (ans > 0) ? ans : 0;
#endif
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __viaddmin_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c){
  unsigned int res;
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  asm ("{.reg .b32 t1; \n\t"
      "add.s16x2 t1, %1, %2; \n\t"
      "min.s16x2.relu %0, t1, %3;}\n\t"
      : "=r"(res) : "r"(a), "r"(b), "r"(c));
#elif defined(__CUDA_ARCH__)
  res = __vimin_s16x2_relu(__vadd2(a, b), c);
#else
  // Host and older architecture code
  // Separate our high and low bit:
  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
  unsigned short aU_hi = (unsigned short)(a >> 16);

  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
  unsigned short bU_hi = (unsigned short)(b >> 16);

  unsigned short cU_lo = (unsigned short)(c & 0xFFFFU);
  unsigned short cU_hi = (unsigned short)(c >> 16);

  //cast to signed:
  short aS_lo = *(short*)& aU_lo;
  short aS_hi = *(short*)& aU_hi;

  short bS_lo = *(short*)& bU_lo;
  short bS_hi = *(short*)& bU_hi;

  short cS_lo = *(short*)& cU_lo;
  short cS_hi = *(short*)& cU_hi;

  // Get answer
  short ansS_lo = (short)min((short)(aS_lo + bS_lo), cS_lo);
  short ansS_hi = (short)min((short)(aS_hi + bS_hi), cS_hi);

  if(ansS_lo < 0){ansS_lo = 0;}
  if(ansS_hi < 0){ansS_hi = 0;}

  // Cast back to unsigned:
  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;

  // Put answer back together:
  res = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);
#endif

  return res;
}

// vimax vimin with predicate
// *pred gets set to '(a >= b)'
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vibmax_s32(const int a, const int b, bool* const pred){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  int val;
  unsigned int predicate_local;
  asm ("{ .reg .pred __$temp1;\n\t"
      "  setp.ge.s32  __$temp1, %2, %3;\n\t"
      "  selp.s32 %0, %2, %3, __$temp1;\n\t"
      "  selp.s32 %1, 1, 0, __$temp1;}\n\t"
      : "=r"(val), "=r"(predicate_local) : "r"(a), "r"(b));

  *pred = (bool)predicate_local;
  return val;
#else
  // Host and older architecture code
  int ans = max(a, b);

  *pred = (a >= b);
  return ans;
#endif
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmax_u32(const unsigned int a, const unsigned int b, bool* const pred){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  unsigned int val;
  unsigned int predicate_local;
  asm ("{ .reg .pred __$temp1;\n\t"
      "  setp.ge.u32  __$temp1, %2, %3;\n\t"
      "  selp.u32 %0, %2, %3, __$temp1;\n\t"
      "  selp.u32 %1, 1, 0, __$temp1;}\n\t"
      : "=r"(val), "=r"(predicate_local) : "r"(a), "r"(b));

  *pred = (bool)predicate_local;
  return val;
#else
  // Host and older architecture code
  unsigned int ans = max(a, b);

  *pred = (a >= b);
  return ans;
#endif
}

// *pred gets set to '(a <= b)'
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vibmin_s32(const int a, const int b, bool* const pred){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  int val;
  unsigned int predicate_local;
  asm ("{ .reg .pred __$temp1;\n\t"
      "  setp.le.s32  __$temp1, %2, %3;\n\t"
      "  selp.s32 %0, %2, %3, __$temp1;\n\t"
      "  selp.s32 %1, 1, 0, __$temp1;}\n\t"
      : "=r"(val), "=r"(predicate_local) : "r"(a), "r"(b));

  *pred = (bool)predicate_local;
  return val;
#else
  // Host and older architecture code
  int ans = min(a, b);

  *pred = (a <= b);
  return ans;
#endif
}

// *pred gets set to '(a <= b)'
__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmin_u32(const unsigned int a, const unsigned int b, bool* const pred){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  unsigned int val;
  unsigned int predicate_local;
  asm ("{ .reg .pred __$temp1;\n\t"
      "  setp.le.u32  __$temp1, %2, %3;\n\t"
      "  selp.u32 %0, %2, %3, __$temp1;\n\t"
      "  selp.u32 %1, 1, 0, __$temp1;}\n\t"
      : "=r"(val), "=r"(predicate_local) : "r"(a), "r"(b));

  *pred = (bool)predicate_local;
  return val;
#else
  // Host and older architecture code
  unsigned int ans = min(a, b);

  *pred = (a <= b);
  return ans;
#endif
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmax_s16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  unsigned int val;
  unsigned int predicate_local_hi;
  unsigned int predicate_local_lo;
  asm ("{.reg .pred pu, pv; \n\t"
      ".reg .s16 rs0, rs1, rs2, rs3; \n\t"
      "max.s16x2 %0, %3, %4; \n\t"
      "mov.b32 {rs0, rs1}, %0; \n\t"
      "mov.b32 {rs2, rs3}, %3; \n\t"
      "setp.eq.s16 pv, rs0, rs2; \n\t"
      "setp.eq.s16 pu, rs1, rs3; \n\t"
      "selp.b32 %1, 1, 0, pu; \n\t"
      "selp.b32 %2, 1, 0, pv;} \n\t"
      : "=r"(val), "=r"(predicate_local_hi),"=r"(predicate_local_lo) : "r"(a), "r"(b));

  *pred_hi = (bool)predicate_local_hi;
  *pred_lo = (bool)predicate_local_lo;
  return val;
#else
  // Host and older architecture code
  // Separate our high and low bit:
  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
  unsigned short aU_hi = (unsigned short)(a >> 16);

  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
  unsigned short bU_hi = (unsigned short)(b >> 16);

  //cast to signed:
  short aS_lo = *(short*)& aU_lo;
  short aS_hi = *(short*)& aU_hi;

  short bS_lo = *(short*)& bU_lo;
  short bS_hi = *(short*)& bU_hi;

  // Get answer
  short ansS_lo = (short)max(aS_lo, bS_lo);
  short ansS_hi = (short)max(aS_hi, bS_hi);

  *pred_hi = (aS_hi >= bS_hi);
  *pred_lo = (aS_lo >= bS_lo);

  // Cast back to unsigned:
  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;

  // Put answer back together:
  unsigned int ans = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);

  return ans;  
#endif
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmax_u16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  unsigned int val;
  unsigned int predicate_local_hi;
  unsigned int predicate_local_lo;
  asm ("{.reg .pred pu, pv; \n\t"
      ".reg .u16 rs0, rs1, rs2, rs3; \n\t"
      "max.u16x2 %0, %3, %4; \n\t"
      "mov.b32 {rs0, rs1}, %0; \n\t"
      "mov.b32 {rs2, rs3}, %3; \n\t"
      "setp.eq.u16 pv, rs0, rs2; \n\t"
      "setp.eq.u16 pu, rs1, rs3; \n\t"
      "selp.b32 %1, 1, 0, pu; \n\t"
      "selp.b32 %2, 1, 0, pv;} \n\t"
      : "=r"(val), "=r"(predicate_local_hi),"=r"(predicate_local_lo) : "r"(a), "r"(b));

  *pred_hi = (bool)predicate_local_hi;
  *pred_lo = (bool)predicate_local_lo;
  return val;
#else
  // Host and older architecture code
  // Separate our high and low bit:
  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
  unsigned short aU_hi = (unsigned short)(a >> 16);

  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
  unsigned short bU_hi = (unsigned short)(b >> 16);

  // Get answer
  unsigned short ansU_lo = (unsigned short)max(aU_lo, bU_lo);
  unsigned short ansU_hi = (unsigned short)max(aU_hi, bU_hi);

  *pred_hi = (aU_hi >= bU_hi);
  *pred_lo = (aU_lo >= bU_lo);

  // Put answer back together:
  unsigned int ans = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);

  return ans;  
#endif
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmin_s16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  unsigned int val;
  unsigned int predicate_local_hi;
  unsigned int predicate_local_lo;
  asm ("{.reg .pred pu, pv; \n\t"
      ".reg .u16 rs0, rs1, rs2, rs3; \n\t"
      "min.s16x2 %0, %3, %4; \n\t"
      "mov.b32 {rs0, rs1}, %0; \n\t"
      "mov.b32 {rs2, rs3}, %3; \n\t"
      "setp.eq.s16 pv, rs0, rs2; \n\t"
      "setp.eq.s16 pu, rs1, rs3; \n\t"
      "selp.b32 %1, 1, 0, pu; \n\t"
      "selp.b32 %2, 1, 0, pv;} \n\t"
      : "=r"(val), "=r"(predicate_local_hi),"=r"(predicate_local_lo) : "r"(a), "r"(b));

  *pred_hi = (bool)predicate_local_hi;
  *pred_lo = (bool)predicate_local_lo;
  return val;
#else
  // Host and older architecture code
  // Separate our high and low bit:
  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
  unsigned short aU_hi = (unsigned short)(a >> 16);

  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
  unsigned short bU_hi = (unsigned short)(b >> 16);

  //cast to signed:
  short aS_lo = *(short*)& aU_lo;
  short aS_hi = *(short*)& aU_hi;

  short bS_lo = *(short*)& bU_lo;
  short bS_hi = *(short*)& bU_hi;

  // Get answer
  short ansS_lo = (short)min(aS_lo, bS_lo);
  short ansS_hi = (short)min(aS_hi, bS_hi);

  *pred_hi = (aS_hi <= bS_hi);
  *pred_lo = (aS_lo <= bS_lo);

  // Cast back to unsigned:
  unsigned short ansU_lo = *(unsigned short*)& ansS_lo;
  unsigned short ansU_hi = *(unsigned short*)& ansS_hi;

  // Put answer back together:
  unsigned int ans = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);

  return ans;  
#endif
}

__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ unsigned int __vibmin_u16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo){
#ifdef __CUDA_AND_AT_LEAST_SM_90__
  unsigned int val;
  unsigned int predicate_local_hi;
  unsigned int predicate_local_lo;
  asm ("{.reg .pred pu, pv; \n\t"
      ".reg .u16 rs0, rs1, rs2, rs3; \n\t"
      "min.u16x2 %0, %3, %4; \n\t"
      "mov.b32 {rs0, rs1}, %0; \n\t"
      "mov.b32 {rs2, rs3}, %3; \n\t"
      "setp.eq.u16 pv, rs0, rs2; \n\t"
      "setp.eq.u16 pu, rs1, rs3; \n\t"
      "selp.b32 %1, 1, 0, pu; \n\t"
      "selp.b32 %2, 1, 0, pv;} \n\t"
      : "=r"(val), "=r"(predicate_local_hi),"=r"(predicate_local_lo) : "r"(a), "r"(b));

  *pred_hi = (bool)predicate_local_hi;
  *pred_lo = (bool)predicate_local_lo;
  return val;
#else
  // Host and older architecture code
  // Separate our high and low bit:
  unsigned short aU_lo = (unsigned short)(a & 0xFFFFU);
  unsigned short aU_hi = (unsigned short)(a >> 16);

  unsigned short bU_lo = (unsigned short)(b & 0xFFFFU);
  unsigned short bU_hi = (unsigned short)(b >> 16);

  // Get answer
  unsigned short ansU_lo = (unsigned short)min(aU_lo, bU_lo);
  unsigned short ansU_hi = (unsigned short)min(aU_hi, bU_hi);

  *pred_hi = (aU_hi <= bU_hi);
  *pred_lo = (aU_lo <= bU_lo);

  // Put answer back together:
  unsigned int ans = ((unsigned int) ansU_lo) | (((unsigned int) ansU_hi) << 16);

  return ans;  
#endif
}

#ifdef __CUDA_AND_AT_LEAST_SM_90__
#undef __CUDA_AND_AT_LEAST_SM_90__
#endif

#undef __DEVICE_HOST_FUNCTIONS_STATIC_DECL__

/*******************************************************************************
*                                                                              *
*                                                                              *
*                                                                              *
*******************************************************************************/

#endif /* !__DEVICE_FUNCTIONS_HPP__ */

#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__)
#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_HPP__
#endif