/*
 * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */

/**
 * CUDA Occupancy Calculator
 *
 * NAME
 *
 *   cudaOccMaxActiveBlocksPerMultiprocessor,
 *   cudaOccMaxPotentialOccupancyBlockSize,
 *   cudaOccMaxPotentialOccupancyBlockSizeVariableSMem
 *   cudaOccAvailableDynamicSMemPerBlock
 *
 * DESCRIPTION
 *
 *   The CUDA occupancy calculator provides a standalone, programmatical
 *   interface to compute the occupancy of a function on a device. It can also
 *   provide occupancy-oriented launch configuration suggestions.
 *
 *   The function and device are defined by the user through
 *   cudaOccFuncAttributes, cudaOccDeviceProp, and cudaOccDeviceState
 *   structures. All APIs require all 3 of them.
 *
 *   See the structure definition for more details about the device / function
 *   descriptors.
 *
 *   See each API's prototype for API usage.
 *
 * COMPATIBILITY
 *
 *   The occupancy calculator will be updated on each major CUDA toolkit
 *   release. It does not provide forward compatibility, i.e. new hardwares
 *   released after this implementation's release will not be supported.
 *
 * NOTE
 *
 *   If there is access to CUDA runtime, and the sole intent is to calculate
 *   occupancy related values on one of the accessible CUDA devices, using CUDA
 *   runtime's occupancy calculation APIs is recommended.
 *
 */

#ifndef __cuda_occupancy_h__
#define __cuda_occupancy_h__

#include <stddef.h>
#include <limits.h>
#include <string.h>


// __OCC_INLINE will be undefined at the end of this header
//
#ifdef __CUDACC__
#define __OCC_INLINE inline __host__ __device__
#elif defined _MSC_VER
#define __OCC_INLINE __inline
#else // GNUCC assumed
#define __OCC_INLINE inline
#endif

enum cudaOccError_enum {
    CUDA_OCC_SUCCESS              = 0,  // no error encountered
    CUDA_OCC_ERROR_INVALID_INPUT  = 1,  // input parameter is invalid
    CUDA_OCC_ERROR_UNKNOWN_DEVICE = 2,  // requested device is not supported in
                                        // current implementation or device is
                                        // invalid
};
typedef enum cudaOccError_enum       cudaOccError;

typedef struct cudaOccResult         cudaOccResult;
typedef struct cudaOccDeviceProp     cudaOccDeviceProp;
typedef struct cudaOccFuncAttributes cudaOccFuncAttributes;
typedef struct cudaOccDeviceState    cudaOccDeviceState;

/**
 * The CUDA occupancy calculator computes the occupancy of the function
 * described by attributes with the given block size (blockSize), static device
 * properties (properties), dynamic device states (states) and per-block dynamic
 * shared memory allocation (dynamicSMemSize) in bytes, and output it through
 * result along with other useful information. The occupancy is computed in
 * terms of the maximum number of active blocks per multiprocessor. The user can
 * then convert it to other metrics, such as number of active warps.
 *
 * RETURN VALUE
 *
 * The occupancy and related information is returned through result.
 *
 * If result->activeBlocksPerMultiprocessor is 0, then the given parameter
 * combination cannot run on the device.
 *
 * ERRORS
 *
 *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
 *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
 *     current implementation or device is invalid
 */
static __OCC_INLINE
cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
    cudaOccResult               *result,           // out
    const cudaOccDeviceProp     *properties,       // in
    const cudaOccFuncAttributes *attributes,       // in
    const cudaOccDeviceState    *state,            // in
    int                          blockSize,        // in
    size_t                       dynamicSmemSize); // in

/**
 * The CUDA launch configurator C API suggests a grid / block size pair (in
 * minGridSize and blockSize) that achieves the best potential occupancy
 * (i.e. maximum number of active warps with the smallest number of blocks) for
 * the given function described by attributes, on a device described by
 * properties with settings in state.
 *
 * If per-block dynamic shared memory allocation is not needed, the user should
 * leave both blockSizeToDynamicSMemSize and dynamicSMemSize as 0.
 *
 * If per-block dynamic shared memory allocation is needed, then if the dynamic
 * shared memory size is constant regardless of block size, the size should be
 * passed through dynamicSMemSize, and blockSizeToDynamicSMemSize should be
 * NULL.
 *
 * Otherwise, if the per-block dynamic shared memory size varies with different
 * block sizes, the user needs to provide a pointer to an unary function through
 * blockSizeToDynamicSMemSize that computes the dynamic shared memory needed by
 * a block of the function for any given block size. dynamicSMemSize is
 * ignored. An example signature is:
 *
 *    // Take block size, returns dynamic shared memory needed
 *    size_t blockToSmem(int blockSize);
 *
 * RETURN VALUE
 *
 * The suggested block size and the minimum number of blocks needed to achieve
 * the maximum occupancy are returned through blockSize and minGridSize.
 *
 * If *blockSize is 0, then the given combination cannot run on the device.
 *
 * ERRORS
 *
 *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
 *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
 *     current implementation or device is invalid
 *
 */
static __OCC_INLINE
cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
    int                         *minGridSize,      // out
    int                         *blockSize,        // out
    const cudaOccDeviceProp     *properties,       // in
    const cudaOccFuncAttributes *attributes,       // in
    const cudaOccDeviceState    *state,            // in
    size_t                     (*blockSizeToDynamicSMemSize)(int), // in
    size_t                       dynamicSMemSize); // in

/**
 * The CUDA launch configurator C++ API suggests a grid / block size pair (in
 * minGridSize and blockSize) that achieves the best potential occupancy
 * (i.e. the maximum number of active warps with the smallest number of blocks)
 * for the given function described by attributes, on a device described by
 * properties with settings in state.
 *
 * If per-block dynamic shared memory allocation is 0 or constant regardless of
 * block size, the user can use cudaOccMaxPotentialOccupancyBlockSize to
 * configure the launch. A constant dynamic shared memory allocation size in
 * bytes can be passed through dynamicSMemSize.
 *
 * Otherwise, if the per-block dynamic shared memory size varies with different
 * block sizes, the user needs to use
 * cudaOccMaxPotentialOccupancyBlockSizeVariableSmem instead, and provide a
 * functor / pointer to an unary function (blockSizeToDynamicSMemSize) that
 * computes the dynamic shared memory needed by func for any given block
 * size. An example signature is:
 *
 *  // Take block size, returns per-block dynamic shared memory needed
 *  size_t blockToSmem(int blockSize);
 *
 * RETURN VALUE
 *
 * The suggested block size and the minimum number of blocks needed to achieve
 * the maximum occupancy are returned through blockSize and minGridSize.
 *
 * If *blockSize is 0, then the given combination cannot run on the device.
 *
 * ERRORS
 *
 *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
 *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
 *     current implementation or device is invalid
 *
 */

#if defined(__cplusplus)
namespace {

__OCC_INLINE
cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
    int                         *minGridSize,          // out
    int                         *blockSize,            // out
    const cudaOccDeviceProp     *properties,           // in
    const cudaOccFuncAttributes *attributes,           // in
    const cudaOccDeviceState    *state,                // in
    size_t                       dynamicSMemSize = 0); // in

template <typename UnaryFunction>
__OCC_INLINE
cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
    int                         *minGridSize,          // out
    int                         *blockSize,            // out
    const cudaOccDeviceProp     *properties,           // in
    const cudaOccFuncAttributes *attributes,           // in
    const cudaOccDeviceState    *state,                // in
    UnaryFunction                blockSizeToDynamicSMemSize); // in

} // namespace anonymous
#endif // defined(__cplusplus)

/**
 *
 * The CUDA dynamic shared memory calculator computes the maximum size of 
 * per-block dynamic shared memory if we want to place numBlocks blocks
 * on an SM.
 *
 * RETURN VALUE
 *
 * Returns in *dynamicSmemSize the maximum size of dynamic shared memory to allow 
 * numBlocks blocks per SM.
 *
 * ERRORS
 *
 *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
 *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
 *     current implementation or device is invalid
 *
 */
static __OCC_INLINE
cudaOccError cudaOccAvailableDynamicSMemPerBlock(
    size_t                      *dynamicSmemSize,
    const cudaOccDeviceProp     *properties,
    const cudaOccFuncAttributes *attributes,
    const cudaOccDeviceState    *state,
    int                         numBlocks,
    int                         blockSize);

/**
 * Data structures
 *
 * These structures are subject to change for future architecture and CUDA
 * releases. C users should initialize the structure as {0}.
 *
 */

/**
 * Device descriptor
 *
 * This structure describes a device.
 */
struct cudaOccDeviceProp {
    int    computeMajor;                // Compute capability major version
    int    computeMinor;                // Compute capability minor
                                        // version. None supported minor version
                                        // may cause error
    int    maxThreadsPerBlock;          // Maximum number of threads per block
    int    maxThreadsPerMultiprocessor; // Maximum number of threads per SM
                                        // i.e. (Max. number of warps) x (warp
                                        // size)
    int    regsPerBlock;                // Maximum number of registers per block
    int    regsPerMultiprocessor;       // Maximum number of registers per SM
    int    warpSize;                    // Warp size
    size_t sharedMemPerBlock;           // Maximum shared memory size per block
    size_t sharedMemPerMultiprocessor;  // Maximum shared memory size per SM
    int    numSms;                      // Number of SMs available
    size_t sharedMemPerBlockOptin;      // Maximum optin shared memory size per block
    size_t reservedSharedMemPerBlock;   // Shared memory per block reserved by driver

#ifdef __cplusplus
    // This structure can be converted from a cudaDeviceProp structure for users
    // that use this header in their CUDA applications.
    //
    // If the application have access to the CUDA Runtime API, the application
    // can obtain the device properties of a CUDA device through
    // cudaGetDeviceProperties, and initialize a cudaOccDeviceProp with the
    // cudaDeviceProp structure.
    //
    // Example:
    /*
     {
         cudaDeviceProp prop;

         cudaGetDeviceProperties(&prop, ...);

         cudaOccDeviceProp occProp = prop;

         ...

         cudaOccMaxPotentialOccupancyBlockSize(..., &occProp, ...);
     }
     */
    //
    template<typename DeviceProp>
    __OCC_INLINE
    cudaOccDeviceProp(const DeviceProp &props)
    :   computeMajor                (props.major),
        computeMinor                (props.minor),
        maxThreadsPerBlock          (props.maxThreadsPerBlock),
        maxThreadsPerMultiprocessor (props.maxThreadsPerMultiProcessor),
        regsPerBlock                (props.regsPerBlock),
        regsPerMultiprocessor       (props.regsPerMultiprocessor),
        warpSize                    (props.warpSize),
        sharedMemPerBlock           (props.sharedMemPerBlock),
        sharedMemPerMultiprocessor  (props.sharedMemPerMultiprocessor),
        numSms                      (props.multiProcessorCount),
        sharedMemPerBlockOptin      (props.sharedMemPerBlockOptin),
        reservedSharedMemPerBlock   (props.reservedSharedMemPerBlock)
    {}

    __OCC_INLINE
    cudaOccDeviceProp()
    :   computeMajor                (0),
        computeMinor                (0),
        maxThreadsPerBlock          (0),
        maxThreadsPerMultiprocessor (0),
        regsPerBlock                (0),
        regsPerMultiprocessor       (0),
        warpSize                    (0),
        sharedMemPerBlock           (0),
        sharedMemPerMultiprocessor  (0),
        numSms                      (0),
        sharedMemPerBlockOptin      (0),
        reservedSharedMemPerBlock   (0)
    {}
#endif // __cplusplus
};

/**
 * Partitioned global caching option
 */
typedef enum cudaOccPartitionedGCConfig_enum {
    PARTITIONED_GC_OFF,        // Disable partitioned global caching
    PARTITIONED_GC_ON,         // Prefer partitioned global caching
    PARTITIONED_GC_ON_STRICT   // Force partitioned global caching
} cudaOccPartitionedGCConfig;

/**
 * Per function opt in maximum dynamic shared memory limit
 */
typedef enum cudaOccFuncShmemConfig_enum {
    FUNC_SHMEM_LIMIT_DEFAULT,   // Default shmem limit
    FUNC_SHMEM_LIMIT_OPTIN,     // Use the optin shmem limit
} cudaOccFuncShmemConfig;

/**
 * Function descriptor
 *
 * This structure describes a CUDA function.
 */
struct cudaOccFuncAttributes {
    int maxThreadsPerBlock; // Maximum block size the function can work with. If
                            // unlimited, use INT_MAX or any value greater than
                            // or equal to maxThreadsPerBlock of the device
    int numRegs;            // Number of registers used. When the function is
                            // launched on device, the register count may change
                            // due to internal tools requirements.
    size_t sharedSizeBytes; // Number of static shared memory used

    cudaOccPartitionedGCConfig partitionedGCConfig; 
                            // Partitioned global caching is required to enable
                            // caching on certain chips, such as sm_52
                            // devices. Partitioned global caching can be
                            // automatically disabled if the occupancy
                            // requirement of the launch cannot support caching.
                            //
                            // To override this behavior with caching on and
                            // calculate occupancy strictly according to the
                            // preference, set partitionedGCConfig to
                            // PARTITIONED_GC_ON_STRICT. This is especially
                            // useful for experimenting and finding launch
                            // configurations (MaxPotentialOccupancyBlockSize)
                            // that allow global caching to take effect.
                            //
                            // This flag only affects the occupancy calculation.

    cudaOccFuncShmemConfig shmemLimitConfig;
                            // Certain chips like sm_70 allow a user to opt into
                            // a higher per block limit of dynamic shared memory
                            // This optin is performed on a per function basis
                            // using the cuFuncSetAttribute function

    size_t maxDynamicSharedSizeBytes;
                            // User set limit on maximum dynamic shared memory
                            // usable by the kernel
                            // This limit is set using the cuFuncSetAttribute
                            // function.

    int numBlockBarriers;   // Number of block barriers used (default to 1)
#ifdef __cplusplus
    // This structure can be converted from a cudaFuncAttributes structure for
    // users that use this header in their CUDA applications.
    //
    // If the application have access to the CUDA Runtime API, the application
    // can obtain the function attributes of a CUDA kernel function through
    // cudaFuncGetAttributes, and initialize a cudaOccFuncAttributes with the
    // cudaFuncAttributes structure.
    //
    // Example:
    /*
      __global__ void foo() {...}

      ...

      {
          cudaFuncAttributes attr;

          cudaFuncGetAttributes(&attr, foo);

          cudaOccFuncAttributes occAttr = attr;

          ...

          cudaOccMaxPotentialOccupancyBlockSize(..., &occAttr, ...);
      }
     */
    //
    template<typename FuncAttributes>
    __OCC_INLINE
    cudaOccFuncAttributes(const FuncAttributes &attr)
    :   maxThreadsPerBlock  (attr.maxThreadsPerBlock),
        numRegs             (attr.numRegs),
        sharedSizeBytes     (attr.sharedSizeBytes),
        partitionedGCConfig (PARTITIONED_GC_OFF),
        shmemLimitConfig    (FUNC_SHMEM_LIMIT_OPTIN),
        maxDynamicSharedSizeBytes (attr.maxDynamicSharedSizeBytes),
        numBlockBarriers    (1)
    {}

    __OCC_INLINE
    cudaOccFuncAttributes()
    :   maxThreadsPerBlock  (0),
        numRegs             (0),
        sharedSizeBytes     (0),
        partitionedGCConfig (PARTITIONED_GC_OFF),
        shmemLimitConfig    (FUNC_SHMEM_LIMIT_DEFAULT),
        maxDynamicSharedSizeBytes (0),
        numBlockBarriers    (0)
    {}
#endif
};

typedef enum cudaOccCacheConfig_enum {
    CACHE_PREFER_NONE   = 0x00, // no preference for shared memory or L1 (default)
    CACHE_PREFER_SHARED = 0x01, // prefer larger shared memory and smaller L1 cache
    CACHE_PREFER_L1     = 0x02, // prefer larger L1 cache and smaller shared memory
    CACHE_PREFER_EQUAL  = 0x03  // prefer equal sized L1 cache and shared memory
} cudaOccCacheConfig;

typedef enum cudaOccCarveoutConfig_enum {
    SHAREDMEM_CARVEOUT_DEFAULT       = -1,  // no preference for shared memory or L1 (default)
    SHAREDMEM_CARVEOUT_MAX_SHARED    = 100, // prefer maximum available shared memory, minimum L1 cache
    SHAREDMEM_CARVEOUT_MAX_L1        = 0,    // prefer maximum available L1 cache, minimum shared memory
    SHAREDMEM_CARVEOUT_HALF          = 50   // prefer half of maximum available shared memory, with the rest as L1 cache
} cudaOccCarveoutConfig;

/**
 * Device state descriptor
 *
 * This structure describes device settings that affect occupancy calculation.
 */
struct cudaOccDeviceState
{
    // Cache / shared memory split preference. Deprecated on Volta 
    cudaOccCacheConfig cacheConfig; 
    // Shared memory / L1 split preference. Supported on only Volta
    int carveoutConfig;

#ifdef __cplusplus
    __OCC_INLINE
    cudaOccDeviceState()
    :   cacheConfig     (CACHE_PREFER_NONE),
        carveoutConfig  (SHAREDMEM_CARVEOUT_DEFAULT)
    {}
#endif
};

typedef enum cudaOccLimitingFactor_enum {
                                    // Occupancy limited due to:
    OCC_LIMIT_WARPS         = 0x01, // - warps available
    OCC_LIMIT_REGISTERS     = 0x02, // - registers available
    OCC_LIMIT_SHARED_MEMORY = 0x04, // - shared memory available
    OCC_LIMIT_BLOCKS        = 0x08, // - blocks available
    OCC_LIMIT_BARRIERS      = 0x10  // - barrier available
} cudaOccLimitingFactor;

/**
 * Occupancy output
 *
 * This structure contains occupancy calculator's output.
 */
struct cudaOccResult {
    int activeBlocksPerMultiprocessor; // Occupancy
    unsigned int limitingFactors;      // Factors that limited occupancy. A bit
                                       // field that counts the limiting
                                       // factors, see cudaOccLimitingFactor
    int blockLimitRegs;                // Occupancy due to register
                                       // usage, INT_MAX if the kernel does not
                                       // use any register.
    int blockLimitSharedMem;           // Occupancy due to shared memory
                                       // usage, INT_MAX if the kernel does not
                                       // use shared memory.
    int blockLimitWarps;               // Occupancy due to block size limit
    int blockLimitBlocks;              // Occupancy due to maximum number of blocks
                                       // managable per SM
    int blockLimitBarriers;            // Occupancy due to block barrier usage
    int allocatedRegistersPerBlock;    // Actual number of registers allocated per
                                       // block
    size_t allocatedSharedMemPerBlock; // Actual size of shared memory allocated
                                       // per block
    cudaOccPartitionedGCConfig partitionedGCConfig;
                                       // Report if partitioned global caching
                                       // is actually enabled.
};

/**
 * Partitioned global caching support
 *
 * See cudaOccPartitionedGlobalCachingModeSupport
 */
typedef enum cudaOccPartitionedGCSupport_enum {
    PARTITIONED_GC_NOT_SUPPORTED,  // Partitioned global caching is not supported
    PARTITIONED_GC_SUPPORTED,      // Partitioned global caching is supported
} cudaOccPartitionedGCSupport;

/**
 * Implementation
 */

/**
 * Max compute capability supported
 */
#define __CUDA_OCC_MAJOR__ 9
#define __CUDA_OCC_MINOR__ 0

//////////////////////////////////////////
//    Mathematical Helper Functions     //
//////////////////////////////////////////

static __OCC_INLINE int __occMin(int lhs, int rhs)
{
    return rhs < lhs ? rhs : lhs;
}

static __OCC_INLINE int __occDivideRoundUp(int x, int y)
{
    return (x + (y - 1)) / y;
}

static __OCC_INLINE int __occRoundUp(int x, int y)
{
    return y * __occDivideRoundUp(x, y);
}

//////////////////////////////////////////
//      Architectural Properties        //
//////////////////////////////////////////

/**
 * Granularity of shared memory allocation
 */
static __OCC_INLINE cudaOccError cudaOccSMemAllocationGranularity(int *limit, const cudaOccDeviceProp *properties)
{
    int value;

    switch(properties->computeMajor) {
        case 3:
        case 5:
        case 6:
        case 7:
            value = 256;
            break;
        case 8:
        case 9:
            value = 128;
            break;
        default:
            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
    }

    *limit = value;

    return CUDA_OCC_SUCCESS;
}

/**
 * Maximum number of registers per thread
 */
static __OCC_INLINE cudaOccError cudaOccRegAllocationMaxPerThread(int *limit, const cudaOccDeviceProp *properties)
{
    int value;

    switch(properties->computeMajor) {
        case 3:
        case 5:
        case 6:
            value = 255;
            break;
        case 7:
        case 8:
        case 9:
            value = 256;
            break;
        default:
            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
    }

    *limit = value;

    return CUDA_OCC_SUCCESS;
}

/**
 * Granularity of register allocation
 */
static __OCC_INLINE cudaOccError cudaOccRegAllocationGranularity(int *limit, const cudaOccDeviceProp *properties)
{
    int value;

    switch(properties->computeMajor) {
        case 3:
        case 5:
        case 6:
        case 7:
        case 8:
        case 9:
            value = 256;
            break;
        default:
            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
    }

    *limit = value;

    return CUDA_OCC_SUCCESS;
}

/**
 * Number of sub-partitions
 */
static __OCC_INLINE cudaOccError cudaOccSubPartitionsPerMultiprocessor(int *limit, const cudaOccDeviceProp *properties)
{
    int value;

    switch(properties->computeMajor) {
        case 3:
        case 5:
        case 7:
        case 8:
        case 9:
            value = 4;
            break;
        case 6:
            value = properties->computeMinor ? 4 : 2;
            break;
        default:
            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
    }

    *limit = value;

    return CUDA_OCC_SUCCESS;
}


/**
 * Maximum number of blocks that can run simultaneously on a multiprocessor
 */
static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerMultiprocessor(int* limit, const cudaOccDeviceProp *properties)
{
    int value;

    switch(properties->computeMajor) {
        case 3:
            value = 16;
            break;
        case 5:
        case 6:
            value = 32;
            break;
        case 7: {
            int isTuring = properties->computeMinor == 5;
            value = (isTuring) ? 16 : 32;
            break;
        }
        case 8:
            if (properties->computeMinor == 0) {
                value = 32;
            }
            else if (properties->computeMinor == 9) {
                value = 24;
            }
            else {
                value = 16;
            }
            break;
        case 9:
            value = 32;
            break;
        default:
            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
    }

    *limit = value;

    return CUDA_OCC_SUCCESS;
}

/** 
 * Align up shared memory based on compute major configurations
 */
static __OCC_INLINE cudaOccError cudaOccAlignUpShmemSizeVoltaPlus(size_t *shMemSize, const cudaOccDeviceProp *properties)
{
    // Volta and Turing have shared L1 cache / shared memory, and support cache
    // configuration to trade one for the other. These values are needed to
    // map carveout config ratio to the next available architecture size
    size_t size = *shMemSize;

    switch (properties->computeMajor) {
    case 7: {
        // Turing supports 32KB and 64KB shared mem.
        int isTuring = properties->computeMinor == 5;
        if (isTuring) {
            if      (size <= 32 * 1024) {
                *shMemSize = 32 * 1024;
            }
            else if (size <= 64 * 1024) {
                *shMemSize = 64 * 1024;
            }
            else {
                return CUDA_OCC_ERROR_INVALID_INPUT;
            }
        }
        // Volta supports 0KB, 8KB, 16KB, 32KB, 64KB, and 96KB shared mem.
        else {
            if      (size == 0) {
                *shMemSize = 0;
            }
            else if (size <= 8 * 1024) {
                *shMemSize = 8 * 1024;
            }
            else if (size <= 16 * 1024) {
                *shMemSize = 16 * 1024;
            }
            else if (size <= 32 * 1024) {
                *shMemSize = 32 * 1024;
            }
            else if (size <= 64 * 1024) {
                *shMemSize = 64 * 1024;
            }
            else if (size <= 96 * 1024) {
                *shMemSize = 96 * 1024;
            }
            else {
                return CUDA_OCC_ERROR_INVALID_INPUT;
            }
        }
        break;
    }
    case 8:
        if (properties->computeMinor == 0 || properties->computeMinor == 7) {
            if      (size == 0) {
                *shMemSize = 0;
            }
            else if (size <= 8 * 1024) {
                *shMemSize = 8 * 1024;
            }
            else if (size <= 16 * 1024) {
                *shMemSize = 16 * 1024;
            }
            else if (size <= 32 * 1024) {
                *shMemSize = 32 * 1024;
            }
            else if (size <= 64 * 1024) {
                *shMemSize = 64 * 1024;
            }
            else if (size <= 100 * 1024) {
                *shMemSize = 100 * 1024;
            }
            else if (size <= 132 * 1024) {
                *shMemSize = 132 * 1024;
            }
            else if (size <= 164 * 1024) {
                *shMemSize = 164 * 1024;
            }
            else {
                return CUDA_OCC_ERROR_INVALID_INPUT;
            }
        }
        else {
            if      (size == 0) {
                *shMemSize = 0;
            }
            else if (size <= 8 * 1024) {
                *shMemSize = 8 * 1024;
            }
            else if (size <= 16 * 1024) {
                *shMemSize = 16 * 1024;
            }
            else if (size <= 32 * 1024) {
                *shMemSize = 32 * 1024;
            }
            else if (size <= 64 * 1024) {
                *shMemSize = 64 * 1024;
            }
            else if (size <= 100 * 1024) {
                *shMemSize = 100 * 1024;
            }
            else {
                return CUDA_OCC_ERROR_INVALID_INPUT;
            }
        }
        break;
    case 9: {
        if      (size == 0) {
            *shMemSize = 0;
        }
        else if (size <= 8 * 1024) {
            *shMemSize = 8 * 1024;
        }
        else if (size <= 16 * 1024) {
            *shMemSize = 16 * 1024;
        }
        else if (size <= 32 * 1024) {
            *shMemSize = 32 * 1024;
        }
        else if (size <= 64 * 1024) {
            *shMemSize = 64 * 1024;
        }
        else if (size <= 100 * 1024) {
            *shMemSize = 100 * 1024;
        }
        else if (size <= 132 * 1024) {
            *shMemSize = 132 * 1024;
        }
        else if (size <= 164 * 1024) {
            *shMemSize = 164 * 1024;
        }
        else if (size <= 196 * 1024) {
            *shMemSize = 196 * 1024;
        }
        else if (size <= 228 * 1024) {
            *shMemSize = 228 * 1024;
        }
        else {
            return CUDA_OCC_ERROR_INVALID_INPUT;
        }
        break;
    }
    default:
        return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
    }

    return CUDA_OCC_SUCCESS;
}

/**
 * Shared memory based on the new carveoutConfig API introduced with Volta
 */
static __OCC_INLINE cudaOccError cudaOccSMemPreferenceVoltaPlus(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
{
    cudaOccError status = CUDA_OCC_SUCCESS;
    size_t preferenceShmemSize;

    // CUDA 9.0 introduces a new API to set shared memory - L1 configuration on supported
    // devices. This preference will take precedence over the older cacheConfig setting.
    // Map cacheConfig to its effective preference value.
    int effectivePreference = state->carveoutConfig;
    if ((effectivePreference < SHAREDMEM_CARVEOUT_DEFAULT) || (effectivePreference > SHAREDMEM_CARVEOUT_MAX_SHARED)) {
        return CUDA_OCC_ERROR_INVALID_INPUT;
    }
    
    if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
        switch (state->cacheConfig)
        {
        case CACHE_PREFER_L1:
            effectivePreference = SHAREDMEM_CARVEOUT_MAX_L1;
            break;
        case CACHE_PREFER_SHARED:
            effectivePreference = SHAREDMEM_CARVEOUT_MAX_SHARED;
            break;
        case CACHE_PREFER_EQUAL:
            effectivePreference = SHAREDMEM_CARVEOUT_HALF;
            break;
        default:
            effectivePreference = SHAREDMEM_CARVEOUT_DEFAULT;
            break;
        }
    }

    if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
        preferenceShmemSize = properties->sharedMemPerMultiprocessor;
    }
    else {
        preferenceShmemSize = (size_t) (effectivePreference * properties->sharedMemPerMultiprocessor) / 100;
    }

    status = cudaOccAlignUpShmemSizeVoltaPlus(&preferenceShmemSize, properties);
    *limit = preferenceShmemSize;
    return status;
}

/**
 * Shared memory based on the cacheConfig
 */
static __OCC_INLINE cudaOccError cudaOccSMemPreference(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
{
    size_t bytes                          = 0;
    size_t sharedMemPerMultiprocessorHigh = properties->sharedMemPerMultiprocessor;
    cudaOccCacheConfig cacheConfig        = state->cacheConfig;

    // Kepler has shared L1 cache / shared memory, and support cache
    // configuration to trade one for the other. These values are needed to
    // calculate the correct shared memory size for user requested cache
    // configuration.
    //
    size_t minCacheSize                   = 16384;
    size_t maxCacheSize                   = 49152;
    size_t cacheAndSharedTotal            = sharedMemPerMultiprocessorHigh + minCacheSize;
    size_t sharedMemPerMultiprocessorLow  = cacheAndSharedTotal - maxCacheSize;

    switch (properties->computeMajor) {
        case 3:
            // Kepler supports 16KB, 32KB, or 48KB partitions for L1. The rest
            // is shared memory.
            //
            switch (cacheConfig) {
                default :
                case CACHE_PREFER_NONE:
                case CACHE_PREFER_SHARED:
                    bytes = sharedMemPerMultiprocessorHigh;
                    break;
                case CACHE_PREFER_L1:
                    bytes = sharedMemPerMultiprocessorLow;
                    break;
                case CACHE_PREFER_EQUAL:
                    // Equal is the mid-point between high and low. It should be
                    // equivalent to low + 16KB.
                    //
                    bytes = (sharedMemPerMultiprocessorHigh + sharedMemPerMultiprocessorLow) / 2;
                    break;
            }
            break;
        case 5:
        case 6:
            // Maxwell and Pascal have dedicated shared memory.
            //
            bytes = sharedMemPerMultiprocessorHigh;
            break;
        default:
            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
    }

    *limit = bytes;

    return CUDA_OCC_SUCCESS;
}

/**
 * Shared memory based on config requested by User
 */
static __OCC_INLINE cudaOccError cudaOccSMemPerMultiprocessor(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
{
    // Volta introduces a new API that allows for shared memory carveout preference. Because it is a shared memory preference,
    // it is handled separately from the cache config preference.
    if (properties->computeMajor >= 7) {
        return cudaOccSMemPreferenceVoltaPlus(limit, properties, state);
    }
    return cudaOccSMemPreference(limit, properties, state);
}

/**
 * Return the per block shared memory limit based on function config
 */
static __OCC_INLINE cudaOccError cudaOccSMemPerBlock(size_t *limit, const cudaOccDeviceProp *properties, cudaOccFuncShmemConfig shmemLimitConfig, size_t smemPerCta)
{
    switch (properties->computeMajor) {
        case 2:
        case 3:
        case 4:
        case 5:
        case 6:
            *limit = properties->sharedMemPerBlock;
            break;
        case 7:
        case 8:
        case 9:
            switch (shmemLimitConfig) {
                default:
                case FUNC_SHMEM_LIMIT_DEFAULT:
                    *limit = properties->sharedMemPerBlock;
                    break;
                case FUNC_SHMEM_LIMIT_OPTIN:
                    if (smemPerCta > properties->sharedMemPerBlock) {
                        *limit = properties->sharedMemPerBlockOptin;
                    }
                    else {
                        *limit = properties->sharedMemPerBlock;
                    }
                    break;
            }
            break;
        default:
            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
    }

    // Starting Ampere, CUDA driver reserves additional shared memory per block
    if (properties->computeMajor >= 8) {
        *limit += properties->reservedSharedMemPerBlock;
    }

    return CUDA_OCC_SUCCESS;
}

/**
 * Partitioned global caching mode support
 */
static __OCC_INLINE cudaOccError cudaOccPartitionedGlobalCachingModeSupport(cudaOccPartitionedGCSupport *limit, const cudaOccDeviceProp *properties)
{
    *limit = PARTITIONED_GC_NOT_SUPPORTED;

    if ((properties->computeMajor == 5 && (properties->computeMinor == 2 || properties->computeMinor == 3)) ||
        properties->computeMajor == 6) {
        *limit = PARTITIONED_GC_SUPPORTED;
    }

    if (properties->computeMajor == 6 && properties->computeMinor == 0) {
        *limit = PARTITIONED_GC_NOT_SUPPORTED;
    }

    return CUDA_OCC_SUCCESS;
}

///////////////////////////////////////////////
//            User Input Sanity              //
///////////////////////////////////////////////

static __OCC_INLINE cudaOccError cudaOccDevicePropCheck(const cudaOccDeviceProp *properties)
{
    // Verify device properties
    //
    // Each of these limits must be a positive number.
    //
    // Compute capacity is checked during the occupancy calculation
    //
    if (properties->maxThreadsPerBlock          <= 0 ||
        properties->maxThreadsPerMultiprocessor <= 0 ||
        properties->regsPerBlock                <= 0 ||
        properties->regsPerMultiprocessor       <= 0 ||
        properties->warpSize                    <= 0 ||
        properties->sharedMemPerBlock           <= 0 ||
        properties->sharedMemPerMultiprocessor  <= 0 ||
        properties->numSms                      <= 0) {
        return CUDA_OCC_ERROR_INVALID_INPUT;
    }

    return CUDA_OCC_SUCCESS;
}

static __OCC_INLINE cudaOccError cudaOccFuncAttributesCheck(const cudaOccFuncAttributes *attributes)
{
    // Verify function attributes
    //
    if (attributes->maxThreadsPerBlock <= 0 ||
        attributes->numRegs < 0) {            // Compiler may choose not to use
                                              // any register (empty kernels,
                                              // etc.)
        return CUDA_OCC_ERROR_INVALID_INPUT;
    }

    return CUDA_OCC_SUCCESS;
}

static __OCC_INLINE cudaOccError cudaOccDeviceStateCheck(const cudaOccDeviceState *state)
{
    (void)state;   // silence unused-variable warning
    // Placeholder
    //

    return CUDA_OCC_SUCCESS;
}

static __OCC_INLINE cudaOccError cudaOccInputCheck(
    const cudaOccDeviceProp     *properties,
    const cudaOccFuncAttributes *attributes,
    const cudaOccDeviceState    *state)
{
    cudaOccError status = CUDA_OCC_SUCCESS;

    status = cudaOccDevicePropCheck(properties);
    if (status != CUDA_OCC_SUCCESS) {
        return status;
    }

    status = cudaOccFuncAttributesCheck(attributes);
    if (status != CUDA_OCC_SUCCESS) {
        return status;
    }

    status = cudaOccDeviceStateCheck(state);
    if (status != CUDA_OCC_SUCCESS) {
        return status;
    }

    return status;
}

///////////////////////////////////////////////
//    Occupancy calculation Functions        //
///////////////////////////////////////////////

static __OCC_INLINE cudaOccPartitionedGCConfig cudaOccPartitionedGCExpected(
    const cudaOccDeviceProp     *properties,
    const cudaOccFuncAttributes *attributes)
{
    cudaOccPartitionedGCSupport gcSupport;
    cudaOccPartitionedGCConfig gcConfig;

    cudaOccPartitionedGlobalCachingModeSupport(&gcSupport, properties);

    gcConfig = attributes->partitionedGCConfig;

    if (gcSupport == PARTITIONED_GC_NOT_SUPPORTED) {
        gcConfig = PARTITIONED_GC_OFF;
    }

    return gcConfig;
}

// Warp limit
//
static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMWarpsLimit(
    int                         *limit,
    cudaOccPartitionedGCConfig   gcConfig,
    const cudaOccDeviceProp     *properties,
    const cudaOccFuncAttributes *attributes,
    int                          blockSize)
{
    cudaOccError status = CUDA_OCC_SUCCESS;
    int maxWarpsPerSm;
    int warpsAllocatedPerCTA;
    int maxBlocks;
    (void)attributes;   // silence unused-variable warning

    if (blockSize > properties->maxThreadsPerBlock) {
        maxBlocks = 0;
    }
    else {
        maxWarpsPerSm = properties->maxThreadsPerMultiprocessor / properties->warpSize;
        warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);
        maxBlocks = 0;

        if (gcConfig != PARTITIONED_GC_OFF) {
            int maxBlocksPerSmPartition;
            int maxWarpsPerSmPartition;

            // If partitioned global caching is on, then a CTA can only use a SM
            // partition (a half SM), and thus a half of the warp slots
            // available per SM
            //
            maxWarpsPerSmPartition  = maxWarpsPerSm / 2;
            maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA;
            maxBlocks               = maxBlocksPerSmPartition * 2;
        }
        // On hardware that supports partitioned global caching, each half SM is
        // guaranteed to support at least 32 warps (maximum number of warps of a
        // CTA), so caching will not cause 0 occupancy due to insufficient warp
        // allocation slots.
        //
        else {
            maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA;
        }
    }

    *limit = maxBlocks;

    return status;
}

// Shared memory limit
//
static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMSmemLimit(
    int                         *limit,
    cudaOccResult               *result,
    const cudaOccDeviceProp     *properties,
    const cudaOccFuncAttributes *attributes,
    const cudaOccDeviceState    *state,
    int                          blockSize,
    size_t                       dynamicSmemSize)
{
    cudaOccError status = CUDA_OCC_SUCCESS;
    int allocationGranularity;
    size_t userSmemPreference = 0;
    size_t totalSmemUsagePerCTA;
    size_t maxSmemUsagePerCTA;
    size_t smemAllocatedPerCTA;
    size_t staticSmemSize;
    size_t sharedMemPerMultiprocessor;
    size_t smemLimitPerCTA;
    int maxBlocks;
    int dynamicSmemSizeExceeded = 0;
    int totalSmemSizeExceeded = 0;
    (void)blockSize;   // silence unused-variable warning

    status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
    if (status != CUDA_OCC_SUCCESS) {
        return status;
    }

    // Obtain the user preferred shared memory size. This setting is ignored if
    // user requests more shared memory than preferred.
    //
    status = cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
    if (status != CUDA_OCC_SUCCESS) {
        return status;
    }

    staticSmemSize = attributes->sharedSizeBytes + properties->reservedSharedMemPerBlock;
    totalSmemUsagePerCTA = staticSmemSize + dynamicSmemSize;
    smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity);

    maxSmemUsagePerCTA = staticSmemSize + attributes->maxDynamicSharedSizeBytes;

    dynamicSmemSizeExceeded = 0;
    totalSmemSizeExceeded   = 0;

    // Obtain the user set maximum dynamic size if it exists
    // If so, the current launch dynamic shared memory must not
    // exceed the set limit
    if (attributes->shmemLimitConfig != FUNC_SHMEM_LIMIT_DEFAULT &&
        dynamicSmemSize > attributes->maxDynamicSharedSizeBytes) {
        dynamicSmemSizeExceeded = 1;
    }

    status = cudaOccSMemPerBlock(&smemLimitPerCTA, properties, attributes->shmemLimitConfig, maxSmemUsagePerCTA);
    if (status != CUDA_OCC_SUCCESS) {
        return status;
    }

    if (smemAllocatedPerCTA > smemLimitPerCTA) {
        totalSmemSizeExceeded = 1;
    }

    if (dynamicSmemSizeExceeded || totalSmemSizeExceeded) {
        maxBlocks = 0;
    }
    else {
        // User requested shared memory limit is used as long as it is greater
        // than the total shared memory used per CTA, i.e. as long as at least
        // one CTA can be launched.
        if (userSmemPreference >= smemAllocatedPerCTA) {
            sharedMemPerMultiprocessor = userSmemPreference;
        }
        else {
            // On Volta+, user requested shared memory will limit occupancy
            // if it's less than shared memory per CTA. Otherwise, the
            // maximum shared memory limit is used.
            if (properties->computeMajor >= 7) {
                sharedMemPerMultiprocessor = smemAllocatedPerCTA;
                status = cudaOccAlignUpShmemSizeVoltaPlus(&sharedMemPerMultiprocessor, properties);
                if (status != CUDA_OCC_SUCCESS) {
                    return status;
                }
            }
            else {
                sharedMemPerMultiprocessor = properties->sharedMemPerMultiprocessor;
            }
        }

        if (smemAllocatedPerCTA > 0) {
            maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA);
        }
        else {
            maxBlocks = INT_MAX;
        }
    }

    result->allocatedSharedMemPerBlock = smemAllocatedPerCTA;

    *limit = maxBlocks;

    return status;
}

static __OCC_INLINE
cudaOccError cudaOccMaxBlocksPerSMRegsLimit(
    int                         *limit,
    cudaOccPartitionedGCConfig  *gcConfig,
    cudaOccResult               *result,
    const cudaOccDeviceProp     *properties,
    const cudaOccFuncAttributes *attributes,
    int                          blockSize)
{
    cudaOccError status = CUDA_OCC_SUCCESS;
    int allocationGranularity;
    int warpsAllocatedPerCTA;
    int regsAllocatedPerCTA;
    int regsAssumedPerCTA;
    int regsPerWarp;
    int regsAllocatedPerWarp;
    int numSubPartitions;
    int numRegsPerSubPartition;
    int numWarpsPerSubPartition;
    int numWarpsPerSM;
    int maxBlocks;
    int maxRegsPerThread;

    status = cudaOccRegAllocationGranularity(
        &allocationGranularity,
        properties);
    if (status != CUDA_OCC_SUCCESS) {
        return status;
    }

    status = cudaOccRegAllocationMaxPerThread(
        &maxRegsPerThread,
        properties);
    if (status != CUDA_OCC_SUCCESS) {
        return status;
    }

    status = cudaOccSubPartitionsPerMultiprocessor(&numSubPartitions, properties);
    if (status != CUDA_OCC_SUCCESS) {
        return status;
    }

    warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);

    // GPUs of compute capability 2.x and higher allocate registers to warps
    //
    // Number of regs per warp is regs per thread x warp size, rounded up to
    // register allocation granularity
    //
    regsPerWarp          = attributes->numRegs * properties->warpSize;
    regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity);
    regsAllocatedPerCTA  = regsAllocatedPerWarp * warpsAllocatedPerCTA;

    // Hardware verifies if a launch fits the per-CTA register limit. For
    // historical reasons, the verification logic assumes register
    // allocations are made to all partitions simultaneously. Therefore, to
    // simulate the hardware check, the warp allocation needs to be rounded
    // up to the number of partitions.
    //
    regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions);

    if (properties->regsPerBlock < regsAssumedPerCTA ||   // Hardware check
        properties->regsPerBlock < regsAllocatedPerCTA || // Software check
        attributes->numRegs > maxRegsPerThread) {         // Per thread limit check
        maxBlocks = 0;
    }
    else {
        if (regsAllocatedPerWarp > 0) {
            // Registers are allocated in each sub-partition. The max number
            // of warps that can fit on an SM is equal to the max number of
            // warps per sub-partition x number of sub-partitions.
            //
            numRegsPerSubPartition  = properties->regsPerMultiprocessor / numSubPartitions;
            numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp;

            maxBlocks = 0;

            if (*gcConfig != PARTITIONED_GC_OFF) {
                int numSubPartitionsPerSmPartition;
                int numWarpsPerSmPartition;
                int maxBlocksPerSmPartition;

                // If partitioned global caching is on, then a CTA can only
                // use a half SM, and thus a half of the registers available
                // per SM
                //
                numSubPartitionsPerSmPartition = numSubPartitions / 2;
                numWarpsPerSmPartition         = numWarpsPerSubPartition * numSubPartitionsPerSmPartition;
                maxBlocksPerSmPartition        = numWarpsPerSmPartition / warpsAllocatedPerCTA;
                maxBlocks                      = maxBlocksPerSmPartition * 2;
            }

            // Try again if partitioned global caching is not enabled, or if
            // the CTA cannot fit on the SM with caching on (maxBlocks == 0).  In the latter
            // case, the device will automatically turn off caching, except
            // if the user forces enablement via PARTITIONED_GC_ON_STRICT to calculate
            // occupancy and launch configuration.
            //
            if (maxBlocks == 0 && *gcConfig != PARTITIONED_GC_ON_STRICT) {
               // In case *gcConfig was PARTITIONED_GC_ON flip it OFF since
               // this is what it will be if we spread CTA across partitions.
               //
               *gcConfig = PARTITIONED_GC_OFF;
               numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions;
               maxBlocks     = numWarpsPerSM / warpsAllocatedPerCTA;
            }
        }
        else {
            maxBlocks = INT_MAX;
        }
    }


    result->allocatedRegistersPerBlock = regsAllocatedPerCTA;

    *limit = maxBlocks;

    return status;
}

// Barrier limit
//
static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMBlockBarrierLimit(
    int                         *limit,
    int                          ctaLimitBlocks,
    const cudaOccFuncAttributes *attributes)
{
    cudaOccError status = CUDA_OCC_SUCCESS;
    int numBarriersAvailable = ctaLimitBlocks * 2;
    int numBarriersUsed = attributes->numBlockBarriers;
    int maxBlocks = INT_MAX;

    if (numBarriersUsed) {
        maxBlocks = numBarriersAvailable / numBarriersUsed;
    }

    *limit = maxBlocks;

    return status;
}

///////////////////////////////////
//      API Implementations      //
///////////////////////////////////

static __OCC_INLINE
cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
    cudaOccResult               *result,
    const cudaOccDeviceProp     *properties,
    const cudaOccFuncAttributes *attributes,
    const cudaOccDeviceState    *state,
    int                          blockSize,
    size_t                       dynamicSmemSize)
{
    cudaOccError status          = CUDA_OCC_SUCCESS;
    int          ctaLimitWarps   = 0;
    int          ctaLimitBlocks  = 0;
    int          ctaLimitSMem    = 0;
    int          ctaLimitRegs    = 0;
    int          ctaLimitBars    = 0;
    int          ctaLimit        = 0;
    unsigned int limitingFactors = 0;
    
    cudaOccPartitionedGCConfig gcConfig = PARTITIONED_GC_OFF;

    if (!result || !properties || !attributes || !state || blockSize <= 0) {
        return CUDA_OCC_ERROR_INVALID_INPUT;
    }

    ///////////////////////////
    // Check user input
    ///////////////////////////

    status = cudaOccInputCheck(properties, attributes, state);
    if (status != CUDA_OCC_SUCCESS) {
        return status;
    }

    ///////////////////////////
    // Initialization
    ///////////////////////////

    gcConfig = cudaOccPartitionedGCExpected(properties, attributes);

    ///////////////////////////
    // Compute occupancy
    ///////////////////////////

    // Limits due to registers/SM
    // Also compute if partitioned global caching has to be turned off
    //
    status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegs, &gcConfig, result, properties, attributes, blockSize);
    if (status != CUDA_OCC_SUCCESS) {
        return status;
    }

    // SMs on GP100 (6.0) have 2 subpartitions, while those on GP10x have 4.
    // As a result, an SM on GP100 may be able to run more CTAs than the one on GP10x.
    // For forward compatibility within Pascal family, if a function cannot run on GP10x (maxBlock == 0),
    // we do not let it run on any Pascal processor, even though it may be able to run on GP100.
    // Therefore, we check the occupancy on GP10x when it can run on GP100
    //
    if (properties->computeMajor == 6 && properties->computeMinor == 0 && ctaLimitRegs) {
        cudaOccDeviceProp propertiesGP10x;
        cudaOccPartitionedGCConfig gcConfigGP10x = gcConfig;
        int ctaLimitRegsGP10x = 0;

        // Set up properties for GP10x
        memcpy(&propertiesGP10x, properties, sizeof(propertiesGP10x));
        propertiesGP10x.computeMinor = 1;

        status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegsGP10x, &gcConfigGP10x, result, &propertiesGP10x, attributes, blockSize);
        if (status != CUDA_OCC_SUCCESS) {
            return status;
        }

        if (ctaLimitRegsGP10x == 0) {
            ctaLimitRegs = 0;
        }
    }

    // Limits due to warps/SM
    //
    status = cudaOccMaxBlocksPerSMWarpsLimit(&ctaLimitWarps, gcConfig, properties, attributes, blockSize);
    if (status != CUDA_OCC_SUCCESS) {
        return status;
    }

    // Limits due to blocks/SM
    //
    status = cudaOccMaxBlocksPerMultiprocessor(&ctaLimitBlocks, properties);
    if (status != CUDA_OCC_SUCCESS) {
        return status;
    }

    // Limits due to shared memory/SM
    //
    status = cudaOccMaxBlocksPerSMSmemLimit(&ctaLimitSMem, result, properties, attributes, state, blockSize, dynamicSmemSize);
    if (status != CUDA_OCC_SUCCESS) {
        return status;
    }

    ///////////////////////////
    // Overall occupancy
    ///////////////////////////

    // Overall limit is min() of limits due to above reasons
    //
    ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));

    // Determine occupancy limiting factors
    //
    if (ctaLimit == ctaLimitWarps) {
        limitingFactors |= OCC_LIMIT_WARPS;
    }
    if (ctaLimit == ctaLimitRegs) {
        limitingFactors |= OCC_LIMIT_REGISTERS;
    }
    if (ctaLimit == ctaLimitSMem) {
        limitingFactors |= OCC_LIMIT_SHARED_MEMORY;
    }
    if (ctaLimit == ctaLimitBlocks) {
        limitingFactors |= OCC_LIMIT_BLOCKS;
    }

    // For Hopper onwards compute the limits to occupancy based on block barrier count
    //
    if (properties->computeMajor >= 9 && attributes->numBlockBarriers > 0) {
        // Limits due to barrier/SM
        //
        status = cudaOccMaxBlocksPerSMBlockBarrierLimit(&ctaLimitBars, ctaLimitBlocks, attributes);
        if (status != CUDA_OCC_SUCCESS) {
            return status;
        }

        // Recompute overall limit based on barrier/SM
        //
        ctaLimit = __occMin(ctaLimitBars, ctaLimit);

        // Determine if this is occupancy limiting factor
        //
        if (ctaLimit == ctaLimitBars) {
            limitingFactors |= OCC_LIMIT_BARRIERS;
        }
    }
    else {
        ctaLimitBars = INT_MAX;
    }

    // Fill in the return values
    //
    result->limitingFactors = limitingFactors;

    result->blockLimitRegs      = ctaLimitRegs;
    result->blockLimitSharedMem = ctaLimitSMem;
    result->blockLimitWarps     = ctaLimitWarps;
    result->blockLimitBlocks    = ctaLimitBlocks;
    result->blockLimitBarriers  = ctaLimitBars;
    result->partitionedGCConfig = gcConfig;

    // Final occupancy
    result->activeBlocksPerMultiprocessor = ctaLimit;

    return CUDA_OCC_SUCCESS;
}

static __OCC_INLINE
cudaOccError cudaOccAvailableDynamicSMemPerBlock(
    size_t                      *bytesAvailable,
    const cudaOccDeviceProp     *properties,
    const cudaOccFuncAttributes *attributes,
    const cudaOccDeviceState    *state,
    int                         numBlocks,
    int                         blockSize)
{
    int allocationGranularity;
    size_t smemLimitPerBlock;
    size_t smemAvailableForDynamic;
    size_t userSmemPreference = 0;
    size_t sharedMemPerMultiprocessor;
    cudaOccResult result;
    cudaOccError status = CUDA_OCC_SUCCESS;

    if (numBlocks <= 0)
        return CUDA_OCC_ERROR_INVALID_INPUT;

    // First compute occupancy of potential kernel launch.
    //
    status = cudaOccMaxActiveBlocksPerMultiprocessor(&result, properties, attributes, state, blockSize, 0);
    if (status != CUDA_OCC_SUCCESS) {
        return status;
    }
    // Check if occupancy is achievable given user requested number of blocks. 
    //
    if (result.activeBlocksPerMultiprocessor < numBlocks) {
        return CUDA_OCC_ERROR_INVALID_INPUT;
    }

    status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
    if (status != CUDA_OCC_SUCCESS) {
        return status;
    }

    // Return the per block shared memory limit based on function config.
    //
    status = cudaOccSMemPerBlock(&smemLimitPerBlock, properties, attributes->shmemLimitConfig, properties->sharedMemPerMultiprocessor);
    if (status != CUDA_OCC_SUCCESS) {
        return status;
    }

    // If there is only a single block needed per SM, then the user preference can be ignored and the fully SW
    // limit is allowed to be used as shared memory otherwise if more than one block is needed, then the user
    // preference sets the total limit of available shared memory.
    //
    cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
    if (numBlocks == 1) {
        sharedMemPerMultiprocessor = smemLimitPerBlock;
    }
    else {
        if (!userSmemPreference) {
            userSmemPreference = 1 ;
            status = cudaOccAlignUpShmemSizeVoltaPlus(&userSmemPreference, properties);
            if (status != CUDA_OCC_SUCCESS) {
                return status;
            }
        }
        sharedMemPerMultiprocessor = userSmemPreference;
    }

    // Compute total shared memory available per SM
    //
    smemAvailableForDynamic =  sharedMemPerMultiprocessor / numBlocks;
    smemAvailableForDynamic = (smemAvailableForDynamic / allocationGranularity) * allocationGranularity;

    // Cap shared memory
    //
    if (smemAvailableForDynamic > smemLimitPerBlock) {
        smemAvailableForDynamic = smemLimitPerBlock;
    }

    // Now compute dynamic shared memory size
    smemAvailableForDynamic = smemAvailableForDynamic - attributes->sharedSizeBytes; 

    // Cap computed dynamic SM by user requested limit specified via cuFuncSetAttribute()
    //
    if (smemAvailableForDynamic > attributes->maxDynamicSharedSizeBytes)
        smemAvailableForDynamic = attributes->maxDynamicSharedSizeBytes;

    *bytesAvailable = smemAvailableForDynamic;
    return CUDA_OCC_SUCCESS;
}

static __OCC_INLINE
cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
    int                         *minGridSize,
    int                         *blockSize,
    const cudaOccDeviceProp     *properties,
    const cudaOccFuncAttributes *attributes,
    const cudaOccDeviceState    *state,
    size_t                     (*blockSizeToDynamicSMemSize)(int),
    size_t                       dynamicSMemSize)
{
    cudaOccError  status = CUDA_OCC_SUCCESS;
    cudaOccResult result;

    // Limits
    int occupancyLimit;
    int granularity;
    int blockSizeLimit;

    // Recorded maximum
    int maxBlockSize = 0;
    int numBlocks    = 0;
    int maxOccupancy = 0;

    // Temporary
    int blockSizeToTryAligned;
    int blockSizeToTry;
    int blockSizeLimitAligned;
    int occupancyInBlocks;
    int occupancyInThreads;

    ///////////////////////////
    // Check user input
    ///////////////////////////

    if (!minGridSize || !blockSize || !properties || !attributes || !state) {
        return CUDA_OCC_ERROR_INVALID_INPUT;
    }

    status = cudaOccInputCheck(properties, attributes, state);
    if (status != CUDA_OCC_SUCCESS) {
        return status;
    }

    /////////////////////////////////////////////////////////////////////////////////
    // Try each block size, and pick the block size with maximum occupancy
    /////////////////////////////////////////////////////////////////////////////////

    occupancyLimit = properties->maxThreadsPerMultiprocessor;
    granularity    = properties->warpSize;

    blockSizeLimit        = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
    blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);

    for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
        blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);

        // Ignore dynamicSMemSize if the user provides a mapping
        //
        if (blockSizeToDynamicSMemSize) {
            dynamicSMemSize = (*blockSizeToDynamicSMemSize)(blockSizeToTry);
        }

        status = cudaOccMaxActiveBlocksPerMultiprocessor(
            &result,
            properties,
            attributes,
            state,
            blockSizeToTry,
            dynamicSMemSize);

        if (status != CUDA_OCC_SUCCESS) {
            return status;
        }

        occupancyInBlocks = result.activeBlocksPerMultiprocessor;
        occupancyInThreads = blockSizeToTry * occupancyInBlocks;

        if (occupancyInThreads > maxOccupancy) {
            maxBlockSize = blockSizeToTry;
            numBlocks    = occupancyInBlocks;
            maxOccupancy = occupancyInThreads;
        }

        // Early out if we have reached the maximum
        //
        if (occupancyLimit == maxOccupancy) {
            break;
        }
    }

    ///////////////////////////
    // Return best available
    ///////////////////////////

    // Suggested min grid size to achieve a full machine launch
    //
    *minGridSize = numBlocks * properties->numSms;
    *blockSize = maxBlockSize;

    return status;
}


#if defined(__cplusplus)

namespace {

__OCC_INLINE
cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
    int                         *minGridSize,
    int                         *blockSize,
    const cudaOccDeviceProp     *properties,
    const cudaOccFuncAttributes *attributes,
    const cudaOccDeviceState    *state,
    size_t                       dynamicSMemSize)
{
    return cudaOccMaxPotentialOccupancyBlockSize(
        minGridSize,
        blockSize,
        properties,
        attributes,
        state,
        NULL,
        dynamicSMemSize);
}

template <typename UnaryFunction>
__OCC_INLINE
cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
    int                         *minGridSize,
    int                         *blockSize,
    const cudaOccDeviceProp     *properties,
    const cudaOccFuncAttributes *attributes,
    const cudaOccDeviceState    *state,
    UnaryFunction                blockSizeToDynamicSMemSize)
{
    cudaOccError  status = CUDA_OCC_SUCCESS;
    cudaOccResult result;

    // Limits
    int occupancyLimit;
    int granularity;
    int blockSizeLimit;

    // Recorded maximum
    int maxBlockSize = 0;
    int numBlocks    = 0;
    int maxOccupancy = 0;

    // Temporary
    int blockSizeToTryAligned;
    int blockSizeToTry;
    int blockSizeLimitAligned;
    int occupancyInBlocks;
    int occupancyInThreads;
    size_t dynamicSMemSize;

    ///////////////////////////
    // Check user input
    ///////////////////////////

    if (!minGridSize || !blockSize || !properties || !attributes || !state) {
        return CUDA_OCC_ERROR_INVALID_INPUT;
    }

    status = cudaOccInputCheck(properties, attributes, state);
    if (status != CUDA_OCC_SUCCESS) {
        return status;
    }

    /////////////////////////////////////////////////////////////////////////////////
    // Try each block size, and pick the block size with maximum occupancy
    /////////////////////////////////////////////////////////////////////////////////

    occupancyLimit = properties->maxThreadsPerMultiprocessor;
    granularity    = properties->warpSize;
    blockSizeLimit        = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
    blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);

    for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
        blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);

        dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);

        status = cudaOccMaxActiveBlocksPerMultiprocessor(
            &result,
            properties,
            attributes,
            state,
            blockSizeToTry,
            dynamicSMemSize);

        if (status != CUDA_OCC_SUCCESS) {
            return status;
        }

        occupancyInBlocks = result.activeBlocksPerMultiprocessor;

        occupancyInThreads = blockSizeToTry * occupancyInBlocks;

        if (occupancyInThreads > maxOccupancy) {
            maxBlockSize = blockSizeToTry;
            numBlocks    = occupancyInBlocks;
            maxOccupancy = occupancyInThreads;
        }

        // Early out if we have reached the maximum
        //
        if (occupancyLimit == maxOccupancy) {
            break;
        }
    }

    ///////////////////////////
    // Return best available
    ///////////////////////////

    // Suggested min grid size to achieve a full machine launch
    //
    *minGridSize = numBlocks * properties->numSms;
    *blockSize = maxBlockSize;

    return status;
}

} // namespace anonymous

#endif /*__cplusplus */

#undef __OCC_INLINE

#endif /*__cuda_occupancy_h__*/