// Copyright © 2022 Apple Inc. #pragma once #include #include #include #include #include #include #include #include #include // this implementation is based on CUDACachingAllocator. // It utilizes Metal Heaps to improve the performance with buffer allocation. // Do not include this header. Use MPSAllocatorInterface.h instead. // TODO: Unify the logic with CUDACachingAllocator and remove redundant code. namespace at::mps::HeapAllocator { static const size_t kMaxSmallAlloc = MB(1); // largest "small" allocation is 1 MiB static const size_t kMinLargeAlloc = MB(10); // allocations between 1 and 10 MiB may use kLargeHeap static const size_t kRoundLarge = MB(2); // round up large allocations to 2 MiB static const size_t kSmallHeap = MB(8); // "small" allocations are packed in 8 MiB heaps static const size_t kLargeHeap = MB(32); // "large" allocations may be packed in 32 MiB heaps static const size_t kXLargeHeapD = MB(128); // "extra large" allocations on Discrete devices may be packed in 128 MiB heaps static const size_t kXLargeHeapU = MB(1024); // "extra large" allocations on Unified devices may be packed in 1 GiB heaps static const size_t kMaxScalarAlloc = (sizeof(int64_t)); // largest "scalar" allocation // buffer pools could be customized with a combination of usage flags enum UsageFlags : uint32_t { PRIVATE = 0, SMALL = (1 << 0), // small heaps have sizes of kSmallHeap, and large ones kLargeHeap SHARED = (1 << 1), // shared pools allocated on devices with unified memory; otherwise, private between host/device MANAGED = (1 << 2), // managed storage mode HAZARD = (1 << 3), // enables Automatic Hazard Tracking for the resources allocated on the pool SCALAR = (1 << 4), // used to import CPU scalar values to GPU and use them in MPS Stream }; // debug verbosity flags enum DebugVerbosity : uint32_t { SILENT = 0, PROFILING = (1 << 0), // print generic profiling data for total system memory usage ALLOCATIONS = (1 << 1), // print buffer allocations RECYCLES = (1 << 2), // print buffer recycling RELEASES = (1 << 3), // print buffer releases LARGE_ONLY = (1 << 4), // only log large buffer pool transactions }; struct HeapBlock; struct BufferBlock { id buffer; void* cpu_ptr = nullptr; // stores the pointer to CPU mapping of a Shared MTLBuffer size_t size; // size after alignment size_t requested_size; // requested size (before alignment) // buffer shape is used for retrieving base of views in cached graphs std::vector shape; bool in_use = false; HeapBlock* heap; id_t buf_id; // counter to candidate least recently used buffers for garbage collection uint32_t gc_count = 0; uint32_t use_count = 0; // counter to assign unique ids to buffer blocks static uint64_t buffer_counter; // Metal events used to sync GPU/CPU operations on the shared-storage buffers MPSEventPtr event; BufferBlock(size_t Size, size_t RequestedSize = 0, const id Buffer = nullptr, HeapBlock* Heap = nullptr) : buffer(Buffer), size(Size), requested_size(RequestedSize), heap(Heap), buf_id(Buffer ? ++buffer_counter : 0) { } static bool Comparator(const BufferBlock* a, const BufferBlock* b) { return (a->size != b->size) ? a->size < b->size : (uintptr_t)a->buffer < (uintptr_t)b->buffer; } static size_t alignUp(size_t Size, size_t Alignment) { assert(((Alignment - 1) & Alignment) == 0); return ((Size + Alignment - 1) & ~(Alignment - 1)); } uint32_t retainCount() const { return [buffer retainCount]; } }; typedef bool (*BufferComparison)(const BufferBlock*, const BufferBlock*); struct BufferPool; struct AllocParams { AllocParams(size_t Alloc_Size, size_t Requested_Size, BufferPool* Pool) : search_key(Alloc_Size), pool(Pool), requested_size(Requested_Size) { } size_t size() const { return search_key.size; } BufferBlock search_key; BufferPool* pool; BufferBlock* buffer_block = nullptr; size_t requested_size; // true if we exceed the low watermark limit. In this case // we apply strategies to relieve the pressure before allocation. bool has_memory_pressure = false; // true if we're allocating on a unified memory device bool has_unified_memory = true; }; struct HeapBlock { id heap; struct { size_t total, available; } size; BufferPool* pool; unsigned int n_buffers = 0; id_t heap_id; // indicates if we split this heap to sub-allocate 'several' buffers (otherwise single buffer) bool is_split; // counter to assign unique ids to heap blocks static uint64_t heap_counter; HeapBlock(size_t Size, const id Heap = nullptr, BufferPool *Pool = nullptr) : heap(Heap), size({.total = Size, .available = Size}), pool(Pool), heap_id(Heap ? ++heap_counter : 0), is_split(true) { } static MTLResourceOptions getOptions(uint32_t usage) { // TODO: check the caching performance of write-combined mode MTLResourceOptions options = MTLResourceCPUCacheModeDefaultCache; if (usage & UsageFlags::MANAGED) options |= MTLResourceStorageModeManaged; else if (usage & UsageFlags::SHARED) options |= MTLResourceStorageModeShared; else options |= MTLResourceStorageModePrivate; options |= (usage & UsageFlags::HAZARD) ? MTLResourceHazardTrackingModeTracked : MTLResourceHazardTrackingModeUntracked; return options; } static HeapBlock* createHeapBlock(AllocParams& params, id device, uint32_t usage) { HeapBlock *heapBlock = nullptr; bool is_split = true; const size_t size = params.size(); MTLHeapDescriptor *d = [MTLHeapDescriptor new]; if (d) { const size_t kXLargeHeap = params.has_unified_memory ? kXLargeHeapU : kXLargeHeapD; if (size <= kMaxSmallAlloc) { d.size = kSmallHeap; } else if (size < kMinLargeAlloc) { d.size = kLargeHeap; } else if (size < kXLargeHeap / 2 && !params.has_memory_pressure) { d.size = kXLargeHeap; } else { d.size = kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge); is_split = false; } d.storageMode = (usage & UsageFlags::SHARED) ? MTLStorageModeShared : MTLStorageModePrivate; d.cpuCacheMode = MTLCPUCacheModeDefaultCache; // this automatically handles Metal buffer access synchronizations at the // cost of slightly lower performance. d.hazardTrackingMode = (usage & UsageFlags::HAZARD) ? MTLHazardTrackingModeTracked : MTLHazardTrackingModeUntracked; d.resourceOptions = getOptions(usage); d.type = MTLHeapTypeAutomatic; id heap = [device newHeapWithDescriptor: d]; if (heap) { [heap setPurgeableState:MTLPurgeableStateNonVolatile]; const size_t heap_size = heapAvailableSize(heap); heapBlock = new HeapBlock(heap_size, heap, params.pool); if (heapBlock) { heapBlock->is_split = is_split; } } [d release]; } return heapBlock; } static bool Comparator(const HeapBlock* a, const HeapBlock* b) { return (a->size.available != b->size.available) ? a->size.available < b->size.available : (uintptr_t)a->heap < (uintptr_t)b->heap; } static NSUInteger heapAvailableSize(id heap, size_t Alignment = vm_page_size) { return [heap maxAvailableSizeWithAlignment:Alignment]; } NSUInteger Size() { return [heap size]; } id newMTLBuffer(size_t length, uint32_t usage) { id buf = [heap newBufferWithLength:length options:getOptions(usage)]; if (buf) { updateAvailableSize(); n_buffers++; } return buf; } // returns the retainCount before releasing the buffer uint32_t releaseMTLBuffer(id& buffer) { const uint32_t retainCount = [buffer retainCount]; [buffer release]; buffer = nil; updateAvailableSize(); n_buffers--; return retainCount; } // returns the retainCount before releasing the heap uint32_t releaseMTLHeap() { const uint32_t retainCount = [heap retainCount]; TORCH_INTERNAL_ASSERT(!n_buffers); // assert if heap isn't empty [heap setPurgeableState:MTLPurgeableStateEmpty]; [heap release]; heap = nil; size.available = 0; return retainCount; } uint32_t retainCount() const { return [heap retainCount]; } void updateAvailableSize() { size.available = heapAvailableSize(heap); } }; typedef bool (*HeapComparison)(const HeapBlock*, const HeapBlock*); struct BufferPool { enum class Kind { PRIVATE_SMALL, PRIVATE_LARGE, SHARED_SMALL, SHARED_LARGE, SCALAR, }; BufferPool(const id Device, uint32_t Usage) : device(Device), usage(Usage), heaps(HeapBlock::Comparator), available_buffers(BufferBlock::Comparator) { } const id device; // usage flags to customize the pool for various purposes (see UsageFlags enum) const uint32_t usage; // total number of buffers in the pool uint32_t n_buffers = 0; // total allocations size on this pool size_t allocated_size = 0; // total memory available in the pool size_t available_size = 0; // list of heaps ordered by their "available" (not total) memory size std::set heaps; // list of only "available" buffers in the pool (i.e., buffers not in-use) std::set available_buffers; // list of buffers that are in a state of "limbo" where they've already been freed // from PyTorch-side, but were not returned to pool due to still being // in-use by command buffers with retainCount > 1. In this state, the buffer is // neither ready to be recycled, nor could be returned to pool as available. // These buffers will be returned to pool once the command buffer's // completionHandler callbacks are called. std::unordered_set buffers_pending_free; // list of heaps pending size update std::unordered_set heaps_pending_update; }; class MPSHeapAllocatorImpl { public: explicit MPSHeapAllocatorImpl() : m_device(at::mps::MPSDevice::getInstance()->device()), m_max_buffer_size([m_device maxBufferLength]), m_stream(getDefaultMPSStream()), m_event_pool(getMPSEventPool()) { init_allocator(); } ~MPSHeapAllocatorImpl() { emptyCache(); } // interface exposed to at::Allocator id malloc(size_t size, uint32_t usage); // frees a buffer and returns it into buffer pool void free(void* ptr); // releases all the cached buffers and their associated heaps void emptyCache(); // free inactive buffers that are pending to be freed void freeInactiveBuffers(); // returns true if buffer was allocated from the shared pool bool isSharedBuffer(const void* ptr); // get the requested unaligned size of an MTLBuffer ssize_t getUnalignedBufferSize(const void* ptr); // set the shape of a base tensor from a view tensor void setBufferShape(const void* ptr, const IntArrayRef& shape); // retrieve the shape of a base tensor from a view tensor IntArrayRef getBufferShape(const void* ptr); // get the unique ID of the buffer id_t getBufferId(const void* ptr); // allocate a buffer from a specialized pool to import CPU scalars into GPU id allocScalarBufferWithValue(void* value, size_t size); // returns a CPU-mapping of the input buffer and its retainCount, // if only it has Shared storage-mode and allocated on MPSAllocator std::pair getSharedBufferPtr(const void* buffer); // records events for a list of MTLBuffers (list is used to lock the mutex once) // returns true if records any event (given if passed buffers exist and are shared-storage) bool recordEvents(c10::ArrayRef buffers); // waits for the event to signal the completion of GPU execution // on the passed shared buffers (list is used to lock the mutex once) // returns true if actually waited on any event bool waitForEvents(c10::ArrayRef buffers); // this indicates how far (in Megabytes) the current total allocations are from the // low watermark limit which is used to detect if we're under memory pressure // This returns zero if we've reached the low watermark limit ssize_t getLowWatermarkValue(); // (see m_low_watermark_ratio for description) void setLowWatermarkRatio(double ratio); // (see m_high_watermark_ratio for description) void setHighWatermarkRatio(double ratio); // (see m_low_watermark_limit for description) size_t getLowWatermarkLimit() const { return m_low_watermark_limit; } // (see m_max_total_allowed_size for description) size_t getHighWatermarkLimit() const { return m_max_total_allowed_size; } // (see m_total_allocated_memory for description) size_t getTotalAllocatedMemory() const { return m_total_allocated_memory; } // (see m_current_allocated_memory for description) size_t getCurrentAllocatedMemory() const { return m_current_allocated_memory; } // total GPU memory allocated in the process by Metal driver; including // implicit allocations from MPS/MPSGraph frameworks and MPSHeapAllocatorImpl. size_t getDriverAllocatedMemory() const { return current_allocated_size(); } // recommended Max memory for Metal size_t getRecommendedMaxMemory() const { return max_device_size(); } // (see enum DebugVerbosity for description) uint32_t getDebugVerbosity() const { return m_debug_verbosity; } // returns the device that we allocate from inline id Device() const { return m_device; } // TODO: make a common function to do size unit conversions in PyTorch. inline std::string format_size(uint64_t size) const; private: // (see m_high_watermark_ratio for description) constexpr static double default_high_watermark_ratio = 1.7; // we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize. constexpr static double default_high_watermark_upper_bound = 2.0; // (see m_low_watermark_ratio for description) // on unified memory, we could allocate beyond the recommendedMaxWorkingSetSize constexpr static double default_low_watermark_ratio_unified = 1.4; constexpr static double default_low_watermark_ratio_discrete = 1.0; const id m_device; std::recursive_mutex m_mutex; // allocated buffers by device pointer ska::flat_hash_map m_allocated_buffers; // using a container for pools to simplify iterating them ska::flat_hash_map> m_pools; // total memory allocated by HeapAllocator (including blocks in pools) size_t m_total_allocated_memory = 0; // currently active memory allocations in use (i.e., blocks not in pools) size_t m_current_allocated_memory = 0; // max buffer size allowed by Metal size_t m_max_buffer_size = 0; // maximum total size allowed to be allocated size_t m_max_total_allowed_size = 0; // high watermark ratio is a hard limit for the total allowed allocations // 0. : disables high watermark limit (may cause system failure if system-wide OOM occurs) // 1. : recommended maximum allocation size (i.e., device.recommendedMaxWorkingSetSize) // >1.: allows limits beyond the device.recommendedMaxWorkingSetSize // e.g., value 0.95 means we allocate up to 95% of recommended maximum // allocation size; beyond that, the allocations would fail with OOM error. double m_high_watermark_ratio; // low watermark ratio is a soft limit to attempt limiting memory allocations up to the lower watermark // level by garbage collection or committing command buffers more frequently (a.k.a, adaptive commit). // Value between 0 to m_high_watermark_ratio (setting 0.0 disables adaptive commit and garbage collection) // e.g., value 0.9 means we 'attempt' to limit allocations up to 90% of recommended maximum // allocation size. double m_low_watermark_ratio; // low watermark size limit (in Bytes) at the time we initialize the allocator size_t m_low_watermark_limit; // use "PYTORCH_DEBUG_MPS_ALLOCATOR" env-var to set debug verbosity uint32_t m_debug_verbosity; // default MPS stream MPSStream* m_stream; // we hold a reference to MPSEventPool so it could get destroyed after MPSAllocator std::shared_ptr m_event_pool; void init_allocator(); void init_buffer_pools(); HeapBlock* get_free_heap(AllocParams& params); bool get_free_buffer(AllocParams& params); BufferBlock* get_allocated_buffer_block(const void* ptr); BufferBlock* alloc_buffer_block(size_t size, uint32_t usage); bool alloc_buffer(AllocParams& params); void free_buffer(BufferBlock* buffer_block); // returns true if the container heap is also released bool release_buffer(BufferBlock* buffer_block, bool remove_empty_heap = true); void release_buffers(BufferPool& pool); bool release_available_cached_buffers(AllocParams& params); bool release_cached_buffers(); // free unused cached blocks to reclaim GPU memory if memory pressure is high void garbage_collect_cached_buffers(AllocParams& params); // returns the suitable buffer pool type for the usage or // requested/allocated sizes BufferPool& get_pool(size_t requested_size, size_t aligned_size, uint32_t usage); // returns the aligned allocation size that is optimized // for the buffers to get reused frequently size_t get_allocation_size(size_t size, uint32_t usage) const; // maximum size of device memory available for allocation in current process // Note: the recommendedMaxWorkingSetSize is typically 75% of the total system memory. size_t max_device_size() const { return [m_device recommendedMaxWorkingSetSize]; } // there are implicit allocations from MPS backend, so we need to query the 'device' for // total allocated size instead of manually tracking in MPSAllocator size_t current_allocated_size() const { return [m_device currentAllocatedSize]; } bool trigger_memory_callbacks(BufferBlock* buffer_block, IMpsAllocatorCallback::EventType event) const { for (const auto& name : MPSAllocatorCallbacksRegistry()->Keys()) { MPSAllocatorCallbacksRegistry()->Create(name)->executeMPSAllocatorCallback(buffer_block ? buffer_block->buffer : nullptr, event); } return true; } }; } // namespace at::mps::HeapAllocator