OpenSHMEM Backend
Status: Production-Ready Since: DTL 0.1.0-alpha.1 Last Updated: 2026-02-04
Overview
The OpenSHMEM backend enables DTL to leverage PGAS (Partitioned Global Address Space) programming with symmetric memory and one-sided communication operations.
Key Features
Symmetric Memory Allocation: Automatic symmetric heap management
One-Sided Communication: Put/get operations without target participation
Atomic Operations: Fetch-add, compare-swap, swap, fetch, set
Synchronization: Fence, quiet, barrier operations
DTL Integration: Full
memory_window_implimplementation
Build Configuration
Enable the SHMEM backend with CMake:
cmake -DDTL_ENABLE_SHMEM=ON ..
Requirements
OpenSHMEM 1.4+ implementation (e.g., OpenSHMEM Reference, Sandia OpenSHMEM, OSHMEM)
C++20 compiler
Runtime:
oshrunor equivalent launcher
Supported Implementations
Implementation |
Status |
Notes |
|---|---|---|
OpenSHMEM Reference |
✅ Tested |
Recommended for development |
Sandia OpenSHMEM (SOS) |
✅ Tested |
High-performance production |
Open MPI OSHMEM |
✅ Tested |
MPI + SHMEM hybrid |
Cray SHMEM |
✅ Tested |
HPC systems |
Quick Start
Basic Setup
#include <dtl/core/config.hpp>
#if DTL_ENABLE_SHMEM
#include <backends/shmem/shmem_communicator.hpp>
#include <backends/shmem/shmem_memory_window_impl.hpp>
int main() {
// Initialize SHMEM (RAII)
dtl::shmem::scoped_shmem_environment env;
// Get communicator
auto& comm = dtl::shmem::world_communicator();
printf("PE %d of %d\n", comm.rank(), comm.size());
// Create memory window with symmetric allocation
auto window_result = dtl::shmem::make_shmem_window(1024);
if (!window_result) return 1;
auto& window = *window_result.value();
// Use window for RMA operations...
return 0;
}
#endif
Running SHMEM Programs
# Compile
cmake -DDTL_ENABLE_SHMEM=ON ..
make my_shmem_program
# Run with 4 PEs
oshrun -np 4 ./my_shmem_program
API Reference
Initialization
namespace dtl::shmem {
// Manual init/finalize
result<void> init();
void finalize();
// RAII wrapper (recommended)
class scoped_shmem_environment {
scoped_shmem_environment(); // Calls init()
~scoped_shmem_environment(); // Calls finalize()
};
}
Domain and Context
// SHMEM domain (rank/size/barrier)
class shmem_domain {
rank_t rank() const noexcept;
rank_t size() const noexcept;
bool valid() const noexcept;
bool is_root() const noexcept;
void barrier();
};
// SHMEM context type alias
using shmem_context = context<shmem_domain, cpu_domain>;
Symmetric Memory Allocation
namespace dtl::shmem {
// Allocate symmetric memory
result<void*> symmetric_alloc(size_type size);
void symmetric_free(void* ptr);
// Memory space class
class shmem_symmetric_memory_space {
static void* allocate(size_type size);
static void* allocate(size_type size, size_type alignment);
static void deallocate(void* ptr, size_type size) noexcept;
static void* reallocate(void* ptr, size_type size);
static void* calloc(size_type count, size_type size);
};
}
Memory Window
namespace dtl::shmem {
// Create SHMEM-backed memory window
result<std::unique_ptr<shmem_memory_window_impl>>
make_shmem_window(size_type size);
result<std::unique_ptr<shmem_memory_window_impl>>
make_shmem_window(void* base, size_type size);
class shmem_memory_window_impl : public memory_window_impl {
// Properties
void* base() const noexcept override;
size_type size() const noexcept override;
bool valid() const noexcept override;
// Data transfer
result<void> put(const void* origin, size_type size,
rank_t target, size_type target_offset) override;
result<void> get(void* origin, size_type size,
rank_t target, size_type target_offset) override;
// Non-blocking transfer
result<void> async_put(const void* origin, size_type size,
rank_t target, size_type target_offset,
rma_request_handle& request) override;
result<void> async_get(void* origin, size_type size,
rank_t target, size_type target_offset,
rma_request_handle& request) override;
// Atomics
result<void> fetch_and_op(const void* origin, void* result_buf,
size_type size, rank_t target,
size_type target_offset, rma_reduce_op op) override;
result<void> compare_and_swap(const void* origin, const void* compare,
void* result_buf, size_type size,
rank_t target, size_type target_offset) override;
// Synchronization
result<void> fence(int assert_flags = 0) override;
result<void> flush(rank_t target) override;
result<void> flush_all() override;
// SHMEM-specific
void barrier();
rank_t rank() const noexcept;
rank_t num_pes() const noexcept;
};
}
RMA Adapter (Low-Level)
namespace dtl::shmem {
class shmem_rma_adapter {
// Communication
void put(rank_t target, void* dest, const void* source, size_type size);
void get(rank_t source, void* dest, const void* source, size_type size);
void put_nbi(rank_t target, void* dest, const void* source, size_type size);
void get_nbi(rank_t source, void* dest, const void* source, size_type size);
// Typed operations (int, long, double)
void put(rank_t target, int* dest, const int* source, size_type count);
void get(rank_t source, int* dest, const int* source, size_type count);
// Atomics
int fetch_add(int* target, int value, rank_t pe);
int compare_swap(int* target, int cond, int value, rank_t pe);
int atomic_swap(int* target, int value, rank_t pe);
int atomic_fetch(const int* target, rank_t pe);
void atomic_set(int* target, int value, rank_t pe);
// Synchronization
void fence();
void quiet();
void barrier();
};
shmem_rma_adapter& global_rma_adapter();
}
Supported Operations
Data Types for Atomics
Type |
fetch_add |
compare_swap |
swap |
fetch |
set |
|---|---|---|---|---|---|
|
✅ |
✅ |
✅ |
✅ |
✅ |
|
✅ |
✅ |
✅ |
✅ |
✅ |
|
✅ |
✅ |
✅ |
✅ |
✅ |
|
✅ |
✅ |
✅ |
✅ |
✅ |
|
❌ |
❌ |
❌ |
❌ |
❌ |
|
❌ |
❌ |
❌ |
❌ |
❌ |
RMA Reduce Operations
Operation |
Description |
Supported Types |
|---|---|---|
|
Add to remote value |
int, long |
|
Swap/replace value |
int, long |
|
Just fetch |
int, long |
|
Maximum (collective only) |
int, long |
|
Minimum (collective only) |
int, long |
Synchronization Model
Ordering Guarantees
// fence() - Orders operations before with those after to same target
window.put(data1, size, target, offset1);
window.fence();
window.put(data2, size, target, offset2); // Guaranteed after data1
// flush_all() / quiet() - Ensure all operations complete
window.async_put(data, size, target, offset, req);
window.flush_all(); // All previous operations complete
// barrier() - Full synchronization across all PEs
window.barrier(); // All PEs reach this point
Comparison with MPI RMA
Feature |
MPI RMA |
SHMEM |
|---|---|---|
Window creation |
Explicit |
Implicit (symmetric memory) |
Epoch management |
Required |
Not required |
Default mode |
Active or passive |
Passive (always accessible) |
Ordering |
Via epochs |
Via fence/quiet |
Lock/unlock |
Required for passive |
No-op (always valid) |
Examples
Put/Get Communication
// Create window
auto window = dtl::shmem::make_shmem_window(1024).value();
// Initialize data
int* data = static_cast<int*>(window->base());
data[0] = rank;
window->barrier();
// PE 0 reads from PE 1
if (rank == 0) {
int remote_value;
window->get(&remote_value, sizeof(int), 1, 0);
printf("Read %d from PE 1\n", remote_value);
}
Atomic Counter
auto window = dtl::shmem::make_shmem_window(sizeof(int)).value();
int* counter = static_cast<int*>(window->base());
*counter = 0;
window->barrier();
// All PEs increment counter on PE 0
int one = 1;
int old;
window->fetch_and_op(&one, &old, sizeof(int), 0, 0, rma_reduce_op::sum);
window->flush_all();
window->barrier();
if (rank == 0) {
printf("Final count: %d\n", *counter); // Equals number of PEs
}
Compare-and-Swap Lock
int* lock = static_cast<int*>(window->base());
*lock = 0; // 0 = unlocked
window->barrier();
// Try to acquire lock
int unlocked = 0, my_id = rank + 1;
int prev;
window->compare_and_swap(&my_id, &unlocked, &prev, sizeof(int), 0, 0);
if (prev == 0) {
printf("PE %d acquired lock\n", rank);
// ... critical section ...
// Release lock
int zero = 0;
window->put(&zero, sizeof(int), 0, 0);
window->flush(0);
}
Testing
Running SHMEM Tests
# Build tests
cmake -DDTL_ENABLE_SHMEM=ON -DDTL_BUILD_INTEGRATION_TESTS=ON ..
make dtl_shmem_tests
# Run with oshrun
oshrun -np 2 ./dtl_shmem_tests
oshrun -np 4 ./dtl_shmem_tests
Test Labels
# Run all SHMEM tests via CTest (if oshrun is detected)
ctest -L shmem
# Specific test sets
ctest -R shmem_integration_2pes
ctest -R shmem_integration_4pes
Troubleshooting
Common Issues
“SHMEM not initialized”
Ensure
scoped_shmem_environmentis created before any SHMEM operationsOr manually call
dtl::shmem::init()at program start
“Symmetric memory allocation failed”
Increase symmetric heap size:
oshrun -np 4 -e SHMEM_SYMMETRIC_SIZE=256M ./progCheck available memory
“Invalid PE” errors
Verify target PE is in range
[0, size())Ensure consistent number of PEs across runs
Segfault on put/get
Verify memory is symmetric (allocated via
shmem_mallocormake_shmem_window)Check offset doesn’t exceed window size
Environment Variables
Variable |
Description |
Default |
|---|---|---|
|
Symmetric heap size |
Implementation-specific |
|
Enable debug output |
Off |