diff --git a/m4/ax_check_hwloc.m4 b/m4/ax_check_hwloc.m4 index 8be2e8f1b26462e73f91b3edbdc7f667a7473ef0..637cac6be75545cc25392c2286ac24e39a72545b 100644 --- a/m4/ax_check_hwloc.m4 +++ b/m4/ax_check_hwloc.m4 @@ -55,16 +55,15 @@ AS_IF([test "x$with_hwloc" != xyes dnl -o "x$with_hwloc" != xno dnl -o "x$with_hwloc" != x dnl ],[ - hwlocinc="-I $with_hwloc/include" - hwloc_h="$with_hwloc/include/hwloc.h" - AS_IF([test -d $with_hwloc/lib64], - [hwloclib="-L$with_hwloc/lib64 -Wl,-rpath,$with_hwloc/lib64"], - [hwloclib="-L$with_hwloc/lib -Wl,-rpath,$with_hwloc/lib"])dnl -])dnl - -AS_IF([test "x$with_hwloc_include" != x],[ - hwlocinc="-isystem $with_hwloc_include" - hwloc_h="$with_hwloc_include/hwloc.h" + AS_IF([test -d $with_hwloc/include], [ + hwlocinc="-I $with_hwloc/include" + hwloc_h="$with_hwloc/include/hwloc.h" + ]) + AS_IF([test -d $with_hwloc/lib64], [ + hwloclib="-L$with_hwloc/lib64 -Wl,-rpath,$with_hwloc/lib64" + ], [test -d $with_hwloc/lib], [ + hwloclib="-L$with_hwloc/lib -Wl,-rpath,$with_hwloc/lib" + ])dnl ])dnl AS_IF([test "x$with_hwloc_lib" != x],[ @@ -73,7 +72,7 @@ AS_IF([test "x$with_hwloc_lib" != x],[ # This condition is satisfied even if $with_hwloc="yes" # This happens when user leaves --with-value alone -AS_IF([test "x$with_hwloc$with_hwloc_include$with_hwloc_lib" != x],[ +AS_IF([test "x$with_hwloc$with_hwloc_lib" != x],[ AC_LANG_PUSH([C++]) #tests if provided headers and libraries are usable and correct diff --git a/src/arch/smp/smpplugin.cpp b/src/arch/smp/smpplugin.cpp index a73864c7588383934a118516f0b046dcaceed5f0..9fe91392924e5a7c8ae8c55e0ed440390d41d6bf 100644 --- a/src/arch/smp/smpplugin.cpp +++ b/src/arch/smp/smpplugin.cpp @@ -59,6 +59,7 @@ nanos::PE * smpProcessorFactory ( int id, int uid ) , _smpHostCpus( 0 ) , _smpPrivateMemorySize( 256 * 1024 * 1024 ) // 256 Mb , _workersCreated( false ) + , _threadsPerCore( 0 ) , _cpuSystemMask() , _cpuProcessMask() , _cpuActiveMask() @@ -150,6 +151,10 @@ nanos::PE * smpProcessorFactory ( int id, int uid ) "SMP sync transfers." ); cfg.registerArgOption( "smp-sync-transfers", "smp-sync-transfers" ); cfg.registerEnvOption( "smp-sync-transfers", "NX_SMP_SYNC_TRANSFERS" ); + + cfg.registerConfigOption( "smp-threads-per-core", NEW Config::PositiveVar( _threadsPerCore ), + "Limit the number of threads per core on SMT processors." ); + cfg.registerArgOption( "smp-threads-per-core", "smp-threads-per-core" ); } void SMPPlugin::init() @@ -192,22 +197,68 @@ nanos::PE * smpProcessorFactory ( int id, int uid ) } verbose0("requested cpus: " << _requestedCPUs << " available: " << _availableCPUs << " to be used: " << _currentCPUs); - //! \note Fill _bindings vector with the active CPUs first, then the not active - _bindings.reserve( _availableCPUs ); - for ( int i = 0; i < _availableCPUs; i++ ) { - if ( _cpuProcessMask.isSet(i) ) { - _bindings.push_back(i); + std::map bindings_list; + // If --smp-threads-per-core option is used, adjust CPUs used + if ( _threadsPerCore > 0 ) { + fatal_cond0( !sys._hwloc.isHwlocAvailable(), + "Option --smp-threads-per-core requires HWLOC" ); + + std::list core_cpusets = sys._hwloc.getCoreCpusetsOf( _cpuProcessMask ); + fatal_cond0( core_cpusets.size() == 0, "Process mask [" << _cpuProcessMask << + "] does not contain a full core cpuset. Cannot run with --smp-threads-per-core" ); + { + // Append not owned CPUs to core list + CpuSet cpus_not_owned = _cpuSystemMask - _cpuProcessMask; + std::list core_cpusets_not_owned = + sys._hwloc.getCoreCpusetsOf( cpus_not_owned ); + core_cpusets.splice( core_cpusets.end(), core_cpusets_not_owned ); } - } - for ( int i = 0; i < _availableCPUs; i++ ) { - if ( !_cpuProcessMask.isSet(i) ) { - _bindings.push_back(i); + for ( std::list::iterator it = core_cpusets.begin(); + it != core_cpusets.end(); ++it) { + CpuSet& core = *it; + unsigned int nthreads = std::min( _threadsPerCore, core.size() ); + unsigned int offset = core.size() / nthreads; + unsigned int steps_left = 0; + unsigned int inserted = 0; + int last_binding = -1; + // Iterate core cpuset up to nthreads and add them to the _bindings list + for ( CpuSet::const_iterator cit = core.begin(); cit != core.end(); ++cit ) { + int cpuid = *cit; + // We try to evenly distribute CPUs inside Core + if ( steps_left == 0 && inserted < nthreads) { + _bindings.push_back(cpuid); + last_binding = cpuid; + bindings_list.insert( std::pair(last_binding, CpuSet(cpuid)) ); + steps_left += offset - 1; + ++inserted; + } else { + --steps_left; + bindings_list[last_binding].set(cpuid); + } + } + } + _availableCPUs = _bindings.size(); + } else { + + //! \note Fill _bindings vector with the active CPUs first, then the not active + _bindings.reserve( _availableCPUs ); + for ( int i = 0; i < _availableCPUs; i++ ) { + if ( _cpuProcessMask.isSet(i) ) { + _bindings.push_back(i); + bindings_list.insert( std::pair(i, CpuSet(i)) ); + } + } + for ( int i = 0; i < _availableCPUs; i++ ) { + if ( !_cpuProcessMask.isSet(i) ) { + _bindings.push_back(i); + bindings_list.insert( std::pair(i, CpuSet(i)) ); + } } } //! \note Load & check NUMA config (_cpus vectors must be created before) _cpus = NEW std::vector( _availableCPUs, (SMPProcessor *) NULL ); - _cpusByCpuId = NEW std::vector( _availableCPUs, (SMPProcessor *) NULL ); + _cpusByCpuId = NEW std::map(); loadNUMAInfo(); @@ -234,37 +285,42 @@ nanos::PE * smpProcessorFactory ( int id, int uid ) //! \note Create the SMPProcessors in _cpus array int count = 0; for ( std::vector::iterator it = _bindings.begin(); it != _bindings.end(); it++ ) { + int cpuid = *it; SMPProcessor *cpu; - bool active = ( (count < _currentCPUs) && _cpuProcessMask.isSet(*it) ); + bool active = (count < _currentCPUs) && _cpuProcessMask.isSet(cpuid); unsigned numaNode; // If this PE can't be seen by hwloc (weird case in Altix 2, for instance) - if ( !sys._hwloc.isCpuAvailable( *it ) ) { + if ( !sys._hwloc.isCpuAvailable( cpuid ) ) { /* There's a problem: we can't query it's numa node. Let's give it 0 (ticket #1090), consider throwing a warning */ numaNode = 0; } else - numaNode = getNodeOfPE( *it ); + numaNode = getNodeOfPE( cpuid ); unsigned socket = numaNode; /* FIXME: socket */ + memory_space_id_t id; if ( _smpPrivateMemory && count >= _smpHostCpus && !_memkindSupport ) { OSAllocator a; - memory_space_id_t id = sys.addSeparateMemoryAddressSpace( ext::getSMPDevice(), _smpAllocWide, sys.getRegionCacheSlabSize() ); + id = sys.addSeparateMemoryAddressSpace( ext::getSMPDevice(), + _smpAllocWide, sys.getRegionCacheSlabSize() ); SeparateMemoryAddressSpace &numaMem = sys.getSeparateMemory( id ); numaMem.setSpecificData( NEW SimpleAllocator( ( uintptr_t ) a.allocate(_smpPrivateMemorySize), _smpPrivateMemorySize ) ); numaMem.setAcceleratorNumber( sys.getNewAcceleratorId() ); - cpu = NEW SMPProcessor( *it, id, active, numaNode, socket ); } else { - - cpu = NEW SMPProcessor( *it, mem_id, active, numaNode, socket ); + id = mem_id; } + + // Create SMPProcessor object + cpu = NEW SMPProcessor( cpuid, bindings_list[cpuid], id, active, numaNode, socket ); + if ( active ) { _cpuActiveMask.set( cpu->getBindingId() ); } //cpu->setNUMANode( getNodeOfPE( cpu->getId() ) ); (*_cpus)[count] = cpu; - (*_cpusByCpuId)[ *it ] = cpu; + (*_cpusByCpuId)[cpuid] = cpu; count += 1; } @@ -785,7 +841,7 @@ nanos::PE * smpProcessorFactory ( int id, int uid ) void SMPPlugin::updateCpuStatus( int cpuid ) { - SMPProcessor *cpu = _cpusByCpuId->at(cpuid); + SMPProcessor *cpu = (*_cpusByCpuId)[cpuid]; if ( cpu->getRunningThreads() > 0 ) { _cpuActiveMask.set( cpuid ); } else { @@ -951,31 +1007,36 @@ nanos::PE * smpProcessorFactory ( int id, int uid ) // Initialize rank limits with the first cpu in the list int a0 = -1, aN = -1, i0 = -1, iN = -1; - SMPProcessor *first_cpu = _cpusByCpuId->front(); + SMPProcessor *first_cpu = _cpusByCpuId->begin()->second; first_cpu->isActive() ? a0 = first_cpu->getBindingId() : i0 = first_cpu->getBindingId(); // Iterate through begin+1..end - for ( std::vector::iterator curr = _cpusByCpuId->begin()+1; curr != _cpusByCpuId->end(); curr++ ) { + std::map::iterator prev_it = _cpusByCpuId->begin(); /* begin() */ + std::map::iterator curr_it = prev_it; ++curr_it; /* begin() + 1 */ + while ( curr_it != _cpusByCpuId->end() ) { /* Detect whether there is a state change (a->i/i->a). If so, * close the rank and start a new one. If it's the last iteration * close it anyway. */ - std::vector::iterator prev = curr-1; - if ( (*curr)->isActive() && !(*prev)->isActive() ) { + SMPProcessor *prev = prev_it->second; + SMPProcessor *curr = curr_it->second; + if ( curr->isActive() && !prev->isActive() ) { // change, i->a - iN = (*prev)->getBindingId(); - a0 = (*curr)->getBindingId(); + iN = prev->getBindingId(); + a0 = curr->getBindingId(); ( i0 != iN ) ? i << i0 << "-" << iN << ", " : i << i0 << ", "; - } else if ( !(*curr)->isActive() && (*prev)->isActive() ) { + } else if ( !curr->isActive() && prev->isActive() ) { // change, a->i - aN = (*prev)->getBindingId(); - i0 = (*curr)->getBindingId(); + aN = prev->getBindingId(); + i0 = curr->getBindingId(); ( a0 != aN ) ? a << a0 << "-" << aN << ", " : a << a0 << ", "; } + ++prev_it; + ++curr_it; } // close ranks and append strings according to the last cpu - SMPProcessor *last_cpu = _cpusByCpuId->back(); + SMPProcessor *last_cpu = prev_it->second; if ( last_cpu->isActive() ) { aN = last_cpu->getBindingId(); ( a0 != aN ) ? a << a0 << "-" << aN << ", " : a << a0 << ", "; diff --git a/src/arch/smp/smpplugin_decl.hpp b/src/arch/smp/smpplugin_decl.hpp index e879527a1f0a59cb13b3111ee5129cec4cbf9698..076c1d72135d08e95f5eb2726e5f45a8d86b3e81 100644 --- a/src/arch/smp/smpplugin_decl.hpp +++ b/src/arch/smp/smpplugin_decl.hpp @@ -54,7 +54,7 @@ class SMPPlugin : public SMPBasePlugin int _currentCPUs; int _requestedWorkers; std::vector *_cpus; - std::vector *_cpusByCpuId; + std::map *_cpusByCpuId; std::vector _workers; int _bindingStart; int _bindingStride; @@ -64,6 +64,7 @@ class SMPPlugin : public SMPBasePlugin int _smpHostCpus; std::size_t _smpPrivateMemorySize; bool _workersCreated; + int _threadsPerCore; // Nanos++ scheduling domain CpuSet _cpuSystemMask; /*!< \brief system's default cpu_set */ diff --git a/src/arch/smp/smpprocessor.cpp b/src/arch/smp/smpprocessor.cpp index 84f2163f12156728a71da5dbf5d23bd75ded8a73..c68a9bc0d34eaae6af4d58f0c9a1876057becb86 100644 --- a/src/arch/smp/smpprocessor.cpp +++ b/src/arch/smp/smpprocessor.cpp @@ -35,9 +35,11 @@ size_t SMPProcessor::_threadsStackSize = 0; System::CachePolicyType SMPProcessor::_cachePolicy = System::DEFAULT; size_t SMPProcessor::_cacheDefaultSize = 1048580; -SMPProcessor::SMPProcessor( int bindingId, memory_space_id_t memId, bool active, unsigned int numaNode, unsigned int socket ) : +SMPProcessor::SMPProcessor( int bindingId, const CpuSet& bindingList, + memory_space_id_t memId, bool active, unsigned int numaNode, unsigned int socket ) : PE( &getSMPDevice(), memId, 0 /* always local node */, numaNode, true, socket, true ), - _bindingId( bindingId ), _reserved( false ), _active( active ), _futureThreads( 0 ) {} + _bindingId( bindingId ), _bindingList( bindingList ), + _reserved( false ), _active( active ), _futureThreads( 0 ) {} void SMPProcessor::prepareConfig ( Config &config ) { @@ -134,4 +136,3 @@ void SMPProcessor::setNumFutureThreads( unsigned int nthreads ) { unsigned int SMPProcessor::getNumFutureThreads() const { return _futureThreads; } - diff --git a/src/arch/smp/smpprocessor.hpp b/src/arch/smp/smpprocessor.hpp index 212666ab3c08c4976369cffc671cbae966631652..91ab7628f429c8717175ed8891ae414742bbcb41 100644 --- a/src/arch/smp/smpprocessor.hpp +++ b/src/arch/smp/smpprocessor.hpp @@ -43,6 +43,7 @@ namespace ext { static size_t _cacheDefaultSize; static System::CachePolicyType _cachePolicy; unsigned int _bindingId; + CpuSet _bindingList; bool _reserved; bool _active; unsigned int _futureThreads; @@ -54,9 +55,11 @@ namespace ext { public: // constructors - SMPProcessor( int bindingId, memory_space_id_t numMemId, bool active, unsigned int numaNode, unsigned int socket ); + SMPProcessor( int bindingId, const CpuSet& bindingList, memory_space_id_t numMemId, + bool active, unsigned int numaNode, unsigned int socket ); unsigned int getBindingId() const { return _bindingId; } + const CpuSet& getBindingList() const { return _bindingList; } virtual ~SMPProcessor() {} diff --git a/src/arch/unix-os/cpuset.hpp b/src/arch/unix-os/cpuset.hpp index 3dd6748e5c3d45d634043d8db1521bb605d2bfca..81e044f5925cc09b42b32f1cbb6b37f99efd84ce 100644 --- a/src/arch/unix-os/cpuset.hpp +++ b/src/arch/unix-os/cpuset.hpp @@ -67,6 +67,8 @@ class CpuSet // Default constructor CpuSet(): _mask() {} + CpuSet( int cpuid ): _mask() { set(cpuid); } + // Destructor ~CpuSet() {} @@ -105,6 +107,7 @@ class CpuSet friend CpuSet operator&( const CpuSet& lhs, const CpuSet& rhs ); friend CpuSet operator+( const CpuSet& lhs, const CpuSet& rhs ); friend CpuSet operator*( const CpuSet& lhs, const CpuSet& rhs ); + friend CpuSet operator-( const CpuSet& lhs, const CpuSet& rhs ); friend bool operator==( const CpuSet& lhs, const CpuSet& rhs ); friend bool operator!=( const CpuSet& lhs, const CpuSet& rhs ); @@ -235,6 +238,14 @@ inline CpuSet operator*( const CpuSet& lhs, const CpuSet& rhs ) return lhs & rhs; } +inline CpuSet operator-( const CpuSet& lhs, const CpuSet& rhs ) +{ + CpuSet result; + CPU_XOR( &result._mask, &lhs._mask, &rhs._mask ); + CPU_AND( &result._mask, &lhs._mask, &result._mask ); + return result; +} + inline bool operator==( const CpuSet& lhs, const CpuSet& rhs ) { return CPU_EQUAL( &lhs._mask, &rhs._mask ); diff --git a/src/arch/unix-os/pthread.cpp b/src/arch/unix-os/pthread.cpp index 46d0ff6e0afe4f9e79f007fe255de83a4c42b874..af277ebc5c31e21f4333c202461bbfe973e458de 100644 --- a/src/arch/unix-os/pthread.cpp +++ b/src/arch/unix-os/pthread.cpp @@ -113,12 +113,9 @@ void PThread::join () void PThread::bind() { - int cpu_id = _core->getBindingId(); - cpu_set_t cpu_set; - CPU_ZERO( &cpu_set ); - CPU_SET( cpu_id, &cpu_set ); - verbose( "Binding thread " << getMyThreadSafe()->getId() << " to cpu " << cpu_id ); - pthread_setaffinity_np( _pth, sizeof(cpu_set_t), &cpu_set ); + const CpuSet& binding = _core->getBindingList(); + verbose( "Binding thread " << getMyThreadSafe()->getId() << " to cpus [" << binding << "]" ); + pthread_setaffinity_np( _pth, sizeof(cpu_set_t), binding.get_cpu_set_pointer() ); NANOS_INSTRUMENT ( static InstrumentationDictionary *ID = sys.getInstrumentation()->getInstrumentationDictionary(); ) NANOS_INSTRUMENT ( static nanos_event_key_t cpuid_key = ID->getEventKey("cpuid"); ) @@ -129,7 +126,7 @@ void PThread::bind() NANOS_INSTRUMENT ( keys[1] = numa_key ) NANOS_INSTRUMENT ( nanos_event_value_t values[2]; ) - NANOS_INSTRUMENT ( values[0] = (nanos_event_value_t) cpu_id + 1; ) + NANOS_INSTRUMENT ( values[0] = (nanos_event_value_t) _core->getBindingId() + 1; ) NANOS_INSTRUMENT ( values[1] = (nanos_event_value_t) _core->getNumaNode() + 1; ) NANOS_INSTRUMENT ( sys.getInstrumentation()->raisePointEvents(2, keys, values); ) } diff --git a/src/support/hwloc.cpp b/src/support/hwloc.cpp index 2ffe6d841ba366bf1e3060cdb9867a653aac2c20..82f9bdb368161968d634586280d1e162bb14c85a 100644 --- a/src/support/hwloc.cpp +++ b/src/support/hwloc.cpp @@ -21,6 +21,7 @@ #include "debug.hpp" #ifdef HWLOC + #include #ifdef GPU_DEV #include #endif @@ -70,8 +71,10 @@ void Hwloc::loadHwloc () fatal_cond0( res != 0, "Could not load hwloc topology xml file." ); } +#if (HWLOC_API_VERSION >> 16) == 1 // Enable GPU detection hwloc_topology_set_flags( _hwlocTopology, HWLOC_TOPOLOGY_FLAG_IO_DEVICES ); +#endif // Perform the topology detection. hwloc_topology_load( _hwlocTopology ); @@ -174,4 +177,42 @@ bool Hwloc::isCpuAvailable( unsigned int cpu ) const return hwloc_get_pu_obj_by_os_index( _hwlocTopology, cpu ) != NULL; #endif } + +CpuSet Hwloc::getCoreCpusetOf( unsigned int cpu ) +{ + CpuSet core_cpuset; +#ifdef HWLOC + hwloc_obj_t pu = hwloc_get_pu_obj_by_os_index( _hwlocTopology, cpu ); + hwloc_obj_t core = hwloc_get_ancestor_obj_by_type( _hwlocTopology, HWLOC_OBJ_CORE, pu ); + hwloc_cpuset_to_glibc_sched_affinity( _hwlocTopology, core->cpuset, + core_cpuset.get_cpu_set_pointer(), sizeof(cpu_set_t)); +#endif + return core_cpuset; +} + +std::list Hwloc::getCoreCpusetsOf( const CpuSet& parent ) +{ + std::list core_cpusets; +#ifdef HWLOC + // Covert parent cpuset to hwlocset + hwloc_cpuset_t hwlocset = hwloc_bitmap_alloc(); + hwloc_cpuset_from_glibc_sched_affinity( _hwlocTopology, hwlocset, + parent.get_cpu_set_pointer(), sizeof(cpu_set_t)); + + // Iterate cores inside parent cpuset + hwloc_obj_t core = NULL; + while ( (core = hwloc_get_next_obj_inside_cpuset_by_type( + _hwlocTopology, hwlocset, HWLOC_OBJ_CORE, core)) != NULL ) { + CpuSet core_cpuset; + hwloc_cpuset_to_glibc_sched_affinity( _hwlocTopology, core->cpuset, + core_cpuset.get_cpu_set_pointer(), sizeof(cpu_set_t)); + // Append core cpuset to list + core_cpusets.push_back( core_cpuset ); + } + + hwloc_bitmap_free(hwlocset); +#endif + return core_cpusets; +} + } diff --git a/src/support/hwloc_decl.hpp b/src/support/hwloc_decl.hpp index c23c6fe51d973e14264ede97ecac8d9efba09a05..86adcdce03b1b273b976f8bf43949a9968ef6fb4 100644 --- a/src/support/hwloc_decl.hpp +++ b/src/support/hwloc_decl.hpp @@ -23,6 +23,7 @@ #include #include +#include "cpuset.hpp" #ifdef HWLOC #include @@ -63,6 +64,8 @@ class Hwloc { */ bool isCpuAvailable( unsigned int cpu ) const; + CpuSet getCoreCpusetOf( unsigned int cpu ); + std::list getCoreCpusetsOf( const CpuSet& parent ); }; } // namespace nanos diff --git a/tests/test/00_arch/unix-os/cpuset_00.cpp b/tests/test/00_arch/unix-os/cpuset_00.cpp index 4fd0cb419755646829e877117f469d3c07b0aa5c..d76da51b25fbb286c4ee65406406fb794908a199 100644 --- a/tests/test/00_arch/unix-os/cpuset_00.cpp +++ b/tests/test/00_arch/unix-os/cpuset_00.cpp @@ -57,6 +57,7 @@ int main(int argc, char *argv[]) assert((set1 + set2).size()==4); /* 1111 */ assert((set1 & set2).size()==1); /* 0001 */ assert((set1 * set2).size()==1); /* 0001 */ + assert((set1 - set2).size()==2); /* 0110 */ // Compound assignment operators set2 = set1; /* 0111 */