
肝好辈敛NCCL 使用 bootstrapInit 实现控制环形拓扑的协商和建立。ncclCommInitRankncclCommInitRankDevncclCommInitRankFuncbootstrapInit(int nHandles, void* handles, struct ncclComm* comm)在 NCCL 控制环初始化阶段每个 MPI 进程创建一个 TCP 监听 socket1这个 socket 只允许邻居节点的连接并交换必要的连接信息每个 MPI 进程创建第二个 TCP 监听 socket2这个 socket 只允许 MPI 根进程的连接用于传递 MPI 邻居信息每个 MPI 进程分别向 root MPI 进程发送自身的连接信息root 节点通过校验 UniqueId其实就是 UniqueId 里面的 magic来决定是否接受连接通过 MPI root 进程的 bootstrapRoot 函数逻辑每个 MPI 进程分别从 root MPI 进程获得自己的后继节点信息MPI root 进程通过连接每个 MPI 进程的 socket2 发送每个 MPI 进程向自己的后继节点发起主动连接指向后继节点的socket1并接受自己前驱节点的连接。到此每个 rank 已经和下一个 rank 建立了直接连接形成一个控制逻辑环每个 MPI 进程分别再创建 3 个监听 socket分别是peerProxyAddressespeerProxyAddressesUDS和 peerP2pAddressespeerProxyAddresses。NCCL 使用 代理线程proxy thread 来处理一些异步或需要 CPU 参与的任务例如注册/注销 GPU 内存尤其在 IB/RDMA 场景处理共享内存shm或 CUDA IPC管理连接生命周期每个 rank 启动时会创建一个 本地 proxy 服务线程并监听一个 TCP 端口其他 rank 若需请求该 rank 的 proxy 服务如 “请帮我注册这块 GPU 内存”就连接到 socketpeerProxyAddressespeerProxyAddressesUDS。当多个 rank 运行在 同一台机器时可以通过peerProxyAddressesUDS来加速通信peerP2pAddresses。用于 rank 之间建立直接的点对点P2P连接实现任意两个 rank 能直接通信。例如ncclSend / ncclRecv点对点通信 API每个 MPI 进程通过 AllGather 算法在建立的控制环形拓扑上广播交换 3 个socketpeerProxyAddressespeerProxyAddressesUDS和 peerP2pAddresses 的地址信息使得每个 rank 上都知道其它任意 rank 的 3 个 socket 通信地址ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {······} else {// create socket for ring neightbor to contact meeNCCLCHECK(createListenSocket(comm, comm-magic, STATE_LISTEN(state, socket), info.connectInfo.addr, ncclSocketTypeBootstrap));}// Create socket for root to contact me using the roots magicNCCLCHECK(createListenSocket(comm, BOOTSTRAP_HANDLE(handles, curr_root)-magic, listenSockRoot, info.listenRootAddress, ncclSocketTypeBootstrap));······NCCLCHECK(sendToRoot(BOOTSTRAP_HANDLE(handles, curr_root), comm, info));// get info on my next rank in the bootstrap ring from rootNCCLCHECK(ncclSocketInit(sock));NCCLCHECK(ncclSocketAccept(sock, listenSockRoot));NCCLCHECK(socketRecv(sock, nextPeer, sizeof(nextPeer)));······// accept and connect the ring networkNCCLCHECK(socketRingConnect(nextPeer.addr, STATE_RING(state, socket.send), STATE_LISTEN(state, socket), STATE_RING(state, socket.recv), comm-magic, state-abortFlag));······// AllGather all listen handlers// in case of failure, those resources will be freed when calling bootstrapDestroy, so we can return immediatlyNCCLCHECK(ncclCalloc(state-peerProxyAddresses, nranks));NCCLCHECKGOTO(createListenSocket(comm, comm-magic, proxySocket, state-peerProxyAddresses rank, ncclSocketTypeProxy), result, fail);NCCLCHECKGOTO(ncclCalloc(state-peerProxyAddressesUDS, nranks), result, fail);NCCLCHECKGOTO(getUDS(state-peerProxyAddressesUDS rank), result, fail);// Create the service proxy and get the UDSNCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state-peerProxyAddresses, state-peerProxyAddressesUDS), result, fail);// create a socket for others to reach out (P2P)union ncclSocketAddress peerSocketAddress;NCCLCHECKGOTO(createListenSocket(comm, comm-magic, STATE_LISTEN(state, peerSocket), peerSocketAddress, ncclSocketTypeBootstrap), result, fail);NCCLCHECKGOTO(ncclCalloc(state-peerP2pAddresses, nranks), result, fail);memcpy(state-peerP2pAddresses rank, peerSocketAddress, sizeof(union ncclSocketAddress));NCCLCHECKGOTO(ringAllInfo(comm, state, state-peerP2pAddresses, state-peerProxyAddresses, state-peerProxyAddressesUDS, rasRanks), result, fail);······}上述逻辑在 rank0 中对应如下日志bootstrapInit:656 NCCL TRACE rank 0 nranks 4ncclSocketListen:421 NCCL TRACE Listening on socket 10.10.10.1160473ncclSocketListen:421 NCCL TRACE Listening on socket 10.10.10.1153543ncclSocketConnect:718 NCCL TRACE Connecting to socket 10.10.10.1157865ncclSocketConnect:718 NCCL TRACE Connecting to socket 10.10.10.1159361ncclSocketListen:421 NCCL TRACE Listening on socket 10.10.10.1135707ncclSocketListen:421 NCCL TRACE Listening on socket 10.10.10.1155235ncclIpcSocketInit:51 NCCL TRACE UDS: Creating socket /tmp/nccl-socket-0-c5b60c2e5e7a9a0cbootstrapAllGather:1074 NCCL TRACE rank 0 nranks 4 size 120 - AllGathersocketRingAllGather:1032 NCCL TRACE socketRingAllGather started: rank0 nranks4socketRingAllGather:1034 NCCL TRACE bidirectional bootstrap: totalSteps2socketRingAllGather:1064 NCCL TRACE socketRingAllGather first message in 0.000115 (1.039933 MB/sec), rest in 0.000017 (21.719457 MB/sec)bootstrapAllGather:1085 NCCL TRACE bootstrapAllGather for 120 B done in 0.000141 sec: 3.404569 MB/secbootstrapAllGather:1086 NCCL TRACE rank 0 nranks 4 size 120 - AllGather DONEbootstrapInit:776 NCCL TRACE rank 0 nranks 4 - DONE以上过程可总结为如下图所示NCCL传输层初始化NCCL 使用 initTransportsRank 实现传输层初始化。ncclCommInitRankncclCommInitRankDevncclCommInitRankFuncinitTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT])控制通道交换邻居信息交换信息有如下定义例如 NCCL 版本主机唯一标识hostHashGPU 计算能力cudaCompCap等等struct ncclPeerInfo {int rank;int cudaDev;int nvmlDev;int gdrSupport;uint64_t hostHash;uint64_t pidHash;dev_t shmDev;int64_t busId;struct ncclComm* comm;int cudaCompCap;size_t totalGlobalMem;// MNNVL supportnvmlGpuFabricInfoV_t fabricInfo;int cuMemSupport;int version;};通过算法AllGather在控制环广播。最终每个 rank 都将包含全局邻居信息NCCLCHECKGOTO(ncclCalloc(comm-peerInfo, nranks1), ret, fail); // Extra rank to represent CollNet rootNCCLCHECKGOTO(fillInfo(comm, comm-peerInforank, comm-commHash), ret, fail);NCCLCHECKGOTO(bootstrapAllGather(comm-bootstrap, comm-peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);版本一致性校验与全局属性推导做一些基础检查NCCL 版本一致性检查只有全局版本一致才允许继续协商根据每个 rank 的 hostHash 值推导物理机节点数GPU 绑定重复检测。同一物理机上不能有两个 rank 绑定到同一个 GPU只能是一个 rank 绑定一个 GPUfor (int i 0; i nranks; i) {if (comm-peerInfo[i].version ! comm-peerInfo[rank].version) {......goto fail;}if (comm-peerInfo[i].hostHash ! comm-peerInfo[rank].hostHash)nNodes;if ((i ! rank) (comm-peerInfo[i].hostHash comm-peerInfo[rank].hostHash) (comm-peerInfo[i].busId comm-peerInfo[rank].busId)) {......goto fail;}}同进程rank计算NCCL 支持多种运行模式例如一个进程绑定一个 GPUOne Rank Per ProcessORPP一个进程绑定多个 GPUSingle Process Multiple RanksSPMR一个进程通过多线程绑定多个 GPU 等模式。MPI / PyTorch DDP / DeepSpeed 等分布式深度学习框架通常使用一个进程绑定一个 GPU即 1 process - 1 rank - 1 GPU 的模式。但是如果不是 ORPP 模式则需要涉及如下代码逻辑推导多个 rank 与对应进程的关系。 非 ORPP 模式的一些优势共享虚拟地址空间可直接访问彼此的 CPU 内存同进程的 rank集合通信使用的 ring / tree 拓扑会优先使用 Shared Memory (SHM) 或 GPU directP2P进行高效通信只有跨节点时才会使用 IB / ROCE 通信do {// Compute intra-process ranksint intraProcRank0 -1, intraProcRank -1, intraProcRanks 0;comm-nvlsRegSupport 1;for (int i 0; i nranks; i) {if ((comm-peerInfo[i].hostHash comm-peerInfo[rank].hostHash) (comm-peerInfo[i].pidHash comm-peerInfo[rank].pidHash)) {// Rank is in same processif (intraProcRanks 0)intraProcRank0 i;if (i rank)intraProcRank intraProcRanks;intraProcRanks;if (intraProcRank0 rank rank ! i) {comm-peerInfo[i].comm-intraNext comm-intraNext;comm-intraNext comm-peerInfo[i].comm;}}}comm-intraComm0 comm0;comm-intraRank intraProcRank; // 当前 rank 在同进程中的序号0,1,2...comm-intraRanks intraProcRanks; // 同进程中的总 rank 数} while(0);NCCL推导PCIe设备链路拓扑目的是先构建一个有层次关系的全局 XML 拓扑里面记录了 NCCL 需要使用的硬件信息。ncclCommInitRankncclCommInitRankDevncclCommInitRankFuncinitTransportsRankncclTopoGetSystemncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode)NCCL 使用函数 ncclTopoGetSystem 构建一个描述整机关键硬件拓扑的 XML 树拓扑将用于后续 NCCL 通信路径的构造。XML 拓扑将包含如下信息所有 GPU所有 NICCPU 归属的 NUMA 节点体现 PCIe 设备之间的层级结构PCIe 基础介绍每个 CPU 都有自己直连的 Root Complex简称 RCRC 会帮助 CPU 和其他组件通信比如 CPU 和内存CPU 和 PCIe 系统PCIe Switch 的作用是扩展 PCIe 端口下游可以连接 PCIe 设备或者其它 PCIe Switch上游来的请求将被 PCIe Switch 转发PCIe 设备可以连在 RC也可以连在 PCIe Switch假设有如下 PCIe 信息$ lspci -tv--[0000:ff]--00.0 Intel Corporation Device 324c| -00.1 Intel Corporation Device 324c-[0000:f2]--00.0 Intel Corporation Ice Lake Memory Map/VT-d| -00.1 Intel Corporation Ice Lake Mesh 2 PCIe-[0000:d0]--00.0 Intel Corporation Ice Lake Memory Map/VT-d| -00.1 Intel Corporation Ice Lake Mesh 2 PCIe| -00.2 Intel Corporation Ice Lake RAS| -00.4 Intel Corporation Device 0b23| \-01.0-[d1-d9]----00.0-[d2-d9]---00.0-[d3]----00.0 NVIDIA Corporation AD102GL [L20]| -01.0-[d4]----00.0 NVIDIA Corporation AD102GL [L20]| -02.0-[d5]----00.0 NVIDIA Corporation AD102GL [L20]| -03.0-[d6]----00.0 NVIDIA Corporation AD102GL [L20]| -04.0-[d7]----00.0 Broadcom / LSI Virtual PCIe Placeholder Endpoint| -05.0-[d8]----00.0 Broadcom / LSI Virtual PCIe Placeholder Endpoint| \-1f.0-[d9]----00.0 Broadcom / LSI PCIe Switch management endpoint-[0000:95]--00.0 Intel Corporation Ice Lake Memory Map/VT-d| -00.1 Intel Corporation Ice Lake Mesh 2 PCIe| -00.2 Intel Corporation Ice Lake RAS| -00.4 Intel Corporation Device 0b23| \-01.0-[96]---00.0 Mellanox Technologies MT27800 Family [ConnectX-5]| \-00.1 Mellanox Technologies MT27800 Family [ConnectX-5]GPU 的完整链路信息如下0000:d3:00.0 - ../../../devices/pci0000:d0/0000:d0:01.0/0000:d1:00.0/0000:d2:00.0/0000:d3:00.00000:d4:00.0 - ../../../devices/pci0000:d0/0000:d0:01.0/0000:d1:00.0/0000:d2:01.0/0000:d4:00.00000:d5:00.0 - ../../../devices/pci0000:d0/0000:d0:01.0/0000:d1:00.0/0000:d2:02.0/0000:d5:00.00000:d6:00.0 - ../../../devices/pci0000:d0/0000:d0:01.0/0000:d1:00.0/0000:d2:03.0/0000:d6:00.0NIC 的完整链路信息如下0000:96:00.0 - ../../../devices/pci0000:95/0000:95:01.0/0000:96:00.00000:96:00.1 - ../../../devices/pci0000:95/0000:95:01.0/0000:96:00.1得到信息四个 GPU 都在相同 Root Complexpci0000:d0 下面0000:d0:01.0 为 Root Complex 的 PCIe Root Port0000:d1:00.0 / 0000:d2:01.0 为某个 PCIe Switch 的上游端口和下游端口0000:d3:00.00000:d4:00.00000:d5:00.00000:d6:00.0 四个 GPU 分别挂接在上游的 PCIe Switch 上拓扑结构可表示为CPU → Root Complex → Root Port → PCIe Switch Upstream → PCIe Switch Downstream → GPU。NCCL 让每个 rank 分别从自身绑定的 GPU 开始通过 PCIe 路径逐步向上逆向推导从而获得自身 GPU 在整个 PCIe 链路上的路径信息每个 rank 只探测自身的 PCIe 链路信息。ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode) {......// 向 xml 中添加当前 GPU nodeNCCLCHECK(ncclTopoGetPciNode(xml, busId, node));......// 当前 GPU PCIe 链路信息推导NCCLCHECK(ncclTopoGetXmlFromSys(node, xml));......}其中ncclTopoGetXmlFromSys 的实现可简略为ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* xml) {......NCCLCHECKNOWARN(xmlGetAttrIndex(pciNode, vendor, index), NCCL_GRAPH);if (index -1) {if (path) NOWARN(ncclTopoSetAttrFromSys(pciNode, path, vendor, vendor), NCCL_GRAPH);}......struct ncclXmlNode* parent pciNode-parent;if (parent NULL) {if (path) {// Save that for later in case next step is a CPUNCCLCHECK(ncclTopoGetStrFromSys(path, numa_node, numaIdStr));......// Go up one level in the PCI tree. Rewind two / and follow the upper PCI// switch, or stop if we reach a CPU root complex.for (parentOffset strlen(path)-1; parentOffset0; parentOffset--) {if (path[parentOffset] /) {slashCount;path[parentOffset] \0;int start parentOffset - 1;while (start0 path[start] ! /) start--;// Check whether the parent path looks like BBBB:BB:DD.F or not.if (checkBDFFormat(pathstart1) 0) {// This a CPU root complex. Create a CPU tag and stop there.struct ncclXmlNode* topNode;NCCLCHECK(xmlFindTag(xml, system, topNode));NCCLCHECK(xmlGetSubKv(topNode, cpu, parent, numaid, numaIdStr));......}} else if (slashCount 2) {// Continue on the upper PCI switchfor (int i strlen(path)-1; i0; i--) {if (path[i] /) {NCCLCHECK(xmlFindTagKv(xml, pci, parent, busid, pathi1));......}}}}......}} else {......}pciNode-parent parent;......if (strcmp(parent-name, pci) 0) {NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));} else if (strcmp(parent-name, cpu) 0) {NCCLCHECK(ncclTopoGetXmlFromCpu(parent, xml));}......}算法思想为逐步向上逆向推导从而获得 GPU 在整个 PCIe 链路上的硬件信息。算法总结如下先获取当前 rank 绑定的 GPU Bus ID在 xml 中添加一个 pci 节点调用ncclTopoGetXmlFromSys实现向上逐级推导。获取当前 GPU 的父节点即通过 GPU 的 PCIe Path 推导得到 GPU0000:d3:00.0 的父节点为 0000:d2:00.0PCIe Switch 有上游和下游端口。因此推导至 PCIe Switch 的上游端口 Bus ID0000:d1:00.0。判断该 Bus ID 不是 CPU root complex此时 slashCount 2因此将该 ID 作为一个 pci 节点并添加到 xml 中递归调用ncclTopoGetXmlFromSys从 PCIe Switch 上游端口 Bus ID0000:d1:00.0 再次向上推导重复上面的步骤推导至 Bus IDpci0000:d0发现是一个 CPU root complex向 xml 中添加该 CPU 节点并填充 CPU 相关信息例如 numaid arch等。最后填充 当前 GPU 相关信息例如 rank IDsmgdr 等信息ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode) {......// 向 xml 中添加当前 GPU 相关信息NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode));}NIC 推导也类似 GPU 的推导过程。但是 NIC 是系统级资源并不绑定某个 rank每个 rank 都会枚举自己可见的 NIC 资源。由于每个 rank 只推导自己绑定的 GPU全局视图需要汇总。因此需要通过 NCCL 控制通道在物理设备内部通过bootstrapIntraNodeAllGather实现 XML 信息融合和去重。经过汇总后前面介绍的 PCIe 链路信息可以构造出可视化的下列 xml 视图在我们的虚拟机实验环境上所有设备都挂在同一个 Root Complex 下面且没有 PCIe Switch 参与是最简单的拓扑。$ lspci -tv-[0000:00]--00.0 Intel Corporation 440FX - 82441FX PMC [Natoma]-01.0 Intel Corporation 82371SB PIIX3 ISA [Natoma/Triton II]-01.1 Intel Corporation 82371SB PIIX3 IDE [Natoma/Triton II]-01.2 Intel Corporation 82371SB PIIX3 USB [Natoma/Triton II]-01.3 Intel Corporation 82371AB/EB/MB PIIX4 ACPI-02.0 Cirrus Logic GD 5446-03.0 Red Hat, Inc. Virtio network device-04.0 Red Hat, Inc. Virtio block device-05.0 Red Hat, Inc. Virtio memory balloon-06.0 Red Hat, Inc. Virtio file system-07.0-[01]---08.0 NVIDIA Corporation AD102GL [L20]-09.0 NVIDIA Corporation AD102GL [L20]-0a.0 Mellanox Technologies MT27800 Family [ConnectX-5]\-0b.0 Mellanox Technologies MT27800 Family [ConnectX-5]$ ll /sys/bus/pci/devices/0000:00:00.0 - ../../../devices/pci0000:00/0000:00:00.0/0000:00:01.0 - ../../../devices/pci0000:00/0000:00:01.0/0000:00:01.1 - ../../../devices/pci0000:00/0000:00:01.1/0000:00:01.2 - ../../../devices/pci0000:00/0000:00:01.2/0000:00:01.3 - ../../../devices/pci0000:00/0000:00:01.3/0000:00:02.0 - ../../../devices/pci0000:00/0000:00:02.0/0000:00:03.0 - ../../../devices/pci0000:00/0000:00:03.0/0000:00:04.0 - ../../../devices/pci0000:00/0000:00:04.0/0000:00:05.0 - ../../../devices/pci0000:00/0000:00:05.0/0000:00:06.0 - ../../../devices/pci0000:00/0000:00:06.0/0000:00:07.0 - ../../../devices/pci0000:00/0000:00:07.0/0000:00:08.0 - ../../../devices/pci0000:00/0000:00:08.0/0000:00:09.0 - ../../../devices/pci0000:00/0000:00:09.0/0000:00:0a.0 - ../../../devices/pci0000:00/0000:00:0a.0/0000:00:0b.0 - ../../../devices/pci0000:00/0000:00:0b.0/最后得到的 xml 融合信息如下。实验环境中的网卡 mlx5_0 和 mlx5_1 被进行了合并。上述逻辑在 rank0 中对应如下日志// PCIe 链路推导3068.938120 ncclTopoSetAttrFromSys:472 NCCL TRACE Read from sys /sys/devices/pci0000:00/0000:00:08.0/class - class0x0302003068.949021 ncclTopoSetAttrFromSys:472 NCCL TRACE Read from sys /sys/devices/pci0000:00/0000:00:08.0/vendor - vendor0x10de3068.955660 ncclTopoSetAttrFromSys:472 NCCL TRACE Read from sys /sys/devices/pci0000:00/0000:00:08.0/device - device0x26ba......// 网卡合并3069.492280 ncclTopoMakeVNics:1314 NCCL TRACE Found physical ncclNet node 0 mlx5_03069.494257 ncclTopoMakeVNics:1314 NCCL TRACE Found physical ncclNet node 1 mlx5_1ncclIbMakeVDeviceInternal:649 NCCL INFO NET/IB : Made virtual device [2] namemlx5_0mlx5_1 speed50000 ndevs2ncclTopoMakeVnic:1029 NCCL INFO TOPO/NET : Made vNic 23069.508220 ncclTopoGetVNicParent:1270 NCCL TRACE Re-found physical ncclNet node 0 mlx5_03069.510005 ncclTopoGetVNicParent:1270 NCCL TRACE Re-found physical ncclNet node 1 mlx5_13069.512078 ncclTopoGetVNicParent:1295 NCCL TRACE Selected parent pci with path 1ncclTopoPopulateNics:1357 NCCL INFO NET/IB : GPU Direct RDMA Disabled for HCA 2 mlx5_0mlx5_13069.519502 ncclTopoTrimXmlRec:982 NCCL TRACE Removing node net mlx5_0 (null)3069.521424 ncclTopoTrimXmlRec:982 NCCL TRACE Removing node net mlx5_1 (null)// 同主机内部 XML 信息融合3072.011796 bootstrapIntraNodeAllGather:1128 NCCL TRACE rank 0 nranks 2 size 2562056 - ENTER3072.029868 ncclSocketConnect:718 NCCL TRACE Connecting to socket 10.10.10.11462253073.529592 socketRingAllGather:1032 NCCL TRACE socketRingAllGather started: rank0 nranks23073.533829 socketRingAllGather:1034 NCCL TRACE bidirectional bootstrap: totalSteps13074.657969 socketRingAllGather:1064 NCCL TRACE socketRingAllGather first message in 0.001121 (2285.429856 MB/sec), rest in 0.000000 (50236392.156863 MB/sec)3074.684796 bootstrapIntraNodeAllGather:1142 NCCL TRACE rank 0 nranks 2 size 2562056 - DONENCCL对PCIe节点建图通过 PCIe 链路推导得到了 GPU 到 CPU NIC 到 CPU以及 GPU 到 NIC 之间的基于 xml 的拓扑信息但是还缺乏它们之间的链路连接关系。因此需要通过建图将 PCIe 设备通过 PCIe 路径连接起来。就比如有多个孤立岛现在按照路径规划实际的 PCIe 路径在它们之间建立桥梁实现互通从而为 NCCL 后续最优通信路径的搜索打基础。NCCL 使用 ncclTopoGetSystemFromXml 在 PCIe 设备之间建图。ncclCommInitRankncclCommInitRankDevncclCommInitRankFuncinitTransportsRankncclTopoGetSystemncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem, const uint64_t localHostHash)NCCL 定义了六种设备类型分别是NCCL_TOPO_NODE_TYPES [GPUPCINVSCPUNICNET]每种设备类型支持最大 NCCL_TOPO_MAX_NODES 576 个节点每个节点支持最多 NCCL_TOPO_MAX_LINKS 576 个 PCIe 连接。其中struct ncclTopoNode 里面的 struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS]用于记录当前 PCIe 设备节点的路径信息也就是记录从它可达其它 PCIe 设备的路径信息。struct ncclTopoNode {......// Type specific dataunion {struct {int dev; // NVML dev numberint rank;......}gpu;struct {int dev; // Plugin dev numberuint64_t pciId;......}net;struct {int arch;int vendor;......}cpu;struct {uint64_t device;}pci;};int nlinks;struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS];// Pre-computed paths to GPUs and NICsstruct ncclTopoLinkList* paths[NCCL_TOPO_NODE_TYPES];......};struct ncclTopoNodeSet {int count;struct ncclTopoNode nodes[NCCL_TOPO_MAX_NODES];};struct ncclTopoSystem {int systemId;uint64_t hostHashes[NCCL_TOPO_MAX_NODES];int nHosts;struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];float maxBw;float totalBw;int inter;};struct ncclTopoLink {int type;float bw;struct ncclTopoNode* remNode;};#define NCCL_TOPO_NODE_TYPES 6#define NCCL_TOPO_MAX_NODES 576#define NCCL_TOPO_MAX_LINKS 576#define GPU 0#define PCI 1#define NVS 2#define CPU 3 // Actually NUMA domains#define NIC 4#define NET 5假设有 XML 视图通过 ncclTopoAddCpu从 XML 拓扑的 cpu 开始建图。遍历 XML 的 cpu 节点如果是多 NUMA 环境XML 里将是多个 CPU - PCI 的视图。ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem, const uint64_t localHostHash) {......for (int s0; s nSubs; s) {struct ncclXmlNode* node topNode-subs[s];if (strcmp(node-name, cpu) 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));}......}从 XML 里遍历当前 cpu 下的 pci 节点。通过 ncclTopoAddPci 去建立 CPU 和 PCIe Switch以及 PCIe Switch 与 GPU 或者 NIC 的连接。ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) {......for (int s0; s nSubs; s) {struct ncclXmlNode* node xmlCpu-subs[s];if (strcmp(node-name, pci) 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu, systemId, numaId));if (strcmp(node-name, nic) 0) {......NCCLCHECK(ncclTopoAddNic(node, system, nic, systemId));}}......}在 ncclTopoAddPci 里连接建立的算法思想可以总结如下如果当前节点就是一个 PCIe 设备那么通过 ncclTopoCreateNode 将当前节点添加到 struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES]并调用两次 ncclTopoConnectNodes 分别去建立当前 PCIe 设备 与 parent CPU 的连接以及建立 parent CPU 与 当前 PCIe 设备的连接如果当前节点是一个 PCIe Switch且 Switch 下面还有其它 PCIe 设备那么继续遍历 Switch 下面的 PCIe 设备并递归调用 ncclTopoAddPci。当前层次的 PCIe Switch 与 上层 parent CPU 的正反连接在最后递归回溯时再执行通过 ncclTopoAddPci 递归直至达到最底层 PCIe 设备比如 GPU / NIC那么将当前节点添加到 struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES]并调用两次 ncclTopoConnectNodes 分别去建立当前 GPU / NIC 与 parent PCIe Switch 的连接以及建立 parent PCIe Switch 与 当前 GPU / NIC 设备的连接。注意这里的连接类型是LINK_PCI表示通过 PCIe 链路连接。ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent, int systemId, int numaId) {......} else if (type PCI) {NCCLCHECK(ncclTopoCreateNode(system, node, type, NCCL_TOPO_ID(systemId, busId)));NCCLCHECK(xmlGetAttr(xmlPci, vendor, str));if (str) node-pci.device strtol(str, NULL, 0) 48;NCCLCHECK(xmlGetAttr(xmlPci, device, str));if (str) node-pci.device strtol(str, NULL, 0) 32;......for (int s0; s nSubs; s) {struct ncclXmlNode* xmlSubPci xmlPci-subs[s];if (strcmp(xmlSubPci-name, pcilink) ! 0) { // PCI links will be added laterNCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId, numaId));}}}......if (node) {......NCCLCHECK(ncclTopoConnectNodes(node, parent, LINK_PCI, width*speed/80.0));NCCLCHECK(ncclTopoConnectNodes(parent, node, LINK_PCI, width*speed/80.0));}return ncclSuccess;}向 node-links 数组里添加 remNode 节点实现两个节点的连接。调用两次就表示相互互联。ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float bw) {......struct ncclTopoLink* link;for (link node-links; link - node-links ! NCCL_TOPO_MAX_LINKS link-remNode; link) {if (link-remNode remNode link-type type) break;}if (link-remNode NULL) node-nlinks;link-type type;link-remNode remNode;link-bw bw;// Sort links in BW descending orderstruct ncclTopoLink linkSave;memcpy(linkSave, link, sizeof(struct ncclTopoLink));while (link ! node-links) {if ((link-1)-bw linkSave.bw) break;memcpy(link, link-1, sizeof(struct ncclTopoLink));link--;}memcpy(link, linkSave, sizeof(struct ncclTopoLink));......}回到 ncclTopoGetSystemFromXml 最外层通过 ncclTopoAddCpuNCCL 实现了对当前实体节点上众多 GPUPCINVSCPUNICNET 设备的建图任意 PCIe 设备都记录了它自身周边通过 PCIe 链路的可达信息每个 PCIe 设备都只知道自己周边有哪些下一跳节点但是只有一层可达信息每个 PCIe 设备的连接都是双向的即 parent - child 模式连接类型为LINK_PCI每条连接都被赋值了计算出来的带宽值每个 PCIe 设备增加一个连接就要按照连接带宽进行一次排序将带宽最大的连接放在 links 数组的最前面通过 ncclTopoAddNvLinks构建 GPU 和 GPU 之间的 GPU0 ---- NVLink ---- GPU1 连接或者基于 NVSwitch 的 GPU0 ----NVLink---- NVSwitch ----NVLink---- GPU1 连接。连接类型为LINK_NVL通过 ncclTopoAddPciLinks构建 PCIe 设备之间的 pcilink 连接如果存在连接类型为LINK_LOC。pcilink 不是 PCIe 链路是一种本地直连技术。最终通过 ncclTopoGetSystemFromXml。通过 ncclTopoConnectCpus构建 CPU 之间的两两连接连接类型为LINK_SYS最后通过 ncclTopoSortSystem将以上所有 PCIe 设备的连接数组 links按照 NVLink - PCI-down - PCI-up - SYS 的顺序排列。目的是在后续使用 DFS 进行路径搜索时避免造成路径回退确保搜索路径是在往目的 PCIe 节点前进。ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem, const uint64_t localHostHash) {......for (int s0; s nSubs; s) {struct ncclXmlNode* node topNode-subs[s];if (strcmp(node-name, cpu) 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));}......NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL, 0));NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL, 0));NCCLCHECK(ncclTopoAddPciLinks(topNode, *topoSystem, NULL, 0));NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem));NCCLCHECK(ncclTopoConnectCpus(*topoSystem));NCCLCHECK(ncclTopoSortSystem(*topoSystem));}NCCL 定义的 PCIe 设备间的连接类型#define LINK_LOC 0#define LINK_NVL 1// Skipping 2 for PATH_NVB#define LINK_C2C 3#define LINK_PCI 4// Skipping 5 for PATH_PXB// Skipping 6 for PATH_PXN// Skipping 7 for PATH_P2C// Skipping 8 for PATH_PHB#define LINK_SYS 9#define LINK_NET 10