// Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. package nvml // #include "nvml_dl.h" import "C" import ( "bytes" "errors" "fmt" "io/ioutil" "strconv" "strings" ) var ( ErrCPUAffinity = errors.New("failed to retrieve CPU affinity") ErrUnsupportedP2PLink = errors.New("unsupported P2P link type") ErrUnsupportedGPU = errors.New("unsupported GPU device") ) type P2PLinkType uint const ( P2PLinkUnknown P2PLinkType = iota P2PLinkCrossCPU P2PLinkSameCPU P2PLinkHostBridge P2PLinkMultiSwitch P2PLinkSingleSwitch P2PLinkSameBoard ) type P2PLink struct { BusID string Link P2PLinkType } func (t P2PLinkType) String() string { switch t { case P2PLinkCrossCPU: return "Cross CPU socket" case P2PLinkSameCPU: return "Same CPU socket" case P2PLinkHostBridge: return "Host PCI bridge" case P2PLinkMultiSwitch: return "Multiple PCI switches" case P2PLinkSingleSwitch: return "Single PCI switch" case P2PLinkSameBoard: return "Same board" case P2PLinkUnknown: } return "N/A" } type ClockInfo struct { Cores *uint Memory *uint } type PCIInfo struct { BusID string BAR1 *uint64 Bandwidth *uint } type Device struct { handle UUID string Path string Model *string Power *uint CPUAffinity *uint PCI PCIInfo Clocks ClockInfo Topology []P2PLink } type UtilizationInfo struct { GPU *uint Memory *uint Encoder *uint Decoder *uint } type PCIThroughputInfo struct { RX *uint TX *uint } type PCIStatusInfo struct { BAR1Used *uint64 Throughput PCIThroughputInfo } type ECCErrorsInfo struct { L1Cache *uint64 L2Cache *uint64 Global *uint64 } type MemoryInfo struct { GlobalUsed *uint64 ECCErrors ECCErrorsInfo } type ProcessInfo struct { PID uint Name string MemoryUsed uint64 } type DeviceStatus struct { Power *uint Temperature *uint Utilization UtilizationInfo Memory MemoryInfo Clocks ClockInfo PCI PCIStatusInfo Processes []ProcessInfo } func assert(err error) { if err != nil { panic(err) } } func Init() error { return init_() } func Shutdown() error { return shutdown() } func GetDeviceCount() (uint, error) { return deviceGetCount() } func GetDriverVersion() (string, error) { return systemGetDriverVersion() } func numaNode(busid string) (uint, error) { b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", strings.ToLower(busid))) if err != nil { // XXX report node 0 if NUMA support isn't enabled return 0, nil } node, err := strconv.ParseInt(string(bytes.TrimSpace(b)), 10, 8) if err != nil { return 0, fmt.Errorf("%v: %v", ErrCPUAffinity, err) } if node < 0 { node = 0 // XXX report node 0 instead of NUMA_NO_NODE } return uint(node), nil } func pciBandwidth(gen, width *uint) *uint { m := map[uint]uint{ 1: 250, // MB/s 2: 500, 3: 985, 4: 1969, } if gen == nil || width == nil { return nil } bw := m[*gen] * *width return &bw } func NewDevice(idx uint) (device *Device, err error) { defer func() { if r := recover(); r != nil { err = r.(error) } }() h, err := deviceGetHandleByIndex(idx) assert(err) model, err := h.deviceGetName() assert(err) uuid, err := h.deviceGetUUID() assert(err) minor, err := h.deviceGetMinorNumber() assert(err) power, err := h.deviceGetPowerManagementLimit() assert(err) busid, err := h.deviceGetPciInfo() assert(err) bar1, _, err := h.deviceGetBAR1MemoryInfo() assert(err) pcig, err := h.deviceGetMaxPcieLinkGeneration() assert(err) pciw, err := h.deviceGetMaxPcieLinkWidth() assert(err) ccore, cmem, err := h.deviceGetMaxClockInfo() assert(err) if minor == nil || busid == nil || uuid == nil { return nil, ErrUnsupportedGPU } path := fmt.Sprintf("/dev/nvidia%d", *minor) node, err := numaNode(*busid) assert(err) device = &Device{ handle: h, UUID: *uuid, Path: path, Model: model, Power: power, CPUAffinity: &node, PCI: PCIInfo{ BusID: *busid, BAR1: bar1, Bandwidth: pciBandwidth(pcig, pciw), // MB/s }, Clocks: ClockInfo{ Cores: ccore, // MHz Memory: cmem, // MHz }, } if power != nil { *device.Power /= 1000 // W } if bar1 != nil { *device.PCI.BAR1 /= 1024 * 1024 // MiB } return } func NewDeviceLite(idx uint) (device *Device, err error) { defer func() { if r := recover(); r != nil { err = r.(error) } }() h, err := deviceGetHandleByIndex(idx) assert(err) uuid, err := h.deviceGetUUID() assert(err) minor, err := h.deviceGetMinorNumber() assert(err) busid, err := h.deviceGetPciInfo() assert(err) if minor == nil || busid == nil || uuid == nil { return nil, ErrUnsupportedGPU } path := fmt.Sprintf("/dev/nvidia%d", *minor) device = &Device{ handle: h, UUID: *uuid, Path: path, PCI: PCIInfo{ BusID: *busid, }, } return } func (d *Device) Status() (status *DeviceStatus, err error) { defer func() { if r := recover(); r != nil { err = r.(error) } }() power, err := d.deviceGetPowerUsage() assert(err) temp, err := d.deviceGetTemperature() assert(err) ugpu, umem, err := d.deviceGetUtilizationRates() assert(err) uenc, err := d.deviceGetEncoderUtilization() assert(err) udec, err := d.deviceGetDecoderUtilization() assert(err) mem, err := d.deviceGetMemoryInfo() assert(err) ccore, cmem, err := d.deviceGetClockInfo() assert(err) _, bar1, err := d.deviceGetBAR1MemoryInfo() assert(err) pids, pmems, err := d.deviceGetComputeRunningProcesses() assert(err) el1, el2, emem, err := d.deviceGetMemoryErrorCounter() assert(err) pcirx, pcitx, err := d.deviceGetPcieThroughput() assert(err) status = &DeviceStatus{ Power: power, Temperature: temp, // °C Utilization: UtilizationInfo{ GPU: ugpu, // % Memory: umem, // % Encoder: uenc, // % Decoder: udec, // % }, Memory: MemoryInfo{ GlobalUsed: mem, ECCErrors: ECCErrorsInfo{ L1Cache: el1, L2Cache: el2, Global: emem, }, }, Clocks: ClockInfo{ Cores: ccore, // MHz Memory: cmem, // MHz }, PCI: PCIStatusInfo{ BAR1Used: bar1, Throughput: PCIThroughputInfo{ RX: pcirx, TX: pcitx, }, }, } if power != nil { *status.Power /= 1000 // W } if mem != nil { *status.Memory.GlobalUsed /= 1024 * 1024 // MiB } if bar1 != nil { *status.PCI.BAR1Used /= 1024 * 1024 // MiB } if pcirx != nil { *status.PCI.Throughput.RX /= 1000 // MB/s } if pcitx != nil { *status.PCI.Throughput.TX /= 1000 // MB/s } for i := range pids { name, err := systemGetProcessName(pids[i]) assert(err) status.Processes = append(status.Processes, ProcessInfo{ PID: pids[i], Name: name, MemoryUsed: pmems[i] / (1024 * 1024), // MiB }) } return } func GetP2PLink(dev1, dev2 *Device) (link P2PLinkType, err error) { level, err := deviceGetTopologyCommonAncestor(dev1.handle, dev2.handle) if err != nil || level == nil { return P2PLinkUnknown, err } switch *level { case C.NVML_TOPOLOGY_INTERNAL: link = P2PLinkSameBoard case C.NVML_TOPOLOGY_SINGLE: link = P2PLinkSingleSwitch case C.NVML_TOPOLOGY_MULTIPLE: link = P2PLinkMultiSwitch case C.NVML_TOPOLOGY_HOSTBRIDGE: link = P2PLinkHostBridge case C.NVML_TOPOLOGY_CPU: link = P2PLinkSameCPU case C.NVML_TOPOLOGY_SYSTEM: link = P2PLinkCrossCPU default: err = ErrUnsupportedP2PLink } return }