382 lines
7.3 KiB
Go
382 lines
7.3 KiB
Go
// Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
|
|
|
|
package nvml
|
|
|
|
// #include "nvml_dl.h"
|
|
import "C"
|
|
|
|
import (
|
|
"bytes"
|
|
"errors"
|
|
"fmt"
|
|
"io/ioutil"
|
|
"strconv"
|
|
"strings"
|
|
)
|
|
|
|
var (
|
|
ErrCPUAffinity = errors.New("failed to retrieve CPU affinity")
|
|
ErrUnsupportedP2PLink = errors.New("unsupported P2P link type")
|
|
ErrUnsupportedGPU = errors.New("unsupported GPU device")
|
|
)
|
|
|
|
type P2PLinkType uint
|
|
|
|
const (
|
|
P2PLinkUnknown P2PLinkType = iota
|
|
P2PLinkCrossCPU
|
|
P2PLinkSameCPU
|
|
P2PLinkHostBridge
|
|
P2PLinkMultiSwitch
|
|
P2PLinkSingleSwitch
|
|
P2PLinkSameBoard
|
|
)
|
|
|
|
type P2PLink struct {
|
|
BusID string
|
|
Link P2PLinkType
|
|
}
|
|
|
|
func (t P2PLinkType) String() string {
|
|
switch t {
|
|
case P2PLinkCrossCPU:
|
|
return "Cross CPU socket"
|
|
case P2PLinkSameCPU:
|
|
return "Same CPU socket"
|
|
case P2PLinkHostBridge:
|
|
return "Host PCI bridge"
|
|
case P2PLinkMultiSwitch:
|
|
return "Multiple PCI switches"
|
|
case P2PLinkSingleSwitch:
|
|
return "Single PCI switch"
|
|
case P2PLinkSameBoard:
|
|
return "Same board"
|
|
case P2PLinkUnknown:
|
|
}
|
|
return "N/A"
|
|
}
|
|
|
|
type ClockInfo struct {
|
|
Cores *uint
|
|
Memory *uint
|
|
}
|
|
|
|
type PCIInfo struct {
|
|
BusID string
|
|
BAR1 *uint64
|
|
Bandwidth *uint
|
|
}
|
|
|
|
type Device struct {
|
|
handle
|
|
|
|
UUID string
|
|
Path string
|
|
Model *string
|
|
Power *uint
|
|
CPUAffinity *uint
|
|
PCI PCIInfo
|
|
Clocks ClockInfo
|
|
Topology []P2PLink
|
|
}
|
|
|
|
type UtilizationInfo struct {
|
|
GPU *uint
|
|
Memory *uint
|
|
Encoder *uint
|
|
Decoder *uint
|
|
}
|
|
|
|
type PCIThroughputInfo struct {
|
|
RX *uint
|
|
TX *uint
|
|
}
|
|
|
|
type PCIStatusInfo struct {
|
|
BAR1Used *uint64
|
|
Throughput PCIThroughputInfo
|
|
}
|
|
|
|
type ECCErrorsInfo struct {
|
|
L1Cache *uint64
|
|
L2Cache *uint64
|
|
Global *uint64
|
|
}
|
|
|
|
type MemoryInfo struct {
|
|
GlobalUsed *uint64
|
|
ECCErrors ECCErrorsInfo
|
|
}
|
|
|
|
type ProcessInfo struct {
|
|
PID uint
|
|
Name string
|
|
MemoryUsed uint64
|
|
}
|
|
|
|
type DeviceStatus struct {
|
|
Power *uint
|
|
Temperature *uint
|
|
Utilization UtilizationInfo
|
|
Memory MemoryInfo
|
|
Clocks ClockInfo
|
|
PCI PCIStatusInfo
|
|
Processes []ProcessInfo
|
|
}
|
|
|
|
func assert(err error) {
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
}
|
|
|
|
func Init() error {
|
|
return init_()
|
|
}
|
|
|
|
func Shutdown() error {
|
|
return shutdown()
|
|
}
|
|
|
|
func GetDeviceCount() (uint, error) {
|
|
return deviceGetCount()
|
|
}
|
|
|
|
func GetDriverVersion() (string, error) {
|
|
return systemGetDriverVersion()
|
|
}
|
|
|
|
func numaNode(busid string) (uint, error) {
|
|
b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", strings.ToLower(busid)))
|
|
if err != nil {
|
|
// XXX report node 0 if NUMA support isn't enabled
|
|
return 0, nil
|
|
}
|
|
node, err := strconv.ParseInt(string(bytes.TrimSpace(b)), 10, 8)
|
|
if err != nil {
|
|
return 0, fmt.Errorf("%v: %v", ErrCPUAffinity, err)
|
|
}
|
|
if node < 0 {
|
|
node = 0 // XXX report node 0 instead of NUMA_NO_NODE
|
|
}
|
|
return uint(node), nil
|
|
}
|
|
|
|
func pciBandwidth(gen, width *uint) *uint {
|
|
m := map[uint]uint{
|
|
1: 250, // MB/s
|
|
2: 500,
|
|
3: 985,
|
|
4: 1969,
|
|
}
|
|
if gen == nil || width == nil {
|
|
return nil
|
|
}
|
|
bw := m[*gen] * *width
|
|
return &bw
|
|
}
|
|
|
|
func NewDevice(idx uint) (device *Device, err error) {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
err = r.(error)
|
|
}
|
|
}()
|
|
|
|
h, err := deviceGetHandleByIndex(idx)
|
|
assert(err)
|
|
model, err := h.deviceGetName()
|
|
assert(err)
|
|
uuid, err := h.deviceGetUUID()
|
|
assert(err)
|
|
minor, err := h.deviceGetMinorNumber()
|
|
assert(err)
|
|
power, err := h.deviceGetPowerManagementLimit()
|
|
assert(err)
|
|
busid, err := h.deviceGetPciInfo()
|
|
assert(err)
|
|
bar1, _, err := h.deviceGetBAR1MemoryInfo()
|
|
assert(err)
|
|
pcig, err := h.deviceGetMaxPcieLinkGeneration()
|
|
assert(err)
|
|
pciw, err := h.deviceGetMaxPcieLinkWidth()
|
|
assert(err)
|
|
ccore, cmem, err := h.deviceGetMaxClockInfo()
|
|
assert(err)
|
|
|
|
if minor == nil || busid == nil || uuid == nil {
|
|
return nil, ErrUnsupportedGPU
|
|
}
|
|
path := fmt.Sprintf("/dev/nvidia%d", *minor)
|
|
node, err := numaNode(*busid)
|
|
assert(err)
|
|
|
|
device = &Device{
|
|
handle: h,
|
|
UUID: *uuid,
|
|
Path: path,
|
|
Model: model,
|
|
Power: power,
|
|
CPUAffinity: &node,
|
|
PCI: PCIInfo{
|
|
BusID: *busid,
|
|
BAR1: bar1,
|
|
Bandwidth: pciBandwidth(pcig, pciw), // MB/s
|
|
},
|
|
Clocks: ClockInfo{
|
|
Cores: ccore, // MHz
|
|
Memory: cmem, // MHz
|
|
},
|
|
}
|
|
if power != nil {
|
|
*device.Power /= 1000 // W
|
|
}
|
|
if bar1 != nil {
|
|
*device.PCI.BAR1 /= 1024 * 1024 // MiB
|
|
}
|
|
return
|
|
}
|
|
|
|
func NewDeviceLite(idx uint) (device *Device, err error) {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
err = r.(error)
|
|
}
|
|
}()
|
|
|
|
h, err := deviceGetHandleByIndex(idx)
|
|
assert(err)
|
|
uuid, err := h.deviceGetUUID()
|
|
assert(err)
|
|
minor, err := h.deviceGetMinorNumber()
|
|
assert(err)
|
|
busid, err := h.deviceGetPciInfo()
|
|
assert(err)
|
|
|
|
if minor == nil || busid == nil || uuid == nil {
|
|
return nil, ErrUnsupportedGPU
|
|
}
|
|
path := fmt.Sprintf("/dev/nvidia%d", *minor)
|
|
|
|
device = &Device{
|
|
handle: h,
|
|
UUID: *uuid,
|
|
Path: path,
|
|
PCI: PCIInfo{
|
|
BusID: *busid,
|
|
},
|
|
}
|
|
return
|
|
}
|
|
|
|
func (d *Device) Status() (status *DeviceStatus, err error) {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
err = r.(error)
|
|
}
|
|
}()
|
|
|
|
power, err := d.deviceGetPowerUsage()
|
|
assert(err)
|
|
temp, err := d.deviceGetTemperature()
|
|
assert(err)
|
|
ugpu, umem, err := d.deviceGetUtilizationRates()
|
|
assert(err)
|
|
uenc, err := d.deviceGetEncoderUtilization()
|
|
assert(err)
|
|
udec, err := d.deviceGetDecoderUtilization()
|
|
assert(err)
|
|
mem, err := d.deviceGetMemoryInfo()
|
|
assert(err)
|
|
ccore, cmem, err := d.deviceGetClockInfo()
|
|
assert(err)
|
|
_, bar1, err := d.deviceGetBAR1MemoryInfo()
|
|
assert(err)
|
|
pids, pmems, err := d.deviceGetComputeRunningProcesses()
|
|
assert(err)
|
|
el1, el2, emem, err := d.deviceGetMemoryErrorCounter()
|
|
assert(err)
|
|
pcirx, pcitx, err := d.deviceGetPcieThroughput()
|
|
assert(err)
|
|
|
|
status = &DeviceStatus{
|
|
Power: power,
|
|
Temperature: temp, // °C
|
|
Utilization: UtilizationInfo{
|
|
GPU: ugpu, // %
|
|
Memory: umem, // %
|
|
Encoder: uenc, // %
|
|
Decoder: udec, // %
|
|
},
|
|
Memory: MemoryInfo{
|
|
GlobalUsed: mem,
|
|
ECCErrors: ECCErrorsInfo{
|
|
L1Cache: el1,
|
|
L2Cache: el2,
|
|
Global: emem,
|
|
},
|
|
},
|
|
Clocks: ClockInfo{
|
|
Cores: ccore, // MHz
|
|
Memory: cmem, // MHz
|
|
},
|
|
PCI: PCIStatusInfo{
|
|
BAR1Used: bar1,
|
|
Throughput: PCIThroughputInfo{
|
|
RX: pcirx,
|
|
TX: pcitx,
|
|
},
|
|
},
|
|
}
|
|
if power != nil {
|
|
*status.Power /= 1000 // W
|
|
}
|
|
if mem != nil {
|
|
*status.Memory.GlobalUsed /= 1024 * 1024 // MiB
|
|
}
|
|
if bar1 != nil {
|
|
*status.PCI.BAR1Used /= 1024 * 1024 // MiB
|
|
}
|
|
if pcirx != nil {
|
|
*status.PCI.Throughput.RX /= 1000 // MB/s
|
|
}
|
|
if pcitx != nil {
|
|
*status.PCI.Throughput.TX /= 1000 // MB/s
|
|
}
|
|
for i := range pids {
|
|
name, err := systemGetProcessName(pids[i])
|
|
assert(err)
|
|
status.Processes = append(status.Processes, ProcessInfo{
|
|
PID: pids[i],
|
|
Name: name,
|
|
MemoryUsed: pmems[i] / (1024 * 1024), // MiB
|
|
})
|
|
}
|
|
return
|
|
}
|
|
|
|
func GetP2PLink(dev1, dev2 *Device) (link P2PLinkType, err error) {
|
|
level, err := deviceGetTopologyCommonAncestor(dev1.handle, dev2.handle)
|
|
if err != nil || level == nil {
|
|
return P2PLinkUnknown, err
|
|
}
|
|
|
|
switch *level {
|
|
case C.NVML_TOPOLOGY_INTERNAL:
|
|
link = P2PLinkSameBoard
|
|
case C.NVML_TOPOLOGY_SINGLE:
|
|
link = P2PLinkSingleSwitch
|
|
case C.NVML_TOPOLOGY_MULTIPLE:
|
|
link = P2PLinkMultiSwitch
|
|
case C.NVML_TOPOLOGY_HOSTBRIDGE:
|
|
link = P2PLinkHostBridge
|
|
case C.NVML_TOPOLOGY_CPU:
|
|
link = P2PLinkSameCPU
|
|
case C.NVML_TOPOLOGY_SYSTEM:
|
|
link = P2PLinkCrossCPU
|
|
default:
|
|
err = ErrUnsupportedP2PLink
|
|
}
|
|
return
|
|
}
|