Source code for h2o4gpu.util.gpu

# -*- encoding: utf-8 -*-
"""
:copyright: 2017-2018 H2O.ai, Inc.
:license:   Apache License Version 2.0 (see LICENSE for details)
"""
import os
import numpy as np


#############################
# Device utils


[docs]def device_count(n_gpus=0): """Tries to return the number of available GPUs on this machine. :param n_gpus: int, optional, default : 0 If < 0 then return all available GPUs If >= 0 then return n_gpus or as many as possible :return: Adjusted n_gpus and all available devices """ available_device_count = get_gpu_info_c()[0] if n_gpus < 0: if available_device_count >= 0: n_gpus = available_device_count else: print("Cannot set n_gpus to all GPUs %d %d, trying n_gpus=1" % (n_gpus, available_device_count)) n_gpus = 1 if n_gpus > available_device_count: n_gpus = available_device_count return n_gpus, available_device_count
[docs]def get_gpu_info(return_usage=False, trials=2, timeout=30, print_trials=False): """Gets the GPU info. This runs in a sub-process to avoid mixing parent-child CUDA contexts. # get GPU info, but do in sub-process # to avoid mixing parent-child cuda contexts # https://stackoverflow.com/questions/22950047/cuda-initialization-error-after-fork # Tries "trials" times to get result # If fails to get result within "timeout" seconds each trial, # then returns as if no GPU :return: Total number of GPUs and total available memory """ total_gpus = 0 total_mem = 0 gpu_type = 0 usage = [] import concurrent.futures from concurrent.futures import ProcessPoolExecutor res = None # sometimes hit broken process pool in cpu mode, # so just return back no gpus. for trial in range(0, trials): try: with ProcessPoolExecutor(max_workers=1) as executor: future = executor.submit(get_gpu_info_subprocess, return_usage) # don't wait more than 30s, # import on py3nvml can hang if 2 subprocesses # GIL lock import at same time res = future.result(timeout=timeout) return res except concurrent.futures.process.BrokenProcessPool: pass except concurrent.futures.TimeoutError: pass if print_trials: print("Trial %d/%d" % (trial, trials - 1)) if return_usage: return (total_gpus, total_mem, gpu_type, usage) return (total_gpus, total_mem, gpu_type)
[docs]def cuda_vis_check(total_gpus): """Helper function to count GPUs by environment variable """ cudavis = os.getenv("CUDA_VISIBLE_DEVICES") which_gpus = [] if cudavis is not None: # prune away white-space, non-numerics, # except commas for simple checking cudavis = "".join(cudavis.split()) import re cudavis = re.sub("[^0-9,]", "", cudavis) lencudavis = len(cudavis) if lencudavis == 0: total_gpus = 0 else: total_gpus = min( total_gpus, os.getenv("CUDA_VISIBLE_DEVICES").count(",") + 1) which_gpus = os.getenv("CUDA_VISIBLE_DEVICES").split(",") which_gpus = [int(x) for x in which_gpus] else: which_gpus = list(range(0, total_gpus)) return total_gpus, which_gpus
[docs]def get_gpu_info_subprocess(return_usage=False): """Gets the GPU info in a subprocess :return: Total number of GPUs and total available memory (and optionally GPU usage) """ total_gpus = 0 total_mem = 0 gpu_type = 0 usage = [] try: import py3nvml.py3nvml py3nvml.py3nvml.nvmlInit() total_gpus_actual = py3nvml.py3nvml.nvmlDeviceGetCount() # the below restricts but doesn't select total_gpus, which_gpus = cuda_vis_check(total_gpus_actual) total_mem = \ min([py3nvml.py3nvml.nvmlDeviceGetMemoryInfo( py3nvml.py3nvml.nvmlDeviceGetHandleByIndex(i)).total for i in range(total_gpus)]) gpu_type = py3nvml.py3nvml.nvmlDeviceGetName( py3nvml.py3nvml.nvmlDeviceGetHandleByIndex(0)) if return_usage: for j in range(total_gpus_actual): if j in which_gpus: handle = py3nvml.py3nvml.nvmlDeviceGetHandleByIndex(j) util = py3nvml.py3nvml.nvmlDeviceGetUtilizationRates( handle) usage.append(util.gpu) # pylint: disable=bare-except except: pass if return_usage: return (total_gpus, total_mem, gpu_type, usage) return (total_gpus, total_mem, gpu_type)
[docs]def get_gpu_info_c(return_memory=False, return_name=False, return_usage=False, return_free_memory=False, return_capability=False, return_memory_by_pid=False, return_usage_by_pid=False, return_all=False, verbose=0): """Gets the GPU info from C call :return: Total number of GPUs and total available memory (and optionally GPU usage) """ # For backwards compatibility # Don't change to `if verbose:` it will catch also int values > 0 if verbose is True: verbose = 600 if verbose is False: verbose = 0 max_gpus = 16 total_gpus = 0 total_gpus_actual = 0 which_gpus = [] usages_tmp = np.zeros(max_gpus, dtype=np.int32) total_mems_tmp = np.zeros(max_gpus, dtype=np.uint64) free_mems_tmp = np.zeros(max_gpus, dtype=np.uint64) # This 100 should be same as the gpu type in get_gpu_info_c gpu_types_tmp = [' ' * 100 for _ in range(max_gpus)] majors_tmp = np.zeros(max_gpus, dtype=np.int32) minors_tmp = np.zeros(max_gpus, dtype=np.int32) max_pids = 2000 num_pids_tmp = np.zeros(max_pids, dtype=np.uint32) pids_tmp = np.zeros(max_pids * max_gpus, dtype=np.uint32) usedGpuMemorys_tmp = np.zeros(max_pids * max_gpus, dtype=np.uint64) num_pids_usage_tmp = np.zeros(max_pids, dtype=np.uint32) pids_usage_tmp = np.zeros(max_pids * max_gpus, dtype=np.uint32) usedGpuUsage_tmp = np.zeros(max_pids * max_gpus, dtype=np.uint64) try: from ..libs.lib_utils import GPUlib lib = GPUlib().get(verbose=verbose) status, total_gpus_actual = \ lib.get_gpu_info_c(verbose, 1 if return_memory else 0, 1 if return_name else 0, 1 if return_usage else 0, 1 if return_free_memory else 0, 1 if return_capability else 0, 1 if return_memory_by_pid else 0, 1 if return_usage_by_pid else 0, 1 if return_all else 0, usages_tmp, total_mems_tmp, free_mems_tmp, gpu_types_tmp, majors_tmp, minors_tmp, num_pids_tmp, pids_tmp, usedGpuMemorys_tmp, num_pids_usage_tmp, pids_usage_tmp, usedGpuUsage_tmp) if status != 0: return None # This will drop the GPU count, but the returned usage total_gpus, which_gpus = cuda_vis_check(total_gpus_actual) # Strip the trailing NULL and whitespaces from C backend gpu_types_tmp = [g_type.strip().replace("\x00", "") for g_type in gpu_types_tmp] # pylint: disable=broad-except except Exception as e: if verbose > 0: import sys sys.stderr.write("Exception: %s" % str(e)) print(e) sys.stdout.flush() if return_capability or return_all: if list(minors_tmp)[0] == -1: for j in which_gpus: majors_tmp[j], minors_tmp[j], _ = get_compute_capability_orig( j) total_mems_actual = np.resize(total_mems_tmp, total_gpus_actual) free_mems_actual = np.resize(free_mems_tmp, total_gpus_actual) gpu_types_actual = np.resize(gpu_types_tmp, total_gpus_actual) usages_actual = np.resize(usages_tmp, total_gpus_actual) majors_actual = np.resize(majors_tmp, total_gpus_actual) minors_actual = np.resize(minors_tmp, total_gpus_actual) num_pids_actual = np.resize(num_pids_tmp, total_gpus_actual) pids_actual = np.resize(pids_tmp, total_gpus_actual * max_pids) usedGpuMemorys_actual = np.resize(usedGpuMemorys_tmp, total_gpus_actual * max_pids) num_pids_usage_actual = np.resize(num_pids_usage_tmp, total_gpus_actual) pids_usage_actual = np.resize(pids_usage_tmp, total_gpus_actual * max_pids) usedGpuUsage_actual = np.resize(usedGpuUsage_tmp, total_gpus_actual * max_pids) total_mems = np.resize(np.copy(total_mems_actual), total_gpus) free_mems = np.resize(np.copy(free_mems_actual), total_gpus) gpu_types = np.resize(np.copy(gpu_types_actual), total_gpus) usages = np.resize(np.copy(usages_actual), total_gpus) majors = np.resize(np.copy(majors_actual), total_gpus) minors = np.resize(np.copy(minors_actual), total_gpus) num_pids = np.resize(np.copy(num_pids_actual), total_gpus) pids = np.resize(np.copy(pids_actual), total_gpus * max_pids) usedGpuMemorys = np.resize(np.copy(usedGpuMemorys_actual), total_gpus * max_pids) num_pids_usage = np.resize(np.copy(num_pids_usage_actual), total_gpus) pids_usage = np.resize(np.copy(pids_usage_actual), total_gpus * max_pids) usedGpuUsage = np.resize(np.copy(usedGpuUsage_actual), total_gpus * max_pids) gpu_i = 0 for j in range(total_gpus_actual): if j in which_gpus: total_mems[gpu_i] = total_mems_actual[j] free_mems[gpu_i] = free_mems_actual[j] gpu_types[gpu_i] = gpu_types_actual[j] usages[gpu_i] = usages_actual[j] minors[gpu_i] = minors_actual[j] majors[gpu_i] = majors_actual[j] num_pids[gpu_i] = num_pids_actual[j] pids[gpu_i] = pids_actual[j] usedGpuMemorys[gpu_i] = usedGpuMemorys_actual[j] num_pids_usage[gpu_i] = num_pids_usage_actual[j] pids_usage[gpu_i] = pids_usage_actual[j] usedGpuUsage[gpu_i] = usedGpuUsage_actual[j] gpu_i += 1 pids = np.reshape(pids, (total_gpus, max_pids)) usedGpuMemorys = np.reshape(usedGpuMemorys, (total_gpus, max_pids)) pids_usage = np.reshape(pids_usage, (total_gpus, max_pids)) usedGpuUsage = np.reshape(usedGpuUsage, (total_gpus, max_pids)) to_return = [total_gpus] if return_all or return_memory: to_return.append(total_mems) if return_all or return_name: to_return.append(gpu_types) if return_all or return_usage: to_return.append(usages) if return_all or return_free_memory: to_return.append(free_mems) if return_all or return_capability: to_return.extend([majors, minors]) if return_all or return_memory_by_pid: to_return.extend([num_pids, pids, usedGpuMemorys]) if return_all or return_usage_by_pid: to_return.extend([num_pids_usage, pids_usage, usedGpuUsage]) return tuple(to_return)
[docs]def cudaresetdevice(gpu_id, n_gpus): """ Resets the cuda device so any next cuda call will reset the cuda context. :param gpuU_id: int device number of GPU (to start with if n_gpus>1) :param n_gpus: int, optional, default : 0 If < 0 then apply to all available GPUs If >= 0 then apply to that number of GPUs """ (n_gpus, devices) = device_count(n_gpus) gpu_id = gpu_id % devices from ..libs.lib_utils import get_lib lib = get_lib(n_gpus, devices) if lib is None: n_gpus = 0 if n_gpus > 0 and lib is not None: lib.cudaresetdevice(gpu_id, n_gpus)
[docs]def cudaresetdevice_bare(n_gpus): """ Resets the cuda device so any next cuda call will reset the cuda context. """ if n_gpus > 0: from ..libs.lib_utils import GPUlib GPUlib().get().cudaresetdevice_bare()
[docs]def get_compute_capability(gpu_id): """ Get compute capability for all gpus """ try: total_gpus, majors, minors =\ get_gpu_info_c(return_capability=True) # pylint: disable=bare-except except: total_gpus = 0 if total_gpus > 0: gpu_id = gpu_id % total_gpus device_major = majors.tolist()[gpu_id] device_minor = minors.tolist()[gpu_id] device_ratioperf = 1 else: device_major = -1 device_minor = -1 device_ratioperf = 1 return (device_major, device_minor, device_ratioperf)
[docs]def get_compute_capability_orig(gpu_id): """ Gets the major cuda version, minor cuda version, and ratio of floating point single perf to double perf. :param gpuU_id: int device number of GPU """ device_major = -1 device_minor = -1 device_ratioperf = 1 import concurrent.futures from concurrent.futures import ProcessPoolExecutor res = None # sometimes hit broken process pool in cpu mode, # so return dummy values in that case try: with ProcessPoolExecutor(max_workers=1) as executor: future = executor.submit(get_compute_capability_subprocess, gpu_id) res = future.result() return res except concurrent.futures.process.BrokenProcessPool: return (device_major, device_minor, device_ratioperf)
[docs]def get_compute_capability_subprocess(gpu_id): """ Gets the major cuda version, minor cuda version, and ratio of floating point single perf to double perf. :param gpuU_id: int device number of GPU """ n_gpus = -1 (n_gpus, devices) = device_count(n_gpus) gpu_id = gpu_id % devices from ..libs.lib_utils import get_lib lib = get_lib(n_gpus, devices) if lib is None: n_gpus = 0 device_major = 0 device_minor = 0 device_ratioperf = 0 if n_gpus > 0 and lib is not None: error, device_major, device_minor, device_ratioperf = \ lib.get_compute_capability(gpu_id) assert error == 0, "Error in get_compute_capability_subprocess" return device_major, device_minor, device_ratioperf