Add ability to request all available gpus

fbcotter · fbcotter · commit 5b5fd6f30354 · 2019-10-11T14:25:10.000+01:00
Also increased PID field width for py3smi
diff --git a/py3nvml/__init__.py b/py3nvml/__init__.py
@@ -5,4 +5,4 @@
 from py3nvml.utils import grab_gpus, get_free_gpus, get_num_procs
 
 __all__ = ['py3nvml', 'nvidia_smi', 'grab_gpus', 'get_free_gpus', 'get_num_procs']
-__version__ = "0.2.3"
+__version__ = "0.2.4"
diff --git a/py3nvml/utils.py b/py3nvml/utils.py
@@ -17,13 +17,13 @@ def grab_gpus(num_gpus=1, gpu_select=None, gpu_fraction=0.95, max_procs=-1):
     variable. Other programs can still come along and snatch your gpu. This
     function is more about preventing **you** from stealing someone else's GPU.
 
-    If more than 1 GPU is requested but the full amount are available, then it
+    If more than 1 GPU is requested but not all were available, then it
     will set the CUDA_VISIBLE_DEVICES variable to see all the available GPUs.
     A warning is generated in this case.
 
     If one or more GPUs were requested and none were available, a Warning
-    will be raised. Before raising it, the CUDA_VISIBLE_DEVICES will be set to a
-    blank string. This means the calling function can ignore this warning and
+    will be raised. Before raising it, the CUDA_VISIBLE_DEVICES will be set to
+    a blank string. This means the calling function can ignore this warning and
     proceed if it chooses to only use the CPU, and it should still be protected
     against putting processes on a busy GPU.
 
@@ -33,10 +33,11 @@ def grab_gpus(num_gpus=1, gpu_select=None, gpu_fraction=0.95, max_procs=-1):
     Parameters
     ----------
     num_gpus : int
-        How many gpus your job needs (optional)
+        How many gpus your job needs (optional). Can set to -1 to take all
+        remaining available GPUs.
     gpu_select : iterable
         A single int or an iterable of ints indicating gpu numbers to
-        search through.  If left blank, will search through all gpus.
+        search through. If None, will search through all gpus.
     gpu_fraction : float
         The fractional of a gpu memory that must be free for the script to see
         the gpu as free. Defaults to 1. Useful if someone has grabbed a tiny
@@ -55,6 +56,8 @@ def grab_gpus(num_gpus=1, gpu_select=None, gpu_fraction=0.95, max_procs=-1):
     RuntimeWarning
         If couldn't connect with NVIDIA drivers.
         If 1 or more gpus were requested and none were available.
+        Will NOT raise a RuntimeWarning for mismatch in GPU availability if
+        `num_gpus` is -1.
     ValueError
         If the gpu_select option was not understood (can fix by leaving this
         field blank, providing an int or an iterable of ints).
@@ -70,15 +73,20 @@ def grab_gpus(num_gpus=1, gpu_select=None, gpu_fraction=0.95, max_procs=-1):
     try:
         py3nvml.nvmlInit()
     except:
-        str_ = """ Couldn't connect to nvml drivers. Check they are installed correctly.
-Proceeding on cpu only..."""
+        str_ = "Couldn't connect to nvml drivers. Check they are installed " \
+            "correctly.\nProceeding on cpu only..."
         warnings.warn(str_, RuntimeWarning)
         logger.warn(str_)
         return 0
 
     numDevices = py3nvml.nvmlDeviceGetCount()
     gpu_free = [False]*numDevices
 
+    warn_about_fewer_gpus = True
+    if num_gpus == -1:
+        num_gpus = numDevices
+        warn_about_fewer_gpus = False
+
     # Flag which gpus we can check
     if gpu_select is None:
         gpu_check = [True] * numDevices
@@ -91,8 +99,8 @@ def grab_gpus(num_gpus=1, gpu_select=None, gpu_fraction=0.95, max_procs=-1):
                 for i in gpu_select:
                     gpu_check[i] = True
             except:
-                raise ValueError('''Please provide an int or an iterable of ints
-                    for gpu_select''')
+                raise ValueError('Please set gpu_select to None, an int or an'
+                                 'iterable of ints.')
 
     # Print out GPU device info. Useful for debugging.
     for i in range(numDevices):
@@ -112,7 +120,7 @@ def grab_gpus(num_gpus=1, gpu_select=None, gpu_fraction=0.95, max_procs=-1):
     if max_procs >= 0:
         procs_ok = get_free_gpus(max_procs=max_procs)
     else:
-        procs_ok = [True,] * numDevices
+        procs_ok = [True, ] * numDevices
 
     # Now check if any devices are suitable
     for i in range(numDevices):
@@ -145,9 +153,9 @@ def grab_gpus(num_gpus=1, gpu_select=None, gpu_fraction=0.95, max_procs=-1):
             logger.info('Using {}'.format(use_gpus))
             os.environ['CUDA_VISIBLE_DEVICES'] = use_gpus
             return num_gpus
-        else:
+        elif warn_about_fewer_gpus:
             # use everything we can.
-            s = "Only {} GPUs found but {}".format(sum(gpu_free), num_gpus) + \
+            s = "Only {} GPUs found but {} ".format(sum(gpu_free), num_gpus) + \
                 "requested. Allocating these and continuing."
             warnings.warn(s, RuntimeWarning)
             logger.warn(s)
diff --git a/scripts/py3smi b/scripts/py3smi
@@ -3,7 +3,7 @@ from __future__ import print_function
 from __future__ import division
 from __future__ import absolute_import
 
-from py3nvml.py3nvml import *
+from py3nvml.py3nvml import *  
 from datetime import datetime
 import re
 import os
@@ -12,20 +12,20 @@ from subprocess import Popen, PIPE
 import argparse
 from time import sleep
 import sys
+from contextlib import contextmanager
 
 parser = argparse.ArgumentParser(description='Print GPU stats')
-parser.add_argument('-l', '--loop', action='store', type=int,
-                    default=0, help='Loop period')
-parser.add_argument('-f', '--full', action='store_true',
-                    help='Print extended version')
-parser.add_argument('-w', '--width', type=int, default=77,
-                    help='Print width')
+parser.add_argument('-l', '--loop', action='store', type=int, default=0, help='Loop period')
+parser.add_argument('-f', '--full', action='store_true', help='Print extended version')
+parser.add_argument('-w', '--width', type=int, default=77, help='Print width')
+parser.add_argument('--left', action='store_true', help='Prints left part of process name')
 
 COL1_WIDTH = 33
 COL2_WIDTH = 21
 COL3_WIDTH = 21
 WIDTH = 77
 LONG_FORMAT = False
+LEN_PROCESS_LESS_NAME = 51
 
 gpu_format_col1 = '| {:>3} {:3} {:>5} {:>4} {:>11}|'
 gpu_format_col2 = ' {:>19} |'
@@ -76,6 +76,7 @@ def print_proc_header():
     print('+' + '=' * args.width + '+')
     return 6
 
+
 def enabled_str(x):
     if x == 'Enabled':
         return 'On'
@@ -123,7 +124,6 @@ def print_gpu_info(index, long_format=False):
             print(gpu_format_col3.format('', ''))
             return 1
 
-
     min_number = try_get_info(nvmlDeviceGetMinorNumber, h)
     prod_name = try_get_info(nvmlDeviceGetName, h)
     pers_mode = try_get_info(nvmlDeviceGetPersistenceMode, h, 0)
@@ -199,10 +199,13 @@ def print_gpu_info(index, long_format=False):
     return n
 
 
-def cut_proc_name(name, maxlen):
+def cut_proc_name(name, maxlen, left=False):
     if len(name) > maxlen:
         #  return '...' + name[-maxlen+3:]
-        return name[:maxlen-3] + '...'
+        if left:
+            return name[:maxlen-2] + '..'
+        else:
+            return '..' + name[-maxlen+2:]
     else:
         return name
 
@@ -240,7 +243,7 @@ def get_uptime(pid):
     return time
 
 
-def main(full=False):
+def main(full=False, left=False):
     num_lines = 0
     driver_version = nvmlSystemGetDriverVersion()
     header_lines = print_header(driver_version, full)
@@ -276,7 +279,7 @@ def main(full=False):
             uptime = get_uptime(p.pid)
             print(proc_format.format(
                 min_number, uname, p.pid, uptime,
-                cut_proc_name(procname, args.width-50),
+                cut_proc_name(procname, args.width-LEN_PROCESS_LESS_NAME, left),
                 p.usedGpuMemory >> 20, 'MiB'))
             proc_lines += 1
     print('+' + '-' * args.width + '+')
@@ -290,22 +293,20 @@ def main(full=False):
 
 if __name__ == '__main__':
     args = parser.parse_args()
-    proc_format = '| {:>3}  {:>11}  {:>5}  {:>11}  {: <' + str(args.width-50) + '}  {:>5}{:3<} |'
+    proc_format = '| {:>3}  {:>11}  {:>7}  {:>10}  {: <' + str(args.width-LEN_PROCESS_LESS_NAME) + '}  {:>5}{:3<} |'
     nvmlInit()
-    print_lines = main(args.full)
+    print_lines = main(args.full, args.left)
 
     if args.loop > 0:
         try:
             while True:
                 sleep(args.loop)
                 sys.stdout.write("\033[F" * print_lines)
-                print_lines_new = main(args.full)
+                print_lines_new = main(args.full, args.left)
                 if print_lines_new < print_lines:
                     sys.stdout.write((' '*(args.width+2)+'\n')*(print_lines - print_lines_new))
                     sys.stdout.write("\033[F" * (print_lines - print_lines_new))
                 print_lines = print_lines_new
         except KeyboardInterrupt:
             pass
     nvmlShutdown()
-
-