使用多线程监听模型训练过程中实际显存占用nvidia-smi

        在使用torch.cuda.max_allocated_memory时只能返回torch tensor占用显存大小,需要加上模型的大小,而模型的加载大小也不是单纯的model文件夹的大小,此时只能使用watch -n 0.1 nvidia-smi的指令进行人肉监听。

        除此外,TRT等运行时也不支持torch.cuda的方法。

        这里我开了个线程用于监听内存。

import nvidia_smi
from threading import Thread
from multiprocessing import Process
import time

class Monitor(Thread):
    def __init__(self, delay, index):
        super(Monitor, self).__init__()
        self.stopped = False
        self.index=index
        self.delay = delay # Time between calls to nvidia-smi
        # st mem
        nvidia_smi.nvmlInit()
        handle = nvidia_smi.nvmlDeviceGetHandleByIndex(index)
        info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
        self.st_mem=info.used
        print( "start used memory is {} GiB".format(info.used* 1.0 / 1024**3))
        nvidia_smi.nvmlShutdown()
        #st_end
        self.max_mem=0
        time.sleep(self.delay)
        
        self.start()

    def run(self):
        while not self.stopped:
            nvidia_smi.nvmlInit()
            handle = nvidia_smi.nvmlDeviceGetHandleByIndex(self.index)
            info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
            self.max_mem=max(self.max_mem, info.used)
            nvidia_smi.nvmlShutdown()
            time.sleep(self.delay)

    def stop(self):
        self.stopped = True
        res=(self.max_mem-self.st_mem)* 1.0 / 1024**3
        print( "total used memory is {} GiB".format(res))
        return res
        
if __name__=="__main__":
    # Instantiate monitor with a 10-second delay between updates
    monitor = Monitor(0.1,0)

    # Train, etc.
    time.sleep(1)

    # Close monitor
    monitor.stop()