使用多线程监听模型训练过程中实际显存占用nvidia-smi
在使用torch.cuda.max_allocated_memory时只能返回torch tensor占用显存大小,需要加上模型的大小,而模型的加载大小也不是单纯的model文件夹的大小,此时只能使用watch -n 0.1 nvidia-smi的指令进行人肉监听。
除此外,TRT等运行时也不支持torch.cuda的方法。
这里我开了个线程用于监听内存。
import nvidia_smi
from threading import Thread
from multiprocessing import Process
import time
class Monitor(Thread):
def __init__(self, delay, index):
super(Monitor, self).__init__()
self.stopped = False
self.index=index
self.delay = delay # Time between calls to nvidia-smi
# st mem
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(index)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
self.st_mem=info.used
print( "start used memory is {} GiB".format(info.used* 1.0 / 1024**3))
nvidia_smi.nvmlShutdown()
#st_end
self.max_mem=0
time.sleep(self.delay)
self.start()
def run(self):
while not self.stopped:
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(self.index)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
self.max_mem=max(self.max_mem, info.used)
nvidia_smi.nvmlShutdown()
time.sleep(self.delay)
def stop(self):
self.stopped = True
res=(self.max_mem-self.st_mem)* 1.0 / 1024**3
print( "total used memory is {} GiB".format(res))
return res
if __name__=="__main__":
# Instantiate monitor with a 10-second delay between updates
monitor = Monitor(0.1,0)
# Train, etc.
time.sleep(1)
# Close monitor
monitor.stop()