File size: 2,965 Bytes
7ed0fb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# import time

# # Number of seconds in a day: 24 hours * 60 minutes * 60 seconds
# seconds_in_a_day = 24 * 60 * 60

# # Sleep for 100 days
# time.sleep(seconds_in_a_day * 500)

import subprocess
import time
import threading
import torch
from collections import deque

def get_gpu_details(gpu_id):
    """Returns the GPU utilization, used memory, and total memory for a specific GPU."""
    cmd = ['nvidia-smi', '--id=' + str(gpu_id),
           '--query-gpu=utilization.gpu,memory.used,memory.total',
           '--format=csv,noheader,nounits']
    result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
    utilization, used_memory, total_memory = result.stdout.strip().split(', ')
    return int(utilization), int(used_memory), int(total_memory)

def matrix_calculation_task(gpu_id, stop_event, task_running):
    """Performs a GPU-occupying task on the specified GPU."""
    torch.cuda.set_device(gpu_id)
    task_running[gpu_id] = True
    while not stop_event.is_set():
        a = torch.rand(55000, 55000, device='cuda')
        b = torch.rand(55000, 55000, device='cuda')
        torch.matmul(a, b)
    task_running[gpu_id] = False

def monitor_and_manage_gpu(gpu_id, stop_event, task_running):
    """Monitors a GPU and manages the matrix calculation task based on average usage."""
    utilization_data = deque(maxlen=30)  # Stores the last 30 seconds of utilization data
    while True:
        utilization, _, _ = get_gpu_details(gpu_id)
        utilization_data.append(utilization)
        if len(utilization_data) == 30:  # Every 30 seconds
            avg_utilization = round(sum(utilization_data) / len(utilization_data), 1)
            if avg_utilization < 90 and not task_running[gpu_id]:
                print(f"Average GPU {gpu_id} ({avg_utilization}%) utilization over the last 30 seconds is underutilized, starting task.")
                stop_event.clear()
                threading.Thread(target=matrix_calculation_task, args=(gpu_id, stop_event, task_running)).start()
            elif avg_utilization >= 90 and task_running[gpu_id]:
                print(f"Average GPU {gpu_id} ({avg_utilization}%) utilization over the last 30 seconds is nornal, keep running.")
            else:
                if task_running[gpu_id]:
                    print(f"Occupying task just starts, and average GPU {gpu_id} ({avg_utilization}%) is increasing, keep monitoring.")
                else:
                    print(f"No occupying task running, but average GPU {gpu_id} ({avg_utilization}%) utilization over the last 30 seconds is nornal.")
        time.sleep(1)  # Check every second, but make decisions based on the 30-second average

num_gpus = 8
stop_events = [threading.Event() for _ in range(num_gpus)]
task_running = [False] * num_gpus

# Start monitoring and task management for each GPU
for gpu_id in range(1, num_gpus):
    threading.Thread(target=monitor_and_manage_gpu, args=(gpu_id, stop_events[gpu_id], task_running)).start()