15
15
from typing import Optional , Tuple , Dict , Any , List
16
16
17
17
from ..logger import get_logger
18
- from ..config import MIN_FREE_MEMORY
19
18
20
19
# Get logger
21
20
logger = get_logger ("locallab.utils.system" )
22
21
22
+ # System constants
23
+ MIN_FREE_MEMORY = 2000 # Minimum required free memory in MB
24
+ MIN_GPU_MEMORY = 4000 # Minimum required GPU memory in MB
23
25
24
26
def get_system_memory () -> Tuple [int , int ]:
25
27
"""Get system memory information in MB"""
@@ -28,84 +30,41 @@ def get_system_memory() -> Tuple[int, int]:
28
30
free_memory = vm .available // (1024 * 1024 ) # Convert to MB
29
31
return total_memory , free_memory
30
32
31
-
32
33
def get_gpu_memory () -> Optional [Tuple [int , int ]]:
33
- """Get GPU memory information in MB if available """
34
+ """Get GPU memory information in MB"""
34
35
if not TORCH_AVAILABLE or not torch .cuda .is_available ():
35
36
return None
36
-
37
- # First try nvidia-ml-py3 (nvidia_smi)
38
- try :
39
- import nvidia_smi
40
- nvidia_smi .nvmlInit ()
41
- handle = nvidia_smi .nvmlDeviceGetHandleByIndex (0 )
42
- info = nvidia_smi .nvmlDeviceGetMemoryInfo (handle )
43
37
44
- total_memory = info . total // ( 1024 * 1024 ) # Convert to MB
45
- free_memory = info . free // ( 1024 * 1024 ) # Convert to MB
46
-
47
- nvidia_smi . nvmlShutdown ( )
38
+ try :
39
+ device = torch . cuda . current_device ()
40
+ total_memory = torch . cuda . get_device_properties ( device ). total_memory // ( 1024 * 1024 ) # Convert to MB
41
+ free_memory = total_memory - ( torch . cuda . memory_allocated () + torch . cuda . memory_reserved ()) // ( 1024 * 1024 )
48
42
return total_memory , free_memory
49
- except ImportError :
50
- # If nvidia_smi not available, log at debug level to avoid noise
51
- logger .debug ("nvidia-ml-py3 not installed, falling back to torch for GPU info" )
52
- # Fall back to torch for basic info
53
- try :
54
- # Get basic info from torch
55
- device = torch .cuda .current_device ()
56
- total_memory = torch .cuda .get_device_properties (device ).total_memory // (1024 * 1024 )
57
- # Note: torch doesn't provide free memory info easily, so we estimate
58
- # by allocating a tensor and seeing what's available
59
- torch .cuda .empty_cache ()
60
- free_memory = total_memory # Optimistic starting point
61
-
62
- # Rough estimate - we can't get exact free memory from torch easily
63
- return total_memory , free_memory
64
- except Exception as torch_error :
65
- logger .debug (f"Torch GPU memory check also failed: { str (torch_error )} " )
66
- return None
67
43
except Exception as e :
68
- logger .debug (f"Failed to get detailed GPU memory info: { str (e )} " )
69
- # Fall back to torch for basic info (same as ImportError case)
70
- try :
71
- device = torch .cuda .current_device ()
72
- total_memory = torch .cuda .get_device_properties (device ).total_memory // (1024 * 1024 )
73
- torch .cuda .empty_cache ()
74
- free_memory = total_memory # Optimistic estimate
75
- return total_memory , free_memory
76
- except Exception :
77
- return None
78
-
44
+ logger .warning (f"Failed to get GPU memory info: { e } " )
45
+ return None
79
46
80
- def check_resource_availability (required_memory : int ) -> bool :
81
- """Check if system has enough resources for the requested operation """
47
+ def check_resource_availability (required_memory : int = MIN_FREE_MEMORY ) -> bool :
48
+ """Check if system has enough resources"""
82
49
_ , free_memory = get_system_memory ()
83
-
84
- # Check system memory
85
- if free_memory < MIN_FREE_MEMORY :
86
- logger .warning (f"Low system memory: { free_memory } MB available" )
50
+ if free_memory < required_memory :
87
51
return False
88
-
89
- # If GPU is available, check GPU memory
90
- if TORCH_AVAILABLE and torch .cuda .is_available ():
91
- gpu_memory = get_gpu_memory ()
92
- if gpu_memory :
93
- total_gpu , free_gpu = gpu_memory
94
- if free_gpu < required_memory :
95
- logger .warning (f"Insufficient GPU memory: { free_gpu } MB available, { required_memory } MB required" )
96
- return False
97
-
52
+
53
+ gpu_mem = get_gpu_memory ()
54
+ if gpu_mem is not None :
55
+ _ , free_gpu = gpu_mem
56
+ if free_gpu < MIN_GPU_MEMORY :
57
+ return False
58
+
98
59
return True
99
60
100
-
101
61
def get_device () -> str :
102
62
"""Get the device to use for computations."""
103
63
if TORCH_AVAILABLE and torch .cuda .is_available ():
104
64
return "cuda"
105
65
else :
106
66
return "cpu"
107
67
108
-
109
68
def format_model_size (size_in_bytes : int ) -> str :
110
69
"""Format model size in human-readable format"""
111
70
for unit in ['B' , 'KB' , 'MB' , 'GB' ]:
@@ -114,7 +73,6 @@ def format_model_size(size_in_bytes: int) -> str:
114
73
size_in_bytes /= 1024
115
74
return f"{ size_in_bytes :.2f} TB"
116
75
117
-
118
76
def get_system_resources () -> Dict [str , Any ]:
119
77
"""Get system resource information"""
120
78
resources = {
@@ -146,7 +104,6 @@ def get_system_resources() -> Dict[str, Any]:
146
104
147
105
return resources
148
106
149
-
150
107
def get_cpu_info () -> Dict [str , Any ]:
151
108
"""Get information about the CPU."""
152
109
return {
@@ -155,7 +112,6 @@ def get_cpu_info() -> Dict[str, Any]:
155
112
"usage" : psutil .cpu_percent (interval = 0.1 )
156
113
}
157
114
158
-
159
115
def get_gpu_info () -> List [Dict [str , Any ]]:
160
116
"""Get detailed information about all available GPUs.
161
117
@@ -231,7 +187,6 @@ def get_gpu_info() -> List[Dict[str, Any]]:
231
187
232
188
return gpu_info
233
189
234
-
235
190
def get_memory_info () -> Dict [str , Any ]:
236
191
"""Get information about the system memory."""
237
192
mem = psutil .virtual_memory ()
@@ -242,8 +197,7 @@ def get_memory_info() -> Dict[str, Any]:
242
197
"percent" : mem .percent
243
198
}
244
199
245
-
246
200
# Add this function for backward compatibility
247
201
def get_system_info () -> Dict [str , Any ]:
248
202
"""Get system resource information (alias for get_system_resources)"""
249
- return get_system_resources ()
203
+ return get_system_resources ()
0 commit comments