|
|
""" |
|
|
System monitoring service for Video Model Studio. |
|
|
Tracks system resources like CPU, memory, and other metrics. |
|
|
""" |
|
|
|
|
|
import os |
|
|
import time |
|
|
import logging |
|
|
import platform |
|
|
import threading |
|
|
import pandas as pd |
|
|
from datetime import datetime, timedelta |
|
|
from collections import deque |
|
|
from typing import Dict, List, Optional, Tuple, Any |
|
|
|
|
|
import psutil |
|
|
|
|
|
from vms.ui.monitoring.services.gpu import GPUMonitoringService |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
logger.setLevel(logging.INFO) |
|
|
|
|
|
class MonitoringService: |
|
|
"""Service for monitoring system resources and performance""" |
|
|
|
|
|
def __init__(self, history_minutes: int = 10, sample_interval: int = 5): |
|
|
"""Initialize the monitoring service |
|
|
|
|
|
Args: |
|
|
history_minutes: How many minutes of history to keep |
|
|
sample_interval: How many seconds between samples |
|
|
""" |
|
|
self.history_minutes = history_minutes |
|
|
self.sample_interval = sample_interval |
|
|
self.max_samples = (history_minutes * 60) // sample_interval |
|
|
|
|
|
|
|
|
self.timestamps = deque(maxlen=self.max_samples) |
|
|
self.cpu_percent = deque(maxlen=self.max_samples) |
|
|
self.memory_percent = deque(maxlen=self.max_samples) |
|
|
self.memory_used = deque(maxlen=self.max_samples) |
|
|
self.memory_available = deque(maxlen=self.max_samples) |
|
|
|
|
|
|
|
|
self.cpu_temp = deque(maxlen=self.max_samples) |
|
|
|
|
|
|
|
|
self.cpu_cores_percent = {} |
|
|
|
|
|
|
|
|
self.gpu = GPUMonitoringService(history_minutes=history_minutes, sample_interval=sample_interval) |
|
|
|
|
|
|
|
|
self.is_running = False |
|
|
self.thread = None |
|
|
|
|
|
|
|
|
self.collect_metrics() |
|
|
|
|
|
def collect_metrics(self) -> Dict[str, Any]: |
|
|
"""Collect current system metrics |
|
|
|
|
|
Returns: |
|
|
Dictionary of current metrics |
|
|
""" |
|
|
metrics = { |
|
|
'timestamp': datetime.now(), |
|
|
'cpu_percent': psutil.cpu_percent(interval=0.1), |
|
|
'memory_percent': psutil.virtual_memory().percent, |
|
|
'memory_used': psutil.virtual_memory().used / (1024**3), |
|
|
'memory_available': psutil.virtual_memory().available / (1024**3), |
|
|
'cpu_temp': None, |
|
|
'per_cpu_percent': psutil.cpu_percent(interval=0.1, percpu=True) |
|
|
} |
|
|
|
|
|
|
|
|
try: |
|
|
if platform.system() == 'Linux': |
|
|
|
|
|
temps = psutil.sensors_temperatures() |
|
|
for name, entries in temps.items(): |
|
|
if name.startswith(('coretemp', 'k10temp', 'cpu_thermal')): |
|
|
metrics['cpu_temp'] = entries[0].current |
|
|
break |
|
|
elif platform.system() == 'Darwin': |
|
|
|
|
|
|
|
|
pass |
|
|
elif platform.system() == 'Windows': |
|
|
|
|
|
pass |
|
|
except (AttributeError, KeyError, IndexError, NotImplementedError): |
|
|
|
|
|
pass |
|
|
|
|
|
return metrics |
|
|
|
|
|
def update_history(self, metrics: Dict[str, Any]) -> None: |
|
|
"""Update metric history with new values |
|
|
|
|
|
Args: |
|
|
metrics: New metrics to add to history |
|
|
""" |
|
|
self.timestamps.append(metrics['timestamp']) |
|
|
self.cpu_percent.append(metrics['cpu_percent']) |
|
|
self.memory_percent.append(metrics['memory_percent']) |
|
|
self.memory_used.append(metrics['memory_used']) |
|
|
self.memory_available.append(metrics['memory_available']) |
|
|
|
|
|
if metrics['cpu_temp'] is not None: |
|
|
self.cpu_temp.append(metrics['cpu_temp']) |
|
|
|
|
|
|
|
|
for i, percent in enumerate(metrics['per_cpu_percent']): |
|
|
if i not in self.cpu_cores_percent: |
|
|
self.cpu_cores_percent[i] = deque(maxlen=self.max_samples) |
|
|
self.cpu_cores_percent[i].append(percent) |
|
|
|
|
|
def start_monitoring(self) -> None: |
|
|
"""Start background thread for collecting metrics""" |
|
|
if self.is_running: |
|
|
logger.warning("Monitoring thread already running") |
|
|
return |
|
|
|
|
|
self.is_running = True |
|
|
|
|
|
|
|
|
self.gpu.start_monitoring() |
|
|
|
|
|
def _monitor_loop(): |
|
|
while self.is_running: |
|
|
try: |
|
|
metrics = self.collect_metrics() |
|
|
self.update_history(metrics) |
|
|
time.sleep(self.sample_interval) |
|
|
except Exception as e: |
|
|
logger.error(f"Error in monitoring thread: {str(e)}", exc_info=True) |
|
|
time.sleep(self.sample_interval) |
|
|
|
|
|
self.thread = threading.Thread(target=_monitor_loop, daemon=True) |
|
|
self.thread.start() |
|
|
logger.info("System monitoring thread started") |
|
|
|
|
|
def stop_monitoring(self) -> None: |
|
|
"""Stop the monitoring thread""" |
|
|
if not self.is_running: |
|
|
return |
|
|
|
|
|
self.is_running = False |
|
|
|
|
|
|
|
|
self.gpu.stop_monitoring() |
|
|
|
|
|
if self.thread: |
|
|
self.thread.join(timeout=1.0) |
|
|
logger.info("System monitoring thread stopped") |
|
|
|
|
|
def get_current_metrics(self) -> Dict[str, Any]: |
|
|
"""Get current system metrics |
|
|
|
|
|
Returns: |
|
|
Dictionary with current system metrics |
|
|
""" |
|
|
return self.collect_metrics() |
|
|
|
|
|
def get_system_info(self) -> Dict[str, Any]: |
|
|
"""Get general system information |
|
|
|
|
|
Returns: |
|
|
Dictionary with system details |
|
|
""" |
|
|
cpu_info = { |
|
|
'cores_physical': psutil.cpu_count(logical=False), |
|
|
'cores_logical': psutil.cpu_count(logical=True), |
|
|
'current_frequency': None, |
|
|
'architecture': platform.machine(), |
|
|
} |
|
|
|
|
|
|
|
|
try: |
|
|
cpu_freq = psutil.cpu_freq() |
|
|
if cpu_freq: |
|
|
cpu_info['current_frequency'] = cpu_freq.current |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
memory_info = { |
|
|
'total': psutil.virtual_memory().total / (1024**3), |
|
|
'available': psutil.virtual_memory().available / (1024**3), |
|
|
'used': psutil.virtual_memory().used / (1024**3), |
|
|
'percent': psutil.virtual_memory().percent |
|
|
} |
|
|
|
|
|
disk_info = {} |
|
|
for part in psutil.disk_partitions(all=False): |
|
|
if os.name == 'nt' and ('cdrom' in part.opts or part.fstype == ''): |
|
|
|
|
|
continue |
|
|
try: |
|
|
usage = psutil.disk_usage(part.mountpoint) |
|
|
disk_info[part.mountpoint] = { |
|
|
'total': usage.total / (1024**3), |
|
|
'used': usage.used / (1024**3), |
|
|
'free': usage.free / (1024**3), |
|
|
'percent': usage.percent |
|
|
} |
|
|
except PermissionError: |
|
|
continue |
|
|
|
|
|
sys_info = { |
|
|
'system': platform.system(), |
|
|
'version': platform.version(), |
|
|
'platform': platform.platform(), |
|
|
'processor': platform.processor(), |
|
|
'hostname': platform.node(), |
|
|
'python_version': platform.python_version(), |
|
|
'uptime': time.time() - psutil.boot_time() |
|
|
} |
|
|
|
|
|
return { |
|
|
'cpu': cpu_info, |
|
|
'memory': memory_info, |
|
|
'disk': disk_info, |
|
|
'system': sys_info, |
|
|
} |
|
|
|
|
|
def get_cpu_data(self) -> pd.DataFrame: |
|
|
"""Get CPU usage data as a DataFrame |
|
|
|
|
|
Returns: |
|
|
DataFrame with CPU usage data |
|
|
""" |
|
|
if not self.timestamps: |
|
|
return pd.DataFrame({ |
|
|
'time': list(), |
|
|
'CPU Usage (%)': list() |
|
|
}) |
|
|
|
|
|
data = { |
|
|
'time': list(self.timestamps), |
|
|
'CPU Usage (%)': list(self.cpu_percent) |
|
|
} |
|
|
|
|
|
|
|
|
if self.cpu_temp and len(self.cpu_temp) > 0: |
|
|
|
|
|
|
|
|
temp_data = list(self.cpu_temp) |
|
|
if len(temp_data) < len(self.timestamps): |
|
|
padding = [None] * (len(self.timestamps) - len(temp_data)) |
|
|
temp_data = padding + temp_data |
|
|
data['CPU Temperature (°C)'] = temp_data |
|
|
|
|
|
return pd.DataFrame(data) |
|
|
|
|
|
def get_memory_data(self) -> pd.DataFrame: |
|
|
"""Get memory usage data as a DataFrame |
|
|
|
|
|
Returns: |
|
|
DataFrame with memory usage data |
|
|
""" |
|
|
if not self.timestamps: |
|
|
return pd.DataFrame({ |
|
|
'time': list(), |
|
|
'Memory Usage (%)': list(), |
|
|
'Memory Used (GB)': list(), |
|
|
'Memory Available (GB)': list() |
|
|
}) |
|
|
|
|
|
return pd.DataFrame({ |
|
|
'time': list(self.timestamps), |
|
|
'Memory Usage (%)': list(self.memory_percent), |
|
|
'Memory Used (GB)': list(self.memory_used), |
|
|
'Memory Available (GB)': list(self.memory_available) |
|
|
}) |
|
|
|
|
|
def get_per_core_data(self) -> Dict[int, pd.DataFrame]: |
|
|
"""Get per-core CPU usage data as DataFrames |
|
|
|
|
|
Returns: |
|
|
Dictionary of DataFrames with per-core CPU usage data |
|
|
""" |
|
|
if not self.timestamps or not self.cpu_cores_percent: |
|
|
return {} |
|
|
|
|
|
core_data = {} |
|
|
for core_id, percentages in self.cpu_cores_percent.items(): |
|
|
|
|
|
data_length = min(len(percentages), len(self.timestamps)) |
|
|
core_data[core_id] = pd.DataFrame({ |
|
|
'time': list(self.timestamps)[-data_length:], |
|
|
f'Core {core_id} Usage (%)': list(percentages)[-data_length:] |
|
|
}) |
|
|
|
|
|
return core_data |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_cpu_plot(self) -> pd.DataFrame: |
|
|
"""Get CPU usage data for plotting |
|
|
|
|
|
Returns: |
|
|
DataFrame with CPU usage data |
|
|
""" |
|
|
return self.get_cpu_data() |
|
|
|
|
|
|
|
|
def generate_memory_plot(self) -> pd.DataFrame: |
|
|
"""Get memory usage data for plotting |
|
|
|
|
|
Returns: |
|
|
DataFrame with memory usage data |
|
|
""" |
|
|
return self.get_memory_data() |
|
|
|
|
|
|
|
|
def generate_per_core_plot(self) -> pd.DataFrame: |
|
|
"""Get per-core CPU usage data for plotting |
|
|
|
|
|
Returns: |
|
|
Combined DataFrame with all cores' usage data |
|
|
""" |
|
|
core_data = self.get_per_core_data() |
|
|
if not core_data: |
|
|
return pd.DataFrame() |
|
|
|
|
|
|
|
|
first_core_id = list(core_data.keys())[0] |
|
|
combined_df = core_data[first_core_id][['time']].copy() |
|
|
|
|
|
for core_id, df in core_data.items(): |
|
|
combined_df[f'Core {core_id} Usage (%)'] = df[f'Core {core_id} Usage (%)'] |
|
|
|
|
|
return combined_df |