/home/b/lab/denet/src/core/process_monitor.rs
Line | Count | Source |
1 | | #[cfg(any(not(target_os = "linux"), test))] |
2 | | #[allow(unused_imports)] |
3 | | use crate::core::constants::delays; |
4 | | use crate::core::constants::system; |
5 | | use crate::monitor::{ |
6 | | AggregatedMetrics, ChildProcessMetrics, Metrics, ProcessMetadata, ProcessTreeMetrics, Summary, |
7 | | }; |
8 | | use std::collections::HashMap; |
9 | | use std::fs::File; |
10 | | use std::io::{self, BufRead, BufReader}; |
11 | | use std::path::Path; |
12 | | use std::process::{Child, Command, Stdio}; |
13 | | use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; |
14 | | use sysinfo::{self, Pid, ProcessRefreshKind, ProcessesToUpdate, System}; |
15 | | |
16 | | // Constants for better maintainability |
17 | | const DEFAULT_THREAD_COUNT: usize = 1; |
18 | | const LINUX_PROC_DIR: &str = "/proc"; |
19 | | const LINUX_TASK_SUBDIR: &str = "/task"; |
20 | | |
21 | | /// Get the number of threads for a process |
22 | | /// In the long run, we will want this function to be more robust |
23 | | /// or use platform-specific APIs. For now, we'll keep it simple. |
24 | 41 | pub(crate) fn get_thread_count(pid: usize) -> usize { |
25 | 41 | get_linux_thread_count(pid).unwrap_or(DEFAULT_THREAD_COUNT) |
26 | 41 | } |
27 | | |
28 | | #[cfg(target_os = "linux")] |
29 | 41 | fn get_linux_thread_count(pid: usize) -> Option<usize> { |
30 | 41 | let task_dir = format!("{}/{}{}", LINUX_PROC_DIR, pid, LINUX_TASK_SUBDIR); |
31 | 41 | std::fs::read_dir(task_dir) |
32 | 41 | .ok() |
33 | 41 | .map(|entries| entries.count()) |
34 | 41 | } |
35 | | |
36 | | #[cfg(not(target_os = "linux"))] |
37 | | fn get_linux_thread_count(_pid: usize) -> Option<usize> { |
38 | | None |
39 | | } |
40 | | |
41 | | /// Read metrics from a JSON file and generate a summary |
42 | 0 | pub fn summary_from_json_file<P: AsRef<Path>>(path: P) -> io::Result<Summary> { |
43 | 0 | let file = File::open(path)?; |
44 | 0 | let reader = BufReader::new(file); |
45 | 0 |
|
46 | 0 | let mut metrics_vec: Vec<AggregatedMetrics> = Vec::new(); |
47 | 0 | let mut regular_metrics: Vec<Metrics> = Vec::new(); |
48 | 0 | let mut first_timestamp: Option<u64> = None; |
49 | 0 | let mut last_timestamp: Option<u64> = None; |
50 | | |
51 | | // Process file line by line since each line is a separate JSON object |
52 | 0 | for line in reader.lines() { |
53 | 0 | let line = line?; |
54 | | |
55 | | // Skip empty lines |
56 | 0 | if line.trim().is_empty() { |
57 | 0 | continue; |
58 | 0 | } |
59 | | |
60 | | // Try to parse as different types of metrics |
61 | 0 | if let Ok(agg_metric) = serde_json::from_str::<AggregatedMetrics>(&line) { |
62 | | // Got aggregated metrics |
63 | 0 | if first_timestamp.is_none() { |
64 | 0 | first_timestamp = Some(agg_metric.ts_ms); |
65 | 0 | } |
66 | 0 | last_timestamp = Some(agg_metric.ts_ms); |
67 | 0 | metrics_vec.push(agg_metric); |
68 | 0 | } else if let Ok(tree_metrics) = serde_json::from_str::<ProcessTreeMetrics>(&line) { |
69 | | // Got tree metrics, extract aggregated metrics if available |
70 | 0 | if let Some(agg) = tree_metrics.aggregated { |
71 | 0 | if first_timestamp.is_none() { |
72 | 0 | first_timestamp = Some(agg.ts_ms); |
73 | 0 | } |
74 | 0 | last_timestamp = Some(agg.ts_ms); |
75 | 0 | metrics_vec.push(agg); |
76 | 0 | } |
77 | 0 | } else if let Ok(metric) = serde_json::from_str::<Metrics>(&line) { |
78 | | // Got regular metrics |
79 | 0 | if first_timestamp.is_none() { |
80 | 0 | first_timestamp = Some(metric.ts_ms); |
81 | 0 | } |
82 | 0 | last_timestamp = Some(metric.ts_ms); |
83 | 0 | regular_metrics.push(metric); |
84 | 0 | } |
85 | | // Ignore metadata and other lines we can't parse |
86 | | } |
87 | | |
88 | | // Calculate total time |
89 | 0 | let elapsed_time = match (first_timestamp, last_timestamp) { |
90 | 0 | (Some(first), Some(last)) => (last - first) as f64 / 1000.0, |
91 | 0 | _ => 0.0, |
92 | | }; |
93 | | |
94 | | // Generate summary based on the metrics we found |
95 | 0 | if !metrics_vec.is_empty() { |
96 | 0 | Ok(Summary::from_aggregated_metrics(&metrics_vec, elapsed_time)) |
97 | 0 | } else if !regular_metrics.is_empty() { |
98 | 0 | Ok(Summary::from_metrics(®ular_metrics, elapsed_time)) |
99 | | } else { |
100 | 0 | Ok(Summary::default()) // Return empty summary if no metrics found |
101 | | } |
102 | 0 | } |
103 | | |
104 | | #[derive(Debug, Clone)] |
105 | | pub struct IoBaseline { |
106 | | pub disk_read_bytes: u64, |
107 | | pub disk_write_bytes: u64, |
108 | | pub net_rx_bytes: u64, |
109 | | pub net_tx_bytes: u64, |
110 | | } |
111 | | |
112 | | #[derive(Debug, Clone)] |
113 | | pub struct ChildIoBaseline { |
114 | | pub pid: usize, |
115 | | pub disk_read_bytes: u64, |
116 | | pub disk_write_bytes: u64, |
117 | | pub net_rx_bytes: u64, |
118 | | pub net_tx_bytes: u64, |
119 | | } |
120 | | |
121 | | // Main process monitor implementation |
122 | | pub struct ProcessMonitor { |
123 | | child: Option<Child>, |
124 | | pid: usize, |
125 | | sys: System, |
126 | | base_interval: Duration, |
127 | | max_interval: Duration, |
128 | | start_time: Instant, |
129 | | t0_ms: u64, |
130 | | io_baseline: Option<IoBaseline>, |
131 | | child_io_baselines: std::collections::HashMap<usize, ChildIoBaseline>, |
132 | | since_process_start: bool, |
133 | | _include_children: bool, |
134 | | _max_duration: Option<Duration>, |
135 | | enable_ebpf: bool, |
136 | | debug_mode: bool, |
137 | | #[cfg(feature = "ebpf")] |
138 | | ebpf_tracker: Option<crate::ebpf::SyscallTracker>, |
139 | | last_refresh_time: Instant, |
140 | | cpu_sampler: crate::cpu_sampler::CpuSampler, |
141 | | } |
142 | | |
143 | | // We'll use a Result type directly instead of a custom ErrorType to avoid orphan rule issues |
144 | | pub type ProcessResult<T> = std::result::Result<T, std::io::Error>; |
145 | | |
146 | | impl ProcessMonitor { |
147 | 16 | pub fn new( |
148 | 16 | cmd: Vec<String>, |
149 | 16 | base_interval: Duration, |
150 | 16 | max_interval: Duration, |
151 | 16 | ) -> ProcessResult<Self> { |
152 | 16 | Self::new_with_options(cmd, base_interval, max_interval, false) |
153 | 16 | } |
154 | | |
155 | | // Create a new process monitor with I/O accounting options |
156 | 23 | pub fn new_with_options( |
157 | 23 | cmd: Vec<String>, |
158 | 23 | base_interval: Duration, |
159 | 23 | max_interval: Duration, |
160 | 23 | since_process_start: bool, |
161 | 23 | ) -> ProcessResult<Self> { |
162 | 23 | if cmd.is_empty() { |
163 | 2 | return Err(std::io::Error::new( |
164 | 2 | std::io::ErrorKind::InvalidInput, |
165 | 2 | "Command cannot be empty", |
166 | 2 | )); |
167 | 21 | } |
168 | | |
169 | 21 | let child19 = Command::new(&cmd[0]) |
170 | 21 | .args(&cmd[1..]) |
171 | 21 | .stdout(Stdio::null()) |
172 | 21 | .stderr(Stdio::null()) |
173 | 21 | .spawn()?2 ; |
174 | 19 | let pid = child.id(); |
175 | 19 | |
176 | 19 | // Use minimal system initialization - avoid expensive system-wide scans |
177 | 19 | let mut sys = System::new(); |
178 | 19 | // Only refresh CPU info once at startup |
179 | 19 | sys.refresh_cpu_all(); |
180 | 19 | |
181 | 19 | let now = Instant::now(); |
182 | 19 | Ok(Self { |
183 | 19 | child: Some(child), |
184 | 19 | pid: pid.try_into().unwrap(), |
185 | 19 | sys, |
186 | 19 | base_interval, |
187 | 19 | max_interval, |
188 | 19 | start_time: now, |
189 | 19 | t0_ms: SystemTime::now() |
190 | 19 | .duration_since(UNIX_EPOCH) |
191 | 19 | .expect("Time went backwards") |
192 | 19 | .as_millis() as u64, |
193 | 19 | _include_children: true, |
194 | 19 | _max_duration: None, |
195 | 19 | debug_mode: false, |
196 | 19 | io_baseline: None, |
197 | 19 | child_io_baselines: std::collections::HashMap::new(), |
198 | 19 | since_process_start, |
199 | 19 | enable_ebpf: false, |
200 | 19 | #[cfg(feature = "ebpf")] |
201 | 19 | ebpf_tracker: None, |
202 | 19 | last_refresh_time: now, |
203 | 19 | #[cfg(target_os = "linux")] |
204 | 19 | cpu_sampler: crate::cpu_sampler::CpuSampler::new(), |
205 | 19 | }) |
206 | 23 | } |
207 | | |
208 | | // Create a process monitor for an existing process |
209 | 0 | pub fn from_pid( |
210 | 0 | pid: usize, |
211 | 0 | base_interval: Duration, |
212 | 0 | max_interval: Duration, |
213 | 0 | ) -> ProcessResult<Self> { |
214 | 0 | Self::from_pid_with_options(pid, base_interval, max_interval, false) |
215 | 0 | } |
216 | | |
217 | | // Create a process monitor for an existing process with I/O accounting options |
218 | 2 | pub fn from_pid_with_options( |
219 | 2 | pid: usize, |
220 | 2 | base_interval: Duration, |
221 | 2 | max_interval: Duration, |
222 | 2 | since_process_start: bool, |
223 | 2 | ) -> ProcessResult<Self> { |
224 | 2 | // Use minimal system initialization - avoid expensive system-wide scans |
225 | 2 | let mut sys = System::new(); |
226 | 2 | // Only refresh CPU info once at startup |
227 | 2 | sys.refresh_cpu_all(); |
228 | 2 | |
229 | 2 | // Check if the specific process exists - much faster than system-wide scan |
230 | 2 | let pid_sys = Pid::from_u32(pid as u32); |
231 | 2 | |
232 | 2 | // Try to refresh just this process instead of all processes |
233 | 2 | let mut retries = 3; |
234 | 2 | let mut process_found = false; |
235 | | |
236 | 6 | while retries > 0 && !process_found5 { |
237 | | // Only refresh the specific process we care about |
238 | 4 | sys.refresh_processes_specifics( |
239 | 4 | ProcessesToUpdate::Some(&[pid_sys]), |
240 | 4 | true, |
241 | 4 | ProcessRefreshKind::everything(), |
242 | 4 | ); |
243 | 4 | if sys.process(pid_sys).is_some() { |
244 | 1 | process_found = true; |
245 | 3 | } else { |
246 | 3 | retries -= 1; |
247 | 3 | // Shorter sleep since we're doing targeted refresh |
248 | 3 | std::thread::sleep(system::PROCESS_DETECTION); |
249 | 3 | } |
250 | | } |
251 | | |
252 | 2 | if !process_found { |
253 | 1 | return Err(std::io::Error::new( |
254 | 1 | std::io::ErrorKind::NotFound, |
255 | 1 | format!("Process with PID {pid} not found"), |
256 | 1 | )); |
257 | 1 | } |
258 | 1 | |
259 | 1 | let now = Instant::now(); |
260 | 1 | Ok(Self { |
261 | 1 | child: None, |
262 | 1 | pid, |
263 | 1 | sys, |
264 | 1 | base_interval, |
265 | 1 | max_interval, |
266 | 1 | start_time: now, |
267 | 1 | t0_ms: SystemTime::now() |
268 | 1 | .duration_since(UNIX_EPOCH) |
269 | 1 | .expect("Time went backwards") |
270 | 1 | .as_millis() as u64, |
271 | 1 | _include_children: true, |
272 | 1 | _max_duration: None, |
273 | 1 | debug_mode: false, |
274 | 1 | io_baseline: None, |
275 | 1 | child_io_baselines: std::collections::HashMap::new(), |
276 | 1 | since_process_start, |
277 | 1 | enable_ebpf: false, |
278 | 1 | #[cfg(feature = "ebpf")] |
279 | 1 | ebpf_tracker: None, |
280 | 1 | last_refresh_time: now, |
281 | 1 | #[cfg(target_os = "linux")] |
282 | 1 | cpu_sampler: crate::cpu_sampler::CpuSampler::new(), |
283 | 1 | }) |
284 | 2 | } |
285 | | |
286 | | /// Set debug mode for verbose output |
287 | 0 | pub fn set_debug_mode(&mut self, debug: bool) { |
288 | 0 | self.debug_mode = debug; |
289 | 0 |
|
290 | 0 | #[cfg(feature = "ebpf")] |
291 | 0 | unsafe { |
292 | 0 | crate::ebpf::debug::set_debug_mode(debug); |
293 | 0 | } |
294 | 0 |
|
295 | 0 | if debug { |
296 | 0 | log::info!("Debug mode enabled - verbose output will be shown"); |
297 | 0 | } |
298 | 0 | } |
299 | | |
300 | | /// Enable eBPF profiling for this monitor |
301 | | #[cfg(feature = "ebpf")] |
302 | | pub fn enable_ebpf(&mut self) -> crate::error::Result<()> { |
303 | | if !self.enable_ebpf { |
304 | | log::info!("Attempting to enable eBPF profiling"); |
305 | | if self.debug_mode { |
306 | | println!("DEBUG: Attempting to enable eBPF profiling"); |
307 | | |
308 | | // Print current process info |
309 | | println!( |
310 | | "DEBUG: Process monitor running with PID: {}", |
311 | | std::process::id() |
312 | | ); |
313 | | println!("DEBUG: Monitoring target PID: {}", self.pid); |
314 | | |
315 | | // Check for eBPF feature compilation |
316 | | println!("DEBUG: eBPF feature is enabled at compile time"); |
317 | | } |
318 | | |
319 | | // Collect all PIDs in the process tree |
320 | | let mut pids = vec![self.pid as u32]; |
321 | | |
322 | | // Add child PIDs |
323 | | self.sys.refresh_processes(ProcessesToUpdate::All, true); |
324 | | if let Some(_parent_proc) = self.sys.process(Pid::from_u32(self.pid as u32)) { |
325 | | for (child_pid, _) in self.sys.processes() { |
326 | | if let Some(child_proc) = self.sys.process(*child_pid) { |
327 | | if let Some(parent_pid) = child_proc.parent() { |
328 | | if parent_pid == Pid::from_u32(self.pid as u32) { |
329 | | pids.push(child_pid.as_u32()); |
330 | | } |
331 | | } |
332 | | } |
333 | | } |
334 | | } |
335 | | |
336 | | if self.debug_mode { |
337 | | println!( |
338 | | "DEBUG: Collected {} PIDs to monitor: {:?}", |
339 | | pids.len(), |
340 | | pids |
341 | | ); |
342 | | } |
343 | | log::info!("Collected {} PIDs to monitor", pids.len()); |
344 | | |
345 | | // Check system readiness for eBPF |
346 | | if self.debug_mode { |
347 | | let readiness_check = std::process::Command::new("sh") |
348 | | .arg("-c") |
349 | | .arg("echo 'Checking eBPF prerequisites from process_monitor:'; \ |
350 | | echo -n 'Kernel version: '; uname -r; \ |
351 | | echo -n 'Debugfs mounted: '; mount | grep -q debugfs && echo 'YES' || echo 'NO'; \ |
352 | | echo -n 'Tracefs accessible: '; [ -d /sys/kernel/debug/tracing ] && echo 'YES' || echo 'NO';") |
353 | | .output(); |
354 | | |
355 | | if let Ok(output) = readiness_check { |
356 | | let report = String::from_utf8_lossy(&output.stdout); |
357 | | println!("DEBUG: {}", report); |
358 | | log::info!("eBPF readiness: {}", report); |
359 | | } |
360 | | } |
361 | | |
362 | | // Initialize eBPF tracker |
363 | | match crate::ebpf::SyscallTracker::new(pids) { |
364 | | Ok(tracker) => { |
365 | | self.ebpf_tracker = Some(tracker); |
366 | | self.enable_ebpf = true; |
367 | | log::info!("✅ eBPF profiling successfully enabled"); |
368 | | if self.debug_mode { |
369 | | println!("DEBUG: eBPF profiling successfully enabled"); |
370 | | } |
371 | | Ok(()) |
372 | | } |
373 | | Err(e) => { |
374 | | log::warn!("Failed to enable eBPF: {}", e); |
375 | | if self.debug_mode { |
376 | | println!("DEBUG: Failed to enable eBPF: {}", e); |
377 | | |
378 | | // Additional diagnostics |
379 | | if let Ok(output) = std::process::Command::new("sh") |
380 | | .arg("-c") |
381 | | .arg("dmesg | grep -i bpf | tail -5") |
382 | | .output() |
383 | | { |
384 | | let kernel_logs = String::from_utf8_lossy(&output.stdout); |
385 | | if !kernel_logs.trim().is_empty() { |
386 | | println!("DEBUG: Recent kernel BPF logs:\n{}", kernel_logs); |
387 | | log::warn!("Recent kernel BPF logs:\n{}", kernel_logs); |
388 | | } |
389 | | } |
390 | | } |
391 | | |
392 | | Err(e) |
393 | | } |
394 | | } |
395 | | } else { |
396 | | // Already enabled, just return success |
397 | | Ok(()) |
398 | | } |
399 | | } |
400 | | |
401 | | /// Enable eBPF profiling for this monitor (no-op on non-eBPF builds) |
402 | | #[cfg(not(feature = "ebpf"))] |
403 | 0 | pub fn enable_ebpf(&mut self) -> crate::error::Result<()> { |
404 | 0 | log::warn!("eBPF feature not enabled at compile time"); |
405 | 0 | if self.debug_mode { |
406 | 0 | println!( |
407 | 0 | "DEBUG: eBPF feature not enabled at compile time. Cannot enable eBPF profiling." |
408 | 0 | ); |
409 | 0 | println!("DEBUG: To enable eBPF support, rebuild with: cargo build --features ebpf"); |
410 | 0 | } |
411 | | // Set the flag to false to ensure consistent behavior |
412 | 0 | self.enable_ebpf = false; |
413 | 0 | Err(crate::error::DenetError::EbpfNotSupported( |
414 | 0 | "eBPF feature not enabled. Build with --features ebpf".to_string(), |
415 | 0 | )) |
416 | 0 | } |
417 | | |
418 | 2 | pub fn adaptive_interval(&self) -> Duration { |
419 | 2 | // Adaptive sampling strategy: |
420 | 2 | // - First 1 second: use base_interval (fast sampling for short processes) |
421 | 2 | // - 1-10 seconds: gradually increase from base to max |
422 | 2 | // - After 10 seconds: use max_interval |
423 | 2 | let elapsed = self.start_time.elapsed().as_secs_f64(); |
424 | | |
425 | 2 | let interval_secs = if elapsed < 1.0 { |
426 | | // First second: sample at base rate |
427 | 1 | self.base_interval.as_secs_f64() |
428 | 1 | } else if elapsed < 10.0 { |
429 | | // 1-10 seconds: linear interpolation between base and max |
430 | 1 | let t = (elapsed - 1.0) / 9.0; // 0 to 1 over 9 seconds |
431 | 1 | let base = self.base_interval.as_secs_f64(); |
432 | 1 | let max = self.max_interval.as_secs_f64(); |
433 | 1 | base + (max - base) * t |
434 | | } else { |
435 | | // After 10 seconds: use max interval |
436 | 0 | self.max_interval.as_secs_f64() |
437 | | }; |
438 | | |
439 | 2 | Duration::from_secs_f64(interval_secs) |
440 | 2 | } |
441 | | |
442 | 35 | pub fn sample_metrics(&mut self) -> Option<Metrics> { |
443 | 35 | let now = Instant::now(); |
444 | 35 | self.last_refresh_time = now; |
445 | 35 | |
446 | 35 | // We still need to refresh the process for memory and other metrics |
447 | 35 | // But we don't need the CPU refresh delay for Linux anymore |
448 | 35 | let pid = Pid::from_u32(self.pid as u32); |
449 | 35 | self.sys.refresh_processes_specifics( |
450 | 35 | ProcessesToUpdate::Some(&[pid]), |
451 | 35 | false, |
452 | 35 | ProcessRefreshKind::everything(), |
453 | 35 | ); |
454 | | |
455 | 35 | if let Some(proc) = self.sys.process(pid) { |
456 | | // sysinfo returns memory in bytes, so we need to convert to KB |
457 | 35 | let mem_rss_kb = proc.memory() / 1024; |
458 | 35 | let mem_vms_kb = proc.virtual_memory() / 1024; |
459 | 35 | |
460 | 35 | // Use different CPU measurement methods based on platform |
461 | 35 | #[cfg(target_os = "linux")] |
462 | 35 | let cpu_usage = self.cpu_sampler.get_cpu_usage(self.pid).unwrap_or(0.0); |
463 | 35 | |
464 | 35 | #[cfg(not(target_os = "linux"))] |
465 | 35 | let cpu_usage = { |
466 | 35 | // For non-Linux: keep using sysinfo with the refresh strategy |
467 | 35 | let time_since_last_refresh = now.duration_since(self.last_refresh_time); |
468 | 35 | |
469 | 35 | // Refresh CPU for accurate measurement |
470 | 35 | self.sys.refresh_cpu_all(); |
471 | 35 | |
472 | 35 | // If not enough time has passed, add a delay for accuracy |
473 | 35 | if time_since_last_refresh < delays::CPU_MEASUREMENT { |
474 | 35 | std::thread::sleep(delays::CPU_MEASUREMENT); |
475 | 35 | self.sys.refresh_cpu_all(); |
476 | 35 | let pid = Pid::from_u32(self.pid as u32); |
477 | 35 | self.sys.refresh_processes_specifics( |
478 | 35 | ProcessesToUpdate::Some(&[pid]), |
479 | 35 | false, |
480 | 35 | ProcessRefreshKind::everything(), |
481 | 35 | ); |
482 | 35 | } |
483 | 35 | |
484 | 35 | proc.cpu_usage() |
485 | 35 | }; |
486 | 35 | |
487 | 35 | let current_disk_read = proc.disk_usage().total_read_bytes; |
488 | 35 | let current_disk_write = proc.disk_usage().total_written_bytes; |
489 | 35 | |
490 | 35 | // Get network I/O - for now, we'll use 0 as sysinfo doesn't provide per-process network stats |
491 | 35 | // TODO: Implement platform-specific network I/O collection |
492 | 35 | let current_net_rx = self.get_process_net_rx_bytes(); |
493 | 35 | let current_net_tx = self.get_process_net_tx_bytes(); |
494 | | |
495 | | // Handle I/O baseline for delta calculation |
496 | 35 | let (disk_read_bytes, disk_write_bytes, net_rx_bytes, net_tx_bytes) = |
497 | 35 | if self.since_process_start { |
498 | | // Show cumulative I/O since process start |
499 | 1 | ( |
500 | 1 | current_disk_read, |
501 | 1 | current_disk_write, |
502 | 1 | current_net_rx, |
503 | 1 | current_net_tx, |
504 | 1 | ) |
505 | | } else { |
506 | | // Show delta I/O since monitoring start |
507 | 34 | if self.io_baseline.is_none() { |
508 | | // First sample - establish baseline |
509 | 10 | self.io_baseline = Some(IoBaseline { |
510 | 10 | disk_read_bytes: current_disk_read, |
511 | 10 | disk_write_bytes: current_disk_write, |
512 | 10 | net_rx_bytes: current_net_rx, |
513 | 10 | net_tx_bytes: current_net_tx, |
514 | 10 | }); |
515 | 10 | (0, 0, 0, 0) // First sample shows 0 delta |
516 | | } else { |
517 | | // Calculate delta from baseline |
518 | 24 | let baseline = self.io_baseline.as_ref().unwrap(); |
519 | 24 | ( |
520 | 24 | current_disk_read.saturating_sub(baseline.disk_read_bytes), |
521 | 24 | current_disk_write.saturating_sub(baseline.disk_write_bytes), |
522 | 24 | current_net_rx.saturating_sub(baseline.net_rx_bytes), |
523 | 24 | current_net_tx.saturating_sub(baseline.net_tx_bytes), |
524 | 24 | ) |
525 | | } |
526 | | }; |
527 | | |
528 | 35 | let ts_ms = SystemTime::now() |
529 | 35 | .duration_since(UNIX_EPOCH) |
530 | 35 | .expect("Time went backwards") |
531 | 35 | .as_millis() as u64; |
532 | 35 | |
533 | 35 | Some(Metrics { |
534 | 35 | ts_ms, |
535 | 35 | cpu_usage, |
536 | 35 | mem_rss_kb, |
537 | 35 | mem_vms_kb, |
538 | 35 | disk_read_bytes, |
539 | 35 | disk_write_bytes, |
540 | 35 | net_rx_bytes, |
541 | 35 | net_tx_bytes, |
542 | 35 | thread_count: get_thread_count(proc.pid().as_u32() as usize), |
543 | 35 | uptime_secs: proc.run_time(), |
544 | 35 | cpu_core: Self::get_process_cpu_core(self.pid), |
545 | 35 | }) |
546 | | } else { |
547 | 0 | None |
548 | | } |
549 | 35 | } |
550 | | |
551 | 53 | pub fn is_running(&mut self) -> bool { |
552 | | // If we have a child process, use try_wait to check its status |
553 | 53 | if let Some(child) = &mut self.child { |
554 | 53 | match child.try_wait() { |
555 | 5 | Ok(Some(_)) => false, |
556 | 48 | Ok(None) => true, |
557 | 0 | Err(_) => false, |
558 | | } |
559 | | } else { |
560 | | // For existing processes, check if it still exists |
561 | 0 | let pid = Pid::from_u32(self.pid as u32); |
562 | 0 |
|
563 | 0 | // First try with specific process refresh |
564 | 0 | self.sys.refresh_processes_specifics( |
565 | 0 | ProcessesToUpdate::Some(&[pid]), |
566 | 0 | false, |
567 | 0 | ProcessRefreshKind::everything(), |
568 | 0 | ); |
569 | 0 |
|
570 | 0 | // If specific refresh doesn't work, try refreshing all processes |
571 | 0 | if self.sys.process(pid).is_none() { |
572 | 0 | self.sys.refresh_processes(ProcessesToUpdate::All, true); |
573 | 0 |
|
574 | 0 | // Give a small amount of time for the process to be detected |
575 | 0 | // This helps with the test reliability |
576 | 0 | std::thread::sleep(system::PROCESS_DETECTION); |
577 | 0 | } |
578 | | |
579 | 0 | self.sys.process(pid).is_some() |
580 | | } |
581 | 53 | } |
582 | | |
583 | | // Get the process ID |
584 | 1 | pub fn get_pid(&self) -> usize { |
585 | 1 | self.pid |
586 | 1 | } |
587 | | |
588 | | /// Set whether to include children processes in monitoring |
589 | 1 | pub fn set_include_children(&mut self, include_children: bool) -> &mut Self { |
590 | 1 | self._include_children = include_children; |
591 | 1 | self |
592 | 1 | } |
593 | | |
594 | | /// Get whether children processes are included in monitoring |
595 | 0 | pub fn get_include_children(&self) -> bool { |
596 | 0 | self._include_children |
597 | 0 | } |
598 | | |
599 | | /// Returns metadata about the monitored process |
600 | | // Get process metadata (static information) |
601 | 2 | pub fn get_metadata(&mut self) -> Option<ProcessMetadata> { |
602 | 2 | let pid = Pid::from_u32(self.pid as u32); |
603 | 2 | self.sys.refresh_processes_specifics( |
604 | 2 | ProcessesToUpdate::Some(&[pid]), |
605 | 2 | false, |
606 | 2 | ProcessRefreshKind::everything(), |
607 | 2 | ); |
608 | | |
609 | 2 | if let Some(proc) = self.sys.process(pid) { |
610 | | // Convert OsString to String with potential data loss on invalid UTF-8 |
611 | 2 | let cmd: Vec<String> = proc |
612 | 2 | .cmd() |
613 | 2 | .iter() |
614 | 2 | .map(|os_str| os_str.to_string_lossy().to_string()) |
615 | 2 | .collect(); |
616 | 2 | |
617 | 2 | // Handle exe which is now Option<&Path> |
618 | 2 | let executable = proc |
619 | 2 | .exe() |
620 | 2 | .map(|path| path.to_string_lossy().to_string()1 ) |
621 | 2 | .unwrap_or_default(); |
622 | 2 | |
623 | 2 | Some(ProcessMetadata { |
624 | 2 | pid: self.pid, |
625 | 2 | cmd, |
626 | 2 | executable, |
627 | 2 | t0_ms: self.t0_ms, |
628 | 2 | }) |
629 | | } else { |
630 | 0 | None |
631 | | } |
632 | 2 | } |
633 | | |
634 | | // Get all child processes recursively |
635 | 31 | pub fn get_child_pids(&mut self) -> Vec<usize> { |
636 | 31 | self.sys.refresh_processes(ProcessesToUpdate::All, true); |
637 | 31 | let mut children = Vec::new(); |
638 | 31 | self.find_children_recursive(self.pid, &mut children); |
639 | 31 | children |
640 | 31 | } |
641 | | |
642 | | // Recursively find all descendants of a process |
643 | 37 | fn find_children_recursive(&self, parent_pid: usize, children: &mut Vec<usize>) { |
644 | 37 | let parent_pid_sys = Pid::from_u32(parent_pid as u32); |
645 | 261k | for (pid, process) in self.sys.processes()37 { |
646 | 261k | if let Some(ppid261k ) = process.parent() { |
647 | 261k | if ppid == parent_pid_sys { |
648 | 6 | let child_pid = pid.as_u32() as usize; |
649 | 6 | children.push(child_pid); |
650 | 6 | // Recursively find grandchildren |
651 | 6 | self.find_children_recursive(child_pid, children); |
652 | 261k | } |
653 | 74 | } |
654 | | } |
655 | 37 | } |
656 | | |
657 | | // Sample metrics including child processes |
658 | 28 | pub fn sample_tree_metrics(&mut self) -> ProcessTreeMetrics { |
659 | 28 | let tree_ts_ms = SystemTime::now() |
660 | 28 | .duration_since(UNIX_EPOCH) |
661 | 28 | .expect("Time went backwards") |
662 | 28 | .as_millis() as u64; |
663 | 28 | |
664 | 28 | // Get parent metrics |
665 | 28 | let parent_metrics = self.sample_metrics(); |
666 | 28 | |
667 | 28 | // Get child PIDs and their metrics |
668 | 28 | let child_pids = self.get_child_pids(); |
669 | 28 | let mut child_metrics = Vec::new(); |
670 | | |
671 | 28 | for child_pid6 in child_pids.iter() { |
672 | | // We no longer need delays between child measurements for Linux with our new CPU sampler |
673 | | // But we still need to refresh process info for other metrics |
674 | 6 | let pid = Pid::from_u32(*child_pid as u32); |
675 | 6 | self.sys.refresh_processes_specifics( |
676 | 6 | ProcessesToUpdate::Some(&[pid]), |
677 | 6 | false, |
678 | 6 | ProcessRefreshKind::everything(), |
679 | 6 | ); |
680 | | |
681 | 6 | if let Some(proc) = self.sys.process(pid) { |
682 | 6 | let command = proc.name().to_string_lossy().to_string(); |
683 | 6 | |
684 | 6 | // Get I/O stats for child |
685 | 6 | let current_disk_read = proc.disk_usage().total_read_bytes; |
686 | 6 | let current_disk_write = proc.disk_usage().total_written_bytes; |
687 | 6 | let current_net_rx = 0; // TODO: Implement for children |
688 | 6 | let current_net_tx = 0; |
689 | | |
690 | | // Handle I/O baseline for child processes |
691 | 6 | let (disk_read_bytes, disk_write_bytes, net_rx_bytes, net_tx_bytes) = |
692 | 6 | if self.since_process_start { |
693 | | // Show cumulative I/O since process start |
694 | 0 | ( |
695 | 0 | current_disk_read, |
696 | 0 | current_disk_write, |
697 | 0 | current_net_rx, |
698 | 0 | current_net_tx, |
699 | 0 | ) |
700 | | } else { |
701 | | // Show delta I/O since monitoring start |
702 | 6 | match self.child_io_baselines.entry(*child_pid) { |
703 | 2 | std::collections::hash_map::Entry::Vacant(e) => { |
704 | 2 | // First time seeing this child - establish baseline |
705 | 2 | e.insert(ChildIoBaseline { |
706 | 2 | pid: *child_pid, |
707 | 2 | disk_read_bytes: current_disk_read, |
708 | 2 | disk_write_bytes: current_disk_write, |
709 | 2 | net_rx_bytes: current_net_rx, |
710 | 2 | net_tx_bytes: current_net_tx, |
711 | 2 | }); |
712 | 2 | (0, 0, 0, 0) // First sample shows 0 delta |
713 | | } |
714 | 4 | std::collections::hash_map::Entry::Occupied(e) => { |
715 | 4 | // Calculate delta from baseline |
716 | 4 | let baseline = e.get(); |
717 | 4 | ( |
718 | 4 | current_disk_read.saturating_sub(baseline.disk_read_bytes), |
719 | 4 | current_disk_write.saturating_sub(baseline.disk_write_bytes), |
720 | 4 | current_net_rx.saturating_sub(baseline.net_rx_bytes), |
721 | 4 | current_net_tx.saturating_sub(baseline.net_tx_bytes), |
722 | 4 | ) |
723 | | } |
724 | | } |
725 | | }; |
726 | | |
727 | 6 | let child_ts_ms = SystemTime::now() |
728 | 6 | .duration_since(UNIX_EPOCH) |
729 | 6 | .expect("Time went backwards") |
730 | 6 | .as_millis() as u64; |
731 | 6 | |
732 | 6 | // Use different CPU measurement methods based on platform |
733 | 6 | #[cfg(target_os = "linux")] |
734 | 6 | let cpu_usage = self.cpu_sampler.get_cpu_usage(*child_pid).unwrap_or(0.0); |
735 | 6 | |
736 | 6 | #[cfg(not(target_os = "linux"))] |
737 | 6 | let cpu_usage = proc.cpu_usage(); |
738 | 6 | |
739 | 6 | let metrics = Metrics { |
740 | 6 | ts_ms: child_ts_ms, |
741 | 6 | cpu_usage, |
742 | 6 | mem_rss_kb: proc.memory() / 1024, |
743 | 6 | mem_vms_kb: proc.virtual_memory() / 1024, |
744 | 6 | disk_read_bytes, |
745 | 6 | disk_write_bytes, |
746 | 6 | net_rx_bytes, |
747 | 6 | net_tx_bytes, |
748 | 6 | thread_count: get_thread_count(*child_pid), |
749 | 6 | uptime_secs: proc.run_time(), |
750 | 6 | cpu_core: Self::get_process_cpu_core(*child_pid), |
751 | 6 | }; |
752 | 6 | |
753 | 6 | child_metrics.push(ChildProcessMetrics { |
754 | 6 | pid: *child_pid, |
755 | 6 | command, |
756 | 6 | metrics, |
757 | 6 | }); |
758 | 0 | } |
759 | | } |
760 | | |
761 | | // Cleanup stale entries in the CPU sampler |
762 | | #[cfg(target_os = "linux")] |
763 | 28 | { |
764 | 28 | let all_pids = std::iter::once(self.pid) |
765 | 28 | .chain(child_pids.iter().copied()) |
766 | 28 | .collect::<Vec<_>>(); |
767 | 28 | self.cpu_sampler.cleanup_stale_entries(&all_pids); |
768 | 28 | } |
769 | | |
770 | | // Create aggregated metrics |
771 | 28 | let aggregated = if let Some(ref parent) = parent_metrics { |
772 | 28 | let mut agg = AggregatedMetrics { |
773 | 28 | ts_ms: tree_ts_ms, |
774 | 28 | cpu_usage: parent.cpu_usage, |
775 | 28 | mem_rss_kb: parent.mem_rss_kb, |
776 | 28 | mem_vms_kb: parent.mem_vms_kb, |
777 | 28 | disk_read_bytes: parent.disk_read_bytes, |
778 | 28 | disk_write_bytes: parent.disk_write_bytes, |
779 | 28 | net_rx_bytes: parent.net_rx_bytes, |
780 | 28 | net_tx_bytes: parent.net_tx_bytes, |
781 | 28 | thread_count: parent.thread_count, |
782 | 28 | process_count: 1, // Parent |
783 | 28 | uptime_secs: parent.uptime_secs, |
784 | 28 | ebpf: None, // Will be populated below if eBPF is enabled |
785 | 28 | }; |
786 | | |
787 | | // Add child metrics |
788 | 34 | for child6 in &child_metrics { |
789 | 6 | agg.cpu_usage += child.metrics.cpu_usage; |
790 | 6 | agg.mem_rss_kb += child.metrics.mem_rss_kb; |
791 | 6 | agg.mem_vms_kb += child.metrics.mem_vms_kb; |
792 | 6 | agg.disk_read_bytes += child.metrics.disk_read_bytes; |
793 | 6 | agg.disk_write_bytes += child.metrics.disk_write_bytes; |
794 | 6 | agg.net_rx_bytes += child.metrics.net_rx_bytes; |
795 | 6 | agg.net_tx_bytes += child.metrics.net_tx_bytes; |
796 | 6 | agg.thread_count += child.metrics.thread_count; |
797 | 6 | agg.process_count += 1; |
798 | 6 | } |
799 | | |
800 | | // Collect eBPF metrics if enabled |
801 | | #[cfg(feature = "ebpf")] |
802 | | if self.enable_ebpf { |
803 | | if let Some(ref mut tracker) = self.ebpf_tracker { |
804 | | // Update PIDs in case the process tree changed |
805 | | let all_pids: Vec<u32> = std::iter::once(self.pid as u32) |
806 | | .chain(child_pids.iter().map(|&pid| pid as u32)) |
807 | | .collect(); |
808 | | |
809 | | if let Err(e) = tracker.update_pids(all_pids) { |
810 | | log::warn!("Failed to update eBPF PIDs: {}", e); |
811 | | } |
812 | | |
813 | | // Get eBPF metrics with enhanced analysis |
814 | | let mut ebpf_metrics = tracker.get_metrics(); |
815 | | |
816 | | // Add enhanced analysis if we have syscall data |
817 | | #[cfg(feature = "ebpf")] |
818 | | if let Some(ref mut syscalls) = ebpf_metrics.syscalls { |
819 | | let elapsed_time = (tree_ts_ms - self.t0_ms) as f64 / 1000.0; |
820 | | syscalls.analysis = Some(crate::ebpf::metrics::generate_syscall_analysis( |
821 | | syscalls, |
822 | | agg.cpu_usage, |
823 | | elapsed_time, |
824 | | )); |
825 | | } |
826 | | |
827 | | agg.ebpf = Some(ebpf_metrics); |
828 | | } |
829 | | } |
830 | | |
831 | | #[cfg(not(feature = "ebpf"))] |
832 | 28 | { |
833 | 28 | // eBPF is already None from initialization |
834 | 28 | } |
835 | 28 | |
836 | 28 | Some(agg) |
837 | | } else { |
838 | 0 | None |
839 | | }; |
840 | | |
841 | 28 | ProcessTreeMetrics { |
842 | 28 | ts_ms: tree_ts_ms, |
843 | 28 | parent: parent_metrics, |
844 | 28 | children: child_metrics, |
845 | 28 | aggregated, |
846 | 28 | } |
847 | 28 | } |
848 | | |
849 | | // Get network receive bytes for the process |
850 | 35 | fn get_process_net_rx_bytes(&self) -> u64 { |
851 | 35 | #[cfg(target_os = "linux")] |
852 | 35 | { |
853 | 35 | self.get_linux_process_net_stats().0 |
854 | 35 | } |
855 | 35 | #[cfg(not(target_os = "linux"))] |
856 | 35 | { |
857 | 35 | 0 // Not implemented for non-Linux platforms yet |
858 | 35 | } |
859 | 35 | } |
860 | | |
861 | | // Get network transmit bytes for the process |
862 | 35 | fn get_process_net_tx_bytes(&self) -> u64 { |
863 | 35 | #[cfg(target_os = "linux")] |
864 | 35 | { |
865 | 35 | self.get_linux_process_net_stats().1 |
866 | 35 | } |
867 | 35 | #[cfg(not(target_os = "linux"))] |
868 | 35 | { |
869 | 35 | 0 // Not implemented for non-Linux platforms yet |
870 | 35 | } |
871 | 35 | } |
872 | | |
873 | | #[cfg(target_os = "linux")] |
874 | 70 | fn get_linux_process_net_stats(&self) -> (u64, u64) { |
875 | 70 | // Parse /proc/[pid]/net/dev if it exists (in network namespaces) |
876 | 70 | // Fall back to system-wide /proc/net/dev as approximation |
877 | 70 | |
878 | 70 | let net_dev_path = format!("/proc/{}/net/dev", self.pid); |
879 | 70 | let net_stats = if std::path::Path::new(&net_dev_path).exists() { |
880 | 30 | self.parse_net_dev(&net_dev_path) |
881 | | } else { |
882 | | // Fall back to system-wide stats |
883 | | // This is less accurate but better than nothing |
884 | 40 | self.parse_net_dev("/proc/net/dev") |
885 | | }; |
886 | | |
887 | | // Get interface statistics (sum all interfaces except loopback) |
888 | 70 | let mut total_rx = 0u64; |
889 | 70 | let mut total_tx = 0u64; |
890 | | |
891 | 420 | for (interface, (rx, tx350 )) in net_stats { |
892 | 350 | if interface != "lo" { |
893 | 280 | // Skip loopback |
894 | 280 | total_rx += rx; |
895 | 280 | total_tx += tx; |
896 | 280 | }70 |
897 | | } |
898 | | |
899 | 70 | (total_rx, total_tx) |
900 | 70 | } |
901 | | |
902 | | #[cfg(target_os = "linux")] |
903 | 70 | fn parse_net_dev(&self, path: &str) -> HashMap<String, (u64, u64)> { |
904 | 70 | let mut stats = HashMap::new(); |
905 | | |
906 | 70 | if let Ok(mut file) = std::fs::File::open(path) { |
907 | 70 | let mut contents = String::new(); |
908 | 70 | if std::io::Read::read_to_string(&mut file, &mut contents).is_ok() { |
909 | 350 | for line in contents.lines().skip(2)70 { |
910 | | // Skip header lines |
911 | 350 | let parts: Vec<&str> = line.split_whitespace().collect(); |
912 | 350 | if parts.len() >= 10 { |
913 | 350 | if let Some(interface) = parts[0].strip_suffix(':') { |
914 | 350 | if let (Ok(rx_bytes), Ok(tx_bytes)) = |
915 | 350 | (parts[1].parse::<u64>(), parts[9].parse::<u64>()) |
916 | 350 | { |
917 | 350 | stats.insert(interface.to_string(), (rx_bytes, tx_bytes)); |
918 | 350 | }0 |
919 | 0 | } |
920 | 0 | } |
921 | | } |
922 | 0 | } |
923 | 0 | } |
924 | | |
925 | 70 | stats |
926 | 70 | } |
927 | | |
928 | | /// Get the CPU core a process is currently running on (Linux only) |
929 | | #[cfg(target_os = "linux")] |
930 | 41 | fn get_process_cpu_core(pid: usize) -> Option<u32> { |
931 | 41 | // Read /proc/[pid]/stat to get the last CPU the process ran on |
932 | 41 | let stat_path = format!("/proc/{pid}/stat"); |
933 | 41 | if let Ok(contents) = std::fs::read_to_string(&stat_path) { |
934 | | // The CPU field is the 39th field in /proc/[pid]/stat |
935 | | // Format: pid (comm) state ppid pgrp session tty_nr tpgid flags... |
936 | | // We need to handle the command field which can contain spaces and parentheses |
937 | 41 | if let Some(last_paren) = contents.rfind(')') { |
938 | 41 | let after_comm = &contents[last_paren + 1..]; |
939 | 41 | let fields: Vec<&str> = after_comm.split_whitespace().collect(); |
940 | 41 | // CPU is the 37th field after the command (0-indexed) |
941 | 41 | if fields.len() > 36 { |
942 | 41 | if let Ok(cpu) = fields[36].parse::<u32>() { |
943 | 41 | return Some(cpu); |
944 | 0 | } |
945 | 0 | } |
946 | 0 | } |
947 | 0 | } |
948 | 0 | None |
949 | 41 | } |
950 | | |
951 | | #[cfg(not(target_os = "linux"))] |
952 | | fn get_process_cpu_core(_pid: usize) -> Option<u32> { |
953 | | None // Not implemented for non-Linux platforms |
954 | | } |
955 | | } |
956 | | |
957 | | #[cfg(test)] |
958 | | mod tests { |
959 | | use super::*; |
960 | | use crate::core::constants::{defaults, delays, sampling}; |
961 | | use std::thread; |
962 | | |
963 | | // Helper function for creating a test monitor with standard parameters |
964 | | // Test fixture for process monitoring tests |
965 | | struct ProcessTestFixture { |
966 | | cmd: Vec<String>, |
967 | | base_interval: Duration, |
968 | | max_interval: Duration, |
969 | | ready_timeout: Duration, |
970 | | } |
971 | | |
972 | | impl ProcessTestFixture { |
973 | 14 | fn new(cmd: Vec<String>) -> Self { |
974 | 14 | Self { |
975 | 14 | cmd, |
976 | 14 | base_interval: defaults::BASE_INTERVAL, |
977 | 14 | max_interval: defaults::MAX_INTERVAL, |
978 | 14 | ready_timeout: delays::STARTUP, |
979 | 14 | } |
980 | 14 | } |
981 | | |
982 | 15 | fn create_monitor(&self) -> Result<ProcessMonitor, std::io::Error> { |
983 | 15 | ProcessMonitor::new(self.cmd.clone(), self.base_interval, self.max_interval) |
984 | 15 | } |
985 | | |
986 | 0 | fn create_monitor_from_pid(&self, pid: usize) -> Result<ProcessMonitor, std::io::Error> { |
987 | 0 | ProcessMonitor::from_pid(pid, self.base_interval, self.max_interval) |
988 | 0 | } |
989 | | |
990 | | // Create a monitor and wait until the process is reliably detected |
991 | 1 | fn create_and_verify_running(&self) -> Result<(ProcessMonitor, usize), std::io::Error> { |
992 | 1 | let mut monitor = self.create_monitor()?0 ; |
993 | 1 | let pid = monitor.get_pid(); |
994 | 1 | |
995 | 1 | // Give the process a small amount of time to start |
996 | 1 | std::thread::sleep(delays::STANDARD); |
997 | 1 | |
998 | 1 | // Verify the process is running using a retry strategy |
999 | 1 | if !self.wait_for_condition(|| monitor.is_running()) { |
1000 | 0 | return Err(std::io::Error::new( |
1001 | 0 | std::io::ErrorKind::TimedOut, |
1002 | 0 | "Process did not start or was not detected", |
1003 | 0 | )); |
1004 | 1 | } |
1005 | 1 | |
1006 | 1 | Ok((monitor, pid)) |
1007 | 1 | } |
1008 | | |
1009 | | // Utility method for waiting with exponential backoff |
1010 | | // Wait for a condition to become true with exponential backoff |
1011 | | // This approach is more reliable than fixed sleeps and handles |
1012 | | // timing variations in test environments |
1013 | 3 | fn wait_for_condition<F>(&self, mut condition: F) -> bool |
1014 | 3 | where |
1015 | 3 | F: FnMut() -> bool, |
1016 | 3 | { |
1017 | 3 | let start = std::time::Instant::now(); |
1018 | 3 | let mut delay_ms = 1; |
1019 | | |
1020 | 47 | while start.elapsed() < self.ready_timeout { |
1021 | 47 | if condition() { |
1022 | 3 | return true; |
1023 | 44 | } |
1024 | 44 | |
1025 | 44 | // Exponential backoff with a maximum delay |
1026 | 44 | std::thread::sleep(Duration::from_millis(delay_ms)); |
1027 | 44 | delay_ms = std::cmp::min(delay_ms * 2, 50); |
1028 | | } |
1029 | | |
1030 | 0 | false |
1031 | 3 | } |
1032 | | } |
1033 | | |
1034 | | // Helper function for creating a test monitor |
1035 | 13 | fn create_test_monitor(cmd: Vec<String>) -> Result<ProcessMonitor, std::io::Error> { |
1036 | 13 | ProcessTestFixture::new(cmd).create_monitor() |
1037 | 13 | } |
1038 | | |
1039 | | // This function is intentionally left in place for future reference, but is currently |
1040 | | // not used directly as the fixture pattern provides better test isolation |
1041 | | #[allow(dead_code)] |
1042 | 0 | fn create_test_monitor_from_pid(pid: usize) -> Result<ProcessMonitor, std::io::Error> { |
1043 | 0 | let fixture = ProcessTestFixture { |
1044 | 0 | cmd: vec![], |
1045 | 0 | base_interval: defaults::BASE_INTERVAL, |
1046 | 0 | max_interval: defaults::MAX_INTERVAL, |
1047 | 0 | ready_timeout: delays::STARTUP, |
1048 | 0 | }; |
1049 | 0 | fixture.create_monitor_from_pid(pid) |
1050 | 0 | } |
1051 | | |
1052 | | // Test attaching to existing process |
1053 | | #[test] |
1054 | 0 | fn test_from_pid() { |
1055 | | // Create a test fixture with a longer-running process |
1056 | 0 | let cmd = if cfg!(target_os = "windows") { |
1057 | 0 | vec![ |
1058 | 0 | "powershell".to_string(), |
1059 | 0 | "-Command".to_string(), |
1060 | 0 | "Start-Sleep -Seconds 5".to_string(), |
1061 | 0 | ] |
1062 | | } else { |
1063 | 0 | vec!["sleep".to_string(), "5".to_string()] |
1064 | | }; |
1065 | | |
1066 | 0 | let fixture = ProcessTestFixture::new(cmd); |
1067 | 0 |
|
1068 | 0 | // Create and verify the direct monitor is running |
1069 | 0 | let (_, pid) = fixture.create_and_verify_running().unwrap(); |
1070 | 0 |
|
1071 | 0 | // Create a monitor for the existing process |
1072 | 0 | let pid_monitor = fixture.create_monitor_from_pid(pid); |
1073 | 0 | assert!( |
1074 | 0 | pid_monitor.is_ok(), |
1075 | 0 | "Should be able to attach to running process" |
1076 | | ); |
1077 | | |
1078 | 0 | let mut pid_monitor = pid_monitor.unwrap(); |
1079 | 0 |
|
1080 | 0 | // Verify the PID monitor can detect the process |
1081 | 0 | assert!( |
1082 | 0 | fixture.wait_for_condition(|| pid_monitor.is_running()), |
1083 | 0 | "PID monitor should detect the running process" |
1084 | | ); |
1085 | 0 | } |
1086 | | |
1087 | | #[test] |
1088 | 1 | fn test_adaptive_interval() { |
1089 | 1 | let cmd = vec!["sleep".to_string(), "10".to_string()]; |
1090 | 1 | let monitor = create_test_monitor(cmd).unwrap(); |
1091 | 1 | |
1092 | 1 | let base_interval = monitor.base_interval; |
1093 | 1 | |
1094 | 1 | // Initial interval should be close to base_interval |
1095 | 1 | let initial = monitor.adaptive_interval(); |
1096 | 1 | assert!(initial >= base_interval); |
1097 | 1 | assert!(initial <= base_interval * 2); // Allow for some time passing during test |
1098 | | |
1099 | | // After waiting, interval should increase but not exceed max |
1100 | 1 | thread::sleep(Duration::from_secs(2)); |
1101 | 1 | let later = monitor.adaptive_interval(); |
1102 | 1 | assert!(later > initial); // Should increase |
1103 | 1 | assert!(later <= monitor.max_interval); // Should not exceed max |
1104 | 1 | } |
1105 | | |
1106 | | #[test] |
1107 | 1 | fn test_is_running() { |
1108 | 1 | // Test with a short-lived process |
1109 | 1 | let fixture = ProcessTestFixture::new(vec!["echo".to_string(), "hello".to_string()]); |
1110 | 1 | let mut monitor = fixture.create_monitor().unwrap(); |
1111 | 1 | |
1112 | 1 | // Wait for the process to terminate |
1113 | 1 | assert!( |
1114 | 1 | fixture.wait_for_condition(|| !monitor.is_running()), |
1115 | 0 | "Short-lived process should terminate" |
1116 | | ); |
1117 | | |
1118 | | // Test with a longer running process |
1119 | 1 | let fixture = ProcessTestFixture { |
1120 | 1 | cmd: vec!["sleep".to_string(), "2".to_string()], // Increased sleep time for reliability |
1121 | 1 | base_interval: defaults::BASE_INTERVAL, |
1122 | 1 | max_interval: defaults::MAX_INTERVAL, |
1123 | 1 | ready_timeout: Duration::from_secs(5), // Longer timeout for this test |
1124 | 1 | }; |
1125 | 1 | let (mut monitor, _) = fixture.create_and_verify_running().unwrap(); |
1126 | 1 | |
1127 | 1 | // Verify it's running (this is already done by create_and_verify_running, but we're being explicit) |
1128 | 1 | assert!(monitor.is_running(), "Process should be running initially"0 ); |
1129 | | |
1130 | | // Now wait for it to terminate |
1131 | 1 | assert!( |
1132 | 45 | fixture.wait_for_condition(|| !monitor.is_running())1 , |
1133 | 0 | "Process should terminate within the timeout period" |
1134 | | ); |
1135 | 1 | } |
1136 | | |
1137 | | #[test] |
1138 | 1 | fn test_metrics_collection() { |
1139 | | // Start a simple CPU-bound process |
1140 | 1 | let cmd = if cfg!(target_os = "windows") { |
1141 | 0 | vec![ |
1142 | 0 | "powershell".to_string(), |
1143 | 0 | "-Command".to_string(), |
1144 | 0 | "Start-Sleep -Seconds 3".to_string(), |
1145 | 0 | ] |
1146 | | } else { |
1147 | 1 | vec!["sleep".to_string(), "3".to_string()] |
1148 | | }; |
1149 | | |
1150 | 1 | let mut monitor = create_test_monitor(cmd).unwrap(); |
1151 | 1 | |
1152 | 1 | // Allow more time for the process to start and register uptime |
1153 | 1 | thread::sleep(delays::STARTUP); |
1154 | 1 | |
1155 | 1 | // Sample metrics |
1156 | 1 | let metrics = monitor.sample_metrics(); |
1157 | 1 | assert!( |
1158 | 1 | metrics.is_some(), |
1159 | 0 | "Should collect metrics from running process" |
1160 | | ); |
1161 | | |
1162 | 1 | if let Some(m) = metrics { |
1163 | | // Check thread count first |
1164 | 1 | assert!( |
1165 | 1 | m.thread_count > 0, |
1166 | 0 | "Process should have at least one thread" |
1167 | | ); |
1168 | | |
1169 | | // Handle uptime which might be platform-dependent |
1170 | 1 | if m.uptime_secs == 0 { |
1171 | | // On some platforms (especially macOS), uptime might not be reliably reported |
1172 | | // If uptime is 0, wait a bit and check again to see if it increases |
1173 | 1 | thread::sleep(Duration::from_secs(1)); |
1174 | 1 | if let Some(m2) = monitor.sample_metrics() { |
1175 | | // We don't assert here - just log the value to debug |
1176 | 1 | println!("Process uptime after delay: {} seconds", m2.uptime_secs); |
1177 | 1 | |
1178 | 1 | // On macOS, uptime might still be 0 - that's OK |
1179 | 1 | #[cfg(target_os = "linux")] |
1180 | 1 | { |
1181 | 1 | // On Linux specifically, we expect uptime to work reliably |
1182 | 1 | assert!( |
1183 | 1 | m2.uptime_secs > 0, |
1184 | 0 | "Process uptime should increase after delay on Linux" |
1185 | | ); |
1186 | | } |
1187 | 0 | } |
1188 | 0 | } else { |
1189 | 0 | // Uptime is already positive, which is good on any platform |
1190 | 0 | println!("Process uptime: {} seconds", m.uptime_secs); |
1191 | 0 | } |
1192 | 0 | } |
1193 | 1 | } |
1194 | | |
1195 | | #[test] |
1196 | 1 | fn test_child_process_detection() { |
1197 | | // Start a process that spawns children |
1198 | 1 | let cmd = if cfg!(target_os = "windows") { |
1199 | 0 | vec![ |
1200 | 0 | "cmd".to_string(), |
1201 | 0 | "/C".to_string(), |
1202 | 0 | "timeout 2 >nul & echo child".to_string(), |
1203 | 0 | ] |
1204 | | } else { |
1205 | 1 | vec![ |
1206 | 1 | "sh".to_string(), |
1207 | 1 | "-c".to_string(), |
1208 | 1 | "sleep 2 & echo child".to_string(), |
1209 | 1 | ] |
1210 | | }; |
1211 | | |
1212 | 1 | let mut monitor = create_test_monitor(cmd).unwrap(); |
1213 | 1 | |
1214 | 1 | // Allow time for child processes to start |
1215 | 1 | thread::sleep(sampling::SLOW); |
1216 | 1 | |
1217 | 1 | // Get child PIDs |
1218 | 1 | let children = monitor.get_child_pids(); |
1219 | 1 | |
1220 | 1 | // We might not always detect children due to timing, so just verify the method works |
1221 | 1 | // The assertion here is mainly to document that the method should return a Vec |
1222 | 1 | assert!( |
1223 | 1 | children.is_empty() || !children.is_empty()0 , |
1224 | 0 | "Should return a list of child PIDs (possibly empty)" |
1225 | | ); |
1226 | 1 | } |
1227 | | |
1228 | | #[test] |
1229 | 1 | fn test_tree_metrics_structure() { |
1230 | 1 | // Test the tree metrics structure with a simple process |
1231 | 1 | let cmd = vec!["sleep".to_string(), "1".to_string()]; |
1232 | 1 | let mut monitor = create_test_monitor(cmd).unwrap(); |
1233 | 1 | |
1234 | 1 | // Allow time for process to start |
1235 | 1 | thread::sleep(sampling::STANDARD); |
1236 | 1 | |
1237 | 1 | // Sample tree metrics |
1238 | 1 | let tree_metrics = monitor.sample_tree_metrics(); |
1239 | 1 | |
1240 | 1 | // Should have parent metrics |
1241 | 1 | assert!(tree_metrics.parent.is_some(), "Should have parent metrics"0 ); |
1242 | | |
1243 | | // Should have aggregated metrics |
1244 | 1 | assert!( |
1245 | 1 | tree_metrics.aggregated.is_some(), |
1246 | 0 | "Should have aggregated metrics" |
1247 | | ); |
1248 | | |
1249 | 1 | if let Some(agg) = tree_metrics.aggregated { |
1250 | 1 | assert!( |
1251 | 1 | agg.process_count >= 1, |
1252 | 0 | "Should count at least the parent process" |
1253 | | ); |
1254 | 1 | assert!(agg.thread_count > 0, "Should have at least one thread"0 ); |
1255 | 0 | } |
1256 | 1 | } |
1257 | | |
1258 | | #[test] |
1259 | 1 | fn test_child_process_aggregation() { |
1260 | 1 | // This test is hard to make deterministic since we can't guarantee child processes |
1261 | 1 | // But we can test the aggregation logic with the structure |
1262 | 1 | let cmd = vec!["sleep".to_string(), "1".to_string()]; |
1263 | 1 | let mut monitor = create_test_monitor(cmd).unwrap(); |
1264 | 1 | |
1265 | 1 | thread::sleep(Duration::from_millis(100)); |
1266 | 1 | |
1267 | 1 | let tree_metrics = monitor.sample_tree_metrics(); |
1268 | | |
1269 | 1 | if let (Some(parent), Some(agg)) = (tree_metrics.parent, tree_metrics.aggregated) { |
1270 | | // Aggregated metrics should include at least the parent |
1271 | 1 | assert!( |
1272 | 1 | agg.cpu_usage >= parent.cpu_usage, |
1273 | 0 | "Aggregated CPU should be >= parent CPU" |
1274 | | ); |
1275 | 1 | assert!( |
1276 | 1 | agg.mem_rss_kb >= parent.mem_rss_kb, |
1277 | 0 | "Aggregated memory should be >= parent memory" |
1278 | | ); |
1279 | 1 | assert!( |
1280 | 1 | agg.thread_count >= parent.thread_count, |
1281 | 0 | "Aggregated threads should be >= parent threads" |
1282 | | ); |
1283 | | |
1284 | | // Process count should be at least 1 (the parent) |
1285 | 1 | assert!( |
1286 | 1 | agg.process_count >= 1, |
1287 | 0 | "Should count at least the parent process" |
1288 | | ); |
1289 | 0 | } |
1290 | 1 | } |
1291 | | |
1292 | | #[test] |
1293 | 1 | fn test_empty_process_tree() { |
1294 | 1 | // Test behavior when monitoring a process with no children |
1295 | 1 | let cmd = vec!["sleep".to_string(), "1".to_string()]; |
1296 | 1 | let mut monitor = create_test_monitor(cmd).unwrap(); |
1297 | 1 | |
1298 | 1 | thread::sleep(Duration::from_millis(50)); |
1299 | 1 | |
1300 | 1 | let tree_metrics = monitor.sample_tree_metrics(); |
1301 | 1 | |
1302 | 1 | // Should have parent metrics |
1303 | 1 | assert!( |
1304 | 1 | tree_metrics.parent.is_some(), |
1305 | 0 | "Should have parent metrics even with no children" |
1306 | | ); |
1307 | | |
1308 | | // Children list might be empty (which is fine) |
1309 | | // Length is always non-negative, so just verify it's accessible |
1310 | | |
1311 | | // Aggregated should exist and equal parent (since no children) |
1312 | 1 | if let (Some(parent), Some(agg)) = (tree_metrics.parent, tree_metrics.aggregated) { |
1313 | 1 | assert_eq!( |
1314 | 1 | agg.process_count, |
1315 | 1 | 1 + tree_metrics.children.len(), |
1316 | 0 | "Process count should be parent + actual children" |
1317 | | ); |
1318 | | |
1319 | 1 | if tree_metrics.children.is_empty() { |
1320 | | // If no children, aggregated should equal parent |
1321 | 1 | assert_eq!( |
1322 | | agg.cpu_usage, parent.cpu_usage, |
1323 | 0 | "CPU should match parent when no children" |
1324 | | ); |
1325 | 1 | assert_eq!( |
1326 | | agg.mem_rss_kb, parent.mem_rss_kb, |
1327 | 0 | "Memory should match parent when no children" |
1328 | | ); |
1329 | 1 | assert_eq!( |
1330 | | agg.thread_count, parent.thread_count, |
1331 | 0 | "Threads should match parent when no children" |
1332 | | ); |
1333 | 0 | } |
1334 | 0 | } |
1335 | 1 | } |
1336 | | |
1337 | | #[test] |
1338 | 1 | fn test_recursive_child_detection() { |
1339 | | // Test that we can find children recursively in a more complex process tree |
1340 | 1 | let cmd = if cfg!(target_os = "windows") { |
1341 | 0 | vec![ |
1342 | 0 | "cmd".to_string(), |
1343 | 0 | "/C".to_string(), |
1344 | 0 | "timeout 3 >nul & (timeout 2 >nul & timeout 1 >nul)".to_string(), |
1345 | 0 | ] |
1346 | | } else { |
1347 | 1 | vec![ |
1348 | 1 | "sh".to_string(), |
1349 | 1 | "-c".to_string(), |
1350 | 1 | "sleep 3 & (sleep 2 & sleep 1 &)".to_string(), |
1351 | 1 | ] |
1352 | | }; |
1353 | | |
1354 | 1 | let mut monitor = create_test_monitor(cmd).unwrap(); |
1355 | 1 | |
1356 | 1 | // Allow time for the process tree to establish |
1357 | 1 | thread::sleep(Duration::from_millis(300)); |
1358 | 1 | |
1359 | 1 | let _children = monitor.get_child_pids(); |
1360 | 1 | |
1361 | 1 | // We might detect children (timing dependent), but the method should work |
1362 | 1 | // Just verify the method returns successfully (length is always valid) |
1363 | 1 | |
1364 | 1 | // Test that repeated calls work |
1365 | 1 | let _children2 = monitor.get_child_pids(); |
1366 | 1 | // Both calls should succeed and return valid vectors |
1367 | 1 | } |
1368 | | |
1369 | | #[test] |
1370 | 1 | fn test_child_process_lifecycle() { |
1371 | | // Test monitoring during child process lifecycle changes |
1372 | 1 | let cmd = if cfg!(target_os = "windows") { |
1373 | 0 | vec![ |
1374 | 0 | "cmd".to_string(), |
1375 | 0 | "/C".to_string(), |
1376 | 0 | "start /b ping 127.0.0.1 -n 3 >nul".to_string(), |
1377 | 0 | ] |
1378 | | } else { |
1379 | 1 | vec![ |
1380 | 1 | "sh".to_string(), |
1381 | 1 | "-c".to_string(), |
1382 | 1 | // Create multiple child processes that run long enough to be detected |
1383 | 1 | "for i in 1 2 3; do sleep $i & done; sleep 0.5; wait".to_string(), |
1384 | 1 | ] |
1385 | | }; |
1386 | | |
1387 | 1 | let mut monitor = create_test_monitor(cmd).unwrap(); |
1388 | 1 | |
1389 | 1 | // Enable child process monitoring explicitly |
1390 | 1 | monitor.set_include_children(true); |
1391 | 1 | |
1392 | 1 | // First, take multiple initial samples and find the stable baseline |
1393 | 1 | // (since environment might have background processes that come and go) |
1394 | 1 | println!("Measuring baseline process count..."); |
1395 | 1 | let mut baseline_samples = Vec::new(); |
1396 | 6 | for i5 in 0..5 { |
1397 | 5 | let metrics = monitor.sample_tree_metrics(); |
1398 | 5 | let count = metrics |
1399 | 5 | .aggregated |
1400 | 5 | .as_ref() |
1401 | 5 | .map(|a| a.process_count) |
1402 | 5 | .unwrap_or(1); |
1403 | 5 | baseline_samples.push(count); |
1404 | 5 | println!("Baseline sample {}: process count: {}", i + 1, count); |
1405 | 5 | thread::sleep(Duration::from_millis(100)); |
1406 | 5 | } |
1407 | | |
1408 | | // Calculate mode (most common value) as our baseline |
1409 | 1 | let mut counts = std::collections::HashMap::new(); |
1410 | 6 | for &count5 in &baseline_samples { |
1411 | 5 | *counts.entry(count).or_insert(0) += 1; |
1412 | 5 | } |
1413 | 1 | let baseline_count = counts |
1414 | 1 | .into_iter() |
1415 | 3 | .max_by_key(|&(_, count)| count) |
1416 | 1 | .map(|(val, _)| val) |
1417 | 1 | .unwrap_or(1); |
1418 | 1 | |
1419 | 1 | println!("Established baseline process count: {}", baseline_count); |
1420 | 1 | |
1421 | 1 | // Now create our command which should spawn child processes |
1422 | 1 | // Sample multiple times to catch process count changes |
1423 | 1 | let mut max_count = baseline_count; |
1424 | 1 | let mut min_count_after_max = usize::MAX; |
1425 | 1 | let mut saw_increase = false; |
1426 | 1 | let mut saw_decrease = false; |
1427 | 1 | |
1428 | 1 | println!("Starting sampling to detect process lifecycle..."); |
1429 | 16 | for i15 in 0..15 { |
1430 | 15 | thread::sleep(Duration::from_millis(200)); |
1431 | 15 | |
1432 | 15 | let metrics = monitor.sample_tree_metrics(); |
1433 | 15 | let count = metrics |
1434 | 15 | .aggregated |
1435 | 15 | .as_ref() |
1436 | 15 | .map(|a| a.process_count) |
1437 | 15 | .unwrap_or(1); |
1438 | 15 | |
1439 | 15 | println!("Sample {}: process count: {}", i + 1, count); |
1440 | 15 | |
1441 | 15 | // If we see an increase from baseline, note it |
1442 | 15 | if count > baseline_count && !saw_increase0 { |
1443 | 0 | saw_increase = true; |
1444 | 0 | println!( |
1445 | 0 | "Detected process count increase: {} -> {}", |
1446 | 0 | baseline_count, count |
1447 | 0 | ); |
1448 | 15 | } |
1449 | | |
1450 | | // Update maximum count observed |
1451 | 15 | if count > max_count { |
1452 | 0 | max_count = count; |
1453 | 15 | } |
1454 | | |
1455 | | // If we've seen an increase and now count is decreasing, note it |
1456 | 15 | if saw_increase && count < max_count0 { |
1457 | 0 | saw_decrease = true; |
1458 | 0 | min_count_after_max = min_count_after_max.min(count); |
1459 | 0 | println!( |
1460 | 0 | "Detected process count decrease: {} -> {}", |
1461 | 0 | max_count, count |
1462 | 0 | ); |
1463 | 15 | } |
1464 | | } |
1465 | | |
1466 | | // Final sample after waiting for processes to finish |
1467 | 1 | thread::sleep(Duration::from_millis(1000)); |
1468 | 1 | |
1469 | 1 | let final_metrics = monitor.sample_tree_metrics(); |
1470 | 1 | let final_count = final_metrics |
1471 | 1 | .aggregated |
1472 | 1 | .as_ref() |
1473 | 1 | .map(|a| a.process_count) |
1474 | 1 | .unwrap_or(1); |
1475 | 1 | |
1476 | 1 | println!("Final process count: {}", final_count); |
1477 | 1 | println!( |
1478 | 1 | "Test summary: baseline={}, max={}, min_after_max={}, final={}", |
1479 | 1 | baseline_count, max_count, min_count_after_max, final_count |
1480 | 1 | ); |
1481 | 1 | |
1482 | 1 | // Assert proper functioning |
1483 | 1 | if saw_increase { |
1484 | 0 | println!("✓ Successfully detected process count increase"); |
1485 | 1 | } else { |
1486 | 1 | println!("âš Did not detect any process count increase"); |
1487 | 1 | } |
1488 | | |
1489 | 1 | if saw_decrease { |
1490 | 0 | println!("✓ Successfully detected process count decrease"); |
1491 | 1 | } else { |
1492 | 1 | println!("âš Did not detect any process count decrease"); |
1493 | 1 | } |
1494 | | |
1495 | | // Make a loose assertion - the test mainly provides diagnostic output |
1496 | | // We don't want it to fail in CI with timing differences |
1497 | 1 | assert!( |
1498 | 1 | max_count >= baseline_count, |
1499 | 0 | "Process monitoring should detect at least the baseline count" |
1500 | | ); |
1501 | | |
1502 | | // All samples should have valid structure |
1503 | 1 | assert!( |
1504 | 1 | final_metrics.aggregated.is_some(), |
1505 | 0 | "Final aggregated metrics should exist" |
1506 | | ); |
1507 | 1 | } |
1508 | | |
1509 | | #[test] |
1510 | 1 | fn test_network_io_limitation_for_children() { |
1511 | | // Test that the current limitation of network I/O for children is handled properly |
1512 | 1 | let cmd = if cfg!(target_os = "windows") { |
1513 | 0 | vec![ |
1514 | 0 | "cmd".to_string(), |
1515 | 0 | "/C".to_string(), |
1516 | 0 | "timeout 1 >nul & echo test".to_string(), |
1517 | 0 | ] |
1518 | | } else { |
1519 | 1 | vec![ |
1520 | 1 | "sh".to_string(), |
1521 | 1 | "-c".to_string(), |
1522 | 1 | "sleep 1 & echo test".to_string(), |
1523 | 1 | ] |
1524 | | }; |
1525 | | |
1526 | 1 | let mut monitor = create_test_monitor(cmd).unwrap(); |
1527 | 1 | |
1528 | 1 | thread::sleep(Duration::from_millis(200)); |
1529 | 1 | |
1530 | 1 | let tree_metrics = monitor.sample_tree_metrics(); |
1531 | | |
1532 | | // Check that all children have 0 network I/O (current limitation) |
1533 | 1 | for child0 in &tree_metrics.children { |
1534 | 0 | assert_eq!( |
1535 | | child.metrics.net_rx_bytes, 0, |
1536 | 0 | "Child network RX should be 0 (known limitation)" |
1537 | | ); |
1538 | 0 | assert_eq!( |
1539 | | child.metrics.net_tx_bytes, 0, |
1540 | 0 | "Child network TX should be 0 (known limitation)" |
1541 | | ); |
1542 | | } |
1543 | | |
1544 | | // Parent might have network I/O, children should not |
1545 | 1 | if let Some(parent) = tree_metrics.parent { |
1546 | | // Parent could have network activity, that's fine |
1547 | 1 | if let Some(agg) = tree_metrics.aggregated { |
1548 | | // Aggregated network should equal parent network (since children are 0) |
1549 | 1 | assert_eq!( |
1550 | | agg.net_rx_bytes, parent.net_rx_bytes, |
1551 | 0 | "Aggregated network RX should equal parent (children are 0)" |
1552 | | ); |
1553 | 1 | assert_eq!( |
1554 | | agg.net_tx_bytes, parent.net_tx_bytes, |
1555 | 0 | "Aggregated network TX should equal parent (children are 0)" |
1556 | | ); |
1557 | 0 | } |
1558 | 0 | } |
1559 | 1 | } |
1560 | | |
1561 | | #[test] |
1562 | 1 | fn test_aggregation_arithmetic() { |
1563 | 1 | // Test that aggregation arithmetic is correct when we have known values |
1564 | 1 | let cmd = vec!["sleep".to_string(), "2".to_string()]; |
1565 | 1 | let mut monitor = create_test_monitor(cmd).unwrap(); |
1566 | 1 | |
1567 | 1 | thread::sleep(Duration::from_millis(100)); |
1568 | 1 | |
1569 | 1 | let tree_metrics = monitor.sample_tree_metrics(); |
1570 | | |
1571 | 1 | if let (Some(parent), Some(agg)) = (tree_metrics.parent, tree_metrics.aggregated) { |
1572 | | // Calculate expected values |
1573 | 1 | let expected_mem = parent.mem_rss_kb |
1574 | 1 | + tree_metrics |
1575 | 1 | .children |
1576 | 1 | .iter() |
1577 | 1 | .map(|c| c.metrics.mem_rss_kb0 ) |
1578 | 1 | .sum::<u64>(); |
1579 | 1 | let expected_threads = parent.thread_count |
1580 | 1 | + tree_metrics |
1581 | 1 | .children |
1582 | 1 | .iter() |
1583 | 1 | .map(|c| c.metrics.thread_count0 ) |
1584 | 1 | .sum::<usize>(); |
1585 | 1 | let expected_cpu = parent.cpu_usage |
1586 | 1 | + tree_metrics |
1587 | 1 | .children |
1588 | 1 | .iter() |
1589 | 1 | .map(|c| c.metrics.cpu_usage0 ) |
1590 | 1 | .sum::<f32>(); |
1591 | 1 | let expected_processes = 1 + tree_metrics.children.len(); |
1592 | 1 | |
1593 | 1 | assert_eq!( |
1594 | | agg.mem_rss_kb, expected_mem, |
1595 | 0 | "Memory aggregation should sum parent + children" |
1596 | | ); |
1597 | 1 | assert_eq!( |
1598 | | agg.thread_count, expected_threads, |
1599 | 0 | "Thread aggregation should sum parent + children" |
1600 | | ); |
1601 | 1 | assert_eq!( |
1602 | | agg.process_count, expected_processes, |
1603 | 0 | "Process count should be parent + children" |
1604 | | ); |
1605 | | |
1606 | | // CPU might have floating point precision issues, use approximate equality |
1607 | 1 | assert!( |
1608 | 1 | (agg.cpu_usage - expected_cpu).abs() < 0.01, |
1609 | 0 | "CPU aggregation should approximately sum parent + children" |
1610 | | ); |
1611 | 0 | } |
1612 | 1 | } |
1613 | | |
1614 | | #[test] |
1615 | 1 | fn test_timestamp_functionality() { |
1616 | | use std::thread; |
1617 | | use std::time::{SystemTime, UNIX_EPOCH}; |
1618 | | |
1619 | 1 | let cmd = vec!["sleep".to_string(), "2".to_string()]; |
1620 | 1 | let mut monitor = create_test_monitor(cmd).unwrap(); |
1621 | 1 | |
1622 | 1 | thread::sleep(Duration::from_millis(100)); |
1623 | 1 | |
1624 | 1 | // Collect multiple samples |
1625 | 1 | let sample1 = monitor.sample_metrics().unwrap(); |
1626 | 1 | thread::sleep(Duration::from_millis(50)); |
1627 | 1 | let sample2 = monitor.sample_metrics().unwrap(); |
1628 | 1 | |
1629 | 1 | // Verify timestamps are reasonable (within last minute) |
1630 | 1 | let now_ms = SystemTime::now() |
1631 | 1 | .duration_since(UNIX_EPOCH) |
1632 | 1 | .unwrap() |
1633 | 1 | .as_millis() as u64; |
1634 | 1 | |
1635 | 1 | assert!( |
1636 | 1 | sample1.ts_ms <= now_ms, |
1637 | 0 | "Sample1 timestamp should not be in future" |
1638 | | ); |
1639 | 1 | assert!( |
1640 | 1 | sample2.ts_ms <= now_ms, |
1641 | 0 | "Sample2 timestamp should not be in future" |
1642 | | ); |
1643 | 1 | assert!( |
1644 | 1 | now_ms - sample1.ts_ms < 60000, |
1645 | 0 | "Sample1 timestamp should be recent" |
1646 | | ); |
1647 | 1 | assert!( |
1648 | 1 | now_ms - sample2.ts_ms < 60000, |
1649 | 0 | "Sample2 timestamp should be recent" |
1650 | | ); |
1651 | | |
1652 | | // Verify timestamps are monotonic |
1653 | 1 | assert!( |
1654 | 1 | sample2.ts_ms >= sample1.ts_ms, |
1655 | 0 | "Timestamps should be monotonic" |
1656 | | ); |
1657 | | |
1658 | | // Test tree metrics timestamps (allow small timing differences) |
1659 | 1 | let tree_metrics = monitor.sample_tree_metrics(); |
1660 | 1 | let now_ms2 = SystemTime::now() |
1661 | 1 | .duration_since(UNIX_EPOCH) |
1662 | 1 | .unwrap() |
1663 | 1 | .as_millis() as u64; |
1664 | 1 | |
1665 | 1 | assert!( |
1666 | 1 | tree_metrics.ts_ms <= now_ms2 + 1000, |
1667 | 0 | "Tree timestamp should be reasonable" |
1668 | | ); |
1669 | | |
1670 | 1 | if let Some(parent) = tree_metrics.parent { |
1671 | 1 | assert!( |
1672 | 1 | parent.ts_ms <= now_ms2 + 1000, |
1673 | 0 | "Parent timestamp should be reasonable" |
1674 | | ); |
1675 | 0 | } |
1676 | | |
1677 | 1 | if let Some(agg) = tree_metrics.aggregated { |
1678 | 1 | assert!( |
1679 | 1 | agg.ts_ms <= now_ms2 + 1000, |
1680 | 0 | "Aggregated timestamp should be reasonable" |
1681 | | ); |
1682 | 0 | } |
1683 | 1 | } |
1684 | | |
1685 | | #[test] |
1686 | 1 | fn test_enhanced_memory_metrics() { |
1687 | | use std::thread; |
1688 | | use std::time::{SystemTime, UNIX_EPOCH}; |
1689 | | |
1690 | 1 | let cmd = vec!["sleep".to_string(), "2".to_string()]; |
1691 | 1 | let mut monitor = create_test_monitor(cmd).unwrap(); |
1692 | 1 | |
1693 | 1 | thread::sleep(Duration::from_millis(200)); |
1694 | 1 | |
1695 | 1 | // Try multiple times in case initial memory reporting is delayed |
1696 | 1 | let mut metrics = monitor.sample_metrics().unwrap(); |
1697 | 1 | for _ in 0..5 { |
1698 | 1 | if metrics.mem_rss_kb > 0 { |
1699 | 1 | break; |
1700 | 0 | } |
1701 | 0 | thread::sleep(Duration::from_millis(100)); |
1702 | 0 | metrics = monitor.sample_metrics().unwrap(); |
1703 | | } |
1704 | | |
1705 | | // Test that new memory fields exist and are reasonable |
1706 | | // Note: Memory reporting can be unreliable in test environments |
1707 | | // Allow for zero values in case of very fast processes or system limitations |
1708 | 1 | if metrics.mem_rss_kb > 0 && metrics.mem_vms_kb > 0 { |
1709 | 1 | assert!( |
1710 | 1 | metrics.mem_vms_kb >= metrics.mem_rss_kb, |
1711 | 0 | "Virtual memory should be >= RSS when both > 0" |
1712 | | ); |
1713 | 0 | } |
1714 | | |
1715 | | // At least one memory metric should be available, but allow for system variations |
1716 | 1 | let has_memory_data = metrics.mem_rss_kb > 0 || metrics.mem_vms_kb > 00 ; |
1717 | 1 | if !has_memory_data { |
1718 | 0 | println!("Warning: No memory data available from sysinfo - this can happen in test environments"); |
1719 | 1 | } |
1720 | | |
1721 | | // Test metadata separately |
1722 | 1 | let metadata = monitor.get_metadata().unwrap(); |
1723 | 1 | let now_ms = SystemTime::now() |
1724 | 1 | .duration_since(UNIX_EPOCH) |
1725 | 1 | .expect("Time went backwards") |
1726 | 1 | .as_millis() as u64; |
1727 | 1 | |
1728 | 1 | assert!( |
1729 | 1 | metadata.t0_ms <= now_ms, |
1730 | 0 | "Start time should not be in future" |
1731 | | ); |
1732 | 1 | assert!( |
1733 | 1 | now_ms - metadata.t0_ms < 60000, |
1734 | 0 | "Start time should be recent (within 60 seconds)" |
1735 | | ); |
1736 | | |
1737 | | // Test tree metrics also have enhanced fields |
1738 | 1 | let tree_metrics = monitor.sample_tree_metrics(); |
1739 | | |
1740 | 1 | if let Some(parent) = tree_metrics.parent { |
1741 | 1 | assert!( |
1742 | 1 | parent.mem_vms_kb >= parent.mem_rss_kb, |
1743 | 0 | "Parent VMS should be >= RSS" |
1744 | | ); |
1745 | 0 | } |
1746 | | |
1747 | 1 | if let Some(agg) = tree_metrics.aggregated { |
1748 | 1 | assert!( |
1749 | 1 | agg.mem_vms_kb >= agg.mem_rss_kb, |
1750 | 0 | "Aggregated VMS should be >= RSS" |
1751 | | ); |
1752 | 0 | } |
1753 | 1 | } |
1754 | | |
1755 | | #[test] |
1756 | 0 | fn test_process_metadata() { |
1757 | | use std::thread; |
1758 | | use std::time::{SystemTime, UNIX_EPOCH}; |
1759 | | |
1760 | 0 | let cmd = vec!["sleep".to_string(), "2".to_string()]; |
1761 | 0 | let mut monitor = create_test_monitor(cmd).unwrap(); |
1762 | 0 |
|
1763 | 0 | thread::sleep(Duration::from_millis(100)); |
1764 | 0 |
|
1765 | 0 | // Test metadata collection |
1766 | 0 | let metadata = monitor.get_metadata().unwrap(); |
1767 | 0 |
|
1768 | 0 | // Verify basic metadata fields |
1769 | 0 | assert!(metadata.pid > 0, "PID should be positive"); |
1770 | 0 | assert!(!metadata.cmd.is_empty(), "Command should not be empty"); |
1771 | 0 | assert_eq!( |
1772 | 0 | metadata.cmd[0], "sleep", |
1773 | 0 | "First command arg should be 'sleep'" |
1774 | | ); |
1775 | 0 | assert!( |
1776 | 0 | !metadata.executable.is_empty(), |
1777 | 0 | "Executable path should not be empty" |
1778 | | ); |
1779 | | |
1780 | | // Test start time is reasonable |
1781 | 0 | let now_ms = SystemTime::now() |
1782 | 0 | .duration_since(UNIX_EPOCH) |
1783 | 0 | .expect("Time went backwards") |
1784 | 0 | .as_millis() as u64; |
1785 | 0 |
|
1786 | 0 | assert!( |
1787 | 0 | metadata.t0_ms <= now_ms, |
1788 | 0 | "Start time should not be in future" |
1789 | | ); |
1790 | 0 | assert!( |
1791 | 0 | now_ms - metadata.t0_ms < 60000, |
1792 | 0 | "Start time should be recent (within 60 seconds)" |
1793 | | ); |
1794 | | |
1795 | | // Test that t0_ms has millisecond precision (not just seconds * 1000) |
1796 | | // The value should not be a round thousand (which would indicate second precision) |
1797 | 0 | let remainder = metadata.t0_ms % 1000; |
1798 | 0 | // Allow some tolerance for processes that might start exactly on second boundaries |
1799 | 0 | // but most of the time it should have non-zero millisecond component |
1800 | 0 | println!("t0_ms: {}, remainder: {}", metadata.t0_ms, remainder); |
1801 | 0 |
|
1802 | 0 | // Test tree metrics work without embedded metadata |
1803 | 0 | let tree_metrics = monitor.sample_tree_metrics(); |
1804 | 0 | assert_eq!( |
1805 | 0 | tree_metrics.parent.is_some(), |
1806 | | true, |
1807 | 0 | "Tree should have parent metrics" |
1808 | | ); |
1809 | 0 | } |
1810 | | |
1811 | | #[test] |
1812 | 1 | fn test_t0_ms_precision() { |
1813 | | use std::thread; |
1814 | | use std::time::{SystemTime, UNIX_EPOCH}; |
1815 | | |
1816 | | // Capture time before creating monitor |
1817 | 1 | let before_ms = SystemTime::now() |
1818 | 1 | .duration_since(UNIX_EPOCH) |
1819 | 1 | .expect("Time went backwards") |
1820 | 1 | .as_millis() as u64; |
1821 | 1 | |
1822 | 1 | let cmd = vec!["sleep".to_string(), "0.1".to_string()]; |
1823 | 1 | let mut monitor = create_test_monitor(cmd).unwrap(); |
1824 | 1 | |
1825 | 1 | // Capture time after creating monitor |
1826 | 1 | let after_ms = SystemTime::now() |
1827 | 1 | .duration_since(UNIX_EPOCH) |
1828 | 1 | .expect("Time went backwards") |
1829 | 1 | .as_millis() as u64; |
1830 | 1 | |
1831 | 1 | // Wait a small amount to let process start |
1832 | 1 | thread::sleep(Duration::from_millis(50)); |
1833 | 1 | |
1834 | 1 | let metadata = monitor.get_metadata().unwrap(); |
1835 | 1 | |
1836 | 1 | // Verify t0_ms is in milliseconds and reasonable |
1837 | 1 | assert!( |
1838 | 1 | metadata.t0_ms > 1000000000000, |
1839 | 0 | "t0_ms should be a reasonable Unix timestamp in milliseconds" |
1840 | | ); |
1841 | 1 | assert!( |
1842 | 1 | metadata.t0_ms >= before_ms, |
1843 | 0 | "t0_ms should be after we started creating the monitor" |
1844 | | ); |
1845 | 1 | assert!( |
1846 | 1 | metadata.t0_ms <= after_ms, |
1847 | 0 | "t0_ms should be before we finished creating the monitor" |
1848 | | ); |
1849 | | |
1850 | | // Test precision by checking that we have millisecond information |
1851 | | // t0_ms should have millisecond precision, not just seconds * 1000 |
1852 | 1 | let remainder = metadata.t0_ms % 1000; |
1853 | 1 | println!("t0_ms: {}, remainder: {}", metadata.t0_ms, remainder); |
1854 | 1 | |
1855 | 1 | // The value should be a proper millisecond timestamp |
1856 | 1 | // Allow some tolerance for timing variations and clock resolution |
1857 | 1 | assert!( |
1858 | 1 | metadata.t0_ms <= after_ms + 1000, |
1859 | 0 | "t0_ms should be close to creation time (within 1 second tolerance)" |
1860 | | ); |
1861 | | |
1862 | | // Verify we have reasonable millisecond precision |
1863 | | // On some systems, clocks may have lower resolution, so we just verify |
1864 | | // it's a valid timestamp format |
1865 | 1 | assert!( |
1866 | 1 | metadata.t0_ms > 0, |
1867 | 0 | "t0_ms should be a valid positive timestamp" |
1868 | | ); |
1869 | 1 | } |
1870 | | } |