Skip to content

Commit

Permalink
Merge pull request NordicHPC#146 from lars-t-hansen/w-142-sysinfo
Browse files Browse the repository at this point in the history
Fix NordicHPC#142 - implement sysinfo
  • Loading branch information
bast authored Feb 25, 2024
2 parents 7be0d99 + 5d7328d commit 2538468
Show file tree
Hide file tree
Showing 8 changed files with 802 additions and 54 deletions.
21 changes: 17 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,5 @@ env_logger = "0.11"
page_size = "0.6"
libc = "0.2"
signal-hook = "0.3"
serde_json = "1.0.114"
serde = { version = "1.0.197", features = ["derive"] }
72 changes: 57 additions & 15 deletions src/amd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
/// about the usage of various processes on the various devices. We divide the utilization of a
/// device by the number of processes on the device. This is approximate.
use crate::command::{self, CmdError};
use crate::nvidia;
use crate::gpu;
use crate::ps::UserTable;
use crate::TIMEOUT_SECONDS;

Expand All @@ -23,12 +23,54 @@ use std::cmp::Ordering;
#[cfg(test)]
use crate::util::map;

// We only have one machine with AMD GPUs at UiO and rocm-smi is unable to show eg how much memory
// is installed on each card on this machine, so this is pretty limited. But we are at least able
// to extract gross information about the installed cards.
//
// `rocm-smi --showproductname` lists the cards. The "Card series" line has the card number and
// model name. There is no memory information, so record it as zero.
//
// TODO: It may be possible to find memory sizes using lspci. Run `lspci -v` and capture the
// output. Now look for the line "Kernel modules: amdgpu". The lines that are part of that block
// of info will have a couple of `Memory at ... ` lines that have memory block sizes, and the first
// line of the info block will have the GPU model. The largest memory block size is likely the one
// we want.
//
// (It does not appear that the lspci trick works with the nvidia cards - the memory block sizes are
// too small. This is presumably all driver dependent.)

pub fn get_amd_configuration() -> Option<Vec<gpu::Card>> {
match command::safe_command("rocm-smi --showproductname", TIMEOUT_SECONDS) {
Ok(raw_text) => {
let mut cards = vec![];
for l in raw_text.lines() {
// We want to match /^GPU\[(\d+)\].*Card series:\s*(.*)$/ but we really only care
// about \2, which is the description.
if l.starts_with("GPU[") {
if let Some((_, after)) = l.split_once("Card series:") {
cards.push(gpu::Card {
model: after.trim().to_string(),
mem_size_kib: 0,
});
}
}
}
if !cards.is_empty() {
Some(cards)
} else {
None
}
}
Err(_) => None,
}
}

/// Get information about AMD cards.
///
/// Err(e) really means the command started running but failed, for the reason given. If the
/// command could not be found, we return Ok(vec![]).

pub fn get_amd_information(user_by_pid: &UserTable) -> Result<Vec<nvidia::Process>, String> {
pub fn get_amd_information(user_by_pid: &UserTable) -> Result<Vec<gpu::Process>, String> {
// I've not been able to combine the two invocations of rocm-smi yet; we have to run the command
// twice. Not a happy situation.

Expand All @@ -54,7 +96,7 @@ fn extract_amd_information(
concise_raw_text: &str,
showpidgpus_raw_text: &str,
user_by_pid: &UserTable,
) -> Result<Vec<nvidia::Process>, String> {
) -> Result<Vec<gpu::Process>, String> {
let per_device_info = parse_concise_command(concise_raw_text)?; // device -> (gpu%, mem%)
let per_pid_info = parse_showpidgpus_command(showpidgpus_raw_text)?; // pid -> [device, ...]
let mut num_processes_per_device = vec![0; per_device_info.len()];
Expand All @@ -70,9 +112,9 @@ fn extract_amd_information(
let (user, uid) = if let Some((user, uid)) = user_by_pid.get(pid) {
(user.to_string(), *uid)
} else {
("_zombie_".to_owned() + &pid.to_string(), nvidia::ZOMBIE_UID)
("_zombie_".to_owned() + &pid.to_string(), gpu::ZOMBIE_UID)
};
processes.push(nvidia::Process {
processes.push(gpu::Process {
device: Some(*dev),
pid: *pid,
user,
Expand All @@ -98,15 +140,15 @@ fn extract_amd_information(
#[cfg(test)]
macro_rules! proc(
{ $a:expr, $b:expr, $c:expr, $d:expr, $e: expr, $f: expr } => {
nvidia::Process { device: $a,
pid: $b,
user: $c.to_string(),
uid: $d,
gpu_pct: $e,
mem_pct: $f,
mem_size_kib: 0,
command: "_noinfo_".to_string()
}
gpu::Process { device: $a,
pid: $b,
user: $c.to_string(),
uid: $d,
gpu_pct: $e,
mem_pct: $f,
mem_size_kib: 0,
command: "_noinfo_".to_string()
}
});

#[test]
Expand All @@ -131,7 +173,7 @@ PID 28154 is using 1 DRM device(s):
};
let zs = extract_amd_information(concise, pidgpu, &users).unwrap();
assert!(zs.eq(&vec![
proc! { Some(0), 28154, "_zombie_28154", nvidia::ZOMBIE_UID, 99.0/2.0, 57.0/2.0 },
proc! { Some(0), 28154, "_zombie_28154", gpu::ZOMBIE_UID, 99.0/2.0, 57.0/2.0 },
proc! { Some(0), 28156, "bob", 1001, 99.0/2.0, 57.0/2.0 },
proc! { Some(1), 28156, "bob", 1001, 63.0, 5.0 },
]));
Expand Down
19 changes: 19 additions & 0 deletions src/gpu.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#[derive(PartialEq)]
pub struct Process {
pub device: Option<usize>, // Device ID
pub pid: usize, // Process ID
pub user: String, // User name, _zombie_PID for zombies
pub uid: usize, // User ID, 666666 for zombies
pub gpu_pct: f64, // Percent of GPU /for this sample/, 0.0 for zombies
pub mem_pct: f64, // Percent of memory /for this sample/, 0.0 for zombies
pub mem_size_kib: usize, // Memory use in KiB /for this sample/, _not_ zero for zombies
pub command: String, // The command, _unknown_ for zombies, _noinfo_ if not known
}

pub const ZOMBIE_UID: usize = 666666;

#[derive(PartialEq)]
pub struct Card {
pub model: String,
pub mem_size_kib: i64,
}
7 changes: 7 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@ use clap::{Parser, Subcommand};
mod amd;
mod batchless;
mod command;
mod gpu;
mod jobs;
mod nvidia;
mod procfs;
mod procfsapi;
mod ps;
mod slurm;
mod sysinfo;
mod users;
mod util;

Expand Down Expand Up @@ -66,6 +68,8 @@ enum Commands {
#[arg(long)]
lockdir: Option<String>,
},
/// Extract system information
Sysinfo {},
/// Not yet implemented
Analyze {},
}
Expand Down Expand Up @@ -120,6 +124,9 @@ fn main() {
ps::create_snapshot(&mut jm, &opts, &timestamp);
}
}
Commands::Sysinfo {} => {
sysinfo::show_system(&timestamp);
}
Commands::Analyze {} => {
println!("sonar analyze not yet completed");
}
Expand Down
Loading

0 comments on commit 2538468

Please sign in to comment.