board.rs8.67%
1
// Copyright 2024 Google LLC2
//3
// Licensed under the Apache License, Version 2.0 (the "License");4
// you may not use this file except in compliance with the License.5
// You may obtain a copy of the License at6
//7
// https://www.apache.org/licenses/LICENSE-2.08
//9
// Unless required by applicable law or agreed to in writing, software10
// distributed under the License is distributed on an "AS IS" BASIS,11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12
// See the License for the specific language governing permissions and13
// limitations under the License.14
15
#[cfg(target_arch = "aarch64")]16
#[path = "board_aarch64.rs"]17
mod aarch64;18
#[cfg(target_arch = "x86_64")]19
#[path = "board_x86_64/board_x86_64.rs"]20
mod x86_64;21
22
#[cfg(target_os = "linux")]23
use std::collections::HashMap;24
use std::ffi::CStr;25
use std::sync::Arc;26
use std::sync::mpsc::Sender;27
use std::thread::JoinHandle;28
29
use libc::{MAP_PRIVATE, MAP_SHARED};30
use parking_lot::{Condvar, Mutex, RwLock, RwLockReadGuard};31
use serde::Deserialize;32
use serde_aco::Help;33
use snafu::{ResultExt, Snafu};34
35
#[cfg(target_arch = "x86_64")]36
use crate::arch::cpuid::CpuidIn;37
#[cfg(target_arch = "x86_64")]38
use crate::arch::layout::PORT_PCI_ADDRESS;39
use crate::arch::layout::{40
MEM_64_START, PCIE_CONFIG_START, PCIE_MMIO_32_NON_PREFETCHABLE_END,41
PCIE_MMIO_32_NON_PREFETCHABLE_START, PCIE_MMIO_32_PREFETCHABLE_END,42
PCIE_MMIO_32_PREFETCHABLE_START, RAM_32_SIZE,43
};44
use crate::device::MmioDev;45
#[cfg(target_arch = "x86_64")]46
use crate::device::fw_cfg::FwCfg;47
use crate::errors::{DebugTrace, trace_error};48
use crate::hv::{Coco, Hypervisor, Vcpu, Vm, VmConfig, VmEntry, VmExit};49
#[cfg(target_arch = "x86_64")]50
use crate::loader::xen;51
use crate::loader::{Executable, InitState, Payload, linux};52
use crate::mem::mapped::ArcMemPages;53
use crate::mem::{MemBackend, MemConfig, MemRegion, MemRegionType, Memory};54
use crate::pci::bus::PciBus;55
#[cfg(target_os = "linux")]56
use crate::vfio::container::Container;57
#[cfg(target_os = "linux")]58
use crate::vfio::iommu::Ioas;59
60
#[cfg(target_arch = "aarch64")]61
use self::aarch64::ArchBoard;62
#[cfg(target_arch = "x86_64")]63
use self::x86_64::ArchBoard;64
65
#[trace_error]66
#[derive(Snafu, DebugTrace)]67
#[snafu(module, context(suffix(false)))]68
pub enum Error {69
#[snafu(display("Hypervisor internal error"), context(false))]70
HvError { source: Box<crate::hv::Error> },71
#[snafu(display("Failed to access guest memory"), context(false))]72
Memory { source: Box<crate::mem::Error> },73
#[snafu(display("Failed to load payload"), context(false))]74
Loader { source: Box<crate::loader::Error> },75
#[snafu(display("Invalid CPU topology"))]76
InvalidCpuTopology,77
#[snafu(display("Failed to create VCPU-{index}"))]78
CreateVcpu {79
index: u16,80
source: Box<crate::hv::Error>,81
},82
#[snafu(display("Failed to run VCPU-{index}"))]83
RunVcpu {84
index: u16,85
source: Box<crate::hv::Error>,86
},87
#[snafu(display("Failed to stop VCPU-{index}"))]88
StopVcpu {89
index: u16,90
source: Box<crate::hv::Error>,91
},92
#[snafu(display("Failed to reset PCI devices"))]93
ResetPci { source: Box<crate::pci::Error> },94
#[snafu(display("Failed to configure firmware"))]95
FwCfg { error: std::io::Error },96
#[snafu(display("Missing payload"))]97
MissingPayload,98
#[snafu(display("Failed to notify the VMM thread"))]99
NotifyVmm,100
#[snafu(display("Another VCPU thread has signaled failure"))]101
PeerFailure,102
#[snafu(display("Unexpected state: {state:?}, want {want:?}"))]103
UnexpectedState { state: BoardState, want: BoardState },104
#[cfg(target_arch = "x86_64")]105
#[snafu(display("Missing CPUID leaf {leaf:x?}"))]106
MissingCpuid { leaf: CpuidIn },107
#[snafu(display("Firmware error"), context(false))]108
Firmware { source: Box<crate::firmware::Error> },109
#[snafu(display("Unknown firmware metadata"))]110
UnknownFirmwareMetadata,111
}112
113
type Result<T, E = Error> = std::result::Result<T, E>;114
115
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Deserialize, Help)]116
pub struct CpuTopology {117
#[serde(default)]118
/// Enable SMT (Hyperthreading).119
pub smt: bool,120
#[serde(default)]121
/// Number of cores per socket.122
pub cores: u16,123
#[serde(default)]124
/// Number of sockets.125
pub sockets: u8,126
}127
128
impl CpuTopology {129
pub fn encode(&self, index: u16) -> (u8, u16, u8) {42x130
let total_cores = self.cores * self.sockets as u16;42x131
let thread_id = index / total_cores;42x132
let core_id = index % total_cores % self.cores;42x133
let socket_id = index % total_cores / self.cores;42x134
(socket_id as u8, core_id, thread_id as u8)42x135
}42x136
137
pub fn decode(&self, socket_id: u8, core_id: u16, thread_id: u8) -> u16 {27x138
let total_cores = self.cores * self.sockets as u16;27x139
thread_id as u16 * total_cores + core_id + socket_id as u16 * self.cores27x140
}27x141
}142
143
const fn default_cpu_count() -> u16 {144
1145
}146
147
#[derive(Debug, Default, PartialEq, Eq, Deserialize, Help)]148
pub struct CpuConfig {149
/// Number of VCPUs assigned to the guest. [default: 1]150
#[serde(default = "default_cpu_count")]151
pub count: u16,152
/// Architecture specific CPU topology.153
#[serde(default)]154
pub topology: CpuTopology,155
}156
157
impl CpuConfig {158
pub fn fixup(&mut self) -> Result<()> {6x159
if self.topology.sockets == 0 {6x160
self.topology.sockets = 1;3x161
}3x162
let vcpus_per_core = 1 + self.topology.smt as u16;6x163
if self.topology.cores == 0 {6x164
self.topology.cores = self.count / self.topology.sockets as u16 / vcpus_per_core;3x165
}3x166
let vcpus_per_socket = self.topology.cores * vcpus_per_core;6x167
let count = self.topology.sockets as u16 * vcpus_per_socket;6x168
if count != self.count {6x169
return error::InvalidCpuTopology.fail();3x170
}3x171
Ok(())3x172
}6x173
}174
175
#[derive(Debug, Clone, Copy, PartialEq, Eq)]176
pub enum BoardState {177
Paused,178
Running,179
Shutdown,180
RebootPending,181
}182
183
#[derive(Debug)]184
struct MpSync {185
state: BoardState,186
fatal: bool,187
count: u16,188
}189
190
pub const PCIE_MMIO_64_SIZE: u64 = 1 << 40;191
192
#[derive(Debug, Default, PartialEq, Eq, Deserialize)]193
pub struct BoardConfig {194
pub mem: MemConfig,195
pub cpu: CpuConfig,196
pub coco: Option<Coco>,197
}198
199
impl BoardConfig {200
pub fn pcie_mmio_64_start(&self) -> u64 {201
(self.mem.size.saturating_sub(RAM_32_SIZE) + MEM_64_START).next_power_of_two()202
}203
204
pub fn config_fixup(&mut self) -> Result<()> {205
self.cpu.fixup()206
}207
}208
209
type VcpuGuard<'a> = RwLockReadGuard<'a, Vec<VcpuHandle>>;210
type VcpuHandle = JoinHandle<Result<()>>;211
212
pub struct Board<V>213
where214
V: Vm,215
{216
pub vm: V,217
pub memory: Memory,218
pub vcpus: Arc<RwLock<Vec<VcpuHandle>>>,219
pub arch: ArchBoard<V>,220
pub config: BoardConfig,221
pub payload: RwLock<Option<Payload>>,222
pub io_devs: RwLock<Vec<(u16, Arc<dyn MmioDev>)>>,223
pub mmio_devs: RwLock<Vec<(u64, Arc<dyn MmioDev>)>>,224
pub pci_bus: PciBus,225
#[cfg(target_arch = "x86_64")]226
pub fw_cfg: Mutex<Option<Arc<Mutex<FwCfg>>>>,227
#[cfg(target_os = "linux")]228
pub vfio_ioases: Mutex<HashMap<Box<str>, Arc<Ioas>>>,229
#[cfg(target_os = "linux")]230
pub vfio_containers: Mutex<HashMap<Box<str>, Arc<Container>>>,231
232
mp_sync: Mutex<MpSync>,233
cond_var: Condvar,234
}235
236
impl<V> Board<V>237
where238
V: Vm,239
{240
pub fn new<H>(hv: &H, mut config: BoardConfig) -> Result<Self>241
where242
H: Hypervisor<Vm = V>,243
{244
config.config_fixup()?;245
246
let vm_config = VmConfig {247
coco: config.coco.clone(),248
};249
let mut vm = hv.create_vm(&vm_config)?;250
let vm_memory = Arc::new(vm.create_vm_memory()?);251
let arch = ArchBoard::new(hv, &vm, &config)?;252
253
let board = Board {254
vm,255
memory: Memory::new(vm_memory.clone()),256
arch,257
config,258
payload: RwLock::new(None),259
vcpus: Arc::new(RwLock::new(Vec::new())),260
io_devs: RwLock::new(Vec::new()),261
mmio_devs: RwLock::new(Vec::new()),262
pci_bus: PciBus::new(),263
#[cfg(target_arch = "x86_64")]264
fw_cfg: Mutex::new(None),265
#[cfg(target_os = "linux")]266
vfio_ioases: Mutex::new(HashMap::new()),267
#[cfg(target_os = "linux")]268
vfio_containers: Mutex::new(HashMap::new()),269
270
mp_sync: Mutex::new(MpSync {271
state: BoardState::Paused,272
count: 0,273
fatal: false,274
}),275
cond_var: Condvar::new(),276
};277
278
board.coco_init(vm_memory)?;279
280
Ok(board)281
}282
283
pub fn boot(&self) -> Result<()> {284
self.resume()285
}286
287
pub fn resume(&self) -> Result<()> {288
let mut mp_sync = self.mp_sync.lock();289
if mp_sync.state == BoardState::Paused {290
mp_sync.state = BoardState::Running;291
} else {292
return error::UnexpectedState {293
state: mp_sync.state,294
want: BoardState::Paused,295
}296
.fail();297
}298
self.cond_var.notify_all();299
Ok(())300
}301
302
pub fn pause(&self) -> Result<()> {303
let vcpus = self.vcpus.read();304
let mut mp_sync = self.mp_sync.lock();305
if mp_sync.state != BoardState::Running {306
return error::UnexpectedState {307
state: mp_sync.state,308
want: BoardState::Running,309
}310
.fail();311
}312
mp_sync.state = BoardState::Paused;313
self.stop_other_vcpus(None, &vcpus)?;314
Ok(())315
}316
317
fn load_payload(&self, vcpu: &mut V::Vcpu) -> Result<InitState, Error> {318
let payload = self.payload.read();319
let Some(payload) = payload.as_ref() else {320
return error::MissingPayload.fail();321
};322
323
if let Some(fw) = payload.firmware.as_ref() {324
return self.setup_firmware(fw, payload, vcpu);325
}326
327
let Some(exec) = &payload.executable else {328
return error::MissingPayload.fail();329
};330
let mem_regions = self.memory.mem_region_entries();331
let init_state = match exec {332
Executable::Linux(image) => linux::load(333
&self.memory.ram_bus(),334
&mem_regions,335
image.as_ref(),336
payload.cmdline.as_deref(),337
payload.initramfs.as_deref(),338
),339
#[cfg(target_arch = "x86_64")]340
Executable::Pvh(image) => xen::load(341
&self.memory.ram_bus(),342
&mem_regions,343
image.as_ref(),344
payload.cmdline.as_deref(),345
payload.initramfs.as_deref(),346
),347
}?;348
Ok(init_state)349
}350
351
fn add_pci_devs(&self) -> Result<()> {352
#[cfg(target_arch = "x86_64")]353
self.memory354
.add_io_dev(PORT_PCI_ADDRESS, self.pci_bus.io_bus.clone())?;355
self.memory.add_region(356
PCIE_CONFIG_START,357
Arc::new(MemRegion::with_emulated(358
self.pci_bus.segment.clone(),359
MemRegionType::Reserved,360
)),361
)?;362
let pcie_mmio_64_start = self.config.pcie_mmio_64_start();363
self.pci_bus.segment.assign_resources(&[364
(0x1000, 0x10000),365
(366
PCIE_MMIO_32_NON_PREFETCHABLE_START,367
PCIE_MMIO_32_NON_PREFETCHABLE_END,368
),369
(370
PCIE_MMIO_32_PREFETCHABLE_START,371
PCIE_MMIO_32_PREFETCHABLE_END,372
),373
(pcie_mmio_64_start, pcie_mmio_64_start + PCIE_MMIO_64_SIZE),374
]);375
Ok(())376
}377
378
fn vcpu_loop(&self, vcpu: &mut <V as Vm>::Vcpu, index: u16) -> Result<BoardState> {379
let mut vm_entry = VmEntry::None;380
loop {381
let vm_exit = vcpu.run(vm_entry).context(error::RunVcpu { index })?;382
vm_entry = match vm_exit {383
#[cfg(target_arch = "x86_64")]384
VmExit::Io { port, write, size } => self.memory.handle_io(port, write, size)?,385
VmExit::Mmio { addr, write, size } => self.memory.handle_mmio(addr, write, size)?,386
VmExit::Shutdown => break Ok(BoardState::Shutdown),387
VmExit::Reboot => break Ok(BoardState::RebootPending),388
VmExit::Paused => break Ok(BoardState::Paused),389
VmExit::Interrupted => {390
let mp_sync = self.mp_sync.lock();391
match mp_sync.state {392
BoardState::Shutdown => VmEntry::Shutdown,393
BoardState::RebootPending => VmEntry::Reboot,394
BoardState::Paused => VmEntry::Pause,395
BoardState::Running => VmEntry::None,396
}397
}398
VmExit::ConvertMemory { gpa, size, private } => {399
self.memory.mark_private_memory(gpa, size, private)?;400
VmEntry::None401
}402
};403
}404
}405
406
fn sync_vcpus(&self, vcpus: &VcpuGuard) -> Result<()> {407
let mut mp_sync = self.mp_sync.lock();408
if mp_sync.fatal {409
return error::PeerFailure.fail();410
}411
412
mp_sync.count += 1;413
if mp_sync.count == vcpus.len() as u16 {414
mp_sync.count = 0;415
self.cond_var.notify_all();416
} else {417
self.cond_var.wait(&mut mp_sync)418
}419
420
if mp_sync.fatal {421
return error::PeerFailure.fail();422
}423
424
Ok(())425
}426
427
fn notify_vmm(&self, index: u16, event_tx: &Sender<u16>) -> Result<()> {428
if event_tx.send(index).is_err() {429
error::NotifyVmm.fail()430
} else {431
Ok(())432
}433
}434
435
fn boot_init_sync(&self, index: u16, vcpu: &mut V::Vcpu) -> Result<()> {436
let vcpus = self.vcpus.read();437
if index == 0 {438
self.create_ram()?;439
for (port, dev) in self.io_devs.read().iter() {440
self.memory.add_io_dev(*port, dev.clone())?;441
}442
for (addr, dev) in self.mmio_devs.read().iter() {443
self.memory.add_mmio_dev(*addr, dev.clone())?;444
}445
self.add_pci_devs()?;446
let init_state = self.load_payload(vcpu)?;447
self.init_boot_vcpu(vcpu, &init_state)?;448
self.create_firmware_data(&init_state)?;449
}450
self.init_ap(index, vcpu, &vcpus)?;451
self.coco_finalize(index, &vcpus)?;452
self.sync_vcpus(&vcpus)453
}454
455
fn stop_other_vcpus(&self, current: Option<u16>, vcpus: &VcpuGuard) -> Result<()> {456
for (index, handle) in vcpus.iter().enumerate() {457
let index = index as u16;458
if let Some(current) = current {459
if current == index {460
continue;461
}462
log::info!("VCPU-{current}: stopping VCPU-{index}");463
} else {464
log::info!("Stopping VCPU-{index}");465
}466
let identity = self.encode_cpu_identity(index);467
self.vm468
.stop_vcpu(identity, handle)469
.context(error::StopVcpu { index })?;470
}471
Ok(())472
}473
474
fn run_vcpu_inner(&self, index: u16, event_tx: &Sender<u16>) -> Result<(), Error> {475
let mut vcpu = self.create_vcpu(index)?;476
self.notify_vmm(index, event_tx)?;477
self.init_vcpu(index, &mut vcpu)?;478
479
'reboot: loop {480
let mut mp_sync = self.mp_sync.lock();481
loop {482
match mp_sync.state {483
BoardState::Paused => self.cond_var.wait(&mut mp_sync),484
BoardState::Running => break,485
BoardState::Shutdown => break 'reboot Ok(()),486
BoardState::RebootPending => mp_sync.state = BoardState::Running,487
}488
}489
drop(mp_sync);490
491
self.boot_init_sync(index, &mut vcpu)?;492
493
let request = 'pause: loop {494
let request = self.vcpu_loop(&mut vcpu, index);495
496
let vcpus = self.vcpus.read();497
let mut mp_sync = self.mp_sync.lock();498
if mp_sync.state == BoardState::Running {499
mp_sync.state = match request {500
Ok(BoardState::RebootPending) => BoardState::RebootPending,501
Ok(BoardState::Paused) => BoardState::Paused,502
_ => BoardState::Shutdown,503
};504
log::trace!("VCPU-{index}: change state to {:?}", mp_sync.state);505
self.stop_other_vcpus(Some(index), &vcpus)?;506
}507
loop {508
match mp_sync.state {509
BoardState::Running => break,510
BoardState::Paused => self.cond_var.wait(&mut mp_sync),511
BoardState::RebootPending | BoardState::Shutdown => break 'pause request,512
}513
}514
};515
516
if index == 0 {517
self.pci_bus.segment.reset().context(error::ResetPci)?;518
self.memory.reset()?;519
}520
self.reset_vcpu(index, &mut vcpu)?;521
522
request?;523
524
let vcpus = self.vcpus.read();525
self.sync_vcpus(&vcpus)?;526
}527
}528
529
fn create_vcpu(&self, index: u16) -> Result<V::Vcpu> {530
let identity = self.encode_cpu_identity(index);531
let vcpu = self532
.vm533
.create_vcpu(index, identity)534
.context(error::CreateVcpu { index })?;535
Ok(vcpu)536
}537
538
pub fn run_vcpu(&self, index: u16, event_tx: Sender<u16>) -> Result<(), Error> {539
let ret = self.run_vcpu_inner(index, &event_tx);540
541
let _ = self.notify_vmm(index, &event_tx);542
543
if matches!(ret, Ok(_) | Err(Error::PeerFailure { .. })) {544
return Ok(());545
}546
547
log::warn!("VCPU-{index} reported error {ret:?}, unblocking other VCPUs...");548
let mut mp_sync = self.mp_sync.lock();549
mp_sync.fatal = true;550
if mp_sync.count > 0 {551
self.cond_var.notify_all();552
}553
ret554
}555
556
fn create_ram_pages(557
&self,558
size: u64,559
#[cfg_attr(not(target_os = "linux"), allow(unused_variables))] name: &CStr,560
) -> Result<ArcMemPages> {561
let mmap_flag = if self.config.mem.shared {562
Some(MAP_SHARED)563
} else {564
Some(MAP_PRIVATE)565
};566
let pages = match self.config.mem.backend {567
#[cfg(target_os = "linux")]568
MemBackend::Memfd => ArcMemPages::from_memfd(name, size as usize, None),569
MemBackend::Anonymous => ArcMemPages::from_anonymous(size as usize, None, mmap_flag),570
}?;571
#[cfg(target_os = "linux")]572
if self.config.mem.transparent_hugepage {573
pages.madvise_hugepage()?;574
}575
Ok(pages)576
}577
}578
579
#[cfg(test)]580
#[path = "board_test.rs"]581
mod tests;582