pci.rs0.00%
1
// Copyright 2024 Google LLC2
//3
// Licensed under the Apache License, Version 2.0 (the "License");4
// you may not use this file except in compliance with the License.5
// You may obtain a copy of the License at6
//7
// https://www.apache.org/licenses/LICENSE-2.08
//9
// Unless required by applicable law or agreed to in writing, software10
// distributed under the License is distributed on an "AS IS" BASIS,11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12
// See the License for the specific language governing permissions and13
// limitations under the License.14
15
use std::cmp::min;16
use std::fs::File;17
use std::iter::zip;18
use std::mem::size_of;19
use std::ops::Range;20
use std::os::fd::{AsFd, AsRawFd};21
use std::os::unix::fs::FileExt;22
use std::sync::Arc;23
use std::sync::atomic::AtomicU64;24
25
use libc::{PROT_READ, PROT_WRITE};26
use parking_lot::{Mutex, RwLock};27
use zerocopy::{FromBytes, IntoBytes};28
29
use crate::device::Pause;30
use crate::errors::BoxTrace;31
use crate::hv::{IrqFd, MsiSender};32
use crate::mem::emulated::{Action, Mmio, MmioBus};33
use crate::mem::mapped::ArcMemPages;34
use crate::mem::{IoRegion, MemRange, MemRegion, MemRegionEntry, MemRegionType};35
use crate::pci::cap::{36
MsiCapHdr, MsiCapMmio, MsixCap, MsixCapMmio, MsixTableEntry, MsixTableMmio, MsixTableMmioEntry,37
NullCap, PciCap, PciCapHdr, PciCapId,38
};39
use crate::pci::config::{40
BAR_IO, Command, CommonHeader, ConfigHeader, DeviceHeader, EmulatedHeader, HeaderType,41
PciConfig, PciConfigArea, Status,42
};43
use crate::pci::{self, Pci, PciBar};44
use crate::sys::vfio::{45
VfioDeviceInfoFlag, VfioIrqSet, VfioIrqSetData, VfioIrqSetFlag, VfioPciIrq, VfioPciRegion,46
VfioRegionInfo, VfioRegionInfoFlag,47
};48
use crate::vfio::device::Device;49
use crate::vfio::{Result, error};50
use crate::{align_down, align_up, mem};51
52
fn round_up_range(range: Range<usize>) -> Range<usize> {53
(align_down!(range.start, 12))..(align_up!(range.end, 12))54
}55
56
fn create_mapped_bar_pages(57
fd: &File,58
region_flags: VfioRegionInfoFlag,59
offset: i64,60
size: usize,61
) -> Result<ArcMemPages> {62
let mut prot = 0;63
if region_flags.contains(VfioRegionInfoFlag::READ) {64
prot |= PROT_READ;65
}66
if region_flags.contains(VfioRegionInfoFlag::WRITE) {67
prot |= PROT_WRITE;68
}69
let mapped_pages = ArcMemPages::from_file(fd.try_clone()?, offset, size, prot)?;70
Ok(mapped_pages)71
}72
73
fn create_splitted_bar_region<I, M, D>(74
dev: Arc<VfioDev<D>>,75
region_info: &VfioRegionInfo,76
table_range: Range<usize>,77
pba_range: Range<usize>,78
msix_table: Arc<MsixTableMmio<I>>,79
msi_sender: Arc<M>,80
) -> Result<MemRegion>81
where82
I: IrqFd,83
M: MsiSender<IrqFd = I>,84
D: Device,85
{86
let table_pages = round_up_range(table_range.clone());87
let pba_pages = round_up_range(pba_range.clone());88
let (excluded_page1, excluded_page2) = if table_pages.clone().eq(0..0) {89
(0..0, pba_pages)90
} else if pba_pages.clone().eq(0..0) {91
(0..0, table_pages)92
} else if table_pages.start <= pba_pages.start && table_pages.end >= pba_pages.start {93
(0..0, table_pages.start..pba_pages.end)94
} else if pba_pages.start <= table_pages.start && pba_pages.end >= table_pages.start {95
(0..0, pba_pages.start..table_pages.end)96
} else if table_pages.end < pba_pages.start {97
(table_pages, pba_pages)98
} else {99
(pba_pages, table_pages)100
};101
let mut region = MemRegion {102
callbacks: Mutex::new(vec![]),103
entries: vec![MemRegionEntry {104
size: region_info.size,105
type_: MemRegionType::Hidden,106
}],107
ranges: vec![],108
};109
if excluded_page1.start > 0 {110
region.ranges.push(MemRange::DevMem(create_mapped_bar_pages(111
dev.dev.fd(),112
region_info.flags,113
region_info.offset as i64,114
excluded_page1.start,115
)?));116
}117
if excluded_page1.end - excluded_page1.start > 0 {118
region.ranges.push(MemRange::Emulated(Arc::new(MsixBarMmio {119
table: msix_table.clone(),120
table_range: table_range.clone(),121
msi_sender: msi_sender.clone(),122
pba: Arc::new([]),123
pba_range: pba_range.clone(),124
cdev: dev.clone(),125
cdev_offset: region_info.offset,126
region_start: excluded_page1.start,127
region_size: excluded_page1.end - excluded_page1.start,128
})));129
}130
if excluded_page2.start - excluded_page1.end > 0 {131
region.ranges.push(MemRange::DevMem(create_mapped_bar_pages(132
dev.dev.fd(),133
region_info.flags,134
region_info.offset as i64 + excluded_page1.end as i64,135
excluded_page2.start - excluded_page1.end,136
)?));137
}138
if excluded_page2.end - excluded_page2.start > 0 {139
region.ranges.push(MemRange::Emulated(Arc::new(MsixBarMmio {140
table: msix_table,141
table_range,142
msi_sender,143
pba: Arc::new([]),144
pba_range,145
cdev: dev.clone(),146
cdev_offset: region_info.offset,147
region_start: excluded_page2.start,148
region_size: excluded_page2.end - excluded_page2.start,149
})));150
}151
if excluded_page2.end < region_info.size as usize {152
region.ranges.push(MemRange::DevMem(create_mapped_bar_pages(153
dev.dev.fd(),154
region_info.flags,155
region_info.offset as i64 + excluded_page2.end as i64,156
region_info.size as usize - excluded_page2.end,157
)?));158
}159
Ok(region)160
}161
162
fn create_mappable_bar_region<I, M, D>(163
cdev: Arc<VfioDev<D>>,164
index: u32,165
region_info: &VfioRegionInfo,166
msix_cap: Option<&MsixCap>,167
msix_table: Arc<MsixTableMmio<I>>,168
msi_sender: Arc<M>,169
) -> Result<MemRegion>170
where171
I: IrqFd,172
M: MsiSender<IrqFd = I>,173
D: Device,174
{175
let (msix_table_offset, msix_pba_offset, msix_control) = if let Some(msix_cap) = msix_cap {176
(msix_cap.table_offset, msix_cap.pba_offset, msix_cap.control)177
} else {178
return create_splitted_bar_region(cdev, region_info, 0..0, 0..0, msix_table, msi_sender);179
};180
let num_msix_entries = msix_control.table_len() as usize + 1;181
let table_offset = msix_table_offset.0 as usize & !0b111;182
let pba_offset = msix_pba_offset.0 as usize & !0b111;183
let table_range = table_offset..(table_offset + size_of::<MsixTableEntry>() * num_msix_entries);184
let pba_range = pba_offset..(pba_offset + (align_up!(num_msix_entries, 6) >> 3));185
186
if msix_table_offset.bar() == index && msix_pba_offset.bar() == index {187
create_splitted_bar_region(188
cdev,189
region_info,190
table_range,191
pba_range,192
msix_table,193
msi_sender,194
)195
} else if msix_table_offset.bar() == index {196
create_splitted_bar_region(cdev, region_info, table_range, 0..0, msix_table, msi_sender)197
} else if msix_pba_offset.bar() == index {198
create_splitted_bar_region(cdev, region_info, 0..0, pba_range, msix_table, msi_sender)199
} else {200
create_splitted_bar_region(cdev, region_info, 0..0, 0..0, msix_table, msi_sender)201
}202
}203
204
#[derive(Debug)]205
struct VfioDev<D> {206
name: Arc<str>,207
dev: D,208
flags: VfioDeviceInfoFlag,209
}210
211
impl<D> VfioDev<D>212
where213
D: Device,214
{215
fn reset(&self) -> Result<()> {216
if self.flags.contains(VfioDeviceInfoFlag::RESET) {217
self.dev.reset()?;218
}219
Ok(())220
}221
}222
223
#[derive(Debug)]224
struct PthConfigArea<D> {225
offset: u64, // offset to dev226
size: u64,227
dev: Arc<VfioDev<D>>,228
}229
230
impl<D> Mmio for PthConfigArea<D>231
where232
D: Device,233
{234
fn size(&self) -> u64 {235
self.size236
}237
238
fn read(&self, offset: u64, size: u8) -> mem::Result<u64> {239
self.dev.dev.read(self.offset + offset, size)240
}241
242
fn write(&self, offset: u64, size: u8, val: u64) -> mem::Result<Action> {243
self.dev.dev.write(self.offset + offset, size, val)?;244
Ok(Action::None)245
}246
}247
248
impl<D> PciConfigArea for PthConfigArea<D>249
where250
D: Device,251
{252
fn reset(&self) -> pci::Result<()> {253
Ok(())254
}255
}256
257
#[derive(Debug)]258
pub struct PciPthConfig<D> {259
header: EmulatedHeader,260
extra: MmioBus<Box<dyn PciConfigArea>>,261
dev: Arc<VfioDev<D>>,262
}263
264
impl<D> Mmio for PciPthConfig<D>265
where266
D: Device,267
{268
fn read(&self, offset: u64, size: u8) -> mem::Result<u64> {269
if offset < self.header.size() {270
Mmio::read(&self.header, offset, size)271
} else {272
self.extra.read(offset, size)273
}274
}275
276
fn size(&self) -> u64 {277
4096278
}279
280
fn write(&self, offset: u64, size: u8, val: u64) -> mem::Result<Action> {281
if offset < self.header.size() {282
Mmio::write(&self.header, offset, size, val)283
} else {284
self.extra.write(offset, size, val)285
}286
}287
}288
289
impl<D> PciConfig for PciPthConfig<D>290
where291
D: Device,292
{293
fn get_header(&self) -> &EmulatedHeader {294
&self.header295
}296
297
fn reset(&self) -> pci::Result<()> {298
self.header.reset()?;299
for (_, area) in self.extra.inner.iter() {300
area.reset()?;301
}302
Ok(())303
}304
}305
306
#[derive(Debug)]307
pub struct PthBarRegion<D> {308
cdev: Arc<VfioDev<D>>,309
size: usize,310
offset: u64,311
}312
313
impl<D> Mmio for PthBarRegion<D>314
where315
D: Device,316
{317
fn size(&self) -> u64 {318
self.size as u64319
}320
321
fn read(&self, offset: u64, size: u8) -> mem::Result<u64> {322
log::trace!(323
"{}: emulated read at {offset:#x}, size={size}",324
self.cdev.name325
);326
self.cdev.dev.read(self.offset + offset, size)327
}328
329
fn write(&self, offset: u64, size: u8, val: u64) -> mem::Result<Action> {330
log::trace!(331
"{}: emulated write at {offset:#x}, val={val:#x}, size={size}",332
self.cdev.name333
);334
self.cdev.dev.write(self.offset + offset, size, val)?;335
Ok(Action::None)336
}337
}338
339
#[derive(Debug)]340
pub struct VfioPciDev<M, D>341
where342
M: MsiSender,343
{344
config: PciPthConfig<D>,345
msix_table: Arc<MsixTableMmio<M::IrqFd>>,346
}347
348
impl<M, D> Pause for VfioPciDev<M, D>349
where350
M: MsiSender,351
D: Device,352
{353
}354
355
impl<M, D> Pci for VfioPciDev<M, D>356
where357
D: Device,358
M: MsiSender,359
{360
fn name(&self) -> &str {361
&self.config.dev.name362
}363
364
fn config(&self) -> &dyn PciConfig {365
&self.config366
}367
368
fn reset(&self) -> pci::Result<()> {369
let ret = VfioPciDev::reset(self);370
ret.box_trace(pci::error::Reset)?;371
Ok(())372
}373
}374
375
impl<M, D> VfioPciDev<M, D>376
where377
M: MsiSender,378
D: Device,379
{380
pub fn new(name: Arc<str>, dev: D, msi_sender: M) -> Result<VfioPciDev<M, D>> {381
let flags = dev.get_info()?.flags;382
383
let cdev = Arc::new(VfioDev { dev, name, flags });384
385
let msi_sender = Arc::new(msi_sender);386
387
let region_config = cdev.dev.get_region_info(VfioPciRegion::CONFIG.raw())?;388
389
let pci_command = Command::IO | Command::MEM | Command::BUS_MASTER | Command::INTX_DISABLE;390
cdev.dev.write(391
region_config.offset + CommonHeader::OFFSET_COMMAND as u64,392
CommonHeader::SIZE_COMMAND as u8,393
pci_command.bits() as _,394
)?;395
396
let mut buf = vec![0u32; region_config.size as usize >> 2];397
let buf = buf.as_mut_bytes();398
cdev.dev.fd().read_at(buf, region_config.offset)?;399
400
let (mut dev_header, _) = DeviceHeader::read_from_prefix(buf).unwrap();401
let header_type = dev_header.common.header_type.raw() & !(1 << 7);402
if header_type != HeaderType::DEVICE.raw() {403
return error::NotSupportedHeader { ty: header_type }.fail();404
}405
dev_header.common.header_type = HeaderType::DEVICE;406
dev_header.intx_pin = 0;407
dev_header.common.command = Command::empty();408
409
let mut masked_caps: Vec<(u64, Box<dyn PciConfigArea>)> = vec![];410
let mut msix_info = None;411
let mut msi_info = None;412
413
if dev_header.common.status.contains(Status::CAP) {414
let mut cap_offset = dev_header.capability_pointer as usize;415
while cap_offset != 0 {416
let Some(cap_buf) = buf.get(cap_offset..) else {417
log::error!("{}: invalid cap offset: {cap_offset:#x}", cdev.name);418
break;419
};420
let (cap_header, _) = PciCapHdr::ref_from_prefix(cap_buf).unwrap();421
if cap_header.id == PciCapId::MSIX {422
let Ok((mut c, _)) = MsixCap::read_from_prefix(cap_buf) else {423
log::error!(424
"{}: MSIX capability is at an invalid offset: {cap_offset:#x}",425
cdev.name426
);427
continue;428
};429
c.control.set_enabled(false);430
c.control.set_masked(false);431
msix_info = Some((cap_offset, c.clone()));432
} else if cap_header.id == PciCapId::MSI {433
let Ok((mut c, _)) = MsiCapHdr::read_from_prefix(cap_buf) else {434
log::error!(435
"{}: MSI capability is at an invalid offset: {cap_offset:#x}",436
cdev.name437
);438
continue;439
};440
log::info!("{}: MSI cap header: {c:#x?}", cdev.name);441
c.control.set_enable(false);442
c.control.set_ext_msg_data_cap(true);443
let multi_msg_cap = min(5, c.control.multi_msg_cap());444
c.control.set_multi_msg_cap(multi_msg_cap);445
msi_info = Some((cap_offset, c));446
}447
cap_offset = cap_header.next as usize;448
}449
}450
451
let mut msix_cap = None;452
if let Some((offset, cap)) = msix_info {453
msix_cap = Some(cap.clone());454
let msix_cap_mmio = MsixCapMmio::new(cap);455
masked_caps.push((offset as u64, Box::new(msix_cap_mmio)));456
if let Some((offset, hdr)) = msi_info {457
let null_cap = NullCap {458
size: hdr.control.cap_size(),459
next: hdr.header.next,460
};461
masked_caps.push((offset as u64, Box::new(null_cap)));462
}463
} else if let Some((offset, hdr)) = msi_info {464
let count = 1 << hdr.control.multi_msg_cap();465
let irqfds = (0..count)466
.map(|_| msi_sender.create_irqfd())467
.collect::<Result<Box<_>, _>>()?;468
469
let mut eventfds = [-1; 32];470
for (fd, irqfd) in zip(&mut eventfds, &irqfds) {471
*fd = irqfd.as_fd().as_raw_fd();472
}473
let set_eventfd = VfioIrqSet {474
argsz: (size_of::<VfioIrqSet<0>>() + size_of::<i32>() * count) as u32,475
flags: VfioIrqSetFlag::DATA_EVENTFD | VfioIrqSetFlag::ACTION_TRIGGER,476
index: VfioPciIrq::MSI.raw(),477
start: 0,478
count: count as u32,479
data: VfioIrqSetData { eventfds },480
};481
cdev.dev.set_irqs(&set_eventfd)?;482
483
let mut msi_cap_mmio = MsiCapMmio::new(hdr.control, irqfds);484
msi_cap_mmio.set_next(hdr.header.next);485
masked_caps.push((offset as u64, Box::new(msi_cap_mmio)));486
}487
488
let mut extra_areas: MmioBus<Box<dyn PciConfigArea>> = MmioBus::new();489
masked_caps.sort_by_key(|(offset, _)| *offset);490
let mut area_end = 0x40;491
for (offset, cap) in masked_caps {492
if area_end < offset {493
extra_areas.add(494
area_end,495
Box::new(PthConfigArea {496
offset: region_config.offset + area_end,497
size: offset - area_end,498
dev: cdev.clone(),499
}),500
)?;501
}502
area_end = offset + Mmio::size(&*cap);503
extra_areas.add(offset, cap)?;504
}505
if area_end < region_config.size {506
extra_areas.add(507
area_end,508
Box::new(PthConfigArea {509
offset: region_config.offset + area_end,510
size: region_config.size - area_end,511
dev: cdev.clone(),512
}),513
)?;514
}515
516
let config_header = ConfigHeader::Device(dev_header);517
518
cdev.reset()?;519
520
let msix_count = match &msix_cap {521
Some(cap) => cap.control.table_len() + 1,522
None => 0,523
};524
let msix_entries = RwLock::new(525
(0..msix_count)526
.map(|_| MsixTableMmioEntry::Entry(MsixTableEntry::default()))527
.collect(),528
);529
530
let msix_table = Arc::new(MsixTableMmio {531
entries: msix_entries,532
});533
534
let mut bars = [const { PciBar::Empty }; 6];535
let bar_vals = config_header.bars();536
537
for index in VfioPciRegion::BAR0.raw()..=VfioPciRegion::BAR5.raw() {538
let region_info = cdev.dev.get_region_info(index)?;539
if region_info.size == 0 {540
continue;541
}542
let region = if region_info.flags.contains(VfioRegionInfoFlag::MMAP) {543
create_mappable_bar_region(544
cdev.clone(),545
index,546
®ion_info,547
msix_cap.as_ref(),548
msix_table.clone(),549
msi_sender.clone(),550
)?551
} else {552
MemRegion::with_emulated(553
Arc::new(PthBarRegion {554
cdev: cdev.clone(),555
size: region_info.size as usize,556
offset: region_info.offset,557
}),558
MemRegionType::Hidden,559
)560
};561
let index = index as usize;562
let bar_val = bar_vals[index];563
if bar_val & BAR_IO == BAR_IO {564
let MemRange::Emulated(range) = ®ion.ranges[0] else {565
unreachable!()566
};567
bars[index] = PciBar::Io(Arc::new(IoRegion {568
range: range.clone(),569
callbacks: Mutex::new(vec![]),570
}))571
} else {572
bars[index] = PciBar::Mem(Arc::new(region));573
}574
}575
576
Ok(VfioPciDev {577
config: PciPthConfig {578
header: EmulatedHeader::new(config_header, bars),579
extra: extra_areas,580
dev: cdev,581
},582
msix_table,583
})584
}585
586
fn reset(&self) -> Result<()> {587
let is_irqfd = |e| matches!(e, &MsixTableMmioEntry::IrqFd(_));588
if self.msix_table.entries.read().iter().any(is_irqfd) {589
let dev = &self.config.dev;590
if let Err(e) = dev.dev.disable_all_irqs(VfioPciIrq::MSIX) {591
log::error!("{}: failed to disable MSIX IRQs: {e:?}", dev.name)592
}593
}594
595
self.msix_table.reset();596
self.config.dev.reset()597
}598
}599
600
#[derive(Debug)]601
struct MsixBarMmio<M, D>602
where603
M: MsiSender,604
{605
table: Arc<MsixTableMmio<M::IrqFd>>,606
msi_sender: Arc<M>,607
table_range: Range<usize>,608
#[allow(dead_code)]609
pba: Arc<[AtomicU64]>, // TODO610
pba_range: Range<usize>,611
cdev: Arc<VfioDev<D>>,612
cdev_offset: u64,613
region_start: usize,614
region_size: usize,615
}616
617
impl<M, D> MsixBarMmio<M, D>618
where619
M: MsiSender,620
D: Device,621
{622
fn enable_irqfd(&self, index: usize) -> Result<()> {623
let mut entries = self.table.entries.write();624
let Some(entry) = entries.get_mut(index) else {625
log::error!(626
"{}: MSIX-X index {index} is out of range ({})",627
self.cdev.name,628
entries.len()629
);630
return Ok(());631
};632
let MsixTableMmioEntry::Entry(e) = &*entry else {633
return Ok(());634
};635
if e.control.masked() {636
return Ok(());637
}638
639
log::debug!("{}: enabling irqfd for MSI-X {index}", self.cdev.name);640
let irqfd = self.msi_sender.create_irqfd()?;641
irqfd.set_addr_hi(e.addr_hi)?;642
irqfd.set_addr_lo(e.addr_lo)?;643
irqfd.set_data(e.data)?;644
irqfd.set_masked(false)?;645
*entry = MsixTableMmioEntry::IrqFd(irqfd);646
647
// If a device IRQ has flag NORESIZE, it must be disabled before a new648
// subindex can be enabled.649
// However if this IRQ has been disabled, VFIO returns error if we try650
// to call disable_all_irqs(). This happens when the guest enables a651
// subindex for the first time.652
// As long as the following set_irqs() succeeds, we can safely ignore653
// the error here.654
let _ = self.cdev.dev.disable_all_irqs(VfioPciIrq::MSIX);655
656
let mut eventfds = [-1; 2048];657
let mut count = 0;658
for (index, (entry, fd)) in std::iter::zip(entries.iter(), &mut eventfds).enumerate() {659
let MsixTableMmioEntry::IrqFd(irqfd) = entry else {660
continue;661
};662
count = index + 1;663
*fd = irqfd.as_fd().as_raw_fd();664
}665
let vfio_irq_set_eventfd = VfioIrqSet {666
argsz: (size_of::<VfioIrqSet<0>>() + size_of::<i32>() * count) as u32,667
flags: VfioIrqSetFlag::DATA_EVENTFD | VfioIrqSetFlag::ACTION_TRIGGER,668
index: VfioPciIrq::MSIX.raw(),669
start: 0,670
count: count as u32,671
data: VfioIrqSetData { eventfds },672
};673
self.cdev.dev.set_irqs(&vfio_irq_set_eventfd)674
}675
}676
677
impl<M, D> Mmio for MsixBarMmio<M, D>678
where679
M: MsiSender,680
D: Device,681
{682
fn size(&self) -> u64 {683
self.region_size as u64684
}685
686
fn read(&self, offset: u64, size: u8) -> mem::Result<u64> {687
let offset = self.region_start + offset as usize;688
let name = &self.cdev.name;689
if offset < self.table_range.end && offset + size as usize > self.table_range.start {690
let offset = offset - self.table_range.start;691
self.table.read(offset as u64, size)692
} else if self.pba_range.contains(&offset) {693
log::error!("{name}: reading pba at {offset:#x}, size={size}: unimplemented",);694
Ok(0)695
} else {696
log::trace!("{name}: emulated BAR read at {offset:#x}, size={size}",);697
self.cdev.dev.read(self.cdev_offset + offset as u64, size)698
}699
}700
701
fn write(&self, offset: u64, size: u8, val: u64) -> mem::Result<Action> {702
let offset = self.region_start + offset as usize;703
let name = &self.cdev.name;704
if offset < self.table_range.end && offset + size as usize > self.table_range.start {705
let offset = offset - self.table_range.start;706
if self.table.write_val(offset as u64, size, val)? {707
self.enable_irqfd(offset / size_of::<MsixTableEntry>())708
.box_trace(mem::error::Mmio)?;709
}710
} else if self.pba_range.contains(&offset) {711
log::error!(712
"{name}: writing pba at {offset:#x}, size={size}, val={val:#x}: unimplemented",713
);714
} else {715
log::trace!("{name}: emulated BAR write at {offset:#x}, size={size}, val={val:#x}",);716
self.cdev717
.dev718
.write(self.cdev_offset + offset as u64, size, val)?;719
}720
Ok(Action::None)721
}722
}723