pci.rs0.00%
1
// Copyright 2024 Google LLC2
//3
// Licensed under the Apache License, Version 2.0 (the "License");4
// you may not use this file except in compliance with the License.5
// You may obtain a copy of the License at6
//7
// https://www.apache.org/licenses/LICENSE-2.08
//9
// Unless required by applicable law or agreed to in writing, software10
// distributed under the License is distributed on an "AS IS" BASIS,11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12
// See the License for the specific language governing permissions and13
// limitations under the License.14
15
use std::cmp::min;16
use std::fs::File;17
use std::iter::zip;18
use std::mem::size_of;19
use std::ops::Range;20
use std::os::fd::{AsFd, AsRawFd};21
use std::os::unix::fs::FileExt;22
use std::sync::Arc;23
use std::sync::atomic::AtomicU64;24
25
use libc::{PROT_READ, PROT_WRITE};26
use parking_lot::{Mutex, RwLock};27
use zerocopy::{FromBytes, transmute};28
29
use crate::device::Pause;30
use crate::errors::BoxTrace;31
use crate::hv::{IrqFd, MsiSender};32
use crate::mem::emulated::{Action, Mmio, MmioBus};33
use crate::mem::mapped::ArcMemPages;34
use crate::mem::{IoRegion, MemRange, MemRegion, MemRegionEntry, MemRegionType};35
use crate::pci::cap::{36
MsiCapHdr, MsiCapMmio, MsixCap, MsixCapMmio, MsixTableEntry, MsixTableMmio, MsixTableMmioEntry,37
NullCap, PciCap, PciCapHdr, PciCapId,38
};39
use crate::pci::config::{40
BAR_IO, Command, CommonHeader, ConfigHeader, DeviceHeader, EmulatedHeader, HeaderType,41
PciConfig, PciConfigArea, Status,42
};43
use crate::pci::{self, Pci, PciBar};44
use crate::sys::vfio::{45
VfioIrqSet, VfioIrqSetData, VfioIrqSetFlag, VfioPciIrq, VfioPciRegion, VfioRegionInfo,46
VfioRegionInfoFlag,47
};48
use crate::vfio::device::Device;49
use crate::vfio::{Result, error};50
use crate::{align_down, align_up, mem};51
52
fn round_up_range(range: Range<usize>) -> Range<usize> {53
(align_down!(range.start, 12))..(align_up!(range.end, 12))54
}55
56
fn create_mapped_bar_pages(57
fd: &File,58
region_flags: VfioRegionInfoFlag,59
offset: i64,60
size: usize,61
) -> Result<ArcMemPages> {62
let mut prot = 0;63
if region_flags.contains(VfioRegionInfoFlag::READ) {64
prot |= PROT_READ;65
}66
if region_flags.contains(VfioRegionInfoFlag::WRITE) {67
prot |= PROT_WRITE;68
}69
let mapped_pages = ArcMemPages::from_file(fd.try_clone()?, offset, size, prot)?;70
Ok(mapped_pages)71
}72
73
fn create_splitted_bar_region<I, M, D>(74
dev: Arc<VfioDev<D>>,75
region_info: &VfioRegionInfo,76
table_range: Range<usize>,77
pba_range: Range<usize>,78
msix_table: Arc<MsixTableMmio<I>>,79
msi_sender: Arc<M>,80
) -> Result<MemRegion>81
where82
I: IrqFd,83
M: MsiSender<IrqFd = I>,84
D: Device,85
{86
let table_pages = round_up_range(table_range.clone());87
let pba_pages = round_up_range(pba_range.clone());88
let (excluded_page1, excluded_page2) = if table_pages.clone().eq(0..0) {89
(0..0, pba_pages)90
} else if pba_pages.clone().eq(0..0) {91
(0..0, table_pages)92
} else if table_pages.start <= pba_pages.start && table_pages.end >= pba_pages.start {93
(0..0, table_pages.start..pba_pages.end)94
} else if pba_pages.start <= table_pages.start && pba_pages.end >= table_pages.start {95
(0..0, pba_pages.start..table_pages.end)96
} else if table_pages.end < pba_pages.start {97
(table_pages, pba_pages)98
} else {99
(pba_pages, table_pages)100
};101
let mut region = MemRegion {102
callbacks: Mutex::new(vec![]),103
entries: vec![MemRegionEntry {104
size: region_info.size,105
type_: MemRegionType::Hidden,106
}],107
ranges: vec![],108
};109
if excluded_page1.start > 0 {110
region.ranges.push(MemRange::DevMem(create_mapped_bar_pages(111
dev.dev.fd(),112
region_info.flags,113
region_info.offset as i64,114
excluded_page1.start,115
)?));116
}117
if excluded_page1.end - excluded_page1.start > 0 {118
region.ranges.push(MemRange::Emulated(Arc::new(MsixBarMmio {119
table: msix_table.clone(),120
table_range: table_range.clone(),121
msi_sender: msi_sender.clone(),122
pba: Arc::new([]),123
pba_range: pba_range.clone(),124
cdev: dev.clone(),125
cdev_offset: region_info.offset,126
region_start: excluded_page1.start,127
region_size: excluded_page1.end - excluded_page1.start,128
})));129
}130
if excluded_page2.start - excluded_page1.end > 0 {131
region.ranges.push(MemRange::DevMem(create_mapped_bar_pages(132
dev.dev.fd(),133
region_info.flags,134
region_info.offset as i64 + excluded_page1.end as i64,135
excluded_page2.start - excluded_page1.end,136
)?));137
}138
if excluded_page2.end - excluded_page2.start > 0 {139
region.ranges.push(MemRange::Emulated(Arc::new(MsixBarMmio {140
table: msix_table,141
table_range,142
msi_sender,143
pba: Arc::new([]),144
pba_range,145
cdev: dev.clone(),146
cdev_offset: region_info.offset,147
region_start: excluded_page2.start,148
region_size: excluded_page2.end - excluded_page2.start,149
})));150
}151
if excluded_page2.end < region_info.size as usize {152
region.ranges.push(MemRange::DevMem(create_mapped_bar_pages(153
dev.dev.fd(),154
region_info.flags,155
region_info.offset as i64 + excluded_page2.end as i64,156
region_info.size as usize - excluded_page2.end,157
)?));158
}159
Ok(region)160
}161
162
fn create_mappable_bar_region<I, M, D>(163
cdev: Arc<VfioDev<D>>,164
index: u32,165
region_info: &VfioRegionInfo,166
msix_cap: Option<&MsixCap>,167
msix_table: Arc<MsixTableMmio<I>>,168
msi_sender: Arc<M>,169
) -> Result<MemRegion>170
where171
I: IrqFd,172
M: MsiSender<IrqFd = I>,173
D: Device,174
{175
let (msix_table_offset, msix_pba_offset, msix_control) = if let Some(msix_cap) = msix_cap {176
(msix_cap.table_offset, msix_cap.pba_offset, msix_cap.control)177
} else {178
return create_splitted_bar_region(cdev, region_info, 0..0, 0..0, msix_table, msi_sender);179
};180
let num_msix_entries = msix_control.table_len() as usize + 1;181
let table_offset = msix_table_offset.0 as usize & !0b111;182
let pba_offset = msix_pba_offset.0 as usize & !0b111;183
let table_range = table_offset..(table_offset + size_of::<MsixTableEntry>() * num_msix_entries);184
let pba_range = pba_offset..(pba_offset + (align_up!(num_msix_entries, 6) >> 3));185
186
if msix_table_offset.bar() == index && msix_pba_offset.bar() == index {187
create_splitted_bar_region(188
cdev,189
region_info,190
table_range,191
pba_range,192
msix_table,193
msi_sender,194
)195
} else if msix_table_offset.bar() == index {196
create_splitted_bar_region(cdev, region_info, table_range, 0..0, msix_table, msi_sender)197
} else if msix_pba_offset.bar() == index {198
create_splitted_bar_region(cdev, region_info, 0..0, pba_range, msix_table, msi_sender)199
} else {200
create_splitted_bar_region(cdev, region_info, 0..0, 0..0, msix_table, msi_sender)201
}202
}203
204
#[derive(Debug)]205
struct VfioDev<D> {206
name: Arc<str>,207
dev: D,208
}209
210
#[derive(Debug)]211
struct PthConfigArea<D> {212
offset: u64, // offset to dev213
size: u64,214
dev: Arc<VfioDev<D>>,215
}216
217
impl<D> Mmio for PthConfigArea<D>218
where219
D: Device,220
{221
fn size(&self) -> u64 {222
self.size223
}224
225
fn read(&self, offset: u64, size: u8) -> mem::Result<u64> {226
self.dev.dev.read(self.offset + offset, size)227
}228
229
fn write(&self, offset: u64, size: u8, val: u64) -> mem::Result<Action> {230
self.dev.dev.write(self.offset + offset, size, val)?;231
Ok(Action::None)232
}233
}234
235
impl<D> PciConfigArea for PthConfigArea<D>236
where237
D: Device,238
{239
fn reset(&self) -> pci::Result<()> {240
Ok(())241
}242
}243
244
#[derive(Debug)]245
pub struct PciPthConfig<D> {246
header: EmulatedHeader,247
extra: MmioBus<Box<dyn PciConfigArea>>,248
dev: Arc<VfioDev<D>>,249
}250
251
impl<D> Mmio for PciPthConfig<D>252
where253
D: Device,254
{255
fn read(&self, offset: u64, size: u8) -> mem::Result<u64> {256
if offset < self.header.size() {257
Mmio::read(&self.header, offset, size)258
} else {259
self.extra.read(offset, size)260
}261
}262
263
fn size(&self) -> u64 {264
4096265
}266
267
fn write(&self, offset: u64, size: u8, val: u64) -> mem::Result<Action> {268
if offset < self.header.size() {269
Mmio::write(&self.header, offset, size, val)270
} else {271
self.extra.write(offset, size, val)272
}273
}274
}275
276
impl<D> PciConfig for PciPthConfig<D>277
where278
D: Device,279
{280
fn get_header(&self) -> &EmulatedHeader {281
&self.header282
}283
284
fn reset(&self) -> pci::Result<()> {285
self.header.reset()?;286
for (_, area) in self.extra.inner.iter() {287
area.reset()?;288
}289
Ok(())290
}291
}292
293
#[derive(Debug)]294
pub struct PthBarRegion<D> {295
cdev: Arc<VfioDev<D>>,296
size: usize,297
offset: u64,298
}299
300
impl<D> Mmio for PthBarRegion<D>301
where302
D: Device,303
{304
fn size(&self) -> u64 {305
self.size as u64306
}307
308
fn read(&self, offset: u64, size: u8) -> mem::Result<u64> {309
log::trace!(310
"{}: emulated read at {offset:#x}, size={size}",311
self.cdev.name312
);313
self.cdev.dev.read(self.offset + offset, size)314
}315
316
fn write(&self, offset: u64, size: u8, val: u64) -> mem::Result<Action> {317
log::trace!(318
"{}: emulated write at {offset:#x}, val={val:#x}, size={size}",319
self.cdev.name320
);321
self.cdev.dev.write(self.offset + offset, size, val)?;322
Ok(Action::None)323
}324
}325
326
#[derive(Debug)]327
pub struct VfioPciDev<M, D>328
where329
M: MsiSender,330
{331
config: PciPthConfig<D>,332
msix_table: Arc<MsixTableMmio<M::IrqFd>>,333
}334
335
impl<M, D> Pause for VfioPciDev<M, D>336
where337
M: MsiSender,338
D: Device,339
{340
}341
342
impl<M, D> Pci for VfioPciDev<M, D>343
where344
D: Device,345
M: MsiSender,346
{347
fn name(&self) -> &str {348
&self.config.dev.name349
}350
351
fn config(&self) -> &dyn PciConfig {352
&self.config353
}354
355
fn reset(&self) -> pci::Result<()> {356
let ret = VfioPciDev::reset(self);357
ret.box_trace(pci::error::Reset)?;358
Ok(())359
}360
}361
362
impl<M, D> VfioPciDev<M, D>363
where364
M: MsiSender,365
D: Device,366
{367
pub fn new(name: Arc<str>, dev: D, msi_sender: M) -> Result<VfioPciDev<M, D>> {368
let cdev = Arc::new(VfioDev { dev, name });369
370
let msi_sender = Arc::new(msi_sender);371
372
let region_config = cdev.dev.get_region_info(VfioPciRegion::CONFIG.raw())?;373
374
let pci_command = Command::IO | Command::MEM | Command::BUS_MASTER | Command::INTX_DISABLE;375
cdev.dev.write(376
region_config.offset + CommonHeader::OFFSET_COMMAND as u64,377
CommonHeader::SIZE_COMMAND as u8,378
pci_command.bits() as _,379
)?;380
let mut buf: [u8; 4096] = transmute!([0u32; 1024]);381
cdev.dev.fd().read_at(&mut buf, region_config.offset)?;382
383
let (mut dev_header, _) = DeviceHeader::read_from_prefix(&buf).unwrap();384
let header_type = dev_header.common.header_type.raw() & !(1 << 7);385
if header_type != HeaderType::DEVICE.raw() {386
return error::NotSupportedHeader { ty: header_type }.fail();387
}388
dev_header.common.header_type = HeaderType::DEVICE;389
dev_header.intx_pin = 0;390
dev_header.common.command = Command::empty();391
392
let mut masked_caps: Vec<(u64, Box<dyn PciConfigArea>)> = vec![];393
let mut msix_info = None;394
let mut msi_info = None;395
396
if dev_header.common.status.contains(Status::CAP) {397
let mut cap_offset = dev_header.capability_pointer as usize;398
while cap_offset != 0 {399
let Some(cap_buf) = buf.get(cap_offset..) else {400
log::error!("{}: invalid cap offset: {cap_offset:#x}", cdev.name);401
break;402
};403
let (cap_header, _) = PciCapHdr::ref_from_prefix(cap_buf).unwrap();404
if cap_header.id == PciCapId::MSIX {405
let Ok((mut c, _)) = MsixCap::read_from_prefix(cap_buf) else {406
log::error!(407
"{}: MSIX capability is at an invalid offset: {cap_offset:#x}",408
cdev.name409
);410
continue;411
};412
c.control.set_enabled(false);413
c.control.set_masked(false);414
msix_info = Some((cap_offset, c.clone()));415
} else if cap_header.id == PciCapId::MSI {416
let Ok((mut c, _)) = MsiCapHdr::read_from_prefix(cap_buf) else {417
log::error!(418
"{}: MSI capability is at an invalid offset: {cap_offset:#x}",419
cdev.name420
);421
continue;422
};423
log::info!("{}: MSI cap header: {c:#x?}", cdev.name);424
c.control.set_enable(false);425
c.control.set_ext_msg_data_cap(true);426
let multi_msg_cap = min(5, c.control.multi_msg_cap());427
c.control.set_multi_msg_cap(multi_msg_cap);428
msi_info = Some((cap_offset, c));429
}430
cap_offset = cap_header.next as usize;431
}432
}433
434
let mut msix_cap = None;435
if let Some((offset, cap)) = msix_info {436
msix_cap = Some(cap.clone());437
let msix_cap_mmio = MsixCapMmio::new(cap);438
masked_caps.push((offset as u64, Box::new(msix_cap_mmio)));439
if let Some((offset, hdr)) = msi_info {440
let null_cap = NullCap {441
size: hdr.control.cap_size(),442
next: hdr.header.next,443
};444
masked_caps.push((offset as u64, Box::new(null_cap)));445
}446
} else if let Some((offset, hdr)) = msi_info {447
let count = 1 << hdr.control.multi_msg_cap();448
let irqfds = (0..count)449
.map(|_| msi_sender.create_irqfd())450
.collect::<Result<Box<_>, _>>()?;451
452
let mut eventfds = [-1; 32];453
for (fd, irqfd) in zip(&mut eventfds, &irqfds) {454
*fd = irqfd.as_fd().as_raw_fd();455
}456
let set_eventfd = VfioIrqSet {457
argsz: (size_of::<VfioIrqSet<0>>() + size_of::<i32>() * count) as u32,458
flags: VfioIrqSetFlag::DATA_EVENTFD | VfioIrqSetFlag::ACTION_TRIGGER,459
index: VfioPciIrq::MSI.raw(),460
start: 0,461
count: count as u32,462
data: VfioIrqSetData { eventfds },463
};464
cdev.dev.set_irqs(&set_eventfd)?;465
466
let mut msi_cap_mmio = MsiCapMmio::new(hdr.control, irqfds);467
msi_cap_mmio.set_next(hdr.header.next);468
masked_caps.push((offset as u64, Box::new(msi_cap_mmio)));469
}470
471
let mut extra_areas: MmioBus<Box<dyn PciConfigArea>> = MmioBus::new();472
masked_caps.sort_by_key(|(offset, _)| *offset);473
let mut area_end = 0x40;474
for (offset, cap) in masked_caps {475
if area_end < offset {476
extra_areas.add(477
area_end,478
Box::new(PthConfigArea {479
offset: region_config.offset + area_end,480
size: offset - area_end,481
dev: cdev.clone(),482
}),483
)?;484
}485
area_end = offset + Mmio::size(&*cap);486
extra_areas.add(offset, cap)?;487
}488
if area_end < region_config.size {489
extra_areas.add(490
area_end,491
Box::new(PthConfigArea {492
offset: region_config.offset + area_end,493
size: region_config.size - area_end,494
dev: cdev.clone(),495
}),496
)?;497
}498
499
let config_header = ConfigHeader::Device(dev_header);500
501
cdev.dev.reset()?;502
503
let msix_count = match &msix_cap {504
Some(cap) => cap.control.table_len() + 1,505
None => 0,506
};507
let msix_entries = RwLock::new(508
(0..msix_count)509
.map(|_| MsixTableMmioEntry::Entry(MsixTableEntry::default()))510
.collect(),511
);512
513
let msix_table = Arc::new(MsixTableMmio {514
entries: msix_entries,515
});516
517
let mut bars = [const { PciBar::Empty }; 6];518
let bar_vals = config_header.bars();519
520
for index in VfioPciRegion::BAR0.raw()..=VfioPciRegion::BAR5.raw() {521
let region_info = cdev.dev.get_region_info(index)?;522
if region_info.size == 0 {523
continue;524
}525
let region = if region_info.flags.contains(VfioRegionInfoFlag::MMAP) {526
create_mappable_bar_region(527
cdev.clone(),528
index,529
®ion_info,530
msix_cap.as_ref(),531
msix_table.clone(),532
msi_sender.clone(),533
)?534
} else {535
MemRegion::with_emulated(536
Arc::new(PthBarRegion {537
cdev: cdev.clone(),538
size: region_info.size as usize,539
offset: region_info.offset,540
}),541
MemRegionType::Hidden,542
)543
};544
let index = index as usize;545
let bar_val = bar_vals[index];546
if bar_val & BAR_IO == BAR_IO {547
let MemRange::Emulated(range) = ®ion.ranges[0] else {548
unreachable!()549
};550
bars[index] = PciBar::Io(Arc::new(IoRegion {551
range: range.clone(),552
callbacks: Mutex::new(vec![]),553
}))554
} else {555
bars[index] = PciBar::Mem(Arc::new(region));556
}557
}558
559
Ok(VfioPciDev {560
config: PciPthConfig {561
header: EmulatedHeader::new(config_header, bars),562
extra: extra_areas,563
dev: cdev,564
},565
msix_table,566
})567
}568
569
fn reset(&self) -> Result<()> {570
let is_irqfd = |e| matches!(e, &MsixTableMmioEntry::IrqFd(_));571
if self.msix_table.entries.read().iter().any(is_irqfd) {572
let dev = &self.config.dev;573
if let Err(e) = dev.dev.disable_all_irqs(VfioPciIrq::MSIX) {574
log::error!("{}: failed to disable MSIX IRQs: {e:?}", dev.name)575
}576
}577
578
self.msix_table.reset();579
self.config.dev.dev.reset()580
}581
}582
583
#[derive(Debug)]584
struct MsixBarMmio<M, D>585
where586
M: MsiSender,587
{588
table: Arc<MsixTableMmio<M::IrqFd>>,589
msi_sender: Arc<M>,590
table_range: Range<usize>,591
#[allow(dead_code)]592
pba: Arc<[AtomicU64]>, // TODO593
pba_range: Range<usize>,594
cdev: Arc<VfioDev<D>>,595
cdev_offset: u64,596
region_start: usize,597
region_size: usize,598
}599
600
impl<M, D> MsixBarMmio<M, D>601
where602
M: MsiSender,603
D: Device,604
{605
fn enable_irqfd(&self, index: usize) -> Result<()> {606
let mut entries = self.table.entries.write();607
let Some(entry) = entries.get_mut(index) else {608
log::error!(609
"{}: MSIX-X index {index} is out of range ({})",610
self.cdev.name,611
entries.len()612
);613
return Ok(());614
};615
let MsixTableMmioEntry::Entry(e) = &*entry else {616
return Ok(());617
};618
if e.control.masked() {619
return Ok(());620
}621
622
log::debug!("{}: enabling irqfd for MSI-X {index}", self.cdev.name);623
let irqfd = self.msi_sender.create_irqfd()?;624
irqfd.set_addr_hi(e.addr_hi)?;625
irqfd.set_addr_lo(e.addr_lo)?;626
irqfd.set_data(e.data)?;627
irqfd.set_masked(false)?;628
*entry = MsixTableMmioEntry::IrqFd(irqfd);629
630
// If a device IRQ has flag NORESIZE, it must be disabled before a new631
// subindex can be enabled.632
// However if this IRQ has been disabled, VFIO returns error if we try633
// to call disable_all_irqs(). This happens when the guest enables a634
// subindex for the first time.635
// As long as the following set_irqs() succeeds, we can safely ignore636
// the error here.637
let _ = self.cdev.dev.disable_all_irqs(VfioPciIrq::MSIX);638
639
let mut eventfds = [-1; 2048];640
let mut count = 0;641
for (index, (entry, fd)) in std::iter::zip(entries.iter(), &mut eventfds).enumerate() {642
let MsixTableMmioEntry::IrqFd(irqfd) = entry else {643
continue;644
};645
count = index + 1;646
*fd = irqfd.as_fd().as_raw_fd();647
}648
let vfio_irq_set_eventfd = VfioIrqSet {649
argsz: (size_of::<VfioIrqSet<0>>() + size_of::<i32>() * count) as u32,650
flags: VfioIrqSetFlag::DATA_EVENTFD | VfioIrqSetFlag::ACTION_TRIGGER,651
index: VfioPciIrq::MSIX.raw(),652
start: 0,653
count: count as u32,654
data: VfioIrqSetData { eventfds },655
};656
self.cdev.dev.set_irqs(&vfio_irq_set_eventfd)657
}658
}659
660
impl<M, D> Mmio for MsixBarMmio<M, D>661
where662
M: MsiSender,663
D: Device,664
{665
fn size(&self) -> u64 {666
self.region_size as u64667
}668
669
fn read(&self, offset: u64, size: u8) -> mem::Result<u64> {670
let offset = self.region_start + offset as usize;671
let name = &self.cdev.name;672
if offset < self.table_range.end && offset + size as usize > self.table_range.start {673
let offset = offset - self.table_range.start;674
self.table.read(offset as u64, size)675
} else if self.pba_range.contains(&offset) {676
log::error!("{name}: reading pba at {offset:#x}, size={size}: unimplemented",);677
Ok(0)678
} else {679
log::trace!("{name}: emulated BAR read at {offset:#x}, size={size}",);680
self.cdev.dev.read(self.cdev_offset + offset as u64, size)681
}682
}683
684
fn write(&self, offset: u64, size: u8, val: u64) -> mem::Result<Action> {685
let offset = self.region_start + offset as usize;686
let name = &self.cdev.name;687
if offset < self.table_range.end && offset + size as usize > self.table_range.start {688
let offset = offset - self.table_range.start;689
if self.table.write_val(offset as u64, size, val)? {690
self.enable_irqfd(offset / size_of::<MsixTableEntry>())691
.box_trace(mem::error::Mmio)?;692
}693
} else if self.pba_range.contains(&offset) {694
log::error!(695
"{name}: writing pba at {offset:#x}, size={size}, val={val:#x}: unimplemented",696
);697
} else {698
log::trace!("{name}: emulated BAR write at {offset:#x}, size={size}, val={val:#x}",);699
self.cdev700
.dev701
.write(self.cdev_offset + offset as u64, size, val)?;702
}703
Ok(Action::None)704
}705
}706