Skip to content

Commit

Permalink
feat: 添加最有匹配的偏移计算器,并为 cuda 添加池化流分配器
Browse files Browse the repository at this point in the history
Signed-off-by: YdrMaster <ydrml@hotmail.com>
  • Loading branch information
YdrMaster committed Jan 3, 2025
1 parent 6f51afa commit 94a081e
Show file tree
Hide file tree
Showing 3 changed files with 189 additions and 3 deletions.
107 changes: 107 additions & 0 deletions operators/src/common/calculator.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
use std::{
cmp::Ordering,
collections::{BTreeSet, HashMap},
ops::Range,
};

#[derive(Debug)]
pub struct OffsetCalculator {
alignment: usize,
free_list: BTreeSet<Area>,
heads: HashMap<usize, usize>,
tails: HashMap<usize, usize>,
}

impl OffsetCalculator {
pub fn new(alignment: usize) -> Self {
Self {
alignment,
free_list: BTreeSet::new(),
heads: HashMap::new(),
tails: HashMap::new(),
}
}

pub fn put(&mut self, range: &Range<usize>) {
let len = range.len().div_ceil(self.alignment) * self.alignment;
if len == 0 {
return;
}

let mut head = range.start;
let mut tail = head + len;
if let Some(len_) = self.tails.remove(&head) {
head -= len_;
assert!(self.free_list.remove(&Area {
off: head,
len: len_,
}));
assert_eq!(self.heads.remove(&head), Some(len_));
}
if let Some(len_) = self.heads.remove(&tail) {
assert!(self.free_list.remove(&Area {
off: tail,
len: len_,
}));
tail += len_;
assert_eq!(self.tails.remove(&tail), Some(len_));
}

self.insert_area(Area {
off: head,
len: tail - head,
})
}

pub fn take(&mut self, expect: usize) -> Option<Range<usize>> {
let len = expect.div_ceil(self.alignment) * self.alignment;
if len == 0 {
return Some(usize::MAX..usize::MAX);
}

let &free = self.free_list.range(Area { off: 0, len }..).next()?;

let head = free.off;
let tail = free.off + free.len;

self.free_list.remove(&free);
self.heads.remove(&head);
self.tails.remove(&tail);

if free.len > len {
self.insert_area(Area {
off: free.off + len,
len: free.len - len,
})
}

Some(head..head + expect)
}

fn insert_area(&mut self, area: Area) {
self.free_list.insert(area);
self.heads.insert(area.off, area.len);
self.tails.insert(area.off + area.len, area.len);
}
}

#[derive(Clone, Copy, PartialEq, Eq, Debug)]
struct Area {
off: usize,
len: usize,
}

impl PartialOrd for Area {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}

impl Ord for Area {
fn cmp(&self, other: &Self) -> Ordering {
match self.len.cmp(&other.len) {
Ordering::Equal => self.off.cmp(&other.off),
ord => ord,
}
}
}
2 changes: 2 additions & 0 deletions operators/src/common/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
mod blob;
mod calculator;
mod error;
mod maybe_dyn;
mod pool;
Expand All @@ -7,6 +8,7 @@ mod unsigned;
mod workspace;

pub use blob::Blob;
pub use calculator::OffsetCalculator;
pub use error::{functions::*, LaunchError, LaunchErrorKind, SchemeError, SchemeErrorKind};
pub use maybe_dyn::{dyn_, DynVal, MaybeDyn};
pub use pool::Pool;
Expand Down
83 changes: 80 additions & 3 deletions operators/src/handle/cuda/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,22 @@ mod module;
#[cfg(use_nccl)]
mod nccl;

use crate::{Alloc, Hardware, Pool, QueueAlloc, QueueOf, SchemeDiversity};
use crate::{Alloc, Hardware, OffsetCalculator, Pool, QueueAlloc, QueueOf, SchemeDiversity};
use cublas::{Cublas, CublasSpore};
use cuda::{
self, AsRaw, Context, ContextResource, ContextSpore, CurrentCtx, DevMem, Device, Stream,
Version,
self, AsRaw, Context, ContextResource, ContextSpore, CurrentCtx, DevByte, DevMem, Device,
Stream, Version,
};
use digit_layout::DigitLayout;
use libloading::Library;
use lru::LruCache;
use std::{
cell::RefCell,
collections::HashMap,
hash::Hash,
num::NonZeroUsize,
ops::{Deref, DerefMut, Range},
rc::Rc,
sync::{Arc, Mutex, RwLock, Weak},
};

Expand All @@ -35,6 +38,80 @@ impl Hardware for Gpu {
type Queue<'ctx> = cuda::Stream<'ctx>;
}

pub struct StreamMemPool<'ctx> {
stream: Stream<'ctx>,
mem_pool: Rc<RefCell<MemPool<'ctx>>>,
}

pub struct MemPool<'ctx> {
pool: Vec<DevMem<'ctx>>,
recorder: OffsetCalculator,
}

pub struct MemPoolBlob(Range<usize>);

impl Deref for MemPoolBlob {
type Target = [DevByte];
#[inline]
fn deref(&self) -> &Self::Target {
unsafe { std::slice::from_raw_parts(self.0.start as _, self.0.len()) }
}
}

impl DerefMut for MemPoolBlob {
fn deref_mut(&mut self) -> &mut Self::Target {
unsafe { std::slice::from_raw_parts_mut(self.0.start as _, self.0.len()) }
}
}

impl<'ctx> StreamMemPool<'ctx> {
pub fn new(stream: Stream<'ctx>) -> Self {
let alignment = stream.ctx().dev().alignment();
Self {
stream,
mem_pool: Rc::new(RefCell::new(MemPool {
pool: Vec::new(),
recorder: OffsetCalculator::new(alignment),
})),
}
}

pub fn put(&self, size: usize) {
let blob = self.stream.ctx().malloc::<u8>(size);
let area = blob.as_ptr_range();
let mut mem_pool = self.mem_pool.borrow_mut();
mem_pool.pool.push(blob);
mem_pool.recorder.put(&(area.start as _..area.end as _));
}
}

impl Alloc<MemPoolBlob> for StreamMemPool<'_> {
#[inline]
fn alloc(&self, size: usize) -> MemPoolBlob {
MemPoolBlob(
self.mem_pool
.borrow_mut()
.recorder
.take(size)
.expect("out of memory"),
)
}

#[inline]
fn free(&self, mem: MemPoolBlob) {
self.mem_pool.borrow_mut().recorder.put(&mem.0)
}
}

impl QueueAlloc for StreamMemPool<'_> {
type Hardware = Gpu;
type DevMem = MemPoolBlob;
#[inline]
fn queue(&self) -> &QueueOf<Self::Hardware> {
&self.stream
}
}

impl<'ctx> Alloc<DevMem<'ctx>> for &'ctx CurrentCtx {
#[inline]
fn alloc(&self, size: usize) -> DevMem<'ctx> {
Expand Down

0 comments on commit 94a081e

Please # to comment.