Skip to content

Commit

Permalink
Some improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Jan 7, 2025
1 parent 6684e73 commit 422fecb
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 30 deletions.
15 changes: 5 additions & 10 deletions src/neon/plane_u8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,11 @@ macro_rules! accumulate_8_horiz {

macro_rules! accumulate_4_horiz {
($store: expr, $ptr: expr, $weights: expr) => {{
let pixel_colors = vld1_u16(
[
$ptr.read_unaligned() as u16,
$ptr.add(1).read_unaligned() as u16,
$ptr.add(2).read_unaligned() as u16,
$ptr.add(3).read_unaligned() as u16,
]
.as_ptr(),
);
let px_16 = vreinterpret_s16_u16(pixel_colors);
let pixel_colors = vmovl_u8(vreinterpret_u8_u32(vld1_lane_u32::<0>(
$ptr as *const u32,
vdup_n_u32(0),
)));
let px_16 = vreinterpret_s16_u16(vget_low_u16(pixel_colors));

$store = vmlal_s16($store, px_16, $weights);
}};
Expand Down
35 changes: 15 additions & 20 deletions src/neon/vertical_u8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -690,8 +690,7 @@ fn convolve_vertical_neon_row_full(
if bounds_size == 2 {
let py = bounds.start;
let weight = weight.get_unchecked(0..2);
let mut v_weight = vld1_dup_s16(weight.as_ptr());
v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);

Expand Down Expand Up @@ -719,8 +718,8 @@ fn convolve_vertical_neon_row_full(
} else if bounds_size == 3 {
let py = bounds.start;
let weight = weight.get_unchecked(0..3);
let mut v_weight = vld1_dup_s16(weight.as_ptr());
v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
let mut v_weight =
vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
Expand Down Expand Up @@ -855,8 +854,7 @@ fn convolve_vertical_neon_row_full(
if bounds_size == 2 {
let py = bounds.start;
let weight = weight.get_unchecked(0..2);
let mut v_weight = vld1_dup_s16(weight.as_ptr());
v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
Expand All @@ -871,8 +869,8 @@ fn convolve_vertical_neon_row_full(
} else if bounds_size == 3 {
let py = bounds.start;
let weight = weight.get_unchecked(0..3);
let mut v_weight = vld1_dup_s16(weight.as_ptr());
v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
let mut v_weight =
vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
Expand Down Expand Up @@ -955,8 +953,7 @@ fn convolve_vertical_neon_row_full(
if bounds_size == 2 {
let py = bounds.start;
let weight = weight.get_unchecked(0..2);
let mut v_weight = vld1_dup_s16(weight.as_ptr());
v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
let item_row0 = vld1q_u8(src_ptr0.as_ptr());
Expand All @@ -966,8 +963,8 @@ fn convolve_vertical_neon_row_full(
} else if bounds_size == 3 {
let py = bounds.start;
let weight = weight.get_unchecked(0..3);
let mut v_weight = vld1_dup_s16(weight.as_ptr());
v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
let mut v_weight =
vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
Expand Down Expand Up @@ -1025,8 +1022,7 @@ fn convolve_vertical_neon_row_full(
if bounds_size == 2 {
let py = bounds.start;
let weight = weight.get_unchecked(0..2);
let mut v_weight = vld1_dup_s16(weight.as_ptr());
v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
let item_row0 = vld1_u8(src_ptr0.as_ptr());
Expand All @@ -1041,8 +1037,8 @@ fn convolve_vertical_neon_row_full(
} else if bounds_size == 3 {
let py = bounds.start;
let weight = weight.get_unchecked(0..3);
let mut v_weight = vld1_dup_s16(weight.as_ptr());
v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
let mut v_weight =
vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
Expand Down Expand Up @@ -1123,8 +1119,7 @@ fn convolve_vertical_neon_row_full(
if bounds_size == 2 {
let py = bounds.start;
let weight = weight.get_unchecked(0..2);
let mut v_weight = vld1_dup_s16(weight.as_ptr());
v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
let item_row0 = vld1_dup_u8(src_ptr0.as_ptr());
Expand All @@ -1137,8 +1132,8 @@ fn convolve_vertical_neon_row_full(
} else if bounds_size == 3 {
let py = bounds.start;
let weight = weight.get_unchecked(0..3);
let mut v_weight = vld1_dup_s16(weight.as_ptr());
v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
let mut v_weight =
vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
Expand Down

0 comments on commit 422fecb

Please # to comment.