Skip to content

Commit 5276198

Browse files
committed
Adding explicit type casting for vector optimization in LoongArch architecture
1 parent 5da6e04 commit 5276198

File tree

4 files changed

+104
-102
lines changed

4 files changed

+104
-102
lines changed

bin/activate-emsdk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ EMSDK_PATH = os.path.join(EMSDK_ROOT, 'emsdk.py')
1818
EMSDK_VERSION = '4.0.7'
1919

2020
def main():
21-
if sysconfig.get_platform() in ['linux-aarch64', 'linux-arm64']:
21+
if sysconfig.get_platform() in ['linux-aarch64', 'linux-arm64', 'linux-loongarch64']:
2222
# This platform cannot install emsdk at the provided version. See
2323
# https://github.com/emscripten-core/emsdk/blob/main/emscripten-releases-tags.json#L5
2424
# for the latest version

bin/fetch-gn

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ os.chdir(os.path.join(os.path.dirname(__file__), os.pardir))
2020
gnzip = os.path.join(tempfile.mkdtemp(), 'gn.zip')
2121
with open(gnzip, 'wb') as f:
2222
OS = {'darwin': 'mac', 'linux': 'linux', 'linux2': 'linux', 'win32': 'windows'}[sys.platform]
23-
cpu = {'aarch64': 'arm64', 'amd64': 'amd64', 'arm64': 'arm64', 'x86_64': 'amd64'}[platform.machine().lower()]
23+
cpu = {'aarch64': 'arm64', 'amd64': 'amd64', 'arm64': 'arm64', 'x86_64': 'amd64', 'loongarch64': 'loongarch64'}[platform.machine().lower()]
2424

2525
rev = 'b2afae122eeb6ce09c52d63f67dc53fc517dbdc8'
2626
url = 'https://chrome-infra-packages.appspot.com/dl/gn/gn/{}-{}/+/git_revision:{}'.format(

src/core/SkBlurEngine.cpp

Lines changed: 28 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -556,62 +556,64 @@ class ThreeBoxApproxPass final : public Pass {
556556
skvx::Vec<4, uint32_t>* buffer0Cursor = fBuffer0Cursor;
557557
skvx::Vec<4, uint32_t>* buffer1Cursor = fBuffer1Cursor;
558558
skvx::Vec<4, uint32_t>* buffer2Cursor = fBuffer2Cursor;
559-
v4u32 sum0 = __lsx_vld(fSum0, 0); // same as skvx::Vec<4, uint32_t>::Load(fSum0);
560-
v4u32 sum1 = __lsx_vld(fSum1, 0);
561-
v4u32 sum2 = __lsx_vld(fSum2, 0);
559+
v4u32 sum0 = (v4u32)__lsx_vld(fSum0, 0); // same as skvx::Vec<4, uint32_t>::Load(fSum0);
560+
v4u32 sum1 = (v4u32)__lsx_vld(fSum1, 0);
561+
v4u32 sum2 = (v4u32)__lsx_vld(fSum2, 0);
562562

563563
auto processValue = [&](v4u32& vLeadingEdge){
564564
sum0 += vLeadingEdge;
565565
sum1 += sum0;
566566
sum2 += sum1;
567567

568-
v4u32 divisorFactor = __lsx_vreplgr2vr_w(fDivider.divisorFactor());
569-
v4u32 blurred = __lsx_vmuh_w(divisorFactor, sum2);
568+
v4u32 divisorFactor = (v4u32)__lsx_vreplgr2vr_w(fDivider.divisorFactor());
569+
v4u32 blurred = (v4u32)__lsx_vmuh_w((__m128i)divisorFactor, (__m128i)sum2);
570570

571-
v4u32 buffer2Value = __lsx_vld(buffer2Cursor, 0); //Not fBuffer0Cursor, out of bounds.
571+
v4u32 buffer2Value = (v4u32)__lsx_vld(buffer2Cursor, 0); //Not fBuffer0Cursor, out of bounds.
572572
sum2 -= buffer2Value;
573-
__lsx_vst(sum1, (void *)buffer2Cursor, 0);
573+
__lsx_vst((__m128i)sum1, (void *)buffer2Cursor, 0);
574574
buffer2Cursor = (buffer2Cursor + 1) < fBuffersEnd ? buffer2Cursor + 1 : fBuffer2;
575-
v4u32 buffer1Value = __lsx_vld(buffer1Cursor, 0);
575+
v4u32 buffer1Value = (v4u32)__lsx_vld(buffer1Cursor, 0);
576576
sum1 -= buffer1Value;
577-
__lsx_vst(sum0, (void *)buffer1Cursor, 0);
577+
__lsx_vst((__m128i)sum0, (void *)buffer1Cursor, 0);
578578
buffer1Cursor = (buffer1Cursor + 1) < fBuffer2 ? buffer1Cursor + 1 : fBuffer1;
579-
v4u32 buffer0Value = __lsx_vld(buffer0Cursor, 0);
579+
v4u32 buffer0Value = (v4u32)__lsx_vld(buffer0Cursor, 0);
580580
sum0 -= buffer0Value;
581-
__lsx_vst(vLeadingEdge, (void *)buffer0Cursor, 0);
581+
__lsx_vst((__m128i)vLeadingEdge, (void *)buffer0Cursor, 0);
582582
buffer0Cursor = (buffer0Cursor + 1) < fBuffer1 ? buffer0Cursor + 1 : fBuffer0;
583583

584584
v16u8 shuf = {0x0,0x4,0x8,0xc,0x0};
585-
v16u8 ret = __lsx_vshuf_b(blurred, blurred, shuf);
585+
v16u8 ret = (v16u8)__lsx_vshuf_b((__m128i)blurred, (__m128i)blurred, (__m128i)shuf);
586586
return ret;
587587
};
588588

589-
v4u32 zero = __lsx_vldi(0x0);
589+
v4u32 zero = (v4u32)__lsx_vldi(0x0);
590590
if (!src32 && !dst32) {
591591
while (n --> 0) {
592592
(void)processValue(zero);
593593
}
594594
} else if (src32 && !dst32) {
595595
while (n --> 0) {
596-
v4u32 edge = __lsx_vinsgr2vr_w(zero, *src32, 0);
597-
edge = __lsx_vilvl_b(zero, edge);
598-
edge = __lsx_vilvl_h(zero, edge);
596+
v4u32 edge = (v4u32)__lsx_vinsgr2vr_w((__m128i)zero, *src32, 0);
597+
edge = (v4u32)__lsx_vilvl_b((__m128i)zero, (__m128i)edge);
598+
edge = (v4u32)__lsx_vilvl_h((__m128i)zero, (__m128i)edge);
599599
(void)processValue(edge);
600600
src32 += srcStride;
601601
}
602602
} else if (!src32 && dst32) {
603603
while (n --> 0) {
604-
v4u32 ret = processValue(zero);
605-
__lsx_vstelm_w(ret, dst32, 0, 0); // 3rd is offset, 4th is idx.
604+
v16u8 ret_vec = processValue(zero);
605+
v4u32 ret = (v4u32)ret_vec;
606+
__lsx_vstelm_w((__m128i)ret, dst32, 0, 0); // 3rd is offset, 4th is idx.
606607
dst32 += dstStride;
607608
}
608609
} else if (src32 && dst32) {
609610
while (n --> 0) {
610-
v4u32 edge = __lsx_vinsgr2vr_w(zero, *src32, 0);
611-
edge = __lsx_vilvl_b(zero, edge);
612-
edge = __lsx_vilvl_h(zero, edge);
613-
v4u32 ret = processValue(edge);
614-
__lsx_vstelm_w(ret, dst32, 0, 0);
611+
v4u32 edge = (v4u32)__lsx_vinsgr2vr_w((__m128i)zero, *src32, 0);
612+
edge = (v4u32)__lsx_vilvl_b((__m128i)zero, (__m128i)edge);
613+
edge = (v4u32)__lsx_vilvl_h((__m128i)zero, (__m128i)edge);
614+
v16u8 ret_vec = processValue(edge);
615+
v4u32 ret = (v4u32)ret_vec;
616+
__lsx_vstelm_w((__m128i)ret, dst32, 0, 0);
615617
src32 += srcStride;
616618
dst32 += dstStride;
617619
}
@@ -622,9 +624,9 @@ class ThreeBoxApproxPass final : public Pass {
622624
fBuffer1Cursor = buffer1Cursor;
623625
fBuffer2Cursor = buffer2Cursor;
624626

625-
__lsx_vst(sum0, fSum0, 0);
626-
__lsx_vst(sum1, fSum1, 0);
627-
__lsx_vst(sum2, fSum2, 0);
627+
__lsx_vst((__m128i)sum0, fSum0, 0);
628+
__lsx_vst((__m128i)sum1, fSum1, 0);
629+
__lsx_vst((__m128i)sum2, fSum2, 0);
628630
#else
629631
skvx::Vec<4, uint32_t>* buffer0Cursor = fBuffer0Cursor;
630632
skvx::Vec<4, uint32_t>* buffer1Cursor = fBuffer1Cursor;

0 commit comments

Comments
 (0)