diff --git a/api/next/61642.txt b/api/next/61642.txt
new file mode 100644
index 00000000000000..dd67874ab9fba4
--- /dev/null
+++ b/api/next/61642.txt
@@ -0,0 +1 @@
+pkg net/netip, method (Prefix) Compare(Prefix) int #61642
diff --git a/doc/next/6-stdlib/99-minor/net/netip/61642.md b/doc/next/6-stdlib/99-minor/net/netip/61642.md
new file mode 100644
index 00000000000000..3d79f2e76aee04
--- /dev/null
+++ b/doc/next/6-stdlib/99-minor/net/netip/61642.md
@@ -0,0 +1 @@
+The new [Prefix.Compare] method compares two prefixes.
diff --git a/src/bufio/bufio.go b/src/bufio/bufio.go
index 5244ce2e0ca943..141a9a1a2a2305 100644
--- a/src/bufio/bufio.go
+++ b/src/bufio/bufio.go
@@ -311,10 +311,7 @@ func (b *Reader) ReadRune() (r rune, size int, err error) {
 	if b.r == b.w {
 		return 0, 0, b.readErr()
 	}
-	r, size = rune(b.buf[b.r]), 1
-	if r >= utf8.RuneSelf {
-		r, size = utf8.DecodeRune(b.buf[b.r:b.w])
-	}
+	r, size = utf8.DecodeRune(b.buf[b.r:b.w])
 	b.r += size
 	b.lastByte = int(b.buf[b.r-1])
 	b.lastRuneSize = size
diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go
index ce2e0049102234..9a7f4ee3c93afb 100644
--- a/src/bytes/bytes.go
+++ b/src/bytes/bytes.go
@@ -528,11 +528,7 @@ func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
 	// more efficient, possibly due to cache effects.
 	start := -1 // valid span start if >= 0
 	for i := 0; i < len(s); {
-		size := 1
-		r := rune(s[i])
-		if r >= utf8.RuneSelf {
-			r, size = utf8.DecodeRune(s[i:])
-		}
+		r, size := utf8.DecodeRune(s[i:])
 		if f(r) {
 			if start >= 0 {
 				spans = append(spans, span{start, i})
@@ -614,11 +610,7 @@ func Map(mapping func(r rune) rune, s []byte) []byte {
 	// fine. It could also shrink but that falls out naturally.
 	b := make([]byte, 0, len(s))
 	for i := 0; i < len(s); {
-		wid := 1
-		r := rune(s[i])
-		if r >= utf8.RuneSelf {
-			r, wid = utf8.DecodeRune(s[i:])
-		}
+		r, wid := utf8.DecodeRune(s[i:])
 		r = mapping(r)
 		if r >= 0 {
 			b = utf8.AppendRune(b, r)
@@ -917,11 +909,7 @@ func LastIndexFunc(s []byte, f func(r rune) bool) int {
 func indexFunc(s []byte, f func(r rune) bool, truth bool) int {
 	start := 0
 	for start < len(s) {
-		wid := 1
-		r := rune(s[start])
-		if r >= utf8.RuneSelf {
-			r, wid = utf8.DecodeRune(s[start:])
-		}
+		r, wid := utf8.DecodeRune(s[start:])
 		if f(r) == truth {
 			return start
 		}
@@ -1052,10 +1040,7 @@ func trimLeftASCII(s []byte, as *asciiSet) []byte {
 
 func trimLeftUnicode(s []byte, cutset string) []byte {
 	for len(s) > 0 {
-		r, n := rune(s[0]), 1
-		if r >= utf8.RuneSelf {
-			r, n = utf8.DecodeRune(s)
-		}
+		r, n := utf8.DecodeRune(s)
 		if !containsRune(cutset, r) {
 			break
 		}
@@ -1251,19 +1236,10 @@ hasUnicode:
 	t = t[i:]
 	for len(s) != 0 && len(t) != 0 {
 		// Extract first rune from each.
-		var sr, tr rune
-		if s[0] < utf8.RuneSelf {
-			sr, s = rune(s[0]), s[1:]
-		} else {
-			r, size := utf8.DecodeRune(s)
-			sr, s = r, s[size:]
-		}
-		if t[0] < utf8.RuneSelf {
-			tr, t = rune(t[0]), t[1:]
-		} else {
-			r, size := utf8.DecodeRune(t)
-			tr, t = r, t[size:]
-		}
+		sr, size := utf8.DecodeRune(s)
+		s = s[size:]
+		tr, size := utf8.DecodeRune(t)
+		t = t[size:]
 
 		// If they match, keep going; if not, return false.
 
diff --git a/src/bytes/iter.go b/src/bytes/iter.go
index b2abb2c9ba3dc6..a4ece881d20fa1 100644
--- a/src/bytes/iter.go
+++ b/src/bytes/iter.go
@@ -117,11 +117,7 @@ func FieldsFuncSeq(s []byte, f func(rune) bool) iter.Seq[[]byte] {
 	return func(yield func([]byte) bool) {
 		start := -1
 		for i := 0; i < len(s); {
-			size := 1
-			r := rune(s[i])
-			if r >= utf8.RuneSelf {
-				r, size = utf8.DecodeRune(s[i:])
-			}
+			r, size := utf8.DecodeRune(s[i:])
 			if f(r) {
 				if start >= 0 {
 					if !yield(s[start:i:i]) {
diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
index 72e65734666c2a..63676cc785967c 100644
--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
@@ -260,6 +260,28 @@ lable2:
 	MOVV    FCC0, R4		// 04dc1401
 	MOVV    R4, FCC0		// 80d81401
 
+	// LDPTR.{W/D} and STPTR.{W/D} instructions
+	MOVWP	R5, -32768(R4)		// 85008025
+	MOVWP	R5, 32764(R4)		// 85fc7f25
+	MOVWP	R5, 32(R4)		// 85200025
+	MOVWP	R5, 4(R4)		// 85040025
+	MOVWP	R5, (R4)		// 85000025
+	MOVVP	R5, -32768(R4)		// 85008027
+	MOVVP	R5, 32764(R4)		// 85fc7f27
+	MOVVP	R5, 32(R4)		// 85200027
+	MOVVP	R5, 4(R4)		// 85040027
+	MOVVP	R5, (R4)		// 85000027
+	MOVWP	-32768(R5), R4		// a4008024
+	MOVWP	32764(R5), R4		// a4fc7f24
+	MOVWP	32(R5), R4		// a4200024
+	MOVWP	4(R5), R4		// a4040024
+	MOVWP	(R5), R4		// a4000024
+	MOVVP	-32768(R5), R4		// a4008026
+	MOVVP	32764(R5), R4		// a4fc7f26
+	MOVVP	32(R5), R4		// a4200026
+	MOVVP	4(R5), R4		// a4040026
+	MOVVP	(R5), R4		// a4000026
+
 	// Loong64 atomic memory access instructions
 	AMSWAPB		R14, (R13), R12 // ac395c38
 	AMSWAPH		R14, (R13), R12 // acb95c38
diff --git a/src/cmd/compile/internal/dwarfgen/dwarf.go b/src/cmd/compile/internal/dwarfgen/dwarf.go
index 6ab39d2aaad1cf..9d975e0bc1ac7d 100644
--- a/src/cmd/compile/internal/dwarfgen/dwarf.go
+++ b/src/cmd/compile/internal/dwarfgen/dwarf.go
@@ -128,14 +128,29 @@ func Info(ctxt *obj.Link, fnsym *obj.LSym, infosym *obj.LSym, curfn obj.Func) (s
 	// already referenced by a dwarf var, attach an R_USETYPE relocation to
 	// the function symbol to insure that the type included in DWARF
 	// processing during linking.
+	// Do the same with R_USEIFACE relocations from the function symbol for the
+	// same reason.
+	// All these R_USETYPE relocations are only looked at if the function
+	// survives deadcode elimination in the linker.
 	typesyms := []*obj.LSym{}
 	for t := range fnsym.Func().Autot {
 		typesyms = append(typesyms, t)
 	}
+	for i := range fnsym.R {
+		if fnsym.R[i].Type == objabi.R_USEIFACE && !strings.HasPrefix(fnsym.R[i].Sym.Name, "go:itab.") {
+			// Types referenced through itab will be referenced from somewhere else
+			typesyms = append(typesyms, fnsym.R[i].Sym)
+		}
+	}
 	slices.SortFunc(typesyms, func(a, b *obj.LSym) int {
 		return strings.Compare(a.Name, b.Name)
 	})
+	var lastsym *obj.LSym
 	for _, sym := range typesyms {
+		if sym == lastsym {
+			continue
+		}
+		lastsym = sym
 		infosym.AddRel(ctxt, obj.Reloc{Type: objabi.R_USETYPE, Sym: sym})
 	}
 	fnsym.Func().Autot = nil
diff --git a/src/cmd/compile/internal/loong64/ssa.go b/src/cmd/compile/internal/loong64/ssa.go
index 895eadd07261d6..3959f8a7c11eb9 100644
--- a/src/cmd/compile/internal/loong64/ssa.go
+++ b/src/cmd/compile/internal/loong64/ssa.go
@@ -185,7 +185,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		ssa.OpLOONG64MULD,
 		ssa.OpLOONG64DIVF,
 		ssa.OpLOONG64DIVD,
-		ssa.OpLOONG64MULV, ssa.OpLOONG64MULHV, ssa.OpLOONG64MULHVU,
+		ssa.OpLOONG64MULV, ssa.OpLOONG64MULHV, ssa.OpLOONG64MULHVU, ssa.OpLOONG64MULH, ssa.OpLOONG64MULHU,
 		ssa.OpLOONG64DIVV, ssa.OpLOONG64REMV, ssa.OpLOONG64DIVVU, ssa.OpLOONG64REMVU,
 		ssa.OpLOONG64FCOPYSGD:
 		p := s.Prog(v.Op.Asm())
@@ -560,28 +560,97 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		p.To.Sym = ir.Syms.Duffzero
 		p.To.Offset = v.AuxInt
 	case ssa.OpLOONG64LoweredZero:
-		// MOVx	R0, (Rarg0)
-		// ADDV	$sz, Rarg0
-		// BGEU	Rarg1, Rarg0, -2(PC)
-		mov, sz := largestMove(v.AuxInt)
-		p := s.Prog(mov)
-		p.From.Type = obj.TYPE_REG
-		p.From.Reg = loong64.REGZERO
-		p.To.Type = obj.TYPE_MEM
-		p.To.Reg = v.Args[0].Reg()
+		ptrReg := v.Args[0].Reg()
+		n := v.AuxInt
+		if n < 16 {
+			v.Fatalf("Zero too small %d", n)
+		}
 
-		p2 := s.Prog(loong64.AADDVU)
-		p2.From.Type = obj.TYPE_CONST
-		p2.From.Offset = sz
-		p2.To.Type = obj.TYPE_REG
-		p2.To.Reg = v.Args[0].Reg()
+		// Generate Zeroing instructions.
+		var off int64
+		for n >= 8 {
+			// MOVV     ZR, off(ptrReg)
+			zero8(s, ptrReg, off)
+			off += 8
+			n -= 8
+		}
+		if n != 0 {
+			// MOVV     ZR, off+n-8(ptrReg)
+			zero8(s, ptrReg, off+n-8)
+		}
+	case ssa.OpLOONG64LoweredZeroLoop:
+		ptrReg := v.Args[0].Reg()
+		countReg := v.RegTmp()
+		var off int64
+		n := v.AuxInt
+		loopSize := int64(64)
+		if n < 3*loopSize {
+			// - a loop count of 0 won't work.
+			// - a loop count of 1 is useless.
+			// - a loop count of 2 is a code size ~tie
+			//     4 instructions to implement the loop
+			//     8 instructions in the loop body
+			//   vs
+			//     16 instuctions in the straightline code
+			//   Might as well use straightline code.
+			v.Fatalf("ZeroLoop size tool small %d", n)
+		}
 
-		p3 := s.Prog(loong64.ABGEU)
-		p3.From.Type = obj.TYPE_REG
-		p3.From.Reg = v.Args[1].Reg()
-		p3.Reg = v.Args[0].Reg()
-		p3.To.Type = obj.TYPE_BRANCH
-		p3.To.SetTarget(p)
+		// Put iteration count in a register.
+		//   MOVV     $n/loopSize, countReg
+		p := s.Prog(loong64.AMOVV)
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = n / loopSize
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = countReg
+		cntInit := p
+
+		// Zero loopSize bytes starting at ptrReg.
+		for range loopSize / 8 {
+			// MOVV     ZR, off(ptrReg)
+			zero8(s, ptrReg, off)
+			off += 8
+		}
+
+		// Increment ptrReg by loopSize.
+		//   ADDV     $loopSize, ptrReg
+		p = s.Prog(loong64.AADDV)
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = loopSize
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = ptrReg
+
+		// Decrement loop count.
+		//   SUBV     $1, countReg
+		p = s.Prog(loong64.ASUBV)
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = 1
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = countReg
+
+		// Jump to loop header if we're not done yet.
+		//   BNE     countReg, loop header
+		p = s.Prog(loong64.ABNE)
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = countReg
+		p.To.Type = obj.TYPE_BRANCH
+		p.To.SetTarget(cntInit.Link)
+
+		// Multiples of the loop size are now done.
+		n %= loopSize
+
+		off = 0
+		// Write any fractional portion.
+		for n >= 8 {
+			// MOVV     ZR, off(ptrReg)
+			zero8(s, ptrReg, off)
+			off += 8
+			n -= 8
+		}
+
+		if n != 0 {
+			zero8(s, ptrReg, off+n-8)
+		}
 
 	case ssa.OpLOONG64DUFFCOPY:
 		p := s.Prog(obj.ADUFFCOPY)
@@ -590,42 +659,119 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		p.To.Sym = ir.Syms.Duffcopy
 		p.To.Offset = v.AuxInt
 	case ssa.OpLOONG64LoweredMove:
-		// MOVx	(Rarg1), Rtmp
-		// MOVx	Rtmp, (Rarg0)
-		// ADDV	$sz, Rarg1
-		// ADDV	$sz, Rarg0
-		// BGEU	Rarg2, Rarg0, -4(PC)
-		mov, sz := largestMove(v.AuxInt)
-		p := s.Prog(mov)
-		p.From.Type = obj.TYPE_MEM
-		p.From.Reg = v.Args[1].Reg()
+		dstReg := v.Args[0].Reg()
+		srcReg := v.Args[1].Reg()
+		if dstReg == srcReg {
+			break
+		}
+		tmpReg := int16(loong64.REG_R20)
+		n := v.AuxInt
+		if n < 16 {
+			v.Fatalf("Move too small %d", n)
+		}
+
+		var off int64
+		for n >= 8 {
+			// MOVV     off(srcReg), tmpReg
+			// MOVV     tmpReg, off(dstReg)
+			move8(s, srcReg, dstReg, tmpReg, off)
+			off += 8
+			n -= 8
+		}
+
+		if n != 0 {
+			// MOVV     off+n-8(srcReg), tmpReg
+			// MOVV     tmpReg, off+n-8(srcReg)
+			move8(s, srcReg, dstReg, tmpReg, off+n-8)
+		}
+	case ssa.OpLOONG64LoweredMoveLoop:
+		dstReg := v.Args[0].Reg()
+		srcReg := v.Args[1].Reg()
+		if dstReg == srcReg {
+			break
+		}
+		countReg := int16(loong64.REG_R20)
+		tmpReg := int16(loong64.REG_R21)
+		var off int64
+		n := v.AuxInt
+		loopSize := int64(64)
+		if n < 3*loopSize {
+			// - a loop count of 0 won't work.
+			// - a loop count of 1 is useless.
+			// - a loop count of 2 is a code size ~tie
+			//     4 instructions to implement the loop
+			//     8 instructions in the loop body
+			//   vs
+			//     16 instructions in the straightline code
+			//   Might as well use straightline code.
+			v.Fatalf("ZeroLoop size too small %d", n)
+		}
+
+		// Put iteration count in a register.
+		//   MOVV     $n/loopSize, countReg
+		p := s.Prog(loong64.AMOVV)
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = n / loopSize
 		p.To.Type = obj.TYPE_REG
-		p.To.Reg = loong64.REGTMP
+		p.To.Reg = countReg
+		cntInit := p
+
+		// Move loopSize bytes starting at srcReg to dstReg.
+		for range loopSize / 8 {
+			// MOVV     off(srcReg), tmpReg
+			// MOVV     tmpReg, off(dstReg)
+			move8(s, srcReg, dstReg, tmpReg, off)
+			off += 8
+		}
 
-		p2 := s.Prog(mov)
-		p2.From.Type = obj.TYPE_REG
-		p2.From.Reg = loong64.REGTMP
-		p2.To.Type = obj.TYPE_MEM
-		p2.To.Reg = v.Args[0].Reg()
+		// Increment srcReg and destReg by loopSize.
+		//   ADDV     $loopSize, srcReg
+		p = s.Prog(loong64.AADDV)
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = loopSize
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = srcReg
+		//   ADDV     $loopSize, dstReg
+		p = s.Prog(loong64.AADDV)
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = loopSize
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = dstReg
 
-		p3 := s.Prog(loong64.AADDVU)
-		p3.From.Type = obj.TYPE_CONST
-		p3.From.Offset = sz
-		p3.To.Type = obj.TYPE_REG
-		p3.To.Reg = v.Args[1].Reg()
+		// Decrement loop count.
+		//   SUBV     $1, countReg
+		p = s.Prog(loong64.ASUBV)
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = 1
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = countReg
 
-		p4 := s.Prog(loong64.AADDVU)
-		p4.From.Type = obj.TYPE_CONST
-		p4.From.Offset = sz
-		p4.To.Type = obj.TYPE_REG
-		p4.To.Reg = v.Args[0].Reg()
+		// Jump to loop header if we're not done yet.
+		//   BNE     countReg, loop header
+		p = s.Prog(loong64.ABNE)
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = countReg
+		p.To.Type = obj.TYPE_BRANCH
+		p.To.SetTarget(cntInit.Link)
+
+		// Multiples of the loop size are now done.
+		n %= loopSize
+
+		off = 0
+		// Copy any fractional portion.
+		for n >= 8 {
+			// MOVV     off(srcReg), tmpReg
+			// MOVV     tmpReg, off(dstReg)
+			move8(s, srcReg, dstReg, tmpReg, off)
+			off += 8
+			n -= 8
+		}
 
-		p5 := s.Prog(loong64.ABGEU)
-		p5.From.Type = obj.TYPE_REG
-		p5.From.Reg = v.Args[2].Reg()
-		p5.Reg = v.Args[1].Reg()
-		p5.To.Type = obj.TYPE_BRANCH
-		p5.To.SetTarget(p)
+		if n != 0 {
+			// MOVV     off+n-8(srcReg), tmpReg
+			// MOVV     tmpReg, off+n-8(srcReg)
+			move8(s, srcReg, dstReg, tmpReg, off+n-8)
+		}
 
 	case ssa.OpLOONG64CALLstatic, ssa.OpLOONG64CALLclosure, ssa.OpLOONG64CALLinter:
 		s.Call(v)
@@ -1155,3 +1301,32 @@ func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg in
 	p.Pos = p.Pos.WithNotStmt()
 	return p
 }
+
+// move8 copies 8 bytes at src+off to dst+off.
+func move8(s *ssagen.State, src, dst, tmp int16, off int64) {
+	// MOVV     off(src), tmp
+	ld := s.Prog(loong64.AMOVV)
+	ld.From.Type = obj.TYPE_MEM
+	ld.From.Reg = src
+	ld.From.Offset = off
+	ld.To.Type = obj.TYPE_REG
+	ld.To.Reg = tmp
+	// MOVV     tmp, off(dst)
+	st := s.Prog(loong64.AMOVV)
+	st.From.Type = obj.TYPE_REG
+	st.From.Reg = tmp
+	st.To.Type = obj.TYPE_MEM
+	st.To.Reg = dst
+	st.To.Offset = off
+}
+
+// zero8 zeroes 8 bytes at reg+off.
+func zero8(s *ssagen.State, reg int16, off int64) {
+	// MOVV     ZR, off(reg)
+	p := s.Prog(loong64.AMOVV)
+	p.From.Type = obj.TYPE_REG
+	p.From.Reg = loong64.REGZERO
+	p.To.Type = obj.TYPE_MEM
+	p.To.Reg = reg
+	p.To.Offset = off
+}
diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules
index ca04bdcd42307d..3fa4f363f65515 100644
--- a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules
@@ -17,8 +17,8 @@
 
 (Hmul64 ...)  => (MULHV  ...)
 (Hmul64u ...) => (MULHVU ...)
-(Hmul32 x y)  => (SRAVconst (MULV (SignExt32to64 x) (SignExt32to64 y)) [32])
-(Hmul32u x y) => (SRLVconst (MULV (ZeroExt32to64 x) (ZeroExt32to64 y)) [32])
+(Hmul32 ...)  => (MULH  ...)
+(Hmul32u ...) => (MULHU ...)
 
 (Div64 x y) => (DIVV x y)
 (Div64u ...) => (DIVVU ...)
@@ -373,24 +373,8 @@
         (MOVVstore [8] ptr (MOVVconst [0])
                 (MOVVstore ptr (MOVVconst [0]) mem))
 
-// strip off fractional word zeroing
-(Zero [s] ptr mem) && s%8 != 0 && s > 16 =>
-    (Zero [s%8]
-        (OffPtr <ptr.Type> ptr [s-s%8])
-        (Zero [s-s%8] ptr mem))
-
-// medium zeroing uses a duff device
-(Zero [s] ptr mem)
-    && s%8 == 0 && s > 16 && s <= 8*128 =>
-    (DUFFZERO [8 * (128 - s/8)] ptr mem)
-
-// large zeroing uses a loop
-(Zero [s] ptr mem)
-    && s%8 == 0 && s > 8*128 =>
-    (LoweredZero
-        ptr
-        (ADDVconst <ptr.Type> ptr [s-8])
-        mem)
+(Zero [s] ptr mem) && s > 16 && s < 192 => (LoweredZero [s] ptr mem)
+(Zero [s] ptr mem) && s >= 192 => (LoweredZeroLoop [s] ptr mem)
 
 // moves
 (Move [0] _ _ mem) => mem
@@ -435,34 +419,8 @@
         (MOVVstore [8] dst (MOVVload [8] src mem)
                 (MOVVstore dst (MOVVload src mem) mem))
 
-// strip off fractional word move
-(Move [s] dst src mem) && s%8 != 0 && s > 16 =>
-        (Move [s%8]
-                (OffPtr <dst.Type> dst [s-s%8])
-                (OffPtr <src.Type> src [s-s%8])
-                (Move [s-s%8] dst src mem))
-
-// medium move uses a duff device
-(Move [s] dst src mem)
-        && s%8 == 0 && s > 16 && s <= 8*128
-        && logLargeCopy(v, s)  =>
-        (DUFFCOPY [16 * (128 - s/8)] dst src mem)
-// 16 and 128 are magic constants.  16 is the number of bytes to encode:
-//     MOVV    (R20), R30
-//     ADDV    $8, R20
-//     MOVV    R30, (R21)
-//     ADDV    $8, R21
-// and 128 is the number of such blocks. See runtime/duff_loong64.s:duffcopy.
-
-// large move uses a loop
-(Move [s] dst src mem)
-        && s%8 == 0 && s > 1024 && logLargeCopy(v, s) =>
-        (LoweredMove
-                dst
-                src
-                (ADDVconst <src.Type> src [s-8])
-                mem)
-
+(Move [s] dst src mem) && s > 16 && s < 192 && logLargeCopy(v, s) => (LoweredMove [s] dst src mem)
+(Move [s] dst src mem) && s >= 192 && logLargeCopy(v, s) => (LoweredMoveLoop [s] dst src mem)
 
 // float <=> int register moves, with no conversion.
 // These come up when compiling math.{Float64bits, Float64frombits, Float32bits, Float32frombits}.
@@ -471,6 +429,10 @@
 (MOVWUload [off] {sym} ptr (MOVFstore [off] {sym} ptr val _)) => (ZeroExt32to64 (MOVWfpgp <typ.Float32> val))
 (MOVFload  [off] {sym} ptr (MOVWstore [off] {sym} ptr val _)) => (MOVWgpfp val)
 
+// If the memory load and store operations use the same ptr, they are combined into a direct move operation between registers.
+(MOV(V|W|H|B)load [off] {sym} ptr (MOV(V|W|H|B)store [off] {sym} ptr x _)) => (MOV(V|W|H|B)reg x)
+(MOV(W|H|B)Uload  [off] {sym} ptr (MOV(W|H|B)store   [off] {sym} ptr x _)) => (MOV(W|H|B)Ureg  x)
+
 // Similarly for stores, if we see a store after FPR <=> GPR move, then redirect store to use the other register set.
 (MOVVstore [off] {sym} ptr (MOVVfpgp val) mem) => (MOVDstore [off] {sym} ptr val mem)
 (MOVDstore [off] {sym} ptr (MOVVgpfp val) mem) => (MOVVstore [off] {sym} ptr val mem)
diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
index ccd9721498232a..cc6ae8fb8e65de 100644
--- a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
@@ -197,6 +197,8 @@ func init() {
 		{name: "MULV", argLength: 2, reg: gp21, asm: "MULV", commutative: true, typ: "Int64"},      // arg0 * arg1
 		{name: "MULHV", argLength: 2, reg: gp21, asm: "MULHV", commutative: true, typ: "Int64"},    // (arg0 * arg1) >> 64, signed
 		{name: "MULHVU", argLength: 2, reg: gp21, asm: "MULHVU", commutative: true, typ: "UInt64"}, // (arg0 * arg1) >> 64, unsigned
+		{name: "MULH", argLength: 2, reg: gp21, asm: "MULH", commutative: true, typ: "Int32"},      // (arg0 * arg1) >> 32, signed
+		{name: "MULHU", argLength: 2, reg: gp21, asm: "MULHU", commutative: true, typ: "UInt32"},   // (arg0 * arg1) >> 32, unsigned
 		{name: "DIVV", argLength: 2, reg: gp21, asm: "DIVV", typ: "Int64"},                         // arg0 / arg1, signed
 		{name: "DIVVU", argLength: 2, reg: gp21, asm: "DIVVU", typ: "UInt64"},                      // arg0 / arg1, unsigned
 		{name: "REMV", argLength: 2, reg: gp21, asm: "REMV", typ: "Int64"},                         // arg0 / arg1, signed
@@ -376,6 +378,21 @@ func init() {
 			faultOnNilArg0: true,
 		},
 
+		// medium zeroing
+		// arg0 = address of memory to zero
+		// arg1 = mem
+		// auxint = number of bytes to zero
+		// returns mem
+		{
+			name:      "LoweredZero",
+			aux:       "Int64",
+			argLength: 2,
+			reg: regInfo{
+				inputs: []regMask{gp},
+			},
+			faultOnNilArg0: true,
+		},
+
 		// duffcopy
 		// arg0 = address of dst memory (in R21, changed as side effect)
 		// arg1 = address of src memory (in R20, changed as side effect)
@@ -395,48 +412,57 @@ func init() {
 			faultOnNilArg1: true,
 		},
 
-		// large or unaligned zeroing
-		// arg0 = address of memory to zero (in R20, changed as side effect)
-		// arg1 = address of the last element to zero
+		// large zeroing
+		// arg0 = address of memory to zero
+		// arg1 = mem
+		// auxint = number of bytes to zero
+		// returns mem
+		{
+			name:      "LoweredZeroLoop",
+			aux:       "Int64",
+			argLength: 2,
+			reg: regInfo{
+				inputs:       []regMask{gp},
+				clobbersArg0: true,
+			},
+			faultOnNilArg0: true,
+			needIntTemp:    true,
+		},
+
+		// medium copying
+		// arg0 = address of dst memory
+		// arg1 = address of src memory
 		// arg2 = mem
-		// auxint = alignment
+		// auxint = number of bytes to copy
 		// returns mem
-		//	MOVx	R0, (R20)
-		//	ADDV	$sz, R20
-		//	BGEU	Rarg1, R20, -2(PC)
 		{
-			name:      "LoweredZero",
+			name:      "LoweredMove",
 			aux:       "Int64",
 			argLength: 3,
 			reg: regInfo{
-				inputs:   []regMask{buildReg("R20"), gp},
+				inputs:   []regMask{gp &^ buildReg("R20"), gp &^ buildReg("R20")},
 				clobbers: buildReg("R20"),
 			},
-			typ:            "Mem",
 			faultOnNilArg0: true,
+			faultOnNilArg1: true,
 		},
 
-		// large or unaligned move
-		// arg0 = address of dst memory (in R21, changed as side effect)
-		// arg1 = address of src memory (in R20, changed as side effect)
-		// arg2 = address of the last element of src
-		// arg3 = mem
-		// auxint = alignment
+		// large copying
+		// arg0 = address of dst memory
+		// arg1 = address of src memory
+		// arg2 = mem
+		// auxint = number of bytes to copy
 		// returns mem
-		//	MOVx	(R20), Rtmp
-		//	MOVx	Rtmp, (R21)
-		//	ADDV	$sz, R20
-		//	ADDV	$sz, R21
-		//	BGEU	Rarg2, R20, -4(PC)
 		{
-			name:      "LoweredMove",
+			name:      "LoweredMoveLoop",
 			aux:       "Int64",
-			argLength: 4,
+			argLength: 3,
 			reg: regInfo{
-				inputs:   []regMask{buildReg("R21"), buildReg("R20"), gp},
-				clobbers: buildReg("R20 R21"),
+				inputs:       []regMask{gp &^ buildReg("R20 R21"), gp &^ buildReg("R20 R21")},
+				clobbers:     buildReg("R20 R21"),
+				clobbersArg0: true,
+				clobbersArg1: true,
 			},
-			typ:            "Mem",
 			faultOnNilArg0: true,
 			faultOnNilArg1: true,
 		},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 126682b9866849..f42d64228fae3a 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1795,6 +1795,8 @@ const (
 	OpLOONG64MULV
 	OpLOONG64MULHV
 	OpLOONG64MULHVU
+	OpLOONG64MULH
+	OpLOONG64MULHU
 	OpLOONG64DIVV
 	OpLOONG64DIVVU
 	OpLOONG64REMV
@@ -1923,9 +1925,11 @@ const (
 	OpLOONG64CALLclosure
 	OpLOONG64CALLinter
 	OpLOONG64DUFFZERO
-	OpLOONG64DUFFCOPY
 	OpLOONG64LoweredZero
+	OpLOONG64DUFFCOPY
+	OpLOONG64LoweredZeroLoop
 	OpLOONG64LoweredMove
+	OpLOONG64LoweredMoveLoop
 	OpLOONG64LoweredAtomicLoad8
 	OpLOONG64LoweredAtomicLoad32
 	OpLOONG64LoweredAtomicLoad64
@@ -24138,6 +24142,36 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:        "MULH",
+		argLen:      2,
+		commutative: true,
+		asm:         loong64.AMULH,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1073741816}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
+				{1, 1073741817}, // ZERO R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
+			},
+			outputs: []outputInfo{
+				{0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
+			},
+		},
+	},
+	{
+		name:        "MULHU",
+		argLen:      2,
+		commutative: true,
+		asm:         loong64.AMULHU,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1073741816}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
+				{1, 1073741817}, // ZERO R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
+			},
+			outputs: []outputInfo{
+				{0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
+			},
+		},
+	},
 	{
 		name:   "DIVV",
 		argLen: 2,
@@ -25912,6 +25946,17 @@ var opcodeTable = [...]opInfo{
 			clobbers: 524290, // R1 R20
 		},
 	},
+	{
+		name:           "LoweredZero",
+		auxType:        auxInt64,
+		argLen:         2,
+		faultOnNilArg0: true,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
+			},
+		},
+	},
 	{
 		name:           "DUFFCOPY",
 		auxType:        auxInt64,
@@ -25927,31 +25972,46 @@ var opcodeTable = [...]opInfo{
 		},
 	},
 	{
-		name:           "LoweredZero",
+		name:           "LoweredZeroLoop",
+		auxType:        auxInt64,
+		argLen:         2,
+		needIntTemp:    true,
+		faultOnNilArg0: true,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
+			},
+			clobbersArg0: true,
+		},
+	},
+	{
+		name:           "LoweredMove",
 		auxType:        auxInt64,
 		argLen:         3,
 		faultOnNilArg0: true,
+		faultOnNilArg1: true,
 		reg: regInfo{
 			inputs: []inputInfo{
-				{0, 524288},     // R20
-				{1, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
+				{0, 1071120376}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R21 R23 R24 R25 R26 R27 R28 R29 R31
+				{1, 1071120376}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R21 R23 R24 R25 R26 R27 R28 R29 R31
 			},
 			clobbers: 524288, // R20
 		},
 	},
 	{
-		name:           "LoweredMove",
+		name:           "LoweredMoveLoop",
 		auxType:        auxInt64,
-		argLen:         4,
+		argLen:         3,
 		faultOnNilArg0: true,
 		faultOnNilArg1: true,
 		reg: regInfo{
 			inputs: []inputInfo{
-				{0, 1048576},    // R21
-				{1, 524288},     // R20
-				{2, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
+				{0, 1070071800}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R23 R24 R25 R26 R27 R28 R29 R31
+				{1, 1070071800}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R23 R24 R25 R26 R27 R28 R29 R31
 			},
-			clobbers: 1572864, // R20 R21
+			clobbers:     1572864, // R20 R21
+			clobbersArg0: true,
+			clobbersArg1: true,
 		},
 	},
 	{
diff --git a/src/cmd/compile/internal/ssa/rewriteLOONG64.go b/src/cmd/compile/internal/ssa/rewriteLOONG64.go
index eb134789f74131..5890fe050a222b 100644
--- a/src/cmd/compile/internal/ssa/rewriteLOONG64.go
+++ b/src/cmd/compile/internal/ssa/rewriteLOONG64.go
@@ -296,9 +296,11 @@ func rewriteValueLOONG64(v *Value) bool {
 		v.Op = OpLOONG64LoweredGetClosurePtr
 		return true
 	case OpHmul32:
-		return rewriteValueLOONG64_OpHmul32(v)
+		v.Op = OpLOONG64MULH
+		return true
 	case OpHmul32u:
-		return rewriteValueLOONG64_OpHmul32u(v)
+		v.Op = OpLOONG64MULHU
+		return true
 	case OpHmul64:
 		v.Op = OpLOONG64MULHV
 		return true
@@ -1576,50 +1578,6 @@ func rewriteValueLOONG64_OpEqPtr(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueLOONG64_OpHmul32(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	typ := &b.Func.Config.Types
-	// match: (Hmul32 x y)
-	// result: (SRAVconst (MULV (SignExt32to64 x) (SignExt32to64 y)) [32])
-	for {
-		x := v_0
-		y := v_1
-		v.reset(OpLOONG64SRAVconst)
-		v.AuxInt = int64ToAuxInt(32)
-		v0 := b.NewValue0(v.Pos, OpLOONG64MULV, typ.Int64)
-		v1 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
-		v1.AddArg(x)
-		v2 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
-		v2.AddArg(y)
-		v0.AddArg2(v1, v2)
-		v.AddArg(v0)
-		return true
-	}
-}
-func rewriteValueLOONG64_OpHmul32u(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	typ := &b.Func.Config.Types
-	// match: (Hmul32u x y)
-	// result: (SRLVconst (MULV (ZeroExt32to64 x) (ZeroExt32to64 y)) [32])
-	for {
-		x := v_0
-		y := v_1
-		v.reset(OpLOONG64SRLVconst)
-		v.AuxInt = int64ToAuxInt(32)
-		v0 := b.NewValue0(v.Pos, OpLOONG64MULV, typ.Int64)
-		v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
-		v1.AddArg(x)
-		v2 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
-		v2.AddArg(y)
-		v0.AddArg2(v1, v2)
-		v.AddArg(v0)
-		return true
-	}
-}
 func rewriteValueLOONG64_OpIsInBounds(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
@@ -2371,6 +2329,23 @@ func rewriteValueLOONG64_OpLOONG64MOVBUload(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
 	config := b.Func.Config
+	// match: (MOVBUload [off] {sym} ptr (MOVBstore [off] {sym} ptr x _))
+	// result: (MOVBUreg x)
+	for {
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
+		ptr := v_0
+		if v_1.Op != OpLOONG64MOVBstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym {
+			break
+		}
+		x := v_1.Args[1]
+		if ptr != v_1.Args[0] {
+			break
+		}
+		v.reset(OpLOONG64MOVBUreg)
+		v.AddArg(x)
+		return true
+	}
 	// match: (MOVBUload [off1] {sym} (ADDVconst [off2] ptr) mem)
 	// cond: is32Bit(int64(off1)+off2) && (ptr.Op != OpSB || !config.ctxt.Flag_dynlink)
 	// result: (MOVBUload [off1+int32(off2)] {sym} ptr mem)
@@ -2648,6 +2623,23 @@ func rewriteValueLOONG64_OpLOONG64MOVBload(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
 	config := b.Func.Config
+	// match: (MOVBload [off] {sym} ptr (MOVBstore [off] {sym} ptr x _))
+	// result: (MOVBreg x)
+	for {
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
+		ptr := v_0
+		if v_1.Op != OpLOONG64MOVBstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym {
+			break
+		}
+		x := v_1.Args[1]
+		if ptr != v_1.Args[0] {
+			break
+		}
+		v.reset(OpLOONG64MOVBreg)
+		v.AddArg(x)
+		return true
+	}
 	// match: (MOVBload [off1] {sym} (ADDVconst [off2] ptr) mem)
 	// cond: is32Bit(int64(off1)+off2) && (ptr.Op != OpSB || !config.ctxt.Flag_dynlink)
 	// result: (MOVBload [off1+int32(off2)] {sym} ptr mem)
@@ -3568,6 +3560,23 @@ func rewriteValueLOONG64_OpLOONG64MOVHUload(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
 	config := b.Func.Config
+	// match: (MOVHUload [off] {sym} ptr (MOVHstore [off] {sym} ptr x _))
+	// result: (MOVHUreg x)
+	for {
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
+		ptr := v_0
+		if v_1.Op != OpLOONG64MOVHstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym {
+			break
+		}
+		x := v_1.Args[1]
+		if ptr != v_1.Args[0] {
+			break
+		}
+		v.reset(OpLOONG64MOVHUreg)
+		v.AddArg(x)
+		return true
+	}
 	// match: (MOVHUload [off1] {sym} (ADDVconst [off2] ptr) mem)
 	// cond: is32Bit(int64(off1)+off2) && (ptr.Op != OpSB || !config.ctxt.Flag_dynlink)
 	// result: (MOVHUload [off1+int32(off2)] {sym} ptr mem)
@@ -3807,6 +3816,23 @@ func rewriteValueLOONG64_OpLOONG64MOVHload(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
 	config := b.Func.Config
+	// match: (MOVHload [off] {sym} ptr (MOVHstore [off] {sym} ptr x _))
+	// result: (MOVHreg x)
+	for {
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
+		ptr := v_0
+		if v_1.Op != OpLOONG64MOVHstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym {
+			break
+		}
+		x := v_1.Args[1]
+		if ptr != v_1.Args[0] {
+			break
+		}
+		v.reset(OpLOONG64MOVHreg)
+		v.AddArg(x)
+		return true
+	}
 	// match: (MOVHload [off1] {sym} (ADDVconst [off2] ptr) mem)
 	// cond: is32Bit(int64(off1)+off2) && (ptr.Op != OpSB || !config.ctxt.Flag_dynlink)
 	// result: (MOVHload [off1+int32(off2)] {sym} ptr mem)
@@ -4250,6 +4276,23 @@ func rewriteValueLOONG64_OpLOONG64MOVVload(v *Value) bool {
 		v.AddArg(val)
 		return true
 	}
+	// match: (MOVVload [off] {sym} ptr (MOVVstore [off] {sym} ptr x _))
+	// result: (MOVVreg x)
+	for {
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
+		ptr := v_0
+		if v_1.Op != OpLOONG64MOVVstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym {
+			break
+		}
+		x := v_1.Args[1]
+		if ptr != v_1.Args[0] {
+			break
+		}
+		v.reset(OpLOONG64MOVVreg)
+		v.AddArg(x)
+		return true
+	}
 	// match: (MOVVload [off1] {sym} (ADDVconst [off2] ptr) mem)
 	// cond: is32Bit(int64(off1)+off2) && (ptr.Op != OpSB || !config.ctxt.Flag_dynlink)
 	// result: (MOVVload [off1+int32(off2)] {sym} ptr mem)
@@ -4558,6 +4601,23 @@ func rewriteValueLOONG64_OpLOONG64MOVWUload(v *Value) bool {
 		v.AddArg(v0)
 		return true
 	}
+	// match: (MOVWUload [off] {sym} ptr (MOVWstore [off] {sym} ptr x _))
+	// result: (MOVWUreg x)
+	for {
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
+		ptr := v_0
+		if v_1.Op != OpLOONG64MOVWstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym {
+			break
+		}
+		x := v_1.Args[1]
+		if ptr != v_1.Args[0] {
+			break
+		}
+		v.reset(OpLOONG64MOVWUreg)
+		v.AddArg(x)
+		return true
+	}
 	// match: (MOVWUload [off1] {sym} (ADDVconst [off2] ptr) mem)
 	// cond: is32Bit(int64(off1)+off2) && (ptr.Op != OpSB || !config.ctxt.Flag_dynlink)
 	// result: (MOVWUload [off1+int32(off2)] {sym} ptr mem)
@@ -4830,6 +4890,23 @@ func rewriteValueLOONG64_OpLOONG64MOVWload(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
 	config := b.Func.Config
+	// match: (MOVWload [off] {sym} ptr (MOVWstore [off] {sym} ptr x _))
+	// result: (MOVWreg x)
+	for {
+		off := auxIntToInt32(v.AuxInt)
+		sym := auxToSym(v.Aux)
+		ptr := v_0
+		if v_1.Op != OpLOONG64MOVWstore || auxIntToInt32(v_1.AuxInt) != off || auxToSym(v_1.Aux) != sym {
+			break
+		}
+		x := v_1.Args[1]
+		if ptr != v_1.Args[0] {
+			break
+		}
+		v.reset(OpLOONG64MOVWreg)
+		v.AddArg(x)
+		return true
+	}
 	// match: (MOVWload [off1] {sym} (ADDVconst [off2] ptr) mem)
 	// cond: is32Bit(int64(off1)+off2) && (ptr.Op != OpSB || !config.ctxt.Flag_dynlink)
 	// result: (MOVWload [off1+int32(off2)] {sym} ptr mem)
@@ -9056,62 +9133,35 @@ func rewriteValueLOONG64_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: s%8 != 0 && s > 16
-	// result: (Move [s%8] (OffPtr <dst.Type> dst [s-s%8]) (OffPtr <src.Type> src [s-s%8]) (Move [s-s%8] dst src mem))
-	for {
-		s := auxIntToInt64(v.AuxInt)
-		dst := v_0
-		src := v_1
-		mem := v_2
-		if !(s%8 != 0 && s > 16) {
-			break
-		}
-		v.reset(OpMove)
-		v.AuxInt = int64ToAuxInt(s % 8)
-		v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
-		v0.AuxInt = int64ToAuxInt(s - s%8)
-		v0.AddArg(dst)
-		v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
-		v1.AuxInt = int64ToAuxInt(s - s%8)
-		v1.AddArg(src)
-		v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem)
-		v2.AuxInt = int64ToAuxInt(s - s%8)
-		v2.AddArg3(dst, src, mem)
-		v.AddArg3(v0, v1, v2)
-		return true
-	}
-	// match: (Move [s] dst src mem)
-	// cond: s%8 == 0 && s > 16 && s <= 8*128 && logLargeCopy(v, s)
-	// result: (DUFFCOPY [16 * (128 - s/8)] dst src mem)
+	// cond: s > 16 && s < 192 && logLargeCopy(v, s)
+	// result: (LoweredMove [s] dst src mem)
 	for {
 		s := auxIntToInt64(v.AuxInt)
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s%8 == 0 && s > 16 && s <= 8*128 && logLargeCopy(v, s)) {
+		if !(s > 16 && s < 192 && logLargeCopy(v, s)) {
 			break
 		}
-		v.reset(OpLOONG64DUFFCOPY)
-		v.AuxInt = int64ToAuxInt(16 * (128 - s/8))
+		v.reset(OpLOONG64LoweredMove)
+		v.AuxInt = int64ToAuxInt(s)
 		v.AddArg3(dst, src, mem)
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: s%8 == 0 && s > 1024 && logLargeCopy(v, s)
-	// result: (LoweredMove dst src (ADDVconst <src.Type> src [s-8]) mem)
+	// cond: s >= 192 && logLargeCopy(v, s)
+	// result: (LoweredMoveLoop [s] dst src mem)
 	for {
 		s := auxIntToInt64(v.AuxInt)
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s%8 == 0 && s > 1024 && logLargeCopy(v, s)) {
+		if !(s >= 192 && logLargeCopy(v, s)) {
 			break
 		}
-		v.reset(OpLOONG64LoweredMove)
-		v0 := b.NewValue0(v.Pos, OpLOONG64ADDVconst, src.Type)
-		v0.AuxInt = int64ToAuxInt(s - 8)
-		v0.AddArg(src)
-		v.AddArg4(dst, src, v0, mem)
+		v.reset(OpLOONG64LoweredMoveLoop)
+		v.AuxInt = int64ToAuxInt(s)
+		v.AddArg3(dst, src, mem)
 		return true
 	}
 	return false
@@ -11497,56 +11547,33 @@ func rewriteValueLOONG64_OpZero(v *Value) bool {
 		return true
 	}
 	// match: (Zero [s] ptr mem)
-	// cond: s%8 != 0 && s > 16
-	// result: (Zero [s%8] (OffPtr <ptr.Type> ptr [s-s%8]) (Zero [s-s%8] ptr mem))
+	// cond: s > 16 && s < 192
+	// result: (LoweredZero [s] ptr mem)
 	for {
 		s := auxIntToInt64(v.AuxInt)
 		ptr := v_0
 		mem := v_1
-		if !(s%8 != 0 && s > 16) {
+		if !(s > 16 && s < 192) {
 			break
 		}
-		v.reset(OpZero)
-		v.AuxInt = int64ToAuxInt(s % 8)
-		v0 := b.NewValue0(v.Pos, OpOffPtr, ptr.Type)
-		v0.AuxInt = int64ToAuxInt(s - s%8)
-		v0.AddArg(ptr)
-		v1 := b.NewValue0(v.Pos, OpZero, types.TypeMem)
-		v1.AuxInt = int64ToAuxInt(s - s%8)
-		v1.AddArg2(ptr, mem)
-		v.AddArg2(v0, v1)
-		return true
-	}
-	// match: (Zero [s] ptr mem)
-	// cond: s%8 == 0 && s > 16 && s <= 8*128
-	// result: (DUFFZERO [8 * (128 - s/8)] ptr mem)
-	for {
-		s := auxIntToInt64(v.AuxInt)
-		ptr := v_0
-		mem := v_1
-		if !(s%8 == 0 && s > 16 && s <= 8*128) {
-			break
-		}
-		v.reset(OpLOONG64DUFFZERO)
-		v.AuxInt = int64ToAuxInt(8 * (128 - s/8))
+		v.reset(OpLOONG64LoweredZero)
+		v.AuxInt = int64ToAuxInt(s)
 		v.AddArg2(ptr, mem)
 		return true
 	}
 	// match: (Zero [s] ptr mem)
-	// cond: s%8 == 0 && s > 8*128
-	// result: (LoweredZero ptr (ADDVconst <ptr.Type> ptr [s-8]) mem)
+	// cond: s >= 192
+	// result: (LoweredZeroLoop [s] ptr mem)
 	for {
 		s := auxIntToInt64(v.AuxInt)
 		ptr := v_0
 		mem := v_1
-		if !(s%8 == 0 && s > 8*128) {
+		if !(s >= 192) {
 			break
 		}
-		v.reset(OpLOONG64LoweredZero)
-		v0 := b.NewValue0(v.Pos, OpLOONG64ADDVconst, ptr.Type)
-		v0.AuxInt = int64ToAuxInt(s - 8)
-		v0.AddArg(ptr)
-		v.AddArg3(ptr, v0, mem)
+		v.reset(OpLOONG64LoweredZeroLoop)
+		v.AuxInt = int64ToAuxInt(s)
+		v.AddArg2(ptr, mem)
 		return true
 	}
 	return false
diff --git a/src/cmd/compile/internal/test/inl_test.go b/src/cmd/compile/internal/test/inl_test.go
index eda6084b48e7cc..a49cd767db43d8 100644
--- a/src/cmd/compile/internal/test/inl_test.go
+++ b/src/cmd/compile/internal/test/inl_test.go
@@ -125,6 +125,8 @@ func TestIntendedInlining(t *testing.T) {
 			"assemble64",
 		},
 		"unicode/utf8": {
+			"DecodeRune",
+			"DecodeRuneInString",
 			"FullRune",
 			"FullRuneInString",
 			"RuneLen",
diff --git a/src/cmd/dist/test.go b/src/cmd/dist/test.go
index dfa4ffb522fa65..91e3716f07b1c5 100644
--- a/src/cmd/dist/test.go
+++ b/src/cmd/dist/test.go
@@ -677,7 +677,7 @@ func (t *tester) registerTests() {
 			}
 			t.registerStdTest(pkg)
 		}
-		if t.race {
+		if t.race && !t.short {
 			for _, pkg := range pkgs {
 				if t.packageHasBenchmarks(pkg) {
 					t.registerRaceBenchTest(pkg)
diff --git a/src/cmd/gofmt/gofmt.go b/src/cmd/gofmt/gofmt.go
index d91a75b1050e20..bbb8b4fd15c2f7 100644
--- a/src/cmd/gofmt/gofmt.go
+++ b/src/cmd/gofmt/gofmt.go
@@ -87,10 +87,8 @@ func initParserMode() {
 	}
 }
 
-func isGoFile(f fs.DirEntry) bool {
-	// ignore non-Go files
-	name := f.Name()
-	return !strings.HasPrefix(name, ".") && strings.HasSuffix(name, ".go") && !f.IsDir()
+func isGoFilename(name string) bool {
+	return !strings.HasPrefix(name, ".") && strings.HasSuffix(name, ".go")
 }
 
 // A sequencer performs concurrent tasks that may write output, but emits that
@@ -411,34 +409,30 @@ func gofmtMain(s *sequencer) {
 	}
 
 	for _, arg := range args {
-		switch info, err := os.Stat(arg); {
-		case err != nil:
-			s.AddReport(err)
-		case !info.IsDir():
-			// Non-directory arguments are always formatted.
-			arg := arg
-			s.Add(fileWeight(arg, info), func(r *reporter) error {
-				return processFile(arg, info, nil, r)
-			})
-		default:
-			// Directories are walked, ignoring non-Go files.
-			err := filepath.WalkDir(arg, func(path string, f fs.DirEntry, err error) error {
-				if err != nil || !isGoFile(f) {
-					return err
-				}
-				info, err := f.Info()
-				if err != nil {
-					s.AddReport(err)
-					return nil
-				}
-				s.Add(fileWeight(path, info), func(r *reporter) error {
-					return processFile(path, info, nil, r)
-				})
-				return nil
-			})
+		// Walk each given argument as a directory tree.
+		// If the argument is not a directory, it's always formatted as a Go file.
+		// If the argument is a directory, we walk it, ignoring non-Go files.
+		if err := filepath.WalkDir(arg, func(path string, d fs.DirEntry, err error) error {
+			switch {
+			case err != nil:
+				return err
+			case d.IsDir():
+				return nil // simply recurse into directories
+			case path == arg:
+				// non-directories given as explicit arguments are always formatted
+			case !isGoFilename(d.Name()):
+				return nil // skip walked non-Go files
+			}
+			info, err := d.Info()
 			if err != nil {
-				s.AddReport(err)
+				return err
 			}
+			s.Add(fileWeight(path, info), func(r *reporter) error {
+				return processFile(path, info, nil, r)
+			})
+			return nil
+		}); err != nil {
+			s.AddReport(err)
 		}
 	}
 }
diff --git a/src/cmd/gofmt/long_test.go b/src/cmd/gofmt/long_test.go
index 21a01196cf6cc2..372e324387843d 100644
--- a/src/cmd/gofmt/long_test.go
+++ b/src/cmd/gofmt/long_test.go
@@ -115,7 +115,7 @@ func genFilenames(t *testing.T, filenames chan<- string) {
 			return nil
 		}
 		// don't descend into testdata directories
-		if isGoFile(d) && !strings.Contains(filepath.ToSlash(filename), "/testdata/") {
+		if !d.IsDir() && isGoFilename(d.Name()) && !strings.Contains(filepath.ToSlash(filename), "/testdata/") {
 			filenames <- filename
 			nfiles++
 		}
diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go
index f5d20cfabe76d5..8e651cdfef0e21 100644
--- a/src/cmd/internal/obj/loong64/a.out.go
+++ b/src/cmd/internal/obj/loong64/a.out.go
@@ -666,6 +666,10 @@ const (
 	ABSTRPICKW
 	ABSTRPICKV
 
+	// 2.2.5.3
+	AMOVWP
+	AMOVVP
+
 	// 2.2.5.4. Prefetch Instructions
 	APRELD
 	APRELDX
diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go
index 67b5f2fc809927..c629553d5598af 100644
--- a/src/cmd/internal/obj/loong64/anames.go
+++ b/src/cmd/internal/obj/loong64/anames.go
@@ -202,6 +202,8 @@ var Anames = []string{
 	"BSTRINSV",
 	"BSTRPICKW",
 	"BSTRPICKV",
+	"MOVWP",
+	"MOVVP",
 	"PRELD",
 	"PRELDX",
 	"CRCWBW",
diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go
index 5d85585ebec11e..1b982f6c86fa53 100644
--- a/src/cmd/internal/obj/loong64/asm.go
+++ b/src/cmd/internal/obj/loong64/asm.go
@@ -212,6 +212,8 @@ var optab = []Optab{
 	{AMOVV, C_REG, C_NONE, C_NONE, C_TLS_LE, C_NONE, 53, 16, 0, 0},
 	{AMOVB, C_REG, C_NONE, C_NONE, C_TLS_LE, C_NONE, 53, 16, 0, 0},
 	{AMOVBU, C_REG, C_NONE, C_NONE, C_TLS_LE, C_NONE, 53, 16, 0, 0},
+	{AMOVWP, C_REG, C_NONE, C_NONE, C_SOREG, C_NONE, 73, 4, 0, 0},
+	{AMOVWP, C_REG, C_NONE, C_NONE, C_LOREG, C_NONE, 73, 4, 0, 0},
 
 	{AMOVW, C_LAUTO, C_NONE, C_NONE, C_REG, C_NONE, 36, 12, REGSP, 0},
 	{AMOVWU, C_LAUTO, C_NONE, C_NONE, C_REG, C_NONE, 36, 12, REGSP, 0},
@@ -233,6 +235,8 @@ var optab = []Optab{
 	{AMOVV, C_TLS_LE, C_NONE, C_NONE, C_REG, C_NONE, 54, 16, 0, 0},
 	{AMOVB, C_TLS_LE, C_NONE, C_NONE, C_REG, C_NONE, 54, 16, 0, 0},
 	{AMOVBU, C_TLS_LE, C_NONE, C_NONE, C_REG, C_NONE, 54, 16, 0, 0},
+	{AMOVWP, C_SOREG, C_NONE, C_NONE, C_REG, C_NONE, 74, 4, 0, 0},
+	{AMOVWP, C_LOREG, C_NONE, C_NONE, C_REG, C_NONE, 74, 4, 0, 0},
 
 	{AMOVW, C_SACON, C_NONE, C_NONE, C_REG, C_NONE, 3, 4, REGSP, 0},
 	{AMOVV, C_SACON, C_NONE, C_NONE, C_REG, C_NONE, 3, 4, REGSP, 0},
@@ -1437,6 +1441,9 @@ func buildop(ctxt *obj.Link) {
 		case AMOVBU:
 			opset(AMOVHU, r0)
 
+		case AMOVWP:
+			opset(AMOVVP, r0)
+
 		case AMUL:
 			opset(AMULU, r0)
 			opset(AMULH, r0)
@@ -1964,6 +1971,10 @@ func OP_16IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
 	return op | (i&0xFFFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
 }
 
+func OP_14IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
+	return op | (i&0x3FFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
+}
+
 func OP_12IR_5I(op uint32, i1 uint32, r2 uint32, i2 uint32) uint32 {
 	return op | (i1&0xFFF)<<10 | (r2&0x1F)<<5 | (i2&0x1F)<<0
 }
@@ -2893,6 +2904,20 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
 			o3 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(REGTMP), uint32(REGTMP))
 		}
 		o4 = OP_RRR(c.oprrr(p.As), uint32(REGTMP), uint32(r), uint32(p.To.Reg))
+
+	case 73:
+		v := c.regoff(&p.To)
+		if v&3 != 0 {
+			c.ctxt.Diag("%v: offset must be a multiple of 4.\n", p)
+		}
+		o1 = OP_14IRR(c.opirr(p.As), uint32(v>>2), uint32(p.To.Reg), uint32(p.From.Reg))
+
+	case 74:
+		v := c.regoff(&p.From)
+		if v&3 != 0 {
+			c.ctxt.Diag("%v: offset must be a multiple of 4.\n", p)
+		}
+		o1 = OP_14IRR(c.opirr(-p.As), uint32(v>>2), uint32(p.From.Reg), uint32(p.To.Reg))
 	}
 
 	out[0] = o1
@@ -4026,6 +4051,10 @@ func (c *ctxt0) opirr(a obj.As) uint32 {
 		return 0x0ad << 22
 	case AMOVD:
 		return 0x0af << 22
+	case AMOVVP:
+		return 0x27 << 24 // stptr.d
+	case AMOVWP:
+		return 0x25 << 24 // stptr.w
 	case -AMOVB:
 		return 0x0a0 << 22
 	case -AMOVBU:
@@ -4044,6 +4073,10 @@ func (c *ctxt0) opirr(a obj.As) uint32 {
 		return 0x0ac << 22
 	case -AMOVD:
 		return 0x0ae << 22
+	case -AMOVVP:
+		return 0x26 << 24 // ldptr.d
+	case -AMOVWP:
+		return 0x24 << 24 // ldptr.w
 	case -AVMOVQ:
 		return 0x0b0 << 22 // vld
 	case -AXVMOVQ:
diff --git a/src/cmd/internal/obj/loong64/doc.go b/src/cmd/internal/obj/loong64/doc.go
index 64bb41ae5a2219..6c8f2618a2cb73 100644
--- a/src/cmd/internal/obj/loong64/doc.go
+++ b/src/cmd/internal/obj/loong64/doc.go
@@ -289,6 +289,34 @@ Note: In the following sections 3.1 to 3.6, "ui4" (4-bit unsigned int immediate)
 
             Go assembly      | instruction Encoding
         ALSLV $4, r4, r5, R6 |      002d9486
+
+5. Note of special memory access instructions
+    Instruction format:
+      MOVWP	offset(Rj), Rd
+      MOVVP	offset(Rj), Rd
+      MOVWP	Rd, offset(Rj)
+      MOVVP	Rd, offset(Rj)
+
+    Mapping between Go and platform assembly:
+               Go assembly      |      platform assembly
+      MOVWP  offset(Rj), Rd     |    ldptr.w  rd, rj, si14
+      MOVVP  offset(Rj), Rd     |    ldptr.d  rd, rj, si14
+      MOVWP  Rd, offset(Rj)     |    stptr.w  rd, rj, si14
+      MOVVP  Rd, offset(Rj)     |    stptr.d  rd, rj, si14
+
+      note: In Go assembly, for ease of understanding, offset is a 16-bit immediate number representing
+            the actual address offset, but in platform assembly, it need a 14-bit immediate number.
+	    si14 = offset>>2
+
+    The addressing calculation for the above instruction involves logically left-shifting the 14-bit
+    immediate number si14 by 2 bits, then sign-extending it, and finally adding it to the value in the
+    general-purpose register rj to obtain the sum.
+
+    For example:
+
+            Go assembly      |      platform assembly
+         MOVWP  8(R4), R5    |      ldptr.w r5, r4, $2
+
 */
 
 package loong64
diff --git a/src/encoding/json/decode.go b/src/encoding/json/decode.go
index 70885a517e1876..fc29296c0f464f 100644
--- a/src/encoding/json/decode.go
+++ b/src/encoding/json/decode.go
@@ -1214,10 +1214,6 @@ func unquoteBytes(s []byte) (t []byte, ok bool) {
 		if c == '\\' || c == '"' || c < ' ' {
 			break
 		}
-		if c < utf8.RuneSelf {
-			r++
-			continue
-		}
 		rr, size := utf8.DecodeRune(s[r:])
 		if rr == utf8.RuneError && size == 1 {
 			break
diff --git a/src/fmt/format.go b/src/fmt/format.go
index 90e18cd696375f..334a94e2983e63 100644
--- a/src/fmt/format.go
+++ b/src/fmt/format.go
@@ -346,10 +346,7 @@ func (f *fmt) truncate(b []byte) []byte {
 			if n < 0 {
 				return b[:i]
 			}
-			wid := 1
-			if b[i] >= utf8.RuneSelf {
-				_, wid = utf8.DecodeRune(b[i:])
-			}
+			_, wid := utf8.DecodeRune(b[i:])
 			i += wid
 		}
 	}
diff --git a/src/fmt/print.go b/src/fmt/print.go
index 155218046f47ce..01cfa1a1c7d7b4 100644
--- a/src/fmt/print.go
+++ b/src/fmt/print.go
@@ -1145,10 +1145,7 @@ formatLoop:
 			break
 		}
 
-		verb, size := rune(format[i]), 1
-		if verb >= utf8.RuneSelf {
-			verb, size = utf8.DecodeRuneInString(format[i:])
-		}
+		verb, size := utf8.DecodeRuneInString(format[i:])
 		i += size
 
 		switch {
diff --git a/src/go/doc/comment_test.go b/src/go/doc/comment_test.go
index 004ae9d13d6de6..0e7de3eb78f38f 100644
--- a/src/go/doc/comment_test.go
+++ b/src/go/doc/comment_test.go
@@ -24,12 +24,12 @@ func TestComment(t *testing.T) {
 	pkg := New(pkgs["pkgdoc"], "testdata/pkgdoc", 0)
 
 	var (
-		input           = "[T] and [U] are types, and [T.M] is a method, but [V] is a broken link. [rand.Int] and [crand.Reader] are things. [G.M1] and [G.M2] are generic methods.\n"
-		wantHTML        = `<p><a href="#T">T</a> and <a href="#U">U</a> are types, and <a href="#T.M">T.M</a> is a method, but [V] is a broken link. <a href="/math/rand#Int">rand.Int</a> and <a href="/crypto/rand#Reader">crand.Reader</a> are things. <a href="#G.M1">G.M1</a> and <a href="#G.M2">G.M2</a> are generic methods.` + "\n"
-		wantOldHTML     = "<p>[T] and [U] are <i>types</i>, and [T.M] is a method, but [V] is a broken link. [rand.Int] and [crand.Reader] are things. [G.M1] and [G.M2] are generic methods.\n"
-		wantMarkdown    = "[T](#T) and [U](#U) are types, and [T.M](#T.M) is a method, but \\[V] is a broken link. [rand.Int](/math/rand#Int) and [crand.Reader](/crypto/rand#Reader) are things. [G.M1](#G.M1) and [G.M2](#G.M2) are generic methods.\n"
-		wantText        = "T and U are types, and T.M is a method, but [V] is a broken link. rand.Int and\ncrand.Reader are things. G.M1 and G.M2 are generic methods.\n"
-		wantOldText     = "[T] and [U] are types, and [T.M] is a method, but [V] is a broken link.\n[rand.Int] and [crand.Reader] are things. [G.M1] and [G.M2] are generic methods.\n"
+		input           = "[T] and [U] are types, and [T.M] is a method, but [V] is a broken link. [rand.Int] and [crand.Reader] are things. [G.M1] and [G.M2] are generic methods. [I.F] is an interface method and [I.V] is a broken link.\n"
+		wantHTML        = `<p><a href="#T">T</a> and <a href="#U">U</a> are types, and <a href="#T.M">T.M</a> is a method, but [V] is a broken link. <a href="/math/rand#Int">rand.Int</a> and <a href="/crypto/rand#Reader">crand.Reader</a> are things. <a href="#G.M1">G.M1</a> and <a href="#G.M2">G.M2</a> are generic methods. <a href="#I.F">I.F</a> is an interface method and [I.V] is a broken link.` + "\n"
+		wantOldHTML     = "<p>[T] and [U] are <i>types</i>, and [T.M] is a method, but [V] is a broken link. [rand.Int] and [crand.Reader] are things. [G.M1] and [G.M2] are generic methods. [I.F] is an interface method and [I.V] is a broken link.\n"
+		wantMarkdown    = "[T](#T) and [U](#U) are types, and [T.M](#T.M) is a method, but \\[V] is a broken link. [rand.Int](/math/rand#Int) and [crand.Reader](/crypto/rand#Reader) are things. [G.M1](#G.M1) and [G.M2](#G.M2) are generic methods. [I.F](#I.F) is an interface method and \\[I.V] is a broken link.\n"
+		wantText        = "T and U are types, and T.M is a method, but [V] is a broken link. rand.Int and\ncrand.Reader are things. G.M1 and G.M2 are generic methods. I.F is an interface\nmethod and [I.V] is a broken link.\n"
+		wantOldText     = "[T] and [U] are types, and [T.M] is a method, but [V] is a broken link.\n[rand.Int] and [crand.Reader] are things. [G.M1] and [G.M2] are generic methods.\n[I.F] is an interface method and [I.V] is a broken link.\n"
 		wantSynopsis    = "T and U are types, and T.M is a method, but [V] is a broken link."
 		wantOldSynopsis = "[T] and [U] are types, and [T.M] is a method, but [V] is a broken link."
 	)
diff --git a/src/go/doc/doc.go b/src/go/doc/doc.go
index f7e3c1bad8207b..0c23f1a46c87fd 100644
--- a/src/go/doc/doc.go
+++ b/src/go/doc/doc.go
@@ -167,6 +167,7 @@ func (p *Package) collectTypes(types []*Type) {
 		p.collectValues(t.Vars)
 		p.collectFuncs(t.Funcs)
 		p.collectFuncs(t.Methods)
+		p.collectInterfaceMethods(t)
 	}
 }
 
@@ -184,6 +185,33 @@ func (p *Package) collectFuncs(funcs []*Func) {
 	}
 }
 
+// collectInterfaceMethods adds methods of interface types within t to p.syms.
+// Note that t.Methods will contain methods of non-interface types, but not interface types.
+// Adding interface methods to t.Methods might make sense, but would cause us to
+// include those methods in the documentation index. Adding interface methods to p.syms
+// here allows us to linkify references like [io.Reader.Read] without making any other
+// changes to the documentation formatting at this time.
+//
+// If we do start adding interface methods to t.Methods in the future,
+// collectInterfaceMethods can be dropped as redundant with collectFuncs(t.Methods).
+func (p *Package) collectInterfaceMethods(t *Type) {
+	for _, s := range t.Decl.Specs {
+		spec, ok := s.(*ast.TypeSpec)
+		if !ok {
+			continue
+		}
+		list, isStruct := fields(spec.Type)
+		if isStruct {
+			continue
+		}
+		for _, field := range list {
+			for _, name := range field.Names {
+				p.syms[t.Name+"."+name.Name] = true
+			}
+		}
+	}
+}
+
 // NewFromFiles computes documentation for a package.
 //
 // The package is specified by a list of *ast.Files and corresponding
diff --git a/src/go/doc/testdata/pkgdoc/doc.go b/src/go/doc/testdata/pkgdoc/doc.go
index 3f822c75546c63..d542dc2cdd0cb6 100644
--- a/src/go/doc/testdata/pkgdoc/doc.go
+++ b/src/go/doc/testdata/pkgdoc/doc.go
@@ -20,5 +20,9 @@ var _ = crand.Reader
 
 type G[T any] struct{ x T }
 
-func (g G[T]) M1() {}
+func (g G[T]) M1()  {}
 func (g *G[T]) M2() {}
+
+type I interface {
+	F()
+}
diff --git a/src/internal/runtime/atomic/atomic_loong64.s b/src/internal/runtime/atomic/atomic_loong64.s
index 95d3e2bdab8c86..4215af24febaa2 100644
--- a/src/internal/runtime/atomic/atomic_loong64.s
+++ b/src/internal/runtime/atomic/atomic_loong64.s
@@ -19,7 +19,7 @@ TEXT ·Cas(SB), NOSPLIT, $0-17
 	MOVW	new+12(FP), R6
 
 	MOVBU	internal∕cpu·Loong64+const_offsetLOONG64HasLAMCAS(SB), R8
-	BEQ	R8, cas_again
+	BEQ	R8, ll_sc
 	MOVV	R5, R7  // backup old value
 	AMCASDBW	R6, (R4), R5
 	BNE	R7, R5, cas_fail0
@@ -30,6 +30,7 @@ cas_fail0:
 	MOVB	R0, ret+16(FP)
 	RET
 
+ll_sc:
 	// Implemented using the ll-sc instruction pair
 	DBAR	$0x14	// LoadAcquire barrier
 cas_again:
@@ -60,7 +61,7 @@ TEXT ·Cas64(SB), NOSPLIT, $0-25
 	MOVV	new+16(FP), R6
 
 	MOVBU	internal∕cpu·Loong64+const_offsetLOONG64HasLAMCAS(SB), R8
-	BEQ	R8, cas64_again
+	BEQ	R8, ll_sc_64
 	MOVV	R5, R7  // backup old value
 	AMCASDBV	R6, (R4), R5
 	BNE	R7, R5, cas64_fail0
@@ -71,6 +72,7 @@ cas64_fail0:
 	MOVB	R0, ret+24(FP)
 	RET
 
+ll_sc_64:
 	// Implemented using the ll-sc instruction pair
 	DBAR	$0x14
 cas64_again:
diff --git a/src/math/modf.go b/src/math/modf.go
index ab73e2dc36831e..12630958e969b7 100644
--- a/src/math/modf.go
+++ b/src/math/modf.go
@@ -11,8 +11,8 @@ package math
 //
 //	Modf(±Inf) = ±Inf, NaN
 //	Modf(NaN) = NaN, NaN
-func Modf(f float64) (int float64, frac float64) {
-	int = Trunc(f)
-	frac = Copysign(f-int, f)
+func Modf(f float64) (integer float64, fractional float64) {
+	integer = Trunc(f)
+	fractional = Copysign(f-integer, f)
 	return
 }
diff --git a/src/net/http/server.go b/src/net/http/server.go
index cf0bd0a91d7624..6fdcd51c0a6777 100644
--- a/src/net/http/server.go
+++ b/src/net/http/server.go
@@ -2759,9 +2759,12 @@ func (mux *ServeMux) matchOrRedirect(host, method, path string, u *url.URL) (_ *
 	defer mux.mu.RUnlock()
 
 	n, matches := mux.tree.match(host, method, path)
-	// If we have an exact match, or we were asked not to try trailing-slash redirection,
-	// or the URL already has a trailing slash, then we're done.
-	if !exactMatch(n, path) && u != nil && !strings.HasSuffix(path, "/") {
+	// We can terminate here if any of the following is true:
+	// - We have an exact match already.
+	// - We were asked not to try trailing slash redirection.
+	// - The URL already has a trailing slash.
+	// - The URL is an empty string.
+	if !exactMatch(n, path) && u != nil && !strings.HasSuffix(path, "/") && path != "" {
 		// If there is an exact match with a trailing slash, then redirect.
 		path += "/"
 		n2, _ := mux.tree.match(host, method, path)
diff --git a/src/net/http/server_test.go b/src/net/http/server_test.go
index f4aafc853bd5d6..832f9688b63d9c 100644
--- a/src/net/http/server_test.go
+++ b/src/net/http/server_test.go
@@ -97,6 +97,7 @@ func TestFindHandler(t *testing.T) {
 		{"GET", "/foo/x", "&http.handler{i:2}"},
 		{"GET", "/bar/x", "&http.handler{i:4}"},
 		{"GET", "/bar", `&http.redirectHandler{url:"/bar/", code:301}`},
+		{"CONNECT", "", "(http.HandlerFunc)(.*)"},
 		{"CONNECT", "/", "&http.handler{i:1}"},
 		{"CONNECT", "//", "&http.handler{i:1}"},
 		{"CONNECT", "//foo", "&http.handler{i:5}"},
@@ -112,7 +113,7 @@ func TestFindHandler(t *testing.T) {
 		r.URL = &url.URL{Path: test.path}
 		gotH, _, _, _ := mux.findHandler(&r)
 		got := fmt.Sprintf("%#v", gotH)
-		if got != test.wantHandler {
+		if !regexp.MustCompile(test.wantHandler).MatchString(got) {
 			t.Errorf("%s %q: got %q, want %q", test.method, test.path, got, test.wantHandler)
 		}
 	}
diff --git a/src/net/netip/export_test.go b/src/net/netip/export_test.go
index b2fae1aa47eedc..777a76a6b26401 100644
--- a/src/net/netip/export_test.go
+++ b/src/net/netip/export_test.go
@@ -34,5 +34,3 @@ var TestAppendToMarshal = testAppendToMarshal
 
 func (a Addr) IsZero() bool   { return a.isZero() }
 func (p Prefix) IsZero() bool { return p.isZero() }
-
-func (p Prefix) Compare(p2 Prefix) int { return p.compare(p2) }
diff --git a/src/net/netip/netip.go b/src/net/netip/netip.go
index 35abfd3241bc13..b1b15b47287de2 100644
--- a/src/net/netip/netip.go
+++ b/src/net/netip/netip.go
@@ -1330,21 +1330,23 @@ func (p Prefix) isZero() bool { return p == Prefix{} }
 // IsSingleIP reports whether p contains exactly one IP.
 func (p Prefix) IsSingleIP() bool { return p.IsValid() && p.Bits() == p.ip.BitLen() }
 
-// compare returns an integer comparing two prefixes.
+// Compare returns an integer comparing two prefixes.
 // The result will be 0 if p == p2, -1 if p < p2, and +1 if p > p2.
 // Prefixes sort first by validity (invalid before valid), then
-// address family (IPv4 before IPv6), then prefix length, then
-// address.
-//
-// Unexported for Go 1.22 because we may want to compare by p.Addr first.
-// See post-acceptance discussion on go.dev/issue/61642.
-func (p Prefix) compare(p2 Prefix) int {
-	if c := cmp.Compare(p.Addr().BitLen(), p2.Addr().BitLen()); c != 0 {
+// address family (IPv4 before IPv6), then masked prefix address, then
+// prefix length, then unmasked address.
+func (p Prefix) Compare(p2 Prefix) int {
+	// Aside from sorting based on the masked address, this use of
+	// Addr.Compare also enforces the valid vs. invalid and address
+	// family ordering for the prefix.
+	if c := p.Masked().Addr().Compare(p2.Masked().Addr()); c != 0 {
 		return c
 	}
+
 	if c := cmp.Compare(p.Bits(), p2.Bits()); c != 0 {
 		return c
 	}
+
 	return p.Addr().Compare(p2.Addr())
 }
 
diff --git a/src/net/netip/netip_test.go b/src/net/netip/netip_test.go
index ea03f9a9e72473..71e39021ca8969 100644
--- a/src/net/netip/netip_test.go
+++ b/src/net/netip/netip_test.go
@@ -1123,6 +1123,9 @@ func TestPrefixCompare(t *testing.T) {
 		{mustPrefix("fe80::/48"), mustPrefix("fe80::/64"), -1},
 
 		{mustPrefix("1.2.3.0/24"), mustPrefix("fe80::/8"), -1},
+
+		{mustPrefix("1.2.3.0/24"), mustPrefix("1.2.3.4/24"), -1},
+		{mustPrefix("1.2.3.0/24"), mustPrefix("1.2.3.0/28"), -1},
 	}
 	for _, tt := range tests {
 		got := tt.a.Compare(tt.b)
@@ -1148,10 +1151,70 @@ func TestPrefixCompare(t *testing.T) {
 		Prefix{},
 		mustPrefix("fe80::/48"),
 		mustPrefix("1.2.0.0/24"),
+		mustPrefix("1.2.3.4/24"),
+		mustPrefix("1.2.3.0/28"),
 	}
 	slices.SortFunc(values, Prefix.Compare)
 	got := fmt.Sprintf("%s", values)
-	want := `[invalid Prefix 1.2.0.0/16 1.2.0.0/24 1.2.3.0/24 fe80::/48 fe80::/64 fe90::/64]`
+	want := `[invalid Prefix 1.2.0.0/16 1.2.0.0/24 1.2.3.0/24 1.2.3.4/24 1.2.3.0/28 fe80::/48 fe80::/64 fe90::/64]`
+	if got != want {
+		t.Errorf("unexpected sort\n got: %s\nwant: %s\n", got, want)
+	}
+
+	// Lists from
+	// https://www.iana.org/assignments/iana-ipv4-special-registry/iana-ipv4-special-registry.xhtml and
+	// https://www.iana.org/assignments/ipv6-address-space/ipv6-address-space.xhtml,
+	// to verify that the sort order matches IANA's conventional
+	// ordering.
+	values = []Prefix{
+		mustPrefix("0.0.0.0/8"),
+		mustPrefix("127.0.0.0/8"),
+		mustPrefix("10.0.0.0/8"),
+		mustPrefix("203.0.113.0/24"),
+		mustPrefix("169.254.0.0/16"),
+		mustPrefix("192.0.0.0/24"),
+		mustPrefix("240.0.0.0/4"),
+		mustPrefix("192.0.2.0/24"),
+		mustPrefix("192.0.0.170/32"),
+		mustPrefix("198.18.0.0/15"),
+		mustPrefix("192.0.0.8/32"),
+		mustPrefix("0.0.0.0/32"),
+		mustPrefix("192.0.0.9/32"),
+		mustPrefix("198.51.100.0/24"),
+		mustPrefix("192.168.0.0/16"),
+		mustPrefix("192.0.0.10/32"),
+		mustPrefix("192.175.48.0/24"),
+		mustPrefix("192.52.193.0/24"),
+		mustPrefix("100.64.0.0/10"),
+		mustPrefix("255.255.255.255/32"),
+		mustPrefix("192.31.196.0/24"),
+		mustPrefix("172.16.0.0/12"),
+		mustPrefix("192.0.0.0/29"),
+		mustPrefix("192.88.99.0/24"),
+		mustPrefix("fec0::/10"),
+		mustPrefix("6000::/3"),
+		mustPrefix("fe00::/9"),
+		mustPrefix("8000::/3"),
+		mustPrefix("0000::/8"),
+		mustPrefix("0400::/6"),
+		mustPrefix("f800::/6"),
+		mustPrefix("e000::/4"),
+		mustPrefix("ff00::/8"),
+		mustPrefix("a000::/3"),
+		mustPrefix("fc00::/7"),
+		mustPrefix("1000::/4"),
+		mustPrefix("0800::/5"),
+		mustPrefix("4000::/3"),
+		mustPrefix("0100::/8"),
+		mustPrefix("c000::/3"),
+		mustPrefix("fe80::/10"),
+		mustPrefix("0200::/7"),
+		mustPrefix("f000::/5"),
+		mustPrefix("2000::/3"),
+	}
+	slices.SortFunc(values, func(a, b Prefix) int { return a.Compare(b) })
+	got = fmt.Sprintf("%s", values)
+	want = `[0.0.0.0/8 0.0.0.0/32 10.0.0.0/8 100.64.0.0/10 127.0.0.0/8 169.254.0.0/16 172.16.0.0/12 192.0.0.0/24 192.0.0.0/29 192.0.0.8/32 192.0.0.9/32 192.0.0.10/32 192.0.0.170/32 192.0.2.0/24 192.31.196.0/24 192.52.193.0/24 192.88.99.0/24 192.168.0.0/16 192.175.48.0/24 198.18.0.0/15 198.51.100.0/24 203.0.113.0/24 240.0.0.0/4 255.255.255.255/32 ::/8 100::/8 200::/7 400::/6 800::/5 1000::/4 2000::/3 4000::/3 6000::/3 8000::/3 a000::/3 c000::/3 e000::/4 f000::/5 f800::/6 fc00::/7 fe00::/9 fe80::/10 fec0::/10 ff00::/8]`
 	if got != want {
 		t.Errorf("unexpected sort\n got: %s\nwant: %s\n", got, want)
 	}
diff --git a/src/net/url/url.go b/src/net/url/url.go
index 2a57659460373d..7021f343972ea2 100644
--- a/src/net/url/url.go
+++ b/src/net/url/url.go
@@ -661,6 +661,13 @@ func parseHost(host string) (string, error) {
 			return host1 + host2 + host3, nil
 		}
 	} else if i := strings.LastIndex(host, ":"); i != -1 {
+		if j := strings.LastIndex(host[:i], ":"); j != -1 { // multiple colons
+			if k := strings.LastIndex(host[:j], ":"); k == -1 { // only one other colon
+				if port := host[j:i]; validOptionalPort(port) { // see issue #75223
+					return "", fmt.Errorf("a colon after port %q is not allowed", port)
+				}
+			}
+		}
 		colonPort := host[i:]
 		if !validOptionalPort(colonPort) {
 			return "", fmt.Errorf("invalid port %q after host", colonPort)
diff --git a/src/net/url/url_test.go b/src/net/url/url_test.go
index 16e08b63c6d098..6c16f8fc057933 100644
--- a/src/net/url/url_test.go
+++ b/src/net/url/url_test.go
@@ -707,6 +707,13 @@ var parseRequestURLTests = []struct {
 	// RFC 6874.
 	{"http://[fe80::1%en0]/", false},
 	{"http://[fe80::1%en0]:8080/", false},
+
+	{"http://x:x:", true},             // malformed IPv6 but still accepted
+	{"http://x::", false},             // a colon after empty port is not allowed
+	{"http://x:1:", false},            // a colon after the port is not allowed
+	{"http://x:12:", false},           // a colon after the port is not allowed
+	{"http://x:123:", false},          // a colon after the port is not allowed
+	{"http://127.0.0.1:8080:", false}, // a colon after the port is not allowed
 }
 
 func TestParseRequestURI(t *testing.T) {
@@ -1643,6 +1650,13 @@ func TestParseErrors(t *testing.T) {
 		{"cache_object:foo", true},
 		{"cache_object:foo/bar", true},
 		{"cache_object/:foo/bar", false},
+
+		{"http://x:x:", false},           // malformed IPv6 but still accepted
+		{"http://x::", true},             // a colon after empty port is not allowed
+		{"http://x:1:", true},            // a colon after the port is not allowed
+		{"http://x:12:", true},           // a colon after the port is not allowed
+		{"http://x:123:", true},          // a colon after the port is not allowed
+		{"http://127.0.0.1:8080:", true}, // a colon after the port is not allowed
 	}
 	for _, tt := range tests {
 		u, err := Parse(tt.in)
diff --git a/src/regexp/regexp.go b/src/regexp/regexp.go
index 253415fb6a44c6..66c73693995a42 100644
--- a/src/regexp/regexp.go
+++ b/src/regexp/regexp.go
@@ -384,10 +384,6 @@ type inputString struct {
 
 func (i *inputString) step(pos int) (rune, int) {
 	if pos < len(i.str) {
-		c := i.str[pos]
-		if c < utf8.RuneSelf {
-			return rune(c), 1
-		}
 		return utf8.DecodeRuneInString(i.str[pos:])
 	}
 	return endOfText, 0
@@ -409,17 +405,11 @@ func (i *inputString) context(pos int) lazyFlag {
 	r1, r2 := endOfText, endOfText
 	// 0 < pos && pos <= len(i.str)
 	if uint(pos-1) < uint(len(i.str)) {
-		r1 = rune(i.str[pos-1])
-		if r1 >= utf8.RuneSelf {
-			r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
-		}
+		r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
 	}
 	// 0 <= pos && pos < len(i.str)
 	if uint(pos) < uint(len(i.str)) {
-		r2 = rune(i.str[pos])
-		if r2 >= utf8.RuneSelf {
-			r2, _ = utf8.DecodeRuneInString(i.str[pos:])
-		}
+		r2, _ = utf8.DecodeRuneInString(i.str[pos:])
 	}
 	return newLazyFlag(r1, r2)
 }
@@ -431,10 +421,6 @@ type inputBytes struct {
 
 func (i *inputBytes) step(pos int) (rune, int) {
 	if pos < len(i.str) {
-		c := i.str[pos]
-		if c < utf8.RuneSelf {
-			return rune(c), 1
-		}
 		return utf8.DecodeRune(i.str[pos:])
 	}
 	return endOfText, 0
@@ -456,17 +442,11 @@ func (i *inputBytes) context(pos int) lazyFlag {
 	r1, r2 := endOfText, endOfText
 	// 0 < pos && pos <= len(i.str)
 	if uint(pos-1) < uint(len(i.str)) {
-		r1 = rune(i.str[pos-1])
-		if r1 >= utf8.RuneSelf {
-			r1, _ = utf8.DecodeLastRune(i.str[:pos])
-		}
+		r1, _ = utf8.DecodeLastRune(i.str[:pos])
 	}
 	// 0 <= pos && pos < len(i.str)
 	if uint(pos) < uint(len(i.str)) {
-		r2 = rune(i.str[pos])
-		if r2 >= utf8.RuneSelf {
-			r2, _ = utf8.DecodeRune(i.str[pos:])
-		}
+		r2, _ = utf8.DecodeRune(i.str[pos:])
 	}
 	return newLazyFlag(r1, r2)
 }
diff --git a/src/runtime/string.go b/src/runtime/string.go
index 44d586bc53ee7d..3726d9235bfa4b 100644
--- a/src/runtime/string.go
+++ b/src/runtime/string.go
@@ -59,6 +59,9 @@ func concatstrings(buf *tmpBuf, a []string) string {
 	return s
 }
 
+// concatstring2 helps make the callsite smaller (compared to concatstrings),
+// and we think this is currently more valuable than omitting one call in the
+// chain, the same goes for concatstring{3,4,5}.
 func concatstring2(buf *tmpBuf, a0, a1 string) string {
 	return concatstrings(buf, []string{a0, a1})
 }
@@ -108,6 +111,9 @@ func concatbytes(buf *tmpBuf, a []string) []byte {
 	return b
 }
 
+// concatbyte2 helps make the callsite smaller (compared to concatbytes),
+// and we think this is currently more valuable than omitting one call in
+// the chain, the same goes for concatbyte{3,4,5}.
 func concatbyte2(buf *tmpBuf, a0, a1 string) []byte {
 	return concatbytes(buf, []string{a0, a1})
 }
diff --git a/src/runtime/tagptr_64bit.go b/src/runtime/tagptr_64bit.go
index 3d79332e2dcaff..76733cc1d64630 100644
--- a/src/runtime/tagptr_64bit.go
+++ b/src/runtime/tagptr_64bit.go
@@ -22,10 +22,17 @@ const (
 	// On AMD64, virtual addresses are 48-bit (or 57-bit) sign-extended.
 	// Other archs are 48-bit zero-extended.
 	//
+	// We use one extra bit to placate systems which simulate amd64 binaries on
+	// an arm64 host. Allocated arm64 addresses could be as high as 1<<48-1,
+	// which would be invalid if we assumed 48-bit sign-extended addresses.
+	// See issue 69255.
+	// (Note that this does not help the other way around, simluating arm64
+	// on amd64, but we don't have that problem at the moment.)
+	//
 	// On s390x, virtual addresses are 64-bit. There's not much we
 	// can do about this, so we just hope that the kernel doesn't
 	// get to really high addresses and panic if it does.
-	defaultAddrBits = 48
+	defaultAddrBits = 48 + 1
 
 	// On AIX, 64-bit addresses are split into 36-bit segment number and 28-bit
 	// offset in segment.  Segment numbers in the range 0x0A0000000-0x0AFFFFFFF(LSA)
diff --git a/src/strconv/quote.go b/src/strconv/quote.go
index 99c292a8ed5884..da2325647d3817 100644
--- a/src/strconv/quote.go
+++ b/src/strconv/quote.go
@@ -37,12 +37,8 @@ func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly b
 		buf = nBuf
 	}
 	buf = append(buf, quote)
-	for width := 0; len(s) > 0; s = s[width:] {
-		r := rune(s[0])
-		width = 1
-		if r >= utf8.RuneSelf {
-			r, width = utf8.DecodeRuneInString(s)
-		}
+	for r, width := rune(0), 0; len(s) > 0; s = s[width:] {
+		r, width = utf8.DecodeRuneInString(s)
 		if width == 1 && r == utf8.RuneError {
 			buf = append(buf, `\x`...)
 			buf = append(buf, lowerhex[s[0]>>4])
diff --git a/src/strings/iter.go b/src/strings/iter.go
index 69fe031739628c..84e763a8343df4 100644
--- a/src/strings/iter.go
+++ b/src/strings/iter.go
@@ -117,11 +117,7 @@ func FieldsFuncSeq(s string, f func(rune) bool) iter.Seq[string] {
 	return func(yield func(string) bool) {
 		start := -1
 		for i := 0; i < len(s); {
-			size := 1
-			r := rune(s[i])
-			if r >= utf8.RuneSelf {
-				r, size = utf8.DecodeRuneInString(s[i:])
-			}
+			r, size := utf8.DecodeRuneInString(s[i:])
 			if f(r) {
 				if start >= 0 {
 					if !yield(s[start:i]) {
diff --git a/src/strings/reader.go b/src/strings/reader.go
index 497ffb7a39c635..f12c9b18b36d43 100644
--- a/src/strings/reader.go
+++ b/src/strings/reader.go
@@ -90,10 +90,6 @@ func (r *Reader) ReadRune() (ch rune, size int, err error) {
 		return 0, 0, io.EOF
 	}
 	r.prevRune = int(r.i)
-	if c := r.s[r.i]; c < utf8.RuneSelf {
-		r.i++
-		return rune(c), 1, nil
-	}
 	ch, size = utf8.DecodeRuneInString(r.s[r.i:])
 	r.i += int64(size)
 	return
diff --git a/src/strings/strings.go b/src/strings/strings.go
index 74007977d911f0..3cc3e79f982248 100644
--- a/src/strings/strings.go
+++ b/src/strings/strings.go
@@ -896,7 +896,7 @@ func TrimLeftFunc(s string, f func(rune) bool) string {
 // Unicode code points c satisfying f(c) removed.
 func TrimRightFunc(s string, f func(rune) bool) string {
 	i := lastIndexFunc(s, f, false)
-	if i >= 0 && s[i] >= utf8.RuneSelf {
+	if i >= 0 {
 		_, wid := utf8.DecodeRuneInString(s[i:])
 		i += wid
 	} else {
@@ -1028,10 +1028,7 @@ func trimLeftASCII(s string, as *asciiSet) string {
 
 func trimLeftUnicode(s, cutset string) string {
 	for len(s) > 0 {
-		r, n := rune(s[0]), 1
-		if r >= utf8.RuneSelf {
-			r, n = utf8.DecodeRuneInString(s)
-		}
+		r, n := utf8.DecodeRuneInString(s)
 		if !ContainsRune(cutset, r) {
 			break
 		}
@@ -1224,13 +1221,8 @@ hasUnicode:
 		}
 
 		// Extract first rune from second string.
-		var tr rune
-		if t[0] < utf8.RuneSelf {
-			tr, t = rune(t[0]), t[1:]
-		} else {
-			r, size := utf8.DecodeRuneInString(t)
-			tr, t = r, t[size:]
-		}
+		tr, size := utf8.DecodeRuneInString(t)
+		t = t[size:]
 
 		// If they match, keep going; if not, return false.
 
diff --git a/src/unicode/utf8/utf8.go b/src/unicode/utf8/utf8.go
index 01cad1cc81f880..68283341d92ace 100644
--- a/src/unicode/utf8/utf8.go
+++ b/src/unicode/utf8/utf8.go
@@ -155,6 +155,20 @@ func FullRuneInString(s string) bool {
 // out of range, or is not the shortest possible UTF-8 encoding for the
 // value. No other validation is performed.
 func DecodeRune(p []byte) (r rune, size int) {
+	// Inlineable fast path for ASCII characters; see #48195.
+	// This implementation is weird but effective at rendering the
+	// function inlineable.
+	for _, b := range p {
+		if b < RuneSelf {
+			return rune(b), 1
+		}
+		break
+	}
+	r, size = decodeRuneSlow(p)
+	return
+}
+
+func decodeRuneSlow(p []byte) (r rune, size int) {
 	n := len(p)
 	if n < 1 {
 		return RuneError, 0
@@ -203,6 +217,18 @@ func DecodeRune(p []byte) (r rune, size int) {
 // out of range, or is not the shortest possible UTF-8 encoding for the
 // value. No other validation is performed.
 func DecodeRuneInString(s string) (r rune, size int) {
+	// Inlineable fast path for ASCII characters; see #48195.
+	// This implementation is a bit weird but effective at rendering the
+	// function inlineable.
+	if s != "" && s[0] < RuneSelf {
+		return rune(s[0]), 1
+	} else {
+		r, size = decodeRuneInStringSlow(s)
+	}
+	return
+}
+
+func decodeRuneInStringSlow(s string) (rune, int) {
 	n := len(s)
 	if n < 1 {
 		return RuneError, 0
diff --git a/src/unicode/utf8/utf8_test.go b/src/unicode/utf8/utf8_test.go
index aece0fab731f41..bf4f074ffd0f5f 100644
--- a/src/unicode/utf8/utf8_test.go
+++ b/src/unicode/utf8/utf8_test.go
@@ -747,18 +747,37 @@ func BenchmarkAppendInvalidRuneNegative(b *testing.B) {
 
 func BenchmarkDecodeASCIIRune(b *testing.B) {
 	a := []byte{'a'}
-	for i := 0; i < b.N; i++ {
-		DecodeRune(a)
+	for range b.N {
+		runeSink, sizeSink = DecodeRune(a)
 	}
 }
 
 func BenchmarkDecodeJapaneseRune(b *testing.B) {
 	nihon := []byte("本")
-	for i := 0; i < b.N; i++ {
-		DecodeRune(nihon)
+	for range b.N {
+		runeSink, sizeSink = DecodeRune(nihon)
+	}
+}
+
+func BenchmarkDecodeASCIIRuneInString(b *testing.B) {
+	a := "a"
+	for range b.N {
+		runeSink, sizeSink = DecodeRuneInString(a)
 	}
 }
 
+func BenchmarkDecodeJapaneseRuneInString(b *testing.B) {
+	nihon := "本"
+	for range b.N {
+		runeSink, sizeSink = DecodeRuneInString(nihon)
+	}
+}
+
+var (
+	runeSink rune
+	sizeSink int
+)
+
 // boolSink is used to reference the return value of benchmarked
 // functions to avoid dead code elimination.
 var boolSink bool