patch-2.3.23 linux/arch/arm/lib/string.S

Next file: linux/arch/arm/lib/system.S
Previous file: linux/arch/arm/lib/memcpy.S
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.3.22/linux/arch/arm/lib/string.S linux/arch/arm/lib/string.S
@@ -1,69 +1,112 @@
 /*
  * linux/arch/arm/lib/string.S
  *
- * Copyright (C) 1995-1998 Russell King
+ * Copyright (C) 1995-1999 Russell King
  *
- * This is commonly used to clear the frame buffer and the frame
- * backing buffer.  As such, it will be rarely called with r2 < 32.
+ * ASM optimised string functions
  *
- * Optimisations by Matthew Wilcox
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include "constants.h"
+
 		.text
-# Prototype: char *strrchr(const char *s,char c);
 
 /*
  * Prototype: void memzero(void *d, size_t n)
  */
-ENTRY(memzero)
-		mov	r2, r1
-		mov	r1, #0
+1:		@ 4 <= r1
+		cmp	ip, #2				@	1
+		strltb	r2, [r0], #1			@	1
+		strleb	r2, [r0], #1			@	1
+		strb	r2, [r0], #1			@	1
+		rsb	ip, ip, #4			@	1
+		sub	r1, r1, ip			@	1
+		cmp	r1, #3				@	1
+		bgt	2f				@	1	@ +8
+		b	4f				@	1	@ +9
+
+		.align	5
+
+ENTRY(__memzero)
+		mov	r2, #0				@	1
+		cmp	r1, #4				@	1
+		blt	4f				@	1	@ = 3
+
+		@ r1 >= 4
+
+		ands	ip, r0, #3			@	1
+		bne	1b				@	1	@ = 5
+
+2:		@ r1 >= 4 && (r0 & 3) = 0				@ = 5 or 11
+
+		str	lr, [sp, #-4]!			@	1
+		mov	r3, #0				@	1
+		mov	ip, #0				@	1
+		mov	lr, #0				@	1
+
+		@ 4 <= r1 <= 32						@ = 9 or 15
+
+3:		subs	r1, r1, #32			@	1
+		stmgeia	r0!, {r2, r3, ip, lr}		@	4
+		stmgeia	r0!, {r2, r3, ip, lr}		@	4
+		bgt	3b				@	1
+		LOADREGS(eqfd, sp!, {pc})		@	1/2
+
+		@ -28 <= r1 <= -1
+
+		cmp	r1, #-16			@	1
+		stmgeia	r0!, {r2, r3, ip, lr}		@	4
+		ldr	lr, [sp], #4			@	1
+		addlts	r1, r1, #16			@	1
+		RETINSTR(moveq,pc,lr)			@	1
+
+		@ -12 <= r1 <= -1
+
+		cmp	r1, #-8				@	1
+		stmgeia	r0!, {r2, r3}			@	2
+		addlts	r1, r1, #8			@	1
+		RETINSTR(moveq,pc,lr)			@	1
+
+		@ -4 <= r1 <= -1
+
+		cmp	r1, #-4				@	1
+		strge	r2, [r0], #4			@	1
+		adds	r1, r1, #4			@	1
+		RETINSTR(moveq,pc,lr)			@	1
+
+4:		@ 1 <= r1 <= 3
+		cmp	r1, #2				@	1
+		strgtb	r2, [r0], #1			@	1
+		strgeb	r2, [r0], #1			@	1
+		strb	r2, [r0], #1			@	1
+		RETINSTR(mov,pc,lr)			@	1
+
 /*
- * Prototype: void memsetl(unsigned long *d, unsigned long c, size_t n)
+ * StrongARM optimised copy_page routine
+ * now 1.72bytes/cycle, was 1.60 bytes/cycle
+ * (50MHz bus -> 86MB/s)
  */
-ENTRY(memsetl)
-		teq	r2, #0
-		RETINSTR(moveq,pc,lr)
-		stmfd	sp!, {lr}
-		mov	lr, r1
-		mov	r3, r1
-		mov	ip, r1
-
-		@ r2 = {32 ... 4}
-
-1:		subs	r2, r2, #32
-		stmgeia	r0!, {r1, r3, ip, lr}
-		stmgeia	r0!, {r1, r3, ip, lr}
-		bgt	1b
-		LOADREGS(eqfd, sp!, {pc})
-
-		@ r2 can be {-4 ... -28}
 
-		cmp	r2, #-16
-		stmgeia	r0!, {r1, r3, ip, lr}
-		addlts	r2, r2, #16
-		LOADREGS(eqfd, sp!, {pc})
-
-		@ r2 can be {-4 ... -12}
-
-		cmp	r2, #-8
-		stmgeia	r0!, {r1, r3}
-		strne	r1, [r0]
-		LOADREGS(fd, sp!, {pc})
-
-		.global	__page_memcpy
-__page_memcpy:	stmfd	sp!, {r4, r5, lr}
-1:		subs	r2, r2, #4*8
-		ldmgeia	r1!, {r3, r4, r5, ip}
-		stmgeia	r0!, {r3, r4, r5, ip}
-		ldmgeia	r1!, {r3, r4, r5, ip}
-		stmgeia	r0!, {r3, r4, r5, ip}
-		bgt	1b
-		LOADREGS(fd, sp!, {r4, r5, pc})
-
-		.global	memset
-memset:		mov	r3, r0
+ENTRY(copy_page)
+		stmfd	sp!, {r4, lr}			@	2
+		mov	r2, #PAGE_SZ/64			@	1
+1:		ldmia	r1!, {r3, r4, ip, lr}		@	4
+		subs	r2, r2, #1			@	1
+		stmia	r0!, {r3, r4, ip, lr}		@	4
+		ldmia	r1!, {r3, r4, ip, lr}		@	4+1
+		stmia	r0!, {r3, r4, ip, lr}		@	4
+		ldmia	r1!, {r3, r4, ip, lr}		@	4+1
+		stmia	r0!, {r3, r4, ip, lr}		@	4
+		ldmia	r1!, {r3, r4, ip, lr}		@	4+1
+		stmia	r0!, {r3, r4, ip, lr}		@	4
+		bne	1b				@	1
+		LOADREGS(fd, sp!, {r4, pc})		@	3
+
+		.align	5
+ENTRY(memset)		/* needed for some versions of gcc */
+ENTRY(__memset)
+		mov	r3, r0
 		cmp	r2, #16
 		blt	6f
 		ands	ip, r3, #3
@@ -168,3 +211,309 @@
 2:		movne	r0, #0
 		subeq	r0, r0, #1
 		LOADREGS(fd, sp!, {pc})
+
+
+#define ENTER	\
+		mov	ip,sp	;\
+		stmfd	sp!,{r4-r9,fp,ip,lr,pc}	;\
+		sub	fp,ip,#4
+
+#define EXIT	\
+		LOADREGS(ea, fp, {r4 - r9, fp, sp, pc})
+
+#define EXITEQ	\
+		LOADREGS(eqea, fp, {r4 - r9, fp, sp, pc})
+
+/*
+ * Prototype: void memcpy(void *to,const void *from,unsigned long n);
+ * ARM3: cant use memcopy here!!!
+ */
+ENTRY(memcpy)
+ENTRY(memmove)
+		ENTER
+		cmp	r1, r0
+		bcc	19f
+		subs	r2, r2, #4
+		blt	6f
+		ands	ip, r0, #3
+		bne	7f
+		ands	ip, r1, #3
+		bne	8f
+
+1:		subs	r2, r2, #8
+		blt	5f
+		subs	r2, r2, #0x14
+		blt	3f
+2:		ldmia	r1!,{r3 - r9, ip}
+		stmia	r0!,{r3 - r9, ip}
+		subs	r2, r2, #32
+		bge	2b
+		cmn	r2, #16
+		ldmgeia	r1!, {r3 - r6}
+		stmgeia	r0!, {r3 - r6}
+		subge	r2, r2, #0x10
+3:		adds	r2, r2, #0x14
+4:		ldmgeia	r1!, {r3 - r5}
+		stmgeia	r0!, {r3 - r5}
+		subges	r2, r2, #12
+		bge	4b
+5:		adds	r2, r2, #8
+		blt	6f
+		subs	r2, r2, #4
+		ldrlt	r3, [r1], #4
+		strlt	r3, [r0], #4
+		ldmgeia	r1!, {r3, r4}
+		stmgeia	r0!, {r3, r4}
+		subge	r2, r2, #4
+
+6:		adds	r2, r2, #4
+		EXITEQ
+		cmp	r2, #2
+		ldrb	r3, [r1], #1
+		strb	r3, [r0], #1
+		ldrgeb	r3, [r1], #1
+		strgeb	r3, [r0], #1
+		ldrgtb	r3, [r1], #1
+		strgtb	r3, [r0], #1
+		EXIT
+
+7:		rsb	ip, ip, #4
+		cmp	ip, #2
+		ldrb	r3, [r1], #1
+		strb	r3, [r0], #1
+		ldrgeb	r3, [r1], #1
+		strgeb	r3, [r0], #1
+		ldrgtb	r3, [r1], #1
+		strgtb	r3, [r0], #1
+		subs	r2, r2, ip
+		blt	6b
+		ands	ip, r1, #3
+		beq	1b
+
+8:		bic	r1, r1, #3
+		ldr	r7, [r1], #4
+		cmp	ip, #2
+		bgt	15f
+		beq	11f
+		cmp	r2, #12
+		blt	10f
+		sub	r2, r2, #12
+9:		mov	r3, r7, lsr #8
+		ldmia	r1!, {r4 - r7}
+		orr	r3, r3, r4, lsl #24
+		mov	r4, r4, lsr #8
+		orr	r4, r4, r5, lsl #24
+		mov	r5, r5, lsr #8
+		orr	r5, r5, r6, lsl #24
+		mov	r6, r6, lsr #8
+		orr	r6, r6, r7, lsl #24
+		stmia	r0!, {r3 - r6}
+		subs	r2, r2, #16
+		bge	9b
+		adds	r2, r2, #12
+		blt	100f
+10:		mov	r3, r7, lsr #8
+		ldr	r7, [r1], #4
+		orr	r3, r3, r7, lsl #24
+		str	r3, [r0], #4
+		subs	r2, r2, #4
+		bge	10b
+100:		sub	r1, r1, #3
+		b	6b
+
+11:		cmp	r2, #12
+		blt	13f		/* */
+		sub	r2, r2, #12
+12:		mov	r3, r7, lsr #16
+		ldmia	r1!, {r4 - r7}
+		orr	r3, r3, r4, lsl #16
+		mov	r4, r4, lsr #16
+		orr	r4, r4, r5, lsl #16
+		mov	r5, r5, lsr #16
+		orr	r5, r5, r6, lsl #16
+		mov	r6, r6, lsr #16
+		orr	r6, r6, r7,LSL#16
+		stmia	r0!, {r3 - r6}
+		subs	r2, r2, #16
+		bge	12b
+		adds	r2, r2, #12
+		blt	14f
+13:		mov	r3, r7, lsr #16
+		ldr	r7, [r1], #4
+		orr	r3, r3, r7, lsl #16
+		str	r3, [r0], #4
+		subs	r2, r2, #4
+		bge	13b
+14:		sub	r1, r1, #2
+		b	6b
+
+15:		cmp	r2, #12
+		blt	17f
+		sub	r2, r2, #12
+16:		mov	r3, r7, lsr #24
+		ldmia	r1!,{r4 - r7}
+		orr	r3, r3, r4, lsl #8
+		mov	r4, r4, lsr #24
+		orr	r4, r4, r5, lsl #8
+		mov	r5, r5, lsr #24
+		orr	r5, r5, r6, lsl #8
+		mov	r6, r6, lsr #24
+		orr	r6, r6, r7, lsl #8
+		stmia	r0!, {r3 - r6}
+		subs	r2, r2, #16
+		bge	16b
+		adds	r2, r2, #12
+		blt	18f
+17:		mov	r3, r7, lsr #24
+		ldr	r7, [r1], #4
+		orr	r3, r3, r7, lsl#8
+		str	r3, [r0], #4
+		subs	r2, r2, #4
+		bge	17b
+18:		sub	r1, r1, #1
+		b	6b
+
+
+19:		add	r1, r1, r2
+		add	r0, r0, r2
+		subs	r2, r2, #4
+		blt	24f
+		ands	ip, r0, #3
+		bne	25f
+		ands	ip, r1, #3
+		bne	26f
+
+20:		subs	r2, r2, #8
+		blt	23f
+		subs	r2, r2, #0x14
+		blt	22f
+21:		ldmdb	r1!, {r3 - r9, ip}
+		stmdb	r0!, {r3 - r9, ip}
+		subs	r2, r2, #32
+		bge	21b
+22:		cmn	r2, #16
+		ldmgedb	r1!, {r3 - r6}
+		stmgedb	r0!, {r3 - r6}
+		subge	r2, r2, #16
+		adds	r2, r2, #20
+		ldmgedb	r1!, {r3 - r5}
+		stmgedb	r0!, {r3 - r5}
+		subge	r2, r2, #12
+23:		adds	r2, r2, #8
+		blt	24f
+		subs	r2, r2, #4
+		ldrlt	r3, [r1, #-4]!
+		strlt	r3, [r0, #-4]!
+		ldmgedb	r1!, {r3, r4}
+		stmgedb	r0!, {r3, r4}
+		subge	r2, r2, #4
+
+24:		adds	r2, r2, #4
+		EXITEQ
+		cmp	r2, #2
+		ldrb	r3, [r1, #-1]!
+		strb	r3, [r0, #-1]!
+		ldrgeb	r3, [r1, #-1]!
+		strgeb	r3, [r0, #-1]!
+		ldrgtb	r3, [r1, #-1]!
+		strgtb	r3, [r0, #-1]!
+		EXIT
+
+25:		cmp	ip, #2
+		ldrb	r3, [r1, #-1]!
+		strb	r3, [r0, #-1]!
+		ldrgeb	r3, [r1, #-1]!
+		strgeb	r3, [r0, #-1]!
+		ldrgtb	r3, [r1, #-1]!
+		strgtb	r3, [r0, #-1]!
+		subs	r2, r2, ip
+		blt	24b
+		ands	ip, r1, #3
+		beq	20b
+
+26:		bic	r1, r1, #3
+		ldr	r3, [r1], #0
+		cmp	ip, #2
+		blt	34f
+		beq	30f
+		cmp	r2, #12
+		blt	28f
+		sub	r2, r2, #12
+27:		mov	r7, r3, lsl #8
+		ldmdb	r1!, {r3, r4, r5, r6}
+		orr	r7, r7, r6, lsr #24
+		mov	r6, r6, lsl #8
+		orr	r6, r6, r5, lsr #24
+		mov	r5, r5, lsl #8
+		orr	r5, r5, r4, lsr #24
+		mov	r4, r4, lsl #8
+		orr	r4, r4, r3, lsr #24
+		stmdb	r0!, {r4, r5, r6, r7}
+		subs	r2, r2, #16
+		bge	27b
+		adds	r2, r2, #12
+		blt	29f
+28:		mov	ip, r3, lsl #8
+		ldr	r3, [r1, #-4]!
+		orr	ip, ip, r3, lsr #24
+		str	ip, [r0, #-4]!
+		subs	r2, r2, #4
+		bge	28b
+29:		add	r1, r1, #3
+		b	24b
+
+30:		cmp	r2, #12
+		blt	32f
+		sub	r2, r2, #12
+31:		mov	r7, r3, lsl #16
+		ldmdb	r1!, {r3, r4, r5, r6}
+		orr	r7, r7, r6, lsr #16
+		mov	r6, r6, lsl #16
+		orr	r6, r6, r5, lsr #16
+		mov	r5, r5, lsl #16
+		orr	r5, r5, r4, lsr #16
+		mov	r4, r4, lsl #16
+		orr	r4, r4, r3, lsr #16
+		stmdb	r0!, {r4, r5, r6, r7}
+		subs	r2, r2, #16
+		bge	31b
+		adds	r2, r2, #12
+		blt	33f
+32:		mov	ip, r3, lsl #16
+		ldr	r3, [r1, #-4]!
+		orr	ip, ip, r3, lsr #16
+		str	ip, [r0, #-4]!
+		subs	r2, r2, #4
+		bge	32b
+33:		add	r1, r1, #2
+		b	24b
+
+34:		cmp	r2, #12
+		blt	36f
+		sub	r2, r2, #12
+35:		mov	r7, r3, lsl #24
+		ldmdb	r1!, {r3, r4, r5, r6}
+		orr	r7, r7, r6, lsr #8
+		mov	r6, r6, lsl #24
+		orr	r6, r6, r5, lsr #8
+		mov	r5, r5, lsl #24
+		orr	r5, r5, r4, lsr #8
+		mov	r4, r4, lsl #24
+		orr	r4, r4, r3, lsr #8
+		stmdb	r0!, {r4, r5, r6, r7}
+		subs	r2, r2, #16
+		bge	35b
+		adds	r2, r2, #12
+		blt	37f
+36:		mov	ip, r3, lsl #24
+		ldr	r3, [r1, #-4]!
+		orr	ip, ip, r3, lsr #8
+		str	ip, [r0, #-4]!
+		subs	r2, r2, #4
+		bge	36b
+37:		add	r1, r1, #1
+		b	24b
+
+		.align
+
+

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)