patch-2.3.43 linux/arch/ia64/lib/do_csum.S

Next file: linux/arch/ia64/lib/flush.S
Previous file: linux/arch/ia64/lib/csum_partial_copy.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.3.42/linux/arch/ia64/lib/do_csum.S linux/arch/ia64/lib/do_csum.S
@@ -0,0 +1,230 @@
+/*
+ *
+ * Optmized version of the standard do_csum() function
+ *
+ * Return: a 64bit quantity containing the 16bit Internet checksum
+ *
+ * Inputs:
+ *	in0: address of buffer to checksum (char *)
+ *	in1: length of the buffer (int)
+ * 
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ *
+ */
+
+//
+// Theory of operations:
+//	The goal is to go as quickly as possible to the point where
+//	we can checksum 8 bytes/loop. Before reaching that point we must
+//	take care of incorrect alignment of first byte.
+//
+//	The code hereafter also takes care of the "tail" part of the buffer
+//	before entering the core loop, if any. The checksum is a sum so it
+//	allows us to commute operations. So we do do the "head" and "tail"
+//	first to finish at full speed in the body. Once we get the head and
+//	tail values, we feed them into the pipeline, very handy initialization.
+//
+//	Of course we deal with the special case where the whole buffer fits
+//	into one 8 byte word. In this case we have only one entry in the pipeline.
+//
+//	We use a (3+1)-stage pipeline in the loop to account for possible
+//	load latency and also to accomodate for head and tail.
+//
+//	The end of the function deals with folding the checksum from 64bits
+//	down to 16bits taking care of the carry.
+//
+//	This version avoids synchronization in the core loop by also using a
+//	pipeline for the accumulation of the checksum in result[].
+//
+//	 p[]     
+//	|---|
+//     0|   | r32 : new value loaded in pipeline
+//	|---|
+//     1|   | r33 : in transit data
+//	|---|
+//     2|   | r34 : current value to add to checksum
+//	|---|
+//     3|   | r35 : previous value added to checksum (previous iteration)
+//      |---|
+//
+//	result[] 
+//	|---|
+//     0|   | r36 : new checksum
+//	|---|
+//     1|   | r37 : previous value of checksum
+//	|---|
+//     2|   | r38 : final checksum when out of the loop (after 2 epilogue rots)
+//	|---|
+//
+//
+// NOT YET DONE:
+//	- Take advantage of the MMI bandwidth to load more than 8byte per loop
+//	  iteration
+//	- use the lfetch instruction to augment the chances of the data being in
+//	  the cache when we need it.
+//	- Maybe another algorithm which would take care of the folding at the
+//	  end in a different manner
+//	- Work with people more knowledgeable than me on the network stack
+//	  to figure out if we could not split the function depending on the 
+//	  type of packet or alignment we get. Like the ip_fast_csum() routine
+//	  where we know we have at least 20bytes worth of data to checksum.
+//	- Look at RFCs about checksums to see whether or not we can do better
+//
+//	- Do a better job of handling small packets.
+//
+#define saved_pfs	r11
+#define hmask		r16
+#define tmask		r17
+#define first		r18
+#define firstval	r19
+#define firstoff	r20
+#define last		r21
+#define lastval		r22
+#define lastoff		r23
+#define saved_lc	r24
+#define saved_pr	r25
+#define tmp1		r26
+#define tmp2		r27
+#define tmp3		r28
+#define carry		r29
+
+#define buf		in0
+#define len		in1
+
+
+	.text
+	.psr abi64
+	.psr lsb
+	.lsb
+
+// unsigned long do_csum(unsigned char *buf,int len)
+
+	.align 32
+	.global do_csum
+	.proc do_csum
+do_csum:
+	alloc saved_pfs=ar.pfs,2,8,0,8
+
+	.rotr p[4], result[3]
+	mov ret0=r0		// in case we have zero length
+	cmp4.lt p0,p6=r0,len	// check for zero length or negative (32bit len)
+	;;			// avoid WAW on CFM
+	mov tmp3=0x7		// a temporary mask/value
+	add tmp1=buf,len	// last byte's address
+(p6)	br.ret.spnt.few rp	// return if true (hope we can avoid that)
+
+	and firstoff=7,buf	// how many bytes off for first element
+	tbit.nz p10,p0=buf,0	// is buf an odd address ?
+	mov hmask=-1		// intialize head mask
+	;;
+
+	andcm first=buf,tmp3	// 8byte aligned down address of first element
+	mov tmask=-1		// initialize tail mask
+	adds tmp2=-1,tmp1	// last-1
+	;;
+	and lastoff=7,tmp1	// how many bytes off for last element
+	andcm last=tmp2,tmp3	// address of word containing last byte
+	mov saved_pr=pr		// preserve predicates (rotation)
+	;;
+	sub tmp3=last,first	// tmp3=distance from first to last
+	cmp.eq p8,p9=last,first	// everything fits in one word ?
+	sub tmp1=8,lastoff	// complement to lastoff
+
+	ld8 firstval=[first],8	// load,ahead of time, "first" word
+	shl tmp2=firstoff,3	// number of bits
+	;;
+	and tmp1=7, tmp1	// make sure that if tmp1==8 -> tmp1=0
+
+(p9)	ld8 lastval=[last]	// load,ahead of time, "last" word, if needed
+(p8)	mov lastval=r0		// we don't need lastval if first==last
+	mov result[1]=r0	// initialize result
+	;;
+
+	shl tmp1=tmp1,3		// number of bits
+	shl hmask=hmask,tmp2 	// build head mask, mask off [0,firstoff[
+	;;
+	shr.u tmask=tmask,tmp1	// build tail mask, mask off ]8,lastoff]
+	mov saved_lc=ar.lc	// save lc
+	;;
+(p8)	and hmask=hmask,tmask	// apply tail mask to head mask if 1 word only
+(p9)	and p[1]=lastval,tmask	// mask last it as appropriate
+	shr.u tmp3=tmp3,3	// we do 8 bytes per loop
+	;;
+	cmp.lt p6,p7=2,tmp3	// tmp3 > 2 ?
+	and p[2]=firstval,hmask	// and mask it as appropriate
+	add tmp1=-2,tmp3	// -2 = -1 (br.ctop) -1 (last-first)
+	;;
+	// XXX Fixme: not very nice initialization here
+	//
+	// Setup loop control registers: 
+	//
+	// tmp3=0 (1 word)   : lc=0, ec=2, p16=F
+	// tmp3=1 (2 words)  : lc=0, ec=3, p16=F
+	// tmp3=2 (3 words)  : lc=0, ec=4, p16=T
+	// tmp3>2 (4 or more): lc=tmp3-2, ec=4, p16=T
+	//
+	cmp.eq p8,p9=r0,tmp3	// tmp3 == 0 ?
+(p6)	mov ar.lc=tmp1
+(p7)	mov ar.lc=0
+	;;
+	cmp.lt p6,p7=1,tmp3	// tmp3 > 1 ?
+(p8)	mov ar.ec=2		// we need the extra rotation on result[]
+(p9)	mov ar.ec=3		// hard not to set it twice sometimes
+	;;
+	mov carry=r0			// initialize carry
+(p6)	mov ar.ec=4
+(p6)	mov pr.rot=0xffffffffffff0000	// p16=T, p18=T
+
+	cmp.ne p8,p0=r0,r0		// p8 is false
+	mov p[3]=r0			// make sure first compare fails
+(p7)	mov pr.rot=0xfffffffffffe0000	// p16=F, p18=T
+	;;
+1:
+(p16)	ld8 p[0]=[first],8		// load next
+(p8)	adds carry=1,carry		// add carry on prev_prev_value
+(p18)	add result[0]=result[1],p[2]	// new_res = prev_res + cur_val
+	cmp.ltu p8,p0=result[1],p[3]	// p8= prev_result < prev_val
+	br.ctop.dptk.few 1b		// loop until lc--==0
+	;;				// RAW on carry when loop exits
+ (p8)	adds carry=1,carry;;		// correct for carry on prev_value
+	add result[2]=carry,result[2];;	// add carry to final result
+	cmp.ltu p6,p7=result[2], carry	// check for new carry
+	;;
+(p6)	adds result[2]=1,result[1]	// correct if required
+	movl tmp3=0xffffffff
+	;;
+	// XXX Fixme
+	//
+	// now fold 64 into 16 bits taking care of carry
+	// that's not very good because it has lots of sequentiality
+	//
+	and tmp1=result[2],tmp3
+	shr.u tmp2=result[2],32
+	;;
+	add result[2]=tmp1,tmp2
+	shr.u tmp3=tmp3,16
+	;;
+	and tmp1=result[2],tmp3
+	shr.u tmp2=result[2],16
+	;;
+	add result[2]=tmp1,tmp2
+	;;
+	and tmp1=result[2],tmp3
+	shr.u tmp2=result[2],16
+	;;
+	add result[2]=tmp1,tmp2
+	;;
+	and tmp1=result[2],tmp3
+	shr.u tmp2=result[2],16
+	;;
+	add ret0=tmp1,tmp2
+	mov pr=saved_pr,0xffffffffffff0000
+	;;
+	// if buf was odd then swap bytes 
+	mov ar.pfs=saved_pfs		// restore ar.ec
+(p10)	mux1 ret0=ret0,@rev		// reverse word
+	;;
+	mov ar.lc=saved_lc
+(p10)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
+	br.ret.sptk.few rp

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)