patch-2.3.43 linux/arch/ia64/lib/do_csum.S
Next file: linux/arch/ia64/lib/flush.S
Previous file: linux/arch/ia64/lib/csum_partial_copy.c
Back to the patch index
Back to the overall index
- Lines: 231
- Date:
Sun Feb 6 18:42:40 2000
- Orig file:
v2.3.42/linux/arch/ia64/lib/do_csum.S
- Orig date:
Wed Dec 31 16:00:00 1969
diff -u --recursive --new-file v2.3.42/linux/arch/ia64/lib/do_csum.S linux/arch/ia64/lib/do_csum.S
@@ -0,0 +1,230 @@
+/*
+ *
+ * Optmized version of the standard do_csum() function
+ *
+ * Return: a 64bit quantity containing the 16bit Internet checksum
+ *
+ * Inputs:
+ * in0: address of buffer to checksum (char *)
+ * in1: length of the buffer (int)
+ *
+ * Copyright (C) 1999 Hewlett-Packard Co
+ * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ *
+ */
+
+//
+// Theory of operations:
+// The goal is to go as quickly as possible to the point where
+// we can checksum 8 bytes/loop. Before reaching that point we must
+// take care of incorrect alignment of first byte.
+//
+// The code hereafter also takes care of the "tail" part of the buffer
+// before entering the core loop, if any. The checksum is a sum so it
+// allows us to commute operations. So we do do the "head" and "tail"
+// first to finish at full speed in the body. Once we get the head and
+// tail values, we feed them into the pipeline, very handy initialization.
+//
+// Of course we deal with the special case where the whole buffer fits
+// into one 8 byte word. In this case we have only one entry in the pipeline.
+//
+// We use a (3+1)-stage pipeline in the loop to account for possible
+// load latency and also to accomodate for head and tail.
+//
+// The end of the function deals with folding the checksum from 64bits
+// down to 16bits taking care of the carry.
+//
+// This version avoids synchronization in the core loop by also using a
+// pipeline for the accumulation of the checksum in result[].
+//
+// p[]
+// |---|
+// 0| | r32 : new value loaded in pipeline
+// |---|
+// 1| | r33 : in transit data
+// |---|
+// 2| | r34 : current value to add to checksum
+// |---|
+// 3| | r35 : previous value added to checksum (previous iteration)
+// |---|
+//
+// result[]
+// |---|
+// 0| | r36 : new checksum
+// |---|
+// 1| | r37 : previous value of checksum
+// |---|
+// 2| | r38 : final checksum when out of the loop (after 2 epilogue rots)
+// |---|
+//
+//
+// NOT YET DONE:
+// - Take advantage of the MMI bandwidth to load more than 8byte per loop
+// iteration
+// - use the lfetch instruction to augment the chances of the data being in
+// the cache when we need it.
+// - Maybe another algorithm which would take care of the folding at the
+// end in a different manner
+// - Work with people more knowledgeable than me on the network stack
+// to figure out if we could not split the function depending on the
+// type of packet or alignment we get. Like the ip_fast_csum() routine
+// where we know we have at least 20bytes worth of data to checksum.
+// - Look at RFCs about checksums to see whether or not we can do better
+//
+// - Do a better job of handling small packets.
+//
+#define saved_pfs r11
+#define hmask r16
+#define tmask r17
+#define first r18
+#define firstval r19
+#define firstoff r20
+#define last r21
+#define lastval r22
+#define lastoff r23
+#define saved_lc r24
+#define saved_pr r25
+#define tmp1 r26
+#define tmp2 r27
+#define tmp3 r28
+#define carry r29
+
+#define buf in0
+#define len in1
+
+
+ .text
+ .psr abi64
+ .psr lsb
+ .lsb
+
+// unsigned long do_csum(unsigned char *buf,int len)
+
+ .align 32
+ .global do_csum
+ .proc do_csum
+do_csum:
+ alloc saved_pfs=ar.pfs,2,8,0,8
+
+ .rotr p[4], result[3]
+ mov ret0=r0 // in case we have zero length
+ cmp4.lt p0,p6=r0,len // check for zero length or negative (32bit len)
+ ;; // avoid WAW on CFM
+ mov tmp3=0x7 // a temporary mask/value
+ add tmp1=buf,len // last byte's address
+(p6) br.ret.spnt.few rp // return if true (hope we can avoid that)
+
+ and firstoff=7,buf // how many bytes off for first element
+ tbit.nz p10,p0=buf,0 // is buf an odd address ?
+ mov hmask=-1 // intialize head mask
+ ;;
+
+ andcm first=buf,tmp3 // 8byte aligned down address of first element
+ mov tmask=-1 // initialize tail mask
+ adds tmp2=-1,tmp1 // last-1
+ ;;
+ and lastoff=7,tmp1 // how many bytes off for last element
+ andcm last=tmp2,tmp3 // address of word containing last byte
+ mov saved_pr=pr // preserve predicates (rotation)
+ ;;
+ sub tmp3=last,first // tmp3=distance from first to last
+ cmp.eq p8,p9=last,first // everything fits in one word ?
+ sub tmp1=8,lastoff // complement to lastoff
+
+ ld8 firstval=[first],8 // load,ahead of time, "first" word
+ shl tmp2=firstoff,3 // number of bits
+ ;;
+ and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0
+
+(p9) ld8 lastval=[last] // load,ahead of time, "last" word, if needed
+(p8) mov lastval=r0 // we don't need lastval if first==last
+ mov result[1]=r0 // initialize result
+ ;;
+
+ shl tmp1=tmp1,3 // number of bits
+ shl hmask=hmask,tmp2 // build head mask, mask off [0,firstoff[
+ ;;
+ shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff]
+ mov saved_lc=ar.lc // save lc
+ ;;
+(p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only
+(p9) and p[1]=lastval,tmask // mask last it as appropriate
+ shr.u tmp3=tmp3,3 // we do 8 bytes per loop
+ ;;
+ cmp.lt p6,p7=2,tmp3 // tmp3 > 2 ?
+ and p[2]=firstval,hmask // and mask it as appropriate
+ add tmp1=-2,tmp3 // -2 = -1 (br.ctop) -1 (last-first)
+ ;;
+ // XXX Fixme: not very nice initialization here
+ //
+ // Setup loop control registers:
+ //
+ // tmp3=0 (1 word) : lc=0, ec=2, p16=F
+ // tmp3=1 (2 words) : lc=0, ec=3, p16=F
+ // tmp3=2 (3 words) : lc=0, ec=4, p16=T
+ // tmp3>2 (4 or more): lc=tmp3-2, ec=4, p16=T
+ //
+ cmp.eq p8,p9=r0,tmp3 // tmp3 == 0 ?
+(p6) mov ar.lc=tmp1
+(p7) mov ar.lc=0
+ ;;
+ cmp.lt p6,p7=1,tmp3 // tmp3 > 1 ?
+(p8) mov ar.ec=2 // we need the extra rotation on result[]
+(p9) mov ar.ec=3 // hard not to set it twice sometimes
+ ;;
+ mov carry=r0 // initialize carry
+(p6) mov ar.ec=4
+(p6) mov pr.rot=0xffffffffffff0000 // p16=T, p18=T
+
+ cmp.ne p8,p0=r0,r0 // p8 is false
+ mov p[3]=r0 // make sure first compare fails
+(p7) mov pr.rot=0xfffffffffffe0000 // p16=F, p18=T
+ ;;
+1:
+(p16) ld8 p[0]=[first],8 // load next
+(p8) adds carry=1,carry // add carry on prev_prev_value
+(p18) add result[0]=result[1],p[2] // new_res = prev_res + cur_val
+ cmp.ltu p8,p0=result[1],p[3] // p8= prev_result < prev_val
+ br.ctop.dptk.few 1b // loop until lc--==0
+ ;; // RAW on carry when loop exits
+ (p8) adds carry=1,carry;; // correct for carry on prev_value
+ add result[2]=carry,result[2];; // add carry to final result
+ cmp.ltu p6,p7=result[2], carry // check for new carry
+ ;;
+(p6) adds result[2]=1,result[1] // correct if required
+ movl tmp3=0xffffffff
+ ;;
+ // XXX Fixme
+ //
+ // now fold 64 into 16 bits taking care of carry
+ // that's not very good because it has lots of sequentiality
+ //
+ and tmp1=result[2],tmp3
+ shr.u tmp2=result[2],32
+ ;;
+ add result[2]=tmp1,tmp2
+ shr.u tmp3=tmp3,16
+ ;;
+ and tmp1=result[2],tmp3
+ shr.u tmp2=result[2],16
+ ;;
+ add result[2]=tmp1,tmp2
+ ;;
+ and tmp1=result[2],tmp3
+ shr.u tmp2=result[2],16
+ ;;
+ add result[2]=tmp1,tmp2
+ ;;
+ and tmp1=result[2],tmp3
+ shr.u tmp2=result[2],16
+ ;;
+ add ret0=tmp1,tmp2
+ mov pr=saved_pr,0xffffffffffff0000
+ ;;
+ // if buf was odd then swap bytes
+ mov ar.pfs=saved_pfs // restore ar.ec
+(p10) mux1 ret0=ret0,@rev // reverse word
+ ;;
+ mov ar.lc=saved_lc
+(p10) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
+ br.ret.sptk.few rp
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)