Windows-Server-2003/base/published/xsum_ia64

342 lines
7.6 KiB
Plaintext

#include "ksia64.h"
//++
//
// Copyright (c) Microsoft Corporation. All rights reserved.
//
// Routine:
//
// ULONG
// tcpxsum(
// IN ULONG ChkSum,
// IN PUCHAR Buffer
// IN ULONG BufferLength
// )
//
// Routine Description:
//
// This function computes the checksum of the specified buffer.
//
// Arguments:
//
// a0: supplies the initial checksum value, in 16-bit form,
// with the high word set to 0.
//
// a1: supplies a pointer to the buffer buffer.
//
// a2: supplies the length of the buffer in bytes.
//
//
// Return Value:
//
// The computed checksum in 32-bit form two-partial-accumulators form,
// added to the initial checksum is returned as the function value.
//
// Author:
//
// Thierry Fevrier (Hewlett-Packard) for Microsoft Corporation.
//
// Notes:
//
// !!WARNING!! - Thierry - 07/10/2000
// The following code has been carefully optimized.
// Please consider this before making any modifications... Thank you.
//
//--
LEAF_ENTRY(tcpxsum)
and t1 = -4, a1
and t2 = -4, a1
brp.dptk.imp xUA, UAbrlabel
and t17 = -8, a1 // mod 8 the address
cmp.gtu pt2, pt3 = 96, a2 // is size < 96?
;;
add t3 = 8, t2
(pt3) ld8 t16 = [t17], 64 // load first data needed for loop
cmp.eq pt0, pt1 = 20, a2 // is length 20 bytes ?
nop.i 0
mov t4 = 128;;
nop.m 0
cmp.gtu pt2 = a2, t4;; //is a2 > 128?
(pt2) lfetch [t17], 64;; // if yes, you can prefetch 4
(pt2) lfetch [t17], 64 // do prefetches of data needed
nop.i 0;;
nop.m 0
nop.i 0
(pt1) br.cond.dptk.few x32start;;
ld4 t11 = [t2], 4
tbit.nz pt9 = a1, 0
nop.b 0
ld4 t12 = [t3], 4
cmp.ltu pt1 = t1, a1 // if not 4 byte aligned
(pt1) br.cond.dpnt.few x32start;;
ld4 t13 = [t2], 4
ld4 t14 = [t3], 4
nop.i 0;;
ld4 t15 = [t3]
add t20 = t11, t12
add t21 = t13, t14;;
add t20 = t20, t21;;
add t20 = t20, t15
nop.i 0;;
xfold:
addl t10 = 0xffff, zero // fold 64 bit into 16 bits
dep t0 = -1, zero, 0, 32
nop.i 0;;
and t1 = t20, t0
extr.u t2 = t20, 32, 32;;
add t20 = t1, t2;;
and t1 = t20, t0
extr.u t2 = t20, 32, 32;;
add t20 = t1, t2;;
and t2 = t20, t10
extr.u t1 = t20, 16, 16;;
add t20 = t1, t2;;
and t2 = t20, t10
extr.u t1 = t20, 16, 1;;
add t20 = t1, t2;;
(pt9) nop.m 0 // swap bytes if necessary
(pt9) extr.u t1 = t20, 8, 8
(pt9) nop.i 0;;
(pt9) nop.m 0
(pt9) dep t20 = t20, t1, 8, 8
(pt9) nop.i 0;;
add t20 = a0, t20 // add seed, fold again
nop.i 0
nop.i 0;;
extr.u t1 = t20, 32, 1
extr.u t2 = t20, 0, 32;;
add t20 = t1, t2;;
and t1 = t20, t10
extr.u t2 = t20, 16, 16;;
add t20 = t1, t2;;
and t1 = t20, t10
extr.u t2 = t20, 16, 1;;
add t20 = t1, t2;;
add v0 = zero, t20
nop.i 0
br.ret.sptk.few b0;;
x32start: // not 20 bytes
and t1 = -8, a1
cmp.eq pt3 = 1, zero
cmp.eq pt4 = 1, zero
add t10 = a1, a2
mov t20 = zero
tbit.nz pt9 = a1, 0;;
cmp.ltu pt1 = t1, a1
brp.sptk.imp x32startA, x32Abrlabel
UAbrlabel:
(pt1) br.cond.dptk.few xUA;;
x32startA: // now it is 8 byte aligned
and t10 = -8, t10
dep t9 = zero, a2, 0, 6 // make last 6 bits of count 0
// 6 bits => 64 = # bytes consumed
// in one iteration
adds t2 = 8, t1;;
cmp.gtu pt2 = 96, a2 // count < 96
add t5 = t1, t9
(pt2) br.cond.dpnt.few xLT32;;
ld8 t3 = [t1], 16 // initial load can eliminated. It may no
// longer be valid if alignment occurred, it
// was there to provide order
mov t4 = 128;;
cmp.gtu pt2 = a2, t4;; // is a2 > 256?
ld8 t4 = [t2], 16
(pt2) lfetch [t17], 64
mov t14 = zero;;
(pt2) lfetch [t17], 64
mov t11 = zero
mov t13 = zero
ld8 t18 = [t1], 16
ld8 t19 = [t2], 16
mov t12 = zero;;
x32loop: // t5 = address to stop fetching at
// t17 = next addr to prefetch
ld8 t6 = [t1], 16 // modified main loop; unrolled a little more
// and using prefetches
ld8 t7 = [t2], 16
add t11 = t11, t3
add t12 = t12, t4
add t13 = t13, t18
add t14 = t14, t19;;
ld8 t8 = [t1], 16
ld8 t9 = [t2], 16
cmp.ltu pt1 = t11, t3
cmp.ltu pt2 = t12, t4
cmp.ltu pt3 = t13, t18
cmp.ltu pt4 = t14, t19;;
cmp.ltu pt0 = t1, t5
cmp.ltu pt5 = t17, t5
(pt1) adds t11 = 1, t11
(pt2) adds t12 = 1, t12
(pt3) adds t13 = 1, t13
(pt4) adds t14 = 1, t14;;
(pt0) ld8 t3 = [t1], 16
(pt5) lfetch [t17], 64
add t11 = t11, t6
add t12 = t12, t7
add t13 = t13, t8
add t14 = t14, t9;;
(pt0) ld8 t4 = [t2], 16
(pt0) ld8 t18 = [t1], 16
cmp.ltu pt1 = t11, t6
cmp.ltu pt2 = t12, t7
cmp.ltu pt3 = t13, t8
cmp.ltu pt4 = t14, t9;;
(pt0) ld8 t19 = [t2], 16
(pt1) adds t11 = 1, t11
(pt2) adds t12 = 1, t12
(pt3) adds t13 = 1, t13
(pt4) adds t14 = 1, t14
(pt0) br.cond.dptk.many x32loop;; // merge parallel adds
add t21 = t11, t12;;
nop.m 0
cmp.ltu pt8 = t21, t11;;
(pt8) adds t21 = 1, t21;;
nop.m 0
add t20 = t20, t21;;
cmp.ltu pt1 = t20, t21;;
add t21 = t13, t14
(pt1) adds t20 = 1, t20;;
cmp.ltu pt2 = t21, t13
nop.i 0;;
(pt2) adds t21 = 1, t21;;
add t20 = t20, t21
nop.i 0;;
cmp.ltu pt1 = t20, t21;;
(pt1) adds t20 = 1, t20
nop.i 0
nop.i 0
xLT32: // < 32
nop.m 0
cmp.ltu pt0, pt1 = t1, t10
(pt1) br.cond.dpnt.few xtail
ld8 t11 = [t1], 8;;
add t20 = t20, t11
nop.i 0;;
cmp.ltu pt0 = t20, t11;;
(pt0) adds t20 = 1, t20
nop.i 0;;
nop.m 0
nop.f 0
br.cond.sptk.many xLT32
xtail: // < 8
and t5 = 7, a2;;
cmp.eq pt0 = zero, t5
nop.i 0
nop.m 0
nop.f 0
(pt0) br.cond.sptk.many xfold
ld8 t11 = [t1]
sub t6 = 8, t5
adds t7 = -1, zero;;
nop.m 0
shl t6 = t6, 3
nop.b 0;;
nop.m 0
shr.u t7 = t7, t6;;
and t11 = t11, t7;;
add t20 = t20, t11
nop.i 0;;
cmp.ltu pt0 = t20, t11;;
(pt0) adds t20 = 1, t20
nop.f 0
br.cond.sptk.many xfold
xUA: // unaligned
and t5 = 7, a1
dep t1 = zero, a1, 0, 3
adds t6 = -1, zero;;
ld8 t11 = [t1], 8
sub t7 = 8, t5 ;;
cmp.ltu pt0, pt1 = a2, t7;;
(pt0) sub t9 = t7, a2
shl t8 = t5, 3;;
(pt0) shl t12 = t9, 3;;
nop.m 0
(pt0) shr.u t14 = t6, t12
shl t13 = t6, t8;;
and t20 = t11, t13;;
(pt0) and t20 = t20, t14
(pt0) mov a2 = zero
(pt1) sub a2 = a2, t7
mov a1 = t1
x32Abrlabel:
br.cond.sptk.many x32startA
LEAF_EXIT(tcpxsum)