Page 288 - ARM 64 Bit Assembly Language
P. 288
Non-integral mathematics 277
96 .word 0x94603063, 26 // -1/15! as an S(-40,32)
97 .word 0x654B1DC1, 32 // 1/17! as an S(-48,32)
98 .equ tablen,(.-sintab) // set tablen to size of table
99 // The ’.’ refers to the current address counter value.
100 // Subtracting the address of sintab from the current
101 // address gives the size of the table.
102
103 .text
104 //-------------------------------------------------------------
105 // sinq(x)
106 // input: x -> S(1,30) s.t. 0 <= x <= pi/2
107 // returns sin(x) -> S(3,28)
108 // x0 : Sum of terms
109 // x1 : pointer to table
110 // x2 : next coefficient
111 // x3 : next shift
112 // x4 : x^2
113 // x5 : x^(2n-1)
114 // x6 : next term
115 // x7 : pointer to end of table
116 sinq: smull x4,w0,w0 // w4 will hold x^2
117 // x^2 is now an S(3,60) in x4 (0<= x^2 <= 2.467)
118 mov x5,x0 // x5 will keep x^(2n-1). Start with x
119 // x5 now contains x as an S(1,30)
120 // The first term in the Taylor series is simply x, so
121 // convert x to an S(2,61) by shifting it left
122 lsl x0,x0,#31 // x0 holds the sum
123 ldr x1,=(sintab+8) // get pointer to beginning of table
124 // but skip first entry
125 mov w2,#0xAAAAAAAA // Since first coefficient is a pattern
126 mov x3,#0 // we can load it more quickly this way
127 asr x4,x4,#31 // convert x^2 to an S(2,29)
128 add x7,x1,#(tablen-8) // get pointer to end of table
129 b firstmul // skip the first load
130
131 // We know that we will always execute the loop 6 times,
132 // so we use a post-test loop.
133 sloop: ldpsw x2,x3,[x1],#8 // Load two values from the table
134 // x2 now has 1/(2n+1)! sign extended to 64 bits
135 // x3 contains the correcting shift sign extended to 64 bits
136 // The multiply will take time, so start it now
137 firstmul:
138 smull x5,w4,w5 // x5 <- x^(2n+1) as an S(4,59)
139 cmp x1, x7 // perfomance: do loop test early
140 asr x5,x5,#31 // convert x^(2n-1) to S(3,28)
141 smull x6,w5,w2 // multiply by value from the table
142 add x6,x6,x6,lsr #63// if the result is negative, then add one
143 asr x6,x6,x3 // apply shift to make an S(2,61)
144 add x0,x0,x6 // add to running total