Page 327 - ARM 64 Bit Assembly Language
P. 327

316 Chapter 9


                18        .word 0x13A86D09, 0x3DE61246    //  1.605904383682161e-10
                19        .word 0xE733B81F, 0xBD6AE7F3    // -7.647163731819816e-13
                20        .word 0x7030AD4A, 0x3CE952C7    //  2.811457254345521e-15
                21        .word 0x46814157, 0xBC62F49B    // -8.220635246624329e-18
                22        .equ    TERMS,((. - ctab)/8)
                23  // ---------------------------------------------------------------
                24        .text
                25        .align 2
                26  // double sin_a_d(double x)
                27  // sin_a_f_d implements the sine function using IEEE
                28  // double precision floating point.  It computes sine
                29  // by summing the first ten terms of the Taylor series.
                30        .global sin_a_d
                31  sin_a_d:// d0 contains x
                32        ldr     x0,=ctab       // load pointer to coefficient table
                33        // initialize variables
                34        fmul    d1,d0,d0       // d1 <- x^2
                35        fmov    d3,d0          // d3 <- x
                36        mov     x3,#TERMS      // load loop counter
                37        // loop over table
                38  loop:  fmul   d3,d1,d3       // d4 <- x^(2n+1)
                39        ldr     d4,[x0],#8     // load coefficient and increment pointer
                40        subs    x3,x3,#1       // decrement and test loop counter
                41        fmadd   d0,d3,d4,d0    // d0 += next term
                42        bne     loop           // loop nine times
                43        ret


                  Listing 9.2 shows a double precision floating point implementation of the sine function, us-
                  ing the ARM FP/NEON instruction set. Again, there is a table of constants, each of which
                  is the reciprocal of one of the factorial divisors in the Taylor series for sine. The subroutine
                  calculates the powers of x one-by-one, and multiplies each power by the next constant in
                  the table, summing the results as it goes. Note that the table of constants is longer than the
                  fixed point version of the code, because there are more bits of precision in a double precision
                  floating point number than there are in the fixed point representation that was used previ-
                  ously.


                  9.8.1 Performance comparison

                  Table 9.2 shows the performance of different implementations of the sine function, with
                  and without compiler optimization. The Single Precision C and Double Precision C imple-
                  mentations are the standard implementations provided by GCC. The comparison tests were
                  performed by timing each of the four cases on the same set of 100,000,000 random numbers,
                  with and without compiler optimization.
   322   323   324   325   326   327   328   329   330   331   332