You can make those functions significantly faster (at least on PC) by rearranging the expressions to cut down on dependencies between instructions. This let's the CPU pipeline work more efficiently.

The downside is that you lose a little accuracy. You may be able to get some of that back by tweaking the constants, and/or the bracketing for the adds.

Here's what I ended up with:

float Sin(float x) { int32_t i32I = int32_t( x * (1.0f / PI) ); x = (x - float( i32I ) * PI); float fX2 = x * x; float fX4 = fX2 * fX2; float fX6 = fX2 * fX4; float fX8 = fX4 * fX4; float fX10 = fX6 * fX4; float fX12 = fX6 * fX6; float fX14 = fX6 * fX8; return (i32I & 1) ? -x * (float( 1.00000000000000000000e+00 ) + (fX2 * float( -1.66666671633720397949e-01 )) + ((fX4 * float( 8.33333376795053482056e-03 )) + (fX6 * float( -1.98412497411482036114e-04 ))) + ((fX8 * float( 2.75565571428160183132e-06 )) + (fX10 * float( -2.50368472620721149724e-08 ))) + ((fX12 * float( 1.58849267073435385100e-10 )) + (fX14 * float( -6.58925550841432672300e-13 ))) ): x * (float( 1.00000000000000000000e+00 ) + (fX2 * float( -1.66666671633720397949e-01 )) + ((fX4 * float( 8.33333376795053482056e-03 )) + (fX6 * float( -1.98412497411482036114e-04 ))) + ((fX8 * float( 2.75565571428160183132e-06 )) + (fX10 * float( -2.50368472620721149724e-08 ))) + ((fX12 * float( 1.58849267073435385100e-10 )) + (fX14 * float( -6.58925550841432672300e-13 ))) ); } float Cos(float x) { int32_t i32I = int32_t( x * (1.0f / PI) ); x = (x - float( i32I ) * PI); float fX2 = x * x; float fX4 = fX2 * fX2; float fX6 = fX2 * fX4; float fX8 = fX4 * fX4; float fX10 = fX6 * fX4; float fX12 = fX6 * fX6; float fX14 = fX6 * fX8; return (i32I & 1) ? float( -1.00000000000000000000e+00 ) - ( (fX2 * float( -5.00000000000000000000e-01 )) + ((fX4 * float( 4.16666641831398010254e-02 )) + (fX6 * float( -1.38888671062886714935e-03 ))) + ((fX8 * float( 2.48006890615215525031e-05 )) + (fX10 * float( -2.75369927749125054106e-07 ))) + ((fX12 * float( 2.06207229069832465029e-09 )) + (fX14 * float( -9.77507137733812925262e-12 ))) ) : float( 1.00000000000000000000e+00 ) + ( (fX2 * float( -5.00000000000000000000e-01 )) + ((fX4 * float( 4.16666641831398010254e-02 )) + (fX6 * float( -1.38888671062886714935e-03 ))) + ((fX8 * float( 2.48006890615215525031e-05 )) + (fX10 * float( -2.75369927749125054106e-07 ))) + ((fX12 * float( 2.06207229069832465029e-09 )) + (fX14 * float( -9.77507137733812925262e-12 ))) ); }

I tested this in a VS 2015 x64 release build. YMMV.