inline const CVector3SSE& CVector3SSE::Normalize()
{
static const __m128 almostZero = _mm_set1_ps(1e-5f);
__m128 dp = _mm_dp_ps(m_fValsSSE, m_fValsSSE, 0x7F);
const __m128 cmp = _mm_gt_ps(dp, almostZero);
dp = _mm_rsqrt_ps(dp);
m_fValsSSE = _mm_mul_ps(m_fValsSSE, _mm_and_ps(dp, cmp));
return *this;
}

Although yours is the standard way folks do the normalization, for large components the dot product overflows. If you need something that is robust for all finite floating-point inputs,

inline __m128 MaximumAbsoluteComponent (__m128 const v)
{
__m128 SIGN = _mm_set1_ps(0x80000000u);
__m128 vAbs = _mm_andnot_ps(SIGN, v);
__m128 max0 = _mm_shuffle_ps(vAbs, vAbs, _MM_SHUFFLE(0,0,0,0));
__m128 max1 = _mm_shuffle_ps(vAbs, vAbs, _MM_SHUFFLE(1,1,1,1));
__m128 max2 = _mm_shuffle_ps(vAbs, vAbs, _MM_SHUFFLE(2,2,2,2));
__m128 max3 = _mm_shuffle_ps(vAbs, vAbs, _MM_SHUFFLE(3,3,3,3));
max0 = _mm_max_ps(max0, max1);
max2 = _mm_max_ps(max2, max3);
max0 = _mm_max_ps(max0, max2);
return max0;
}
inline __m128 Normalize (__m128 const v)
{
// Compute the maximum absolute value component.
__m128 maxComponent = MaximumAbsoluteComponent(v);
// Divide by the maximum absolute component. This is potentially a divide by zero.
__m128 normalized = _mm_div_ps(v, maxComponent);
// Set to zero when the original length is zero.
__m128 zero = _mm_setzero_ps();
__m128 mask = _mm_cmpneq_ps(zero, maxComponent);
normalized = _mm_and_ps(mask, normalized);
// (sqrLength, sqrLength, sqrLength, sqrLength)
__m128 sqrLength = _mm_dp_ps(normalized, normalized, 0x7F);
// (length, length, length, length)
__m128 length = _mm_sqrt_ps(sqrLength);
// Divide by the length to normalize. This is potentially a divide by zero.
normalized = _mm_div_ps(normalized, length);
// Set to zero when the original length is zero or infinity. In the latter case, this is considered to be an unexpected condition.
normalized = _mm_and_ps(mask, normalized);
return normalized;
}