Remember that /Zp16 must be used setted
You don't need /Zp16 at all. The vector types are already set up with __declspec(align(16)) so they will get 16 byte aligned as local/global/static variables without any need for compiler flags. The only time you need to worry about alignment is on heap allocated data.
http://blogs.msdn.com/b/oldnewthing/archive/2007/12/27/6873648.aspx
XMMATRIX and opaque vectors (XMVECTORF32, XMVECTORI32, XMVECTORU8, XMVECTORU32) are defined with __declspec(align(16)) , but not XMVECTOR:
//------------------------------------------------------------------------------
// Vector intrinsic: Four 32 bit floating point components aligned on a 16 byte
// boundary and mapped to hardware vector registers
#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
typedef __m128 XMVECTOR;
#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
typedef __n128 XMVECTOR;
#else
typedef __vector4 XMVECTOR;
#endif
EDIT: you're right : = 0__m128 is 16-byte aligned
typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 {
float m128_f32[4];
unsigned __int64 m128_u64[2];
__int8 m128_i8[16];
__int16 m128_i16[8];
__int32 m128_i32[4];
__int64 m128_i64[2];
unsigned __int8 m128_u8[16];
unsigned __int16 m128_u16[8];
unsigned __int32 m128_u32[4];
} __m128;
where _CRT_ALIGN(16) is an alias of the CRT#define _CRT_ALIGN(x) __declspec(align(x))
nice to know that : p