Can make the source available? Think its some tricky trick that lead to the behaviour.

the code is like i said

this is stripped to the bare win main version with the same settings to compilation

https://www.dropbox.com/s/ib4igh5qs85a156/test.zip

it do not crashes as it seem

when I call it from within my program

#include "fist.h"
#include <x86intrin.h>
float modelRight_x = 1.1;
float modelRight_y = 1.2;
float modelRight_z = 1.3;
float modelUp_x = 1.1;
float modelUp_y = 1.2;
float modelUp_z = 1.3;
float modelDir_x = 1.1;
float modelDir_y = 1.2;
float modelDir_z = 1.3;
__attribute__ ((aligned (16))) float normal_x[100*1000];
__attribute__ ((aligned (16))) float normal_y[100*1000];
__attribute__ ((aligned (16))) float normal_z[100*1000];
__attribute__ ((aligned (16))) float n_x[100*1000];
__attribute__ ((aligned (16))) float n_y[100*1000];
__attribute__ ((aligned (16))) float n_z[100*1000];
void initialize_data_for_matrix_mul()
{
static int initialized = 0;
if(initialized) return;
initialized = 1;
for(int i=0; i<100*1000; i++)
{
n_x[i] = (100.+rand()%10000)/1000.;
n_y[i] = (100.+rand()%10000)/1000.;
n_z[i] = (100.+rand()%10000)/1000.;
}
}
void matrix_mul_float()
{
for(int i=0; i<100*1000; i++)
{
normal_x[i] = n_x[i]*modelRight_x + n_y[i]*modelRight_y + n_z[i]*modelRight_z;
normal_y[i] = n_x[i]*modelUp_x + n_y[i]*modelUp_y + n_z[i]*modelUp_z;
normal_z[i] = n_x[i]*modelDir_x + n_y[i]*modelDir_y + n_z[i]*modelDir_z;
}
return;
}
//struct float4 { float x,y,z,w; };
__attribute__ ((aligned (16))) float4 modelRight_4x = {1.1, 1.1, 1.1, 1.1 };
__attribute__ ((aligned (16))) float4 modelRight_4y = {1.2, 1.2, 1.2, 1.2 };
__attribute__ ((aligned (16))) float4 modelRight_4z = {1.3, 1.3, 1.3, 1.3 };
__attribute__ ((aligned (16))) float4 modelUp_4x = {1.1, 1.1, 1.1, 1.1 };;
__attribute__ ((aligned (16))) float4 modelUp_4y = {1.2, 1.2, 1.2, 1.2 };;
__attribute__ ((aligned (16))) float4 modelUp_4z = {1.3, 1.3, 1.3, 1.3 };;
__attribute__ ((aligned (16))) float4 modelDir_4x = {1.1, 1.1, 1.1, 1.1 };;
__attribute__ ((aligned (16))) float4 modelDir_4y = {1.2, 1.2, 1.2, 1.2 };;
__attribute__ ((aligned (16))) float4 modelDir_4z = {1.3, 1.3, 1.3, 1.3 };;
void matrix_mul_sse()
{
__m128 mRx = _mm_load_ps((const float*) &modelRight_4x);
__m128 mRy = _mm_load_ps((const float*) &modelRight_4y);
__m128 mRz = _mm_load_ps((const float*) &modelRight_4z);
__m128 mUx = _mm_load_ps((const float*) &modelUp_4x);
__m128 mUy = _mm_load_ps((const float*) &modelUp_4y);
__m128 mUz = _mm_load_ps((const float*) &modelUp_4z);
__m128 mDx = _mm_load_ps((const float*) &modelDir_4x);
__m128 mDy = _mm_load_ps((const float*) &modelDir_4y);
__m128 mDz = _mm_load_ps((const float*) &modelDir_4z);
for(int i=0; i<100*1000; i+=4)
{
__m128 nx = _mm_load_ps( &n_x[i]);
__m128 ny = _mm_load_ps( &n_y[i]);
__m128 nz = _mm_load_ps( &n_z[i]);
__m128 normalx = _mm_add_ps(_mm_add_ps(_mm_mul_ps(nx,mRx), _mm_mul_ps(ny,mRy)), _mm_mul_ps(nz,mRz));
__m128 normaly = _mm_add_ps(_mm_add_ps(_mm_mul_ps(nx,mUx), _mm_mul_ps(ny,mUy)), _mm_mul_ps(nz,mUz));
__m128 normalz = _mm_add_ps(_mm_add_ps(_mm_mul_ps(nx,mDx), _mm_mul_ps(ny,mDy)), _mm_mul_ps(nz,mDz));
_mm_store_ps( &normal_x[i], normalx);
_mm_store_ps( &normal_y[i], normaly);
// _mm_store_ps( &normal_z[i], normalz);
}
}
void tests()
{
alert("\nstart");
initialize_data_for_matrix_mul();
alert("\nmul float");
matrix_mul_float();
alert("\nmul sse");
matrix_mul_sse();
alert("\ndone");
exit(0);
}

with only chnge changing winmain to tests and including header of my framework, should bo nothing scary there (i could comment this), - this is linked as an seperate .o and called - it crashes as i said

commented header thus calling the same code as this separate win main that not crashes - only change is the renaming winmain to tests and calling this from my appilication (commandline scripts are the same except im linking more objects in my application) - this just crashes when called from my application (I call it in the main loop, could see what would be there if i call it from app setup)

**Edited by fir, 30 June 2014 - 09:25 AM.**