diff --git a/main.cpp b/main.cpp
index 69a7ca9ac8dbf04cfd984b4a27ca9ca397bf3783..d1e015641cb6237944395873f7f3a4c78775f440 100644
--- a/main.cpp
+++ b/main.cpp
@@ -1,134 +1,11 @@
 
 #include <cstdio>
-#include <cmath>
-#include <cassert>
-#include <pthread.h>
-#include <utility>
 #include "miosix.h"
 
 using namespace std;
 using namespace miosix;
 
-volatile float f1=3.0f; //Volatile to prevent compiler optimization
-volatile float f2=2.0f; //from moving the computation out of the loop
-
-static float approxSqrt1()
-{
-    float result=f1;
-    for(int j=0;j<10;j++)
-    {
-        for(int i=0;i<1000000/10;i++) result=(result+f1/result)/2.0f;
-        delayMs(2); //To test code that first uses fp. then stops and restarts
-    }
-    return result;
-}
-
-static float approxSqrt2()
-{
-    float result=f2;
-    for(int i=0;i<1000000;i++) result=(result+f2/result)/2.0f;
-    return result;
-}
-
-void *thread(void*)
-{
-    for(;;)
-    {
-        volatile float value=approxSqrt1();
-        //assert(fabsf(value-sqrt(3.0f))<0.000001f);
-        printf("b: %12.10f\n",value);
-    }
-}
-
 int main()
 {
-    pthread_t t;
-    pthread_create(&t,0,thread,0);
-    for(;;)
-    {
-        volatile float value=approxSqrt2();
-        //assert(fabsf(value-sqrt(2.0f))<0.000001f);
-        printf("a: %12.10f\n",value);
-    }
-}
-
-//#define sarcazzo
-//
-//int exchange_and_add(volatile int* __mem, int __val)
-//  {
-//    int __result;
-//
-//    #ifdef sarcazzo
-//    int __ok;
-//    do {
-//      asm volatile("ldrex %0, [%1]"     : "=r"(__result) : "r"(__mem)             : "memory");
-//      int __tmp = __result + __val;
-//      asm volatile("strex %0, %1, [%2]" : "=r"(__ok)     : "r"(__tmp), "r"(__mem) : "memory");
-//    } while(__ok);
-//    #else
-//    __result = *__mem;
-//    *__mem += __val;
-//    #endif
-//
-//    return __result;
-//  }
-//
-//int atomic_add(volatile int* __mem, int __val)
-//{
-//	int result;
-//    #ifdef sarcazzo
-//    int __ok;
-//    do {
-//      int __tmp;
-//      asm volatile("ldrex %0, [%1]"     : "=r"(__tmp) : "r"(__mem)             : "memory");
-//      __tmp += __val;
-//      asm volatile("strex %0, %1, [%2]" : "=r"(__ok)  : "r"(__tmp), "r"(__mem) : "memory");
-//	  result++;
-//    } while(__ok);
-//    #else //sarcazzo
-//    int __tmp=*__mem;
-//	__tmp += __val;
-//	*__mem=__tmp;
-//	result=1;
-//    #endif //sarcazzo
-//	return result;
-//}
-//
-//int mazz;
-//int k;
-//int w;
-//
-//void *thread(void*)
-//{
-//	mazz=0;
-//	for(int i=0;i<1000;i++)
-//	{
-//		mazz=max(mazz,atomic_add(&k, 1));
-//		mazz=max(mazz,atomic_add(&k,-1));
-//		exchange_and_add(&w, 1);
-//		exchange_and_add(&w,-1);
-//	}
-//	return 0;
-//}
-//
-//int main()
-//{
-//	getchar();
-//	for(;;)
-//	{
-//		k=0;
-//		w=0;
-//		pthread_t t;
-//		pthread_create(&t,0,thread,0);
-//		int maz=0;
-//		for(int i=0;i<1000;i++)
-//		{
-//			maz=max(maz,atomic_add(&k, 1));
-//			maz=max(maz,atomic_add(&k,-1));
-//			exchange_and_add(&w, 1);
-//			exchange_and_add(&w,-1);
-//		}
-//		pthread_join(t,0);
-//		iprintf("Main: k=%d, w=%d, max1=%d max2=%d\n",k,w,maz,mazz);
-//	}
-//}
+    iprintf("Hello world, write your application here\n");
+}
\ No newline at end of file
diff --git a/miosix/arch/arm7_lpc2000/common/arch_settings.h b/miosix/arch/arm7_lpc2000/common/arch_settings.h
index 84d25cc1d1281466149bcdf7691b06005bc998cb..a5a8b94e5921568dfa7c4688c0552568a7ccb1b2 100644
--- a/miosix/arch/arm7_lpc2000/common/arch_settings.h
+++ b/miosix/arch/arm7_lpc2000/common/arch_settings.h
@@ -46,6 +46,9 @@ const unsigned int CTXSAVE_SIZE=17;
 /// MUST be divisible by 4.
 const unsigned int CTXSAVE_ON_STACK=0;
 
+/// \internal stack alignment for this specific architecture
+const unsigned int CTXSAVE_STACK_ALIGNMENT=4;
+
 /**
  * \}
  */
diff --git a/miosix/arch/cortexM3_stm32/common/arch_settings.h b/miosix/arch/cortexM3_stm32/common/arch_settings.h
index 47350c39dd6f3b40970c0926cddcd37318c8958e..3287b3b82a76bfb09bd2e04c5c537309ba890aba 100644
--- a/miosix/arch/cortexM3_stm32/common/arch_settings.h
+++ b/miosix/arch/cortexM3_stm32/common/arch_settings.h
@@ -47,6 +47,9 @@ const unsigned char CTXSAVE_SIZE=9;
 /// MUST be divisible by 4.
 const unsigned int CTXSAVE_ON_STACK=32;
 
+/// \internal stack alignment for this specific architecture
+const unsigned int CTXSAVE_STACK_ALIGNMENT=8;
+
 /**
  * \}
  */
diff --git a/miosix/arch/cortexM3_stm32f2/common/arch_settings.h b/miosix/arch/cortexM3_stm32f2/common/arch_settings.h
index 686e61a3c0dc043153b0d91426dc0b1fa8fcab54..73a851afb55f5efb87e0e8eb2102c248b7f4d7d6 100644
--- a/miosix/arch/cortexM3_stm32f2/common/arch_settings.h
+++ b/miosix/arch/cortexM3_stm32f2/common/arch_settings.h
@@ -47,6 +47,9 @@ const unsigned char CTXSAVE_SIZE=9;
 /// MUST be divisible by 4.
 const unsigned int CTXSAVE_ON_STACK=32;
 
+/// \internal stack alignment for this specific architecture
+const unsigned int CTXSAVE_STACK_ALIGNMENT=8;
+
 /**
  * \}
  */
diff --git a/miosix/arch/cortexM3_stm32l1/common/arch_settings.h b/miosix/arch/cortexM3_stm32l1/common/arch_settings.h
index 686e61a3c0dc043153b0d91426dc0b1fa8fcab54..73a851afb55f5efb87e0e8eb2102c248b7f4d7d6 100644
--- a/miosix/arch/cortexM3_stm32l1/common/arch_settings.h
+++ b/miosix/arch/cortexM3_stm32l1/common/arch_settings.h
@@ -47,6 +47,9 @@ const unsigned char CTXSAVE_SIZE=9;
 /// MUST be divisible by 4.
 const unsigned int CTXSAVE_ON_STACK=32;
 
+/// \internal stack alignment for this specific architecture
+const unsigned int CTXSAVE_STACK_ALIGNMENT=8;
+
 /**
  * \}
  */
diff --git a/miosix/arch/cortexM4_stm32f4/common/arch_settings.h b/miosix/arch/cortexM4_stm32f4/common/arch_settings.h
index 8d74c1e436f7c2403c3822c5030e09ae4d99fd88..ef8238cbc1bdc3e82a912a6748de34eadd15d389 100644
--- a/miosix/arch/cortexM4_stm32f4/common/arch_settings.h
+++ b/miosix/arch/cortexM4_stm32f4/common/arch_settings.h
@@ -35,22 +35,25 @@ namespace miosix {
  * \{
  */
 
-/// \internal Size of vector to store registers during ctx switch
-/// ((9+16+1)*4=104Bytes). Only sp, r4-r11 and s16-s31 are saved here, since
-/// r0-r3,r12,lr,pc,xPSR, old sp and s0-s15,fpscr are saved by hardware on the
-/// process stack on Cortex M4 CPUs. The +1 is to save the exception lr, that
-/// is, EXC_RETURN, as it is necessary to know if the thread has used fp regs
-const unsigned char CTXSAVE_SIZE=9+16+1;
+/// \internal size of vector to store registers during ctx switch
+/// ((10+16)*4=104Bytes). Only sp, r4-r11, EXC_RETURN and s16-s31 are saved
+/// here, since r0-r3,r12,lr,pc,xPSR, old sp and s0-s15,fpscr are saved by
+/// hardware on the process stack on Cortex M4F CPUs. EXC_RETURN, or the lr, 
+/// value to use to return from the exception is necessary to know if the
+/// thread has used fp regs, as an extension specific to Cortex-M4F CPUs.
+const unsigned char CTXSAVE_SIZE=10+16;
 
 /// \internal some architectures save part of the context on their stack.
-/// This constant is used to increase the stack size by the size of context
-/// save frame. If zero, this architecture does not save anything on stack
-/// during context save. Size is in bytes, not words.
+/// ((8+17)*4=100Bytes). This constant is used to increase the stack size by
+/// the size of context save frame. If zero, this architecture does not save
+/// anything on stack during context save. Size is in bytes, not words.
 ///  8 registers=r0-r3,r12,lr,pc,xPSR
 /// 17 registers=s0-s15,fpscr
 /// MUST be divisible by 4.
-// FIXME: +1 because of alignment of the cortex m3!!
-const unsigned int CTXSAVE_ON_STACK=(8+17+1)*4;
+const unsigned int CTXSAVE_ON_STACK=(8+17)*4;
+
+/// \internal stack alignment for this specific architecture
+const unsigned int CTXSAVE_STACK_ALIGNMENT=8;
 
 /**
  * \}
diff --git a/miosix/arch/cortexM4_stm32f4/common/interfaces-impl/portability.cpp b/miosix/arch/cortexM4_stm32f4/common/interfaces-impl/portability.cpp
index 015a34c0feb293a0533a2c2df41ccd95dc1b01bc..9700002290dfd1129ca3bc0be80baa28527084e2 100644
--- a/miosix/arch/cortexM4_stm32f4/common/interfaces-impl/portability.cpp
+++ b/miosix/arch/cortexM4_stm32f4/common/interfaces-impl/portability.cpp
@@ -166,8 +166,8 @@ void initCtxsave(unsigned int *ctxsave, void *(*pc)(void *), unsigned int *sp,
 
     ctxsave[0]=reinterpret_cast<unsigned long>(stackPtr);             //--> psp
     //leaving the content of r4-r11 uninitialized
+    ctxsave[9]=0xfffffffd; //EXC_RETURN=thread mode, use psp, no floating ops
     //leaving the content of s16-s31 uninitialized
-    ctxsave[25]=0xfffffffd; //EXC_RETURN=thread mode, use psp, no floating ops
 }
 
 void IRQportableStartKernel()
diff --git a/miosix/arch/cortexM4_stm32f4/common/interfaces-impl/portability_impl.h b/miosix/arch/cortexM4_stm32f4/common/interfaces-impl/portability_impl.h
index 69b042116cb7dbb035ad1eec801dd9636d746e2d..e37f87b00c503f1c5d243049a922fd4ed15889c6 100644
--- a/miosix/arch/cortexM4_stm32f4/common/interfaces-impl/portability_impl.h
+++ b/miosix/arch/cortexM4_stm32f4/common/interfaces-impl/portability_impl.h
@@ -42,18 +42,19 @@
  * this is a pointer to a location where to store the thread's registers during
  * context switch. It requires C linkage to be used inside asm statement.
  * Registers are saved in the following order:
- * *ctxsave+96 --> s31
+ * *ctxsave+100 --> s31
  * ...
- * *ctxsave+36 --> s16
- * *ctxsave+32 --> r11
- * *ctxsave+28 --> r10
- * *ctxsave+24 --> r9
- * *ctxsave+20 --> r8
- * *ctxsave+16 --> r7
- * *ctxsave+12 --> r6
- * *ctxsave+8  --> r5
- * *ctxsave+4  --> r4
- * *ctxsave+0  --> psp
+ * *ctxsave+40  --> s16
+ * *ctxsave+36  --> lr (contains EXC_RETURN whose bit #4 tells if fpu is used)
+ * *ctxsave+32  --> r11
+ * *ctxsave+28  --> r10
+ * *ctxsave+24  --> r9
+ * *ctxsave+20  --> r8
+ * *ctxsave+16  --> r7
+ * *ctxsave+12  --> r6
+ * *ctxsave+8   --> r5
+ * *ctxsave+4   --> r4
+ * *ctxsave+0   --> psp
  */
 extern "C" {
 extern volatile unsigned int *ctxsave;
@@ -68,12 +69,14 @@ extern volatile unsigned int *ctxsave;
  */
 #define saveContext()                                                         \
 {                                                                              \
-    asm volatile("mrs     r1,  psp         \n\t" /*get PROCESS stack pointer*/ \
-                 "ldr     r0,  =ctxsave    \n\t" /*get current context*/       \
-                 "ldr     r0, [r0]         \n\t"                               \
-                 "stmia   r0!, {r1,r4-r11} \n\t" /*save PROCESS sp + r4-r11*/  \
-                 "vstmia.32 r0!, {s16-s31} \n\t" /*save s16-s31*/              \
-                 "str     lr, [r0]         \n\t"                               \
+    asm volatile("   mrs    r1,  psp            \n"/*get PROCESS stack ptr  */ \
+                 "   ldr    r0,  =ctxsave       \n"/*get current context    */ \
+                 "   ldr    r0,  [r0]           \n"                            \
+                 "   stmia  r0!, {r1,r4-r11,lr} \n"/*save r1(psp),r4-r11,lr */ \
+                 "   lsls   r2,  lr,  #27       \n"/*check if bit #4 is set */ \
+                 "   bmi    0f                  \n"                            \
+                 "   vstmia.32 r0, {s16-s31}    \n"/*save s16-s31 if we need*/ \
+                 "0:                            \n"                            \
                  );                                                            \
 }
 
@@ -85,12 +88,14 @@ extern volatile unsigned int *ctxsave;
  */
 #define restoreContext()                                                      \
 {                                                                              \
-    asm volatile("ldr     r0,  =ctxsave    \n\t" /*get current context*/       \
-                 "ldr     r0,  [r0]        \n\t"                               \
-                 "ldmia   r0!, {r1,r4-r11} \n\t" /*restore r4-r11 + r1=psp*/   \
-                 "vldmia.32 r0!, {s16-s31} \n\t" /*restore s16-s31*/           \
-                 "msr     psp, r1          \n\t" /*restore PROCESS sp*/        \
-                 "ldmia   r0, {pc}         \n\t" /*return*/                    \
+    asm volatile("   ldr    r0,  =ctxsave       \n"/*get current context    */ \
+                 "   ldr    r0,  [r0]           \n"                            \
+                 "   ldmia  r0!, {r1,r4-r11,lr} \n"/*load r1(psp),r4-r11,lr */ \
+                 "   lsls   r2,  lr,  #27       \n"/*check if bit #4 is set */ \
+                 "   bmi    0f                  \n"                            \
+                 "   vldmia.32 r0, {s16-s31}    \n"/*restore s16-s31 if need*/ \
+                 "0: msr    psp, r1             \n"/*restore PROCESS sp*/      \
+                 "   bx     lr                  \n"/*return*/                  \
                  );                                                            \
 }
 
diff --git a/miosix/config/Makefile.inc b/miosix/config/Makefile.inc
index 70ebc0283d6be97abfda8513ad354cad683016b3..7dddd5304c2cc639f9d57e6bee50665ed4a13e40 100644
--- a/miosix/config/Makefile.inc
+++ b/miosix/config/Makefile.inc
@@ -13,12 +13,12 @@
 ## architecture
 ##
 #OPT_BOARD := lpc2138_miosix_board
-#OPT_BOARD := stm32f103ze_stm3210e-eval
+OPT_BOARD := stm32f103ze_stm3210e-eval
 #OPT_BOARD := stm32f103ve_mp3v2
 #OPT_BOARD := stm32f100rb_stm32vldiscovery
 #OPT_BOARD := stm32f103ve_strive_mini
 #OPT_BOARD := stm32f103ze_redbull_v2
-OPT_BOARD := stm32f407vg_stm32f4discovery
+#OPT_BOARD := stm32f407vg_stm32f4discovery
 #OPT_BOARD := stm32f207ig_stm3220g-eval
 #OPT_BOARD := stm32f207zg_ethboard_v2
 #OPT_BOARD := stm32f207ze_als_camboard
@@ -31,8 +31,8 @@ OPT_BOARD := stm32f407vg_stm32f4discovery
 ## -O2 is recomended otherwise, as it provides a good balance between code
 ## size and speed
 ##
-OPT_OPTIMIZATION := -O0
-#OPT_OPTIMIZATION := -O2
+#OPT_OPTIMIZATION := -O0
+OPT_OPTIMIZATION := -O2
 #OPT_OPTIMIZATION := -O3
 #OPT_OPTIMIZATION := -Os
 
diff --git a/miosix/config/miosix_settings.h b/miosix/config/miosix_settings.h
index 82760f0e21325adaec8722dd9cb1c954c512d052..879e4424ba23ff24ee989c90f4a4da29d74a0f90 100644
--- a/miosix/config/miosix_settings.h
+++ b/miosix/config/miosix_settings.h
@@ -69,7 +69,7 @@ namespace miosix {
 /// \def WITH_FILESYSTEM
 /// Allows to enable/disable filesystem support.
 /// By default it is defined (filesystem support is enabled)
-//#define WITH_FILESYSTEM
+#define WITH_FILESYSTEM
     
 /// \def SYNC_AFTER_WRITE
 /// Increases filesystem write robustness. After each write operation the
@@ -119,7 +119,7 @@ const unsigned char MAX_OPEN_FILES=8;
  * mode, so to use debugging it is necessary to disble sleep in the idle thread.
  * By default it is not defined (idle thread calls sleep).
  */
-#define JTAG_DISABLE_SLEEP
+//#define JTAG_DISABLE_SLEEP
 
 /// Minimum stack size (MUST be divisible by 4)
 const unsigned int STACK_MIN=256;
diff --git a/miosix/doc/textdoc/Changelog.txt b/miosix/doc/textdoc/Changelog.txt
index 9b799481429029392acf30b9652129efed9e5485..0d96f3a873f7b7b0a6f3206783f6e7b6a06b6552 100644
--- a/miosix/doc/textdoc/Changelog.txt
+++ b/miosix/doc/textdoc/Changelog.txt
@@ -1,5 +1,12 @@
 Changelog for Miosix np embedded OS
 
+- Added test to testsuite to verify the implementation of atomic operations
+  as part of the GCC patches.
+- Added stack alignment requirement to all arch ports, as different arch
+  do have different requirements.
+- Modified context switch code for the Cortex-M4 architecture to preserve
+  floating point registers. Code use lazy stacking to save fpu registers only
+  for threads that do actually use the fpu. Added test to testsuite for this.
 - Fixed the CMSIS of all Cortex ports so as to prevent gcc from using the same
   register as the first and second argument of strex, strexh, strexb
 - Modified sync.h so as to make mandatory the new gcc 4.7.2 compiler. Not
diff --git a/miosix/kernel/kernel.cpp b/miosix/kernel/kernel.cpp
index 49f3c26555b9a7dd8a872d113e86e4514847d2fc..35c3284cb7b8771c50db08e53519856d983d18f6 100644
--- a/miosix/kernel/kernel.cpp
+++ b/miosix/kernel/kernel.cpp
@@ -166,22 +166,28 @@ void startKernel()
     //
     //Create the idle thread
     //
+    unsigned int fullStackSize=WATERMARK_LEN+CTXSAVE_ON_STACK+STACK_IDLE;
+    
+    //Align fullStackSize to the platform required stack alignment
+    fullStackSize+=CTXSAVE_STACK_ALIGNMENT-1;
+    fullStackSize/=CTXSAVE_STACK_ALIGNMENT;
+    fullStackSize*=CTXSAVE_STACK_ALIGNMENT;
+    
     unsigned int *base=static_cast<unsigned int*>(malloc(sizeof(Thread)+
-            STACK_IDLE+CTXSAVE_ON_STACK+WATERMARK_LEN));
+            fullStackSize));
     if(base==NULL)
     {
         errorHandler(OUT_OF_MEMORY);
         return;//Error
     }
     //At the top of thread memory allocate the Thread class with placement new
-    void *threadClass=base+((STACK_IDLE+CTXSAVE_ON_STACK+WATERMARK_LEN)/
-            sizeof(unsigned int));
+    void *threadClass=base+(fullStackSize/sizeof(unsigned int));
     Thread *idle=new (threadClass) Thread(base,STACK_IDLE);
 
     //Fill watermark and stack
     memset(base, WATERMARK_FILL, WATERMARK_LEN);
     base+=WATERMARK_LEN/sizeof(unsigned int);
-    memset(base, STACK_FILL, STACK_IDLE+CTXSAVE_ON_STACK);
+    memset(base, STACK_FILL, fullStackSize-WATERMARK_LEN);
 
     //On some architectures some registers are saved on the stack, therefore
     //initCtxsave *must* be called after filling the stack.
@@ -302,28 +308,30 @@ Thread *Thread::create(void *(*startfunc)(void *), unsigned int stacksize,
         errorHandler(INVALID_PARAMETERS);
         return NULL;
     }
-    //If stacksize is not divisible by 4, round it to a number divisible by 4
-    stacksize &= ~0x3;
+    
+    unsigned int fullStackSize=WATERMARK_LEN+CTXSAVE_ON_STACK+stacksize;
+    
+    //Align fullStackSize to the platform required stack alignment
+    fullStackSize+=CTXSAVE_STACK_ALIGNMENT-1;
+    fullStackSize/=CTXSAVE_STACK_ALIGNMENT;
+    fullStackSize*=CTXSAVE_STACK_ALIGNMENT;
+    
     //Allocate memory for the thread, return if fail
     unsigned int *base=static_cast<unsigned int*>(malloc(sizeof(Thread)+
-            stacksize+WATERMARK_LEN+CTXSAVE_ON_STACK));
+            fullStackSize));
     if(base==NULL)
     {
         errorHandler(OUT_OF_MEMORY);
         return NULL;//Error
     }
     //At the top of thread memory allocate the Thread class with placement new
-    void *threadClass=base+((stacksize+WATERMARK_LEN+CTXSAVE_ON_STACK)/
-            sizeof(unsigned int));
+    void *threadClass=base+(fullStackSize/sizeof(unsigned int));
     Thread *thread=new (threadClass) Thread(base,stacksize);
 
     //Fill watermark and stack
     memset(base, WATERMARK_FILL, WATERMARK_LEN);
     base+=WATERMARK_LEN/sizeof(unsigned int);
-    //Note: cortex-M4 has two layouts for ctxsave-on-stack, depending on
-    //whether fp regs are used, and they differ in size, so fill the entire
-    //stack or memory profiling may fail
-    memset(base, STACK_FILL, stacksize+CTXSAVE_ON_STACK);
+    memset(base, STACK_FILL, fullStackSize-WATERMARK_LEN);
 
     //On some architectures some registers are saved on the stack, therefore
     //initCtxsave *must* be called after filling the stack.
diff --git a/miosix/kernel/sync.h b/miosix/kernel/sync.h
index f2e5a248480c1191e21f7bc04f11b4e63ab8675b..0d487681cdc5161cc411144db12e1397f5c962cd 100644
--- a/miosix/kernel/sync.h
+++ b/miosix/kernel/sync.h
@@ -43,8 +43,8 @@
  * Now, since the new patches make the #define PTHREAD_MUTEX_RECURSIVE
  * available we can use it to check which compiler is being used.
  */
-#warning "You upgraded to gcc 4.5.2 and mandatory newlib patches, did you?"
-#warning "If not, see http://www.webalice.it/fede.tft/miosix/gcc-4.5.2.html"
+#warning "You upgraded to gcc 4.7.2 and mandatory newlib patches, did you?"
+#warning "If not, see http://www.webalice.it/fede.tft/miosix/gcc-4.7.2.html"
 #endif
 #ifndef _MIOSIX
 /*
diff --git a/miosix/testsuite/testsuite.cpp b/miosix/testsuite/testsuite.cpp
index c5f1ddf6c8253e19dc6dacb528f6222535e315cc..991af0856fa839f0b0e619592c1487571e073cb1 100644
--- a/miosix/testsuite/testsuite.cpp
+++ b/miosix/testsuite/testsuite.cpp
@@ -33,6 +33,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <math.h>
 #include <stdexcept>
 #include <algorithm>
 #include <vector>
@@ -40,6 +41,7 @@
 #include <pthread.h>
 #include <errno.h>
 #include <tr1/functional>
+#include <ext/atomicity.h>
 
 #include "miosix.h"
 #include "miosix/kernel/buffer_queue.h"
@@ -85,6 +87,8 @@ static void test_17();
 static void test_18();
 static void test_19();
 static void test_20();
+static void test_21();
+static void test_22();
 //Filesystem test functions
 #ifdef WITH_FILESYSTEM
 static void fs_test_1();
@@ -144,6 +148,8 @@ int main()
                 test_18();
                 test_19();
                 test_20();
+                test_21();
+                test_22();
                 
                 ledOff();
                 Thread::sleep(500);//Ensure all threads are deleted.
@@ -2751,6 +2757,95 @@ static void test_20()
     pass();
 }
 
+//
+// Test 21
+//
+/*
+tests:
+floating point access from multiple threads (mostly of interest for
+architectures with hardware floating point whose state has to be preserved
+among context switches) 
+*/
+
+static float t21_f1()
+{
+    static volatile float f1=3.0f; //Volatile to prevent compiler optimizations
+    float result=f1;
+    for(int i=0;i<10000;i++) result=(result+f1/result)/2.0f;
+    return result;
+}
+
+static float t21_f2()
+{
+    static volatile float f2=2.0f; //Volatile to prevent compiler optimizations
+    float result=f2;
+    for(int i=0;i<10000;i++) result=(result+f2/result)/2.0f;
+    return result;
+}
+
+void *t21_t1(void*)
+{
+    for(int i=0;i<5;i++)
+    {
+        volatile float value=t21_f1();
+        if(fabsf(value-sqrt(3.0f))>0.00001f) fail("thread1");
+    }
+}
+
+static void test_21()
+{
+    test_name("Floating point");
+    pthread_t t;
+    pthread_create(&t,0,t21_t1,0);
+    for(int i=0;i<5;i++)
+    {
+        volatile float value=t21_f2();
+        if(fabsf(value-sqrt(2.0f))>0.00001f) fail("main");
+    }
+    pthread_join(t,0);
+    pass();
+}
+
+//
+// Test 22
+//
+/*
+tests:
+__atomic_add()
+__exchange_and_add()
+These are not actually in the kernel but in the patches to gcc 
+*/
+
+int t22_v1;
+int t22_v2;
+
+void *t22_t1(void*)
+{
+	for(int i=0;i<100000;i++)
+	{
+		__gnu_cxx::__atomic_add(&t22_v1,1);
+		__gnu_cxx::__exchange_and_add(&t22_v2,-1);
+	}
+	return 0;
+}
+
+static void test_22()
+{
+    test_name("Atomics in gcc");
+    t22_v1=0;
+    t22_v2=0;
+    pthread_t t;
+    pthread_create(&t,0,t22_t1,0);
+    for(int i=0;i<100000;i++)
+    {
+        __gnu_cxx::__atomic_add(&t22_v1,-1);
+        __gnu_cxx::__exchange_and_add(&t22_v2,1);
+    }
+    pthread_join(t,0);
+    if(t22_v1!=0 || t22_v2!=0) fail("not thread safe");
+    pass();
+}
+
 #ifdef WITH_FILESYSTEM
 //
 // Filesystem test 1