diff --git a/dlls/d3dx9_36/d3dx9_36.spec b/dlls/d3dx9_36/d3dx9_36.spec
index f91f962..28259bc 100644
--- a/dlls/d3dx9_36/d3dx9_36.spec
+++ b/dlls/d3dx9_36/d3dx9_36.spec
@@ -130,8 +130,8 @@
 @ stub D3DXFillVolumeTextureTX
 @ stdcall D3DXFilterTexture(ptr ptr long long)
 @ stdcall D3DXFindShaderComment(ptr long ptr ptr)
-@ stub D3DXFloat16To32Array
-@ stub D3DXFloat32To16Array
+@ stdcall D3DXFloat16To32Array(ptr ptr long)
+@ stdcall D3DXFloat32To16Array(ptr ptr long)
 @ stub D3DXFrameAppendChild
 @ stub D3DXFrameCalculateBoundingSphere
 @ stub D3DXFrameDestroy
diff --git a/dlls/d3dx9_36/math.c b/dlls/d3dx9_36/math.c
index fdb5f92..6f8a71d 100644
--- a/dlls/d3dx9_36/math.c
+++ b/dlls/d3dx9_36/math.c
@@ -1769,3 +1769,123 @@ D3DXVECTOR4* WINAPI D3DXVec4TransformArray(D3DXVECTOR4* out, UINT outstride, CON
     }
     return out;
 }
+
+static inline unsigned short float_32_to_16(const float in)
+{
+    int exp = 0, origexp;
+    float tmp = fabs(in);
+    int sign = signbit(in);
+    unsigned int mantissa;
+    unsigned short ret;
+
+    /* Deal with special numbers */
+    if (isinf(in)) return (sign ? 0xffff : 0x7fff);
+    if (isnan(in)) return (sign ? 0xffff : 0x7fff);
+    if (in == 0.0f) return (sign ? 0x8000 : 0x0000);
+
+    if (tmp < powf(2, 10))
+    {
+        do
+        {
+            tmp = tmp * 2.0f;
+            exp--;
+        } while (tmp < powf(2, 10));
+    }
+    else if (tmp >= powf(2, 11))
+    {
+        do
+        {
+            tmp /= 2.0f;
+            exp++;
+        } while (tmp >= powf(2, 11));
+    }
+
+    exp += 10;  /* Normalize the mantissa */
+    exp += 15;  /* Exponent is encoded with excess 15 */
+
+    origexp = exp;
+
+    mantissa = (unsigned int) tmp;
+    if ((tmp - mantissa == 0.5f && mantissa % 2 == 1) || /* round half to even */
+	(tmp - mantissa > 0.5f)) mantissa++; /* round to nearest, away from zero */
+    if (mantissa == 2048)
+    {
+        mantissa = 1024;
+        exp++;
+    }
+
+    if (exp > 31)
+    {
+        /* too big */
+        ret = 0x7fff; /* INF */
+    }
+    else if (exp <= 0)
+    {
+        unsigned int rounding = 0;
+
+        exp = origexp;
+
+        /* exp == 0: Non-normalized mantissa. Returns 0x0000 (=0.0) for too small numbers */
+        mantissa = (unsigned int) tmp;
+        mantissa &= 0x3ff;
+	mantissa |= 0x400; /* explicit the first bit */
+        while (exp <= 0)
+        {
+	    rounding = mantissa & 1;
+            mantissa >>= 1;
+            exp++;
+        }
+        ret = mantissa + rounding;
+    }
+    else
+    {
+        ret = (exp << 10) | (mantissa & 0x3ff);
+    }
+
+    ret |= ((sign ? 1 : 0) << 15); /* Add the sign */
+    return ret;
+}
+
+D3DXFLOAT16 *WINAPI D3DXFloat32To16Array(D3DXFLOAT16 *pout, CONST FLOAT *pin, UINT n)
+{
+    unsigned int i;
+
+    for (i = 0; i < n; ++i)
+    {
+        pout[i].value = float_32_to_16(pin[i]);
+    }
+
+    return pout;
+}
+
+/* Native d3dx9's D3DXFloat16to32Array lacks support for NaN and Inf. Specifically, e = 16 is treated as a
+ * regular number - e.g., 0x7fff is converted to 131008.0 and 0xffff to -131008.0. */
+static inline float float_16_to_32(const unsigned short in)
+{
+    const unsigned short s = (in & 0x8000);
+    const unsigned short e = (in & 0x7C00) >> 10;
+    const unsigned short m = in & 0x3FF;
+    const float sgn = (s ? -1.0f : 1.0f);
+
+    if (e == 0)
+    {
+        if (m == 0) return sgn * 0.0f; /* +0.0 or -0.0 */
+        else return sgn * powf(2, -14.0f) * ((float)m / 1024.0f);
+    }
+    else
+    {
+        return sgn * powf(2, (float)e - 15.0f) * (1.0f + ((float)m / 1024.0f));
+    }
+}
+
+FLOAT *WINAPI D3DXFloat16To32Array(FLOAT *pout, CONST D3DXFLOAT16 *pin, UINT n)
+{
+    unsigned int i;
+
+    for (i = 0; i < n; ++i)
+    {
+        pout[i] = float_16_to_32(pin[i].value);
+    }
+
+    return pout;
+}
diff --git a/dlls/d3dx9_36/tests/math.c b/dlls/d3dx9_36/tests/math.c
index 5ad1924..8a79893 100644
--- a/dlls/d3dx9_36/tests/math.c
+++ b/dlls/d3dx9_36/tests/math.c
@@ -21,6 +21,7 @@
 
 #include "wine/test.h"
 #include "d3dx9.h"
+#include <math.h>
 
 #define ARRAY_SIZE 5
 
@@ -2215,6 +2216,337 @@ static void test_D3DXVec_Array(void)
     compare_planes(exp_plane, out_plane);
 }
 
+#define  INT16_TYPE          short
+#define UINT16_TYPE unsigned short
+#define  INT32_TYPE          long
+#define UINT32_TYPE unsigned long
+
+int singles2halfp(void *target, void *source, int numel)
+{
+    UINT16_TYPE *hp = (UINT16_TYPE *) target; // Type pun output as an unsigned 16-bit int
+    UINT32_TYPE *xp = (UINT32_TYPE *) source; // Type pun input as an unsigned 32-bit int
+    UINT16_TYPE    hs, he, hm;
+    UINT32_TYPE x, xs, xe, xm;
+    int hes;
+    static int next;  // Little Endian adjustment
+    static int checkieee = 0;  // Flag to check for IEEE754, Endian, and word size
+    double one = 1.0; // Used for checking IEEE754 floating point format
+    UINT32_TYPE *ip; // Used for checking IEEE754 floating point format
+    
+    if( checkieee ) { // 1st call, so check for IEEE754, Endian, and word size
+        ip = (UINT32_TYPE *) &one;
+        if( *ip ) { // If Big Endian, then no adjustment
+            next = 0;
+        } else { // If Little Endian, then adjustment will be necessary
+            next = 1;
+            ip++;
+        }
+        if( *ip != 0x3FF00000u ) { // Check for exact IEEE 754 bit pattern of 1.0
+            return 1;  // Floating point bit pattern is not IEEE 754
+        }
+        if( sizeof(INT16_TYPE) != 2 || sizeof(INT32_TYPE) != 4 ) {
+            return 1;  // short is not 16-bits, or long is not 32-bits.
+        }
+        checkieee = 0; // Everything checks out OK
+    }
+    
+    if( source == NULL || target == NULL ) { // Nothing to convert (e.g., imag part of pure real)
+        return 0;
+    }
+    
+    while( numel-- ) {
+        x = *xp++;
+        if( (x & 0x7FFFFFFFu) == 0 ) {  // Signed zero
+            *hp++ = (UINT16_TYPE) (x >> 16);  // Return the signed zero
+        } else { // Not zero
+            xs = x & 0x80000000u;  // Pick off sign bit
+            xe = x & 0x7F800000u;  // Pick off exponent bits
+            xm = x & 0x007FFFFFu;  // Pick off mantissa bits
+            if( xe == 0 ) {  // Denormal will underflow, return a signed zero
+                *hp++ = (UINT16_TYPE) (xs >> 16);
+            } else if( xe == 0x7F800000u ) {  // Inf or NaN (all the exponent bits are set)
+                if( xm == 0 ) { // If mantissa is zero ...
+                    *hp++ = (UINT16_TYPE) ((xs >> 16) | 0x7fffu); // Signed Inf
+                } else {
+                    *hp++ = (UINT16_TYPE) 0xFfffu; // NaN, only 1st mantissa bit set
+                }
+            } else { // Normalized number
+                hs = (UINT16_TYPE) (xs >> 16); // Sign bit
+                hes = ((int)(xe >> 23)) - 127 + 15; // Exponent unbias the single, then bias the halfp
+                if( hes >= 0x1F ) {  // Overflow
+                    *hp++ = (UINT16_TYPE) ((xs >> 16) | 0x7fffu); // Signed Inf
+                } else if( hes <= 0 ) {  // Underflow
+                    if( (14 - hes) > 24 ) {  // Mantissa shifted all the way off & no rounding possibility
+                        hm = (UINT16_TYPE) 0u;  // Set mantissa to zero
+                    } else {
+                        xm |= 0x00800000u;  // Add the hidden leading bit
+                        hm = (UINT16_TYPE) (xm >> (14 - hes)); // Mantissa
+                        if( (xm >> (13 - hes)) & 0x00000001u ) // Check for rounding
+                            hm += (UINT16_TYPE) 1u; // Round, might overflow into exp bit, but this is OK
+                    }
+                    *hp++ = (hs | hm); // Combine sign bit and mantissa bits, biased exponent is zero
+                } else {
+                    he = (UINT16_TYPE) (hes << 10); // Exponent
+                    hm = (UINT16_TYPE) (xm >> 13); // Mantissa
+                    if( xm & 0x00001000u ) // Check for rounding
+                        *hp++ = (hs | he | hm) + (UINT16_TYPE) 1u; // Round, might overflow to inf, this is OK
+                    else
+                        *hp++ = (hs | he | hm);  // No rounding
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+const char *int_to_binary
+(
+    int x
+ )
+{
+    static char b[100];
+    int i = 0;
+    b[0] = '\0';
+
+    int z;
+    for (z = 32768; z > 0; z >>= 1, i++)
+	{
+	    strcat(b, ((x & z) == z) ? "1" : "0");
+	}
+
+    return b;
+}
+
+static inline unsigned short float_32_to_16(const float in)
+{
+    int exp = 0, origexp;
+    float tmp = fabs(in);
+    int sign = signbit(in);
+    unsigned int mantissa;
+    unsigned short ret;
+
+    /* Deal with special numbers */
+    if (isinf(in)) return (sign ? 0xffff : 0x7fff);
+    if (isnan(in)) return (sign ? 0xffff : 0x7fff);
+    if (in == 0.0f) return (sign ? 0x8000 : 0x0000);
+
+    if (tmp < powf(2, 10))
+    {
+        do
+        {
+            tmp = tmp * 2.0f;
+            exp--;
+        } while (tmp < powf(2, 10));
+    }
+    else if (tmp >= powf(2, 11))
+    {
+        do
+        {
+            tmp /= 2.0f;
+            exp++;
+        } while (tmp >= powf(2, 11));
+    }
+
+    exp += 10;  /* Normalize the mantissa */
+    exp += 15;  /* Exponent is encoded with excess 15 */
+
+    origexp = exp;
+
+    mantissa = (unsigned int) tmp;
+    if ((tmp - mantissa == 0.5f && mantissa % 2 == 1) || /* round half to even */
+	(tmp - mantissa > 0.5f)) mantissa++; /* round to nearest, away from zero */
+    if (mantissa == 2048)
+    {
+        mantissa = 1024;
+        exp++;
+    }
+
+    if (exp > 31)
+    {
+        /* too big */
+        ret = 0x7fff; /* INF */
+    }
+    else if (exp <= 0)
+    {
+        unsigned int rounding = 0;
+
+        exp = origexp;
+
+        /* exp == 0: Non-normalized mantissa. Returns 0x0000 (=0.0) for too small numbers */
+	printf("\ttmp = %f\n", tmp);
+        mantissa = (unsigned int) tmp;
+	printf("\tmantissa = %u %s\n", mantissa, int_to_binary(mantissa));
+        mantissa &= 0x3ff;
+	printf("\tmantissa & 0x3ff = %u %s\n", mantissa, int_to_binary(mantissa));
+	mantissa |= 0x400; /* explicit the first bit */
+	printf("\tmantissa | 0x400 = %u %s\n", mantissa, int_to_binary(mantissa));
+        while (exp <= 0)
+        {
+	    rounding = mantissa & 1;
+            mantissa >>= 1;
+            exp++;
+        }
+	printf("\trounding = %d\n", rounding);
+        ret = mantissa + rounding;
+    }
+    else
+    {
+        ret = (exp << 10) | (mantissa & 0x3ff);
+    }
+
+    ret |= ((sign ? 1 : 0) << 15); /* Add the sign */
+    return ret;
+}
+
+const char *single_to_binary
+(
+    unsigned int x
+ )
+{
+    static char b[100];
+    int i = 0;
+    b[0] = '\0';
+
+    unsigned int z;
+    for (z = 32768*65536; z > 0; z >>= 1, i++)
+	{
+	    strcat(b, ((x & z) == z) ? "1" : "0");
+	    if (i == 0 || i == 8) strcat(b, " ");
+	}
+
+    return b;
+}
+
+const char *half_to_binary
+(
+    int x
+ )
+{
+    static char b[100];
+    int i = 0;
+    b[0] = '\0';
+
+    int z;
+    for (z = 32768; z > 0; z >>= 1, i++)
+	{
+	    strcat(b, ((x & z) == z) ? "1" : "0");
+	    if (i == 0 || i == 5) strcat(b, " ");
+	}
+
+    return b;
+}
+
+static void test_D3DXFloat_Array(void)
+{
+    unsigned int i;
+    void *out = NULL;
+    D3DXFLOAT16 half;
+    FLOAT single;
+    struct
+    {
+        FLOAT single_in;
+
+        /* half_ver2 occurs on WXPPROSP3 (32 bit math), WVISTAADM (32 bit math), W7PRO (32 bit math) */
+        WORD half_ver1, half_ver2;
+
+        /* single_out_ver2 confirms that half -> single conversion is consistent across platforms */
+        FLOAT single_out_ver1, single_out_ver2;
+    } testdata[] = {
+        { 80000.0f, 0x7c00, 0x7ce2, 65536.0f, 80000.0f },
+        { 65503.0f, 0x7bff, 0x7bff, 65504.0f, 65504.0f },
+        { 65504.0f, 0x7bff, 0x7bff, 65504.0f, 65504.0f },
+        { 65520.0f, 0x7bff, 0x7c00, 65504.0f, 65536.0f },
+        { 65521.0f, 0x7c00, 0x7c00, 65536.0f, 65536.0f },
+        { 65534.0f, 0x7c00, 0x7c00, 65536.0f, 65536.0f },
+        { 65535.0f, 0x7c00, 0x7c00, 65535.0f, 65536.0f },
+        { 65536.0f, 0x7c00, 0x7c00, 65536.0f, 65536.0f },
+        { -80000.0f, 0xfc00, 0xfce2, -65536.0f, -80000.0f },
+        { -65503.0f, 0xfbff, 0xfbff, -65504.0f, -65504.0f },
+        { -65504.0f, 0xfbff, 0xfbff, -65504.0f, -65504.0f },
+        { -65520.0f, 0xfbff, 0xfc00, -65504.0f, -65536.0f },
+        { -65521.0f, 0xfc00, 0xfc00, -65536.0f, -65536.0f },
+        { -65534.0f, 0xfc00, 0xfc00, -65536.0f, -65536.0f },
+        { -65535.0f, 0xfc00, 0xfc00, -65535.0f, -65536.0f },
+        { -65536.0f, 0xfc00, 0xfc00, -65536.0f, -65536.0f },
+        { INFINITY, 0x7c00, 0x7fff, 65536.0f, 131008.0f },
+        { -INFINITY, 0xffff, 0xffff, -131008.0f, -131008.0f },
+        { NAN, 0x7fff, 0x7fff, 131008.0f, 131008.0f },
+        { -NAN, 0xffff, 0xffff, -131008.0f, -131008.0f },
+        { 0.0f, 0x0, 0x0, 0.0f, 0.0f },
+        { -0.0f, 0x8000, 0x8000, 0.0f, 0.0f }
+    };
+
+    /* exception on NULL out or in parameter */
+    out = D3DXFloat32To16Array(&half, &single, 0);
+    ok(out == &half, "Got %p, expected %p.\n", out, &half);
+
+    out = D3DXFloat16To32Array(&single, (D3DXFLOAT16 *)&half, 0);
+    ok(out == &single, "Got %p, expected %p.\n", out, &single);
+
+    for (i = 0; i < sizeof(testdata)/sizeof(testdata[0]); i++)
+    {
+        out = D3DXFloat32To16Array(&half, &testdata[i].single_in, 1);
+        ok(out == &half, "Got %p, expected %p.\n", out, &half);
+        ok(half.value == testdata[i].half_ver1 || half.value == testdata[i].half_ver2,
+           "Got %x, expected %x or %x for index %d.\n", half.value, testdata[i].half_ver1,
+           testdata[i].half_ver2, i);
+
+        out = D3DXFloat16To32Array(&single, (D3DXFLOAT16 *)&testdata[i].half_ver1, 1);
+        ok(out == &single, "Got %p, expected %p.\n", out, &single);
+        ok(relative_error(single, testdata[i].single_out_ver1) < admitted_error,
+           "Got %f, expected %f for index %d.\n", single, testdata[i].single_out_ver1, i);
+
+        out = D3DXFloat16To32Array(&single, (D3DXFLOAT16 *)&testdata[i].half_ver2, 1);
+        ok(out == &single, "Got %p, expected %p.\n", out, &single);
+        ok(relative_error(single, testdata[i].single_out_ver2) < admitted_error,
+           "Got %f, expected %f for index %d.\n", single, testdata[i].single_out_ver2, i);
+    }
+
+    {
+        HMODULE dll_handle = NULL;
+        D3DXFLOAT16 res, res2;
+        D3DXFLOAT16* (WINAPI * float32to16)(D3DXFLOAT16 *pout, CONST FLOAT *pin, UINT n);
+        union
+        {
+            float f;
+            DWORD d;
+        } x;
+
+        dll_handle = LoadLibraryA("d3dx9_36_2.dll");
+        if (!dll_handle) skip("init: Could not load d3dx9_36_2.dll.\n");
+
+        float32to16 = (void *)GetProcAddress(dll_handle, "D3DXFloat32To16Array");
+        if (!float32to16)
+        {
+            FreeLibrary(dll_handle);
+            skip("init: Could not get function pointer (D3DXFloat32To16Array).\n");
+        }
+
+        for (i = 0; i < 0xfffff000; i += 0xff)
+        {
+            x.d = i;
+            float32to16(&res, &x.f, 1);
+	    D3DXFloat32To16Array(&res2, &x.f, 1);
+	    //	    singles2halfp(&res2, &x.f, 1);
+
+            if (res.value != res2.value)
+            {
+		unsigned int *ptr = (unsigned int *)&x.f;
+		printf("%s: res2 - res = %d\n", single_to_binary(*ptr), res2.value - res.value);
+		float_32_to_16(x.f);
+		//                ok(res.value == res2.value, "Failed i=%#x f=%f (%#x!=%#x)\n", i, x.f, res.value, res2.value);
+		/*
+		  printf("\t%s: ", single_to_binary(*ptr));
+		  printf("%s !=", half_to_binary(res.value));
+		  printf(" %s\n", half_to_binary(res2.value));
+		*/
+            }
+            //trace("i=%#x f=%f (%#x!=%#x)\n", i, x.f, res.value, res2.value);
+        }
+        FreeLibrary(dll_handle);
+    }
+}
+
 START_TEST(math)
 {
     D3DXColorTest();
@@ -2230,4 +2562,5 @@ START_TEST(math)
     test_Matrix_Decompose();
     test_Matrix_Transformation2D();
     test_D3DXVec_Array();
+    test_D3DXFloat_Array();
 }
diff --git a/include/d3dx9math.h b/include/d3dx9math.h
index f842e3e..cdb1deb 100644
--- a/include/d3dx9math.h
+++ b/include/d3dx9math.h
@@ -261,6 +261,21 @@ typedef struct D3DXCOLOR
     FLOAT r, g, b, a;
 } D3DXCOLOR, *LPD3DXCOLOR;
 
+typedef struct D3DXFLOAT16
+{
+#ifdef __cplusplus
+    D3DXFLOAT16();
+    D3DXFLOAT16(FLOAT f);
+    D3DXFLOAT16(CONST D3DXFLOAT16 &f);
+
+    operator FLOAT ();
+
+    BOOL operator == (CONST D3DXFLOAT16 &) const;
+    BOOL operator != (CONST D3DXFLOAT16 &) const;
+#endif /* __cplusplus */
+    WORD value;
+} D3DXFLOAT16, *LPD3DXFLOAT16;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -358,6 +373,9 @@ D3DXVECTOR4* WINAPI D3DXVec4Normalize(D3DXVECTOR4 *pout, CONST D3DXVECTOR4 *pv);
 D3DXVECTOR4* WINAPI D3DXVec4Transform(D3DXVECTOR4 *pout, CONST D3DXVECTOR4 *pv, CONST D3DXMATRIX *pm);
 D3DXVECTOR4* WINAPI D3DXVec4TransformArray(D3DXVECTOR4 *pout, UINT outstride, CONST D3DXVECTOR4 *pv, UINT vstride, CONST D3DXMATRIX *pm, UINT n);
 
+D3DXFLOAT16 *WINAPI D3DXFloat32To16Array(D3DXFLOAT16 *pout, CONST FLOAT *pin, UINT n);
+FLOAT *WINAPI D3DXFloat16To32Array(FLOAT *pout, CONST D3DXFLOAT16 *pin, UINT n);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/d3dx9math.inl b/include/d3dx9math.inl
index 3cd078a..3f55aef 100644
--- a/include/d3dx9math.inl
+++ b/include/d3dx9math.inl
@@ -851,6 +851,37 @@ inline BOOL D3DXCOLOR::operator != (CONST D3DXCOLOR& col) const
     return r != col.r || g != col.g || b != col.b || a != col.a;
 }
 
+inline D3DXFLOAT16::D3DXFLOAT16()
+{
+}
+
+inline D3DXFLOAT16::D3DXFLOAT16(FLOAT f)
+{
+    D3DXFloat32To16Array(this, &f, 1);
+}
+
+inline D3DXFLOAT16::D3DXFLOAT16(CONST D3DXFLOAT16 &f)
+{
+    value = f.value;
+}
+
+inline D3DXFLOAT16::operator FLOAT ()
+{
+    FLOAT f;
+    D3DXFloat16To32Array(&f, this, 1);
+    return f;
+}
+
+inline BOOL D3DXFLOAT16::operator == (CONST D3DXFLOAT16 &f) const
+{
+    return value == f.value;
+}
+
+inline BOOL D3DXFLOAT16::operator != (CONST D3DXFLOAT16 &f) const
+{
+    return value != f.value;
+}
+
 #endif /* __cplusplus */
 
 /*_______________D3DXCOLOR_____________________*/