diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index d869c04..f5eb034 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -42,6 +42,7 @@
 #endif
 #include "pixman-private.h"
 #include "pixman-combine32.h"
+#include "pixman-inlines.h"
 
 #define no_vERBOSE
 
@@ -3506,6 +3507,109 @@ mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
     _mm_empty ();
 }
 
+#define BILINEAR_DECLARE_VARIABLES						\
+    const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);				\
+    const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);				\
+    const __m64 mm_xorc_hi = _mm_setzero_si64 ();				\
+    const __m64 mm_xorc_lo = _mm_set_pi16 (0xff, 0xff, 0xff, 0xff);		\
+    const __m64 mm_addc_hi = _mm_set_pi16 (0, 0, 0, 0);				\
+    const __m64 mm_addc_lo = _mm_set_pi16 (1, 1, 1, 1);				\
+    const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);		\
+    const __m64 mm_zero = _mm_setzero_si64 ();				\
+    __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
+do {										\
+    __m64 mm_wh_hi, mm_wh_lo, mm_hi_lo, mm_hi_hi, mm_lo_lo, mm_lo_hi, hi, lo, t, b, t_hi, t_lo, b_hi, b_lo;		\
+    /* fetch 2x2 pixel block into sse2 register */				\
+    uint32_t tl = src_top [pixman_fixed_to_int (vx)];				\
+    uint32_t tr = src_top [pixman_fixed_to_int (vx) + 1];			\
+    uint32_t bl = src_bottom [pixman_fixed_to_int (vx)];			\
+    uint32_t br = src_bottom [pixman_fixed_to_int (vx) + 1];			\
+    t = _mm_set_pi32 (tr, tl);							\
+    b = _mm_set_pi32 (br, bl);							\
+    vx += unit_x;								\
+    /* vertical interpolation */						\
+    t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt);		\
+    t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt);		\
+    b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb);		\
+    b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);		\
+    hi = _mm_add_pi16 (t_hi, b_hi);						\
+    lo = _mm_add_pi16 (t_lo, b_lo);						\
+    /* calculate horizontal weights */						\
+    mm_wh_hi = _mm_add_pi16 (mm_addc_hi,					\
+			     _mm_xor_si64 (mm_xorc_hi,				\
+					_mm_srli_pi16 (mm_x, 8)));		\
+    mm_wh_lo = _mm_add_pi16 (mm_addc_lo,					\
+			     _mm_xor_si64 (mm_xorc_lo,				\
+					_mm_srli_pi16 (mm_x, 8)));		\
+    mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
+    /* horizontal interpolation */						\
+    mm_hi_lo = _mm_mullo_pi16 (hi, mm_wh_hi);					\
+    mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi);					\
+    mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo);					\
+    mm_lo_hi = _mm_mulhi_pu16 (lo, mm_wh_lo);					\
+    hi = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_hi_lo, mm_hi_hi),			\
+		       _mm_unpackhi_pi16 (mm_hi_lo, mm_hi_hi));			\
+    lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_lo_hi),			\
+		       _mm_unpackhi_pi16 (mm_lo_lo, mm_lo_hi));			\
+    /* shift and pack the result */						\
+    hi = _mm_srli_pi32 (hi, 16);						\
+    lo = _mm_srli_pi32 (lo, 16);						\
+    hi = _mm_packs_pi32 (lo, hi);						\
+    hi = _mm_packs_pu16 (hi, hi);						\
+    pix = _mm_cvtsi64_si32 (hi);						\
+} while (0)
+
+static force_inline void
+scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t *       dst,
+					    const uint32_t * mask,
+					    const uint32_t * src_top,
+					    const uint32_t * src_bottom,
+					    int32_t          w,
+					    int              wt,
+					    int              wb,
+					    pixman_fixed_t   vx,
+					    pixman_fixed_t   unit_x,
+					    pixman_fixed_t   max_vx,
+					    pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2;
+
+    while ((w -= 2) >= 0)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+	*dst++ = pix1;
+	*dst++ = pix2;
+    }
+
+    if (w & 1)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	*dst = pix1;
+    }
+
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
+			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
+			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
+			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
+			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_NONE)
+
 static uint32_t *
 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
 {
@@ -3761,6 +3865,10 @@ static const pixman_fast_path_t mmx_fast_paths[] =
     PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
     PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
 
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          a8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
+
     { PIXMAN_OP_NONE },
 };
 
