simd/jsimd_mips_dspr2.S - external/github.com/libjpeg-turbo/libjpeg-turbo - Git at Google

 /*
  * MIPS DSPr2 optimizations for libjpeg-turbo
  *
  * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
  * All rights reserved.
  * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
  *           Darko Laus       (darko.laus@imgtec.com)
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
  * arising from the use of this software.
  *
  * Permission is granted to anyone to use this software for any purpose,
  * including commercial applications, and to alter it and redistribute it
  * freely, subject to the following restrictions:
  *
  * 1. The origin of this software must not be misrepresented; you must not
  *    claim that you wrote the original software. If you use this software
  *    in a product, an acknowledgment in the product documentation would be
  *    appreciated but is not required.
  * 2. Altered source versions must be plainly marked as such, and must not be
  *    misrepresented as being the original software.
  * 3. This notice may not be removed or altered from any source distribution.
  */

 #include "jsimd_mips_dspr2_asm.h"

 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_c_null_convert_mips_dspr2)
 /*
  * a0     - cinfo->image_width
  * a1     - input_buf
  * a2     - output_buf
  * a3     - output_row
  * 16(sp) - num_rows
  * 20(sp) - cinfo->num_components
  *
  * Null conversion for compression
  */

     SAVE_REGS_ON_STACK 8, s0, s1

     lw        t9, 24(sp)   // t9 = num_rows
     lw        s0, 28(sp)   // s0 = cinfo->num_components
     andi      t0, a0, 3    // t0 = cinfo->image_width & 3
     beqz      t0, 4f       // no residual
      nop
 0:
     addiu     t9, t9, -1
     bltz      t9, 7f
      li       t1, 0
 1:
     sll       t3, t1, 2
     lwx       t5, t3(a2)   // t5 = outptr = output_buf[ci]
     lw        t2, 0(a1)    // t2 = inptr = *input_buf
     sll       t4, a3, 2
     lwx       t5, t4(t5)   // t5 = outptr = output_buf[ci][output_row]
     addu      t2, t2, t1
     addu      s1, t5, a0
     addu      t6, t5, t0
 2:
     lbu       t3, 0(t2)
     addiu     t5, t5, 1
     sb        t3, -1(t5)
     bne       t6, t5, 2b
      addu     t2, t2, s0
 3:
     lbu       t3, 0(t2)
     addu      t4, t2, s0
     addu      t7, t4, s0
     addu      t8, t7, s0
     addu      t2, t8, s0
     lbu       t4, 0(t4)
     lbu       t7, 0(t7)
     lbu       t8, 0(t8)
     addiu     t5, t5, 4
     sb        t3, -4(t5)
     sb        t4, -3(t5)
     sb        t7, -2(t5)
     bne       s1, t5, 3b
      sb       t8, -1(t5)
     addiu     t1, t1, 1
     bne       t1, s0, 1b
      nop
     addiu     a1, a1, 4
     bgez      t9, 0b
      addiu    a3, a3, 1
     b         7f
      nop
 4:
     addiu     t9, t9, -1
     bltz      t9, 7f
      li       t1, 0
 5:
     sll       t3, t1, 2
     lwx       t5, t3(a2)   // t5 = outptr = output_buf[ci]
     lw        t2, 0(a1)    // t2 = inptr = *input_buf
     sll       t4, a3, 2
     lwx       t5, t4(t5)   // t5 = outptr = output_buf[ci][output_row]
     addu      t2, t2, t1
     addu      s1, t5, a0
     addu      t6, t5, t0
 6:
     lbu       t3, 0(t2)
     addu      t4, t2, s0
     addu      t7, t4, s0
     addu      t8, t7, s0
     addu      t2, t8, s0
     lbu       t4, 0(t4)
     lbu       t7, 0(t7)
     lbu       t8, 0(t8)
     addiu     t5, t5, 4
     sb        t3, -4(t5)
     sb        t4, -3(t5)
     sb        t7, -2(t5)
     bne       s1, t5, 6b
      sb       t8, -1(t5)
     addiu     t1, t1, 1
     bne       t1, s0, 5b
      nop
     addiu     a1, a1, 4
     bgez      t9, 4b
      addiu    a3, a3, 1
 7:
     RESTORE_REGS_FROM_STACK 8, s0, s1

     j         ra
      nop

 END(jsimd_c_null_convert_mips_dspr2)

 /*****************************************************************************/
 /*
  * jsimd_extrgb_ycc_convert_mips_dspr2
  * jsimd_extbgr_ycc_convert_mips_dspr2
  * jsimd_extrgbx_ycc_convert_mips_dspr2
  * jsimd_extbgrx_ycc_convert_mips_dspr2
  * jsimd_extxbgr_ycc_convert_mips_dspr2
  * jsimd_extxrgb_ycc_convert_mips_dspr2
  *
  * Colorspace conversion RGB -> YCbCr
  */

 .macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs

 .macro DO_RGB_TO_YCC r,    \
                      g,    \
                      b,    \
                      inptr
     lbu     \r, \r_offs(\inptr)
     lbu     \g, \g_offs(\inptr)
     lbu     \b, \b_offs(\inptr)
     addiu   \inptr, \pixel_size
 .endm

 LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2)
 /*
  * a0     - cinfo->image_width
  * a1     - input_buf
  * a2     - output_buf
  * a3     - output_row
  * 16(sp) - num_rows
  */

     SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

     lw      t7, 48(sp)        // t7 = num_rows
     li      s0, 0x4c8b        // FIX(0.29900)
     li      s1, 0x9646        // FIX(0.58700)
     li      s2, 0x1d2f        // FIX(0.11400)
     li      s3, 0xffffd4cd    // -FIX(0.16874)
     li      s4, 0xffffab33    // -FIX(0.33126)
     li      s5, 0x8000        // FIX(0.50000)
     li      s6, 0xffff94d1    // -FIX(0.41869)
     li      s7, 0xffffeb2f    // -FIX(0.08131)
     li      t8, 0x807fff      // CBCR_OFFSET + ONE_HALF-1

 0:
     addiu   t7, -1            // --num_rows
     lw      t6, 0(a1)         // t6 = input_buf[0]
     lw      t0, 0(a2)
     lw      t1, 4(a2)
     lw      t2, 8(a2)
     sll     t3, a3, 2
     lwx     t0, t3(t0)        // t0 = output_buf[0][output_row]
     lwx     t1, t3(t1)        // t1 = output_buf[1][output_row]
     lwx     t2, t3(t2)        // t2 = output_buf[2][output_row]

     addu    t9, t2, a0        // t9 = end address
     addiu   a3, 1

 1:
     DO_RGB_TO_YCC t3, t4, t5, t6

     mtlo    s5, $ac0
     mtlo    t8, $ac1
     mtlo    t8, $ac2
     maddu   $ac0, s2, t5
     maddu   $ac1, s5, t5
     maddu   $ac2, s5, t3
     maddu   $ac0, s0, t3
     maddu   $ac1, s3, t3
     maddu   $ac2, s6, t4
     maddu   $ac0, s1, t4
     maddu   $ac1, s4, t4
     maddu   $ac2, s7, t5
     extr.w  t3, $ac0, 16
     extr.w  t4, $ac1, 16
     extr.w  t5, $ac2, 16
     sb      t3, 0(t0)
     sb      t4, 0(t1)
     sb      t5, 0(t2)
     addiu   t0, 1
     addiu   t2, 1
     bne     t2, t9, 1b
      addiu  t1, 1
     bgtz    t7, 0b
      addiu  a1, 4

     RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

     j ra
      nop
 END(jsimd_\colorid\()_ycc_convert_mips_dspr2)

 .purgem DO_RGB_TO_YCC

 .endm

 /*------------------------------------------id -- pix R  G  B */
 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2
 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0
 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3

 /*****************************************************************************/
 /*
  * jsimd_ycc_extrgb_convert_mips_dspr2
  * jsimd_ycc_extbgr_convert_mips_dspr2
  * jsimd_ycc_extrgbx_convert_mips_dspr2
  * jsimd_ycc_extbgrx_convert_mips_dspr2
  * jsimd_ycc_extxbgr_convert_mips_dspr2
  * jsimd_ycc_extxrgb_convert_mips_dspr2
  *
  * Colorspace conversion YCbCr -> RGB
  */

 .macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs

 .macro STORE_YCC_TO_RGB  scratch0 \
                          scratch1 \
                          scratch2 \
                          outptr
     sb       \scratch0, \r_offs(\outptr)
     sb       \scratch1, \g_offs(\outptr)
     sb       \scratch2, \b_offs(\outptr)
 .if (\pixel_size == 4)
     li       t0, 0xFF
     sb       t0, \a_offs(\outptr)
 .endif
     addiu    \outptr, \pixel_size
 .endm

 LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2)
 /*
  * a0     - cinfo->image_width
  * a1     - input_buf
  * a2     - input_row
  * a3     - output_buf
  * 16(sp) - num_rows
  */

     SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

     lw         s1, 48(sp)
     li         t3, 0x8000
     li         t4, 0x166e9     // FIX(1.40200)
     li         t5, 0x1c5a2     // FIX(1.77200)
     li         t6, 0xffff492e  // -FIX(0.71414)
     li         t7, 0xffffa7e6  // -FIX(0.34414)
     repl.ph    t8, 128

 0:
     lw         s0, 0(a3)
     lw         t0, 0(a1)
     lw         t1, 4(a1)
     lw         t2, 8(a1)
     sll        s5, a2, 2
     addiu      s1, -1
     lwx        s2, s5(t0)
     lwx        s3, s5(t1)
     lwx        s4, s5(t2)
     addu       t9, s2, a0
     addiu      a2, 1

 1:
     lbu        s7, 0(s4)       // cr
     lbu        s6, 0(s3)       // cb
     lbu        s5, 0(s2)       // y
     addiu      s2, 1
     addiu      s4, 1
     addiu      s7, -128
     addiu      s6, -128
     mul        t2, t7, s6
     mul        t0, t6, s7      // Crgtab[cr]
     sll        s7, 15
     mulq_rs.w  t1, t4, s7      // Crrtab[cr]
     sll        s6, 15
     addu       t2, t3          // Cbgtab[cb]
     addu       t2, t0

     mulq_rs.w  t0, t5, s6      // Cbbtab[cb]
     sra        t2, 16
     addu       t1, s5
     addu       t2, s5          // add y
     ins        t2, t1, 16, 16
     subu.ph    t2, t2, t8
     addu       t0, s5
     shll_s.ph  t2, t2, 8
     subu       t0, 128
     shra.ph    t2, t2, 8
     shll_s.w   t0, t0, 24
     addu.ph    t2, t2, t8      // clip & store
     sra        t0, t0, 24
     sra        t1, t2, 16
     addiu      t0, 128

     STORE_YCC_TO_RGB t1, t2, t0, s0

     bne        s2, t9, 1b
      addiu     s3, 1
     bgtz       s1, 0b
      addiu     a3, 4

     RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

     j ra
      nop
 END(jsimd_ycc_\colorid\()_convert_mips_dspr2)

 .purgem STORE_YCC_TO_RGB

 .endm

 /*------------------------------------------id -- pix R  G  B  A */
 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2, 3
 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0, 3
 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3
 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3
 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0
 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0

 /*****************************************************************************/
 /*
  * jsimd_extrgb_gray_convert_mips_dspr2
  * jsimd_extbgr_gray_convert_mips_dspr2
  * jsimd_extrgbx_gray_convert_mips_dspr2
  * jsimd_extbgrx_gray_convert_mips_dspr2
  * jsimd_extxbgr_gray_convert_mips_dspr2
  * jsimd_extxrgb_gray_convert_mips_dspr2
  *
  * Colorspace conversion RGB -> GRAY
  */

 .macro GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs

 .macro DO_RGB_TO_GRAY r,    \
                       g,    \
                       b,    \
                       inptr
     lbu     \r, \r_offs(\inptr)
     lbu     \g, \g_offs(\inptr)
     lbu     \b, \b_offs(\inptr)
     addiu   \inptr, \pixel_size
 .endm

 LEAF_MIPS_DSPR2(jsimd_\colorid\()_gray_convert_mips_dspr2)
 /*
  * a0     - cinfo->image_width
  * a1     - input_buf
  * a2     - output_buf
  * a3     - output_row
  * 16(sp) - num_rows
  */

     SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

     li      s0, 0x4c8b             // s0 = FIX(0.29900)
     li      s1, 0x9646             // s1 = FIX(0.58700)
     li      s2, 0x1d2f             // s2 = FIX(0.11400)
     li      s7, 0x8000             // s7 = FIX(0.50000)
     lw      s6, 48(sp)
     andi    t7, a0, 3

 0:
     addiu   s6, -1                 // s6 = num_rows
     lw      t0, 0(a1)
     lw      t1, 0(a2)
     sll     t3, a3, 2
     lwx     t1, t3(t1)
     addiu   a3, 1
     addu    t9, t1, a0
     subu    t8, t9, t7
     beq     t1, t8, 2f
      nop

 1:
     DO_RGB_TO_GRAY t3, t4, t5, t0
     DO_RGB_TO_GRAY s3, s4, s5, t0

     mtlo    s7, $ac0
     maddu   $ac0, s2, t5
     maddu   $ac0, s1, t4
     maddu   $ac0, s0, t3
     mtlo    s7, $ac1
     maddu   $ac1, s2, s5
     maddu   $ac1, s1, s4
     maddu   $ac1, s0, s3
     extr.w  t6, $ac0, 16

     DO_RGB_TO_GRAY t3, t4, t5, t0
     DO_RGB_TO_GRAY s3, s4, s5, t0

     mtlo    s7, $ac0
     maddu   $ac0, s2, t5
     maddu   $ac0, s1, t4
     extr.w  t2, $ac1, 16
     maddu   $ac0, s0, t3
     mtlo    s7, $ac1
     maddu   $ac1, s2, s5
     maddu   $ac1, s1, s4
     maddu   $ac1, s0, s3
     extr.w  t5, $ac0, 16
     sb      t6, 0(t1)
     sb      t2, 1(t1)
     extr.w  t3, $ac1, 16
     addiu   t1, 4
     sb      t5, -2(t1)
     sb      t3, -1(t1)
     bne     t1, t8, 1b
      nop

 2:
     beqz    t7, 4f
      nop

 3:
     DO_RGB_TO_GRAY t3, t4, t5, t0

     mtlo    s7, $ac0
     maddu   $ac0, s2, t5
     maddu   $ac0, s1, t4
     maddu   $ac0, s0, t3
     extr.w  t6, $ac0, 16
     sb      t6, 0(t1)
     addiu   t1, 1
     bne     t1, t9, 3b
      nop

 4:
     bgtz    s6, 0b
      addiu  a1, 4

     RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

     j ra
      nop
 END(jsimd_\colorid\()_gray_convert_mips_dspr2)

 .purgem DO_RGB_TO_GRAY

 .endm

 /*------------------------------------------id --  pix R  G  B */
 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2
 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0
 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
 /*****************************************************************************/
 /*
  * jsimd_h2v2_merged_upsample_mips_dspr2
  * jsimd_h2v2_extrgb_merged_upsample_mips_dspr2
  * jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2
  * jsimd_h2v2_extbgr_merged_upsample_mips_dspr2
  * jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2
  * jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2
  * jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2
  *
  * Merged h2v2 upsample routines
  */
 .macro GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 colorid,    \
                                                 pixel_size, \
                                                 r1_offs,    \
                                                 g1_offs,    \
                                                 b1_offs,    \
                                                 a1_offs,    \
                                                 r2_offs,    \
                                                 g2_offs,    \
                                                 b2_offs,    \
                                                 a2_offs

 .macro STORE_H2V2_2_PIXELS  scratch0 \
                             scratch1 \
                             scratch2 \
                             scratch3 \
                             scratch4 \
                             scratch5 \
                             outptr
     sb       \scratch0, \r1_offs(\outptr)
     sb       \scratch1, \g1_offs(\outptr)
     sb       \scratch2, \b1_offs(\outptr)
     sb       \scratch3, \r2_offs(\outptr)
     sb       \scratch4, \g2_offs(\outptr)
     sb       \scratch5, \b2_offs(\outptr)
 .if (\pixel_size == 8)
     li       \scratch0, 0xFF
     sb       \scratch0, \a1_offs(\outptr)
     sb       \scratch0, \a2_offs(\outptr)
 .endif
     addiu    \outptr, \pixel_size
 .endm

 .macro STORE_H2V2_1_PIXEL  scratch0 \
                            scratch1 \
                            scratch2 \
                            outptr
     sb    \scratch0, \r1_offs(\outptr)
     sb    \scratch1, \g1_offs(\outptr)
     sb    \scratch2, \b1_offs(\outptr)

 .if (\pixel_size == 8)
     li    t0, 0xFF
     sb    t0, \a1_offs(\outptr)
 .endif
 .endm

 LEAF_MIPS_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
 /*
  * a0     - cinfo->output_width
  * a1     - input_buf
  * a2     - in_row_group_ctr
  * a3     - output_buf
  * 16(sp) - cinfo->sample_range_limit
  */

     SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra

     lw           t9, 56(sp)        // cinfo->sample_range_limit
     lw           v0, 0(a1)
     lw           v1, 4(a1)
     lw           t0, 8(a1)
     sll          t1, a2, 3
     addiu        t2, t1, 4
     sll          t3, a2, 2
     lw           t4, 0(a3)         // t4 = output_buf[0]
     lwx          t1, t1(v0)        // t1 = input_buf[0][in_row_group_ctr*2]
     lwx          t2, t2(v0)        // t2 = input_buf[0][in_row_group_ctr*2 + 1]
     lwx          t5, t3(v1)        // t5 = input_buf[1][in_row_group_ctr]
     lwx          t6, t3(t0)        // t6 = input_buf[2][in_row_group_ctr]
     lw           t7, 4(a3)         // t7 = output_buf[1]
     li           s1, 0xe6ea
     addiu        t8, s1, 0x7fff    // t8 = 0x166e9 [FIX(1.40200)]
     addiu        s0, t8, 0x5eb9    // s0 = 0x1c5a2 [FIX(1.77200)]
     addiu        s1, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
     xori         s2, s1, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
     srl          t3, a0, 1
     blez         t3, 2f
      addu        t0, t5, t3        // t0 = end address
  1:
     lbu          t3, 0(t5)
     lbu          s3, 0(t6)
     addiu        t5, t5, 1
     addiu        t3, t3, -128      // (cb - 128)
     addiu        s3, s3, -128      // (cr - 128)
     mult         $ac1, s1, t3
     madd         $ac1, s2, s3
     sll          s3, s3, 15
     sll          t3, t3, 15
     mulq_rs.w    s4, t8, s3        // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
     extr_r.w     s5, $ac1, 16
     mulq_rs.w    s6, s0, t3        // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
     lbu          v0, 0(t1)
     addiu        t6, t6, 1
     addiu        t1, t1, 2
     addu         t3, v0, s4        // y+cred
     addu         s3, v0, s5        // y+cgreen
     addu         v1, v0, s6        // y+cblue
     addu         t3, t9, t3        // y+cred
     addu         s3, t9, s3        // y+cgreen
     addu         v1, t9, v1        // y+cblue
     lbu          AT, 0(t3)
     lbu          s7, 0(s3)
     lbu          ra, 0(v1)
     lbu          v0, -1(t1)
     addu         t3, v0, s4        // y+cred
     addu         s3, v0, s5        // y+cgreen
     addu         v1, v0, s6        // y+cblue
     addu         t3, t9, t3        // y+cred
     addu         s3, t9, s3        // y+cgreen
     addu         v1, t9, v1        // y+cblue
     lbu          t3, 0(t3)
     lbu          s3, 0(s3)
     lbu          v1, 0(v1)
     lbu          v0, 0(t2)

     STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4

     addu         t3, v0, s4        // y+cred
     addu         s3, v0, s5        // y+cgreen
     addu         v1, v0, s6        // y+cblue
     addu         t3, t9, t3        // y+cred
     addu         s3, t9, s3        // y+cgreen
     addu         v1, t9, v1        // y+cblue
     lbu          AT, 0(t3)
     lbu          s7, 0(s3)
     lbu          ra, 0(v1)
     lbu          v0, 1(t2)
     addiu        t2, t2, 2
     addu         t3, v0, s4        // y+cred
     addu         s3, v0, s5        // y+cgreen
     addu         v1, v0, s6        // y+cblue
     addu         t3, t9, t3        // y+cred
     addu         s3, t9, s3        // y+cgreen
     addu         v1, t9, v1        // y+cblue
     lbu          t3, 0(t3)
     lbu          s3, 0(s3)
     lbu          v1, 0(v1)

     STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7

     bne          t0, t5, 1b
      nop
 2:
     andi         t0, a0, 1
     beqz         t0, 4f
      lbu          t3, 0(t5)
     lbu          s3, 0(t6)
     addiu        t3, t3, -128      // (cb - 128)
     addiu        s3, s3, -128      // (cr - 128)
     mult         $ac1, s1, t3
     madd         $ac1, s2, s3
     sll          s3, s3, 15
     sll          t3, t3, 15
     lbu          v0, 0(t1)
     extr_r.w     s5, $ac1, 16
     mulq_rs.w    s4, t8, s3        // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
     mulq_rs.w    s6, s0, t3        // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
     addu         t3, v0, s4        // y+cred
     addu         s3, v0, s5        // y+cgreen
     addu         v1, v0, s6        // y+cblue
     addu         t3, t9, t3        // y+cred
     addu         s3, t9, s3        // y+cgreen
     addu         v1, t9, v1        // y+cblue
     lbu          t3, 0(t3)
     lbu          s3, 0(s3)
     lbu          v1, 0(v1)
     lbu          v0, 0(t2)

     STORE_H2V2_1_PIXEL t3, s3, v1, t4

     addu         t3, v0, s4        // y+cred
     addu         s3, v0, s5        // y+cgreen
     addu         v1, v0, s6        // y+cblue
     addu         t3, t9, t3        // y+cred
     addu         s3, t9, s3        // y+cgreen
     addu         v1, t9, v1        // y+cblue
     lbu          t3, 0(t3)
     lbu          s3, 0(s3)
     lbu          v1, 0(v1)

     STORE_H2V2_1_PIXEL t3, s3, v1, t7
 4:
     RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra

     j           ra
      nop

 END(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)

 .purgem STORE_H2V2_1_PIXEL
 .purgem STORE_H2V2_2_PIXELS
 .endm

 /*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
 /*****************************************************************************/
 /*
  * jsimd_h2v1_merged_upsample_mips_dspr2
  * jsimd_h2v1_extrgb_merged_upsample_mips_dspr2
  * jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2
  * jsimd_h2v1_extbgr_merged_upsample_mips_dspr2
  * jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2
  * jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2
  * jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2
  *
  * Merged h2v1 upsample routines
  */

 .macro GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 colorid,    \
                                                 pixel_size, \
                                                 r1_offs,    \
                                                 g1_offs,    \
                                                 b1_offs,    \
                                                 a1_offs,    \
                                                 r2_offs,    \
                                                 g2_offs,    \
                                                 b2_offs,    \
                                                 a2_offs

 .macro STORE_H2V1_2_PIXELS  scratch0 \
                             scratch1 \
                             scratch2 \
                             scratch3 \
                             scratch4 \
                             scratch5 \
                             outptr
     sb       \scratch0, \r1_offs(\outptr)
     sb       \scratch1, \g1_offs(\outptr)
     sb       \scratch2, \b1_offs(\outptr)
     sb       \scratch3, \r2_offs(\outptr)
     sb       \scratch4, \g2_offs(\outptr)
     sb       \scratch5, \b2_offs(\outptr)
 .if (\pixel_size == 8)
     li       t0, 0xFF
     sb       t0, \a1_offs(\outptr)
     sb       t0, \a2_offs(\outptr)
 .endif
     addiu    \outptr, \pixel_size
 .endm

 .macro STORE_H2V1_1_PIXEL  scratch0 \
                            scratch1 \
                            scratch2 \
                            outptr
     sb    \scratch0, \r1_offs(\outptr)
     sb    \scratch1, \g1_offs(\outptr)
     sb    \scratch2, \b1_offs(\outptr)
 .if (\pixel_size == 8)
     li    t0, 0xFF
     sb    t0, \a1_offs(\outptr)
 .endif
 .endm

 LEAF_MIPS_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
 /*
  * a0     - cinfo->output_width
  * a1     - input_buf
  * a2     - in_row_group_ctr
  * a3     - output_buf
  * 16(sp) - range_limit
  */

     SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra

     li           t0, 0xe6ea
     lw           t1, 0(a1)         // t1 = input_buf[0]
     lw           t2, 4(a1)         // t2 = input_buf[1]
     lw           t3, 8(a1)         // t3 = input_buf[2]
     lw           t8, 56(sp)        // t8 = range_limit
     addiu        s1, t0, 0x7fff    // s1 = 0x166e9 [FIX(1.40200)]
     addiu        s2, s1, 0x5eb9    // s2 = 0x1c5a2 [FIX(1.77200)]
     addiu        s0, t0, 0x9916    // s0 = 0x8000
     addiu        s4, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
     xori         s3, s4, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
     srl          t0, a0, 1
     sll          t4, a2, 2
     lwx          s5, t4(t1)        // s5 = inptr0
     lwx          s6, t4(t2)        // s6 = inptr1
     lwx          s7, t4(t3)        // s7 = inptr2
     lw           t7, 0(a3)         // t7 = outptr
     blez         t0, 2f
      addu        t9, s6, t0        // t9 = end address
 1:
     lbu          t2, 0(s6)         // t2 = cb
     lbu          t0, 0(s7)         // t0 = cr
     lbu          t1, 0(s5)         // t1 = y
     addiu        t2, t2, -128      // t2 = cb - 128
     addiu        t0, t0, -128      // t0 = cr - 128
     mult         $ac1, s4, t2
     madd         $ac1, s3, t0
     sll          t0, t0, 15
     sll          t2, t2, 15
     mulq_rs.w    t0, s1, t0        // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
     extr_r.w     t5, $ac1, 16
     mulq_rs.w    t6, s2, t2        // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
     addiu        s7, s7, 1
     addiu        s6, s6, 1
     addu         t2, t1, t0        // t2 = y + cred
     addu         t3, t1, t5        // t3 = y + cgreen
     addu         t4, t1, t6        // t4 = y + cblue
     addu         t2, t8, t2
     addu         t3, t8, t3
     addu         t4, t8, t4
     lbu          t1, 1(s5)
     lbu          v0, 0(t2)
     lbu          v1, 0(t3)
     lbu          ra, 0(t4)
     addu         t2, t1, t0
     addu         t3, t1, t5
     addu         t4, t1, t6
     addu         t2, t8, t2
     addu         t3, t8, t3
     addu         t4, t8, t4
     lbu          t2, 0(t2)
     lbu          t3, 0(t3)
     lbu          t4, 0(t4)

     STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7

     bne          t9, s6, 1b
      addiu       s5, s5, 2
 2:
     andi         t0, a0, 1
     beqz         t0, 4f
      nop
 3:
     lbu          t2, 0(s6)
     lbu          t0, 0(s7)
     lbu          t1, 0(s5)
     addiu        t2, t2, -128      //(cb - 128)
     addiu        t0, t0, -128      //(cr - 128)
     mul          t3, s4, t2
     mul          t4, s3, t0
     sll          t0, t0, 15
     sll          t2, t2, 15
     mulq_rs.w    t0, s1, t0       // (C1*cr + ONE_HALF)>> SCALEBITS
     mulq_rs.w    t6, s2, t2       // (C2*cb + ONE_HALF)>> SCALEBITS
     addu         t3, t3, s0
     addu         t3, t4, t3
     sra          t5, t3, 16       // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
     addu         t2, t1, t0       // y + cred
     addu         t3, t1, t5       // y + cgreen
     addu         t4, t1, t6       // y + cblue
     addu         t2, t8, t2
     addu         t3, t8, t3
     addu         t4, t8, t4
     lbu          t2, 0(t2)
     lbu          t3, 0(t3)
     lbu          t4, 0(t4)

     STORE_H2V1_1_PIXEL t2, t3, t4, t7
 4:
     RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra

     j            ra
      nop

 END(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)

 .purgem STORE_H2V1_1_PIXEL
 .purgem STORE_H2V1_2_PIXELS
 .endm

 /*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
 /*****************************************************************************/
 /*
  * jsimd_h2v2_fancy_upsample_mips_dspr2
  *
  * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
  */
 LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2)
 /*
  * a0     - cinfo->max_v_samp_factor
  * a1     - downsampled_width
  * a2     - input_data
  * a3     - output_data_ptr
  */

     SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5

     li             s4, 0
     lw             s2, 0(a3)       // s2 = *output_data_ptr
 0:
     li             t9, 2
     lw             s1, -4(a2)      // s1 = inptr1

 1:
     lw             s0, 0(a2)       // s0 = inptr0
     lwx            s3, s4(s2)
     addiu          s5, a1, -2      // s5 = downsampled_width - 2
     srl            t4, s5, 1
     sll            t4, t4, 1
     lbu            t0, 0(s0)
     lbu            t1, 1(s0)
     lbu            t2, 0(s1)
     lbu            t3, 1(s1)
     addiu          s0, 2
     addiu          s1, 2
     addu           t8, s0, t4      // t8 = end address
     andi           s5, s5, 1       // s5 = residual
     sll            t4, t0, 1
     sll            t6, t1, 1
     addu           t0, t0, t4      // t0 = (*inptr0++) * 3
     addu           t1, t1, t6      // t1 = (*inptr0++) * 3
     addu           t7, t0, t2      // t7 = thiscolsum
     addu           t6, t1, t3      // t5 = nextcolsum
     sll            t0, t7, 2       // t0 = thiscolsum * 4
     subu           t1, t0, t7      // t1 = thiscolsum * 3
     shra_r.w       t0, t0, 4
     addiu          t1, 7
     addu           t1, t1, t6
     srl            t1, t1, 4
     sb             t0, 0(s3)
     sb             t1, 1(s3)
     addiu          s3, 2
 2:
     lh             t0, 0(s0)       // t0 = A3|A2
     lh             t2, 0(s1)       // t2 = B3|B2
     addiu          s0, 2
     addiu          s1, 2
     preceu.ph.qbr  t0, t0          // t0 = 0|A3|0|A2
     preceu.ph.qbr  t2, t2          // t2 = 0|B3|0|B2
     shll.ph        t1, t0, 1
     sll            t3, t6, 1
     addu.ph        t0, t1, t0      // t0 = A3*3|A2*3
     addu           t3, t3, t6      // t3 = this * 3
     addu.ph        t0, t0, t2      // t0 = next2|next1
     addu           t1, t3, t7
     andi           t7, t0, 0xFFFF  // t7 = next1
     sll            t2, t7, 1
     addu           t2, t7, t2      // t2 = next1*3
     addu           t4, t2, t6
     srl            t6, t0, 16      // t6 = next2
     shra_r.w       t1, t1, 4       // t1 = (this*3 + last + 8) >> 4
     addu           t0, t3, t7
     addiu          t0, 7
     srl            t0, t0, 4       // t0 = (this*3 + next1 + 7) >> 4
     shra_r.w       t4, t4, 4       // t3 = (next1*3 + this + 8) >> 4
     addu           t2, t2, t6
     addiu          t2, 7
     srl            t2, t2, 4       // t2 = (next1*3 + next2 + 7) >> 4
     sb             t1, 0(s3)
     sb             t0, 1(s3)
     sb             t4, 2(s3)
     sb             t2, 3(s3)
     bne            t8, s0, 2b
      addiu         s3, 4
     beqz           s5, 4f
      addu          t8, s0, s5
 3:
     lbu            t0, 0(s0)
     lbu            t2, 0(s1)
     addiu          s0, 1
     addiu          s1, 1
     sll            t3, t6, 1
     sll            t1, t0, 1
     addu           t1, t0, t1      // t1 = inptr0 * 3
     addu           t3, t3, t6      // t3 = thiscolsum * 3
     addu           t5, t1, t2
     addu           t1, t3, t7
     shra_r.w       t1, t1, 4
     addu           t0, t3, t5
     addiu          t0, 7
     srl            t0, t0, 4
     sb             t1, 0(s3)
     sb             t0, 1(s3)
     addiu          s3, 2
     move           t7, t6
     bne            t8, s0, 3b
      move          t6, t5
 4:
     sll            t0, t6, 2       // t0 = thiscolsum * 4
     subu           t1, t0, t6      // t1 = thiscolsum * 3
     addu           t1, t1, t7
     addiu          s4, 4
     shra_r.w       t1, t1, 4
     addiu          t0, 7
     srl            t0, t0, 4
     sb             t1, 0(s3)
     sb             t0, 1(s3)
     addiu          t9, -1
     addiu          s3, 2
     bnez           t9, 1b
      lw            s1, 4(a2)
     srl            t0, s4, 2
     subu           t0, a0, t0
     bgtz           t0, 0b
      addiu         a2, 4

     RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5

     j ra
      nop
 END(jsimd_h2v2_fancy_upsample_mips_dspr2)

 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2)
 /*
  * a0     - cinfo->max_v_samp_factor
  * a1     - downsampled_width
  * a2     - input_data
  * a3     - output_data_ptr
  */

     SAVE_REGS_ON_STACK 16, s0, s1, s2, s3

     .set at

     beqz           a0, 3f
      sll           t0, a0, 2
     lw             s1, 0(a3)
     li             s3, 0x10001
     addu           s0, s1, t0
 0:
     addiu          t8, a1, -2
     srl            t9, t8, 2
     lw             t7, 0(a2)
     lw             s2, 0(s1)
     lbu            t0, 0(t7)
     lbu            t1, 1(t7)   // t1 = inptr[1]
     sll            t2, t0, 1
     addu           t2, t2, t0  // t2 = invalue*3
     addu           t2, t2, t1
     shra_r.w       t2, t2, 2
     sb             t0, 0(s2)
     sb             t2, 1(s2)
     beqz           t9, 11f
      addiu         s2, 2
 1:
     ulw            t0, 0(t7)   // t0 = |P3|P2|P1|P0|
     ulw            t1, 1(t7)
     ulh            t2, 4(t7)   // t2 = |0|0|P5|P4|
     preceu.ph.qbl  t3, t0      // t3 = |0|P3|0|P2|
     preceu.ph.qbr  t0, t0      // t0 = |0|P1|0|P0|
     preceu.ph.qbr  t2, t2      // t2 = |0|P5|0|P4|
     preceu.ph.qbl  t4, t1      // t4 = |0|P4|0|P3|
     preceu.ph.qbr  t1, t1      // t1 = |0|P2|0|P1|
     shll.ph        t5, t4, 1
     shll.ph        t6, t1, 1
     addu.ph        t5, t5, t4  // t5 = |P4*3|P3*3|
     addu.ph        t6, t6, t1  // t6 = |P2*3|P1*3|
     addu.ph        t4, t3, s3
     addu.ph        t0, t0, s3
     addu.ph        t4, t4, t5
     addu.ph        t0, t0, t6
     shrl.ph        t4, t4, 2   // t4 = |0|P3|0|P2|
     shrl.ph        t0, t0, 2   // t0 = |0|P1|0|P0|
     addu.ph        t2, t2, t5
     addu.ph        t3, t3, t6
     shra_r.ph      t2, t2, 2   // t2 = |0|P5|0|P4|
     shra_r.ph      t3, t3, 2   // t3 = |0|P3|0|P2|
     shll.ph        t2, t2, 8
     shll.ph        t3, t3, 8
     or             t2, t4, t2
     or             t3, t3, t0
     addiu          t9, -1
     usw            t3, 0(s2)
     usw            t2, 4(s2)
     addiu          s2, 8
     bgtz           t9, 1b
      addiu         t7, 4
 11:
     andi           t8, 3
     beqz           t8, 22f
      addiu         t7, 1

 2:
     lbu            t0, 0(t7)
     addiu          t7, 1
     sll            t1, t0, 1
     addu           t2, t0, t1  // t2 = invalue
     lbu            t3, -2(t7)
     lbu            t4, 0(t7)
     addiu          t3, 1
     addiu          t4, 2
     addu           t3, t3, t2
     addu           t4, t4, t2
     srl            t3, 2
     srl            t4, 2
     sb             t3, 0(s2)
     sb             t4, 1(s2)
     addiu          t8, -1
     bgtz           t8, 2b
      addiu         s2, 2

 22:
     lbu            t0, 0(t7)
     lbu            t2, -1(t7)
     sll            t1, t0, 1
     addu           t1, t1, t0 // t1 = invalue * 3
     addu           t1, t1, t2
     addiu          t1, 1
     srl            t1, t1, 2
     sb             t1, 0(s2)
     sb             t0, 1(s2)
     addiu          s1, 4
     bne            s1, s0, 0b
      addiu         a2, 4
 3:
     RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3

     j              ra
      nop
 END(jsimd_h2v1_fancy_upsample_mips_dspr2)

 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_h2v1_downsample_mips_dspr2)
 /*
  * a0     - cinfo->image_width
  * a1     - cinfo->max_v_samp_factor
  * a2     - compptr->v_samp_factor
  * a3     - compptr->width_in_blocks
  * 16(sp) - input_data
  * 20(sp) - output_data
  */
     .set at

     SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4

     beqz        a2, 7f
      lw         s1, 44(sp)  // s1 = output_data
     lw          s0, 40(sp)  // s0 = input_data
     srl         s2, a0, 2
     andi        t9, a0, 2
     srl         t7, t9, 1
     addu        s2, t7, s2
     sll         t0, a3, 3   // t0 = width_in_blocks*DCT
     srl         t7, t0, 1
     subu        s2, t7, s2
 0:
     andi        t6, a0, 1   // t6 = temp_index
     addiu       t6, -1
     lw          t4, 0(s1)   // t4 = outptr
     lw          t5, 0(s0)   // t5 = inptr0
     li          s3, 0       // s3 = bias
     srl         t7, a0, 1   // t7 = image_width1
     srl         s4, t7, 2
     andi        t8, t7, 3
 1:
     ulhu        t0, 0(t5)
     ulhu        t1, 2(t5)
     ulhu        t2, 4(t5)
     ulhu        t3, 6(t5)
     raddu.w.qb  t0, t0
     raddu.w.qb  t1, t1
     raddu.w.qb  t2, t2
     raddu.w.qb  t3, t3
     shra.ph     t0, t0, 1
     shra_r.ph   t1, t1, 1
     shra.ph     t2, t2, 1
     shra_r.ph   t3, t3, 1
     sb          t0, 0(t4)
     sb          t1, 1(t4)
     sb          t2, 2(t4)
     sb          t3, 3(t4)
     addiu       s4, -1
     addiu       t4, 4
     bgtz        s4, 1b
      addiu      t5, 8
     beqz        t8, 3f
      addu       s4, t4, t8
 2:
     ulhu        t0, 0(t5)
     raddu.w.qb  t0, t0
     addqh.w     t0, t0, s3
     xori        s3, s3, 1
     sb          t0, 0(t4)
     addiu       t4, 1
     bne         t4, s4, 2b
      addiu      t5, 2
 3:
     lbux        t1, t6(t5)
     sll         t1, 1
     addqh.w     t2, t1, s3  // t2 = pixval1
     xori        s3, s3, 1
     addqh.w     t3, t1, s3  // t3 = pixval2
     blez        s2, 5f
      append     t3, t2,  8
     addu        t5, t4, s2  // t5 = loop_end2
 4:
     ush         t3, 0(t4)
     addiu       s2, -1
     bgtz        s2, 4b
      addiu      t4,  2
 5:
     beqz        t9, 6f
      nop
     sb          t2, 0(t4)
 6:
     addiu       s1, 4
     addiu       a2, -1
     bnez        a2, 0b
      addiu      s0, 4
 7:
     RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4

     j           ra
     nop
 END(jsimd_h2v1_downsample_mips_dspr2)

 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2)

 /*
  * a0     - cinfo->image_width
  * a1     - cinfo->max_v_samp_factor
  * a2     - compptr->v_samp_factor
  * a3     - compptr->width_in_blocks
  * 16(sp) - input_data
  * 20(sp) - output_data
  */
     .set at
     SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

     beqz         a2, 8f
      lw          s1, 52(sp)      // s1 = output_data
     lw           s0, 48(sp)      // s0 = input_data

     andi         t6, a0, 1       // t6 = temp_index
     addiu        t6, -1
     srl          t7, a0, 1       // t7 = image_width1
     srl          s4, t7, 2
     andi         t8, t7, 3
     andi         t9, a0, 2
     srl          s2, a0, 2
     srl          t7, t9, 1
     addu         s2, t7, s2
     sll          t0, a3, 3       // s2 = width_in_blocks*DCT
     srl          t7, t0, 1
     subu         s2, t7, s2
 0:
     lw           t4, 0(s1)       // t4 = outptr
     lw           t5, 0(s0)       // t5 = inptr0
     lw           s7, 4(s0)       // s7 = inptr1
     li           s6, 1           // s6 = bias
 2:
     ulw          t0, 0(t5)       // t0 = |P3|P2|P1|P0|
     ulw          t1, 0(s7)       // t1 = |Q3|Q2|Q1|Q0|
     ulw          t2, 4(t5)
     ulw          t3, 4(s7)
     precrq.ph.w  t7, t0, t1      // t2 = |P3|P2|Q3|Q2|
     ins          t0, t1, 16, 16  // t0 = |Q1|Q0|P1|P0|
     raddu.w.qb   t1, t7
     raddu.w.qb   t0, t0
     shra_r.w     t1, t1, 2
     addiu        t0, 1
     srl          t0, 2
     precrq.ph.w  t7, t2, t3
     ins          t2, t3, 16, 16
     raddu.w.qb   t7, t7
     raddu.w.qb   t2, t2
     shra_r.w     t7, t7, 2
     addiu        t2, 1
     srl          t2, 2
     sb           t0, 0(t4)
     sb           t1, 1(t4)
     sb           t2, 2(t4)
     sb           t7, 3(t4)
     addiu        t4, 4
     addiu        t5, 8
     addiu        s4, s4, -1
     bgtz         s4, 2b
      addiu       s7, 8
     beqz         t8, 4f
      addu        t8, t4, t8
 3:
     ulhu         t0, 0(t5)
     ulhu         t1, 0(s7)
     ins          t0, t1, 16, 16
     raddu.w.qb   t0, t0
     addu         t0, t0, s6
     srl          t0, 2
     xori         s6, s6, 3
     sb           t0, 0(t4)
     addiu        t5, 2
     addiu        t4, 1
     bne          t8, t4, 3b
      addiu       s7, 2
 4:
     lbux         t1, t6(t5)
     sll          t1, 1
     lbux         t0, t6(s7)
     sll          t0, 1
     addu         t1, t1, t0
     addu         t3, t1, s6
     srl          t0, t3, 2       // t2 = pixval1
     xori         s6, s6, 3
     addu         t2, t1, s6
     srl          t1, t2, 2       // t3 = pixval2
     blez         s2, 6f
      append      t1, t0, 8
 5:
     ush          t1, 0(t4)
     addiu        s2, -1
     bgtz         s2, 5b
      addiu       t4, 2
 6:
     beqz         t9, 7f
      nop
     sb           t0, 0(t4)
 7:
     addiu        s1, 4
     addiu        a2, -1
     bnez         a2, 0b
      addiu       s0, 8
 8:
     RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

     j            ra
      nop
 END(jsimd_h2v2_downsample_mips_dspr2)
 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_h2v2_smooth_downsample_mips_dspr2)
 /*
  * a0     - input_data
  * a1     - output_data
  * a2     - compptr->v_samp_factor
  * a3     - cinfo->max_v_samp_factor
  * 16(sp) - cinfo->smoothing_factor
  * 20(sp) - compptr->width_in_blocks
  * 24(sp) - cinfo->image_width
  */

     .set at

     SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

     lw          s7, 52(sp)      // compptr->width_in_blocks
     lw          s0, 56(sp)      // cinfo->image_width
     lw          s6, 48(sp)      // cinfo->smoothing_factor
     sll         s7, 3           // output_cols = width_in_blocks * DCTSIZE
     sll         v0, s7, 1
     subu        v0, v0, s0
     blez        v0, 2f
     move        v1, zero
     addiu       t0, a3, 2       // t0 = cinfo->max_v_samp_factor + 2
 0:
     addiu       t1, a0, -4
     sll         t2, v1, 2
     lwx         t1, t2(t1)
     move        t3, v0
     addu        t1, t1, s0
     lbu         t2, -1(t1)
 1:
     addiu       t3, t3, -1
     sb          t2, 0(t1)
     bgtz        t3, 1b
     addiu       t1, t1, 1
     addiu       v1, v1, 1
     bne         v1, t0, 0b
     nop
 2:
     li          v0, 80
     mul         v0, s6, v0
     li          v1, 16384
     move        t4, zero
     move        t5, zero
     subu        t6, v1, v0      // t6 = 16384 - tmp_smoot_f * 80
     sll         t7, s6, 4       // t7 = tmp_smoot_f * 16
 3:
 /* Special case for first column: pretend column -1 is same as column 0 */
     sll         v0, t4, 2
     lwx         t8, v0(a1)      //  outptr = output_data[outrow]
     sll         v1, t5, 2
     addiu       t9, v1, 4
     addiu       s0, v1, -4
     addiu       s1, v1, 8
     lwx         s2, v1(a0)      // inptr0 = input_data[inrow]
     lwx         t9, t9(a0)      // inptr1 = input_data[inrow+1]
     lwx         s0, s0(a0)      // above_ptr = input_data[inrow-1]
     lwx         s1, s1(a0)      // below_ptr = input_data[inrow+2]
     lh          v0, 0(s2)
     lh          v1, 0(t9)
     lh          t0, 0(s0)
     lh          t1, 0(s1)
     ins         v0, v1, 16, 16
     ins         t0, t1, 16, 16
     raddu.w.qb  t2, v0
     raddu.w.qb  s3, t0
     lbu         v0, 0(s2)
     lbu         v1, 2(s2)
     lbu         t0, 0(t9)
     lbu         t1, 2(t9)
     addu        v0, v0, v1
     mult        $ac1,t2, t6
     addu        t0, t0, t1
     lbu         t2, 2(s0)
     addu        t0, t0, v0
     lbu         t3, 2(s1)
     addu        s3, t0, s3
     lbu         v0, 0(s0)
     lbu         t0, 0(s1)
     sll         s3, s3, 1
     addu        v0, v0, t2
     addu        t0, t0, t3
     addu        t0, t0, v0
     addu        s3, t0, s3
     madd        $ac1,s3, t7
     extr_r.w    v0, $ac1, 16
     addiu       t8, t8, 1
     addiu       s2, s2, 2
     addiu       t9, t9, 2
     addiu       s0, s0, 2
     addiu       s1, s1, 2
     sb          v0, -1(t8)
     addiu       s4, s7, -2
     and         s4, s4, 3
     addu        s5, s4, t8      //end adress
 4:
     lh          v0, 0(s2)
     lh          v1, 0(t9)
     lh          t0, 0(s0)
     lh          t1, 0(s1)
     ins         v0, v1, 16, 16
     ins         t0, t1, 16, 16
     raddu.w.qb  t2, v0
     raddu.w.qb  s3, t0
     lbu         v0, -1(s2)
     lbu         v1, 2(s2)
     lbu         t0, -1(t9)
     lbu         t1, 2(t9)
     addu        v0, v0, v1
     mult        $ac1, t2, t6
     addu        t0, t0, t1
     lbu         t2, 2(s0)
     addu        t0, t0, v0
     lbu         t3, 2(s1)
     addu        s3, t0, s3
     lbu         v0, -1(s0)
     lbu         t0, -1(s1)
     sll         s3, s3, 1
     addu        v0, v0, t2
     addu        t0, t0, t3
     addu        t0, t0, v0
     addu        s3, t0, s3
     madd        $ac1, s3, t7
     extr_r.w    t2, $ac1, 16
     addiu       t8, t8, 1
     addiu       s2, s2, 2
     addiu       t9, t9, 2
     addiu       s0, s0, 2
     sb          t2, -1(t8)
     bne         s5, t8, 4b
     addiu       s1, s1, 2
     addiu       s5, s7, -2
     subu        s5, s5, s4
     addu        s5, s5, t8      //end adress
 5:
     lh          v0, 0(s2)
     lh          v1, 0(t9)
     lh          t0, 0(s0)
     lh          t1, 0(s1)
     ins         v0, v1, 16, 16
     ins         t0, t1, 16, 16
     raddu.w.qb  t2, v0
     raddu.w.qb  s3, t0
     lbu         v0, -1(s2)
     lbu         v1, 2(s2)
     lbu         t0, -1(t9)
     lbu         t1, 2(t9)
     addu        v0, v0, v1
     mult        $ac1, t2, t6
     addu        t0, t0, t1
     lbu         t2, 2(s0)
     addu        t0, t0, v0
     lbu         t3, 2(s1)
     addu        s3, t0, s3
     lbu         v0, -1(s0)
     lbu         t0, -1(s1)
     sll         s3, s3, 1
     addu        v0, v0, t2
     addu        t0, t0, t3
     lh          v1, 2(t9)
     addu        t0, t0, v0
     lh          v0, 2(s2)
     addu        s3, t0, s3
     lh          t0, 2(s0)
     lh          t1, 2(s1)
     madd        $ac1, s3, t7
     extr_r.w    t2, $ac1, 16
     ins         t0, t1, 16, 16
     ins         v0, v1, 16, 16
     raddu.w.qb  s3, t0
     lbu         v1, 4(s2)
     lbu         t0, 1(t9)
     lbu         t1, 4(t9)
     sb          t2, 0(t8)
     raddu.w.qb  t3, v0
     lbu         v0, 1(s2)
     addu        t0, t0, t1
     mult        $ac1, t3, t6
     addu        v0, v0, v1
     lbu         t2, 4(s0)
     addu        t0, t0, v0
     lbu         v0, 1(s0)
     addu        s3, t0, s3
     lbu         t0, 1(s1)
     lbu         t3, 4(s1)
     addu        v0, v0, t2
     sll         s3, s3, 1
     addu        t0, t0, t3
     lh          v1, 4(t9)
     addu        t0, t0, v0
     lh          v0, 4(s2)
     addu        s3, t0, s3
     lh          t0, 4(s0)
     lh          t1, 4(s1)
     madd        $ac1, s3, t7
     extr_r.w    t2, $ac1, 16
     ins         t0, t1, 16, 16
     ins         v0, v1, 16, 16
     raddu.w.qb  s3, t0
     lbu         v1, 6(s2)
     lbu         t0, 3(t9)
     lbu         t1, 6(t9)
     sb          t2, 1(t8)
     raddu.w.qb  t3, v0
     lbu         v0, 3(s2)
     addu        t0, t0,t1
     mult        $ac1, t3, t6
     addu        v0, v0, v1
     lbu         t2, 6(s0)
     addu        t0, t0, v0
     lbu         v0, 3(s0)
     addu        s3, t0, s3
     lbu         t0, 3(s1)
     lbu         t3, 6(s1)
     addu        v0, v0, t2
     sll         s3, s3, 1
     addu        t0, t0, t3
     lh          v1, 6(t9)
     addu        t0, t0, v0
     lh          v0, 6(s2)
     addu        s3, t0, s3
     lh          t0, 6(s0)
     lh          t1, 6(s1)
     madd        $ac1, s3, t7
     extr_r.w    t3, $ac1, 16
     ins         t0, t1, 16, 16
     ins         v0, v1, 16, 16
     raddu.w.qb  s3, t0
     lbu         v1, 8(s2)
     lbu         t0, 5(t9)
     lbu         t1, 8(t9)
     sb          t3, 2(t8)
     raddu.w.qb  t2, v0
     lbu         v0, 5(s2)
     addu        t0, t0, t1
     mult        $ac1, t2, t6
     addu        v0, v0, v1
     lbu         t2, 8(s0)
     addu        t0, t0, v0
     lbu         v0, 5(s0)
     addu        s3, t0, s3
     lbu         t0, 5(s1)
     lbu         t3, 8(s1)
     addu        v0, v0, t2
     sll         s3, s3, 1
     addu        t0, t0, t3
     addiu       t8, t8, 4
     addu        t0, t0, v0
     addiu       s2, s2, 8
     addu        s3, t0, s3
     addiu       t9, t9, 8
     madd        $ac1, s3, t7
     extr_r.w    t1, $ac1, 16
     addiu       s0, s0, 8
     addiu       s1, s1, 8
     bne         s5, t8, 5b
     sb          t1, -1(t8)
 /* Special case for last column */
     lh          v0, 0(s2)
     lh          v1, 0(t9)
     lh          t0, 0(s0)
     lh          t1, 0(s1)
     ins         v0, v1, 16, 16
     ins         t0, t1, 16, 16
     raddu.w.qb  t2, v0
     raddu.w.qb  s3, t0
     lbu         v0, -1(s2)
     lbu         v1, 1(s2)
     lbu         t0, -1(t9)
     lbu         t1, 1(t9)
     addu        v0, v0, v1
     mult        $ac1, t2, t6
     addu        t0, t0, t1
     lbu         t2, 1(s0)
     addu        t0, t0, v0
     lbu         t3, 1(s1)
     addu        s3, t0, s3
     lbu         v0, -1(s0)
     lbu         t0, -1(s1)
     sll         s3, s3, 1
     addu        v0, v0, t2
     addu        t0, t0, t3
     addu        t0, t0, v0
     addu        s3, t0, s3
     madd        $ac1, s3, t7
     extr_r.w    t0, $ac1, 16
     addiu       t5, t5, 2
     sb          t0, 0(t8)
     addiu       t4, t4, 1
     bne         t4, a2, 3b
     addiu       t5, t5, 2

     RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

     j           ra
      nop

 END(jsimd_h2v2_smooth_downsample_mips_dspr2)

 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_int_upsample_mips_dspr2)
 /*
  * a0     - upsample->h_expand[compptr->component_index]
  * a1     - upsample->v_expand[compptr->component_index]
  * a2     - input_data
  * a3     - output_data_ptr
  * 16(sp) - cinfo->output_width
  * 20(sp) - cinfo->max_v_samp_factor
  */
     .set at

     SAVE_REGS_ON_STACK 16, s0, s1, s2, s3

     lw      s0, 0(a3)    // s0 = output_data
     lw      s1, 32(sp)   // s1 = cinfo->output_width
     lw      s2, 36(sp)   // s2 = cinfo->max_v_samp_factor
     li      t6, 0        // t6 = inrow
     beqz    s2, 10f
      li     s3, 0        // s3 = outrow
 0:
     addu    t0, a2, t6
     addu    t7, s0, s3
     lw      t3, 0(t0)    // t3 = inptr
     lw      t8, 0(t7)    // t8 = outptr
     beqz    s1, 4f
      addu   t5, t8, s1   // t5 = outend
 1:
     lb      t2, 0(t3)    // t2 = invalue = *inptr++
     addiu   t3, 1
     beqz    a0, 3f
      move   t0, a0       // t0 = h_expand
 2:
     sb      t2, 0(t8)
     addiu   t0, -1
     bgtz    t0, 2b
      addiu  t8, 1
 3:
     bgt     t5, t8, 1b
      nop
 4:
     addiu   t9, a1, -1   // t9 = v_expand - 1
     blez    t9, 9f
      nop
 5:
     lw      t3, 0(s0)
     lw      t4, 4(s0)
     subu    t0, s1, 0xF
     blez    t0, 7f
      addu   t5, t3, s1   // t5 = end address
     andi    t7, s1, 0xF  // t7 = residual
     subu    t8, t5, t7
 6:
     ulw     t0, 0(t3)
     ulw     t1, 4(t3)
     ulw     t2, 8(t3)
     usw     t0, 0(t4)
     ulw     t0, 12(t3)
     usw     t1, 4(t4)
     usw     t2, 8(t4)
     usw     t0, 12(t4)
     addiu   t3, 16
     bne     t3, t8, 6b
      addiu  t4, 16
     beqz    t7, 8f
      nop
 7:
     lbu     t0, 0(t3)
     sb      t0, 0(t4)
     addiu   t3, 1
     bne     t3, t5, 7b
      addiu  t4, 1
 8:
     addiu   t9, -1
     bgtz    t9, 5b
      addiu  s0, 8
 9:
     addu    s3, s3, a1
     bne     s3, s2, 0b
      addiu  t6, 1
 10:
     RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3

     j       ra
      nop
 END(jsimd_int_upsample_mips_dspr2)

 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2)
 /*
  * a0     - cinfo->max_v_samp_factor
  * a1     - cinfo->output_width
  * a2     - input_data
  * a3     - output_data_ptr
  */
     lw      t7, 0(a3)       // t7 = output_data
     andi    t8, a1, 0xf     // t8 = residual
     sll     t0, a0, 2
     blez    a0, 4f
      addu   t9, t7, t0      // t9 = output_data end address
 0:
     lw      t5, 0(t7)       // t5 = outptr
     lw      t6, 0(a2)       // t6 = inptr
     addu    t3, t5, a1      // t3 = outptr + output_width (end address)
     subu    t3, t8          // t3 = end address - residual
     beq     t5, t3, 2f
      move   t4, t8
 1:
     ulw     t0, 0(t6)       // t0 = |P3|P2|P1|P0|
     ulw     t2, 4(t6)       // t2 = |P7|P6|P5|P4|
     srl     t1, t0, 16      // t1 = |X|X|P3|P2|
     ins     t0, t0, 16, 16  // t0 = |P1|P0|P1|P0|
     ins     t1, t1, 16, 16  // t1 = |P3|P2|P3|P2|
     ins     t0, t0, 8, 16   // t0 = |P1|P1|P0|P0|
     ins     t1, t1, 8, 16   // t1 = |P3|P3|P2|P2|
     usw     t0, 0(t5)
     usw     t1, 4(t5)
     srl     t0, t2, 16      // t0 = |X|X|P7|P6|
     ins     t2, t2, 16, 16  // t2 = |P5|P4|P5|P4|
     ins     t0, t0, 16, 16  // t0 = |P7|P6|P7|P6|
     ins     t2, t2, 8, 16   // t2 = |P5|P5|P4|P4|
     ins     t0, t0, 8, 16   // t0 = |P7|P7|P6|P6|
     usw     t2, 8(t5)
     usw     t0, 12(t5)
     addiu   t5, 16
     bne     t5, t3, 1b
      addiu  t6, 8
     beqz    t8, 3f
      move   t4, t8
 2:
     lbu     t1, 0(t6)
     sb      t1, 0(t5)
     sb      t1, 1(t5)
     addiu   t4, -2
     addiu   t6, 1
     bgtz    t4, 2b
      addiu  t5, 2
 3:
     addiu   t7, 4
     bne     t9, t7, 0b
      addiu  a2, 4
 4:
     j       ra
      nop
 END(jsimd_h2v1_upsample_mips_dspr2)

 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2)
 /*
  * a0     - cinfo->max_v_samp_factor
  * a1     - cinfo->output_width
  * a2     - input_data
  * a3     - output_data_ptr
  */
     lw      t7, 0(a3)
     blez    a0, 7f
      andi   t9, a1, 0xf     // t9 = residual
 0:
     lw      t6, 0(a2)       // t6 = inptr
     lw      t5, 0(t7)       // t5 = outptr
     addu    t8, t5, a1      // t8 = outptr end address
     subu    t8, t9          // t8 = end address - residual
     beq     t5, t8, 2f
      move   t4, t9
 1:
     ulw     t0, 0(t6)
     srl     t1, t0, 16
     ins     t0, t0, 16, 16
     ins     t0, t0, 8, 16
     ins     t1, t1, 16, 16
     ins     t1, t1, 8, 16
     ulw     t2, 4(t6)
     usw     t0, 0(t5)
     usw     t1, 4(t5)
     srl     t3, t2, 16
     ins     t2, t2, 16, 16
     ins     t2, t2, 8, 16
     ins     t3, t3, 16, 16
     ins     t3, t3, 8, 16
     usw     t2, 8(t5)
     usw     t3, 12(t5)
     addiu   t5, 16
     bne     t5, t8, 1b
      addiu  t6, 8
     beqz    t9, 3f
      move   t4, t9
 2:
     lbu     t0, 0(t6)
     sb      t0, 0(t5)
     sb      t0, 1(t5)
     addiu   t4, -2
     addiu   t6, 1
     bgtz    t4, 2b
      addiu  t5, 2
 3:
     ulw     t6, 0(t7)       // t6 = outptr
     ulw     t5, 4(t7)       // t5 = outptr[1]
     addu    t4, t6, a1      // t4 = new end address
     subu    t8, t4, t9
     beqz    t8, 5f
      nop
 4:
     ulw     t0, 0(t6)
     ulw     t1, 4(t6)
     ulw     t2, 8(t6)
     usw     t0, 0(t5)
     ulw     t0, 12(t6)
     usw     t1, 4(t5)
     usw     t2, 8(t5)
     usw     t0, 12(t5)
     addiu   t6, 16
     bne     t6, t8, 4b
      addiu  t5, 16
     beqz    t9, 6f
      nop
 5:
     lbu     t0, 0(t6)
     sb      t0, 0(t5)
     addiu   t6, 1
     bne     t6, t4, 5b
      addiu  t5, 1
 6:
     addiu   t7, 8
     addiu   a0, -2
     bgtz    a0, 0b
      addiu  a2, 4
 7:
     j       ra
      nop
 END(jsimd_h2v2_upsample_mips_dspr2)

 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_idct_islow_mips_dspr2)
 /*
  * a0     - coef_block
  * a1     - compptr->dcttable
  * a2     - output
  * a3     - range_limit
  */

     SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

     addiu     sp, sp, -256
     move      v0, sp
     addiu     v1, zero, 8      // v1 = DCTSIZE = 8
 1:
     lh        s4, 32(a0)       // s4 = inptr[16]
     lh        s5, 64(a0)       // s5 = inptr[32]
     lh        s6, 96(a0)       // s6 = inptr[48]
     lh        t1, 112(a0)      // t1 = inptr[56]
     lh        t7, 16(a0)       // t7 = inptr[8]
     lh        t5, 80(a0)       // t5 = inptr[40]
     lh        t3, 48(a0)       // t3 = inptr[24]
     or        s4, s4, t1
     or        s4, s4, t3
     or        s4, s4, t5
     or        s4, s4, t7
     or        s4, s4, s5
     or        s4, s4, s6
     bnez      s4, 2f
      addiu    v1, v1, -1
     lh        s5, 0(a1)        // quantptr[DCTSIZE*0]
     lh        s6, 0(a0)        // inptr[DCTSIZE*0]
     mul       s5, s5, s6       // DEQUANTIZE(inptr[0], quantptr[0])
     sll       s5, s5, 2
     sw        s5, 0(v0)
     sw        s5, 32(v0)
     sw        s5, 64(v0)
     sw        s5, 96(v0)
     sw        s5, 128(v0)
     sw        s5, 160(v0)
     sw        s5, 192(v0)
     b         3f
      sw       s5, 224(v0)
 2:
     lh        t0, 112(a1)
     lh        t2, 48(a1)
     lh        t4, 80(a1)
     lh        t6, 16(a1)
     mul       t0, t0, t1       // DEQUANTIZE(inptr[DCTSIZE*7],quant[DCTSIZE*7])
     mul       t1, t2, t3       // DEQUANTIZE(inptr[DCTSIZE*3],quant[DCTSIZE*3])
     mul       t2, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*5],quant[DCTSIZE*5])
     mul       t3, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*1],quant[DCTSIZE*1])
     lh        t4, 32(a1)
     lh        t5, 32(a0)
     lh        t6, 96(a1)
     lh        t7, 96(a0)
     addu      s0, t0, t1       // z3 = tmp0 + tmp2
     addu      s1, t1, t2       // z2 = tmp1 + tmp2
     addu      s2, t2, t3       // z4 = tmp1 + tmp3
     addu      s3, s0, s2       // z3 + z4
     addiu     t9, zero, 9633   // FIX_1_175875602
     mul       s3, s3, t9       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
     addu      t8, t0, t3       // z1 = tmp0 + tmp3
     addiu     t9, zero, 2446   // FIX_0_298631336
     mul       t0, t0, t9       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
     addiu     t9, zero, 16819  // FIX_2_053119869
     mul       t2, t2, t9       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
     addiu     t9, zero, 25172  // FIX_3_072711026
     mul       t1, t1, t9       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
     addiu     t9, zero, 12299  // FIX_1_501321110
     mul       t3, t3, t9       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
     addiu     t9, zero, 16069  // FIX_1_961570560
     mul       s0, s0, t9       // -z3 = MULTIPLY(z3, FIX_1_961570560)
     addiu     t9, zero, 3196   // FIX_0_390180644
     mul       s2, s2, t9       // -z4 = MULTIPLY(z4, FIX_0_390180644)
     addiu     t9, zero, 7373   // FIX_0_899976223
     mul       t8, t8, t9       // -z1 = MULTIPLY(z1, FIX_0_899976223)
     addiu     t9, zero, 20995  // FIX_2_562915447
     mul       s1, s1, t9       // -z2 = MULTIPLY(z2, FIX_2_562915447)
     subu      s0, s3, s0       // z3 += z5
     addu      t0, t0, s0       // tmp0 += z3
     addu      t1, t1, s0       // tmp2 += z3
     subu      s2, s3, s2       // z4 += z5
     addu      t2, t2, s2       // tmp1 += z4
     addu      t3, t3, s2       // tmp3 += z4
     subu      t0, t0, t8       // tmp0 += z1
     subu      t1, t1, s1       // tmp2 += z2
     subu      t2, t2, s1       // tmp1 += z2
     subu      t3, t3, t8       // tmp3 += z1
     mul       s0, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*2],quant[DCTSIZE*2])
     addiu     t9, zero, 6270   // FIX_0_765366865
     mul       s1, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*6],quant[DCTSIZE*6])
     lh        t4, 0(a1)
     lh        t5, 0(a0)
     lh        t6, 64(a1)
     lh        t7, 64(a0)
     mul       s2, t9, s0       // MULTIPLY(z2, FIX_0_765366865)
     mul       t5, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*0],quant[DCTSIZE*0])
     mul       t6, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*4],quant[DCTSIZE*4])
     addiu     t9, zero, 4433   // FIX_0_541196100
     addu      s3, s0, s1       // z2 + z3
     mul       s3, s3, t9       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
     addiu     t9, zero, 15137  // FIX_1_847759065
     mul       t8, s1, t9       // MULTIPLY(z3, FIX_1_847759065)
     addu      t4, t5, t6
     subu      t5, t5, t6
     sll       t4, t4, 13       // tmp0 = (z2 + z3) << CONST_BITS
     sll       t5, t5, 13       // tmp1 = (z2 - z3) << CONST_BITS
     addu      t7, s3, s2       // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
     subu      t6, s3, t8       // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
     addu      s0, t4, t7
     subu      s1, t4, t7
     addu      s2, t5, t6
     subu      s3, t5, t6
     addu      t4, s0, t3
     subu      s0, s0, t3
     addu      t3, s2, t1
     subu      s2, s2, t1
     addu      t1, s3, t2
     subu      s3, s3, t2
     addu      t2, s1, t0
     subu      s1, s1, t0
     shra_r.w  t4, t4, 11
     shra_r.w  t3, t3, 11
     shra_r.w  t1, t1, 11
     shra_r.w  t2, t2, 11
     shra_r.w  s1, s1, 11
     shra_r.w  s3, s3, 11
     shra_r.w  s2, s2, 11
     shra_r.w  s0, s0, 11
     sw        t4, 0(v0)
     sw        t3, 32(v0)
     sw        t1, 64(v0)
     sw        t2, 96(v0)
     sw        s1, 128(v0)
     sw        s3, 160(v0)
     sw        s2, 192(v0)
     sw        s0, 224(v0)
 3:
     addiu     a1, a1, 2
     addiu     a0, a0, 2
     bgtz      v1, 1b
      addiu    v0, v0, 4
     move      v0, sp
     addiu     v1, zero, 8
 4:
     lw        t0, 8(v0)        // z2 = (INT32) wsptr[2]
     lw        t1, 24(v0)       // z3 = (INT32) wsptr[6]
     lw        t2, 0(v0)        // (INT32) wsptr[0]
     lw        t3, 16(v0)       // (INT32) wsptr[4]
     lw        s4, 4(v0)        // (INT32) wsptr[1]
     lw        s5, 12(v0)       // (INT32) wsptr[3]
     lw        s6, 20(v0)       // (INT32) wsptr[5]
     lw        s7, 28(v0)       // (INT32) wsptr[7]
     or        s4, s4, t0
     or        s4, s4, t1
     or        s4, s4, t3
     or        s4, s4, s7
     or        s4, s4, s5
     or        s4, s4, s6
     bnez      s4, 5f
      addiu    v1, v1, -1
     shra_r.w  s5, t2, 5
     andi      s5, s5, 0x3ff
     lbux      s5, s5(a3)
     lw        s1, 0(a2)
     replv.qb  s5, s5
     usw       s5, 0(s1)
     usw       s5, 4(s1)
     b         6f
      nop
 5:
     addu      t4, t0, t1       // z2 + z3
     addiu     t8, zero, 4433   // FIX_0_541196100
     mul       t5, t4, t8       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
     addiu     t8, zero, 15137  // FIX_1_847759065
     mul       t1, t1, t8       // MULTIPLY(z3, FIX_1_847759065)
     addiu     t8, zero, 6270   // FIX_0_765366865
     mul       t0, t0, t8       // MULTIPLY(z2, FIX_0_765366865)
     addu      t4, t2, t3       // (INT32) wsptr[0] + (INT32) wsptr[4]
     subu      t2, t2, t3       // (INT32) wsptr[0] - (INT32) wsptr[4]
     sll       t4, t4, 13       // tmp0 = ((wsptr[0] + wsptr[4]) << CONST_BITS
     sll       t2, t2, 13       // tmp1 = ((wsptr[0] - wsptr[4]) << CONST_BITS
     subu      t1, t5, t1       // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
     subu      t3, t2, t1       // tmp12 = tmp1 - tmp2
     addu      t2, t2, t1       // tmp11 = tmp1 + tmp2
     addu      t5, t5, t0       // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
     subu      t1, t4, t5       // tmp13 = tmp0 - tmp3
     addu      t0, t4, t5       // tmp10 = tmp0 + tmp3
     lw        t4, 28(v0)       // tmp0 = (INT32) wsptr[7]
     lw        t6, 12(v0)       // tmp2 = (INT32) wsptr[3]
     lw        t5, 20(v0)       // tmp1 = (INT32) wsptr[5]
     lw        t7, 4(v0)        // tmp3 = (INT32) wsptr[1]
     addu      s0, t4, t6       // z3 = tmp0 + tmp2
     addiu     t8, zero, 9633   // FIX_1_175875602
     addu      s1, t5, t7       // z4 = tmp1 + tmp3
     addu      s2, s0, s1       // z3 + z4
     mul       s2, s2, t8       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
     addu      s3, t4, t7       // z1 = tmp0 + tmp3
     addu      t9, t5, t6       // z2 = tmp1 + tmp2
     addiu     t8, zero, 16069  // FIX_1_961570560
     mul       s0, s0, t8       // -z3 = MULTIPLY(z3, FIX_1_961570560)
     addiu     t8, zero, 3196   // FIX_0_390180644
     mul       s1, s1, t8       // -z4 = MULTIPLY(z4, FIX_0_390180644)
     addiu     t8, zero, 2446   // FIX_0_298631336
     mul       t4, t4, t8       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
     addiu     t8, zero, 7373   // FIX_0_899976223
     mul       s3, s3, t8       // -z1 = MULTIPLY(z1, FIX_0_899976223)
     addiu     t8, zero, 16819  // FIX_2_053119869
     mul       t5, t5, t8       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
     addiu     t8, zero, 20995  // FIX_2_562915447
     mul       t9, t9, t8       // -z2 = MULTIPLY(z2, FIX_2_562915447)
     addiu     t8, zero, 25172  // FIX_3_072711026
     mul       t6, t6, t8       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
     addiu     t8, zero, 12299  // FIX_1_501321110
     mul       t7, t7, t8       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
     subu      s0, s2, s0       // z3 += z5
     subu      s1, s2, s1       // z4 += z5
     addu      t4, t4, s0
     subu      t4, t4, s3       // tmp0
     addu      t5, t5, s1
     subu      t5, t5, t9       // tmp1
     addu      t6, t6, s0
     subu      t6, t6, t9       // tmp2
     addu      t7, t7, s1
     subu      t7, t7, s3       // tmp3
     addu      s0, t0, t7
     subu      t0, t0, t7
     addu      t7, t2, t6
     subu      t2, t2, t6
     addu      t6, t3, t5
     subu      t3, t3, t5
     addu      t5, t1, t4
     subu      t1, t1, t4
     shra_r.w  s0, s0, 18
     shra_r.w  t7, t7, 18
     shra_r.w  t6, t6, 18
     shra_r.w  t5, t5, 18
     shra_r.w  t1, t1, 18
     shra_r.w  t3, t3, 18
     shra_r.w  t2, t2, 18
     shra_r.w  t0, t0, 18
     andi      s0, s0, 0x3ff
     andi      t7, t7, 0x3ff
     andi      t6, t6, 0x3ff
     andi      t5, t5, 0x3ff
     andi      t1, t1, 0x3ff
     andi      t3, t3, 0x3ff
     andi      t2, t2, 0x3ff
     andi      t0, t0, 0x3ff
     lw        s1, 0(a2)
     lbux      s0, s0(a3)
     lbux      t7, t7(a3)
     lbux      t6, t6(a3)
     lbux      t5, t5(a3)
     lbux      t1, t1(a3)
     lbux      t3, t3(a3)
     lbux      t2, t2(a3)
     lbux      t0, t0(a3)
     sb        s0, 0(s1)
     sb        t7, 1(s1)
     sb        t6, 2(s1)
     sb        t5, 3(s1)
     sb        t1, 4(s1)
     sb        t3, 5(s1)
     sb        t2, 6(s1)
     sb        t0, 7(s1)
 6:
     addiu     v0, v0, 32
     bgtz      v1, 4b
      addiu    a2, a2, 4
     addiu     sp, sp, 256

     RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

     j         ra
      nop

 END(jsimd_idct_islow_mips_dspr2)

 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2)
 /*
  * a0     - inptr
  * a1     - quantptr
  * a2     - wsptr
  * a3     - mips_idct_ifast_coefs
  */

     SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

     addiu          t9, a0, 16            // end address
     or             AT, a3, zero

 0:
     lw             s0, 0(a1)             // quantptr[DCTSIZE*0]
     lw             t0, 0(a0)             // inptr[DCTSIZE*0]
     lw             t1, 16(a0)            // inptr[DCTSIZE*1]
     muleq_s.w.phl  v0, t0, s0            // tmp0 ...
     lw             t2, 32(a0)            // inptr[DCTSIZE*2]
     lw             t3, 48(a0)            // inptr[DCTSIZE*3]
     lw             t4, 64(a0)            // inptr[DCTSIZE*4]
     lw             t5, 80(a0)            // inptr[DCTSIZE*5]
     muleq_s.w.phr  t0, t0, s0            // ... tmp0 ...
     lw             t6, 96(a0)            // inptr[DCTSIZE*6]
     lw             t7, 112(a0)           // inptr[DCTSIZE*7]
     or             s4, t1, t2
     or             s5, t3, t4
     bnez           s4, 1f
      ins           t0, v0, 16, 16        // ... tmp0
     bnez           s5, 1f
      or            s6, t5, t6
     or             s6, s6, t7
     bnez           s6, 1f
      sw            t0, 0(a2)             // wsptr[DCTSIZE*0]
     sw             t0, 16(a2)            // wsptr[DCTSIZE*1]
     sw             t0, 32(a2)            // wsptr[DCTSIZE*2]
     sw             t0, 48(a2)            // wsptr[DCTSIZE*3]
     sw             t0, 64(a2)            // wsptr[DCTSIZE*4]
     sw             t0, 80(a2)            // wsptr[DCTSIZE*5]
     sw             t0, 96(a2)            // wsptr[DCTSIZE*6]
     sw             t0, 112(a2)           // wsptr[DCTSIZE*7]
     addiu          a0, a0, 4
     b              2f
      addiu         a1, a1, 4

 1:
     lw             s1, 32(a1)            // quantptr[DCTSIZE*2]
     lw             s2, 64(a1)            // quantptr[DCTSIZE*4]
     muleq_s.w.phl  v0, t2, s1            // tmp1 ...
     muleq_s.w.phr  t2, t2, s1            // ... tmp1 ...
     lw             s0, 16(a1)            // quantptr[DCTSIZE*1]
     lw             s1, 48(a1)            // quantptr[DCTSIZE*3]
     lw             s3, 96(a1)            // quantptr[DCTSIZE*6]
     muleq_s.w.phl  v1, t4, s2            // tmp2 ...
     muleq_s.w.phr  t4, t4, s2            // ... tmp2 ...
     lw             s2, 80(a1)            // quantptr[DCTSIZE*5]
     lw             t8, 4(AT)             // FIX(1.414213562)
     ins            t2, v0, 16, 16        // ... tmp1
     muleq_s.w.phl  v0, t6, s3            // tmp3 ...
     muleq_s.w.phr  t6, t6, s3            // ... tmp3 ...
     ins            t4, v1, 16, 16        // ... tmp2
     addq.ph        s4, t0, t4            // tmp10
     subq.ph        s5, t0, t4            // tmp11
     ins            t6, v0, 16, 16        // ... tmp3
     subq.ph        s6, t2, t6            // tmp12 ...
     addq.ph        s7, t2, t6            // tmp13
     mulq_s.ph      s6, s6, t8            // ... tmp12 ...
     addq.ph        t0, s4, s7            // tmp0
     subq.ph        t6, s4, s7            // tmp3
     muleq_s.w.phl  v0, t1, s0            // tmp4 ...
     muleq_s.w.phr  t1, t1, s0            // ... tmp4 ...
     shll_s.ph      s6, s6, 1             // x2
     lw             s3, 112(a1)           // quantptr[DCTSIZE*7]
     subq.ph        s6, s6, s7            // ... tmp12
     muleq_s.w.phl  v1, t7, s3            // tmp7 ...
     muleq_s.w.phr  t7, t7, s3            // ... tmp7 ...
     ins            t1, v0, 16, 16        // ... tmp4
     addq.ph        t2, s5, s6            // tmp1
     subq.ph        t4, s5, s6            // tmp2
     muleq_s.w.phl  v0, t5, s2            // tmp6 ...
     muleq_s.w.phr  t5, t5, s2            // ... tmp6 ...
     ins            t7, v1, 16, 16        // ... tmp7
     addq.ph        s5, t1, t7            // z11
     subq.ph        s6, t1, t7            // z12
     muleq_s.w.phl  v1, t3, s1            // tmp5 ...
     muleq_s.w.phr  t3, t3, s1            // ... tmp5 ...
     ins            t5, v0, 16, 16        // ... tmp6
     ins            t3, v1, 16, 16        // ... tmp5
     addq.ph        s7, t5, t3            // z13
     subq.ph        v0, t5, t3            // z10
     addq.ph        t7, s5, s7            // tmp7
     subq.ph        s5, s5, s7            // tmp11 ...
     addq.ph        v1, v0, s6            // z5 ...
     mulq_s.ph      s5, s5, t8            // ... tmp11
     lw             t8, 8(AT)             // FIX(1.847759065)
     lw             s4, 0(AT)             // FIX(1.082392200)
     addq.ph        s0, t0, t7
     subq.ph        s1, t0, t7
     mulq_s.ph      v1, v1, t8            // ... z5
     shll_s.ph      s5, s5, 1             // x2
     lw             t8, 12(AT)            // FIX(-2.613125930)
     sw             s0, 0(a2)             // wsptr[DCTSIZE*0]
     shll_s.ph      v0, v0, 1             // x4
     mulq_s.ph      v0, v0, t8            // tmp12 ...
     mulq_s.ph      s4, s6, s4            // tmp10 ...
     shll_s.ph      v1, v1, 1             // x2
     addiu          a0, a0, 4
     addiu          a1, a1, 4
     sw             s1, 112(a2)           // wsptr[DCTSIZE*7]
     shll_s.ph      s6, v0, 1             // x4
     shll_s.ph      s4, s4, 1             // x2
     addq.ph        s6, s6, v1            // ... tmp12
     subq.ph        t5, s6, t7            // tmp6
     subq.ph        s4, s4, v1            // ... tmp10
     subq.ph        t3, s5, t5            // tmp5
     addq.ph        s2, t2, t5
     addq.ph        t1, s4, t3            // tmp4
     subq.ph        s3, t2, t5
     sw             s2, 16(a2)            // wsptr[DCTSIZE*1]
     sw             s3, 96(a2)            // wsptr[DCTSIZE*6]
     addq.ph        v0, t4, t3
     subq.ph        v1, t4, t3
     sw             v0, 32(a2)            // wsptr[DCTSIZE*2]
     sw             v1, 80(a2)            // wsptr[DCTSIZE*5]
     addq.ph        v0, t6, t1
     subq.ph        v1, t6, t1
     sw             v0, 64(a2)            // wsptr[DCTSIZE*4]
     sw             v1, 48(a2)            // wsptr[DCTSIZE*3]

 2:
     bne            a0, t9, 0b
      addiu         a2, a2, 4

     RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

     j              ra
      nop

 END(jsimd_idct_ifast_cols_mips_dspr2)

 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2)
 /*
  * a0     - wsptr
  * a1     - output_buf
  * a2     - output_col
  * a3     - mips_idct_ifast_coefs
  */

     SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3

     addiu         t9, a0, 128        // end address
     lui           s8, 0x8080
     ori           s8, s8, 0x8080

 0:
     lw            AT, 36(sp)         // restore $a3 (mips_idct_ifast_coefs)
     lw            t0, 0(a0)          // wsptr[DCTSIZE*0+0/1]  b a
     lw            s0, 16(a0)         // wsptr[DCTSIZE*1+0/1]  B A
     lw            t2, 4(a0)          // wsptr[DCTSIZE*0+2/3]  d c
     lw            s2, 20(a0)         // wsptr[DCTSIZE*1+2/3]  D C
     lw            t4, 8(a0)          // wsptr[DCTSIZE*0+4/5]  f e
     lw            s4, 24(a0)         // wsptr[DCTSIZE*1+4/5]  F E
     lw            t6, 12(a0)         // wsptr[DCTSIZE*0+6/7]  h g
     lw            s6, 28(a0)         // wsptr[DCTSIZE*1+6/7]  H G
     precrq.ph.w   t1, s0, t0         // B b
     ins           t0, s0, 16, 16     // A a
     bnez          t1, 1f
      or           s0, t2, s2
     bnez          s0, 1f
      or           s0, t4, s4
     bnez          s0, 1f
      or           s0, t6, s6
     bnez          s0, 1f
      shll_s.ph    s0, t0, 2          // A a
     lw            a3, 0(a1)
     lw            AT, 4(a1)
     precrq.ph.w   t0, s0, s0         // A A
     ins           s0, s0, 16, 16     // a a
     addu          a3, a3, a2
     addu          AT, AT, a2
     precrq.qb.ph  t0, t0, t0         // A A A A
     precrq.qb.ph  s0, s0, s0         // a a a a
     addu.qb       s0, s0, s8
     addu.qb       t0, t0, s8
     sw            s0, 0(a3)
     sw            s0, 4(a3)
     sw            t0, 0(AT)
     sw            t0, 4(AT)
     addiu         a0, a0, 32
     bne           a0, t9, 0b
      addiu        a1, a1, 8
     b             2f
      nop

 1:
     precrq.ph.w   t3, s2, t2
     ins           t2, s2, 16, 16
     precrq.ph.w   t5, s4, t4
     ins           t4, s4, 16, 16
     precrq.ph.w   t7, s6, t6
     ins           t6, s6, 16, 16
     lw            t8, 4(AT)          // FIX(1.414213562)
     addq.ph       s4, t0, t4         // tmp10
     subq.ph       s5, t0, t4         // tmp11
     subq.ph       s6, t2, t6         // tmp12 ...
     addq.ph       s7, t2, t6         // tmp13
     mulq_s.ph     s6, s6, t8         // ... tmp12 ...
     addq.ph       t0, s4, s7         // tmp0
     subq.ph       t6, s4, s7         // tmp3
     shll_s.ph     s6, s6, 1          // x2
     subq.ph       s6, s6, s7         // ... tmp12
     addq.ph       t2, s5, s6         // tmp1
     subq.ph       t4, s5, s6         // tmp2
     addq.ph       s5, t1, t7         // z11
     subq.ph       s6, t1, t7         // z12
     addq.ph       s7, t5, t3         // z13
     subq.ph       v0, t5, t3         // z10
     addq.ph       t7, s5, s7         // tmp7
     subq.ph       s5, s5, s7         // tmp11 ...
     addq.ph       v1, v0, s6         // z5 ...
     mulq_s.ph     s5, s5, t8         // ... tmp11
     lw            t8, 8(AT)          // FIX(1.847759065)
     lw            s4, 0(AT)          // FIX(1.082392200)
     addq.ph       s0, t0, t7         // tmp0 + tmp7
     subq.ph       s7, t0, t7         // tmp0 - tmp7
     mulq_s.ph     v1, v1, t8         // ... z5
     lw            a3, 0(a1)
     lw            t8, 12(AT)         // FIX(-2.613125930)
     shll_s.ph     s5, s5, 1          // x2
     addu          a3, a3, a2
     shll_s.ph     v0, v0, 1          // x4
     mulq_s.ph     v0, v0, t8         // tmp12 ...
     mulq_s.ph     s4, s6, s4         // tmp10 ...
     shll_s.ph     v1, v1, 1          // x2
     addiu         a0, a0, 32
     addiu         a1, a1, 8
     shll_s.ph     s6, v0, 1          // x4
     shll_s.ph     s4, s4, 1          // x2
     addq.ph       s6, s6, v1         // ... tmp12
     shll_s.ph     s0, s0, 2
     subq.ph       t5, s6, t7         // tmp6
     subq.ph       s4, s4, v1         // ... tmp10
     subq.ph       t3, s5, t5         // tmp5
     shll_s.ph     s7, s7, 2
     addq.ph       t1, s4, t3         // tmp4
     addq.ph       s1, t2, t5         // tmp1 + tmp6
     subq.ph       s6, t2, t5         // tmp1 - tmp6
     addq.ph       s2, t4, t3         // tmp2 + tmp5
     subq.ph       s5, t4, t3         // tmp2 - tmp5
     addq.ph       s4, t6, t1         // tmp3 + tmp4
     subq.ph       s3, t6, t1         // tmp3 - tmp4
     shll_s.ph     s1, s1, 2
     shll_s.ph     s2, s2, 2
     shll_s.ph     s3, s3, 2
     shll_s.ph     s4, s4, 2
     shll_s.ph     s5, s5, 2
     shll_s.ph     s6, s6, 2
     precrq.ph.w   t0, s1, s0         // B A
     ins           s0, s1, 16, 16     // b a
     precrq.ph.w   t2, s3, s2         // D C
     ins           s2, s3, 16, 16     // d c
     precrq.ph.w   t4, s5, s4         // F E
     ins           s4, s5, 16, 16     // f e
     precrq.ph.w   t6, s7, s6         // H G
     ins           s6, s7, 16, 16     // h g
     precrq.qb.ph  t0, t2, t0         // D C B A
     precrq.qb.ph  s0, s2, s0         // d c b a
     precrq.qb.ph  t4, t6, t4         // H G F E
     precrq.qb.ph  s4, s6, s4         // h g f e
     addu.qb       s0, s0, s8
     addu.qb       s4, s4, s8
     sw            s0, 0(a3)          // outptr[0/1/2/3]       d c b a
     sw            s4, 4(a3)          // outptr[4/5/6/7]       h g f e
     lw            a3, -4(a1)
     addu.qb       t0, t0, s8
     addu          a3, a3, a2
     addu.qb       t4, t4, s8
     sw            t0, 0(a3)          // outptr[0/1/2/3]       D C B A
     bne           a0, t9, 0b
      sw           t4, 4(a3)          // outptr[4/5/6/7]       H G F E

 2:

     RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3

     j             ra
      nop

 END(jsimd_idct_ifast_rows_mips_dspr2)

 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2)
 /*
  * a0     - data
  */

     SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8

     lui       t0, 6437
     ori       t0, 2260
     lui       t1, 9633
     ori       t1, 11363
     lui       t2, 0xd39e
     ori       t2, 0xe6dc
     lui       t3, 0xf72d
     ori       t3, 9633
     lui       t4, 2261
     ori       t4, 9633
     lui       t5, 0xd39e
     ori       t5, 6437
     lui       t6, 9633
     ori       t6, 0xd39d
     lui       t7, 0xe6dc
     ori       t7, 2260
     lui       t8, 4433
     ori       t8, 10703
     lui       t9, 0xd630
     ori       t9, 4433
     li        s8, 8
     move      a1, a0
 1:
     lw        s0, 0(a1)     // tmp0 = 1|0
     lw        s1, 4(a1)     // tmp1 = 3|2
     lw        s2, 8(a1)     // tmp2 = 5|4
     lw        s3, 12(a1)    // tmp3 = 7|6
     packrl.ph s1, s1, s1    // tmp1 = 2|3
     packrl.ph s3, s3, s3    // tmp3 = 6|7
     subq.ph   s7, s1, s2    // tmp7 = 2-5|3-4 = t5|t4
     subq.ph   s5, s0, s3    // tmp5 = 1-6|0-7 = t6|t7
     mult      $0, $0        // ac0  = 0
     dpa.w.ph  $ac0, s7, t0  // ac0 += t5*  6437 + t4*  2260
     dpa.w.ph  $ac0, s5, t1  // ac0 += t6*  9633 + t7* 11363
     mult      $ac1, $0, $0  // ac1  = 0
     dpa.w.ph  $ac1, s7, t2  // ac1 += t5*-11362 + t4* -6436
     dpa.w.ph  $ac1, s5, t3  // ac1 += t6* -2259 + t7*  9633
     mult      $ac2, $0, $0  // ac2  = 0
     dpa.w.ph  $ac2, s7, t4  // ac2 += t5*  2261 + t4*  9633
     dpa.w.ph  $ac2, s5, t5  // ac2 += t6*-11362 + t7*  6437
     mult      $ac3, $0, $0  // ac3  = 0
     dpa.w.ph  $ac3, s7, t6  // ac3 += t5*  9633 + t4*-11363
     dpa.w.ph  $ac3, s5, t7  // ac3 += t6* -6436 + t7*  2260
     addq.ph   s6, s1, s2    // tmp6 = 2+5|3+4 = t2|t3
     addq.ph   s4, s0, s3    // tmp4 = 1+6|0+7 = t1|t0
     extr_r.w  s0, $ac0, 11  // tmp0 = (ac0 + 1024) >> 11
     extr_r.w  s1, $ac1, 11  // tmp1 = (ac1 + 1024) >> 11
     extr_r.w  s2, $ac2, 11  // tmp2 = (ac2 + 1024) >> 11
     extr_r.w  s3, $ac3, 11  // tmp3 = (ac3 + 1024) >> 11
     addq.ph   s5, s4, s6    // tmp5 = t1+t2|t0+t3 = t11|t10
     subq.ph   s7, s4, s6    // tmp7 = t1-t2|t0-t3 = t12|t13
     sh        s0, 2(a1)
     sh        s1, 6(a1)
     sh        s2, 10(a1)
     sh        s3, 14(a1)
     mult      $0, $0        // ac0  = 0
     dpa.w.ph  $ac0, s7, t8  // ac0 += t12*  4433 + t13* 10703
     mult      $ac1, $0, $0  // ac1  = 0
     dpa.w.ph  $ac1, s7, t9  // ac1 += t12*-10704 + t13*  4433
     sra       s4, s5, 16    // tmp4 = t11
     addiu     a1, a1, 16
     addiu     s8, s8, -1
     extr_r.w  s0, $ac0, 11  // tmp0 = (ac0 + 1024) >> 11
     extr_r.w  s1, $ac1, 11  // tmp1 = (ac1 + 1024) >> 11
     addu      s2, s5, s4    // tmp2 = t10 + t11
     subu      s3, s5, s4    // tmp3 = t10 - t11
     sll       s2, s2, 2     // tmp2 = (t10 + t11) << 2
     sll       s3, s3, 2     // tmp3 = (t10 - t11) << 2
     sh        s2, -16(a1)
     sh        s3, -8(a1)
     sh        s0, -12(a1)
     bgtz      s8, 1b
      sh       s1, -4(a1)
     li        t0, 2260
     li        t1, 11363
     li        t2, 9633
     li        t3, 6436
     li        t4, 6437
     li        t5, 2261
     li        t6, 11362
     li        t7, 2259
     li        t8, 4433
     li        t9, 10703
     li        a1, 10704
     li        s8, 8

 2:
     lh        a2, 0(a0)     // 0
     lh        a3, 16(a0)    // 8
     lh        v0, 32(a0)    // 16
     lh        v1, 48(a0)    // 24
     lh        s4, 64(a0)    // 32
     lh        s5, 80(a0)    // 40
     lh        s6, 96(a0)    // 48
     lh        s7, 112(a0)   // 56
     addu      s2, v0, s5    // tmp2 = 16 + 40
     subu      s5, v0, s5    // tmp5 = 16 - 40
     addu      s3, v1, s4    // tmp3 = 24 + 32
     subu      s4, v1, s4    // tmp4 = 24 - 32
     addu      s0, a2, s7    // tmp0 =  0 + 56
     subu      s7, a2, s7    // tmp7 =  0 - 56
     addu      s1, a3, s6    // tmp1 =  8 + 48
     subu      s6, a3, s6    // tmp6 =  8 - 48
     addu      a2, s0, s3    // tmp10 = tmp0 + tmp3
     subu      v1, s0, s3    // tmp13 = tmp0 - tmp3
     addu      a3, s1, s2    // tmp11 = tmp1 + tmp2
     subu      v0, s1, s2    // tmp12 = tmp1 - tmp2
     mult      s7, t1        // ac0  = tmp7 * c1
     madd      s4, t0        // ac0 += tmp4 * c0
     madd      s5, t4        // ac0 += tmp5 * c4
     madd      s6, t2        // ac0 += tmp6 * c2
     mult      $ac1, s7, t2  // ac1  = tmp7 * c2
     msub      $ac1, s4, t3  // ac1 -= tmp4 * c3
     msub      $ac1, s5, t6  // ac1 -= tmp5 * c6
     msub      $ac1, s6, t7  // ac1 -= tmp6 * c7
     mult      $ac2, s7, t4  // ac2  = tmp7 * c4
     madd      $ac2, s4, t2  // ac2 += tmp4 * c2
     madd      $ac2, s5, t5  // ac2 += tmp5 * c5
     msub      $ac2, s6, t6  // ac2 -= tmp6 * c6
     mult      $ac3, s7, t0  // ac3  = tmp7 * c0
     msub      $ac3, s4, t1  // ac3 -= tmp4 * c1
     madd      $ac3, s5, t2  // ac3 += tmp5 * c2
     msub      $ac3, s6, t3  // ac3 -= tmp6 * c3
     extr_r.w  s0, $ac0, 15  // tmp0 = (ac0 + 16384) >> 15
     extr_r.w  s1, $ac1, 15  // tmp1 = (ac1 + 16384) >> 15
     extr_r.w  s2, $ac2, 15  // tmp2 = (ac2 + 16384) >> 15
     extr_r.w  s3, $ac3, 15  // tmp3 = (ac3 + 16384) >> 15
     addiu     s8, s8, -1
     addu      s4, a2, a3    // tmp4 = tmp10 + tmp11
     subu      s5, a2, a3    // tmp5 = tmp10 - tmp11
     sh        s0, 16(a0)
     sh        s1, 48(a0)
     sh        s2, 80(a0)
     sh        s3, 112(a0)
     mult      v0, t8        // ac0  = tmp12 * c8
     madd      v1, t9        // ac0 += tmp13 * c9
     mult      $ac1, v1, t8  // ac1  = tmp13 * c8
     msub      $ac1, v0, a1  // ac1 -= tmp12 * c10
     addiu     a0, a0, 2
     extr_r.w  s6, $ac0, 15  // tmp6 = (ac0 + 16384) >> 15
     extr_r.w  s7, $ac1, 15  // tmp7 = (ac1 + 16384) >> 15
     shra_r.w  s4, s4, 2     // tmp4 = (tmp4 + 2) >> 2
     shra_r.w  s5, s5, 2     // tmp5 = (tmp5 + 2) >> 2
     sh        s4, -2(a0)
     sh        s5, 62(a0)
     sh        s6, 30(a0)
     bgtz      s8, 2b
      sh       s7, 94(a0)

     RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8

     jr       ra
      nop

 END(jsimd_fdct_islow_mips_dspr2)

 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_fdct_ifast_mips_dspr2)
 /*
  * a0     - data
  */
     .set at
     SAVE_REGS_ON_STACK 8, s0, s1
     li           a1, 0x014e014e  // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
     li           a2, 0x008b008b  // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
     li           a3, 0x00620062  // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
     li           s1, 0x00b500b5  // FIX_0_707106781 (181 << 16)|(181 & 0xffff)

     move         v0, a0
     addiu        v1, v0, 128     // end address

 0:
     lw           t0, 0(v0)       // tmp0 = 1|0
     lw           t1, 4(v0)       // tmp1 = 3|2
     lw           t2, 8(v0)       // tmp2 = 5|4
     lw           t3, 12(v0)      // tmp3 = 7|6
     packrl.ph    t1, t1, t1      // tmp1 = 2|3
     packrl.ph    t3, t3, t3      // tmp3 = 6|7
     subq.ph      t7, t1, t2      // tmp7 = 2-5|3-4 = t5|t4
     subq.ph      t5, t0, t3      // tmp5 = 1-6|0-7 = t6|t7
     addq.ph      t6, t1, t2      // tmp6 = 2+5|3+4 = t2|t3
     addq.ph      t4, t0, t3      // tmp4 = 1+6|0+7 = t1|t0
     addq.ph      t8, t4, t6      // tmp5 = t1+t2|t0+t3 = t11|t10
     subq.ph      t9, t4, t6      // tmp7 = t1-t2|t0-t3 = t12|t13
     sra          t4, t8, 16      // tmp4 = t11
     mult         $0, $0          // ac0  = 0
     dpa.w.ph     $ac0, t9, s1
     mult         $ac1, $0, $0    // ac1  = 0
     dpa.w.ph     $ac1, t7, a3    // ac1 += t4*98 + t5*98
     dpsx.w.ph    $ac1, t5, a3    // ac1 += t6*98 + t7*98
     mult         $ac2, $0, $0    // ac2  = 0
     dpa.w.ph     $ac2, t7, a2    // ac2 += t4*139 + t5*139
     mult         $ac3, $0, $0    // ac3  = 0
     dpa.w.ph     $ac3, t5, a1    // ac3 += t6*334 + t7*334
     precrq.ph.w  t0, t5, t7      // t0 = t5|t6
     addq.ph      t2, t8, t4      // tmp2 = t10 + t11
     subq.ph      t3, t8, t4      // tmp3 = t10 - t11
     extr.w       t4, $ac0, 8
     mult         $0, $0          // ac0  = 0
     dpa.w.ph     $ac0, t0, s1    // ac0 += t5*181 + t6*181
     extr.w       t0, $ac1, 8     // t0 = z5
     extr.w       t1, $ac2, 8     // t1 = MULTIPLY(tmp10, 139)
     extr.w       t7, $ac3, 8     // t2 = MULTIPLY(tmp12, 334)
     extr.w       t8, $ac0, 8     // t8 = z3 = MULTIPLY(tmp11, 181)
     add          t6, t1, t0      // t6 = z2
     add          t7, t7, t0      // t7 = z4
     subq.ph      t0, t5, t8      // t0 = z13 = tmp7 - z3
     addq.ph      t8, t5, t8      // t9 = z11 = tmp7 + z3
     addq.ph      t1, t0, t6      // t1 = z13 + z2
     subq.ph      t6, t0, t6      // t6 = z13 - z2
     addq.ph      t0, t8, t7      // t0 = z11 + z4
     subq.ph      t7, t8, t7      // t7 = z11 - z4
     addq.ph      t5, t4, t9
     subq.ph      t4, t9, t4
     sh           t2, 0(v0)
     sh           t5, 4(v0)
     sh           t3, 8(v0)
     sh           t4, 12(v0)
     sh           t1, 10(v0)
     sh           t6, 6(v0)
     sh           t0, 2(v0)
     sh           t7, 14(v0)
     addiu        v0, 16
     bne          v1, v0, 0b
      nop
     move         v0, a0
     addiu        v1, v0, 16

 1:
     lh           t0, 0(v0)       // 0
     lh           t1, 16(v0)      // 8
     lh           t2, 32(v0)      // 16
     lh           t3, 48(v0)      // 24
     lh           t4, 64(v0)      // 32
     lh           t5, 80(v0)      // 40
     lh           t6, 96(v0)      // 48
     lh           t7, 112(v0)     // 56
     add          t8, t0, t7      // t8 = tmp0
     sub          t7, t0, t7      // t7 = tmp7
     add          t0, t1, t6      // t0 = tmp1
     sub          t1, t1, t6      // t1 = tmp6
     add          t6, t2, t5      // t6 = tmp2
     sub          t5, t2, t5      // t5 = tmp5
     add          t2, t3, t4      // t2 = tmp3
     sub          t3, t3, t4      // t3 = tmp4
     add          t4, t8, t2      // t4 = tmp10 = tmp0 + tmp3
     sub          t8, t8, t2      // t8 = tmp13 = tmp0 - tmp3
     sub          s0, t0, t6      // s0 = tmp12 = tmp1 - tmp2
     ins          t8, s0, 16, 16  // t8 = tmp12|tmp13
     add          t2, t0, t6      // t2 = tmp11 = tmp1 + tmp2
     mult         $0, $0          // ac0  = 0
     dpa.w.ph     $ac0, t8, s1    // ac0 += t12*181 + t13*181
     add          s0, t4, t2      // t8 = tmp10+tmp11
     sub          t4, t4, t2      // t4 = tmp10-tmp11
     sh           s0, 0(v0)
     sh           t4, 64(v0)
     extr.w       t2, $ac0, 8     // z1 = MULTIPLY(tmp12+tmp13,FIX_0_707106781)
     addq.ph      t4, t8, t2      // t9 = tmp13 + z1
     subq.ph      t8, t8, t2      // t2 = tmp13 - z1
     sh           t4, 32(v0)
     sh           t8, 96(v0)
     add          t3, t3, t5      // t3 = tmp10 = tmp4 + tmp5
     add          t0, t5, t1      // t0 = tmp11 = tmp5 + tmp6
     add          t1, t1, t7      // t1 = tmp12 = tmp6 + tmp7
     andi         t4, a1, 0xffff
     mul          s0, t1, t4
     sra          s0, s0, 8       // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
     ins          t1, t3, 16, 16  // t1 = tmp10|tmp12
     mult         $0, $0          // ac0  = 0
     mulsa.w.ph   $ac0, t1, a3    // ac0 += t10*98 - t12*98
     extr.w       t8, $ac0, 8     // z5 = MULTIPLY(tmp10-tmp12,FIX_0_382683433)
     add          t2, t7, t8      // t2 = tmp7 + z5
     sub          t7, t7, t8      // t7 = tmp7 - z5
     andi         t4, a2, 0xffff
     mul          t8, t3, t4
     sra          t8, t8, 8       // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
     andi         t4, s1, 0xffff
     mul          t6, t0, t4
     sra          t6, t6, 8       // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
     add          t0, t6, t8      // t0 = z3 + z2
     sub          t1, t6, t8      // t1 = z3 - z2
     add          t3, t6, s0      // t3 = z3 + z4
     sub          t4, t6, s0      // t4 = z3 - z4
     sub          t5, t2, t1      // t5 = dataptr[5]
     sub          t6, t7, t0      // t6 = dataptr[3]
     add          t3, t2, t3      // t3 = dataptr[1]
     add          t4, t7, t4      // t4 = dataptr[7]
     sh           t5, 80(v0)
     sh           t6, 48(v0)
     sh           t3, 16(v0)
     sh           t4, 112(v0)
     addiu        v0, 2
     bne          v0, v1, 1b
      nop

     RESTORE_REGS_FROM_STACK 8, s0, s1

     j            ra
      nop
 END(jsimd_fdct_ifast_mips_dspr2)

 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2)
 /*
  * a0     - coef_block
  * a1     - divisors
  * a2     - workspace
  */

     .set at

     SAVE_REGS_ON_STACK 16, s0, s1, s2

     addiu   v0, a2, 124  // v0 = workspace_end
     lh      t0, 0(a2)
     lh      t1, 0(a1)
     lh      t2, 128(a1)
     sra     t3, t0, 15
     sll     t3, t3, 1
     addiu   t3, t3, 1
     mul     t0, t0, t3
     lh      t4, 384(a1)
     lh      t5, 130(a1)
     lh      t6, 2(a2)
     lh      t7, 2(a1)
     lh      t8, 386(a1)

 1:
     andi    t1, 0xffff
     add     t9, t0, t2
     andi    t9, 0xffff
     mul     v1, t9, t1
     sra     s0, t6, 15
     sll     s0, s0, 1
     addiu   s0, s0, 1
     addiu   t9, t4, 16
     srav    v1, v1, t9
     mul     v1, v1, t3
     mul     t6, t6, s0
     andi    t7, 0xffff
     addiu   a2, a2, 4
     addiu   a1, a1, 4
     add     s1, t6, t5
     andi    s1, 0xffff
     sh      v1, 0(a0)

     mul     s2, s1, t7
     addiu   s1, t8, 16
     srav    s2, s2, s1
     mul     s2,s2, s0
     lh      t0, 0(a2)
     lh      t1, 0(a1)
     sra     t3, t0, 15
     sll     t3, t3, 1
     addiu   t3, t3, 1
     mul     t0, t0, t3
     lh      t2, 128(a1)
     lh      t4, 384(a1)
     lh      t5, 130(a1)
     lh      t8, 386(a1)
     lh      t6, 2(a2)
     lh      t7, 2(a1)
     sh      s2, 2(a0)
     lh      t0, 0(a2)
     sra     t3, t0, 15
     sll     t3, t3, 1
     addiu   t3, t3, 1
     mul     t0, t0,t3
     bne     a2, v0, 1b
      addiu  a0, a0, 4

     andi    t1, 0xffff
     add     t9, t0, t2
     andi    t9, 0xffff
     mul     v1, t9, t1
     sra     s0, t6, 15
     sll     s0, s0, 1
     addiu   s0, s0, 1
     addiu   t9, t4, 16
     srav    v1, v1, t9
     mul     v1, v1, t3
     mul     t6, t6, s0
     andi    t7, 0xffff
     sh      v1, 0(a0)
     add     s1, t6, t5
     andi    s1, 0xffff
     mul     s2, s1, t7
     addiu   s1, t8, 16
     addiu   a2, a2, 4
     addiu   a1, a1, 4
     srav    s2, s2, s1
     mul     s2, s2, s0
     sh      s2, 2(a0)

     RESTORE_REGS_FROM_STACK 16, s0, s1, s2

     j       ra
      nop

 END(jsimd_quantize_mips_dspr2)

 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_quantize_float_mips_dspr2)
 /*
  * a0     - coef_block
  * a1     - divisors
  * a2     - workspace
  */

     .set at

     li         t1, 0x46800100     //integer representation 16384.5
     mtc1       t1, f0
     li         t0, 63
 0:
     lwc1       f1, 0(a2)
     lwc1       f5, 0(a1)
     lwc1       f2, 4(a2)
     lwc1       f6, 4(a1)
     lwc1       f3, 8(a2)
     lwc1       f7, 8(a1)
     lwc1       f4, 12(a2)
     lwc1       f8, 12(a1)
     madd.s     f1, f0, f1, f5
     madd.s     f2, f0, f2, f6
     madd.s     f3, f0, f3, f7
     madd.s     f4, f0, f4, f8
     lwc1       f5, 16(a1)
     lwc1       f6, 20(a1)
     trunc.w.s  f1, f1
     trunc.w.s  f2, f2
     trunc.w.s  f3, f3
     trunc.w.s  f4, f4
     lwc1       f7, 24(a1)
     lwc1       f8, 28(a1)
     mfc1       t1, f1
     mfc1       t2, f2
     mfc1       t3, f3
     mfc1       t4, f4
     lwc1       f1, 16(a2)
     lwc1       f2, 20(a2)
     lwc1       f3, 24(a2)
     lwc1       f4, 28(a2)
     madd.s     f1, f0, f1, f5
     madd.s     f2, f0, f2, f6
     madd.s     f3, f0, f3, f7
     madd.s     f4, f0, f4, f8
     addiu      t1, t1, -16384
     addiu      t2, t2, -16384
     addiu      t3, t3, -16384
     addiu      t4, t4, -16384
     trunc.w.s  f1, f1
     trunc.w.s  f2, f2
     trunc.w.s  f3, f3
     trunc.w.s  f4, f4
     sh         t1, 0(a0)
     sh         t2, 2(a0)
     sh         t3, 4(a0)
     sh         t4, 6(a0)
     mfc1       t1, f1
     mfc1       t2, f2
     mfc1       t3, f3
     mfc1       t4, f4
     addiu      t0, t0, -8
     addiu      a2, a2, 32
     addiu      a1, a1, 32
     addiu      t1, t1, -16384
     addiu      t2, t2, -16384
     addiu      t3, t3, -16384
     addiu      t4, t4, -16384
     sh         t1, 8(a0)
     sh         t2, 10(a0)
     sh         t3, 12(a0)
     sh         t4, 14(a0)
     bgez       t0, 0b
      addiu     a0, a0, 16

     j          ra
      nop

 END(jsimd_quantize_float_mips_dspr2)
 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2)
 /*
  * a0     - compptr->dct_table
  * a1     - coef_block
  * a2     - output_buf
  * a3     - output_col
  */
     .set at

     SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5

     addiu     sp, sp, -40
     move      v0, sp
     addiu     s2, zero, 29692
     addiu     s3, zero, -10426
     addiu     s4, zero, 6967
     addiu     s5, zero, -5906
     lh        t0, 0(a1)         // t0 = inptr[DCTSIZE*0]
     lh        t5, 0(a0)         // t5 = quantptr[DCTSIZE*0]
     lh        t1, 48(a1)        // t1 = inptr[DCTSIZE*3]
     lh        t6, 48(a0)        // t6 = quantptr[DCTSIZE*3]
     mul       t4, t5, t0
     lh        t0, 16(a1)        // t0 = inptr[DCTSIZE*1]
     lh        t5, 16(a0)        // t5 = quantptr[DCTSIZE*1]
     mul       t6, t6, t1
     mul       t5, t5, t0
     lh        t2, 80(a1)        // t2 = inptr[DCTSIZE*5]
     lh        t7, 80(a0)        // t7 = quantptr[DCTSIZE*5]
     lh        t3, 112(a1)       // t3 = inptr[DCTSIZE*7]
     lh        t8, 112(a0)       // t8 = quantptr[DCTSIZE*7]
     mul       t7, t7, t2
     mult      zero, zero
     mul       t8, t8, t3
     li        s0, 0x73FCD746    // s0 = (29692 << 16) | (-10426 & 0xffff)
     li        s1, 0x1B37E8EE    // s1 = (6967 << 16) | (-5906 & 0xffff)
     ins       t6, t5, 16, 16    // t6 = t5|t6
     sll       t4, t4, 15
     dpa.w.ph  $ac0, t6, s0
     lh        t1, 2(a1)
     lh        t6, 2(a0)
     ins       t8, t7, 16, 16    // t8 = t7|t8
     dpa.w.ph  $ac0, t8, s1
     mflo      t0, $ac0
     mul       t5, t6, t1
     lh        t1, 18(a1)
     lh        t6, 18(a0)
     lh        t2, 50(a1)
     lh        t7, 50(a0)
     mul       t6, t6, t1
     subu      t8, t4, t0
     mul       t7, t7, t2
     addu      t0, t4, t0
     shra_r.w  t0, t0, 13
     lh        t1, 82(a1)
     lh        t2, 82(a0)
     lh        t3, 114(a1)
     lh        t4, 114(a0)
     shra_r.w  t8, t8, 13
     mul       t1, t1, t2
     mul       t3, t3, t4
     sw        t0, 0(v0)
     sw        t8, 20(v0)
     sll       t4, t5, 15
     ins       t7, t6, 16, 16
     mult      zero, zero
     dpa.w.ph  $ac0, t7, s0
     ins       t3, t1, 16, 16
     lh        t1, 6(a1)
     lh        t6, 6(a0)
     dpa.w.ph  $ac0, t3, s1
     mflo      t0, $ac0
     mul       t5, t6, t1
     lh        t1, 22(a1)
     lh        t6, 22(a0)
     lh        t2, 54(a1)
     lh        t7, 54(a0)
     mul       t6, t6, t1
     subu      t8, t4, t0
     mul       t7, t7, t2
     addu      t0, t4, t0
     shra_r.w  t0, t0, 13
     lh        t1, 86(a1)
     lh        t2, 86(a0)
     lh        t3, 118(a1)
     lh        t4, 118(a0)
     shra_r.w  t8, t8, 13
     mul       t1, t1, t2
     mul       t3, t3, t4
     sw        t0, 4(v0)
     sw        t8, 24(v0)
     sll       t4, t5, 15
     ins       t7, t6, 16, 16
     mult      zero, zero
     dpa.w.ph  $ac0, t7, s0
     ins       t3, t1, 16, 16
     lh        t1, 10(a1)
     lh        t6, 10(a0)
     dpa.w.ph  $ac0, t3, s1
     mflo      t0, $ac0
     mul       t5, t6, t1
     lh        t1, 26(a1)
     lh        t6, 26(a0)
     lh        t2, 58(a1)
     lh        t7, 58(a0)
     mul       t6, t6, t1
     subu      t8, t4, t0
     mul       t7, t7, t2
     addu      t0, t4, t0
     shra_r.w  t0, t0, 13
     lh        t1, 90(a1)
     lh        t2, 90(a0)
     lh        t3, 122(a1)
     lh        t4, 122(a0)
     shra_r.w  t8, t8, 13
     mul       t1, t1, t2
     mul       t3, t3, t4
     sw        t0, 8(v0)
     sw        t8, 28(v0)
     sll       t4, t5, 15
     ins       t7, t6, 16, 16
     mult      zero, zero
     dpa.w.ph  $ac0, t7, s0
     ins       t3, t1, 16, 16
     lh        t1, 14(a1)
     lh        t6, 14(a0)
     dpa.w.ph  $ac0, t3, s1
     mflo      t0, $ac0
     mul       t5, t6, t1
     lh        t1, 30(a1)
     lh        t6, 30(a0)
     lh        t2, 62(a1)
     lh        t7, 62(a0)
     mul       t6, t6, t1
     subu      t8, t4, t0
     mul       t7, t7, t2
     addu      t0, t4, t0
     shra_r.w  t0, t0, 13
     lh        t1, 94(a1)
     lh        t2, 94(a0)
     lh        t3, 126(a1)
     lh        t4, 126(a0)
     shra_r.w  t8, t8, 13
     mul       t1, t1, t2
     mul       t3, t3, t4
     sw        t0, 12(v0)
     sw        t8, 32(v0)
     sll       t4, t5, 15
     ins       t7, t6, 16, 16
     mult      zero, zero
     dpa.w.ph  $ac0, t7, s0
     ins       t3, t1, 16, 16
     dpa.w.ph  $ac0, t3, s1
     mflo      t0, $ac0
     lw        t9, 0(a2)
     lw        t3, 0(v0)
     lw        t7, 4(v0)
     lw        t1, 8(v0)
     addu      t9, t9, a3
     sll       t3, t3, 15
     subu      t8, t4, t0
     addu      t0, t4, t0
     shra_r.w  t0, t0, 13
     shra_r.w  t8, t8, 13
     sw        t0, 16(v0)
     sw        t8, 36(v0)
     lw        t5, 12(v0)
     lw        t6, 16(v0)
     mult      t7, s2
     madd      t1, s3
     madd      t5, s4
     madd      t6, s5
     lw        t5, 24(v0)
     lw        t7, 28(v0)
     mflo      t0, $ac0
     lw        t8, 32(v0)
     lw        t2, 36(v0)
     mult      $ac1, t5, s2
     madd      $ac1, t7, s3
     madd      $ac1, t8, s4
     madd      $ac1, t2, s5
     addu      t1, t3, t0
     subu      t6, t3, t0
     shra_r.w  t1, t1, 20
     shra_r.w  t6, t6, 20
     mflo      t4, $ac1
     shll_s.w  t1, t1, 24
     shll_s.w  t6, t6, 24
     sra       t1, t1, 24
     sra       t6, t6, 24
     addiu     t1, t1, 128
     addiu     t6, t6, 128
     lw        t0, 20(v0)
     sb        t1, 0(t9)
     sb        t6, 1(t9)
     sll       t0, t0, 15
     lw        t9, 4(a2)
     addu      t1, t0, t4
     subu      t6, t0, t4
     addu      t9, t9, a3
     shra_r.w  t1, t1, 20
     shra_r.w  t6, t6, 20
     shll_s.w  t1, t1, 24
     shll_s.w  t6, t6, 24
     sra       t1, t1, 24
     sra       t6, t6, 24
     addiu     t1, t1, 128
     addiu     t6, t6, 128
     sb        t1, 0(t9)
     sb        t6, 1(t9)
     addiu     sp, sp, 40

     RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5

     j         ra
      nop

 END(jsimd_idct_2x2_mips_dspr2)

 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_idct_4x4_mips_dspr2)
 /*
  * a0     - compptr->dct_table
  * a1     - coef_block
  * a2     - output_buf
  * a3     - output_col
  * 16(sp) - workspace[DCTSIZE*4];  // buffers data between passes
  */

     .set at
     SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

     lw        v1, 48(sp)
     move      t0, a1
     move      t1, v1
     li        t9, 4
     li        s0, 0x2e75f93e
     li        s1, 0x21f9ba79
     li        s2, 0xecc2efb0
     li        s3, 0x52031ccd

 0:
     lh        s6, 32(t0)        // inptr[DCTSIZE*2]
     lh        t6, 32(a0)        // quantptr[DCTSIZE*2]
     lh        s7, 96(t0)        // inptr[DCTSIZE*6]
     lh        t7, 96(a0)        // quantptr[DCTSIZE*6]
     mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
     lh        s4, 0(t0)         // inptr[DCTSIZE*0]
     mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
     lh        s5, 0(a0)         // quantptr[0]
     li        s6, 15137
     li        s7, 6270
     mul       t2, s4, s5        // tmp0 = (inptr[0] * quantptr[0])
     mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
     lh        t5, 112(t0)       // inptr[DCTSIZE*7]
     mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
     lh        s4, 112(a0)       // quantptr[DCTSIZE*7]
     lh        v0, 80(t0)        // inptr[DCTSIZE*5]
     lh        s5, 80(a0)        // quantptr[DCTSIZE*5]
     lh        s6, 48(a0)        // quantptr[DCTSIZE*3]
     sll       t2, t2, 14        // tmp0 <<= (CONST_BITS+1)
     lh        s7, 16(a0)        // quantptr[DCTSIZE*1]
     lh        t8, 16(t0)        // inptr[DCTSIZE*1]
     subu      t6, t6, t7        // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
     lh        t7, 48(t0)        // inptr[DCTSIZE*3]
     mul       t5, s4, t5        // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
     mul       v0, s5, v0        // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
     mul       t7, s6, t7        // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
     mul       t8, s7, t8        // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
     addu      t3, t2, t6        // tmp10 = tmp0 + z2
     subu      t4, t2, t6        // tmp10 = tmp0 - z2
     mult      $ac0, zero, zero
     mult      $ac1, zero, zero
     ins       t5, v0, 16, 16
     ins       t7, t8, 16, 16
     addiu     t9, t9, -1
     dpa.w.ph  $ac0, t5, s0
     dpa.w.ph  $ac0, t7, s1
     dpa.w.ph  $ac1, t5, s2
     dpa.w.ph  $ac1, t7, s3
     mflo      s4, $ac0
     mflo      s5, $ac1
     addiu     a0, a0, 2
     addiu     t1, t1, 4
     addiu     t0, t0, 2
     addu      t6, t4, s4
     subu      t5, t4, s4
     addu      s6, t3, s5
     subu      s7, t3, s5
     shra_r.w  t6, t6, 12        // DESCALE(tmp12 + temp1, 12)
     shra_r.w  t5, t5, 12        // DESCALE(tmp12 - temp1, 12)
     shra_r.w  s6, s6, 12        // DESCALE(tmp10 + temp2, 12)
     shra_r.w  s7, s7, 12        // DESCALE(tmp10 - temp2, 12)
     sw        t6, 28(t1)
     sw        t5, 60(t1)
     sw        s6, -4(t1)
     bgtz      t9, 0b
      sw       s7, 92(t1)
     // second loop three pass
     li        t9, 3
 1:
     lh        s6, 34(t0)        // inptr[DCTSIZE*2]
     lh        t6, 34(a0)        // quantptr[DCTSIZE*2]
     lh        s7, 98(t0)        // inptr[DCTSIZE*6]
     lh        t7, 98(a0)        // quantptr[DCTSIZE*6]
     mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
     lh        s4, 2(t0)         // inptr[DCTSIZE*0]
     mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
     lh        s5, 2(a0)         // quantptr[DCTSIZE*0]
     li        s6, 15137
     li        s7, 6270
     mul       t2, s4, s5        // tmp0 = (inptr[0] * quantptr[0])
     mul       v0, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
     lh        t5, 114(t0)       // inptr[DCTSIZE*7]
     mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
     lh        s4, 114(a0)       // quantptr[DCTSIZE*7]
     lh        s5, 82(a0)        // quantptr[DCTSIZE*5]
     lh        t6, 82(t0)        // inptr[DCTSIZE*5]
     sll       t2, t2, 14        // tmp0 <<= (CONST_BITS+1)
     lh        s6, 50(a0)        // quantptr[DCTSIZE*3]
     lh        t8, 18(t0)        // inptr[DCTSIZE*1]
     subu      v0, v0, t7        // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
     lh        t7, 50(t0)        // inptr[DCTSIZE*3]
     lh        s7, 18(a0)        // quantptr[DCTSIZE*1]
     mul       t5, s4, t5        // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
     mul       t6, s5, t6        // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
     mul       t7, s6, t7        // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
     mul       t8, s7, t8        // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
     addu      t3, t2, v0        // tmp10 = tmp0 + z2
     subu      t4, t2, v0        // tmp10 = tmp0 - z2
     mult      $ac0, zero, zero
     mult      $ac1, zero, zero
     ins       t5, t6, 16, 16
     ins       t7, t8, 16, 16
     dpa.w.ph  $ac0, t5, s0
     dpa.w.ph  $ac0, t7, s1
     dpa.w.ph  $ac1, t5, s2
     dpa.w.ph  $ac1, t7, s3
     mflo      t5, $ac0
     mflo      t6, $ac1
     addiu     t9, t9, -1
     addiu     t0, t0, 2
     addiu     a0, a0, 2
     addiu     t1, t1, 4
     addu      s5, t4, t5
     subu      s4, t4, t5
     addu      s6, t3, t6
     subu      s7, t3, t6
     shra_r.w  s5, s5, 12        // DESCALE(tmp12 + temp1, 12)
     shra_r.w  s4, s4, 12        // DESCALE(tmp12 - temp1, 12)
     shra_r.w  s6, s6, 12        // DESCALE(tmp10 + temp2, 12)
     shra_r.w  s7, s7, 12        // DESCALE(tmp10 - temp2, 12)
     sw        s5, 32(t1)
     sw        s4, 64(t1)
     sw        s6, 0(t1)
     bgtz      t9, 1b
      sw       s7, 96(t1)
     move      t1, v1
     li        s4, 15137
     lw        s6, 8(t1)         // wsptr[2]
     li        s5, 6270
     lw        s7, 24(t1)        // wsptr[6]
     mul       s4, s4, s6        // MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
     lw        t2, 0(t1)         // wsptr[0]
     mul       s5, s5, s7        // MULTIPLY((INT32) wsptr[6], - FIX_0_765366865)
     lh        t5, 28(t1)        // wsptr[7]
     lh        t6, 20(t1)        // wsptr[5]
     lh        t7, 12(t1)        // wsptr[3]
     lh        t8, 4(t1)         // wsptr[1]
     ins       t5, t6, 16, 16
     ins       t7, t8, 16, 16
     mult      $ac0, zero, zero
     dpa.w.ph  $ac0, t5, s0
     dpa.w.ph  $ac0, t7, s1
     mult      $ac1, zero, zero
     dpa.w.ph  $ac1, t5, s2
     dpa.w.ph  $ac1, t7, s3
     sll       t2, t2, 14        // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1)
     mflo      s6, $ac0
     // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
     subu      s4, s4, s5
     addu      t3, t2, s4        // tmp10 = tmp0 + z2
     mflo      s7, $ac1
     subu      t4, t2, s4        // tmp10 = tmp0 - z2
     addu      t7, t4, s6
     subu      t8, t4, s6
     addu      t5, t3, s7
     subu      t6, t3, s7
     shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
     shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
     shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
     shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
     sll       s4, t9, 2
     lw        v0, 0(a2)         // output_buf[ctr]
     shll_s.w  t5, t5, 24
     shll_s.w  t6, t6, 24
     shll_s.w  t7, t7, 24
     shll_s.w  t8, t8, 24
     sra       t5, t5, 24
     sra       t6, t6, 24
     sra       t7, t7, 24
     sra       t8, t8, 24
     addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
     addiu     t5, t5, 128
     addiu     t6, t6, 128
     addiu     t7, t7, 128
     addiu     t8, t8, 128
     sb        t5, 0(v0)
     sb        t7, 1(v0)
     sb        t8, 2(v0)
     sb        t6, 3(v0)
     // 2
     li        s4, 15137
     lw        s6, 40(t1)        // wsptr[2]
     li        s5, 6270
     lw        s7, 56(t1)        // wsptr[6]
     mul       s4, s4, s6        // MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
     lw        t2, 32(t1)        // wsptr[0]
     mul       s5, s5, s7        // MULTIPLY((INT32) wsptr[6], - FIX_0_765366865)
     lh        t5, 60(t1)        // wsptr[7]
     lh        t6, 52(t1)        // wsptr[5]
     lh        t7, 44(t1)        // wsptr[3]
     lh        t8, 36(t1)        // wsptr[1]
     ins       t5, t6, 16, 16
     ins       t7, t8, 16, 16
     mult      $ac0, zero, zero
     dpa.w.ph  $ac0, t5, s0
     dpa.w.ph  $ac0, t7, s1
     mult      $ac1, zero, zero
     dpa.w.ph  $ac1, t5, s2
     dpa.w.ph  $ac1, t7, s3
     sll       t2, t2, 14        // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1)
     mflo      s6, $ac0
     // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
     subu      s4, s4, s5
     addu      t3, t2, s4        // tmp10 = tmp0 + z2
     mflo      s7, $ac1
     subu      t4, t2, s4        // tmp10 = tmp0 - z2
     addu      t7, t4, s6
     subu      t8, t4, s6
     addu      t5, t3, s7
     subu      t6, t3, s7
     shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
     shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
     shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
     shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
     sll       s4, t9, 2
     lw        v0, 4(a2)         // output_buf[ctr]
     shll_s.w  t5, t5, 24
     shll_s.w  t6, t6, 24
     shll_s.w  t7, t7, 24
     shll_s.w  t8, t8, 24
     sra       t5, t5, 24
     sra       t6, t6, 24
     sra       t7, t7, 24
     sra       t8, t8, 24
     addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
     addiu     t5, t5, 128
     addiu     t6, t6, 128
     addiu     t7, t7, 128
     addiu     t8, t8, 128
     sb        t5, 0(v0)
     sb        t7, 1(v0)
     sb        t8, 2(v0)
     sb        t6, 3(v0)
     // 3
     li        s4, 15137
     lw        s6, 72(t1)        // wsptr[2]
     li        s5, 6270
     lw        s7, 88(t1)        // wsptr[6]
     mul       s4, s4, s6        // MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
     lw        t2, 64(t1)        // wsptr[0]
     mul       s5, s5, s7        // MULTIPLY((INT32) wsptr[6], - FIX_0_765366865)
     lh        t5, 92(t1)        // wsptr[7]
     lh        t6, 84(t1)        // wsptr[5]
     lh        t7, 76(t1)        // wsptr[3]
     lh        t8, 68(t1)        // wsptr[1]
     ins       t5, t6, 16, 16
     ins       t7, t8, 16, 16
     mult      $ac0, zero, zero
     dpa.w.ph  $ac0, t5, s0
     dpa.w.ph  $ac0, t7, s1
     mult      $ac1, zero, zero
     dpa.w.ph  $ac1, t5, s2
     dpa.w.ph  $ac1, t7, s3
     sll       t2, t2, 14        // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1)
     mflo      s6, $ac0
     // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
     subu      s4, s4, s5
     addu      t3, t2, s4        // tmp10 = tmp0 + z2
     mflo      s7, $ac1
     subu      t4, t2, s4        // tmp10 = tmp0 - z2
     addu      t7, t4, s6
     subu      t8, t4, s6
     addu      t5, t3, s7
     subu      t6, t3, s7
     shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
     shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
     shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
     shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
     sll       s4, t9, 2
     lw        v0, 8(a2)         // output_buf[ctr]
     shll_s.w  t5, t5, 24
     shll_s.w  t6, t6, 24
     shll_s.w  t7, t7, 24
     shll_s.w  t8, t8, 24
     sra       t5, t5, 24
     sra       t6, t6, 24
     sra       t7, t7, 24
     sra       t8, t8, 24
     addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
     addiu     t5, t5, 128
     addiu     t6, t6, 128
     addiu     t7, t7, 128
     addiu     t8, t8, 128
     sb        t5, 0(v0)
     sb        t7, 1(v0)
     sb        t8, 2(v0)
     sb        t6, 3(v0)
     li        s4, 15137
     lw        s6, 104(t1)       // wsptr[2]
     li        s5, 6270
     lw        s7, 120(t1)       // wsptr[6]
     mul       s4, s4, s6        // MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
     lw        t2, 96(t1)        // wsptr[0]
     mul       s5, s5, s7        // MULTIPLY((INT32) wsptr[6], -FIX_0_765366865)
     lh        t5, 124(t1)       // wsptr[7]
     lh        t6, 116(t1)       // wsptr[5]
     lh        t7, 108(t1)       // wsptr[3]
     lh        t8, 100(t1)       // wsptr[1]
     ins       t5, t6, 16, 16
     ins       t7, t8, 16, 16
     mult      $ac0, zero, zero
     dpa.w.ph  $ac0, t5, s0
     dpa.w.ph  $ac0, t7, s1
     mult      $ac1, zero, zero
     dpa.w.ph  $ac1, t5, s2
     dpa.w.ph  $ac1, t7, s3
     sll       t2, t2, 14        // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1)
     mflo      s6, $ac0
     // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
     subu      s4, s4, s5
     addu      t3, t2, s4        // tmp10 = tmp0 + z2;
     mflo      s7, $ac1
     subu      t4, t2, s4        // tmp10 = tmp0 - z2;
     addu      t7, t4, s6
     subu      t8, t4, s6
     addu      t5, t3, s7
     subu      t6, t3, s7
     shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
     shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
     shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
     shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
     sll       s4, t9, 2
     lw        v0, 12(a2)        // output_buf[ctr]
     shll_s.w  t5, t5, 24
     shll_s.w  t6, t6, 24
     shll_s.w  t7, t7, 24
     shll_s.w  t8, t8, 24
     sra       t5, t5, 24
     sra       t6, t6, 24
     sra       t7, t7, 24
     sra       t8, t8, 24
     addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
     addiu     t5, t5, 128
     addiu     t6, t6, 128
     addiu     t7, t7, 128
     addiu     t8, t8, 128
     sb        t5, 0(v0)
     sb        t7, 1(v0)
     sb        t8, 2(v0)
     sb        t6, 3(v0)

     RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

     j         ra
      nop
 END(jsimd_idct_4x4_mips_dspr2)

 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_idct_6x6_mips_dspr2)
 /*
  * a0     - compptr->dct_table
  * a1     - coef_block
  * a2     - output_buf
  * a3     - output_col
  */
     .set at

     SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

     addiu     sp, sp, -144
     move      v0, sp
     addiu     v1, v0, 24
     addiu     t9, zero, 5793
     addiu     s0, zero, 10033
     addiu     s1, zero, 2998

 1:
     lh        s2, 0(a0)   // q0 = quantptr[ 0]
     lh        s3, 32(a0)  // q1 = quantptr[16]
     lh        s4, 64(a0)  // q2 = quantptr[32]
     lh        t2, 64(a1)  // tmp2 = inptr[32]
     lh        t1, 32(a1)  // tmp1 = inptr[16]
     lh        t0, 0(a1)   // tmp0 = inptr[ 0]
     mul       t2, t2, s4  // tmp2 = tmp2 * q2
     mul       t1, t1, s3  // tmp1 = tmp1 * q1
     mul       t0, t0, s2  // tmp0 = tmp0 * q0
     lh        t6, 16(a1)  // z1 = inptr[ 8]
     lh        t8, 80(a1)  // z3 = inptr[40]
     lh        t7, 48(a1)  // z2 = inptr[24]
     lh        s2, 16(a0)  // q0 = quantptr[ 8]
     lh        s4, 80(a0)  // q2 = quantptr[40]
     lh        s3, 48(a0)  // q1 = quantptr[24]
     mul       t2, t2, t9  // tmp2 = tmp2 * 5793
     mul       t1, t1, s0  // tmp1 = tmp1 * 10033
     sll       t0, t0, 13  // tmp0 = tmp0 << 13
     mul       t6, t6, s2  // z1 = z1 * q0
     mul       t8, t8, s4  // z3 = z3 * q2
     mul       t7, t7, s3  // z2 = z2 * q1
     addu      t3, t0, t2  // tmp10 = tmp0 + tmp2
     sll       t2, t2, 1   // tmp2 = tmp2 << 2
     subu      t4, t0, t2  // tmp11 = tmp0 - tmp2;
     subu      t5, t3, t1  // tmp12 = tmp10 - tmp1
     addu      t3, t3, t1  // tmp10 = tmp10 + tmp1
     addu      t1, t6, t8  // tmp1 = z1 + z3
     mul       t1, t1, s1  // tmp1 = tmp1 * 2998
     shra_r.w  t4, t4, 11  // tmp11 = (tmp11 + 1024) >> 11
     subu      t2, t6, t8  // tmp2 = z1 - z3
     subu      t2, t2, t7  // tmp2 = tmp2 - z2
     sll       t2, t2, 2   // tmp2 = tmp2 << 2
     addu      t0, t6, t7  // tmp0 = z1 + z2
     sll       t0, t0, 13  // tmp0 = tmp0 << 13
     subu      s2, t8, t7  // q0 = z3 - z2
     sll       s2, s2, 13  // q0 = q0 << 13
     addu      t0, t0, t1  // tmp0 = tmp0 + tmp1
     addu      t1, s2, t1  // tmp1 = q0 + tmp1
     addu      s2, t4, t2  // q0 = tmp11 + tmp2
     subu      s3, t4, t2  // q1 = tmp11 - tmp2
     addu      t6, t3, t0  // z1 = tmp10 + tmp0
     subu      t7, t3, t0  // z2 = tmp10 - tmp0
     addu      t4, t5, t1  // tmp11 = tmp12 + tmp1
     subu      t5, t5, t1  // tmp12 = tmp12 - tmp1
     shra_r.w  t6, t6, 11  // z1 = (z1 + 1024) >> 11
     shra_r.w  t7, t7, 11  // z2 = (z2 + 1024) >> 11
     shra_r.w  t4, t4, 11  // tmp11 = (tmp11 + 1024) >> 11
     shra_r.w  t5, t5, 11  // tmp12 = (tmp12 + 1024) >> 11
     sw        s2, 24(v0)
     sw        s3, 96(v0)
     sw        t6, 0(v0)
     sw        t7, 120(v0)
     sw        t4, 48(v0)
     sw        t5, 72(v0)
     addiu     v0, v0, 4
     addiu     a1, a1, 2
     bne       v0, v1, 1b
      addiu    a0, a0, 2

     /* Pass 2: process 6 rows from work array, store into output array. */
     move      v0, sp
     addiu     v1, v0, 144

 2:
     lw        t0, 0(v0)
     lw        t2, 16(v0)
     lw        s5, 0(a2)
     addiu     t0, t0, 16
     sll       t0, t0, 13
     mul       t3, t2, t9
     lw        t6, 4(v0)
     lw        t8, 20(v0)
     lw        t7, 12(v0)
     addu      s5, s5, a3
     addu      s6, t6, t8
     mul       s6, s6, s1
     addu      t1, t0, t3
     subu      t4, t0, t3
     subu      t4, t4, t3
     lw        t3, 8(v0)
     mul       t0, t3, s0
     addu      s7, t6, t7
     sll       s7, s7, 13
     addu      s7, s6, s7
     subu      t2, t8, t7
     sll       t2, t2, 13
     addu      t2, s6, t2
     subu      s6, t6, t7
     subu      s6, s6, t8
     sll       s6, s6, 13
     addu      t3, t1, t0
     subu      t5, t1, t0
     addu      t6, t3, s7
     subu      t3, t3, s7
     addu      t7, t4, s6
     subu      t4, t4, s6
     addu      t8, t5, t2
     subu      t5, t5, t2
     shll_s.w  t6, t6, 6
     shll_s.w  t3, t3, 6
     shll_s.w  t7, t7, 6
     shll_s.w  t4, t4, 6
     shll_s.w  t8, t8, 6
     shll_s.w  t5, t5, 6
     sra       t6, t6, 24
     addiu     t6, t6, 128
     sra       t3, t3, 24
     addiu     t3, t3, 128
     sb        t6, 0(s5)
     sra       t7, t7, 24
     addiu     t7, t7, 128
     sb        t3, 5(s5)
     sra       t4, t4, 24
     addiu     t4, t4, 128
     sb        t7, 1(s5)
     sra       t8, t8, 24
     addiu     t8, t8, 128
     sb        t4, 4(s5)
     addiu     v0, v0, 24
     sra       t5, t5, 24
     addiu     t5, t5, 128
     sb        t8, 2(s5)
     addiu     a2, a2,  4
     bne       v0, v1, 2b
      sb       t5, 3(s5)

     addiu     sp, sp, 144

     RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7

     j         ra
      nop

 END(jsimd_idct_6x6_mips_dspr2)

 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass1_mips_dspr2)
 /*
  * a0     - compptr->dct_table
  * a1     - coef_block
  * a2     - workspace
  */

     SAVE_REGS_ON_STACK 16, s0, s1, s2, s3

     li         a3, 8

 1:
     // odd part
     lh         t0, 48(a1)
     lh         t1, 48(a0)
     lh         t2, 16(a1)
     lh         t3, 16(a0)
     lh         t4, 80(a1)
     lh         t5, 80(a0)
     lh         t6, 112(a1)
     lh         t7, 112(a0)
     mul        t0, t0, t1    // z2
     mul        t1, t2, t3    // z1
     mul        t2, t4, t5    // z3
     mul        t3, t6, t7    // z4
     li         t4, 10703     // FIX(1.306562965)
     li         t5, 4433      // FIX_0_541196100
     li         t6, 7053      // FIX(0.860918669)
     mul        t4, t0,t4     // tmp11
     mul        t5, t0,t5     // -tmp14
     addu       t7, t1,t2     // tmp10
     addu       t8, t7,t3     // tmp10 + z4
     mul        t6, t6, t8    // tmp15
     li         t8, 2139      // FIX(0.261052384)
     mul        t8, t7, t8    // MULTIPLY(tmp10, FIX(0.261052384))
     li         t7, 2295      // FIX(0.280143716)
     mul        t7, t1, t7    // MULTIPLY(z1, FIX(0.280143716))
     addu       t9, t2, t3    // z3 + z4
     li         s0, 8565      // FIX(1.045510580)
     mul        t9, t9, s0    // -tmp13
     li         s0, 12112     // FIX(1.478575242)
     mul        s0, t2, s0    // MULTIPLY(z3, FIX(1.478575242)
     li         s1, 12998     // FIX(1.586706681)
     mul        s1, t3, s1    // MULTIPLY(z4, FIX(1.586706681))
     li         s2, 5540      // FIX(0.676326758)
     mul        s2, t1, s2    // MULTIPLY(z1, FIX(0.676326758))
     li         s3, 16244     // FIX(1.982889723)
     mul        s3, t3, s3    // MULTIPLY(z4, FIX(1.982889723))
     subu       t1, t1, t3    // z1-=z4
     subu       t0, t0, t2    // z2-=z3
     addu       t2, t0, t1    // z1+z2
     li         t3, 4433      // FIX_0_541196100
     mul        t2, t2, t3    // z3
     li         t3, 6270      // FIX_0_765366865
     mul        t1, t1, t3    // MULTIPLY(z1, FIX_0_765366865)
     li         t3, 15137     // FIX_0_765366865
     mul        t0, t0, t3    // MULTIPLY(z2, FIX_1_847759065)
     addu       t8, t6, t8    // tmp12
     addu       t3, t8, t4    // tmp12 + tmp11
     addu       t3, t3, t7    // tmp10
     subu       t8, t8, t9    // tmp12 + tmp13
     addu       s0, t5, s0
     subu       t8, t8, s0    // tmp12
     subu       t9, t6, t9
     subu       s1, s1, t4
     addu       t9, t9, s1    // tmp13
     subu       t6, t6, t5
     subu       t6, t6, s2
     subu       t6, t6, s3    // tmp15
     // even part start
     lh         t4, 64(a1)
     lh         t5, 64(a0)
     lh         t7, 32(a1)
     lh         s0, 32(a0)
     lh         s1, 0(a1)
     lh         s2, 0(a0)
     lh         s3, 96(a1)
     lh         v0, 96(a0)
     mul        t4, t4, t5    // DEQUANTIZE(inptr[DCTSIZE*4],quantptr[DCTSIZE*4])
     mul        t5, t7, s0    // DEQUANTIZE(inptr[DCTSIZE*2],quantptr[DCTSIZE*2])
     mul        t7, s1, s2    // DEQUANTIZE(inptr[DCTSIZE*0],quantptr[DCTSIZE*0])
     mul        s0, s3, v0    // DEQUANTIZE(inptr[DCTSIZE*6],quantptr[DCTSIZE*6])
     // odd part end
     addu       t1, t2, t1    // tmp11
     subu       t0, t2, t0    // tmp14
     // update counter and pointers
     addiu      a3, a3, -1
     addiu      a0, a0, 2
     addiu      a1, a1, 2
     // even part rest
     li         s1, 10033
     li         s2, 11190
     mul        t4, t4, s1    // z4
     mul        s1, t5, s2    // z4
     sll        t5, t5, 13    // z1
     sll        t7, t7, 13
     addiu      t7, t7, 1024  // z3
     sll        s0, s0, 13    // z2
     addu       s2, t7, t4    // tmp10
     subu       t4, t7, t4    // tmp11
     subu       s3, t5, s0    // tmp12
     addu       t2, t7, s3    // tmp21
     subu       s3, t7, s3    // tmp24
     addu       t7, s1, s0    // tmp12
     addu       v0, s2, t7    // tmp20
     subu       s2, s2, t7    // tmp25
     subu       s1, s1, t5    // z4 - z1
     subu       s1, s1, s0    // tmp12
     addu       s0, t4, s1    // tmp22
     subu       t4, t4, s1    // tmp23
     // final output stage
     addu       t5, v0, t3
     subu       v0, v0, t3
     addu       t3, t2, t1
     subu       t2, t2, t1
     addu       t1, s0, t8
     subu       s0, s0, t8
     addu       t8, t4, t9
     subu       t4, t4, t9
     addu       t9, s3, t0
     subu       s3, s3, t0
     addu       t0, s2, t6
     subu       s2, s2, t6
     sra        t5, t5, 11
     sra        t3, t3, 11
     sra        t1, t1, 11
     sra        t8, t8, 11
     sra        t9, t9, 11
     sra        t0, t0, 11
     sra        s2, s2, 11
     sra        s3, s3, 11
     sra        t4, t4, 11
     sra        s0, s0, 11
     sra        t2, t2, 11
     sra        v0, v0, 11
     sw         t5, 0(a2)
     sw         t3, 32(a2)
     sw         t1, 64(a2)
     sw         t8, 96(a2)
     sw         t9, 128(a2)
     sw         t0, 160(a2)
     sw         s2, 192(a2)
     sw         s3, 224(a2)
     sw         t4, 256(a2)
     sw         s0, 288(a2)
     sw         t2, 320(a2)
     sw         v0, 352(a2)
     bgtz       a3, 1b
      addiu     a2, a2, 4

     RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3

     j          ra
      nop

 END(jsimd_idct_12x12_pass1_mips_dspr2)

 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2)
 /*
  * a0     - workspace
  * a1     - output
  */

     SAVE_REGS_ON_STACK 16, s0, s1, s2, s3

     li        a3, 12

 1:
     // Odd part
     lw        t0, 12(a0)
     lw        t1, 4(a0)
     lw        t2, 20(a0)
     lw        t3, 28(a0)
     li        t4, 10703     // FIX(1.306562965)
     li        t5, 4433      // FIX_0_541196100
     mul       t4, t0, t4    // tmp11
     mul       t5, t0, t5    // -tmp14
     addu      t6, t1, t2    // tmp10
     li        t7, 2139      // FIX(0.261052384)
     mul       t7, t6, t7    // MULTIPLY(tmp10, FIX(0.261052384))
     addu      t6, t6, t3    // tmp10 + z4
     li        t8, 7053      // FIX(0.860918669)
     mul       t6, t6, t8    // tmp15
     li        t8, 2295      // FIX(0.280143716)
     mul       t8, t1, t8    // MULTIPLY(z1, FIX(0.280143716))
     addu      t9, t2, t3    // z3 + z4
     li        s0, 8565      // FIX(1.045510580)
     mul       t9, t9, s0    // -tmp13
     li        s0, 12112     // FIX(1.478575242)
     mul       s0, t2, s0    // MULTIPLY(z3, FIX(1.478575242))
     li        s1, 12998     // FIX(1.586706681)
     mul       s1, t3, s1    // MULTIPLY(z4, FIX(1.586706681))
     li        s2, 5540      // FIX(0.676326758)
     mul       s2, t1, s2    // MULTIPLY(z1, FIX(0.676326758))
     li        s3, 16244     // FIX(1.982889723)
     mul       s3, t3, s3    // MULTIPLY(z4, FIX(1.982889723))
     subu      t1, t1, t3    // z1 -= z4
     subu      t0, t0, t2    // z2 -= z3
     addu      t2, t1, t0    // z1 + z2
     li        t3, 4433      // FIX_0_541196100
     mul       t2, t2, t3    // z3
     li        t3, 6270      // FIX_0_765366865
     mul       t1, t1, t3    // MULTIPLY(z1, FIX_0_765366865)
     li        t3, 15137     // FIX_1_847759065
     mul       t0, t0, t3    // MULTIPLY(z2, FIX_1_847759065)
     addu      t3, t6, t7    // tmp12
     addu      t7, t3, t4
     addu      t7, t7, t8    // tmp10
     subu      t3, t3, t9
     subu      t3, t3, t5
     subu      t3, t3, s0    // tmp12
     subu      t9, t6, t9
     subu      t9, t9, t4
     addu      t9, t9, s1    // tmp13
     subu      t6, t6, t5
     subu      t6, t6, s2
     subu      t6, t6, s3    // tmp15
     addu      t1, t2, t1    // tmp11
     subu      t0, t2, t0    // tmp14
     // even part
     lw        t2, 16(a0)    // z4
     lw        t4, 8(a0)     // z1
     lw        t5, 0(a0)     // z3
     lw        t8, 24(a0)    // z2
     li        s0, 10033     // FIX(1.224744871)
     li        s1, 11190     // FIX(1.366025404)
     mul       t2, t2, s0    // z4
     mul       s0, t4, s1    // z4
     addiu     t5, t5, 0x10
     sll       t5, t5, 13    // z3
     sll       t4, t4, 13    // z1
     sll       t8, t8, 13    // z2
     subu      s1, t4, t8    // tmp12
     addu      s2, t5, t2    // tmp10
     subu      t2, t5, t2    // tmp11
     addu      s3, t5, s1    // tmp21
     subu      s1, t5, s1    // tmp24
     addu      t5, s0, t8    // tmp12
     addu      v0, s2, t5    // tmp20
     subu      t5, s2, t5    // tmp25
     subu      t4, s0, t4
     subu      t4, t4, t8    // tmp12
     addu      t8, t2, t4    // tmp22
     subu      t2, t2, t4    // tmp23
     // increment counter and pointers
     addiu     a3, a3, -1
     addiu     a0, a0, 32
     // Final stage
     addu      t4, v0, t7
     subu      v0, v0, t7
     addu      t7, s3, t1
     subu      s3, s3, t1
     addu      t1, t8, t3
     subu      t8, t8, t3
     addu      t3, t2, t9
     subu      t2, t2, t9
     addu      t9, s1, t0
     subu      s1, s1, t0
     addu      t0, t5, t6
     subu      t5, t5, t6
     sll       t4, t4, 4
     sll       t7, t7, 4
     sll       t1, t1, 4
     sll       t3, t3, 4
     sll       t9, t9, 4
     sll       t0, t0, 4
     sll       t5, t5, 4
     sll       s1, s1, 4
     sll       t2, t2, 4
     sll       t8, t8, 4
     sll       s3, s3, 4
     sll       v0, v0, 4
     shll_s.w  t4, t4, 2
     shll_s.w  t7, t7, 2
     shll_s.w  t1, t1, 2
     shll_s.w  t3, t3, 2
     shll_s.w  t9, t9, 2
     shll_s.w  t0, t0, 2
     shll_s.w  t5, t5, 2
     shll_s.w  s1, s1, 2
     shll_s.w  t2, t2, 2
     shll_s.w  t8, t8, 2
     shll_s.w  s3, s3, 2
     shll_s.w  v0, v0, 2
     srl       t4, t4, 24
     srl       t7, t7, 24
     srl       t1, t1, 24
     srl       t3, t3, 24
     srl       t9, t9, 24
     srl       t0, t0, 24
     srl       t5, t5, 24
     srl       s1, s1, 24
     srl       t2, t2, 24
     srl       t8, t8, 24
     srl       s3, s3, 24
     srl       v0, v0, 24
     lw        t6, 0(a1)
     addiu     t4, t4, 0x80
     addiu     t7, t7, 0x80
     addiu     t1, t1, 0x80
     addiu     t3, t3, 0x80
     addiu     t9, t9, 0x80
     addiu     t0, t0, 0x80
     addiu     t5, t5, 0x80
     addiu     s1, s1, 0x80
     addiu     t2, t2, 0x80
     addiu     t8, t8, 0x80
     addiu     s3, s3, 0x80
     addiu     v0, v0, 0x80
     sb        t4, 0(t6)
     sb        t7, 1(t6)
     sb        t1, 2(t6)
     sb        t3, 3(t6)
     sb        t9, 4(t6)
     sb        t0, 5(t6)
     sb        t5, 6(t6)
     sb        s1, 7(t6)
     sb        t2, 8(t6)
     sb        t8, 9(t6)
     sb        s3, 10(t6)
     sb        v0, 11(t6)
     bgtz      a3, 1b
      addiu    a1, a1, 4

     RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3

     jr        ra
      nop

 END(jsimd_idct_12x12_pass2_mips_dspr2)

 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_convsamp_mips_dspr2)
 /*
  * a0     - sample_data
  * a1     - start_col
  * a2     - workspace
  */

     lw             t0, 0(a0)
     li             t7, 0xff80ff80
     addu           t0, t0, a1
     ulw            t1, 0(t0)
     ulw            t2, 4(t0)
     preceu.ph.qbr  t3, t1
     preceu.ph.qbl  t4, t1
     lw             t0, 4(a0)
     preceu.ph.qbr  t5, t2
     preceu.ph.qbl  t6, t2
     addu           t0, t0, a1
     addu.ph        t3, t3, t7
     addu.ph        t4, t4, t7
     ulw            t1, 0(t0)
     ulw            t2, 4(t0)
     addu.ph        t5, t5, t7
     addu.ph        t6, t6, t7
     usw            t3, 0(a2)
     usw            t4, 4(a2)
     preceu.ph.qbr  t3, t1
     preceu.ph.qbl  t4, t1
     usw            t5, 8(a2)
     usw            t6, 12(a2)

     lw             t0, 8(a0)
     preceu.ph.qbr  t5, t2
     preceu.ph.qbl  t6, t2
     addu           t0, t0, a1
     addu.ph        t3, t3, t7
     addu.ph        t4, t4, t7
     ulw            t1, 0(t0)
     ulw            t2, 4(t0)
     addu.ph        t5, t5, t7
     addu.ph        t6, t6, t7
     usw            t3, 16(a2)
     usw            t4, 20(a2)
     preceu.ph.qbr  t3, t1
     preceu.ph.qbl  t4, t1
     usw            t5, 24(a2)
     usw            t6, 28(a2)

     lw             t0, 12(a0)
     preceu.ph.qbr  t5, t2
     preceu.ph.qbl  t6, t2
     addu           t0, t0, a1
     addu.ph        t3, t3, t7
     addu.ph        t4, t4, t7
     ulw            t1, 0(t0)
     ulw            t2, 4(t0)
     addu.ph        t5, t5, t7
     addu.ph        t6, t6, t7
     usw            t3, 32(a2)
     usw            t4, 36(a2)
     preceu.ph.qbr  t3, t1
     preceu.ph.qbl  t4, t1
     usw            t5, 40(a2)
     usw            t6, 44(a2)

     lw             t0, 16(a0)
     preceu.ph.qbr  t5, t2
     preceu.ph.qbl  t6, t2
     addu           t0, t0, a1
     addu.ph        t3, t3, t7
     addu.ph        t4, t4, t7
     ulw            t1, 0(t0)
     ulw            t2, 4(t0)
     addu.ph        t5, t5, t7
     addu.ph        t6, t6, t7
     usw            t3, 48(a2)
     usw            t4, 52(a2)
     preceu.ph.qbr  t3, t1
     preceu.ph.qbl  t4, t1
     usw            t5, 56(a2)
     usw            t6, 60(a2)

     lw             t0, 20(a0)
     preceu.ph.qbr  t5, t2
     preceu.ph.qbl  t6, t2
     addu           t0, t0, a1
     addu.ph        t3, t3, t7
     addu.ph        t4, t4, t7
     ulw            t1, 0(t0)
     ulw            t2, 4(t0)
     addu.ph        t5, t5, t7
     addu.ph        t6, t6, t7
     usw            t3, 64(a2)
     usw            t4, 68(a2)
     preceu.ph.qbr  t3, t1
     preceu.ph.qbl  t4, t1
     usw            t5, 72(a2)
     usw            t6, 76(a2)

     lw             t0, 24(a0)
     preceu.ph.qbr  t5, t2
     preceu.ph.qbl  t6, t2
     addu           t0, t0, a1
     addu.ph        t3, t3, t7
     addu.ph        t4, t4, t7
     ulw            t1, 0(t0)
     ulw            t2, 4(t0)
     addu.ph        t5, t5, t7
     addu.ph        t6, t6, t7
     usw            t3, 80(a2)
     usw            t4, 84(a2)
     preceu.ph.qbr  t3, t1
     preceu.ph.qbl  t4, t1
     usw            t5, 88(a2)
     usw            t6, 92(a2)

     lw             t0, 28(a0)
     preceu.ph.qbr  t5, t2
     preceu.ph.qbl  t6, t2
     addu           t0, t0, a1
     addu.ph        t3, t3, t7
     addu.ph        t4, t4, t7
     ulw            t1, 0(t0)
     ulw            t2, 4(t0)
     addu.ph        t5, t5, t7
     addu.ph        t6, t6, t7
     usw            t3, 96(a2)
     usw            t4, 100(a2)
     preceu.ph.qbr  t3, t1
     preceu.ph.qbl  t4, t1
     usw            t5, 104(a2)
     usw            t6, 108(a2)
     preceu.ph.qbr  t5, t2
     preceu.ph.qbl  t6, t2
     addu.ph        t3, t3, t7
     addu.ph        t4, t4, t7
     addu.ph        t5, t5, t7
     addu.ph        t6, t6, t7
     usw            t3, 112(a2)
     usw            t4, 116(a2)
     usw            t5, 120(a2)
     usw            t6, 124(a2)

     j              ra
      nop

 END(jsimd_convsamp_mips_dspr2)

 /*****************************************************************************/
 LEAF_MIPS_DSPR2(jsimd_convsamp_float_mips_dspr2)
 /*
  * a0     - sample_data
  * a1     - start_col
  * a2     - workspace
  */

     .set at

     lw       t0, 0(a0)
     addu     t0, t0, a1
     lbu      t1, 0(t0)
     lbu      t2, 1(t0)
     lbu      t3, 2(t0)
     lbu      t4, 3(t0)
     lbu      t5, 4(t0)
     lbu      t6, 5(t0)
     lbu      t7, 6(t0)
     lbu      t8, 7(t0)
     addiu    t1, t1, -128
     addiu    t2, t2, -128
     addiu    t3, t3, -128
     addiu    t4, t4, -128
     addiu    t5, t5, -128
     addiu    t6, t6, -128
     addiu    t7, t7, -128
     addiu    t8, t8, -128
     mtc1     t1, f1
     mtc1     t2, f2
     mtc1     t3, f3
     mtc1     t4, f4
     mtc1     t5, f5
     mtc1     t6, f6
     mtc1     t7, f7
     mtc1     t8, f8
     cvt.s.w  f1, f1
     cvt.s.w  f2, f2
     cvt.s.w  f3, f3
     cvt.s.w  f4, f4
     cvt.s.w  f5, f5
     cvt.s.w  f6, f6
     cvt.s.w  f7, f7
     cvt.s.w  f8, f8
     lw       t0, 4(a0)
     swc1     f1, 0(a2)
     swc1     f2, 4(a2)
     swc1     f3, 8(a2)
     addu     t0, t0, a1
     swc1     f4, 12(a2)
     swc1     f5, 16(a2)
     swc1     f6, 20(a2)
     swc1     f7, 24(a2)
     swc1     f8, 28(a2)
     //elemr 1
     lbu      t1, 0(t0)
     lbu      t2, 1(t0)
     lbu      t3, 2(t0)
     lbu      t4, 3(t0)
     lbu      t5, 4(t0)
     lbu      t6, 5(t0)
     lbu      t7, 6(t0)
     lbu      t8, 7(t0)
     addiu    t1, t1, -128
     addiu    t2, t2, -128
     addiu    t3, t3, -128
     addiu    t4, t4, -128
     addiu    t5, t5, -128
     addiu    t6, t6, -128
     addiu    t7, t7, -128
     addiu    t8, t8, -128
     mtc1     t1, f1
     mtc1     t2, f2
     mtc1     t3, f3
     mtc1     t4, f4
     mtc1     t5, f5
     mtc1     t6, f6
     mtc1     t7, f7
     mtc1     t8, f8
     cvt.s.w  f1, f1
     cvt.s.w  f2, f2
     cvt.s.w  f3, f3
     cvt.s.w  f4, f4
     cvt.s.w  f5, f5
     cvt.s.w  f6, f6
     cvt.s.w  f7, f7
     cvt.s.w  f8, f8
     lw       t0, 8(a0)
     swc1     f1, 32(a2)
     swc1     f2, 36(a2)
     swc1     f3, 40(a2)
     addu     t0, t0, a1
     swc1     f4, 44(a2)
     swc1     f5, 48(a2)
     swc1     f6, 52(a2)
     swc1     f7, 56(a2)
     swc1     f8, 60(a2)
     //elemr 2
     lbu      t1, 0(t0)
     lbu      t2, 1(t0)
     lbu      t3, 2(t0)
     lbu      t4, 3(t0)
     lbu      t5, 4(t0)
     lbu      t6, 5(t0)
     lbu      t7, 6(t0)
     lbu      t8, 7(t0)
     addiu    t1, t1, -128
     addiu    t2, t2, -128
     addiu    t3, t3, -128
     addiu    t4, t4, -128
     addiu    t5, t5, -128
     addiu    t6, t6, -128
     addiu    t7, t7, -128
     addiu    t8, t8, -128
     mtc1     t1, f1
     mtc1     t2, f2
     mtc1     t3, f3
     mtc1     t4, f4
     mtc1     t5, f5
     mtc1     t6, f6
     mtc1     t7, f7
     mtc1     t8, f8
     cvt.s.w  f1, f1
     cvt.s.w  f2, f2
     cvt.s.w  f3, f3
     cvt.s.w  f4, f4
     cvt.s.w  f5, f5
     cvt.s.w  f6, f6
     cvt.s.w  f7, f7
     cvt.s.w  f8, f8
     lw       t0, 12(a0)
     swc1     f1, 64(a2)
     swc1     f2, 68(a2)
     swc1     f3, 72(a2)
     addu     t0, t0, a1
     swc1     f4, 76(a2)
     swc1     f5, 80(a2)
     swc1     f6, 84(a2)
     swc1     f7, 88(a2)
     swc1     f8, 92(a2)
     //elemr 3
     lbu      t1, 0(t0)
     lbu      t2, 1(t0)
     lbu      t3, 2(t0)
     lbu      t4, 3(t0)
     lbu      t5, 4(t0)
     lbu      t6, 5(t0)
     lbu      t7, 6(t0)
     lbu      t8, 7(t0)
     addiu    t1, t1, -128
     addiu    t2, t2, -128
     addiu    t3, t3, -128
     addiu    t4, t4, -128
     addiu    t5, t5, -128
     addiu    t6, t6, -128
     addiu    t7, t7, -128
     addiu    t8, t8, -128
     mtc1     t1, f1
     mtc1     t2, f2
     mtc1     t3, f3
     mtc1     t4, f4
     mtc1     t5, f5
     mtc1     t6, f6
     mtc1     t7, f7
     mtc1     t8, f8
     cvt.s.w  f1, f1
     cvt.s.w  f2, f2
     cvt.s.w  f3, f3
     cvt.s.w  f4, f4
     cvt.s.w  f5, f5
     cvt.s.w  f6, f6
     cvt.s.w  f7, f7
     cvt.s.w  f8, f8
     lw       t0, 16(a0)
     swc1     f1, 96(a2)
     swc1     f2, 100(a2)
     swc1     f3, 104(a2)
     addu     t0, t0, a1
     swc1     f4, 108(a2)
     swc1     f5, 112(a2)
     swc1     f6, 116(a2)
     swc1     f7, 120(a2)
     swc1     f8, 124(a2)
     //elemr 4
     lbu      t1, 0(t0)
     lbu      t2, 1(t0)
     lbu      t3, 2(t0)
     lbu      t4, 3(t0)
     lbu      t5, 4(t0)
     lbu      t6, 5(t0)
     lbu      t7, 6(t0)
     lbu      t8, 7(t0)
     addiu    t1, t1, -128
     addiu    t2, t2, -128
     addiu    t3, t3, -128
     addiu    t4, t4, -128
     addiu    t5, t5, -128
     addiu    t6, t6, -128
     addiu    t7, t7, -128
     addiu    t8, t8, -128
     mtc1     t1, f1
     mtc1     t2, f2
     mtc1     t3, f3
     mtc1     t4, f4
     mtc1     t5, f5
     mtc1     t6, f6
     mtc1     t7, f7
     mtc1     t8, f8
     cvt.s.w  f1, f1
     cvt.s.w  f2, f2
     cvt.s.w  f3, f3
     cvt.s.w  f4, f4
     cvt.s.w  f5, f5
     cvt.s.w  f6, f6
     cvt.s.w  f7, f7
     cvt.s.w  f8, f8
     lw       t0, 20(a0)
     swc1     f1, 128(a2)
     swc1     f2, 132(a2)
     swc1     f3, 136(a2)
     addu     t0, t0, a1
     swc1     f4, 140(a2)
     swc1     f5, 144(a2)
     swc1     f6, 148(a2)
     swc1     f7, 152(a2)
     swc1     f8, 156(a2)
     //elemr 5
     lbu      t1, 0(t0)
     lbu      t2, 1(t0)
     lbu      t3, 2(t0)
     lbu      t4, 3(t0)
     lbu      t5, 4(t0)
     lbu      t6, 5(t0)
     lbu      t7, 6(t0)
     lbu      t8, 7(t0)
     addiu    t1, t1, -128
     addiu    t2, t2, -128
     addiu    t3, t3, -128
     addiu    t4, t4, -128
     addiu    t5, t5, -128
     addiu    t6, t6, -128
     addiu    t7, t7, -128
     addiu    t8, t8, -128
     mtc1     t1, f1
     mtc1     t2, f2
     mtc1     t3, f3
     mtc1     t4, f4
     mtc1     t5, f5
     mtc1     t6, f6
     mtc1     t7, f7
     mtc1     t8, f8
     cvt.s.w  f1, f1
     cvt.s.w  f2, f2
     cvt.s.w  f3, f3
     cvt.s.w  f4, f4
     cvt.s.w  f5, f5
     cvt.s.w  f6, f6
     cvt.s.w  f7, f7
     cvt.s.w  f8, f8
     lw       t0, 24(a0)
     swc1     f1, 160(a2)
     swc1     f2, 164(a2)
     swc1     f3, 168(a2)
     addu     t0, t0, a1
     swc1     f4, 172(a2)
     swc1     f5, 176(a2)
     swc1     f6, 180(a2)
     swc1     f7, 184(a2)
     swc1     f8, 188(a2)
     //elemr 6
     lbu      t1, 0(t0)
     lbu      t2, 1(t0)
     lbu      t3, 2(t0)
     lbu      t4, 3(t0)
     lbu      t5, 4(t0)
     lbu      t6, 5(t0)
     lbu      t7, 6(t0)
     lbu      t8, 7(t0)
     addiu    t1, t1, -128
     addiu    t2, t2, -128
     addiu    t3, t3, -128
     addiu    t4, t4, -128
     addiu    t5, t5, -128
     addiu    t6, t6, -128
     addiu    t7, t7, -128
     addiu    t8, t8, -128
     mtc1     t1, f1
     mtc1     t2, f2
     mtc1     t3, f3
     mtc1     t4, f4
     mtc1     t5, f5
     mtc1     t6, f6
     mtc1     t7, f7
     mtc1     t8, f8
     cvt.s.w  f1, f1
     cvt.s.w  f2, f2
     cvt.s.w  f3, f3
     cvt.s.w  f4, f4
     cvt.s.w  f5, f5
     cvt.s.w  f6, f6
     cvt.s.w  f7, f7
     cvt.s.w  f8, f8
     lw       t0, 28(a0)
     swc1     f1, 192(a2)
     swc1     f2, 196(a2)
     swc1     f3, 200(a2)
     addu     t0, t0, a1
     swc1     f4, 204(a2)
     swc1     f5, 208(a2)
     swc1     f6, 212(a2)
     swc1     f7, 216(a2)
     swc1     f8, 220(a2)
     //elemr 7
     lbu      t1, 0(t0)
     lbu      t2, 1(t0)
     lbu      t3, 2(t0)
     lbu      t4, 3(t0)
     lbu      t5, 4(t0)
     lbu      t6, 5(t0)
     lbu      t7, 6(t0)
     lbu      t8, 7(t0)
     addiu    t1, t1, -128
     addiu    t2, t2, -128
     addiu    t3, t3, -128
     addiu    t4, t4, -128
     addiu    t5, t5, -128
     addiu    t6, t6, -128
     addiu    t7, t7, -128
     addiu    t8, t8, -128
     mtc1     t1, f1
     mtc1     t2, f2
     mtc1     t3, f3
     mtc1     t4, f4
     mtc1     t5, f5
     mtc1     t6, f6
     mtc1     t7, f7
     mtc1     t8, f8
     cvt.s.w  f1, f1
     cvt.s.w  f2, f2
     cvt.s.w  f3, f3
     cvt.s.w  f4, f4
     cvt.s.w  f5, f5
     cvt.s.w  f6, f6
     cvt.s.w  f7, f7
     cvt.s.w  f8, f8
     swc1     f1, 224(a2)
     swc1     f2, 228(a2)
     swc1     f3, 232(a2)
     swc1     f4, 236(a2)
     swc1     f5, 240(a2)
     swc1     f6, 244(a2)
     swc1     f7, 248(a2)
     swc1     f8, 252(a2)

     j        ra
      nop

 END(jsimd_convsamp_float_mips_dspr2)

 /*****************************************************************************/