blob: 0eed1ce2ea657df4814b3f19a71fc15c790436c8 [file] [log] [blame]
/*
* MIPS DSPr2 optimizations for libjpeg-turbo
*
* Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
* All rights reserved.
* Authors: Teodora Novkovic (teodora.novkovic@imgtec.com)
* Darko Laus (darko.laus@imgtec.com)
* Copyright (C) 2015, D. R. Commander. All Rights Reserved.
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#include "jsimd_mips_dspr2_asm.h"
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_c_null_convert_mips_dspr2)
/*
* a0 - cinfo->image_width
* a1 - input_buf
* a2 - output_buf
* a3 - output_row
* 16(sp) - num_rows
* 20(sp) - cinfo->num_components
*
* Null conversion for compression
*/
SAVE_REGS_ON_STACK 8, s0, s1
lw t9, 24(sp) // t9 = num_rows
lw s0, 28(sp) // s0 = cinfo->num_components
andi t0, a0, 3 // t0 = cinfo->image_width & 3
beqz t0, 4f // no residual
nop
0:
addiu t9, t9, -1
bltz t9, 7f
li t1, 0
1:
sll t3, t1, 2
lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
lw t2, 0(a1) // t2 = inptr = *input_buf
sll t4, a3, 2
lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
addu t2, t2, t1
addu s1, t5, a0
addu t6, t5, t0
2:
lbu t3, 0(t2)
addiu t5, t5, 1
sb t3, -1(t5)
bne t6, t5, 2b
addu t2, t2, s0
3:
lbu t3, 0(t2)
addu t4, t2, s0
addu t7, t4, s0
addu t8, t7, s0
addu t2, t8, s0
lbu t4, 0(t4)
lbu t7, 0(t7)
lbu t8, 0(t8)
addiu t5, t5, 4
sb t3, -4(t5)
sb t4, -3(t5)
sb t7, -2(t5)
bne s1, t5, 3b
sb t8, -1(t5)
addiu t1, t1, 1
bne t1, s0, 1b
nop
addiu a1, a1, 4
bgez t9, 0b
addiu a3, a3, 1
b 7f
nop
4:
addiu t9, t9, -1
bltz t9, 7f
li t1, 0
5:
sll t3, t1, 2
lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
lw t2, 0(a1) // t2 = inptr = *input_buf
sll t4, a3, 2
lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
addu t2, t2, t1
addu s1, t5, a0
addu t6, t5, t0
6:
lbu t3, 0(t2)
addu t4, t2, s0
addu t7, t4, s0
addu t8, t7, s0
addu t2, t8, s0
lbu t4, 0(t4)
lbu t7, 0(t7)
lbu t8, 0(t8)
addiu t5, t5, 4
sb t3, -4(t5)
sb t4, -3(t5)
sb t7, -2(t5)
bne s1, t5, 6b
sb t8, -1(t5)
addiu t1, t1, 1
bne t1, s0, 5b
nop
addiu a1, a1, 4
bgez t9, 4b
addiu a3, a3, 1
7:
RESTORE_REGS_FROM_STACK 8, s0, s1
j ra
nop
END(jsimd_c_null_convert_mips_dspr2)
/*****************************************************************************/
/*
* jsimd_extrgb_ycc_convert_mips_dspr2
* jsimd_extbgr_ycc_convert_mips_dspr2
* jsimd_extrgbx_ycc_convert_mips_dspr2
* jsimd_extbgrx_ycc_convert_mips_dspr2
* jsimd_extxbgr_ycc_convert_mips_dspr2
* jsimd_extxrgb_ycc_convert_mips_dspr2
*
* Colorspace conversion RGB -> YCbCr
*/
.macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
.macro DO_RGB_TO_YCC r, \
g, \
b, \
inptr
lbu \r, \r_offs(\inptr)
lbu \g, \g_offs(\inptr)
lbu \b, \b_offs(\inptr)
addiu \inptr, \pixel_size
.endm
LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2)
/*
* a0 - cinfo->image_width
* a1 - input_buf
* a2 - output_buf
* a3 - output_row
* 16(sp) - num_rows
*/
SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
lw t7, 48(sp) // t7 = num_rows
li s0, 0x4c8b // FIX(0.29900)
li s1, 0x9646 // FIX(0.58700)
li s2, 0x1d2f // FIX(0.11400)
li s3, 0xffffd4cd // -FIX(0.16874)
li s4, 0xffffab33 // -FIX(0.33126)
li s5, 0x8000 // FIX(0.50000)
li s6, 0xffff94d1 // -FIX(0.41869)
li s7, 0xffffeb2f // -FIX(0.08131)
li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1
0:
addiu t7, -1 // --num_rows
lw t6, 0(a1) // t6 = input_buf[0]
lw t0, 0(a2)
lw t1, 4(a2)
lw t2, 8(a2)
sll t3, a3, 2
lwx t0, t3(t0) // t0 = output_buf[0][output_row]
lwx t1, t3(t1) // t1 = output_buf[1][output_row]
lwx t2, t3(t2) // t2 = output_buf[2][output_row]
addu t9, t2, a0 // t9 = end address
addiu a3, 1
1:
DO_RGB_TO_YCC t3, t4, t5, t6
mtlo s5, $ac0
mtlo t8, $ac1
mtlo t8, $ac2
maddu $ac0, s2, t5
maddu $ac1, s5, t5
maddu $ac2, s5, t3
maddu $ac0, s0, t3
maddu $ac1, s3, t3
maddu $ac2, s6, t4
maddu $ac0, s1, t4
maddu $ac1, s4, t4
maddu $ac2, s7, t5
extr.w t3, $ac0, 16
extr.w t4, $ac1, 16
extr.w t5, $ac2, 16
sb t3, 0(t0)
sb t4, 0(t1)
sb t5, 0(t2)
addiu t0, 1
addiu t2, 1
bne t2, t9, 1b
addiu t1, 1
bgtz t7, 0b
addiu a1, 4
RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
j ra
nop
END(jsimd_\colorid\()_ycc_convert_mips_dspr2)
.purgem DO_RGB_TO_YCC
.endm
/*------------------------------------------id -- pix R G B */
GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2
GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0
GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
/*****************************************************************************/
/*
* jsimd_ycc_extrgb_convert_mips_dspr2
* jsimd_ycc_extbgr_convert_mips_dspr2
* jsimd_ycc_extrgbx_convert_mips_dspr2
* jsimd_ycc_extbgrx_convert_mips_dspr2
* jsimd_ycc_extxbgr_convert_mips_dspr2
* jsimd_ycc_extxrgb_convert_mips_dspr2
*
* Colorspace conversion YCbCr -> RGB
*/
.macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs
.macro STORE_YCC_TO_RGB scratch0 \
scratch1 \
scratch2 \
outptr
sb \scratch0, \r_offs(\outptr)
sb \scratch1, \g_offs(\outptr)
sb \scratch2, \b_offs(\outptr)
.if (\pixel_size == 4)
li t0, 0xFF
sb t0, \a_offs(\outptr)
.endif
addiu \outptr, \pixel_size
.endm
LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2)
/*
* a0 - cinfo->image_width
* a1 - input_buf
* a2 - input_row
* a3 - output_buf
* 16(sp) - num_rows
*/
SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
lw s1, 48(sp)
li t3, 0x8000
li t4, 0x166e9 // FIX(1.40200)
li t5, 0x1c5a2 // FIX(1.77200)
li t6, 0xffff492e // -FIX(0.71414)
li t7, 0xffffa7e6 // -FIX(0.34414)
repl.ph t8, 128
0:
lw s0, 0(a3)
lw t0, 0(a1)
lw t1, 4(a1)
lw t2, 8(a1)
sll s5, a2, 2
addiu s1, -1
lwx s2, s5(t0)
lwx s3, s5(t1)
lwx s4, s5(t2)
addu t9, s2, a0
addiu a2, 1
1:
lbu s7, 0(s4) // cr
lbu s6, 0(s3) // cb
lbu s5, 0(s2) // y
addiu s2, 1
addiu s4, 1
addiu s7, -128
addiu s6, -128
mul t2, t7, s6
mul t0, t6, s7 // Crgtab[cr]
sll s7, 15
mulq_rs.w t1, t4, s7 // Crrtab[cr]
sll s6, 15
addu t2, t3 // Cbgtab[cb]
addu t2, t0
mulq_rs.w t0, t5, s6 // Cbbtab[cb]
sra t2, 16
addu t1, s5
addu t2, s5 // add y
ins t2, t1, 16, 16
subu.ph t2, t2, t8
addu t0, s5
shll_s.ph t2, t2, 8
subu t0, 128
shra.ph t2, t2, 8
shll_s.w t0, t0, 24
addu.ph t2, t2, t8 // clip & store
sra t0, t0, 24
sra t1, t2, 16
addiu t0, 128
STORE_YCC_TO_RGB t1, t2, t0, s0
bne s2, t9, 1b
addiu s3, 1
bgtz s1, 0b
addiu a3, 4
RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
j ra
nop
END(jsimd_ycc_\colorid\()_convert_mips_dspr2)
.purgem STORE_YCC_TO_RGB
.endm
/*------------------------------------------id -- pix R G B A */
GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2, 3
GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0, 3
GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3
GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3
GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0
GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0
/*****************************************************************************/
/*
* jsimd_extrgb_gray_convert_mips_dspr2
* jsimd_extbgr_gray_convert_mips_dspr2
* jsimd_extrgbx_gray_convert_mips_dspr2
* jsimd_extbgrx_gray_convert_mips_dspr2
* jsimd_extxbgr_gray_convert_mips_dspr2
* jsimd_extxrgb_gray_convert_mips_dspr2
*
* Colorspace conversion RGB -> GRAY
*/
.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
.macro DO_RGB_TO_GRAY r, \
g, \
b, \
inptr
lbu \r, \r_offs(\inptr)
lbu \g, \g_offs(\inptr)
lbu \b, \b_offs(\inptr)
addiu \inptr, \pixel_size
.endm
LEAF_MIPS_DSPR2(jsimd_\colorid\()_gray_convert_mips_dspr2)
/*
* a0 - cinfo->image_width
* a1 - input_buf
* a2 - output_buf
* a3 - output_row
* 16(sp) - num_rows
*/
SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
li s0, 0x4c8b // s0 = FIX(0.29900)
li s1, 0x9646 // s1 = FIX(0.58700)
li s2, 0x1d2f // s2 = FIX(0.11400)
li s7, 0x8000 // s7 = FIX(0.50000)
lw s6, 48(sp)
andi t7, a0, 3
0:
addiu s6, -1 // s6 = num_rows
lw t0, 0(a1)
lw t1, 0(a2)
sll t3, a3, 2
lwx t1, t3(t1)
addiu a3, 1
addu t9, t1, a0
subu t8, t9, t7
beq t1, t8, 2f
nop
1:
DO_RGB_TO_GRAY t3, t4, t5, t0
DO_RGB_TO_GRAY s3, s4, s5, t0
mtlo s7, $ac0
maddu $ac0, s2, t5
maddu $ac0, s1, t4
maddu $ac0, s0, t3
mtlo s7, $ac1
maddu $ac1, s2, s5
maddu $ac1, s1, s4
maddu $ac1, s0, s3
extr.w t6, $ac0, 16
DO_RGB_TO_GRAY t3, t4, t5, t0
DO_RGB_TO_GRAY s3, s4, s5, t0
mtlo s7, $ac0
maddu $ac0, s2, t5
maddu $ac0, s1, t4
extr.w t2, $ac1, 16
maddu $ac0, s0, t3
mtlo s7, $ac1
maddu $ac1, s2, s5
maddu $ac1, s1, s4
maddu $ac1, s0, s3
extr.w t5, $ac0, 16
sb t6, 0(t1)
sb t2, 1(t1)
extr.w t3, $ac1, 16
addiu t1, 4
sb t5, -2(t1)
sb t3, -1(t1)
bne t1, t8, 1b
nop
2:
beqz t7, 4f
nop
3:
DO_RGB_TO_GRAY t3, t4, t5, t0
mtlo s7, $ac0
maddu $ac0, s2, t5
maddu $ac0, s1, t4
maddu $ac0, s0, t3
extr.w t6, $ac0, 16
sb t6, 0(t1)
addiu t1, 1
bne t1, t9, 3b
nop
4:
bgtz s6, 0b
addiu a1, 4
RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
j ra
nop
END(jsimd_\colorid\()_gray_convert_mips_dspr2)
.purgem DO_RGB_TO_GRAY
.endm
/*------------------------------------------id -- pix R G B */
GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2
GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0
GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
/*****************************************************************************/
/*
* jsimd_h2v2_merged_upsample_mips_dspr2
* jsimd_h2v2_extrgb_merged_upsample_mips_dspr2
* jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2
* jsimd_h2v2_extbgr_merged_upsample_mips_dspr2
* jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2
* jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2
* jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2
*
* Merged h2v2 upsample routines
*/
.macro GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 colorid, \
pixel_size, \
r1_offs, \
g1_offs, \
b1_offs, \
a1_offs, \
r2_offs, \
g2_offs, \
b2_offs, \
a2_offs
.macro STORE_H2V2_2_PIXELS scratch0 \
scratch1 \
scratch2 \
scratch3 \
scratch4 \
scratch5 \
outptr
sb \scratch0, \r1_offs(\outptr)
sb \scratch1, \g1_offs(\outptr)
sb \scratch2, \b1_offs(\outptr)
sb \scratch3, \r2_offs(\outptr)
sb \scratch4, \g2_offs(\outptr)
sb \scratch5, \b2_offs(\outptr)
.if (\pixel_size == 8)
li \scratch0, 0xFF
sb \scratch0, \a1_offs(\outptr)
sb \scratch0, \a2_offs(\outptr)
.endif
addiu \outptr, \pixel_size
.endm
.macro STORE_H2V2_1_PIXEL scratch0 \
scratch1 \
scratch2 \
outptr
sb \scratch0, \r1_offs(\outptr)
sb \scratch1, \g1_offs(\outptr)
sb \scratch2, \b1_offs(\outptr)
.if (\pixel_size == 8)
li t0, 0xFF
sb t0, \a1_offs(\outptr)
.endif
.endm
LEAF_MIPS_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
/*
* a0 - cinfo->output_width
* a1 - input_buf
* a2 - in_row_group_ctr
* a3 - output_buf
* 16(sp) - cinfo->sample_range_limit
*/
SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
lw t9, 56(sp) // cinfo->sample_range_limit
lw v0, 0(a1)
lw v1, 4(a1)
lw t0, 8(a1)
sll t1, a2, 3
addiu t2, t1, 4
sll t3, a2, 2
lw t4, 0(a3) // t4 = output_buf[0]
lwx t1, t1(v0) // t1 = input_buf[0][in_row_group_ctr*2]
lwx t2, t2(v0) // t2 = input_buf[0][in_row_group_ctr*2 + 1]
lwx t5, t3(v1) // t5 = input_buf[1][in_row_group_ctr]
lwx t6, t3(t0) // t6 = input_buf[2][in_row_group_ctr]
lw t7, 4(a3) // t7 = output_buf[1]
li s1, 0xe6ea
addiu t8, s1, 0x7fff // t8 = 0x166e9 [FIX(1.40200)]
addiu s0, t8, 0x5eb9 // s0 = 0x1c5a2 [FIX(1.77200)]
addiu s1, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
xori s2, s1, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
srl t3, a0, 1
blez t3, 2f
addu t0, t5, t3 // t0 = end address
1:
lbu t3, 0(t5)
lbu s3, 0(t6)
addiu t5, t5, 1
addiu t3, t3, -128 // (cb - 128)
addiu s3, s3, -128 // (cr - 128)
mult $ac1, s1, t3
madd $ac1, s2, s3
sll s3, s3, 15
sll t3, t3, 15
mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
extr_r.w s5, $ac1, 16
mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
lbu v0, 0(t1)
addiu t6, t6, 1
addiu t1, t1, 2
addu t3, v0, s4 // y+cred
addu s3, v0, s5 // y+cgreen
addu v1, v0, s6 // y+cblue
addu t3, t9, t3 // y+cred
addu s3, t9, s3 // y+cgreen
addu v1, t9, v1 // y+cblue
lbu AT, 0(t3)
lbu s7, 0(s3)
lbu ra, 0(v1)
lbu v0, -1(t1)
addu t3, v0, s4 // y+cred
addu s3, v0, s5 // y+cgreen
addu v1, v0, s6 // y+cblue
addu t3, t9, t3 // y+cred
addu s3, t9, s3 // y+cgreen
addu v1, t9, v1 // y+cblue
lbu t3, 0(t3)
lbu s3, 0(s3)
lbu v1, 0(v1)
lbu v0, 0(t2)
STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
addu t3, v0, s4 // y+cred
addu s3, v0, s5 // y+cgreen
addu v1, v0, s6 // y+cblue
addu t3, t9, t3 // y+cred
addu s3, t9, s3 // y+cgreen
addu v1, t9, v1 // y+cblue
lbu AT, 0(t3)
lbu s7, 0(s3)
lbu ra, 0(v1)
lbu v0, 1(t2)
addiu t2, t2, 2
addu t3, v0, s4 // y+cred
addu s3, v0, s5 // y+cgreen
addu v1, v0, s6 // y+cblue
addu t3, t9, t3 // y+cred
addu s3, t9, s3 // y+cgreen
addu v1, t9, v1 // y+cblue
lbu t3, 0(t3)
lbu s3, 0(s3)
lbu v1, 0(v1)
STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
bne t0, t5, 1b
nop
2:
andi t0, a0, 1
beqz t0, 4f
lbu t3, 0(t5)
lbu s3, 0(t6)
addiu t3, t3, -128 // (cb - 128)
addiu s3, s3, -128 // (cr - 128)
mult $ac1, s1, t3
madd $ac1, s2, s3
sll s3, s3, 15
sll t3, t3, 15
lbu v0, 0(t1)
extr_r.w s5, $ac1, 16
mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
addu t3, v0, s4 // y+cred
addu s3, v0, s5 // y+cgreen
addu v1, v0, s6 // y+cblue
addu t3, t9, t3 // y+cred
addu s3, t9, s3 // y+cgreen
addu v1, t9, v1 // y+cblue
lbu t3, 0(t3)
lbu s3, 0(s3)
lbu v1, 0(v1)
lbu v0, 0(t2)
STORE_H2V2_1_PIXEL t3, s3, v1, t4
addu t3, v0, s4 // y+cred
addu s3, v0, s5 // y+cgreen
addu v1, v0, s6 // y+cblue
addu t3, t9, t3 // y+cred
addu s3, t9, s3 // y+cgreen
addu v1, t9, v1 // y+cblue
lbu t3, 0(t3)
lbu s3, 0(s3)
lbu v1, 0(v1)
STORE_H2V2_1_PIXEL t3, s3, v1, t7
4:
RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
j ra
nop
END(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
.purgem STORE_H2V2_1_PIXEL
.purgem STORE_H2V2_2_PIXELS
.endm
/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
/*****************************************************************************/
/*
* jsimd_h2v1_merged_upsample_mips_dspr2
* jsimd_h2v1_extrgb_merged_upsample_mips_dspr2
* jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2
* jsimd_h2v1_extbgr_merged_upsample_mips_dspr2
* jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2
* jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2
* jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2
*
* Merged h2v1 upsample routines
*/
.macro GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 colorid, \
pixel_size, \
r1_offs, \
g1_offs, \
b1_offs, \
a1_offs, \
r2_offs, \
g2_offs, \
b2_offs, \
a2_offs
.macro STORE_H2V1_2_PIXELS scratch0 \
scratch1 \
scratch2 \
scratch3 \
scratch4 \
scratch5 \
outptr
sb \scratch0, \r1_offs(\outptr)
sb \scratch1, \g1_offs(\outptr)
sb \scratch2, \b1_offs(\outptr)
sb \scratch3, \r2_offs(\outptr)
sb \scratch4, \g2_offs(\outptr)
sb \scratch5, \b2_offs(\outptr)
.if (\pixel_size == 8)
li t0, 0xFF
sb t0, \a1_offs(\outptr)
sb t0, \a2_offs(\outptr)
.endif
addiu \outptr, \pixel_size
.endm
.macro STORE_H2V1_1_PIXEL scratch0 \
scratch1 \
scratch2 \
outptr
sb \scratch0, \r1_offs(\outptr)
sb \scratch1, \g1_offs(\outptr)
sb \scratch2, \b1_offs(\outptr)
.if (\pixel_size == 8)
li t0, 0xFF
sb t0, \a1_offs(\outptr)
.endif
.endm
LEAF_MIPS_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
/*
* a0 - cinfo->output_width
* a1 - input_buf
* a2 - in_row_group_ctr
* a3 - output_buf
* 16(sp) - range_limit
*/
SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
li t0, 0xe6ea
lw t1, 0(a1) // t1 = input_buf[0]
lw t2, 4(a1) // t2 = input_buf[1]
lw t3, 8(a1) // t3 = input_buf[2]
lw t8, 56(sp) // t8 = range_limit
addiu s1, t0, 0x7fff // s1 = 0x166e9 [FIX(1.40200)]
addiu s2, s1, 0x5eb9 // s2 = 0x1c5a2 [FIX(1.77200)]
addiu s0, t0, 0x9916 // s0 = 0x8000
addiu s4, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
xori s3, s4, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
srl t0, a0, 1
sll t4, a2, 2
lwx s5, t4(t1) // s5 = inptr0
lwx s6, t4(t2) // s6 = inptr1
lwx s7, t4(t3) // s7 = inptr2
lw t7, 0(a3) // t7 = outptr
blez t0, 2f
addu t9, s6, t0 // t9 = end address
1:
lbu t2, 0(s6) // t2 = cb
lbu t0, 0(s7) // t0 = cr
lbu t1, 0(s5) // t1 = y
addiu t2, t2, -128 // t2 = cb - 128
addiu t0, t0, -128 // t0 = cr - 128
mult $ac1, s4, t2
madd $ac1, s3, t0
sll t0, t0, 15
sll t2, t2, 15
mulq_rs.w t0, s1, t0 // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
extr_r.w t5, $ac1, 16
mulq_rs.w t6, s2, t2 // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
addiu s7, s7, 1
addiu s6, s6, 1
addu t2, t1, t0 // t2 = y + cred
addu t3, t1, t5 // t3 = y + cgreen
addu t4, t1, t6 // t4 = y + cblue
addu t2, t8, t2
addu t3, t8, t3
addu t4, t8, t4
lbu t1, 1(s5)
lbu v0, 0(t2)
lbu v1, 0(t3)
lbu ra, 0(t4)
addu t2, t1, t0
addu t3, t1, t5
addu t4, t1, t6
addu t2, t8, t2
addu t3, t8, t3
addu t4, t8, t4
lbu t2, 0(t2)
lbu t3, 0(t3)
lbu t4, 0(t4)
STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
bne t9, s6, 1b
addiu s5, s5, 2
2:
andi t0, a0, 1
beqz t0, 4f
nop
3:
lbu t2, 0(s6)
lbu t0, 0(s7)
lbu t1, 0(s5)
addiu t2, t2, -128 //(cb - 128)
addiu t0, t0, -128 //(cr - 128)
mul t3, s4, t2
mul t4, s3, t0
sll t0, t0, 15
sll t2, t2, 15
mulq_rs.w t0, s1, t0 // (C1*cr + ONE_HALF)>> SCALEBITS
mulq_rs.w t6, s2, t2 // (C2*cb + ONE_HALF)>> SCALEBITS
addu t3, t3, s0
addu t3, t4, t3
sra t5, t3, 16 // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
addu t2, t1, t0 // y + cred
addu t3, t1, t5 // y + cgreen
addu t4, t1, t6 // y + cblue
addu t2, t8, t2
addu t3, t8, t3
addu t4, t8, t4
lbu t2, 0(t2)
lbu t3, 0(t3)
lbu t4, 0(t4)
STORE_H2V1_1_PIXEL t2, t3, t4, t7
4:
RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
j ra
nop
END(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
.purgem STORE_H2V1_1_PIXEL
.purgem STORE_H2V1_2_PIXELS
.endm
/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
/*****************************************************************************/
/*
* jsimd_h2v2_fancy_upsample_mips_dspr2
*
* Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
*/
LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2)
/*
* a0 - cinfo->max_v_samp_factor
* a1 - downsampled_width
* a2 - input_data
* a3 - output_data_ptr
*/
SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
li s4, 0
lw s2, 0(a3) // s2 = *output_data_ptr
0:
li t9, 2
lw s1, -4(a2) // s1 = inptr1
1:
lw s0, 0(a2) // s0 = inptr0
lwx s3, s4(s2)
addiu s5, a1, -2 // s5 = downsampled_width - 2
srl t4, s5, 1
sll t4, t4, 1
lbu t0, 0(s0)
lbu t1, 1(s0)
lbu t2, 0(s1)
lbu t3, 1(s1)
addiu s0, 2
addiu s1, 2
addu t8, s0, t4 // t8 = end address
andi s5, s5, 1 // s5 = residual
sll t4, t0, 1
sll t6, t1, 1
addu t0, t0, t4 // t0 = (*inptr0++) * 3
addu t1, t1, t6 // t1 = (*inptr0++) * 3
addu t7, t0, t2 // t7 = thiscolsum
addu t6, t1, t3 // t5 = nextcolsum
sll t0, t7, 2 // t0 = thiscolsum * 4
subu t1, t0, t7 // t1 = thiscolsum * 3
shra_r.w t0, t0, 4
addiu t1, 7
addu t1, t1, t6
srl t1, t1, 4
sb t0, 0(s3)
sb t1, 1(s3)
beq t8, s0, 22f // skip to final iteration if width == 3
addiu s3, 2
2:
lh t0, 0(s0) // t0 = A3|A2
lh t2, 0(s1) // t2 = B3|B2
addiu s0, 2
addiu s1, 2
preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2
preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2
shll.ph t1, t0, 1
sll t3, t6, 1
addu.ph t0, t1, t0 // t0 = A3*3|A2*3
addu t3, t3, t6 // t3 = this * 3
addu.ph t0, t0, t2 // t0 = next2|next1
addu t1, t3, t7
andi t7, t0, 0xFFFF // t7 = next1
sll t2, t7, 1
addu t2, t7, t2 // t2 = next1*3
addu t4, t2, t6
srl t6, t0, 16 // t6 = next2
shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4
addu t0, t3, t7
addiu t0, 7
srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4
shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4
addu t2, t2, t6
addiu t2, 7
srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4
sb t1, 0(s3)
sb t0, 1(s3)
sb t4, 2(s3)
sb t2, 3(s3)
bne t8, s0, 2b
addiu s3, 4
22:
beqz s5, 4f
addu t8, s0, s5
3:
lbu t0, 0(s0)
lbu t2, 0(s1)
addiu s0, 1
addiu s1, 1
sll t3, t6, 1
sll t1, t0, 1
addu t1, t0, t1 // t1 = inptr0 * 3
addu t3, t3, t6 // t3 = thiscolsum * 3
addu t5, t1, t2
addu t1, t3, t7
shra_r.w t1, t1, 4
addu t0, t3, t5
addiu t0, 7
srl t0, t0, 4
sb t1, 0(s3)
sb t0, 1(s3)
addiu s3, 2
move t7, t6
bne t8, s0, 3b
move t6, t5
4:
sll t0, t6, 2 // t0 = thiscolsum * 4
subu t1, t0, t6 // t1 = thiscolsum * 3
addu t1, t1, t7
addiu s4, 4
shra_r.w t1, t1, 4
addiu t0, 7
srl t0, t0, 4
sb t1, 0(s3)
sb t0, 1(s3)
addiu t9, -1
addiu s3, 2
bnez t9, 1b
lw s1, 4(a2)
srl t0, s4, 2
subu t0, a0, t0
bgtz t0, 0b
addiu a2, 4
RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
j ra
nop
END(jsimd_h2v2_fancy_upsample_mips_dspr2)
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2)
/*
* a0 - cinfo->max_v_samp_factor
* a1 - downsampled_width
* a2 - input_data
* a3 - output_data_ptr
*/
SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
.set at
beqz a0, 3f
sll t0, a0, 2
lw s1, 0(a3)
li s3, 0x10001
addu s0, s1, t0
0:
addiu t8, a1, -2
srl t9, t8, 2
lw t7, 0(a2)
lw s2, 0(s1)
lbu t0, 0(t7)
lbu t1, 1(t7) // t1 = inptr[1]
sll t2, t0, 1
addu t2, t2, t0 // t2 = invalue*3
addu t2, t2, t1
shra_r.w t2, t2, 2
sb t0, 0(s2)
sb t2, 1(s2)
beqz t9, 11f
addiu s2, 2
1:
ulw t0, 0(t7) // t0 = |P3|P2|P1|P0|
ulw t1, 1(t7)
ulh t2, 4(t7) // t2 = |0|0|P5|P4|
preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2|
preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0|
preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4|
preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3|
preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1|
shll.ph t5, t4, 1
shll.ph t6, t1, 1
addu.ph t5, t5, t4 // t5 = |P4*3|P3*3|
addu.ph t6, t6, t1 // t6 = |P2*3|P1*3|
addu.ph t4, t3, s3
addu.ph t0, t0, s3
addu.ph t4, t4, t5
addu.ph t0, t0, t6
shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2|
shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0|
addu.ph t2, t2, t5
addu.ph t3, t3, t6
shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4|
shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2|
shll.ph t2, t2, 8
shll.ph t3, t3, 8
or t2, t4, t2
or t3, t3, t0
addiu t9, -1
usw t3, 0(s2)
usw t2, 4(s2)
addiu s2, 8
bgtz t9, 1b
addiu t7, 4
11:
andi t8, 3
beqz t8, 22f
addiu t7, 1
2:
lbu t0, 0(t7)
addiu t7, 1
sll t1, t0, 1
addu t2, t0, t1 // t2 = invalue
lbu t3, -2(t7)
lbu t4, 0(t7)
addiu t3, 1
addiu t4, 2
addu t3, t3, t2
addu t4, t4, t2
srl t3, 2
srl t4, 2
sb t3, 0(s2)
sb t4, 1(s2)
addiu t8, -1
bgtz t8, 2b
addiu s2, 2
22:
lbu t0, 0(t7)
lbu t2, -1(t7)
sll t1, t0, 1
addu t1, t1, t0 // t1 = invalue * 3
addu t1, t1, t2
addiu t1, 1
srl t1, t1, 2
sb t1, 0(s2)
sb t0, 1(s2)
addiu s1, 4
bne s1, s0, 0b
addiu a2, 4
3:
RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
j ra
nop
END(jsimd_h2v1_fancy_upsample_mips_dspr2)
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_h2v1_downsample_mips_dspr2)
/*
* a0 - cinfo->image_width
* a1 - cinfo->max_v_samp_factor
* a2 - compptr->v_samp_factor
* a3 - compptr->width_in_blocks
* 16(sp) - input_data
* 20(sp) - output_data
*/
.set at
SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
beqz a2, 7f
lw s1, 44(sp) // s1 = output_data
lw s0, 40(sp) // s0 = input_data
srl s2, a0, 2
andi t9, a0, 2
srl t7, t9, 1
addu s2, t7, s2
sll t0, a3, 3 // t0 = width_in_blocks*DCT
srl t7, t0, 1
subu s2, t7, s2
0:
andi t6, a0, 1 // t6 = temp_index
addiu t6, -1
lw t4, 0(s1) // t4 = outptr
lw t5, 0(s0) // t5 = inptr0
li s3, 0 // s3 = bias
srl t7, a0, 1 // t7 = image_width1
srl s4, t7, 2
andi t8, t7, 3
1:
ulhu t0, 0(t5)
ulhu t1, 2(t5)
ulhu t2, 4(t5)
ulhu t3, 6(t5)
raddu.w.qb t0, t0
raddu.w.qb t1, t1
raddu.w.qb t2, t2
raddu.w.qb t3, t3
shra.ph t0, t0, 1
shra_r.ph t1, t1, 1
shra.ph t2, t2, 1
shra_r.ph t3, t3, 1
sb t0, 0(t4)
sb t1, 1(t4)
sb t2, 2(t4)
sb t3, 3(t4)
addiu s4, -1
addiu t4, 4
bgtz s4, 1b
addiu t5, 8
beqz t8, 3f
addu s4, t4, t8
2:
ulhu t0, 0(t5)
raddu.w.qb t0, t0
addqh.w t0, t0, s3
xori s3, s3, 1
sb t0, 0(t4)
addiu t4, 1
bne t4, s4, 2b
addiu t5, 2
3:
lbux t1, t6(t5)
sll t1, 1
addqh.w t2, t1, s3 // t2 = pixval1
xori s3, s3, 1
addqh.w t3, t1, s3 // t3 = pixval2
blez s2, 5f
append t3, t2, 8
addu t5, t4, s2 // t5 = loop_end2
4:
ush t3, 0(t4)
addiu s2, -1
bgtz s2, 4b
addiu t4, 2
5:
beqz t9, 6f
nop
sb t2, 0(t4)
6:
addiu s1, 4
addiu a2, -1
bnez a2, 0b
addiu s0, 4
7:
RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
j ra
nop
END(jsimd_h2v1_downsample_mips_dspr2)
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2)
/*
* a0 - cinfo->image_width
* a1 - cinfo->max_v_samp_factor
* a2 - compptr->v_samp_factor
* a3 - compptr->width_in_blocks
* 16(sp) - input_data
* 20(sp) - output_data
*/
.set at
SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
beqz a2, 8f
lw s1, 52(sp) // s1 = output_data
lw s0, 48(sp) // s0 = input_data
andi t6, a0, 1 // t6 = temp_index
addiu t6, -1
srl t7, a0, 1 // t7 = image_width1
srl s4, t7, 2
andi t8, t7, 3
andi t9, a0, 2
srl s2, a0, 2
srl t7, t9, 1
addu s2, t7, s2
sll t0, a3, 3 // s2 = width_in_blocks*DCT
srl t7, t0, 1
subu s2, t7, s2
0:
lw t4, 0(s1) // t4 = outptr
lw t5, 0(s0) // t5 = inptr0
lw s7, 4(s0) // s7 = inptr1
li s6, 1 // s6 = bias
2:
ulw t0, 0(t5) // t0 = |P3|P2|P1|P0|
ulw t1, 0(s7) // t1 = |Q3|Q2|Q1|Q0|
ulw t2, 4(t5)
ulw t3, 4(s7)
precrq.ph.w t7, t0, t1 // t2 = |P3|P2|Q3|Q2|
ins t0, t1, 16, 16 // t0 = |Q1|Q0|P1|P0|
raddu.w.qb t1, t7
raddu.w.qb t0, t0
shra_r.w t1, t1, 2
addiu t0, 1
srl t0, 2
precrq.ph.w t7, t2, t3
ins t2, t3, 16, 16
raddu.w.qb t7, t7
raddu.w.qb t2, t2
shra_r.w t7, t7, 2
addiu t2, 1
srl t2, 2
sb t0, 0(t4)
sb t1, 1(t4)
sb t2, 2(t4)
sb t7, 3(t4)
addiu t4, 4
addiu t5, 8
addiu s4, s4, -1
bgtz s4, 2b
addiu s7, 8
beqz t8, 4f
addu t8, t4, t8
3:
ulhu t0, 0(t5)
ulhu t1, 0(s7)
ins t0, t1, 16, 16
raddu.w.qb t0, t0
addu t0, t0, s6
srl t0, 2
xori s6, s6, 3
sb t0, 0(t4)
addiu t5, 2
addiu t4, 1
bne t8, t4, 3b
addiu s7, 2
4:
lbux t1, t6(t5)
sll t1, 1
lbux t0, t6(s7)
sll t0, 1
addu t1, t1, t0
addu t3, t1, s6
srl t0, t3, 2 // t2 = pixval1
xori s6, s6, 3
addu t2, t1, s6
srl t1, t2, 2 // t3 = pixval2
blez s2, 6f
append t1, t0, 8
5:
ush t1, 0(t4)
addiu s2, -1
bgtz s2, 5b
addiu t4, 2
6:
beqz t9, 7f
nop
sb t0, 0(t4)
7:
addiu s1, 4
addiu a2, -1
bnez a2, 0b
addiu s0, 8
8:
RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
j ra
nop
END(jsimd_h2v2_downsample_mips_dspr2)
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_h2v2_smooth_downsample_mips_dspr2)
/*
* a0 - input_data
* a1 - output_data
* a2 - compptr->v_samp_factor
* a3 - cinfo->max_v_samp_factor
* 16(sp) - cinfo->smoothing_factor
* 20(sp) - compptr->width_in_blocks
* 24(sp) - cinfo->image_width
*/
.set at
SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
lw s7, 52(sp) // compptr->width_in_blocks
lw s0, 56(sp) // cinfo->image_width
lw s6, 48(sp) // cinfo->smoothing_factor
sll s7, 3 // output_cols = width_in_blocks * DCTSIZE
sll v0, s7, 1
subu v0, v0, s0
blez v0, 2f
move v1, zero
addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2
0:
addiu t1, a0, -4
sll t2, v1, 2
lwx t1, t2(t1)
move t3, v0
addu t1, t1, s0
lbu t2, -1(t1)
1:
addiu t3, t3, -1
sb t2, 0(t1)
bgtz t3, 1b
addiu t1, t1, 1
addiu v1, v1, 1
bne v1, t0, 0b
nop
2:
li v0, 80
mul v0, s6, v0
li v1, 16384
move t4, zero
move t5, zero
subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80
sll t7, s6, 4 // t7 = tmp_smoot_f * 16
3:
/* Special case for first column: pretend column -1 is same as column 0 */
sll v0, t4, 2
lwx t8, v0(a1) // outptr = output_data[outrow]
sll v1, t5, 2
addiu t9, v1, 4
addiu s0, v1, -4
addiu s1, v1, 8
lwx s2, v1(a0) // inptr0 = input_data[inrow]
lwx t9, t9(a0) // inptr1 = input_data[inrow+1]
lwx s0, s0(a0) // above_ptr = input_data[inrow-1]
lwx s1, s1(a0) // below_ptr = input_data[inrow+2]
lh v0, 0(s2)
lh v1, 0(t9)
lh t0, 0(s0)
lh t1, 0(s1)
ins v0, v1, 16, 16
ins t0, t1, 16, 16
raddu.w.qb t2, v0
raddu.w.qb s3, t0
lbu v0, 0(s2)
lbu v1, 2(s2)
lbu t0, 0(t9)
lbu t1, 2(t9)
addu v0, v0, v1
mult $ac1,t2, t6
addu t0, t0, t1
lbu t2, 2(s0)
addu t0, t0, v0
lbu t3, 2(s1)
addu s3, t0, s3
lbu v0, 0(s0)
lbu t0, 0(s1)
sll s3, s3, 1
addu v0, v0, t2
addu t0, t0, t3
addu t0, t0, v0
addu s3, t0, s3
madd $ac1,s3, t7
extr_r.w v0, $ac1, 16
addiu t8, t8, 1
addiu s2, s2, 2
addiu t9, t9, 2
addiu s0, s0, 2
addiu s1, s1, 2
sb v0, -1(t8)
addiu s4, s7, -2
and s4, s4, 3
addu s5, s4, t8 //end adress
4:
lh v0, 0(s2)
lh v1, 0(t9)
lh t0, 0(s0)
lh t1, 0(s1)
ins v0, v1, 16, 16
ins t0, t1, 16, 16
raddu.w.qb t2, v0
raddu.w.qb s3, t0
lbu v0, -1(s2)
lbu v1, 2(s2)
lbu t0, -1(t9)
lbu t1, 2(t9)
addu v0, v0, v1
mult $ac1, t2, t6
addu t0, t0, t1
lbu t2, 2(s0)
addu t0, t0, v0
lbu t3, 2(s1)
addu s3, t0, s3
lbu v0, -1(s0)
lbu t0, -1(s1)
sll s3, s3, 1
addu v0, v0, t2
addu t0, t0, t3
addu t0, t0, v0
addu s3, t0, s3
madd $ac1, s3, t7
extr_r.w t2, $ac1, 16
addiu t8, t8, 1
addiu s2, s2, 2
addiu t9, t9, 2
addiu s0, s0, 2
sb t2, -1(t8)
bne s5, t8, 4b
addiu s1, s1, 2
addiu s5, s7, -2
subu s5, s5, s4
addu s5, s5, t8 //end adress
5:
lh v0, 0(s2)
lh v1, 0(t9)
lh t0, 0(s0)
lh t1, 0(s1)
ins v0, v1, 16, 16
ins t0, t1, 16, 16
raddu.w.qb t2, v0
raddu.w.qb s3, t0
lbu v0, -1(s2)
lbu v1, 2(s2)
lbu t0, -1(t9)
lbu t1, 2(t9)
addu v0, v0, v1
mult $ac1, t2, t6
addu t0, t0, t1
lbu t2, 2(s0)
addu t0, t0, v0
lbu t3, 2(s1)
addu s3, t0, s3
lbu v0, -1(s0)
lbu t0, -1(s1)
sll s3, s3, 1
addu v0, v0, t2
addu t0, t0, t3
lh v1, 2(t9)
addu t0, t0, v0
lh v0, 2(s2)
addu s3, t0, s3
lh t0, 2(s0)
lh t1, 2(s1)
madd $ac1, s3, t7
extr_r.w t2, $ac1, 16
ins t0, t1, 16, 16
ins v0, v1, 16, 16
raddu.w.qb s3, t0
lbu v1, 4(s2)
lbu t0, 1(t9)
lbu t1, 4(t9)
sb t2, 0(t8)
raddu.w.qb t3, v0
lbu v0, 1(s2)
addu t0, t0, t1
mult $ac1, t3, t6
addu v0, v0, v1
lbu t2, 4(s0)
addu t0, t0, v0
lbu v0, 1(s0)
addu s3, t0, s3
lbu t0, 1(s1)
lbu t3, 4(s1)
addu v0, v0, t2
sll s3, s3, 1
addu t0, t0, t3
lh v1, 4(t9)
addu t0, t0, v0
lh v0, 4(s2)
addu s3, t0, s3
lh t0, 4(s0)
lh t1, 4(s1)
madd $ac1, s3, t7
extr_r.w t2, $ac1, 16
ins t0, t1, 16, 16
ins v0, v1, 16, 16
raddu.w.qb s3, t0
lbu v1, 6(s2)
lbu t0, 3(t9)
lbu t1, 6(t9)
sb t2, 1(t8)
raddu.w.qb t3, v0
lbu v0, 3(s2)
addu t0, t0,t1
mult $ac1, t3, t6
addu v0, v0, v1
lbu t2, 6(s0)
addu t0, t0, v0
lbu v0, 3(s0)
addu s3, t0, s3
lbu t0, 3(s1)
lbu t3, 6(s1)
addu v0, v0, t2
sll s3, s3, 1
addu t0, t0, t3
lh v1, 6(t9)
addu t0, t0, v0
lh v0, 6(s2)
addu s3, t0, s3
lh t0, 6(s0)
lh t1, 6(s1)
madd $ac1, s3, t7
extr_r.w t3, $ac1, 16
ins t0, t1, 16, 16
ins v0, v1, 16, 16
raddu.w.qb s3, t0
lbu v1, 8(s2)
lbu t0, 5(t9)
lbu t1, 8(t9)
sb t3, 2(t8)
raddu.w.qb t2, v0
lbu v0, 5(s2)
addu t0, t0, t1
mult $ac1, t2, t6
addu v0, v0, v1
lbu t2, 8(s0)
addu t0, t0, v0
lbu v0, 5(s0)
addu s3, t0, s3
lbu t0, 5(s1)
lbu t3, 8(s1)
addu v0, v0, t2
sll s3, s3, 1
addu t0, t0, t3
addiu t8, t8, 4
addu t0, t0, v0
addiu s2, s2, 8
addu s3, t0, s3
addiu t9, t9, 8
madd $ac1, s3, t7
extr_r.w t1, $ac1, 16
addiu s0, s0, 8
addiu s1, s1, 8
bne s5, t8, 5b
sb t1, -1(t8)
/* Special case for last column */
lh v0, 0(s2)
lh v1, 0(t9)
lh t0, 0(s0)
lh t1, 0(s1)
ins v0, v1, 16, 16
ins t0, t1, 16, 16
raddu.w.qb t2, v0
raddu.w.qb s3, t0
lbu v0, -1(s2)
lbu v1, 1(s2)
lbu t0, -1(t9)
lbu t1, 1(t9)
addu v0, v0, v1
mult $ac1, t2, t6
addu t0, t0, t1
lbu t2, 1(s0)
addu t0, t0, v0
lbu t3, 1(s1)
addu s3, t0, s3
lbu v0, -1(s0)
lbu t0, -1(s1)
sll s3, s3, 1
addu v0, v0, t2
addu t0, t0, t3
addu t0, t0, v0
addu s3, t0, s3
madd $ac1, s3, t7
extr_r.w t0, $ac1, 16
addiu t5, t5, 2
sb t0, 0(t8)
addiu t4, t4, 1
bne t4, a2, 3b
addiu t5, t5, 2
RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
j ra
nop
END(jsimd_h2v2_smooth_downsample_mips_dspr2)
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_int_upsample_mips_dspr2)
/*
* a0 - upsample->h_expand[compptr->component_index]
* a1 - upsample->v_expand[compptr->component_index]
* a2 - input_data
* a3 - output_data_ptr
* 16(sp) - cinfo->output_width
* 20(sp) - cinfo->max_v_samp_factor
*/
.set at
SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
lw s0, 0(a3) // s0 = output_data
lw s1, 32(sp) // s1 = cinfo->output_width
lw s2, 36(sp) // s2 = cinfo->max_v_samp_factor
li t6, 0 // t6 = inrow
beqz s2, 10f
li s3, 0 // s3 = outrow
0:
addu t0, a2, t6
addu t7, s0, s3
lw t3, 0(t0) // t3 = inptr
lw t8, 0(t7) // t8 = outptr
beqz s1, 4f
addu t5, t8, s1 // t5 = outend
1:
lb t2, 0(t3) // t2 = invalue = *inptr++
addiu t3, 1
beqz a0, 3f
move t0, a0 // t0 = h_expand
2:
sb t2, 0(t8)
addiu t0, -1
bgtz t0, 2b
addiu t8, 1
3:
bgt t5, t8, 1b
nop
4:
addiu t9, a1, -1 // t9 = v_expand - 1
blez t9, 9f
nop
5:
lw t3, 0(s0)
lw t4, 4(s0)
subu t0, s1, 0xF
blez t0, 7f
addu t5, t3, s1 // t5 = end address
andi t7, s1, 0xF // t7 = residual
subu t8, t5, t7
6:
ulw t0, 0(t3)
ulw t1, 4(t3)
ulw t2, 8(t3)
usw t0, 0(t4)
ulw t0, 12(t3)
usw t1, 4(t4)
usw t2, 8(t4)
usw t0, 12(t4)
addiu t3, 16
bne t3, t8, 6b
addiu t4, 16
beqz t7, 8f
nop
7:
lbu t0, 0(t3)
sb t0, 0(t4)
addiu t3, 1
bne t3, t5, 7b
addiu t4, 1
8:
addiu t9, -1
bgtz t9, 5b
addiu s0, 8
9:
addu s3, s3, a1
bne s3, s2, 0b
addiu t6, 1
10:
RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
j ra
nop
END(jsimd_int_upsample_mips_dspr2)
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2)
/*
* a0 - cinfo->max_v_samp_factor
* a1 - cinfo->output_width
* a2 - input_data
* a3 - output_data_ptr
*/
lw t7, 0(a3) // t7 = output_data
andi t8, a1, 0xf // t8 = residual
sll t0, a0, 2
blez a0, 4f
addu t9, t7, t0 // t9 = output_data end address
0:
lw t5, 0(t7) // t5 = outptr
lw t6, 0(a2) // t6 = inptr
addu t3, t5, a1 // t3 = outptr + output_width (end address)
subu t3, t8 // t3 = end address - residual
beq t5, t3, 2f
move t4, t8
1:
ulw t0, 0(t6) // t0 = |P3|P2|P1|P0|
ulw t2, 4(t6) // t2 = |P7|P6|P5|P4|
srl t1, t0, 16 // t1 = |X|X|P3|P2|
ins t0, t0, 16, 16 // t0 = |P1|P0|P1|P0|
ins t1, t1, 16, 16 // t1 = |P3|P2|P3|P2|
ins t0, t0, 8, 16 // t0 = |P1|P1|P0|P0|
ins t1, t1, 8, 16 // t1 = |P3|P3|P2|P2|
usw t0, 0(t5)
usw t1, 4(t5)
srl t0, t2, 16 // t0 = |X|X|P7|P6|
ins t2, t2, 16, 16 // t2 = |P5|P4|P5|P4|
ins t0, t0, 16, 16 // t0 = |P7|P6|P7|P6|
ins t2, t2, 8, 16 // t2 = |P5|P5|P4|P4|
ins t0, t0, 8, 16 // t0 = |P7|P7|P6|P6|
usw t2, 8(t5)
usw t0, 12(t5)
addiu t5, 16
bne t5, t3, 1b
addiu t6, 8
beqz t8, 3f
move t4, t8
2:
lbu t1, 0(t6)
sb t1, 0(t5)
sb t1, 1(t5)
addiu t4, -2
addiu t6, 1
bgtz t4, 2b
addiu t5, 2
3:
addiu t7, 4
bne t9, t7, 0b
addiu a2, 4
4:
j ra
nop
END(jsimd_h2v1_upsample_mips_dspr2)
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2)
/*
* a0 - cinfo->max_v_samp_factor
* a1 - cinfo->output_width
* a2 - input_data
* a3 - output_data_ptr
*/
lw t7, 0(a3)
blez a0, 7f
andi t9, a1, 0xf // t9 = residual
0:
lw t6, 0(a2) // t6 = inptr
lw t5, 0(t7) // t5 = outptr
addu t8, t5, a1 // t8 = outptr end address
subu t8, t9 // t8 = end address - residual
beq t5, t8, 2f
move t4, t9
1:
ulw t0, 0(t6)
srl t1, t0, 16
ins t0, t0, 16, 16
ins t0, t0, 8, 16
ins t1, t1, 16, 16
ins t1, t1, 8, 16
ulw t2, 4(t6)
usw t0, 0(t5)
usw t1, 4(t5)
srl t3, t2, 16
ins t2, t2, 16, 16
ins t2, t2, 8, 16
ins t3, t3, 16, 16
ins t3, t3, 8, 16
usw t2, 8(t5)
usw t3, 12(t5)
addiu t5, 16
bne t5, t8, 1b
addiu t6, 8
beqz t9, 3f
move t4, t9
2:
lbu t0, 0(t6)
sb t0, 0(t5)
sb t0, 1(t5)
addiu t4, -2
addiu t6, 1
bgtz t4, 2b
addiu t5, 2
3:
lw t6, 0(t7) // t6 = outptr[0]
lw t5, 4(t7) // t5 = outptr[1]
addu t4, t6, a1 // t4 = new end address
beq a1, t9, 5f
subu t8, t4, t9
4:
ulw t0, 0(t6)
ulw t1, 4(t6)
ulw t2, 8(t6)
usw t0, 0(t5)
ulw t0, 12(t6)
usw t1, 4(t5)
usw t2, 8(t5)
usw t0, 12(t5)
addiu t6, 16
bne t6, t8, 4b
addiu t5, 16
beqz t9, 6f
nop
5:
lbu t0, 0(t6)
sb t0, 0(t5)
addiu t6, 1
bne t6, t4, 5b
addiu t5, 1
6:
addiu t7, 8
addiu a0, -2
bgtz a0, 0b
addiu a2, 4
7:
j ra
nop
END(jsimd_h2v2_upsample_mips_dspr2)
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_idct_islow_mips_dspr2)
/*
* a0 - coef_block
* a1 - compptr->dcttable
* a2 - output
* a3 - range_limit
*/
SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
addiu sp, sp, -256
move v0, sp
addiu v1, zero, 8 // v1 = DCTSIZE = 8
1:
lh s4, 32(a0) // s4 = inptr[16]
lh s5, 64(a0) // s5 = inptr[32]
lh s6, 96(a0) // s6 = inptr[48]
lh t1, 112(a0) // t1 = inptr[56]
lh t7, 16(a0) // t7 = inptr[8]
lh t5, 80(a0) // t5 = inptr[40]
lh t3, 48(a0) // t3 = inptr[24]
or s4, s4, t1
or s4, s4, t3
or s4, s4, t5
or s4, s4, t7
or s4, s4, s5
or s4, s4, s6
bnez s4, 2f
addiu v1, v1, -1
lh s5, 0(a1) // quantptr[DCTSIZE*0]
lh s6, 0(a0) // inptr[DCTSIZE*0]
mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0])
sll s5, s5, 2
sw s5, 0(v0)
sw s5, 32(v0)
sw s5, 64(v0)
sw s5, 96(v0)
sw s5, 128(v0)
sw s5, 160(v0)
sw s5, 192(v0)
b 3f
sw s5, 224(v0)
2:
lh t0, 112(a1)
lh t2, 48(a1)
lh t4, 80(a1)
lh t6, 16(a1)
mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7],quant[DCTSIZE*7])
mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3],quant[DCTSIZE*3])
mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5],quant[DCTSIZE*5])
mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1],quant[DCTSIZE*1])
lh t4, 32(a1)
lh t5, 32(a0)
lh t6, 96(a1)
lh t7, 96(a0)
addu s0, t0, t1 // z3 = tmp0 + tmp2
addu s1, t1, t2 // z2 = tmp1 + tmp2
addu s2, t2, t3 // z4 = tmp1 + tmp3
addu s3, s0, s2 // z3 + z4
addiu t9, zero, 9633 // FIX_1_175875602
mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
addu t8, t0, t3 // z1 = tmp0 + tmp3
addiu t9, zero, 2446 // FIX_0_298631336
mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
addiu t9, zero, 16819 // FIX_2_053119869
mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
addiu t9, zero, 25172 // FIX_3_072711026
mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
addiu t9, zero, 12299 // FIX_1_501321110
mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
addiu t9, zero, 16069 // FIX_1_961570560
mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560)
addiu t9, zero, 3196 // FIX_0_390180644
mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644)
addiu t9, zero, 7373 // FIX_0_899976223
mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223)
addiu t9, zero, 20995 // FIX_2_562915447
mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447)
subu s0, s3, s0 // z3 += z5
addu t0, t0, s0 // tmp0 += z3
addu t1, t1, s0 // tmp2 += z3
subu s2, s3, s2 // z4 += z5
addu t2, t2, s2 // tmp1 += z4
addu t3, t3, s2 // tmp3 += z4
subu t0, t0, t8 // tmp0 += z1
subu t1, t1, s1 // tmp2 += z2
subu t2, t2, s1 // tmp1 += z2
subu t3, t3, t8 // tmp3 += z1
mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2],quant[DCTSIZE*2])
addiu t9, zero, 6270 // FIX_0_765366865
mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6],quant[DCTSIZE*6])
lh t4, 0(a1)
lh t5, 0(a0)
lh t6, 64(a1)
lh t7, 64(a0)
mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865)
mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0],quant[DCTSIZE*0])
mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4],quant[DCTSIZE*4])
addiu t9, zero, 4433 // FIX_0_541196100
addu s3, s0, s1 // z2 + z3
mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
addiu t9, zero, 15137 // FIX_1_847759065
mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065)
addu t4, t5, t6
subu t5, t5, t6
sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS
sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS
addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
addu s0, t4, t7
subu s1, t4, t7
addu s2, t5, t6
subu s3, t5, t6
addu t4, s0, t3
subu s0, s0, t3
addu t3, s2, t1
subu s2, s2, t1
addu t1, s3, t2
subu s3, s3, t2
addu t2, s1, t0
subu s1, s1, t0
shra_r.w t4, t4, 11
shra_r.w t3, t3, 11
shra_r.w t1, t1, 11
shra_r.w t2, t2, 11
shra_r.w s1, s1, 11
shra_r.w s3, s3, 11
shra_r.w s2, s2, 11
shra_r.w s0, s0, 11
sw t4, 0(v0)
sw t3, 32(v0)
sw t1, 64(v0)
sw t2, 96(v0)
sw s1, 128(v0)
sw s3, 160(v0)
sw s2, 192(v0)
sw s0, 224(v0)
3:
addiu a1, a1, 2
addiu a0, a0, 2
bgtz v1, 1b
addiu v0, v0, 4
move v0, sp
addiu v1, zero, 8
4:
lw t0, 8(v0) // z2 = (JLONG) wsptr[2]
lw t1, 24(v0) // z3 = (JLONG) wsptr[6]
lw t2, 0(v0) // (JLONG) wsptr[0]
lw t3, 16(v0) // (JLONG) wsptr[4]
lw s4, 4(v0) // (JLONG) wsptr[1]
lw s5, 12(v0) // (JLONG) wsptr[3]
lw s6, 20(v0) // (JLONG) wsptr[5]
lw s7, 28(v0) // (JLONG) wsptr[7]
or s4, s4, t0
or s4, s4, t1
or s4, s4, t3
or s4, s4, s7
or s4, s4, s5
or s4, s4, s6
bnez s4, 5f
addiu v1, v1, -1
shra_r.w s5, t2, 5
andi s5, s5, 0x3ff
lbux s5, s5(a3)
lw s1, 0(a2)
replv.qb s5, s5
usw s5, 0(s1)
usw s5, 4(s1)
b 6f
nop
5:
addu t4, t0, t1 // z2 + z3
addiu t8, zero, 4433 // FIX_0_541196100
mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
addiu t8, zero, 15137 // FIX_1_847759065
mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065)
addiu t8, zero, 6270 // FIX_0_765366865
mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865)
addu t4, t2, t3 // (JLONG) wsptr[0] + (JLONG) wsptr[4]
subu t2, t2, t3 // (JLONG) wsptr[0] - (JLONG) wsptr[4]
sll t4, t4, 13 // tmp0 = ((wsptr[0] + wsptr[4]) << CONST_BITS
sll t2, t2, 13 // tmp1 = ((wsptr[0] - wsptr[4]) << CONST_BITS
subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
subu t3, t2, t1 // tmp12 = tmp1 - tmp2
addu t2, t2, t1 // tmp11 = tmp1 + tmp2
addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
subu t1, t4, t5 // tmp13 = tmp0 - tmp3
addu t0, t4, t5 // tmp10 = tmp0 + tmp3
lw t4, 28(v0) // tmp0 = (JLONG) wsptr[7]
lw t6, 12(v0) // tmp2 = (JLONG) wsptr[3]
lw t5, 20(v0) // tmp1 = (JLONG) wsptr[5]
lw t7, 4(v0) // tmp3 = (JLONG) wsptr[1]
addu s0, t4, t6 // z3 = tmp0 + tmp2
addiu t8, zero, 9633 // FIX_1_175875602
addu s1, t5, t7 // z4 = tmp1 + tmp3
addu s2, s0, s1 // z3 + z4
mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
addu s3, t4, t7 // z1 = tmp0 + tmp3
addu t9, t5, t6 // z2 = tmp1 + tmp2
addiu t8, zero, 16069 // FIX_1_961570560
mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560)
addiu t8, zero, 3196 // FIX_0_390180644
mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644)
addiu t8, zero, 2446 // FIX_0_298631336
mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
addiu t8, zero, 7373 // FIX_0_899976223
mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223)
addiu t8, zero, 16819 // FIX_2_053119869
mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
addiu t8, zero, 20995 // FIX_2_562915447
mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447)
addiu t8, zero, 25172 // FIX_3_072711026
mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
addiu t8, zero, 12299 // FIX_1_501321110
mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
subu s0, s2, s0 // z3 += z5
subu s1, s2, s1 // z4 += z5
addu t4, t4, s0
subu t4, t4, s3 // tmp0
addu t5, t5, s1
subu t5, t5, t9 // tmp1
addu t6, t6, s0
subu t6, t6, t9 // tmp2
addu t7, t7, s1
subu t7, t7, s3 // tmp3
addu s0, t0, t7
subu t0, t0, t7
addu t7, t2, t6
subu t2, t2, t6
addu t6, t3, t5
subu t3, t3, t5
addu t5, t1, t4
subu t1, t1, t4
shra_r.w s0, s0, 18
shra_r.w t7, t7, 18
shra_r.w t6, t6, 18
shra_r.w t5, t5, 18
shra_r.w t1, t1, 18
shra_r.w t3, t3, 18
shra_r.w t2, t2, 18
shra_r.w t0, t0, 18
andi s0, s0, 0x3ff
andi t7, t7, 0x3ff
andi t6, t6, 0x3ff
andi t5, t5, 0x3ff
andi t1, t1, 0x3ff
andi t3, t3, 0x3ff
andi t2, t2, 0x3ff
andi t0, t0, 0x3ff
lw s1, 0(a2)
lbux s0, s0(a3)
lbux t7, t7(a3)
lbux t6, t6(a3)
lbux t5, t5(a3)
lbux t1, t1(a3)
lbux t3, t3(a3)
lbux t2, t2(a3)
lbux t0, t0(a3)
sb s0, 0(s1)
sb t7, 1(s1)
sb t6, 2(s1)
sb t5, 3(s1)
sb t1, 4(s1)
sb t3, 5(s1)
sb t2, 6(s1)
sb t0, 7(s1)
6:
addiu v0, v0, 32
bgtz v1, 4b
addiu a2, a2, 4
addiu sp, sp, 256
RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
j ra
nop
END(jsimd_idct_islow_mips_dspr2)
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2)
/*
* a0 - inptr
* a1 - quantptr
* a2 - wsptr
* a3 - mips_idct_ifast_coefs
*/
SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
addiu t9, a0, 16 // end address
or AT, a3, zero
0:
lw s0, 0(a1) // quantptr[DCTSIZE*0]
lw t0, 0(a0) // inptr[DCTSIZE*0]
lw t1, 16(a0) // inptr[DCTSIZE*1]
muleq_s.w.phl v0, t0, s0 // tmp0 ...
lw t2, 32(a0) // inptr[DCTSIZE*2]
lw t3, 48(a0) // inptr[DCTSIZE*3]
lw t4, 64(a0) // inptr[DCTSIZE*4]
lw t5, 80(a0) // inptr[DCTSIZE*5]
muleq_s.w.phr t0, t0, s0 // ... tmp0 ...
lw t6, 96(a0) // inptr[DCTSIZE*6]
lw t7, 112(a0) // inptr[DCTSIZE*7]
or s4, t1, t2
or s5, t3, t4
bnez s4, 1f
ins t0, v0, 16, 16 // ... tmp0
bnez s5, 1f
or s6, t5, t6
or s6, s6, t7
bnez s6, 1f
sw t0, 0(a2) // wsptr[DCTSIZE*0]
sw t0, 16(a2) // wsptr[DCTSIZE*1]
sw t0, 32(a2) // wsptr[DCTSIZE*2]
sw t0, 48(a2) // wsptr[DCTSIZE*3]
sw t0, 64(a2) // wsptr[DCTSIZE*4]
sw t0, 80(a2) // wsptr[DCTSIZE*5]
sw t0, 96(a2) // wsptr[DCTSIZE*6]
sw t0, 112(a2) // wsptr[DCTSIZE*7]
addiu a0, a0, 4
b 2f
addiu a1, a1, 4
1:
lw s1, 32(a1) // quantptr[DCTSIZE*2]
lw s2, 64(a1) // quantptr[DCTSIZE*4]
muleq_s.w.phl v0, t2, s1 // tmp1 ...
muleq_s.w.phr t2, t2, s1 // ... tmp1 ...
lw s0, 16(a1) // quantptr[DCTSIZE*1]
lw s1, 48(a1) // quantptr[DCTSIZE*3]
lw s3, 96(a1) // quantptr[DCTSIZE*6]
muleq_s.w.phl v1, t4, s2 // tmp2 ...
muleq_s.w.phr t4, t4, s2 // ... tmp2 ...
lw s2, 80(a1) // quantptr[DCTSIZE*5]
lw t8, 4(AT) // FIX(1.414213562)
ins t2, v0, 16, 16 // ... tmp1
muleq_s.w.phl v0, t6, s3 // tmp3 ...
muleq_s.w.phr t6, t6, s3 // ... tmp3 ...
ins t4, v1, 16, 16 // ... tmp2
addq.ph s4, t0, t4 // tmp10
subq.ph s5, t0, t4 // tmp11
ins t6, v0, 16, 16 // ... tmp3
subq.ph s6, t2, t6 // tmp12 ...
addq.ph s7, t2, t6 // tmp13
mulq_s.ph s6, s6, t8 // ... tmp12 ...
addq.ph t0, s4, s7 // tmp0
subq.ph t6, s4, s7 // tmp3
muleq_s.w.phl v0, t1, s0 // tmp4 ...
muleq_s.w.phr t1, t1, s0 // ... tmp4 ...
shll_s.ph s6, s6, 1 // x2
lw s3, 112(a1) // quantptr[DCTSIZE*7]
subq.ph s6, s6, s7 // ... tmp12
muleq_s.w.phl v1, t7, s3 // tmp7 ...
muleq_s.w.phr t7, t7, s3 // ... tmp7 ...
ins t1, v0, 16, 16 // ... tmp4
addq.ph t2, s5, s6 // tmp1
subq.ph t4, s5, s6 // tmp2
muleq_s.w.phl v0, t5, s2 // tmp6 ...
muleq_s.w.phr t5, t5, s2 // ... tmp6 ...
ins t7, v1, 16, 16 // ... tmp7
addq.ph s5, t1, t7 // z11
subq.ph s6, t1, t7 // z12
muleq_s.w.phl v1, t3, s1 // tmp5 ...
muleq_s.w.phr t3, t3, s1 // ... tmp5 ...
ins t5, v0, 16, 16 // ... tmp6
ins t3, v1, 16, 16 // ... tmp5
addq.ph s7, t5, t3 // z13
subq.ph v0, t5, t3 // z10
addq.ph t7, s5, s7 // tmp7
subq.ph s5, s5, s7 // tmp11 ...
addq.ph v1, v0, s6 // z5 ...
mulq_s.ph s5, s5, t8 // ... tmp11
lw t8, 8(AT) // FIX(1.847759065)
lw s4, 0(AT) // FIX(1.082392200)
addq.ph s0, t0, t7
subq.ph s1, t0, t7
mulq_s.ph v1, v1, t8 // ... z5
shll_s.ph s5, s5, 1 // x2
lw t8, 12(AT) // FIX(-2.613125930)
sw s0, 0(a2) // wsptr[DCTSIZE*0]
shll_s.ph v0, v0, 1 // x4
mulq_s.ph v0, v0, t8 // tmp12 ...
mulq_s.ph s4, s6, s4 // tmp10 ...
shll_s.ph v1, v1, 1 // x2
addiu a0, a0, 4
addiu a1, a1, 4
sw s1, 112(a2) // wsptr[DCTSIZE*7]
shll_s.ph s6, v0, 1 // x4
shll_s.ph s4, s4, 1 // x2
addq.ph s6, s6, v1 // ... tmp12
subq.ph t5, s6, t7 // tmp6
subq.ph s4, s4, v1 // ... tmp10
subq.ph t3, s5, t5 // tmp5
addq.ph s2, t2, t5
addq.ph t1, s4, t3 // tmp4
subq.ph s3, t2, t5
sw s2, 16(a2) // wsptr[DCTSIZE*1]
sw s3, 96(a2) // wsptr[DCTSIZE*6]
addq.ph v0, t4, t3
subq.ph v1, t4, t3
sw v0, 32(a2) // wsptr[DCTSIZE*2]
sw v1, 80(a2) // wsptr[DCTSIZE*5]
addq.ph v0, t6, t1
subq.ph v1, t6, t1
sw v0, 64(a2) // wsptr[DCTSIZE*4]
sw v1, 48(a2) // wsptr[DCTSIZE*3]
2:
bne a0, t9, 0b
addiu a2, a2, 4
RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
j ra
nop
END(jsimd_idct_ifast_cols_mips_dspr2)
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2)
/*
* a0 - wsptr
* a1 - output_buf
* a2 - output_col
* a3 - mips_idct_ifast_coefs
*/
SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
addiu t9, a0, 128 // end address
lui s8, 0x8080
ori s8, s8, 0x8080
0:
lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs)
lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a
lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A
lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c
lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C
lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e
lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E
lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g
lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G
precrq.ph.w t1, s0, t0 // B b
ins t0, s0, 16, 16 // A a
bnez t1, 1f
or s0, t2, s2
bnez s0, 1f
or s0, t4, s4
bnez s0, 1f
or s0, t6, s6
bnez s0, 1f
shll_s.ph s0, t0, 2 // A a
lw a3, 0(a1)
lw AT, 4(a1)
precrq.ph.w t0, s0, s0 // A A
ins s0, s0, 16, 16 // a a
addu a3, a3, a2
addu AT, AT, a2
precrq.qb.ph t0, t0, t0 // A A A A
precrq.qb.ph s0, s0, s0 // a a a a
addu.qb s0, s0, s8
addu.qb t0, t0, s8
sw s0, 0(a3)
sw s0, 4(a3)
sw t0, 0(AT)
sw t0, 4(AT)
addiu a0, a0, 32
bne a0, t9, 0b
addiu a1, a1, 8
b 2f
nop
1:
precrq.ph.w t3, s2, t2
ins t2, s2, 16, 16
precrq.ph.w t5, s4, t4
ins t4, s4, 16, 16
precrq.ph.w t7, s6, t6
ins t6, s6, 16, 16
lw t8, 4(AT) // FIX(1.414213562)
addq.ph s4, t0, t4 // tmp10
subq.ph s5, t0, t4 // tmp11
subq.ph s6, t2, t6 // tmp12 ...
addq.ph s7, t2, t6 // tmp13
mulq_s.ph s6, s6, t8 // ... tmp12 ...
addq.ph t0, s4, s7 // tmp0
subq.ph t6, s4, s7 // tmp3
shll_s.ph s6, s6, 1 // x2
subq.ph s6, s6, s7 // ... tmp12
addq.ph t2, s5, s6 // tmp1
subq.ph t4, s5, s6 // tmp2
addq.ph s5, t1, t7 // z11
subq.ph s6, t1, t7 // z12
addq.ph s7, t5, t3 // z13
subq.ph v0, t5, t3 // z10
addq.ph t7, s5, s7 // tmp7
subq.ph s5, s5, s7 // tmp11 ...
addq.ph v1, v0, s6 // z5 ...
mulq_s.ph s5, s5, t8 // ... tmp11
lw t8, 8(AT) // FIX(1.847759065)
lw s4, 0(AT) // FIX(1.082392200)
addq.ph s0, t0, t7 // tmp0 + tmp7
subq.ph s7, t0, t7 // tmp0 - tmp7
mulq_s.ph v1, v1, t8 // ... z5
lw a3, 0(a1)
lw t8, 12(AT) // FIX(-2.613125930)
shll_s.ph s5, s5, 1 // x2
addu a3, a3, a2
shll_s.ph v0, v0, 1 // x4
mulq_s.ph v0, v0, t8 // tmp12 ...
mulq_s.ph s4, s6, s4 // tmp10 ...
shll_s.ph v1, v1, 1 // x2
addiu a0, a0, 32
addiu a1, a1, 8
shll_s.ph s6, v0, 1 // x4
shll_s.ph s4, s4, 1 // x2
addq.ph s6, s6, v1 // ... tmp12
shll_s.ph s0, s0, 2
subq.ph t5, s6, t7 // tmp6
subq.ph s4, s4, v1 // ... tmp10
subq.ph t3, s5, t5 // tmp5
shll_s.ph s7, s7, 2
addq.ph t1, s4, t3 // tmp4
addq.ph s1, t2, t5 // tmp1 + tmp6
subq.ph s6, t2, t5 // tmp1 - tmp6
addq.ph s2, t4, t3 // tmp2 + tmp5
subq.ph s5, t4, t3 // tmp2 - tmp5
addq.ph s4, t6, t1 // tmp3 + tmp4
subq.ph s3, t6, t1 // tmp3 - tmp4
shll_s.ph s1, s1, 2
shll_s.ph s2, s2, 2
shll_s.ph s3, s3, 2
shll_s.ph s4, s4, 2
shll_s.ph s5, s5, 2
shll_s.ph s6, s6, 2
precrq.ph.w t0, s1, s0 // B A
ins s0, s1, 16, 16 // b a
precrq.ph.w t2, s3, s2 // D C
ins s2, s3, 16, 16 // d c
precrq.ph.w t4, s5, s4 // F E
ins s4, s5, 16, 16 // f e
precrq.ph.w t6, s7, s6 // H G
ins s6, s7, 16, 16 // h g
precrq.qb.ph t0, t2, t0 // D C B A
precrq.qb.ph s0, s2, s0 // d c b a
precrq.qb.ph t4, t6, t4 // H G F E
precrq.qb.ph s4, s6, s4 // h g f e
addu.qb s0, s0, s8
addu.qb s4, s4, s8
sw s0, 0(a3) // outptr[0/1/2/3] d c b a
sw s4, 4(a3) // outptr[4/5/6/7] h g f e
lw a3, -4(a1)
addu.qb t0, t0, s8
addu a3, a3, a2
addu.qb t4, t4, s8
sw t0, 0(a3) // outptr[0/1/2/3] D C B A
bne a0, t9, 0b
sw t4, 4(a3) // outptr[4/5/6/7] H G F E
2:
RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
j ra
nop
END(jsimd_idct_ifast_rows_mips_dspr2)
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2)
/*
* a0 - data
*/
SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
lui t0, 6437
ori t0, 2260
lui t1, 9633
ori t1, 11363
lui t2, 0xd39e
ori t2, 0xe6dc
lui t3, 0xf72d
ori t3, 9633
lui t4, 2261
ori t4, 9633
lui t5, 0xd39e
ori t5, 6437
lui t6, 9633
ori t6, 0xd39d
lui t7, 0xe6dc
ori t7, 2260
lui t8, 4433
ori t8, 10703
lui t9, 0xd630
ori t9, 4433
li s8, 8
move a1, a0
1:
lw s0, 0(a1) // tmp0 = 1|0
lw s1, 4(a1) // tmp1 = 3|2
lw s2, 8(a1) // tmp2 = 5|4
lw s3, 12(a1) // tmp3 = 7|6
packrl.ph s1, s1, s1 // tmp1 = 2|3
packrl.ph s3, s3, s3 // tmp3 = 6|7
subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4
subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7
mult $0, $0 // ac0 = 0
dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260
dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363
mult $ac1, $0, $0 // ac1 = 0
dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436
dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633
mult $ac2, $0, $0 // ac2 = 0
dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633
dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437
mult $ac3, $0, $0 // ac3 = 0
dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363
dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260
addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3
addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0
extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11
extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11
addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10
subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13
sh s0, 2(a1)
sh s1, 6(a1)
sh s2, 10(a1)
sh s3, 14(a1)
mult $0, $0 // ac0 = 0
dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703
mult $ac1, $0, $0 // ac1 = 0
dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433
sra s4, s5, 16 // tmp4 = t11
addiu a1, a1, 16
addiu s8, s8, -1
extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
addu s2, s5, s4 // tmp2 = t10 + t11
subu s3, s5, s4 // tmp3 = t10 - t11
sll s2, s2, 2 // tmp2 = (t10 + t11) << 2
sll s3, s3, 2 // tmp3 = (t10 - t11) << 2
sh s2, -16(a1)
sh s3, -8(a1)
sh s0, -12(a1)
bgtz s8, 1b
sh s1, -4(a1)
li t0, 2260
li t1, 11363
li t2, 9633
li t3, 6436
li t4, 6437
li t5, 2261
li t6, 11362
li t7, 2259
li t8, 4433
li t9, 10703
li a1, 10704
li s8, 8
2:
lh a2, 0(a0) // 0
lh a3, 16(a0) // 8
lh v0, 32(a0) // 16
lh v1, 48(a0) // 24
lh s4, 64(a0) // 32
lh s5, 80(a0) // 40
lh s6, 96(a0) // 48
lh s7, 112(a0) // 56
addu s2, v0, s5 // tmp2 = 16 + 40
subu s5, v0, s5 // tmp5 = 16 - 40
addu s3, v1, s4 // tmp3 = 24 + 32
subu s4, v1, s4 // tmp4 = 24 - 32
addu s0, a2, s7 // tmp0 = 0 + 56
subu s7, a2, s7 // tmp7 = 0 - 56
addu s1, a3, s6 // tmp1 = 8 + 48
subu s6, a3, s6 // tmp6 = 8 - 48
addu a2, s0, s3 // tmp10 = tmp0 + tmp3
subu v1, s0, s3 // tmp13 = tmp0 - tmp3
addu a3, s1, s2 // tmp11 = tmp1 + tmp2
subu v0, s1, s2 // tmp12 = tmp1 - tmp2
mult s7, t1 // ac0 = tmp7 * c1
madd s4, t0 // ac0 += tmp4 * c0
madd s5, t4 // ac0 += tmp5 * c4
madd s6, t2 // ac0 += tmp6 * c2
mult $ac1, s7, t2 // ac1 = tmp7 * c2
msub $ac1, s4, t3 // ac1 -= tmp4 * c3
msub $ac1, s5, t6 // ac1 -= tmp5 * c6
msub $ac1, s6, t7 // ac1 -= tmp6 * c7
mult $ac2, s7, t4 // ac2 = tmp7 * c4
madd $ac2, s4, t2 // ac2 += tmp4 * c2
madd $ac2, s5, t5 // ac2 += tmp5 * c5
msub $ac2, s6, t6 // ac2 -= tmp6 * c6
mult $ac3, s7, t0 // ac3 = tmp7 * c0
msub $ac3, s4, t1 // ac3 -= tmp4 * c1
madd $ac3, s5, t2 // ac3 += tmp5 * c2
msub $ac3, s6, t3 // ac3 -= tmp6 * c3
extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15
extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15
extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15
extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15
addiu s8, s8, -1
addu s4, a2, a3 // tmp4 = tmp10 + tmp11
subu s5, a2, a3 // tmp5 = tmp10 - tmp11
sh s0, 16(a0)
sh s1, 48(a0)
sh s2, 80(a0)
sh s3, 112(a0)
mult v0, t8 // ac0 = tmp12 * c8
madd v1, t9 // ac0 += tmp13 * c9
mult $ac1, v1, t8 // ac1 = tmp13 * c8
msub $ac1, v0, a1 // ac1 -= tmp12 * c10
addiu a0, a0, 2
extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15
extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15
shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2
shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2
sh s4, -2(a0)
sh s5, 62(a0)
sh s6, 30(a0)
bgtz s8, 2b
sh s7, 94(a0)
RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
jr ra
nop
END(jsimd_fdct_islow_mips_dspr2)
/*****************************************************************************/
LEAF_MIPS_DSPR2(jsimd_fdct_ifast_mips_dspr2)
/*
* a0 - data
*/
.set at
SAVE_REGS_ON_STACK 8, s0, s1
li a1, 0x014e014e // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
li a2, 0x008b008b // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
li a3, 0x00620062 // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
li s1, 0x00b500b5 // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
move v0, a0
addiu v1, v0, 128 // end address
0:
lw t0, 0(v0) // tmp0 = 1|0
lw t1, 4(v0) // tmp1 = 3|2
lw t2, 8(v0) // tmp2 = 5|4
lw t3, 12(v0) // tmp3 = 7|6
packrl.ph t1, t1, t1 // tmp1 = 2|3
packrl.ph t3, t3, t3 // tmp3 = 6|7
subq.ph t7, t1, t2 // tmp7 = 2-5|3-4 = t5|t4
subq.ph t5, t0, t3 // tmp5 = 1-6|0-7 = t6|t7
addq.ph t6, t1, t2 // tmp6 = 2+5|3+4 = t2|t3
addq.ph t4, t0, t3 // tmp4 = 1+6|0+7 = t1|t0
addq.ph t8, t4, t6 // tmp5 = t1+t2|t0+t3 = t11|t10
subq.ph t9, t4, t6 // tmp7 = t1-t2|t0-t3 = t12|t13
sra t4, t8, 16 // tmp4 = t11
mult $0, $0 // ac0 = 0
dpa.w.ph $ac0, t9, s1
mult $ac1, $0, $0 // ac1 = 0
dpa.w.ph $ac1, t7, a3 // ac1 += t4*98 + t5*98
dpsx.w.ph $ac1, t5, a3 // ac1 += t6*98 + t7*98
mult $ac2, $0, $0 // ac2 = 0
dpa.w.ph $ac2, t7, a2 // ac2 += t4*139 + t5*139
mult $ac3, $0, $0 // ac3 = 0
dpa.w.ph $ac3, t5, a1 // ac3 += t6*334 + t7*334
precrq.ph.w t0, t5, t7 // t0 = t5|t6
addq.ph t2, t8, t4 // tmp2 = t10 + t11
subq.ph t3, t8, t4 // tmp3 = t10 - t11
extr.w t4, $ac0, 8
mult $0, $0 // ac0 = 0
dpa.w.ph $ac0, t0, s1 // ac0 += t5*181 + t6*181
extr.w t0, $ac1, 8 // t0 = z5
extr.w t1, $ac2, 8 // t1 = MULTIPLY(tmp10, 139)
extr.w t7, $ac3, 8 // t2 = MULTIPLY(tmp12, 334)
extr.w t8, $ac0, 8 // t8 = z3 = MULTIPLY(tmp11,