src/lib/vampimg/JpegLib/imjidctasm.pas

   1 unit imjidctasm;
   2
   3 { This file contains a slow-but-accurate integer implementation of the
   4   inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
   5   must also perform dequantization of the input coefficients.
   6
   7   A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
   8   on each row (or vice versa, but it's more convenient to emit a row at
   9   a time).  Direct algorithms are also available, but they are much more
  10   complex and seem not to be any faster when reduced to code.
  11
  12   This implementation is based on an algorithm described in
  13     C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
  14     Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
  15     Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
  16   The primary algorithm described there uses 11 multiplies and 29 adds.
  17   We use their alternate method with 12 multiplies and 32 adds.
  18   The advantage of this method is that no data path contains more than one
  19   multiplication; this allows a very simple and accurate implementation in
  20   scaled fixed-point arithmetic, with a minimal number of shifts. }
  21
  22 { Original : jidctint.c ;  Copyright (C) 1991-1996, Thomas G. Lane. }
  23 { ;-------------------------------------------------------------------------
  24   ; JIDCTINT.ASM
  25   ; 80386 protected mode assembly translation of JIDCTINT.C
  26   ; **** Optimized to all hell by Jason M. Felice (jasonf@apk.net) ****
  27   ; **** E-mail welcome                      ****
  28   ;
  29   ; ** This code does not make O/S calls -- use it for OS/2, Win95, WinNT,
  30   ; ** DOS prot. mode., Linux, whatever... have fun.
  31   ;
  32   ; ** Note, this code is dependant on the structure member order in the .h
  33   ; ** files for the following structures:
  34   ; -- amazingly NOT j_decompress_struct... cool.
  35   ; -- jpeg_component_info (dependant on position of dct_table element)
  36   ;
  37   ; Originally created with the /Fa option of MSVC 4.0 (why work when you
  38   ; don't have to?)
  39   ;
  40   ; (this code, when compiled is 1K bytes smaller than the optimized MSVC
  41   ; release build, not to mention 120-130 ms faster in my profile test with 1
  42   ; small color and and 1 medium black-and-white jpeg: stats using TASM 4.0
  43   ; and MSVC 4.0 to create a non-console app; jpeg_idct_islow accumulated
  44   ; 5,760 hits on all trials)
  45   ;
  46   ; TASM -t -ml -os jidctint.asm, jidctint.obj
  47   ;-------------------------------------------------------------------------
  48    Converted to Delphi 2.0 BASM for PasJPEG
  49    by Jacques NOMSSI NZALI  <nomssi@physik.tu-chemnitz.de>
  50    October 13th 1996
  51     * assumes Delphi "register" calling convention
  52         first 3 parameter are in EAX,EDX,ECX
  53     * register allocation revised
  54 }
  55
  56 interface
  57
  58 {$I imjconfig.inc}
  59
  60 uses
  61   imjmorecfg,
  62   imjinclude,
  63   imjpeglib,
  64   imjdct;         { Private declarations for DCT subsystem }
  65
  66 { Perform dequantization and inverse DCT on one block of coefficients. }
  67
  68 {GLOBAL}
  69 procedure jpeg_idct_islow (cinfo : j_decompress_ptr;
  70                           compptr : jpeg_component_info_ptr;
  71               coef_block : JCOEFPTR;
  72               output_buf : JSAMPARRAY;
  73                           output_col : JDIMENSION);
  74
  75 implementation
  76
  77 { This module is specialized to the case DCTSIZE = 8. }
  78
  79 {$ifndef DCTSIZE_IS_8}
  80   Sorry, this code only copes with 8x8 DCTs. { deliberate syntax err }
  81 {$endif}
  82
  83 { The poop on this scaling stuff is as follows:
  84
  85   Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
  86   larger than the true IDCT outputs.  The final outputs are therefore
  87   a factor of N larger than desired; since N=8 this can be cured by
  88   a simple right shift at the end of the algorithm.  The advantage of
  89   this arrangement is that we save two multiplications per 1-D IDCT,
  90   because the y0 and y4 inputs need not be divided by sqrt(N).
  91
  92   We have to do addition and subtraction of the integer inputs, which
  93   is no problem, and multiplication by fractional constants, which is
  94   a problem to do in integer arithmetic.  We multiply all the constants
  95   by CONST_SCALE and convert them to integer constants (thus retaining
  96   CONST_BITS bits of precision in the constants).  After doing a
  97   multiplication we have to divide the product by CONST_SCALE, with proper
  98   rounding, to produce the correct output.  This division can be done
  99   cheaply as a right shift of CONST_BITS bits.  We postpone shifting
 100   as long as possible so that partial sums can be added together with
 101   full fractional precision.
 102
 103   The outputs of the first pass are scaled up by PASS1_BITS bits so that
 104   they are represented to better-than-integral precision.  These outputs
 105   require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
 106   with the recommended scaling.  (To scale up 12-bit sample data further, an
 107   intermediate INT32 array would be needed.)
 108
 109   To avoid overflow of the 32-bit intermediate results in pass 2, we must
 110   have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
 111   shows that the values given below are the most effective. }
 112
 113 const
 114   CONST_BITS = 13;
 115
 116 {$ifdef BITS_IN_JSAMPLE_IS_8}
 117 const
 118   PASS1_BITS = 2;
 119 {$else}
 120 const
 121   PASS1_BITS = 1; { lose a little precision to avoid overflow }
 122 {$endif}
 123
 124 const
 125   CONST_SCALE = (INT32(1) shl CONST_BITS);
 126
 127 const
 128   FIX_0_298631336 = INT32(Round(CONST_SCALE * 0.298631336));  {2446}
 129   FIX_0_390180644 = INT32(Round(CONST_SCALE * 0.390180644));  {3196}
 130   FIX_0_541196100 = INT32(Round(CONST_SCALE * 0.541196100));  {4433}
 131   FIX_0_765366865 = INT32(Round(CONST_SCALE * 0.765366865));  {6270}
 132   FIX_0_899976223 = INT32(Round(CONST_SCALE * 0.899976223));  {7373}
 133   FIX_1_175875602 = INT32(Round(CONST_SCALE * 1.175875602));  {9633}
 134   FIX_1_501321110 = INT32(Round(CONST_SCALE * 1.501321110));  {12299}
 135   FIX_1_847759065 = INT32(Round(CONST_SCALE * 1.847759065));  {15137}
 136   FIX_1_961570560 = INT32(Round(CONST_SCALE * 1.961570560));  {16069}
 137   FIX_2_053119869 = INT32(Round(CONST_SCALE * 2.053119869));  {16819}
 138   FIX_2_562915447 = INT32(Round(CONST_SCALE * 2.562915447));  {20995}
 139   FIX_3_072711026 = INT32(Round(CONST_SCALE * 3.072711026));  {25172}
 140
 141
 142 { for DESCALE }
 143 const
 144   ROUND_CONST = (INT32(1) shl (CONST_BITS-PASS1_BITS-1));
 145 const
 146   ROUND_CONST_2 = (INT32(1) shl (CONST_BITS+PASS1_BITS+3-1));
 147
 148 { Perform dequantization and inverse DCT on one block of coefficients. }
 149
 150 {GLOBAL}
 151 procedure jpeg_idct_islow (cinfo : j_decompress_ptr;
 152                            compptr : jpeg_component_info_ptr;
 153                coef_block : JCOEFPTR;
 154                output_buf : JSAMPARRAY;
 155                            output_col : JDIMENSION);
 156 type
 157   PWorkspace = ^TWorkspace;
 158   TWorkspace = coef_bits_field; { buffers data between passes }
 159 const
 160   coefDCTSIZE = DCTSIZE*SizeOf(JCOEF);
 161   wrkDCTSIZE = DCTSIZE*SizeOf(int);
 162 var
 163   tmp0, tmp1, tmp2, tmp3 : INT32;
 164   tmp10, tmp11, tmp12, tmp13 : INT32;
 165   z1, z2, z3, z4, z5 : INT32;
 166 var
 167   inptr : JCOEFPTR;
 168   quantptr : ISLOW_MULT_TYPE_FIELD_PTR;
 169   wsptr : PWorkspace;
 170   outptr : JSAMPROW;
 171 var
 172   range_limit : JSAMPROW;
 173   ctr : int;
 174   workspace : TWorkspace;
 175 var
 176   dcval : int;
 177 var
 178   dcval_ : JSAMPLE;
 179 asm
 180   push  edi
 181   push  esi
 182   push  ebx
 183
 184   cld { The only direction we use, might as well set it now, as opposed }
 185         { to inside 2 loops. }
 186
 187 { Each IDCT routine is responsible for range-limiting its results and
 188   converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
 189   be quite far out of range if the input data is corrupt, so a bulletproof
 190   range-limiting step is required.  We use a mask-and-table-lookup method
 191   to do the combined operations quickly.  See the comments with
 192   prepare_range_limit_table (in jdmaster.c) for more info. }
 193
 194   {range_limit := JSAMPROW(@(cinfo^.sample_range_limit^[CENTERJSAMPLE]));}
 195   mov eax, [eax].jpeg_decompress_struct.sample_range_limit {eax=cinfo}
 196   add eax, (MAXJSAMPLE+1 + CENTERJSAMPLE)*(Type JSAMPLE)
 197   mov range_limit, eax
 198
 199   { Pass 1: process columns from input, store into work array. }
 200   { Note results are scaled up by sqrt(8) compared to a true IDCT; }
 201   { furthermore, we scale the results by 2**PASS1_BITS. }
 202
 203   {inptr := coef_block;}
 204   mov esi, ecx     { ecx=coef_block }
 205   {quantptr := ISLOW_MULT_TYPE_FIELD_PTR (compptr^.dct_table);}
 206   mov edi, [edx].jpeg_component_info.dct_table  { edx=compptr }
 207
 208   {wsptr := PWorkspace(@workspace);}
 209   lea ecx, workspace
 210
 211   {for ctr := pred(DCTSIZE) downto 0 do
 212   begin}
 213   mov ctr, DCTSIZE
 214 @loop518:
 215     { Due to quantization, we will usually find that many of the input
 216       coefficients are zero, especially the AC terms.  We can exploit this
 217       by short-circuiting the IDCT calculation for any column in which all
 218       the AC terms are zero.  In that case each output is equal to the
 219       DC coefficient (with scale factor as needed).
 220       With typical images and quantization tables, half or more of the
 221       column DCT calculations can be simplified this way. }
 222
 223     {if ((inptr^[DCTSIZE*1]) or (inptr^[DCTSIZE*2]) or (inptr^[DCTSIZE*3]) or
 224   (inptr^[DCTSIZE*4]) or (inptr^[DCTSIZE*5]) or (inptr^[DCTSIZE*6]) or
 225   (inptr^[DCTSIZE*7]) = 0) then
 226     begin}
 227   mov eax, DWORD PTR [esi+coefDCTSIZE*1]
 228   or  eax, DWORD PTR [esi+coefDCTSIZE*2]
 229   or  eax, DWORD PTR [esi+coefDCTSIZE*3]
 230   mov edx, DWORD PTR [esi+coefDCTSIZE*4]
 231   or    eax, edx
 232   or  eax, DWORD PTR [esi+coefDCTSIZE*5]
 233   or  eax, DWORD PTR [esi+coefDCTSIZE*6]
 234   or  eax, DWORD PTR [esi+coefDCTSIZE*7]
 235   jne @loop520
 236
 237       { AC terms all zero }
 238       {dcval := ISLOW_MULT_TYPE(inptr^[DCTSIZE*0]) *
 239                (quantptr^[DCTSIZE*0]) shl PASS1_BITS;}
 240   mov eax, DWORD PTR [esi+coefDCTSIZE*0]
 241   imul  eax, DWORD PTR [edi+wrkDCTSIZE*0]
 242   shl eax, PASS1_BITS
 243
 244   {wsptr^[DCTSIZE*0] := dcval;
 245   wsptr^[DCTSIZE*1] := dcval;
 246   wsptr^[DCTSIZE*2] := dcval;
 247   wsptr^[DCTSIZE*3] := dcval;
 248   wsptr^[DCTSIZE*4] := dcval;
 249   wsptr^[DCTSIZE*5] := dcval;
 250   wsptr^[DCTSIZE*6] := dcval;
 251   wsptr^[DCTSIZE*7] := dcval;}
 252
 253   mov DWORD PTR [ecx+ wrkDCTSIZE*0], eax
 254   mov DWORD PTR [ecx+ wrkDCTSIZE*1], eax
 255   mov DWORD PTR [ecx+ wrkDCTSIZE*2], eax
 256   mov DWORD PTR [ecx+ wrkDCTSIZE*3], eax
 257   mov DWORD PTR [ecx+ wrkDCTSIZE*4], eax
 258   mov DWORD PTR [ecx+ wrkDCTSIZE*5], eax
 259   mov DWORD PTR [ecx+ wrkDCTSIZE*6], eax
 260   mov DWORD PTR [ecx+ wrkDCTSIZE*7], eax
 261
 262       {Inc(JCOEF_PTR(inptr));   { advance pointers to next column }
 263       {Inc(ISLOW_MULT_TYPE_PTR(quantptr));
 264       Inc(int_ptr(wsptr));
 265       continue;}
 266   dec ctr
 267   je  @loop519
 268
 269   add   esi, Type JCOEF
 270   add edi, Type ISLOW_MULT_TYPE
 271   add ecx, Type int  { int_ptr }
 272   jmp @loop518
 273
 274 @loop520:
 275
 276     {end;}
 277
 278     { Even part: reverse the even part of the forward DCT. }
 279     { The rotator is sqrt(2)*c(-6). }
 280
 281     {z2 := ISLOW_MULT_TYPE(inptr^[DCTSIZE*2]) * quantptr^[DCTSIZE*2];
 282     z3 := ISLOW_MULT_TYPE(inptr^[DCTSIZE*6]) * quantptr^[DCTSIZE*6];
 283
 284     z1 := (z2 + z3) * INT32(FIX_0_541196100);
 285     tmp2 := z1 + INT32(z3) * INT32(- FIX_1_847759065);
 286     tmp3 := z1 + INT32(z2) * INT32(FIX_0_765366865);}
 287
 288   mov edx, DWORD PTR [esi+coefDCTSIZE*2]
 289   imul  edx, DWORD PTR [edi+wrkDCTSIZE*2]  {z2}
 290
 291   mov eax, DWORD PTR [esi+coefDCTSIZE*6]
 292   imul  eax, DWORD PTR [edi+wrkDCTSIZE*6]  {z3}
 293
 294   lea   ebx, [eax+edx]
 295   imul  ebx, FIX_0_541196100               {z1}
 296
 297   imul  eax, (-FIX_1_847759065)
 298   add   eax, ebx
 299   mov   tmp2, eax
 300
 301   imul  edx, FIX_0_765366865
 302   add   edx, ebx
 303   mov   tmp3, edx
 304
 305     {z2 := ISLOW_MULT_TYPE(inptr^[DCTSIZE*0]) * quantptr^[DCTSIZE*0];
 306     z3 := ISLOW_MULT_TYPE(inptr^[DCTSIZE*4]) * quantptr^[DCTSIZE*4];}
 307
 308   mov edx, DWORD PTR [esi+coefDCTSIZE*4]
 309   imul  edx, DWORD PTR [edi+wrkDCTSIZE*4]      { z3 = edx }
 310
 311   mov eax, DWORD PTR [esi+coefDCTSIZE*0]
 312   imul  eax, DWORD PTR [edi+wrkDCTSIZE*0]      { z2 = eax }
 313
 314     {tmp0 := (z2 + z3) shl CONST_BITS;
 315     tmp1 := (z2 - z3) shl CONST_BITS;}
 316   lea ebx,[eax+edx]
 317   sub eax, edx
 318   shl ebx, CONST_BITS                          { tmp0 = ebx }
 319   shl eax, CONST_BITS                          { tmp1 = eax }
 320
 321     {tmp10 := tmp0 + tmp3;
 322     tmp13 := tmp0 - tmp3;}
 323   mov edx, tmp3
 324   sub ebx, edx
 325   mov tmp13, ebx
 326   add edx, edx
 327   add ebx, edx
 328   mov tmp10, ebx
 329
 330     {tmp11 := tmp1 + tmp2;
 331     tmp12 := tmp1 - tmp2;}
 332   mov   ebx, tmp2
 333   sub   eax, ebx
 334   mov   tmp12, eax
 335   add   ebx, ebx
 336   add   eax, ebx
 337   mov tmp11, eax
 338
 339     { Odd part per figure 8; the matrix is unitary and hence its
 340       transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively. }
 341
 342     {tmp0 := ISLOW_MULT_TYPE(inptr^[DCTSIZE*7]) * quantptr^[DCTSIZE*7];}
 343   mov eax, DWORD PTR [esi+coefDCTSIZE*7]
 344   imul  eax, DWORD PTR [edi+wrkDCTSIZE*7]
 345   mov   edx, eax                            { edx = tmp0 }
 346     {tmp0 := (tmp0) * INT32(FIX_0_298631336); { sqrt(2) * (-c1+c3+c5-c7) }
 347   imul  eax, FIX_0_298631336
 348   mov tmp0, eax
 349
 350     {tmp3 := ISLOW_MULT_TYPE(inptr^[DCTSIZE*1]) * quantptr^[DCTSIZE*1];}
 351   mov eax, DWORD PTR [esi+coefDCTSIZE*1]
 352   imul  eax, DWORD PTR [edi+wrkDCTSIZE*1]
 353   mov tmp3, eax
 354
 355     {z1 := tmp0 + tmp3;}
 356     {z1 := (z1) * INT32(- FIX_0_899976223); { sqrt(2) * (c7-c3) }
 357   add eax, edx
 358   imul eax, (-FIX_0_899976223)
 359   mov  z1, eax
 360
 361     {tmp1 := ISLOW_MULT_TYPE(inptr^[DCTSIZE*5]) * quantptr^[DCTSIZE*5];}
 362   mov eax, DWORD PTR [esi+coefDCTSIZE*5]
 363   imul  eax, DWORD PTR [edi+wrkDCTSIZE*5]
 364   mov ebx, eax                            { ebx = tmp1 }
 365     {tmp1 := (tmp1) * INT32(FIX_2_053119869); { sqrt(2) * ( c1+c3-c5+c7) }
 366   imul  eax, FIX_2_053119869
 367   mov tmp1, eax
 368
 369     {tmp2 := ISLOW_MULT_TYPE(inptr^[DCTSIZE*3]) * quantptr^[DCTSIZE*3];}
 370   mov eax, DWORD PTR [esi+coefDCTSIZE*3]
 371   imul  eax, DWORD PTR [edi+wrkDCTSIZE*3]
 372   mov tmp2, eax
 373
 374     {z3 := tmp0 + tmp2;}
 375   add edx, eax                              { edx = z3 }
 376
 377     {z2 := tmp1 + tmp2;}
 378     {z2 := (z2) * INT32(- FIX_2_562915447); { sqrt(2) * (-c1-c3) }
 379   add eax, ebx
 380   imul  eax, (-FIX_2_562915447)
 381   mov z2, eax
 382
 383     {z4 := tmp1 + tmp3;}
 384   add ebx, tmp3                             { ebx = z4 }
 385
 386     {z5 := INT32(z3 + z4) * INT32(FIX_1_175875602); { sqrt(2) * c3 }
 387   lea   eax, [edx+ebx]
 388   imul eax, FIX_1_175875602                   { eax = z5 }
 389
 390     {z4 := (z4) * INT32(- FIX_0_390180644); { sqrt(2) * (c5-c3) }
 391     {Inc(z4, z5);}
 392   imul   ebx, (-FIX_0_390180644)
 393   add    ebx, eax
 394   mov    z4, ebx
 395
 396     {z3 := (z3) * INT32(- FIX_1_961570560); { sqrt(2) * (-c3-c5) }
 397     {Inc(z3, z5);}
 398   imul edx, (-FIX_1_961570560)
 399   add  eax, edx                        { z3 = eax }
 400
 401     {Inc(tmp0, z1 + z3);}
 402   mov   ebx, z1
 403   add ebx, eax
 404   add tmp0, ebx
 405
 406     {tmp2 := (tmp2) * INT32(FIX_3_072711026); { sqrt(2) * ( c1+c3+c5-c7) }
 407     {Inc(tmp2, z2 + z3);}
 408   mov   ebx, tmp2
 409   imul  ebx, FIX_3_072711026
 410   mov edx, z2                        { z2 = edx }
 411   add   ebx, edx
 412   add   eax, ebx
 413   mov tmp2, eax
 414
 415     {Inc(tmp1, z2 + z4);}
 416   mov   eax, z4                        { z4 = eax }
 417   add   edx, eax
 418   add   tmp1, edx
 419
 420     {tmp3 := (tmp3) * INT32(FIX_1_501321110); { sqrt(2) * ( c1+c3-c5-c7) }
 421     {Inc(tmp3, z1 + z4);}
 422   mov edx, tmp3
 423   imul  edx, FIX_1_501321110
 424
 425   add edx, eax
 426   add   edx, z1                        { tmp3 = edx }
 427
 428     { Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 }
 429
 430     {wsptr^[DCTSIZE*0] := int (DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS));}
 431     {wsptr^[DCTSIZE*7] := int (DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS));}
 432   mov eax, tmp10
 433   add   eax, ROUND_CONST
 434   lea   ebx, [eax+edx]
 435   sar ebx, CONST_BITS-PASS1_BITS
 436   mov DWORD PTR [ecx+wrkDCTSIZE*0], ebx
 437
 438   sub eax, edx
 439   sar eax, CONST_BITS-PASS1_BITS
 440   mov DWORD PTR [ecx+wrkDCTSIZE*7], eax
 441
 442     {wsptr^[DCTSIZE*1] := int (DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS));}
 443     {wsptr^[DCTSIZE*6] := int (DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS));}
 444   mov eax, tmp11
 445   add   eax, ROUND_CONST
 446   mov   edx, tmp2
 447   lea ebx, [eax+edx]
 448   sar ebx, CONST_BITS-PASS1_BITS
 449   mov DWORD PTR [ecx+wrkDCTSIZE*1], ebx
 450
 451   sub eax, edx
 452   sar eax, CONST_BITS-PASS1_BITS
 453   mov DWORD PTR [ecx+wrkDCTSIZE*6], eax
 454
 455     {wsptr^[DCTSIZE*2] := int (DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS));}
 456     {wsptr^[DCTSIZE*5] := int (DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS));}
 457   mov eax, tmp12
 458   add   eax, ROUND_CONST
 459   mov   edx, tmp1
 460   lea ebx, [eax+edx]
 461   sar ebx, CONST_BITS-PASS1_BITS
 462   mov DWORD PTR [ecx+wrkDCTSIZE*2], ebx
 463
 464   sub eax, edx
 465   sar eax, CONST_BITS-PASS1_BITS
 466   mov DWORD PTR [ecx+wrkDCTSIZE*5], eax
 467
 468     {wsptr^[DCTSIZE*3] := int (DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS));}
 469     {wsptr^[DCTSIZE*4] := int (DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS));}
 470   mov eax, tmp13
 471   add   eax, ROUND_CONST
 472   mov   edx, tmp0
 473   lea   ebx, [eax+edx]
 474   sar ebx, CONST_BITS-PASS1_BITS
 475   mov DWORD PTR [ecx+wrkDCTSIZE*3], ebx
 476
 477   sub eax, edx
 478   sar eax, CONST_BITS-PASS1_BITS
 479   mov DWORD PTR [ecx+wrkDCTSIZE*4], eax
 480
 481     {Inc(JCOEF_PTR(inptr));   { advance pointers to next column }
 482     {Inc(ISLOW_MULT_TYPE_PTR(quantptr));
 483     Inc(int_ptr(wsptr));}
 484   dec ctr
 485   je  @loop519
 486
 487   add   esi, Type JCOEF
 488   add edi, Type ISLOW_MULT_TYPE
 489   add ecx, Type int  { int_ptr }
 490   {end;}
 491   jmp @loop518
 492 @loop519:
 493   { Save to memory what we've registerized for the preceding loop. }
 494
 495   { Pass 2: process rows from work array, store into output array. }
 496   { Note that we must descale the results by a factor of 8 == 2**3, }
 497   { and also undo the PASS1_BITS scaling. }
 498
 499   {wsptr := @workspace;}
 500   lea esi, workspace
 501
 502   {for ctr := 0 to pred(DCTSIZE) do
 503   begin}
 504   mov ctr, 0
 505 @loop523:
 506
 507     {outptr := output_buf^[ctr];}
 508   mov eax, ctr
 509   mov ebx, output_buf
 510   mov edi, DWORD PTR [ebx+eax*4]           { 4 = SizeOf(pointer) }
 511
 512     {Inc(JSAMPLE_PTR(outptr), output_col);}
 513   add edi, LongWord(output_col)
 514
 515     { Rows of zeroes can be exploited in the same way as we did with columns.
 516       However, the column calculation has created many nonzero AC terms, so
 517       the simplification applies less often (typically 5% to 10% of the time).
 518       On machines with very fast multiplication, it's possible that the
 519       test takes more time than it's worth.  In that case this section
 520       may be commented out. }
 521
 522 {$ifndef NO_ZERO_ROW_TEST}
 523     {if ((wsptr^[1]) or (wsptr^[2]) or (wsptr^[3]) or (wsptr^[4]) or
 524         (wsptr^[5]) or (wsptr^[6]) or (wsptr^[7]) = 0) then
 525     begin}
 526   mov eax, DWORD PTR [esi+4*1]
 527   or  eax, DWORD PTR [esi+4*2]
 528   or  eax, DWORD PTR [esi+4*3]
 529         jne     @loop525            { Nomssi: early exit path may help }
 530   or  eax, DWORD PTR [esi+4*4]
 531   or  eax, DWORD PTR [esi+4*5]
 532   or  eax, DWORD PTR [esi+4*6]
 533   or  eax, DWORD PTR [esi+4*7]
 534   jne @loop525
 535
 536       { AC terms all zero }
 537       {JSAMPLE(dcval_) := range_limit^[int(DESCALE(INT32(wsptr^[0]),
 538                           PASS1_BITS+3)) and RANGE_MASK];}
 539   mov eax, DWORD PTR [esi+4*0]
 540   add eax, (INT32(1) shl (PASS1_BITS+3-1))
 541   sar eax, PASS1_BITS+3
 542   and eax, RANGE_MASK
 543         mov     ebx, range_limit
 544   mov al, BYTE PTR [ebx+eax]
 545         mov     ah, al
 546
 547       {outptr^[0] := dcval_;
 548       outptr^[1] := dcval_;
 549       outptr^[2] := dcval_;
 550       outptr^[3] := dcval_;
 551       outptr^[4] := dcval_;
 552       outptr^[5] := dcval_;
 553       outptr^[6] := dcval_;
 554       outptr^[7] := dcval_;}
 555
 556   stosw
 557   stosw
 558   stosw
 559   stosw
 560
 561       {Inc(int_ptr(wsptr), DCTSIZE);  { advance pointer to next row }
 562       {continue;}
 563   add esi, wrkDCTSIZE
 564   inc ctr
 565   cmp ctr, DCTSIZE
 566   jl  @loop523
 567   jmp @loop524
 568     {end;}
 569 @loop525:
 570 {$endif}
 571
 572
 573     { Even part: reverse the even part of the forward DCT. }
 574     { The rotator is sqrt(2)*c(-6). }
 575
 576     {z2 := INT32 (wsptr^[2]);}
 577   mov edx, DWORD PTR [esi+4*2]                   { z2 = edx }
 578
 579     {z3 := INT32 (wsptr^[6]);}
 580   mov ecx, DWORD PTR [esi+4*6]                   { z3 = ecx }
 581
 582     {z1 := (z2 + z3) * INT32(FIX_0_541196100);}
 583   lea   eax, [edx+ecx]
 584   imul  eax, FIX_0_541196100
 585   mov ebx, eax                                   { z1 = ebx }
 586
 587     {tmp2 := z1 + (z3) * INT32(- FIX_1_847759065);}
 588   imul  ecx, (-FIX_1_847759065)
 589   add ecx, ebx                                   { tmp2 = ecx }
 590
 591     {tmp3 := z1 + (z2) * INT32(FIX_0_765366865);}
 592   imul  edx, FIX_0_765366865
 593   add ebx, edx                                   { tmp3 = ebx }
 594
 595     {tmp0 := (INT32(wsptr^[0]) + INT32(wsptr^[4])) shl CONST_BITS;}
 596     {tmp1 := (INT32(wsptr^[0]) - INT32(wsptr^[4])) shl CONST_BITS;}
 597   mov edx, DWORD PTR [esi+4*4]
 598   mov   eax, DWORD PTR [esi+4*0]
 599   sub   eax, edx
 600   add   edx, edx
 601   add   edx, eax
 602   shl edx, CONST_BITS              { tmp0 = edx }
 603   shl eax, CONST_BITS              { tmp1 = eax }
 604
 605     {tmp10 := tmp0 + tmp3;}
 606     {tmp13 := tmp0 - tmp3;}
 607   sub   edx, ebx
 608   mov tmp13, edx
 609   add   ebx, ebx
 610   add   edx, ebx
 611   mov tmp10, edx
 612
 613     {tmp11 := tmp1 + tmp2;}
 614     {tmp12 := tmp1 - tmp2;}
 615   lea   ebx, [ecx+eax]
 616   mov tmp11, ebx
 617   sub eax, ecx
 618   mov tmp12, eax
 619
 620     { Odd part per figure 8; the matrix is unitary and hence its
 621       transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively. }
 622
 623 { The following lines no longer produce code, since wsptr has been
 624   optimized to esi, it is more efficient to access these values
 625   directly.
 626     tmp0 := INT32(wsptr^[7]);
 627     tmp1 := INT32(wsptr^[5]);
 628     tmp2 := INT32(wsptr^[3]);
 629     tmp3 := INT32(wsptr^[1]); }
 630
 631     {z2 := tmp1 + tmp2;}
 632     {z2 := (z2) * INT32(- FIX_2_562915447); { sqrt(2) * (-c1-c3) }
 633   mov ebx, DWORD PTR [esi+4*3]              { tmp2 }
 634   mov   ecx, DWORD PTR [esi+4*5]              { tmp1 }
 635   lea   eax, [ebx+ecx]
 636   imul  eax, (-FIX_2_562915447)
 637   mov z2, eax
 638
 639     {z3 := tmp0 + tmp2;}
 640   mov edx, DWORD PTR [esi+4*7]              { tmp0 }
 641   add   ebx, edx                              { old z3 = ebx }
 642   mov eax, ebx
 643     {z3 := (z3) * INT32(- FIX_1_961570560); { sqrt(2) * (-c3-c5) }
 644   imul eax, (-FIX_1_961570560)
 645   mov z3, eax
 646
 647     {z1 := tmp0 + tmp3;}
 648     {z1 := (z1) * INT32(- FIX_0_899976223); { sqrt(2) * (c7-c3) }
 649   mov eax, DWORD PTR [esi+4*1]               { tmp3 }
 650   add edx, eax
 651   imul  edx, (-FIX_0_899976223)                { z1 = edx }
 652
 653     {z4 := tmp1 + tmp3;}
 654   add eax, ecx                              { +tmp1 }
 655   add ebx, eax                              { z3 + z4 = ebx }
 656     {z4 := (z4) * INT32(- FIX_0_390180644); { sqrt(2) * (c5-c3) }
 657   imul eax, (-FIX_0_390180644)                { z4 = eax }
 658
 659     {z5 := (z3 + z4) * INT32(FIX_1_175875602); { sqrt(2) * c3 }
 660     {Inc(z3, z5);}
 661   imul ebx, FIX_1_175875602
 662   mov  ecx, z3
 663   add  ecx, ebx                                { ecx = z3 }
 664
 665     {Inc(z4, z5);}
 666   add ebx, eax                                 { z4 = ebx }
 667
 668     {tmp0 := (tmp0) * INT32(FIX_0_298631336); { sqrt(2) * (-c1+c3+c5-c7) }
 669     {Inc(tmp0, z1 + z3);}
 670   mov   eax, DWORD PTR [esi+4*7]
 671   imul  eax, FIX_0_298631336
 672   add   eax, edx
 673   add   eax, ecx
 674   mov tmp0, eax
 675
 676     {tmp1 := (tmp1) * INT32(FIX_2_053119869); { sqrt(2) * ( c1+c3-c5+c7) }
 677     {Inc(tmp1, z2 + z4);}
 678   mov  eax, DWORD PTR [esi+4*5]
 679   imul eax, FIX_2_053119869
 680   add  eax, z2
 681   add  eax, ebx
 682   mov  tmp1, eax
 683
 684     {tmp2 := (tmp2) * INT32(FIX_3_072711026); { sqrt(2) * ( c1+c3+c5-c7) }
 685     {Inc(tmp2, z2 + z3);}
 686   mov eax, DWORD PTR [esi+4*3]
 687   imul  eax, FIX_3_072711026
 688   add   eax, z2
 689   add   ecx, eax                      { ecx = tmp2 }
 690
 691     {tmp3 := (tmp3) * INT32(FIX_1_501321110); { sqrt(2) * ( c1+c3-c5-c7) }
 692     {Inc(tmp3, z1 + z4);}
 693   mov eax, DWORD PTR [esi+4*1]
 694   imul  eax, FIX_1_501321110
 695   add   eax, edx
 696   add   ebx, eax                   { ebx = tmp3 }
 697
 698     { Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 }
 699
 700     {outptr^[0] := range_limit^[ int(DESCALE(tmp10 + tmp3,
 701                       CONST_BITS+PASS1_BITS+3)) and RANGE_MASK]; }
 702     {outptr^[7] := range_limit^[ int(DESCALE(tmp10 - tmp3,
 703                         CONST_BITS+PASS1_BITS+3)) and RANGE_MASK];}
 704
 705   mov edx, tmp10
 706   add   edx, ROUND_CONST_2
 707   lea eax, [ebx+edx]
 708   sub   edx, ebx
 709
 710   shr eax, CONST_BITS+PASS1_BITS+3
 711   and eax, RANGE_MASK
 712   mov   ebx, range_limit           { once for all }
 713   mov al, BYTE PTR [ebx+eax]
 714   mov   [edi+0], al
 715
 716   shr edx, CONST_BITS+PASS1_BITS+3
 717   and edx, RANGE_MASK
 718   mov al, BYTE PTR [ebx+edx]
 719   mov   [edi+7], al
 720
 721     {outptr^[1] := range_limit^[ int(DESCALE(tmp11 + tmp2,
 722                         CONST_BITS+PASS1_BITS+3)) and RANGE_MASK];}
 723   mov eax, tmp11
 724   add   eax, ROUND_CONST_2
 725   lea edx, [eax+ecx]
 726   shr edx, CONST_BITS+PASS1_BITS+3
 727   and edx, RANGE_MASK
 728   mov dl, BYTE PTR [ebx+edx]
 729   mov   [edi+1], dl
 730
 731     {outptr^[6] := range_limit^[ int(DESCALE(tmp11 - tmp2,
 732       CONST_BITS+PASS1_BITS+3)) and RANGE_MASK];}
 733   sub eax, ecx
 734   shr eax, CONST_BITS+PASS1_BITS+3
 735   and eax, RANGE_MASK
 736   mov al, BYTE PTR [ebx+eax]
 737   mov   [edi+6], al
 738
 739     {outptr^[2] := range_limit^[ int(DESCALE(tmp12 + tmp1,
 740       CONST_BITS+PASS1_BITS+3)) and RANGE_MASK];}
 741   mov eax, tmp12
 742   add   eax, ROUND_CONST_2
 743   mov   ecx, tmp1
 744   lea edx, [eax+ecx]
 745   shr edx, CONST_BITS+PASS1_BITS+3
 746   and edx, RANGE_MASK
 747   mov dl, BYTE PTR [ebx+edx]
 748   mov   [edi+2], dl
 749
 750     {outptr^[5] := range_limit^[ int(DESCALE(tmp12 - tmp1,
 751       CONST_BITS+PASS1_BITS+3)) and RANGE_MASK];}
 752   sub eax, ecx
 753   shr eax, CONST_BITS+PASS1_BITS+3
 754   and eax, RANGE_MASK
 755   mov al, BYTE PTR [ebx+eax]
 756   mov   [edi+5], al
 757
 758     {outptr^[3] := range_limit^[ int(DESCALE(tmp13 + tmp0,
 759       CONST_BITS+PASS1_BITS+3)) and RANGE_MASK];}
 760   mov eax, tmp13
 761   add   eax, ROUND_CONST_2
 762   mov   ecx, tmp0
 763   lea   edx, [eax+ecx]
 764   shr edx, CONST_BITS+PASS1_BITS+3
 765   and edx, RANGE_MASK
 766   mov dl, BYTE PTR [ebx+edx]
 767   mov   [edi+3], dl
 768
 769     {outptr^[4] := range_limit^[ int(DESCALE(tmp13 - tmp0,
 770       CONST_BITS+PASS1_BITS+3)) and RANGE_MASK];}
 771   sub eax, ecx
 772   shr eax, CONST_BITS+PASS1_BITS+3
 773   and eax, RANGE_MASK
 774   mov al, BYTE PTR [ebx+eax]
 775   mov   [edi+4], al
 776
 777     {Inc(int_ptr(wsptr), DCTSIZE);  { advance pointer to next row }
 778   add esi, wrkDCTSIZE
 779   add edi, DCTSIZE
 780
 781   {end;}
 782   inc ctr
 783   cmp ctr, DCTSIZE
 784   jl  @loop523
 785
 786 @loop524:
 787 @loop496:
 788   pop   ebx
 789   pop   esi
 790   pop   edi
 791 end;
 792
 793 end.