src/lib/vampimg/JpegLib/imjidctint.pas

   1 unit imjidctint;
   2 {$Q+}
   3
   4 { This file contains a slow-but-accurate integer implementation of the
   5   inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
   6   must also perform dequantization of the input coefficients.
   7
   8   A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
   9   on each row (or vice versa, but it's more convenient to emit a row at
  10   a time).  Direct algorithms are also available, but they are much more
  11   complex and seem not to be any faster when reduced to code.
  12
  13   This implementation is based on an algorithm described in
  14     C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
  15     Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
  16     Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
  17   The primary algorithm described there uses 11 multiplies and 29 adds.
  18   We use their alternate method with 12 multiplies and 32 adds.
  19   The advantage of this method is that no data path contains more than one
  20   multiplication; this allows a very simple and accurate implementation in
  21   scaled fixed-point arithmetic, with a minimal number of shifts. }
  22
  23 { Original : jidctint.c ;  Copyright (C) 1991-1998, Thomas G. Lane. }
  24
  25
  26 interface
  27
  28 {$I imjconfig.inc}
  29
  30 uses
  31   imjmorecfg,
  32   imjinclude,
  33   imjpeglib,
  34   imjdct;         { Private declarations for DCT subsystem }
  35
  36 { Perform dequantization and inverse DCT on one block of coefficients. }
  37
  38 {GLOBAL}
  39 procedure jpeg_idct_islow (cinfo : j_decompress_ptr;
  40                            compptr : jpeg_component_info_ptr;
  41                coef_block : JCOEFPTR;
  42                output_buf : JSAMPARRAY;
  43                            output_col : JDIMENSION);
  44
  45 implementation
  46
  47 { This module is specialized to the case DCTSIZE = 8. }
  48
  49 {$ifndef DCTSIZE_IS_8}
  50   Sorry, this code only copes with 8x8 DCTs. { deliberate syntax err }
  51 {$endif}
  52
  53 { The poop on this scaling stuff is as follows:
  54
  55   Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
  56   larger than the true IDCT outputs.  The final outputs are therefore
  57   a factor of N larger than desired; since N=8 this can be cured by
  58   a simple right shift at the end of the algorithm.  The advantage of
  59   this arrangement is that we save two multiplications per 1-D IDCT,
  60   because the y0 and y4 inputs need not be divided by sqrt(N).
  61
  62   We have to do addition and subtraction of the integer inputs, which
  63   is no problem, and multiplication by fractional constants, which is
  64   a problem to do in integer arithmetic.  We multiply all the constants
  65   by CONST_SCALE and convert them to integer constants (thus retaining
  66   CONST_BITS bits of precision in the constants).  After doing a
  67   multiplication we have to divide the product by CONST_SCALE, with proper
  68   rounding, to produce the correct output.  This division can be done
  69   cheaply as a right shift of CONST_BITS bits.  We postpone shifting
  70   as long as possible so that partial sums can be added together with
  71   full fractional precision.
  72
  73   The outputs of the first pass are scaled up by PASS1_BITS bits so that
  74   they are represented to better-than-integral precision.  These outputs
  75   require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
  76   with the recommended scaling.  (To scale up 12-bit sample data further, an
  77   intermediate INT32 array would be needed.)
  78
  79   To avoid overflow of the 32-bit intermediate results in pass 2, we must
  80   have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
  81   shows that the values given below are the most effective. }
  82
  83 {$ifdef BITS_IN_JSAMPLE_IS_8}
  84 const
  85   CONST_BITS = 13;
  86   PASS1_BITS = 2;
  87 {$else}
  88 const
  89   CONST_BITS = 13;
  90   PASS1_BITS = 1; { lose a little precision to avoid overflow }
  91 {$endif}
  92
  93 const
  94   CONST_SCALE = (INT32(1) shl CONST_BITS);
  95
  96 const
  97   FIX_0_298631336 = INT32(Round(CONST_SCALE * 0.298631336));  {2446}
  98   FIX_0_390180644 = INT32(Round(CONST_SCALE * 0.390180644));  {3196}
  99   FIX_0_541196100 = INT32(Round(CONST_SCALE * 0.541196100));  {4433}
 100   FIX_0_765366865 = INT32(Round(CONST_SCALE * 0.765366865));  {6270}
 101   FIX_0_899976223 = INT32(Round(CONST_SCALE * 0.899976223));  {7373}
 102   FIX_1_175875602 = INT32(Round(CONST_SCALE * 1.175875602));  {9633}
 103   FIX_1_501321110 = INT32(Round(CONST_SCALE * 1.501321110));  {12299}
 104   FIX_1_847759065 = INT32(Round(CONST_SCALE * 1.847759065));  {15137}
 105   FIX_1_961570560 = INT32(Round(CONST_SCALE * 1.961570560));  {16069}
 106   FIX_2_053119869 = INT32(Round(CONST_SCALE * 2.053119869));  {16819}
 107   FIX_2_562915447 = INT32(Round(CONST_SCALE * 2.562915447));  {20995}
 108   FIX_3_072711026 = INT32(Round(CONST_SCALE * 3.072711026));  {25172}
 109
 110
 111
 112 { Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
 113   For 8-bit samples with the recommended scaling, all the variable
 114   and constant values involved are no more than 16 bits wide, so a
 115   16x16->32 bit multiply can be used instead of a full 32x32 multiply.
 116   For 12-bit samples, a full 32-bit multiplication will be needed. }
 117
 118 {$ifdef BITS_IN_JSAMPLE_IS_8}
 119
 120    {$IFDEF BASM16}
 121      {$IFNDEF WIN32}
 122    {MULTIPLY16C16(var,const)}
 123    function Multiply(X, Y: Integer): integer; assembler;
 124    asm
 125      mov ax, X
 126      imul Y
 127      mov al, ah
 128      mov ah, dl
 129    end;
 130      {$ENDIF}
 131    {$ENDIF}
 132
 133    function Multiply(X, Y: INT32): INT32;
 134    begin
 135      Multiply := INT32(X) * INT32(Y);
 136    end;
 137
 138
 139 {$else}
 140   {#define MULTIPLY(var,const)  ((var) * (const))}
 141    function Multiply(X, Y: INT32): INT32;
 142    begin
 143      Multiply := INT32(X) * INT32(Y);
 144    end;
 145 {$endif}
 146
 147
 148 { Dequantize a coefficient by multiplying it by the multiplier-table
 149   entry; produce an int result.  In this module, both inputs and result
 150   are 16 bits or less, so either int or short multiply will work. }
 151
 152 function DEQUANTIZE(coef,quantval : int) : int;
 153 begin
 154   Dequantize := ( ISLOW_MULT_TYPE(coef) * quantval);
 155 end;
 156
 157 { Descale and correctly round an INT32 value that's scaled by N bits.
 158   We assume RIGHT_SHIFT rounds towards minus infinity, so adding
 159   the fudge factor is correct for either sign of X. }
 160
 161 function DESCALE(x : INT32; n : int) : INT32;
 162 var
 163   shift_temp : INT32;
 164 begin
 165 {$ifdef RIGHT_SHIFT_IS_UNSIGNED}
 166   shift_temp := x + (INT32(1) shl (n-1));
 167   if shift_temp < 0 then
 168     Descale :=  (shift_temp shr n) or ((not INT32(0)) shl (32-n))
 169   else
 170     Descale :=  (shift_temp shr n);
 171 {$else}
 172   Descale := (x + (INT32(1) shl (n-1)) shr n;
 173 {$endif}
 174 end;
 175
 176 { Perform dequantization and inverse DCT on one block of coefficients. }
 177
 178 {GLOBAL}
 179 procedure jpeg_idct_islow (cinfo : j_decompress_ptr;
 180                            compptr : jpeg_component_info_ptr;
 181                coef_block : JCOEFPTR;
 182                output_buf : JSAMPARRAY;
 183                            output_col : JDIMENSION);
 184 type
 185   PWorkspace = ^TWorkspace;
 186   TWorkspace = coef_bits_field; { buffers data between passes }
 187 var
 188   tmp0, tmp1, tmp2, tmp3 : INT32;
 189   tmp10, tmp11, tmp12, tmp13 : INT32;
 190   z1, z2, z3, z4, z5 : INT32;
 191   inptr : JCOEFPTR;
 192   quantptr : ISLOW_MULT_TYPE_FIELD_PTR;
 193   wsptr : PWorkspace;
 194   outptr : JSAMPROW;
 195   range_limit : JSAMPROW;
 196   ctr : int;
 197   workspace : TWorkspace;
 198   {SHIFT_TEMPS}
 199 var
 200   dcval : int;
 201 var
 202   dcval_ : JSAMPLE;
 203 begin
 204 { Each IDCT routine is responsible for range-limiting its results and
 205   converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
 206   be quite far out of range if the input data is corrupt, so a bulletproof
 207   range-limiting step is required.  We use a mask-and-table-lookup method
 208   to do the combined operations quickly.  See the comments with
 209   prepare_range_limit_table (in jdmaster.c) for more info. }
 210
 211   range_limit := JSAMPROW(@(cinfo^.sample_range_limit^[CENTERJSAMPLE]));
 212
 213
 214   { Pass 1: process columns from input, store into work array. }
 215   { Note results are scaled up by sqrt(8) compared to a true IDCT; }
 216   { furthermore, we scale the results by 2**PASS1_BITS. }
 217
 218   inptr := coef_block;
 219   quantptr := ISLOW_MULT_TYPE_FIELD_PTR (compptr^.dct_table);
 220   wsptr := PWorkspace(@workspace);
 221   for ctr := pred(DCTSIZE) downto 0 do
 222   begin
 223     { Due to quantization, we will usually find that many of the input
 224       coefficients are zero, especially the AC terms.  We can exploit this
 225       by short-circuiting the IDCT calculation for any column in which all
 226       the AC terms are zero.  In that case each output is equal to the
 227       DC coefficient (with scale factor as needed).
 228       With typical images and quantization tables, half or more of the
 229       column DCT calculations can be simplified this way. }
 230
 231     if ((inptr^[DCTSIZE*1]=0) and (inptr^[DCTSIZE*2]=0) and
 232         (inptr^[DCTSIZE*3]=0) and (inptr^[DCTSIZE*4]=0) and
 233         (inptr^[DCTSIZE*5]=0) and (inptr^[DCTSIZE*6]=0) and
 234   (inptr^[DCTSIZE*7]=0)) then
 235     begin
 236       { AC terms all zero }
 237       dcval := DEQUANTIZE(inptr^[DCTSIZE*0], quantptr^[DCTSIZE*0]) shl PASS1_BITS;
 238
 239       wsptr^[DCTSIZE*0] := dcval;
 240       wsptr^[DCTSIZE*1] := dcval;
 241       wsptr^[DCTSIZE*2] := dcval;
 242       wsptr^[DCTSIZE*3] := dcval;
 243       wsptr^[DCTSIZE*4] := dcval;
 244       wsptr^[DCTSIZE*5] := dcval;
 245       wsptr^[DCTSIZE*6] := dcval;
 246       wsptr^[DCTSIZE*7] := dcval;
 247
 248       Inc(JCOEF_PTR(inptr));    { advance pointers to next column }
 249       Inc(ISLOW_MULT_TYPE_PTR(quantptr));
 250       Inc(int_ptr(wsptr));
 251       continue;
 252     end;
 253
 254     { Even part: reverse the even part of the forward DCT. }
 255     { The rotator is sqrt(2)*c(-6). }
 256
 257     z2 := DEQUANTIZE(inptr^[DCTSIZE*2], quantptr^[DCTSIZE*2]);
 258     z3 := DEQUANTIZE(inptr^[DCTSIZE*6], quantptr^[DCTSIZE*6]);
 259
 260     z1 := MULTIPLY(z2 + z3, FIX_0_541196100);
 261     tmp2 := z1 + MULTIPLY(z3, - FIX_1_847759065);
 262     tmp3 := z1 + MULTIPLY(z2, FIX_0_765366865);
 263
 264     z2 := DEQUANTIZE(inptr^[DCTSIZE*0], quantptr^[DCTSIZE*0]);
 265     z3 := DEQUANTIZE(inptr^[DCTSIZE*4], quantptr^[DCTSIZE*4]);
 266
 267     tmp0 := (z2 + z3) shl CONST_BITS;
 268     tmp1 := (z2 - z3) shl CONST_BITS;
 269
 270     tmp10 := tmp0 + tmp3;
 271     tmp13 := tmp0 - tmp3;
 272     tmp11 := tmp1 + tmp2;
 273     tmp12 := tmp1 - tmp2;
 274
 275     { Odd part per figure 8; the matrix is unitary and hence its
 276       transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively. }
 277
 278     tmp0 := DEQUANTIZE(inptr^[DCTSIZE*7], quantptr^[DCTSIZE*7]);
 279     tmp1 := DEQUANTIZE(inptr^[DCTSIZE*5], quantptr^[DCTSIZE*5]);
 280     tmp2 := DEQUANTIZE(inptr^[DCTSIZE*3], quantptr^[DCTSIZE*3]);
 281     tmp3 := DEQUANTIZE(inptr^[DCTSIZE*1], quantptr^[DCTSIZE*1]);
 282
 283     z1 := tmp0 + tmp3;
 284     z2 := tmp1 + tmp2;
 285     z3 := tmp0 + tmp2;
 286     z4 := tmp1 + tmp3;
 287     z5 := MULTIPLY(z3 + z4, FIX_1_175875602); { sqrt(2) * c3 }
 288
 289     tmp0 := MULTIPLY(tmp0, FIX_0_298631336); { sqrt(2) * (-c1+c3+c5-c7) }
 290     tmp1 := MULTIPLY(tmp1, FIX_2_053119869); { sqrt(2) * ( c1+c3-c5+c7) }
 291     tmp2 := MULTIPLY(tmp2, FIX_3_072711026); { sqrt(2) * ( c1+c3+c5-c7) }
 292     tmp3 := MULTIPLY(tmp3, FIX_1_501321110); { sqrt(2) * ( c1+c3-c5-c7) }
 293     z1 := MULTIPLY(z1, - FIX_0_899976223); { sqrt(2) * (c7-c3) }
 294     z2 := MULTIPLY(z2, - FIX_2_562915447); { sqrt(2) * (-c1-c3) }
 295     z3 := MULTIPLY(z3, - FIX_1_961570560); { sqrt(2) * (-c3-c5) }
 296     z4 := MULTIPLY(z4, - FIX_0_390180644); { sqrt(2) * (c5-c3) }
 297
 298     Inc(z3, z5);
 299     Inc(z4, z5);
 300
 301     Inc(tmp0, z1 + z3);
 302     Inc(tmp1, z2 + z4);
 303     Inc(tmp2, z2 + z3);
 304     Inc(tmp3, z1 + z4);
 305
 306     { Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 }
 307
 308     wsptr^[DCTSIZE*0] := int (DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS));
 309     wsptr^[DCTSIZE*7] := int (DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS));
 310     wsptr^[DCTSIZE*1] := int (DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS));
 311     wsptr^[DCTSIZE*6] := int (DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS));
 312     wsptr^[DCTSIZE*2] := int (DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS));
 313     wsptr^[DCTSIZE*5] := int (DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS));
 314     wsptr^[DCTSIZE*3] := int (DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS));
 315     wsptr^[DCTSIZE*4] := int (DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS));
 316
 317     Inc(JCOEF_PTR(inptr));    { advance pointers to next column }
 318     Inc(ISLOW_MULT_TYPE_PTR(quantptr));
 319     Inc(int_ptr(wsptr));
 320   end;
 321
 322   { Pass 2: process rows from work array, store into output array. }
 323   { Note that we must descale the results by a factor of 8 == 2**3, }
 324   { and also undo the PASS1_BITS scaling. }
 325
 326   wsptr := @workspace;
 327   for ctr := 0 to pred(DCTSIZE) do
 328   begin
 329     outptr := output_buf^[ctr];
 330     Inc(JSAMPLE_PTR(outptr), output_col);
 331     { Rows of zeroes can be exploited in the same way as we did with columns.
 332       However, the column calculation has created many nonzero AC terms, so
 333       the simplification applies less often (typically 5% to 10% of the time).
 334       On machines with very fast multiplication, it's possible that the
 335       test takes more time than it's worth.  In that case this section
 336       may be commented out. }
 337
 338 {$ifndef NO_ZERO_ROW_TEST}
 339     if ((wsptr^[1]=0) and (wsptr^[2]=0) and (wsptr^[3]=0) and (wsptr^[4]=0)
 340        and (wsptr^[5]=0) and (wsptr^[6]=0) and (wsptr^[7]=0)) then
 341     begin
 342       { AC terms all zero }
 343       JSAMPLE(dcval_) := range_limit^[int(DESCALE(INT32(wsptr^[0]),
 344                           PASS1_BITS+3)) and RANGE_MASK];
 345
 346       outptr^[0] := dcval_;
 347       outptr^[1] := dcval_;
 348       outptr^[2] := dcval_;
 349       outptr^[3] := dcval_;
 350       outptr^[4] := dcval_;
 351       outptr^[5] := dcval_;
 352       outptr^[6] := dcval_;
 353       outptr^[7] := dcval_;
 354
 355       Inc(int_ptr(wsptr), DCTSIZE); { advance pointer to next row }
 356       continue;
 357     end;
 358 {$endif}
 359
 360     { Even part: reverse the even part of the forward DCT. }
 361     { The rotator is sqrt(2)*c(-6). }
 362
 363     z2 := INT32 (wsptr^[2]);
 364     z3 := INT32 (wsptr^[6]);
 365
 366     z1 := MULTIPLY(z2 + z3, FIX_0_541196100);
 367     tmp2 := z1 + MULTIPLY(z3, - FIX_1_847759065);
 368     tmp3 := z1 + MULTIPLY(z2, FIX_0_765366865);
 369
 370     tmp0 := (INT32(wsptr^[0]) + INT32(wsptr^[4])) shl CONST_BITS;
 371     tmp1 := (INT32(wsptr^[0]) - INT32(wsptr^[4])) shl CONST_BITS;
 372
 373     tmp10 := tmp0 + tmp3;
 374     tmp13 := tmp0 - tmp3;
 375     tmp11 := tmp1 + tmp2;
 376     tmp12 := tmp1 - tmp2;
 377
 378     { Odd part per figure 8; the matrix is unitary and hence its
 379       transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively. }
 380
 381     tmp0 := INT32(wsptr^[7]);
 382     tmp1 := INT32(wsptr^[5]);
 383     tmp2 := INT32(wsptr^[3]);
 384     tmp3 := INT32(wsptr^[1]);
 385
 386     z1 := tmp0 + tmp3;
 387     z2 := tmp1 + tmp2;
 388     z3 := tmp0 + tmp2;
 389     z4 := tmp1 + tmp3;
 390     z5 := MULTIPLY(z3 + z4, FIX_1_175875602); { sqrt(2) * c3 }
 391
 392     tmp0 := MULTIPLY(tmp0, FIX_0_298631336); { sqrt(2) * (-c1+c3+c5-c7) }
 393     tmp1 := MULTIPLY(tmp1, FIX_2_053119869); { sqrt(2) * ( c1+c3-c5+c7) }
 394     tmp2 := MULTIPLY(tmp2, FIX_3_072711026); { sqrt(2) * ( c1+c3+c5-c7) }
 395     tmp3 := MULTIPLY(tmp3, FIX_1_501321110); { sqrt(2) * ( c1+c3-c5-c7) }
 396     z1 := MULTIPLY(z1, - FIX_0_899976223); { sqrt(2) * (c7-c3) }
 397     z2 := MULTIPLY(z2, - FIX_2_562915447); { sqrt(2) * (-c1-c3) }
 398     z3 := MULTIPLY(z3, - FIX_1_961570560); { sqrt(2) * (-c3-c5) }
 399     z4 := MULTIPLY(z4, - FIX_0_390180644); { sqrt(2) * (c5-c3) }
 400
 401     Inc(z3, z5);
 402     Inc(z4, z5);
 403
 404     Inc(tmp0, z1 + z3);
 405     Inc(tmp1, z2 + z4);
 406     Inc(tmp2, z2 + z3);
 407     Inc(tmp3, z1 + z4);
 408
 409     { Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 }
 410
 411     outptr^[0] := range_limit^[ int(DESCALE(tmp10 + tmp3,
 412             CONST_BITS+PASS1_BITS+3))
 413           and RANGE_MASK];
 414     outptr^[7] := range_limit^[ int(DESCALE(tmp10 - tmp3,
 415             CONST_BITS+PASS1_BITS+3))
 416           and RANGE_MASK];
 417     outptr^[1] := range_limit^[ int(DESCALE(tmp11 + tmp2,
 418             CONST_BITS+PASS1_BITS+3))
 419           and RANGE_MASK];
 420     outptr^[6] := range_limit^[ int(DESCALE(tmp11 - tmp2,
 421                                           CONST_BITS+PASS1_BITS+3))
 422                             and RANGE_MASK];
 423     outptr^[2] := range_limit^[ int(DESCALE(tmp12 + tmp1,
 424                                           CONST_BITS+PASS1_BITS+3))
 425                             and RANGE_MASK];
 426     outptr^[5] := range_limit^[ int(DESCALE(tmp12 - tmp1,
 427                                           CONST_BITS+PASS1_BITS+3))
 428                             and RANGE_MASK];
 429     outptr^[3] := range_limit^[ int(DESCALE(tmp13 + tmp0,
 430                                           CONST_BITS+PASS1_BITS+3))
 431                             and RANGE_MASK];
 432     outptr^[4] := range_limit^[ int(DESCALE(tmp13 - tmp0,
 433                                           CONST_BITS+PASS1_BITS+3))
 434                             and RANGE_MASK];
 435
 436     Inc(int_ptr(wsptr), DCTSIZE); { advance pointer to next row }
 437   end;
 438 end;
 439
 440 end.