src/lib/vampimg/JpegLib/imjidctflt.pas

   1 unit imjidctflt;
   2
   3 {$N+}
   4 { This file contains a floating-point implementation of the
   5   inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
   6   must also perform dequantization of the input coefficients.
   7
   8   This implementation should be more accurate than either of the integer
   9   IDCT implementations.  However, it may not give the same results on all
  10   machines because of differences in roundoff behavior.  Speed will depend
  11   on the hardware's floating point capacity.
  12
  13   A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
  14   on each row (or vice versa, but it's more convenient to emit a row at
  15   a time).  Direct algorithms are also available, but they are much more
  16   complex and seem not to be any faster when reduced to code.
  17
  18   This implementation is based on Arai, Agui, and Nakajima's algorithm for
  19   scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
  20   Japanese, but the algorithm is described in the Pennebaker & Mitchell
  21   JPEG textbook (see REFERENCES section in file README).  The following code
  22   is based directly on figure 4-8 in P&M.
  23   While an 8-point DCT cannot be done in less than 11 multiplies, it is
  24   possible to arrange the computation so that many of the multiplies are
  25   simple scalings of the final outputs.  These multiplies can then be
  26   folded into the multiplications or divisions by the JPEG quantization
  27   table entries.  The AA&N method leaves only 5 multiplies and 29 adds
  28   to be done in the DCT itself.
  29   The primary disadvantage of this method is that with a fixed-point
  30   implementation, accuracy is lost due to imprecise representation of the
  31   scaled quantization values.  However, that problem does not arise if
  32   we use floating point arithmetic. }
  33
  34 { Original: jidctflt.c ; Copyright (C) 1994-1996, Thomas G. Lane. }
  35
  36 interface
  37
  38 {$I imjconfig.inc}
  39
  40 uses
  41   imjmorecfg,
  42   imjinclude,
  43   imjpeglib,
  44   imjdct;           { Private declarations for DCT subsystem }
  45
  46 { Perform dequantization and inverse DCT on one block of coefficients. }
  47
  48 {GLOBAL}
  49 procedure jpeg_idct_float (cinfo : j_decompress_ptr;
  50                            compptr : jpeg_component_info_ptr;
  51                coef_block : JCOEFPTR;
  52                output_buf : JSAMPARRAY;
  53                            output_col : JDIMENSION);
  54
  55 implementation
  56
  57 { This module is specialized to the case DCTSIZE = 8. }
  58
  59 {$ifndef DCTSIZE_IS_8}
  60   Sorry, this code only copes with 8x8 DCTs. { deliberate syntax err }
  61 {$endif}
  62
  63
  64 { Dequantize a coefficient by multiplying it by the multiplier-table
  65   entry; produce a float result. }
  66
  67 function DEQUANTIZE(coef : int; quantval : FAST_FLOAT) : FAST_FLOAT;
  68 begin
  69   Dequantize := ( (coef) * quantval);
  70 end;
  71
  72 { Descale and correctly round an INT32 value that's scaled by N bits.
  73   We assume RIGHT_SHIFT rounds towards minus infinity, so adding
  74   the fudge factor is correct for either sign of X. }
  75
  76 function DESCALE(x : INT32; n : int) : INT32;
  77 var
  78   shift_temp : INT32;
  79 begin
  80 {$ifdef RIGHT_SHIFT_IS_UNSIGNED}
  81   shift_temp := x + (INT32(1) shl (n-1));
  82   if shift_temp < 0 then
  83     Descale :=  (shift_temp shr n) or ((not INT32(0)) shl (32-n))
  84   else
  85     Descale :=  (shift_temp shr n);
  86 {$else}
  87   Descale := (x + (INT32(1) shl (n-1)) shr n;
  88 {$endif}
  89 end;
  90
  91
  92 { Perform dequantization and inverse DCT on one block of coefficients. }
  93
  94 {GLOBAL}
  95 procedure jpeg_idct_float (cinfo : j_decompress_ptr;
  96                            compptr : jpeg_component_info_ptr;
  97                coef_block : JCOEFPTR;
  98                output_buf : JSAMPARRAY;
  99                            output_col : JDIMENSION);
 100 type
 101   PWorkspace = ^TWorkspace;
 102   TWorkspace = array[0..DCTSIZE2-1] of FAST_FLOAT;
 103 var
 104   tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 : FAST_FLOAT;
 105   tmp10, tmp11, tmp12, tmp13 : FAST_FLOAT;
 106   z5, z10, z11, z12, z13 : FAST_FLOAT;
 107   inptr : JCOEFPTR;
 108   quantptr : FLOAT_MULT_TYPE_FIELD_PTR;
 109   wsptr : PWorkSpace;
 110   outptr : JSAMPROW;
 111   range_limit : JSAMPROW;
 112   ctr : int;
 113   workspace : TWorkspace; { buffers data between passes }
 114   {SHIFT_TEMPS}
 115 var
 116   dcval : FAST_FLOAT;
 117 begin
 118 { Each IDCT routine is responsible for range-limiting its results and
 119   converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
 120   be quite far out of range if the input data is corrupt, so a bulletproof
 121   range-limiting step is required.  We use a mask-and-table-lookup method
 122   to do the combined operations quickly.  See the comments with
 123   prepare_range_limit_table (in jdmaster.c) for more info. }
 124
 125   range_limit := JSAMPROW(@(cinfo^.sample_range_limit^[CENTERJSAMPLE]));
 126
 127   { Pass 1: process columns from input, store into work array. }
 128
 129   inptr := coef_block;
 130   quantptr := FLOAT_MULT_TYPE_FIELD_PTR (compptr^.dct_table);
 131   wsptr := @workspace;
 132   for ctr := pred(DCTSIZE) downto 0 do
 133   begin
 134     { Due to quantization, we will usually find that many of the input
 135       coefficients are zero, especially the AC terms.  We can exploit this
 136       by short-circuiting the IDCT calculation for any column in which all
 137       the AC terms are zero.  In that case each output is equal to the
 138       DC coefficient (with scale factor as needed).
 139       With typical images and quantization tables, half or more of the
 140       column DCT calculations can be simplified this way. }
 141
 142     if (inptr^[DCTSIZE*1]=0) and (inptr^[DCTSIZE*2]=0) and
 143        (inptr^[DCTSIZE*3]=0) and (inptr^[DCTSIZE*4]=0) and
 144        (inptr^[DCTSIZE*5]=0) and (inptr^[DCTSIZE*6]=0) and
 145        (inptr^[DCTSIZE*7]=0) then
 146     begin
 147       { AC terms all zero }
 148       FAST_FLOAT(dcval) := DEQUANTIZE(inptr^[DCTSIZE*0], quantptr^[DCTSIZE*0]);
 149
 150       wsptr^[DCTSIZE*0] := dcval;
 151       wsptr^[DCTSIZE*1] := dcval;
 152       wsptr^[DCTSIZE*2] := dcval;
 153       wsptr^[DCTSIZE*3] := dcval;
 154       wsptr^[DCTSIZE*4] := dcval;
 155       wsptr^[DCTSIZE*5] := dcval;
 156       wsptr^[DCTSIZE*6] := dcval;
 157       wsptr^[DCTSIZE*7] := dcval;
 158
 159       Inc(JCOEF_PTR(inptr));    { advance pointers to next column }
 160       Inc(FLOAT_MULT_TYPE_PTR(quantptr));
 161       Inc(FAST_FLOAT_PTR(wsptr));
 162       continue;
 163     end;
 164
 165     { Even part }
 166
 167     tmp0 := DEQUANTIZE(inptr^[DCTSIZE*0], quantptr^[DCTSIZE*0]);
 168     tmp1 := DEQUANTIZE(inptr^[DCTSIZE*2], quantptr^[DCTSIZE*2]);
 169     tmp2 := DEQUANTIZE(inptr^[DCTSIZE*4], quantptr^[DCTSIZE*4]);
 170     tmp3 := DEQUANTIZE(inptr^[DCTSIZE*6], quantptr^[DCTSIZE*6]);
 171
 172     tmp10 := tmp0 + tmp2; { phase 3 }
 173     tmp11 := tmp0 - tmp2;
 174
 175     tmp13 := tmp1 + tmp3; { phases 5-3 }
 176     tmp12 := (tmp1 - tmp3) * ({FAST_FLOAT}(1.414213562)) - tmp13; { 2*c4 }
 177
 178     tmp0 := tmp10 + tmp13;  { phase 2 }
 179     tmp3 := tmp10 - tmp13;
 180     tmp1 := tmp11 + tmp12;
 181     tmp2 := tmp11 - tmp12;
 182
 183     { Odd part }
 184
 185     tmp4 := DEQUANTIZE(inptr^[DCTSIZE*1], quantptr^[DCTSIZE*1]);
 186     tmp5 := DEQUANTIZE(inptr^[DCTSIZE*3], quantptr^[DCTSIZE*3]);
 187     tmp6 := DEQUANTIZE(inptr^[DCTSIZE*5], quantptr^[DCTSIZE*5]);
 188     tmp7 := DEQUANTIZE(inptr^[DCTSIZE*7], quantptr^[DCTSIZE*7]);
 189
 190     z13 := tmp6 + tmp5;   { phase 6 }
 191     z10 := tmp6 - tmp5;
 192     z11 := tmp4 + tmp7;
 193     z12 := tmp4 - tmp7;
 194
 195     tmp7 := z11 + z13;    { phase 5 }
 196     tmp11 := (z11 - z13) * ({FAST_FLOAT}(1.414213562)); { 2*c4 }
 197
 198     z5 := (z10 + z12) * ({FAST_FLOAT}(1.847759065)); { 2*c2 }
 199     tmp10 := ({FAST_FLOAT}(1.082392200)) * z12 - z5; { 2*(c2-c6) }
 200     tmp12 := ({FAST_FLOAT}(-2.613125930)) * z10 + z5; { -2*(c2+c6) }
 201
 202     tmp6 := tmp12 - tmp7; { phase 2 }
 203     tmp5 := tmp11 - tmp6;
 204     tmp4 := tmp10 + tmp5;
 205
 206     wsptr^[DCTSIZE*0] := tmp0 + tmp7;
 207     wsptr^[DCTSIZE*7] := tmp0 - tmp7;
 208     wsptr^[DCTSIZE*1] := tmp1 + tmp6;
 209     wsptr^[DCTSIZE*6] := tmp1 - tmp6;
 210     wsptr^[DCTSIZE*2] := tmp2 + tmp5;
 211     wsptr^[DCTSIZE*5] := tmp2 - tmp5;
 212     wsptr^[DCTSIZE*4] := tmp3 + tmp4;
 213     wsptr^[DCTSIZE*3] := tmp3 - tmp4;
 214
 215     Inc(JCOEF_PTR(inptr));    { advance pointers to next column }
 216     Inc(FLOAT_MULT_TYPE_PTR(quantptr));
 217     Inc(FAST_FLOAT_PTR(wsptr));
 218   end;
 219
 220   { Pass 2: process rows from work array, store into output array. }
 221   { Note that we must descale the results by a factor of 8 = 2**3. }
 222
 223   wsptr := @workspace;
 224   for ctr := 0 to pred(DCTSIZE) do
 225   begin
 226     outptr := JSAMPROW(@(output_buf^[ctr]^[output_col]));
 227     { Rows of zeroes can be exploited in the same way as we did with columns.
 228       However, the column calculation has created many nonzero AC terms, so
 229       the simplification applies less often (typically 5% to 10% of the time).
 230       And testing floats for zero is relatively expensive, so we don't bother. }
 231
 232     { Even part }
 233
 234     tmp10 := wsptr^[0] + wsptr^[4];
 235     tmp11 := wsptr^[0] - wsptr^[4];
 236
 237     tmp13 := wsptr^[2] + wsptr^[6];
 238     tmp12 := (wsptr^[2] - wsptr^[6]) * ({FAST_FLOAT}(1.414213562)) - tmp13;
 239
 240     tmp0 := tmp10 + tmp13;
 241     tmp3 := tmp10 - tmp13;
 242     tmp1 := tmp11 + tmp12;
 243     tmp2 := tmp11 - tmp12;
 244
 245     { Odd part }
 246
 247     z13 := wsptr^[5] + wsptr^[3];
 248     z10 := wsptr^[5] - wsptr^[3];
 249     z11 := wsptr^[1] + wsptr^[7];
 250     z12 := wsptr^[1] - wsptr^[7];
 251
 252     tmp7 := z11 + z13;
 253     tmp11 := (z11 - z13) * ({FAST_FLOAT}(1.414213562));
 254
 255     z5 := (z10 + z12) * ({FAST_FLOAT}(1.847759065)); { 2*c2 }
 256     tmp10 := ({FAST_FLOAT}(1.082392200)) * z12 - z5; { 2*(c2-c6) }
 257     tmp12 := ({FAST_FLOAT}(-2.613125930)) * z10 + z5; { -2*(c2+c6) }
 258
 259     tmp6 := tmp12 - tmp7;
 260     tmp5 := tmp11 - tmp6;
 261     tmp4 := tmp10 + tmp5;
 262
 263     { Final output stage: scale down by a factor of 8 and range-limit }
 264
 265     outptr^[0] := range_limit^[ int(DESCALE( INT32(Round((tmp0 + tmp7))), 3))
 266           and RANGE_MASK];
 267     outptr^[7] := range_limit^[ int(DESCALE( INT32(Round((tmp0 - tmp7))), 3))
 268           and RANGE_MASK];
 269     outptr^[1] := range_limit^[ int(DESCALE( INT32(Round((tmp1 + tmp6))), 3))
 270           and RANGE_MASK];
 271     outptr^[6] := range_limit^[ int(DESCALE( INT32(Round((tmp1 - tmp6))), 3))
 272           and RANGE_MASK];
 273     outptr^[2] := range_limit^[ int(DESCALE( INT32(Round((tmp2 + tmp5))), 3))
 274           and RANGE_MASK];
 275     outptr^[5] := range_limit^[ int(DESCALE( INT32(Round((tmp2 - tmp5))), 3))
 276           and RANGE_MASK];
 277     outptr^[4] := range_limit^[ int(DESCALE( INT32(Round((tmp3 + tmp4))), 3))
 278           and RANGE_MASK];
 279     outptr^[3] := range_limit^[ int(DESCALE( INT32(Round((tmp3 - tmp4))), 3))
 280           and RANGE_MASK];
 281
 282     Inc(FAST_FLOAT_PTR(wsptr), DCTSIZE);  { advance pointer to next row }
 283   end;
 284 end;
 285
 286 end.