DEADSOFTWARE

hopefully no more windows
[d2df-editor.git] / src / lib / vampimg / JpegLib / imjidctint.pas
1 unit imjidctint;
2 {$Q+}
4 { This file contains a slow-but-accurate integer implementation of the
5 inverse DCT (Discrete Cosine Transform). In the IJG code, this routine
6 must also perform dequantization of the input coefficients.
8 A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
9 on each row (or vice versa, but it's more convenient to emit a row at
10 a time). Direct algorithms are also available, but they are much more
11 complex and seem not to be any faster when reduced to code.
13 This implementation is based on an algorithm described in
14 C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
15 Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
16 Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
17 The primary algorithm described there uses 11 multiplies and 29 adds.
18 We use their alternate method with 12 multiplies and 32 adds.
19 The advantage of this method is that no data path contains more than one
20 multiplication; this allows a very simple and accurate implementation in
21 scaled fixed-point arithmetic, with a minimal number of shifts. }
23 { Original : jidctint.c ; Copyright (C) 1991-1998, Thomas G. Lane. }
26 interface
28 {$I imjconfig.inc}
30 uses
31 imjmorecfg,
32 imjinclude,
33 imjpeglib,
34 imjdct; { Private declarations for DCT subsystem }
36 { Perform dequantization and inverse DCT on one block of coefficients. }
38 {GLOBAL}
39 procedure jpeg_idct_islow (cinfo : j_decompress_ptr;
40 compptr : jpeg_component_info_ptr;
41 coef_block : JCOEFPTR;
42 output_buf : JSAMPARRAY;
43 output_col : JDIMENSION);
45 implementation
47 { This module is specialized to the case DCTSIZE = 8. }
49 {$ifndef DCTSIZE_IS_8}
50 Sorry, this code only copes with 8x8 DCTs. { deliberate syntax err }
51 {$endif}
53 { The poop on this scaling stuff is as follows:
55 Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
56 larger than the true IDCT outputs. The final outputs are therefore
57 a factor of N larger than desired; since N=8 this can be cured by
58 a simple right shift at the end of the algorithm. The advantage of
59 this arrangement is that we save two multiplications per 1-D IDCT,
60 because the y0 and y4 inputs need not be divided by sqrt(N).
62 We have to do addition and subtraction of the integer inputs, which
63 is no problem, and multiplication by fractional constants, which is
64 a problem to do in integer arithmetic. We multiply all the constants
65 by CONST_SCALE and convert them to integer constants (thus retaining
66 CONST_BITS bits of precision in the constants). After doing a
67 multiplication we have to divide the product by CONST_SCALE, with proper
68 rounding, to produce the correct output. This division can be done
69 cheaply as a right shift of CONST_BITS bits. We postpone shifting
70 as long as possible so that partial sums can be added together with
71 full fractional precision.
73 The outputs of the first pass are scaled up by PASS1_BITS bits so that
74 they are represented to better-than-integral precision. These outputs
75 require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
76 with the recommended scaling. (To scale up 12-bit sample data further, an
77 intermediate INT32 array would be needed.)
79 To avoid overflow of the 32-bit intermediate results in pass 2, we must
80 have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
81 shows that the values given below are the most effective. }
83 {$ifdef BITS_IN_JSAMPLE_IS_8}
84 const
85 CONST_BITS = 13;
86 PASS1_BITS = 2;
87 {$else}
88 const
89 CONST_BITS = 13;
90 PASS1_BITS = 1; { lose a little precision to avoid overflow }
91 {$endif}
93 const
94 CONST_SCALE = (INT32(1) shl CONST_BITS);
96 const
97 FIX_0_298631336 = INT32(Round(CONST_SCALE * 0.298631336)); {2446}
98 FIX_0_390180644 = INT32(Round(CONST_SCALE * 0.390180644)); {3196}
99 FIX_0_541196100 = INT32(Round(CONST_SCALE * 0.541196100)); {4433}
100 FIX_0_765366865 = INT32(Round(CONST_SCALE * 0.765366865)); {6270}
101 FIX_0_899976223 = INT32(Round(CONST_SCALE * 0.899976223)); {7373}
102 FIX_1_175875602 = INT32(Round(CONST_SCALE * 1.175875602)); {9633}
103 FIX_1_501321110 = INT32(Round(CONST_SCALE * 1.501321110)); {12299}
104 FIX_1_847759065 = INT32(Round(CONST_SCALE * 1.847759065)); {15137}
105 FIX_1_961570560 = INT32(Round(CONST_SCALE * 1.961570560)); {16069}
106 FIX_2_053119869 = INT32(Round(CONST_SCALE * 2.053119869)); {16819}
107 FIX_2_562915447 = INT32(Round(CONST_SCALE * 2.562915447)); {20995}
108 FIX_3_072711026 = INT32(Round(CONST_SCALE * 3.072711026)); {25172}
112 { Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
113 For 8-bit samples with the recommended scaling, all the variable
114 and constant values involved are no more than 16 bits wide, so a
115 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
116 For 12-bit samples, a full 32-bit multiplication will be needed. }
118 {$ifdef BITS_IN_JSAMPLE_IS_8}
120 {$IFDEF BASM16}
121 {$IFNDEF WIN32}
122 {MULTIPLY16C16(var,const)}
123 function Multiply(X, Y: Integer): integer; assembler;
124 asm
125 mov ax, X
126 imul Y
127 mov al, ah
128 mov ah, dl
129 end;
130 {$ENDIF}
131 {$ENDIF}
133 function Multiply(X, Y: INT32): INT32;
134 begin
135 Multiply := INT32(X) * INT32(Y);
136 end;
139 {$else}
140 {#define MULTIPLY(var,const) ((var) * (const))}
141 function Multiply(X, Y: INT32): INT32;
142 begin
143 Multiply := INT32(X) * INT32(Y);
144 end;
145 {$endif}
148 { Dequantize a coefficient by multiplying it by the multiplier-table
149 entry; produce an int result. In this module, both inputs and result
150 are 16 bits or less, so either int or short multiply will work. }
152 function DEQUANTIZE(coef,quantval : int) : int;
153 begin
154 Dequantize := ( ISLOW_MULT_TYPE(coef) * quantval);
155 end;
157 { Descale and correctly round an INT32 value that's scaled by N bits.
158 We assume RIGHT_SHIFT rounds towards minus infinity, so adding
159 the fudge factor is correct for either sign of X. }
161 function DESCALE(x : INT32; n : int) : INT32;
162 var
163 shift_temp : INT32;
164 begin
165 {$ifdef RIGHT_SHIFT_IS_UNSIGNED}
166 shift_temp := x + (INT32(1) shl (n-1));
167 if shift_temp < 0 then
168 Descale := (shift_temp shr n) or ((not INT32(0)) shl (32-n))
169 else
170 Descale := (shift_temp shr n);
171 {$else}
172 Descale := (x + (INT32(1) shl (n-1)) shr n;
173 {$endif}
174 end;
176 { Perform dequantization and inverse DCT on one block of coefficients. }
178 {GLOBAL}
179 procedure jpeg_idct_islow (cinfo : j_decompress_ptr;
180 compptr : jpeg_component_info_ptr;
181 coef_block : JCOEFPTR;
182 output_buf : JSAMPARRAY;
183 output_col : JDIMENSION);
184 type
185 PWorkspace = ^TWorkspace;
186 TWorkspace = coef_bits_field; { buffers data between passes }
187 var
188 tmp0, tmp1, tmp2, tmp3 : INT32;
189 tmp10, tmp11, tmp12, tmp13 : INT32;
190 z1, z2, z3, z4, z5 : INT32;
191 inptr : JCOEFPTR;
192 quantptr : ISLOW_MULT_TYPE_FIELD_PTR;
193 wsptr : PWorkspace;
194 outptr : JSAMPROW;
195 range_limit : JSAMPROW;
196 ctr : int;
197 workspace : TWorkspace;
198 {SHIFT_TEMPS}
199 var
200 dcval : int;
201 var
202 dcval_ : JSAMPLE;
203 begin
204 { Each IDCT routine is responsible for range-limiting its results and
205 converting them to unsigned form (0..MAXJSAMPLE). The raw outputs could
206 be quite far out of range if the input data is corrupt, so a bulletproof
207 range-limiting step is required. We use a mask-and-table-lookup method
208 to do the combined operations quickly. See the comments with
209 prepare_range_limit_table (in jdmaster.c) for more info. }
211 range_limit := JSAMPROW(@(cinfo^.sample_range_limit^[CENTERJSAMPLE]));
214 { Pass 1: process columns from input, store into work array. }
215 { Note results are scaled up by sqrt(8) compared to a true IDCT; }
216 { furthermore, we scale the results by 2**PASS1_BITS. }
218 inptr := coef_block;
219 quantptr := ISLOW_MULT_TYPE_FIELD_PTR (compptr^.dct_table);
220 wsptr := PWorkspace(@workspace);
221 for ctr := pred(DCTSIZE) downto 0 do
222 begin
223 { Due to quantization, we will usually find that many of the input
224 coefficients are zero, especially the AC terms. We can exploit this
225 by short-circuiting the IDCT calculation for any column in which all
226 the AC terms are zero. In that case each output is equal to the
227 DC coefficient (with scale factor as needed).
228 With typical images and quantization tables, half or more of the
229 column DCT calculations can be simplified this way. }
231 if ((inptr^[DCTSIZE*1]=0) and (inptr^[DCTSIZE*2]=0) and
232 (inptr^[DCTSIZE*3]=0) and (inptr^[DCTSIZE*4]=0) and
233 (inptr^[DCTSIZE*5]=0) and (inptr^[DCTSIZE*6]=0) and
234 (inptr^[DCTSIZE*7]=0)) then
235 begin
236 { AC terms all zero }
237 dcval := DEQUANTIZE(inptr^[DCTSIZE*0], quantptr^[DCTSIZE*0]) shl PASS1_BITS;
239 wsptr^[DCTSIZE*0] := dcval;
240 wsptr^[DCTSIZE*1] := dcval;
241 wsptr^[DCTSIZE*2] := dcval;
242 wsptr^[DCTSIZE*3] := dcval;
243 wsptr^[DCTSIZE*4] := dcval;
244 wsptr^[DCTSIZE*5] := dcval;
245 wsptr^[DCTSIZE*6] := dcval;
246 wsptr^[DCTSIZE*7] := dcval;
248 Inc(JCOEF_PTR(inptr)); { advance pointers to next column }
249 Inc(ISLOW_MULT_TYPE_PTR(quantptr));
250 Inc(int_ptr(wsptr));
251 continue;
252 end;
254 { Even part: reverse the even part of the forward DCT. }
255 { The rotator is sqrt(2)*c(-6). }
257 z2 := DEQUANTIZE(inptr^[DCTSIZE*2], quantptr^[DCTSIZE*2]);
258 z3 := DEQUANTIZE(inptr^[DCTSIZE*6], quantptr^[DCTSIZE*6]);
260 z1 := MULTIPLY(z2 + z3, FIX_0_541196100);
261 tmp2 := z1 + MULTIPLY(z3, - FIX_1_847759065);
262 tmp3 := z1 + MULTIPLY(z2, FIX_0_765366865);
264 z2 := DEQUANTIZE(inptr^[DCTSIZE*0], quantptr^[DCTSIZE*0]);
265 z3 := DEQUANTIZE(inptr^[DCTSIZE*4], quantptr^[DCTSIZE*4]);
267 tmp0 := (z2 + z3) shl CONST_BITS;
268 tmp1 := (z2 - z3) shl CONST_BITS;
270 tmp10 := tmp0 + tmp3;
271 tmp13 := tmp0 - tmp3;
272 tmp11 := tmp1 + tmp2;
273 tmp12 := tmp1 - tmp2;
275 { Odd part per figure 8; the matrix is unitary and hence its
276 transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. }
278 tmp0 := DEQUANTIZE(inptr^[DCTSIZE*7], quantptr^[DCTSIZE*7]);
279 tmp1 := DEQUANTIZE(inptr^[DCTSIZE*5], quantptr^[DCTSIZE*5]);
280 tmp2 := DEQUANTIZE(inptr^[DCTSIZE*3], quantptr^[DCTSIZE*3]);
281 tmp3 := DEQUANTIZE(inptr^[DCTSIZE*1], quantptr^[DCTSIZE*1]);
283 z1 := tmp0 + tmp3;
284 z2 := tmp1 + tmp2;
285 z3 := tmp0 + tmp2;
286 z4 := tmp1 + tmp3;
287 z5 := MULTIPLY(z3 + z4, FIX_1_175875602); { sqrt(2) * c3 }
289 tmp0 := MULTIPLY(tmp0, FIX_0_298631336); { sqrt(2) * (-c1+c3+c5-c7) }
290 tmp1 := MULTIPLY(tmp1, FIX_2_053119869); { sqrt(2) * ( c1+c3-c5+c7) }
291 tmp2 := MULTIPLY(tmp2, FIX_3_072711026); { sqrt(2) * ( c1+c3+c5-c7) }
292 tmp3 := MULTIPLY(tmp3, FIX_1_501321110); { sqrt(2) * ( c1+c3-c5-c7) }
293 z1 := MULTIPLY(z1, - FIX_0_899976223); { sqrt(2) * (c7-c3) }
294 z2 := MULTIPLY(z2, - FIX_2_562915447); { sqrt(2) * (-c1-c3) }
295 z3 := MULTIPLY(z3, - FIX_1_961570560); { sqrt(2) * (-c3-c5) }
296 z4 := MULTIPLY(z4, - FIX_0_390180644); { sqrt(2) * (c5-c3) }
298 Inc(z3, z5);
299 Inc(z4, z5);
301 Inc(tmp0, z1 + z3);
302 Inc(tmp1, z2 + z4);
303 Inc(tmp2, z2 + z3);
304 Inc(tmp3, z1 + z4);
306 { Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 }
308 wsptr^[DCTSIZE*0] := int (DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS));
309 wsptr^[DCTSIZE*7] := int (DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS));
310 wsptr^[DCTSIZE*1] := int (DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS));
311 wsptr^[DCTSIZE*6] := int (DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS));
312 wsptr^[DCTSIZE*2] := int (DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS));
313 wsptr^[DCTSIZE*5] := int (DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS));
314 wsptr^[DCTSIZE*3] := int (DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS));
315 wsptr^[DCTSIZE*4] := int (DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS));
317 Inc(JCOEF_PTR(inptr)); { advance pointers to next column }
318 Inc(ISLOW_MULT_TYPE_PTR(quantptr));
319 Inc(int_ptr(wsptr));
320 end;
322 { Pass 2: process rows from work array, store into output array. }
323 { Note that we must descale the results by a factor of 8 == 2**3, }
324 { and also undo the PASS1_BITS scaling. }
326 wsptr := @workspace;
327 for ctr := 0 to pred(DCTSIZE) do
328 begin
329 outptr := output_buf^[ctr];
330 Inc(JSAMPLE_PTR(outptr), output_col);
331 { Rows of zeroes can be exploited in the same way as we did with columns.
332 However, the column calculation has created many nonzero AC terms, so
333 the simplification applies less often (typically 5% to 10% of the time).
334 On machines with very fast multiplication, it's possible that the
335 test takes more time than it's worth. In that case this section
336 may be commented out. }
338 {$ifndef NO_ZERO_ROW_TEST}
339 if ((wsptr^[1]=0) and (wsptr^[2]=0) and (wsptr^[3]=0) and (wsptr^[4]=0)
340 and (wsptr^[5]=0) and (wsptr^[6]=0) and (wsptr^[7]=0)) then
341 begin
342 { AC terms all zero }
343 JSAMPLE(dcval_) := range_limit^[int(DESCALE(INT32(wsptr^[0]),
344 PASS1_BITS+3)) and RANGE_MASK];
346 outptr^[0] := dcval_;
347 outptr^[1] := dcval_;
348 outptr^[2] := dcval_;
349 outptr^[3] := dcval_;
350 outptr^[4] := dcval_;
351 outptr^[5] := dcval_;
352 outptr^[6] := dcval_;
353 outptr^[7] := dcval_;
355 Inc(int_ptr(wsptr), DCTSIZE); { advance pointer to next row }
356 continue;
357 end;
358 {$endif}
360 { Even part: reverse the even part of the forward DCT. }
361 { The rotator is sqrt(2)*c(-6). }
363 z2 := INT32 (wsptr^[2]);
364 z3 := INT32 (wsptr^[6]);
366 z1 := MULTIPLY(z2 + z3, FIX_0_541196100);
367 tmp2 := z1 + MULTIPLY(z3, - FIX_1_847759065);
368 tmp3 := z1 + MULTIPLY(z2, FIX_0_765366865);
370 tmp0 := (INT32(wsptr^[0]) + INT32(wsptr^[4])) shl CONST_BITS;
371 tmp1 := (INT32(wsptr^[0]) - INT32(wsptr^[4])) shl CONST_BITS;
373 tmp10 := tmp0 + tmp3;
374 tmp13 := tmp0 - tmp3;
375 tmp11 := tmp1 + tmp2;
376 tmp12 := tmp1 - tmp2;
378 { Odd part per figure 8; the matrix is unitary and hence its
379 transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. }
381 tmp0 := INT32(wsptr^[7]);
382 tmp1 := INT32(wsptr^[5]);
383 tmp2 := INT32(wsptr^[3]);
384 tmp3 := INT32(wsptr^[1]);
386 z1 := tmp0 + tmp3;
387 z2 := tmp1 + tmp2;
388 z3 := tmp0 + tmp2;
389 z4 := tmp1 + tmp3;
390 z5 := MULTIPLY(z3 + z4, FIX_1_175875602); { sqrt(2) * c3 }
392 tmp0 := MULTIPLY(tmp0, FIX_0_298631336); { sqrt(2) * (-c1+c3+c5-c7) }
393 tmp1 := MULTIPLY(tmp1, FIX_2_053119869); { sqrt(2) * ( c1+c3-c5+c7) }
394 tmp2 := MULTIPLY(tmp2, FIX_3_072711026); { sqrt(2) * ( c1+c3+c5-c7) }
395 tmp3 := MULTIPLY(tmp3, FIX_1_501321110); { sqrt(2) * ( c1+c3-c5-c7) }
396 z1 := MULTIPLY(z1, - FIX_0_899976223); { sqrt(2) * (c7-c3) }
397 z2 := MULTIPLY(z2, - FIX_2_562915447); { sqrt(2) * (-c1-c3) }
398 z3 := MULTIPLY(z3, - FIX_1_961570560); { sqrt(2) * (-c3-c5) }
399 z4 := MULTIPLY(z4, - FIX_0_390180644); { sqrt(2) * (c5-c3) }
401 Inc(z3, z5);
402 Inc(z4, z5);
404 Inc(tmp0, z1 + z3);
405 Inc(tmp1, z2 + z4);
406 Inc(tmp2, z2 + z3);
407 Inc(tmp3, z1 + z4);
409 { Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 }
411 outptr^[0] := range_limit^[ int(DESCALE(tmp10 + tmp3,
412 CONST_BITS+PASS1_BITS+3))
413 and RANGE_MASK];
414 outptr^[7] := range_limit^[ int(DESCALE(tmp10 - tmp3,
415 CONST_BITS+PASS1_BITS+3))
416 and RANGE_MASK];
417 outptr^[1] := range_limit^[ int(DESCALE(tmp11 + tmp2,
418 CONST_BITS+PASS1_BITS+3))
419 and RANGE_MASK];
420 outptr^[6] := range_limit^[ int(DESCALE(tmp11 - tmp2,
421 CONST_BITS+PASS1_BITS+3))
422 and RANGE_MASK];
423 outptr^[2] := range_limit^[ int(DESCALE(tmp12 + tmp1,
424 CONST_BITS+PASS1_BITS+3))
425 and RANGE_MASK];
426 outptr^[5] := range_limit^[ int(DESCALE(tmp12 - tmp1,
427 CONST_BITS+PASS1_BITS+3))
428 and RANGE_MASK];
429 outptr^[3] := range_limit^[ int(DESCALE(tmp13 + tmp0,
430 CONST_BITS+PASS1_BITS+3))
431 and RANGE_MASK];
432 outptr^[4] := range_limit^[ int(DESCALE(tmp13 - tmp0,
433 CONST_BITS+PASS1_BITS+3))
434 and RANGE_MASK];
436 Inc(int_ptr(wsptr), DCTSIZE); { advance pointer to next row }
437 end;
438 end;
440 end.