source: Vago/zlib-1.2.8/contrib/inflate86/inffast.S@ 1049

Last change on this file since 1049 was 1049, checked in by s10k, 8 years ago
File size: 41.8 KB
Line 
1/*
2 * inffast.S is a hand tuned assembler version of:
3 *
4 * inffast.c -- fast decoding
5 * Copyright (C) 1995-2003 Mark Adler
6 * For conditions of distribution and use, see copyright notice in zlib.h
7 *
8 * Copyright (C) 2003 Chris Anderson <christop@charm.net>
9 * Please use the copyright conditions above.
10 *
11 * This version (Jan-23-2003) of inflate_fast was coded and tested under
12 * GNU/Linux on a pentium 3, using the gcc-3.2 compiler distribution. On that
13 * machine, I found that gzip style archives decompressed about 20% faster than
14 * the gcc-3.2 -O3 -fomit-frame-pointer compiled version. Your results will
15 * depend on how large of a buffer is used for z_stream.next_in & next_out
16 * (8K-32K worked best for my 256K cpu cache) and how much overhead there is in
17 * stream processing I/O and crc32/addler32. In my case, this routine used
18 * 70% of the cpu time and crc32 used 20%.
19 *
20 * I am confident that this version will work in the general case, but I have
21 * not tested a wide variety of datasets or a wide variety of platforms.
22 *
23 * Jan-24-2003 -- Added -DUSE_MMX define for slightly faster inflating.
24 * It should be a runtime flag instead of compile time flag...
25 *
26 * Jan-26-2003 -- Added runtime check for MMX support with cpuid instruction.
27 * With -DUSE_MMX, only MMX code is compiled. With -DNO_MMX, only non-MMX code
28 * is compiled. Without either option, runtime detection is enabled. Runtime
29 * detection should work on all modern cpus and the recomended algorithm (flip
30 * ID bit on eflags and then use the cpuid instruction) is used in many
31 * multimedia applications. Tested under win2k with gcc-2.95 and gas-2.12
32 * distributed with cygwin3. Compiling with gcc-2.95 -c inffast.S -o
33 * inffast.obj generates a COFF object which can then be linked with MSVC++
34 * compiled code. Tested under FreeBSD 4.7 with gcc-2.95.
35 *
36 * Jan-28-2003 -- Tested Athlon XP... MMX mode is slower than no MMX (and
37 * slower than compiler generated code). Adjusted cpuid check to use the MMX
38 * code only for Pentiums < P4 until I have more data on the P4. Speed
39 * improvment is only about 15% on the Athlon when compared with code generated
40 * with MSVC++. Not sure yet, but I think the P4 will also be slower using the
41 * MMX mode because many of it's x86 ALU instructions execute in .5 cycles and
42 * have less latency than MMX ops. Added code to buffer the last 11 bytes of
43 * the input stream since the MMX code grabs bits in chunks of 32, which
44 * differs from the inffast.c algorithm. I don't think there would have been
45 * read overruns where a page boundary was crossed (a segfault), but there
46 * could have been overruns when next_in ends on unaligned memory (unintialized
47 * memory read).
48 *
49 * Mar-13-2003 -- P4 MMX is slightly slower than P4 NO_MMX. I created a C
50 * version of the non-MMX code so that it doesn't depend on zstrm and zstate
51 * structure offsets which are hard coded in this file. This was last tested
52 * with zlib-1.2.0 which is currently in beta testing, newer versions of this
53 * and inffas86.c can be found at http://www.eetbeetee.com/zlib/ and
54 * http://www.charm.net/~christop/zlib/
55 */
56
57
58/*
59 * if you have underscore linking problems (_inflate_fast undefined), try
60 * using -DGAS_COFF
61 */
62#if ! defined( GAS_COFF ) && ! defined( GAS_ELF )
63
64#if defined( WIN32 ) || defined( __CYGWIN__ )
65#define GAS_COFF /* windows object format */
66#else
67#define GAS_ELF
68#endif
69
70#endif /* ! GAS_COFF && ! GAS_ELF */
71
72
73#if defined( GAS_COFF )
74
75/* coff externals have underscores */
76#define inflate_fast _inflate_fast
77#define inflate_fast_use_mmx _inflate_fast_use_mmx
78
79#endif /* GAS_COFF */
80
81
82.file "inffast.S"
83
84.globl inflate_fast
85
86.text
87.align 4,0
88.L_invalid_literal_length_code_msg:
89.string "invalid literal/length code"
90
91.align 4,0
92.L_invalid_distance_code_msg:
93.string "invalid distance code"
94
95.align 4,0
96.L_invalid_distance_too_far_msg:
97.string "invalid distance too far back"
98
99#if ! defined( NO_MMX )
100.align 4,0
101.L_mask: /* mask[N] = ( 1 << N ) - 1 */
102.long 0
103.long 1
104.long 3
105.long 7
106.long 15
107.long 31
108.long 63
109.long 127
110.long 255
111.long 511
112.long 1023
113.long 2047
114.long 4095
115.long 8191
116.long 16383
117.long 32767
118.long 65535
119.long 131071
120.long 262143
121.long 524287
122.long 1048575
123.long 2097151
124.long 4194303
125.long 8388607
126.long 16777215
127.long 33554431
128.long 67108863
129.long 134217727
130.long 268435455
131.long 536870911
132.long 1073741823
133.long 2147483647
134.long 4294967295
135#endif /* NO_MMX */
136
137.text
138
139/*
140 * struct z_stream offsets, in zlib.h
141 */
142#define next_in_strm 0 /* strm->next_in */
143#define avail_in_strm 4 /* strm->avail_in */
144#define next_out_strm 12 /* strm->next_out */
145#define avail_out_strm 16 /* strm->avail_out */
146#define msg_strm 24 /* strm->msg */
147#define state_strm 28 /* strm->state */
148
149/*
150 * struct inflate_state offsets, in inflate.h
151 */
152#define mode_state 0 /* state->mode */
153#define wsize_state 32 /* state->wsize */
154#define write_state 40 /* state->write */
155#define window_state 44 /* state->window */
156#define hold_state 48 /* state->hold */
157#define bits_state 52 /* state->bits */
158#define lencode_state 68 /* state->lencode */
159#define distcode_state 72 /* state->distcode */
160#define lenbits_state 76 /* state->lenbits */
161#define distbits_state 80 /* state->distbits */
162
163/*
164 * inflate_fast's activation record
165 */
166#define local_var_size 64 /* how much local space for vars */
167#define strm_sp 88 /* first arg: z_stream * (local_var_size + 24) */
168#define start_sp 92 /* second arg: unsigned int (local_var_size + 28) */
169
170/*
171 * offsets for local vars on stack
172 */
173#define out 60 /* unsigned char* */
174#define window 56 /* unsigned char* */
175#define wsize 52 /* unsigned int */
176#define write 48 /* unsigned int */
177#define in 44 /* unsigned char* */
178#define beg 40 /* unsigned char* */
179#define buf 28 /* char[ 12 ] */
180#define len 24 /* unsigned int */
181#define last 20 /* unsigned char* */
182#define end 16 /* unsigned char* */
183#define dcode 12 /* code* */
184#define lcode 8 /* code* */
185#define dmask 4 /* unsigned int */
186#define lmask 0 /* unsigned int */
187
188/*
189 * typedef enum inflate_mode consts, in inflate.h
190 */
191#define INFLATE_MODE_TYPE 11 /* state->mode flags enum-ed in inflate.h */
192#define INFLATE_MODE_BAD 26
193
194
195#if ! defined( USE_MMX ) && ! defined( NO_MMX )
196
197#define RUN_TIME_MMX
198
199#define CHECK_MMX 1
200#define DO_USE_MMX 2
201#define DONT_USE_MMX 3
202
203.globl inflate_fast_use_mmx
204
205.data
206
207.align 4,0
208inflate_fast_use_mmx: /* integer flag for run time control 1=check,2=mmx,3=no */
209.long CHECK_MMX
210
211#if defined( GAS_ELF )
212/* elf info */
213.type inflate_fast_use_mmx,@object
214.size inflate_fast_use_mmx,4
215#endif
216
217#endif /* RUN_TIME_MMX */
218
219#if defined( GAS_COFF )
220/* coff info: scl 2 = extern, type 32 = function */
221.def inflate_fast; .scl 2; .type 32; .endef
222#endif
223
224.text
225
226.align 32,0x90
227inflate_fast:
228 pushl %edi
229 pushl %esi
230 pushl %ebp
231 pushl %ebx
232 pushf /* save eflags (strm_sp, state_sp assumes this is 32 bits) */
233 subl $local_var_size, %esp
234 cld
235
236#define strm_r %esi
237#define state_r %edi
238
239 movl strm_sp(%esp), strm_r
240 movl state_strm(strm_r), state_r
241
242 /* in = strm->next_in;
243 * out = strm->next_out;
244 * last = in + strm->avail_in - 11;
245 * beg = out - (start - strm->avail_out);
246 * end = out + (strm->avail_out - 257);
247 */
248 movl avail_in_strm(strm_r), %edx
249 movl next_in_strm(strm_r), %eax
250
251 addl %eax, %edx /* avail_in += next_in */
252 subl $11, %edx /* avail_in -= 11 */
253
254 movl %eax, in(%esp)
255 movl %edx, last(%esp)
256
257 movl start_sp(%esp), %ebp
258 movl avail_out_strm(strm_r), %ecx
259 movl next_out_strm(strm_r), %ebx
260
261 subl %ecx, %ebp /* start -= avail_out */
262 negl %ebp /* start = -start */
263 addl %ebx, %ebp /* start += next_out */
264
265 subl $257, %ecx /* avail_out -= 257 */
266 addl %ebx, %ecx /* avail_out += out */
267
268 movl %ebx, out(%esp)
269 movl %ebp, beg(%esp)
270 movl %ecx, end(%esp)
271
272 /* wsize = state->wsize;
273 * write = state->write;
274 * window = state->window;
275 * hold = state->hold;
276 * bits = state->bits;
277 * lcode = state->lencode;
278 * dcode = state->distcode;
279 * lmask = ( 1 << state->lenbits ) - 1;
280 * dmask = ( 1 << state->distbits ) - 1;
281 */
282
283 movl lencode_state(state_r), %eax
284 movl distcode_state(state_r), %ecx
285
286 movl %eax, lcode(%esp)
287 movl %ecx, dcode(%esp)
288
289 movl $1, %eax
290 movl lenbits_state(state_r), %ecx
291 shll %cl, %eax
292 decl %eax
293 movl %eax, lmask(%esp)
294
295 movl $1, %eax
296 movl distbits_state(state_r), %ecx
297 shll %cl, %eax
298 decl %eax
299 movl %eax, dmask(%esp)
300
301 movl wsize_state(state_r), %eax
302 movl write_state(state_r), %ecx
303 movl window_state(state_r), %edx
304
305 movl %eax, wsize(%esp)
306 movl %ecx, write(%esp)
307 movl %edx, window(%esp)
308
309 movl hold_state(state_r), %ebp
310 movl bits_state(state_r), %ebx
311
312#undef strm_r
313#undef state_r
314
315#define in_r %esi
316#define from_r %esi
317#define out_r %edi
318
319 movl in(%esp), in_r
320 movl last(%esp), %ecx
321 cmpl in_r, %ecx
322 ja .L_align_long /* if in < last */
323
324 addl $11, %ecx /* ecx = &in[ avail_in ] */
325 subl in_r, %ecx /* ecx = avail_in */
326 movl $12, %eax
327 subl %ecx, %eax /* eax = 12 - avail_in */
328 leal buf(%esp), %edi
329 rep movsb /* memcpy( buf, in, avail_in ) */
330 movl %eax, %ecx
331 xorl %eax, %eax
332 rep stosb /* memset( &buf[ avail_in ], 0, 12 - avail_in ) */
333 leal buf(%esp), in_r /* in = buf */
334 movl in_r, last(%esp) /* last = in, do just one iteration */
335 jmp .L_is_aligned
336
337 /* align in_r on long boundary */
338.L_align_long:
339 testl $3, in_r
340 jz .L_is_aligned
341 xorl %eax, %eax
342 movb (in_r), %al
343 incl in_r
344 movl %ebx, %ecx
345 addl $8, %ebx
346 shll %cl, %eax
347 orl %eax, %ebp
348 jmp .L_align_long
349
350.L_is_aligned:
351 movl out(%esp), out_r
352
353#if defined( NO_MMX )
354 jmp .L_do_loop
355#endif
356
357#if defined( USE_MMX )
358 jmp .L_init_mmx
359#endif
360
361/*** Runtime MMX check ***/
362
363#if defined( RUN_TIME_MMX )
364.L_check_mmx:
365 cmpl $DO_USE_MMX, inflate_fast_use_mmx
366 je .L_init_mmx
367 ja .L_do_loop /* > 2 */
368
369 pushl %eax
370 pushl %ebx
371 pushl %ecx
372 pushl %edx
373 pushf
374 movl (%esp), %eax /* copy eflags to eax */
375 xorl $0x200000, (%esp) /* try toggling ID bit of eflags (bit 21)
376 * to see if cpu supports cpuid...
377 * ID bit method not supported by NexGen but
378 * bios may load a cpuid instruction and
379 * cpuid may be disabled on Cyrix 5-6x86 */
380 popf
381 pushf
382 popl %edx /* copy new eflags to edx */
383 xorl %eax, %edx /* test if ID bit is flipped */
384 jz .L_dont_use_mmx /* not flipped if zero */
385 xorl %eax, %eax
386 cpuid
387 cmpl $0x756e6547, %ebx /* check for GenuineIntel in ebx,ecx,edx */
388 jne .L_dont_use_mmx
389 cmpl $0x6c65746e, %ecx
390 jne .L_dont_use_mmx
391 cmpl $0x49656e69, %edx
392 jne .L_dont_use_mmx
393 movl $1, %eax
394 cpuid /* get cpu features */
395 shrl $8, %eax
396 andl $15, %eax
397 cmpl $6, %eax /* check for Pentium family, is 0xf for P4 */
398 jne .L_dont_use_mmx
399 testl $0x800000, %edx /* test if MMX feature is set (bit 23) */
400 jnz .L_use_mmx
401 jmp .L_dont_use_mmx
402.L_use_mmx:
403 movl $DO_USE_MMX, inflate_fast_use_mmx
404 jmp .L_check_mmx_pop
405.L_dont_use_mmx:
406 movl $DONT_USE_MMX, inflate_fast_use_mmx
407.L_check_mmx_pop:
408 popl %edx
409 popl %ecx
410 popl %ebx
411 popl %eax
412 jmp .L_check_mmx
413#endif
414
415
416/*** Non-MMX code ***/
417
418#if defined ( NO_MMX ) || defined( RUN_TIME_MMX )
419
420#define hold_r %ebp
421#define bits_r %bl
422#define bitslong_r %ebx
423
424.align 32,0x90
425.L_while_test:
426 /* while (in < last && out < end)
427 */
428 cmpl out_r, end(%esp)
429 jbe .L_break_loop /* if (out >= end) */
430
431 cmpl in_r, last(%esp)
432 jbe .L_break_loop
433
434.L_do_loop:
435 /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out
436 *
437 * do {
438 * if (bits < 15) {
439 * hold |= *((unsigned short *)in)++ << bits;
440 * bits += 16
441 * }
442 * this = lcode[hold & lmask]
443 */
444 cmpb $15, bits_r
445 ja .L_get_length_code /* if (15 < bits) */
446
447 xorl %eax, %eax
448 lodsw /* al = *(ushort *)in++ */
449 movb bits_r, %cl /* cl = bits, needs it for shifting */
450 addb $16, bits_r /* bits += 16 */
451 shll %cl, %eax
452 orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits */
453
454.L_get_length_code:
455 movl lmask(%esp), %edx /* edx = lmask */
456 movl lcode(%esp), %ecx /* ecx = lcode */
457 andl hold_r, %edx /* edx &= hold */
458 movl (%ecx,%edx,4), %eax /* eax = lcode[hold & lmask] */
459
460.L_dolen:
461 /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out
462 *
463 * dolen:
464 * bits -= this.bits;
465 * hold >>= this.bits
466 */
467 movb %ah, %cl /* cl = this.bits */
468 subb %ah, bits_r /* bits -= this.bits */
469 shrl %cl, hold_r /* hold >>= this.bits */
470
471 /* check if op is a literal
472 * if (op == 0) {
473 * PUP(out) = this.val;
474 * }
475 */
476 testb %al, %al
477 jnz .L_test_for_length_base /* if (op != 0) 45.7% */
478
479 shrl $16, %eax /* output this.val char */
480 stosb
481 jmp .L_while_test
482
483.L_test_for_length_base:
484 /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out, %edx = len
485 *
486 * else if (op & 16) {
487 * len = this.val
488 * op &= 15
489 * if (op) {
490 * if (op > bits) {
491 * hold |= *((unsigned short *)in)++ << bits;
492 * bits += 16
493 * }
494 * len += hold & mask[op];
495 * bits -= op;
496 * hold >>= op;
497 * }
498 */
499#define len_r %edx
500 movl %eax, len_r /* len = this */
501 shrl $16, len_r /* len = this.val */
502 movb %al, %cl
503
504 testb $16, %al
505 jz .L_test_for_second_level_length /* if ((op & 16) == 0) 8% */
506 andb $15, %cl /* op &= 15 */
507 jz .L_save_len /* if (!op) */
508 cmpb %cl, bits_r
509 jae .L_add_bits_to_len /* if (op <= bits) */
510
511 movb %cl, %ch /* stash op in ch, freeing cl */
512 xorl %eax, %eax
513 lodsw /* al = *(ushort *)in++ */
514 movb bits_r, %cl /* cl = bits, needs it for shifting */
515 addb $16, bits_r /* bits += 16 */
516 shll %cl, %eax
517 orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits */
518 movb %ch, %cl /* move op back to ecx */
519
520.L_add_bits_to_len:
521 movl $1, %eax
522 shll %cl, %eax
523 decl %eax
524 subb %cl, bits_r
525 andl hold_r, %eax /* eax &= hold */
526 shrl %cl, hold_r
527 addl %eax, len_r /* len += hold & mask[op] */
528
529.L_save_len:
530 movl len_r, len(%esp) /* save len */
531#undef len_r
532
533.L_decode_distance:
534 /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out, %edx = dist
535 *
536 * if (bits < 15) {
537 * hold |= *((unsigned short *)in)++ << bits;
538 * bits += 16
539 * }
540 * this = dcode[hold & dmask];
541 * dodist:
542 * bits -= this.bits;
543 * hold >>= this.bits;
544 * op = this.op;
545 */
546
547 cmpb $15, bits_r
548 ja .L_get_distance_code /* if (15 < bits) */
549
550 xorl %eax, %eax
551 lodsw /* al = *(ushort *)in++ */
552 movb bits_r, %cl /* cl = bits, needs it for shifting */
553 addb $16, bits_r /* bits += 16 */
554 shll %cl, %eax
555 orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits */
556
557.L_get_distance_code:
558 movl dmask(%esp), %edx /* edx = dmask */
559 movl dcode(%esp), %ecx /* ecx = dcode */
560 andl hold_r, %edx /* edx &= hold */
561 movl (%ecx,%edx,4), %eax /* eax = dcode[hold & dmask] */
562
563#define dist_r %edx
564.L_dodist:
565 movl %eax, dist_r /* dist = this */
566 shrl $16, dist_r /* dist = this.val */
567 movb %ah, %cl
568 subb %ah, bits_r /* bits -= this.bits */
569 shrl %cl, hold_r /* hold >>= this.bits */
570
571 /* if (op & 16) {
572 * dist = this.val
573 * op &= 15
574 * if (op > bits) {
575 * hold |= *((unsigned short *)in)++ << bits;
576 * bits += 16
577 * }
578 * dist += hold & mask[op];
579 * bits -= op;
580 * hold >>= op;
581 */
582 movb %al, %cl /* cl = this.op */
583
584 testb $16, %al /* if ((op & 16) == 0) */
585 jz .L_test_for_second_level_dist
586 andb $15, %cl /* op &= 15 */
587 jz .L_check_dist_one
588 cmpb %cl, bits_r
589 jae .L_add_bits_to_dist /* if (op <= bits) 97.6% */
590
591 movb %cl, %ch /* stash op in ch, freeing cl */
592 xorl %eax, %eax
593 lodsw /* al = *(ushort *)in++ */
594 movb bits_r, %cl /* cl = bits, needs it for shifting */
595 addb $16, bits_r /* bits += 16 */
596 shll %cl, %eax
597 orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits */
598 movb %ch, %cl /* move op back to ecx */
599
600.L_add_bits_to_dist:
601 movl $1, %eax
602 shll %cl, %eax
603 decl %eax /* (1 << op) - 1 */
604 subb %cl, bits_r
605 andl hold_r, %eax /* eax &= hold */
606 shrl %cl, hold_r
607 addl %eax, dist_r /* dist += hold & ((1 << op) - 1) */
608 jmp .L_check_window
609
610.L_check_window:
611 /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist
612 * %ecx = nbytes
613 *
614 * nbytes = out - beg;
615 * if (dist <= nbytes) {
616 * from = out - dist;
617 * do {
618 * PUP(out) = PUP(from);
619 * } while (--len > 0) {
620 * }
621 */
622
623 movl in_r, in(%esp) /* save in so from can use it's reg */
624 movl out_r, %eax
625 subl beg(%esp), %eax /* nbytes = out - beg */
626
627 cmpl dist_r, %eax
628 jb .L_clip_window /* if (dist > nbytes) 4.2% */
629
630 movl len(%esp), %ecx
631 movl out_r, from_r
632 subl dist_r, from_r /* from = out - dist */
633
634 subl $3, %ecx
635 movb (from_r), %al
636 movb %al, (out_r)
637 movb 1(from_r), %al
638 movb 2(from_r), %dl
639 addl $3, from_r
640 movb %al, 1(out_r)
641 movb %dl, 2(out_r)
642 addl $3, out_r
643 rep movsb
644
645 movl in(%esp), in_r /* move in back to %esi, toss from */
646 jmp .L_while_test
647
648.align 16,0x90
649.L_check_dist_one:
650 cmpl $1, dist_r
651 jne .L_check_window
652 cmpl out_r, beg(%esp)
653 je .L_check_window
654
655 decl out_r
656 movl len(%esp), %ecx
657 movb (out_r), %al
658 subl $3, %ecx
659
660 movb %al, 1(out_r)
661 movb %al, 2(out_r)
662 movb %al, 3(out_r)
663 addl $4, out_r
664 rep stosb
665
666 jmp .L_while_test
667
668.align 16,0x90
669.L_test_for_second_level_length:
670 /* else if ((op & 64) == 0) {
671 * this = lcode[this.val + (hold & mask[op])];
672 * }
673 */
674 testb $64, %al
675 jnz .L_test_for_end_of_block /* if ((op & 64) != 0) */
676
677 movl $1, %eax
678 shll %cl, %eax
679 decl %eax
680 andl hold_r, %eax /* eax &= hold */
681 addl %edx, %eax /* eax += this.val */
682 movl lcode(%esp), %edx /* edx = lcode */
683 movl (%edx,%eax,4), %eax /* eax = lcode[val + (hold&mask[op])] */
684 jmp .L_dolen
685
686.align 16,0x90
687.L_test_for_second_level_dist:
688 /* else if ((op & 64) == 0) {
689 * this = dcode[this.val + (hold & mask[op])];
690 * }
691 */
692 testb $64, %al
693 jnz .L_invalid_distance_code /* if ((op & 64) != 0) */
694
695 movl $1, %eax
696 shll %cl, %eax
697 decl %eax
698 andl hold_r, %eax /* eax &= hold */
699 addl %edx, %eax /* eax += this.val */
700 movl dcode(%esp), %edx /* edx = dcode */
701 movl (%edx,%eax,4), %eax /* eax = dcode[val + (hold&mask[op])] */
702 jmp .L_dodist
703
704.align 16,0x90
705.L_clip_window:
706 /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist
707 * %ecx = nbytes
708 *
709 * else {
710 * if (dist > wsize) {
711 * invalid distance
712 * }
713 * from = window;
714 * nbytes = dist - nbytes;
715 * if (write == 0) {
716 * from += wsize - nbytes;
717 */
718#define nbytes_r %ecx
719 movl %eax, nbytes_r
720 movl wsize(%esp), %eax /* prepare for dist compare */
721 negl nbytes_r /* nbytes = -nbytes */
722 movl window(%esp), from_r /* from = window */
723
724 cmpl dist_r, %eax
725 jb .L_invalid_distance_too_far /* if (dist > wsize) */
726
727 addl dist_r, nbytes_r /* nbytes = dist - nbytes */
728 cmpl $0, write(%esp)
729 jne .L_wrap_around_window /* if (write != 0) */
730
731 subl nbytes_r, %eax
732 addl %eax, from_r /* from += wsize - nbytes */
733
734 /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist
735 * %ecx = nbytes, %eax = len
736 *
737 * if (nbytes < len) {
738 * len -= nbytes;
739 * do {
740 * PUP(out) = PUP(from);
741 * } while (--nbytes);
742 * from = out - dist;
743 * }
744 * }
745 */
746#define len_r %eax
747 movl len(%esp), len_r
748 cmpl nbytes_r, len_r
749 jbe .L_do_copy1 /* if (nbytes >= len) */
750
751 subl nbytes_r, len_r /* len -= nbytes */
752 rep movsb
753 movl out_r, from_r
754 subl dist_r, from_r /* from = out - dist */
755 jmp .L_do_copy1
756
757 cmpl nbytes_r, len_r
758 jbe .L_do_copy1 /* if (nbytes >= len) */
759
760 subl nbytes_r, len_r /* len -= nbytes */
761 rep movsb
762 movl out_r, from_r
763 subl dist_r, from_r /* from = out - dist */
764 jmp .L_do_copy1
765
766.L_wrap_around_window:
767 /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist
768 * %ecx = nbytes, %eax = write, %eax = len
769 *
770 * else if (write < nbytes) {
771 * from += wsize + write - nbytes;
772 * nbytes -= write;
773 * if (nbytes < len) {
774 * len -= nbytes;
775 * do {
776 * PUP(out) = PUP(from);
777 * } while (--nbytes);
778 * from = window;
779 * nbytes = write;
780 * if (nbytes < len) {
781 * len -= nbytes;
782 * do {
783 * PUP(out) = PUP(from);
784 * } while(--nbytes);
785 * from = out - dist;
786 * }
787 * }
788 * }
789 */
790#define write_r %eax
791 movl write(%esp), write_r
792 cmpl write_r, nbytes_r
793 jbe .L_contiguous_in_window /* if (write >= nbytes) */
794
795 addl wsize(%esp), from_r
796 addl write_r, from_r
797 subl nbytes_r, from_r /* from += wsize + write - nbytes */
798 subl write_r, nbytes_r /* nbytes -= write */
799#undef write_r
800
801 movl len(%esp), len_r
802 cmpl nbytes_r, len_r
803 jbe .L_do_copy1 /* if (nbytes >= len) */
804
805 subl nbytes_r, len_r /* len -= nbytes */
806 rep movsb
807 movl window(%esp), from_r /* from = window */
808 movl write(%esp), nbytes_r /* nbytes = write */
809 cmpl nbytes_r, len_r
810 jbe .L_do_copy1 /* if (nbytes >= len) */
811
812 subl nbytes_r, len_r /* len -= nbytes */
813 rep movsb
814 movl out_r, from_r
815 subl dist_r, from_r /* from = out - dist */
816 jmp .L_do_copy1
817
818.L_contiguous_in_window:
819 /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist
820 * %ecx = nbytes, %eax = write, %eax = len
821 *
822 * else {
823 * from += write - nbytes;
824 * if (nbytes < len) {
825 * len -= nbytes;
826 * do {
827 * PUP(out) = PUP(from);
828 * } while (--nbytes);
829 * from = out - dist;
830 * }
831 * }
832 */
833#define write_r %eax
834 addl write_r, from_r
835 subl nbytes_r, from_r /* from += write - nbytes */
836#undef write_r
837
838 movl len(%esp), len_r
839 cmpl nbytes_r, len_r
840 jbe .L_do_copy1 /* if (nbytes >= len) */
841
842 subl nbytes_r, len_r /* len -= nbytes */
843 rep movsb
844 movl out_r, from_r
845 subl dist_r, from_r /* from = out - dist */
846
847.L_do_copy1:
848 /* regs: %esi = from, %esi = in, %ebp = hold, %bl = bits, %edi = out
849 * %eax = len
850 *
851 * while (len > 0) {
852 * PUP(out) = PUP(from);
853 * len--;
854 * }
855 * }
856 * } while (in < last && out < end);
857 */
858#undef nbytes_r
859#define in_r %esi
860 movl len_r, %ecx
861 rep movsb
862
863 movl in(%esp), in_r /* move in back to %esi, toss from */
864 jmp .L_while_test
865
866#undef len_r
867#undef dist_r
868
869#endif /* NO_MMX || RUN_TIME_MMX */
870
871
872/*** MMX code ***/
873
874#if defined( USE_MMX ) || defined( RUN_TIME_MMX )
875
876.align 32,0x90
877.L_init_mmx:
878 emms
879
880#undef bits_r
881#undef bitslong_r
882#define bitslong_r %ebp
883#define hold_mm %mm0
884 movd %ebp, hold_mm
885 movl %ebx, bitslong_r
886
887#define used_mm %mm1
888#define dmask2_mm %mm2
889#define lmask2_mm %mm3
890#define lmask_mm %mm4
891#define dmask_mm %mm5
892#define tmp_mm %mm6
893
894 movd lmask(%esp), lmask_mm
895 movq lmask_mm, lmask2_mm
896 movd dmask(%esp), dmask_mm
897 movq dmask_mm, dmask2_mm
898 pxor used_mm, used_mm
899 movl lcode(%esp), %ebx /* ebx = lcode */
900 jmp .L_do_loop_mmx
901
902.align 32,0x90
903.L_while_test_mmx:
904 /* while (in < last && out < end)
905 */
906 cmpl out_r, end(%esp)
907 jbe .L_break_loop /* if (out >= end) */
908
909 cmpl in_r, last(%esp)
910 jbe .L_break_loop
911
912.L_do_loop_mmx:
913 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
914
915 cmpl $32, bitslong_r
916 ja .L_get_length_code_mmx /* if (32 < bits) */
917
918 movd bitslong_r, tmp_mm
919 movd (in_r), %mm7
920 addl $4, in_r
921 psllq tmp_mm, %mm7
922 addl $32, bitslong_r
923 por %mm7, hold_mm /* hold_mm |= *((uint *)in)++ << bits */
924
925.L_get_length_code_mmx:
926 pand hold_mm, lmask_mm
927 movd lmask_mm, %eax
928 movq lmask2_mm, lmask_mm
929 movl (%ebx,%eax,4), %eax /* eax = lcode[hold & lmask] */
930
931.L_dolen_mmx:
932 movzbl %ah, %ecx /* ecx = this.bits */
933 movd %ecx, used_mm
934 subl %ecx, bitslong_r /* bits -= this.bits */
935
936 testb %al, %al
937 jnz .L_test_for_length_base_mmx /* if (op != 0) 45.7% */
938
939 shrl $16, %eax /* output this.val char */
940 stosb
941 jmp .L_while_test_mmx
942
943.L_test_for_length_base_mmx:
944#define len_r %edx
945 movl %eax, len_r /* len = this */
946 shrl $16, len_r /* len = this.val */
947
948 testb $16, %al
949 jz .L_test_for_second_level_length_mmx /* if ((op & 16) == 0) 8% */
950 andl $15, %eax /* op &= 15 */
951 jz .L_decode_distance_mmx /* if (!op) */
952
953 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
954 movd %eax, used_mm
955 movd hold_mm, %ecx
956 subl %eax, bitslong_r
957 andl .L_mask(,%eax,4), %ecx
958 addl %ecx, len_r /* len += hold & mask[op] */
959
960.L_decode_distance_mmx:
961 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
962
963 cmpl $32, bitslong_r
964 ja .L_get_dist_code_mmx /* if (32 < bits) */
965
966 movd bitslong_r, tmp_mm
967 movd (in_r), %mm7
968 addl $4, in_r
969 psllq tmp_mm, %mm7
970 addl $32, bitslong_r
971 por %mm7, hold_mm /* hold_mm |= *((uint *)in)++ << bits */
972
973.L_get_dist_code_mmx:
974 movl dcode(%esp), %ebx /* ebx = dcode */
975 pand hold_mm, dmask_mm
976 movd dmask_mm, %eax
977 movq dmask2_mm, dmask_mm
978 movl (%ebx,%eax,4), %eax /* eax = dcode[hold & lmask] */
979
980.L_dodist_mmx:
981#define dist_r %ebx
982 movzbl %ah, %ecx /* ecx = this.bits */
983 movl %eax, dist_r
984 shrl $16, dist_r /* dist = this.val */
985 subl %ecx, bitslong_r /* bits -= this.bits */
986 movd %ecx, used_mm
987
988 testb $16, %al /* if ((op & 16) == 0) */
989 jz .L_test_for_second_level_dist_mmx
990 andl $15, %eax /* op &= 15 */
991 jz .L_check_dist_one_mmx
992
993.L_add_bits_to_dist_mmx:
994 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
995 movd %eax, used_mm /* save bit length of current op */
996 movd hold_mm, %ecx /* get the next bits on input stream */
997 subl %eax, bitslong_r /* bits -= op bits */
998 andl .L_mask(,%eax,4), %ecx /* ecx = hold & mask[op] */
999 addl %ecx, dist_r /* dist += hold & mask[op] */
1000
1001.L_check_window_mmx:
1002 movl in_r, in(%esp) /* save in so from can use it's reg */
1003 movl out_r, %eax
1004 subl beg(%esp), %eax /* nbytes = out - beg */
1005
1006 cmpl dist_r, %eax
1007 jb .L_clip_window_mmx /* if (dist > nbytes) 4.2% */
1008
1009 movl len_r, %ecx
1010 movl out_r, from_r
1011 subl dist_r, from_r /* from = out - dist */
1012
1013 subl $3, %ecx
1014 movb (from_r), %al
1015 movb %al, (out_r)
1016 movb 1(from_r), %al
1017 movb 2(from_r), %dl
1018 addl $3, from_r
1019 movb %al, 1(out_r)
1020 movb %dl, 2(out_r)
1021 addl $3, out_r
1022 rep movsb
1023
1024 movl in(%esp), in_r /* move in back to %esi, toss from */
1025 movl lcode(%esp), %ebx /* move lcode back to %ebx, toss dist */
1026 jmp .L_while_test_mmx
1027
1028.align 16,0x90
1029.L_check_dist_one_mmx:
1030 cmpl $1, dist_r
1031 jne .L_check_window_mmx
1032 cmpl out_r, beg(%esp)
1033 je .L_check_window_mmx
1034
1035 decl out_r
1036 movl len_r, %ecx
1037 movb (out_r), %al
1038 subl $3, %ecx
1039
1040 movb %al, 1(out_r)
1041 movb %al, 2(out_r)
1042 movb %al, 3(out_r)
1043 addl $4, out_r
1044 rep stosb
1045
1046 movl lcode(%esp), %ebx /* move lcode back to %ebx, toss dist */
1047 jmp .L_while_test_mmx
1048
1049.align 16,0x90
1050.L_test_for_second_level_length_mmx:
1051 testb $64, %al
1052 jnz .L_test_for_end_of_block /* if ((op & 64) != 0) */
1053
1054 andl $15, %eax
1055 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
1056 movd hold_mm, %ecx
1057 andl .L_mask(,%eax,4), %ecx
1058 addl len_r, %ecx
1059 movl (%ebx,%ecx,4), %eax /* eax = lcode[hold & lmask] */
1060 jmp .L_dolen_mmx
1061
1062.align 16,0x90
1063.L_test_for_second_level_dist_mmx:
1064 testb $64, %al
1065 jnz .L_invalid_distance_code /* if ((op & 64) != 0) */
1066
1067 andl $15, %eax
1068 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
1069 movd hold_mm, %ecx
1070 andl .L_mask(,%eax,4), %ecx
1071 movl dcode(%esp), %eax /* ecx = dcode */
1072 addl dist_r, %ecx
1073 movl (%eax,%ecx,4), %eax /* eax = lcode[hold & lmask] */
1074 jmp .L_dodist_mmx
1075
1076.align 16,0x90
1077.L_clip_window_mmx:
1078#define nbytes_r %ecx
1079 movl %eax, nbytes_r
1080 movl wsize(%esp), %eax /* prepare for dist compare */
1081 negl nbytes_r /* nbytes = -nbytes */
1082 movl window(%esp), from_r /* from = window */
1083
1084 cmpl dist_r, %eax
1085 jb .L_invalid_distance_too_far /* if (dist > wsize) */
1086
1087 addl dist_r, nbytes_r /* nbytes = dist - nbytes */
1088 cmpl $0, write(%esp)
1089 jne .L_wrap_around_window_mmx /* if (write != 0) */
1090
1091 subl nbytes_r, %eax
1092 addl %eax, from_r /* from += wsize - nbytes */
1093
1094 cmpl nbytes_r, len_r
1095 jbe .L_do_copy1_mmx /* if (nbytes >= len) */
1096
1097 subl nbytes_r, len_r /* len -= nbytes */
1098 rep movsb
1099 movl out_r, from_r
1100 subl dist_r, from_r /* from = out - dist */
1101 jmp .L_do_copy1_mmx
1102
1103 cmpl nbytes_r, len_r
1104 jbe .L_do_copy1_mmx /* if (nbytes >= len) */
1105
1106 subl nbytes_r, len_r /* len -= nbytes */
1107 rep movsb
1108 movl out_r, from_r
1109 subl dist_r, from_r /* from = out - dist */
1110 jmp .L_do_copy1_mmx
1111
1112.L_wrap_around_window_mmx:
1113#define write_r %eax
1114 movl write(%esp), write_r
1115 cmpl write_r, nbytes_r
1116 jbe .L_contiguous_in_window_mmx /* if (write >= nbytes) */
1117
1118 addl wsize(%esp), from_r
1119 addl write_r, from_r
1120 subl nbytes_r, from_r /* from += wsize + write - nbytes */
1121 subl write_r, nbytes_r /* nbytes -= write */
1122#undef write_r
1123
1124 cmpl nbytes_r, len_r
1125 jbe .L_do_copy1_mmx /* if (nbytes >= len) */
1126
1127 subl nbytes_r, len_r /* len -= nbytes */
1128 rep movsb
1129 movl window(%esp), from_r /* from = window */
1130 movl write(%esp), nbytes_r /* nbytes = write */
1131 cmpl nbytes_r, len_r
1132 jbe .L_do_copy1_mmx /* if (nbytes >= len) */
1133
1134 subl nbytes_r, len_r /* len -= nbytes */
1135 rep movsb
1136 movl out_r, from_r
1137 subl dist_r, from_r /* from = out - dist */
1138 jmp .L_do_copy1_mmx
1139
1140.L_contiguous_in_window_mmx:
1141#define write_r %eax
1142 addl write_r, from_r
1143 subl nbytes_r, from_r /* from += write - nbytes */
1144#undef write_r
1145
1146 cmpl nbytes_r, len_r
1147 jbe .L_do_copy1_mmx /* if (nbytes >= len) */
1148
1149 subl nbytes_r, len_r /* len -= nbytes */
1150 rep movsb
1151 movl out_r, from_r
1152 subl dist_r, from_r /* from = out - dist */
1153
1154.L_do_copy1_mmx:
1155#undef nbytes_r
1156#define in_r %esi
1157 movl len_r, %ecx
1158 rep movsb
1159
1160 movl in(%esp), in_r /* move in back to %esi, toss from */
1161 movl lcode(%esp), %ebx /* move lcode back to %ebx, toss dist */
1162 jmp .L_while_test_mmx
1163
1164#undef hold_r
1165#undef bitslong_r
1166
1167#endif /* USE_MMX || RUN_TIME_MMX */
1168
1169
1170/*** USE_MMX, NO_MMX, and RUNTIME_MMX from here on ***/
1171
1172.L_invalid_distance_code:
1173 /* else {
1174 * strm->msg = "invalid distance code";
1175 * state->mode = BAD;
1176 * }
1177 */
1178 movl $.L_invalid_distance_code_msg, %ecx
1179 movl $INFLATE_MODE_BAD, %edx
1180 jmp .L_update_stream_state
1181
1182.L_test_for_end_of_block:
1183 /* else if (op & 32) {
1184 * state->mode = TYPE;
1185 * break;
1186 * }
1187 */
1188 testb $32, %al
1189 jz .L_invalid_literal_length_code /* if ((op & 32) == 0) */
1190
1191 movl $0, %ecx
1192 movl $INFLATE_MODE_TYPE, %edx
1193 jmp .L_update_stream_state
1194
1195.L_invalid_literal_length_code:
1196 /* else {
1197 * strm->msg = "invalid literal/length code";
1198 * state->mode = BAD;
1199 * }
1200 */
1201 movl $.L_invalid_literal_length_code_msg, %ecx
1202 movl $INFLATE_MODE_BAD, %edx
1203 jmp .L_update_stream_state
1204
1205.L_invalid_distance_too_far:
1206 /* strm->msg = "invalid distance too far back";
1207 * state->mode = BAD;
1208 */
1209 movl in(%esp), in_r /* from_r has in's reg, put in back */
1210 movl $.L_invalid_distance_too_far_msg, %ecx
1211 movl $INFLATE_MODE_BAD, %edx
1212 jmp .L_update_stream_state
1213
1214.L_update_stream_state:
1215 /* set strm->msg = %ecx, strm->state->mode = %edx */
1216 movl strm_sp(%esp), %eax
1217 testl %ecx, %ecx /* if (msg != NULL) */
1218 jz .L_skip_msg
1219 movl %ecx, msg_strm(%eax) /* strm->msg = msg */
1220.L_skip_msg:
1221 movl state_strm(%eax), %eax /* state = strm->state */
1222 movl %edx, mode_state(%eax) /* state->mode = edx (BAD | TYPE) */
1223 jmp .L_break_loop
1224
1225.align 32,0x90
1226.L_break_loop:
1227
1228/*
1229 * Regs:
1230 *
1231 * bits = %ebp when mmx, and in %ebx when non-mmx
1232 * hold = %hold_mm when mmx, and in %ebp when non-mmx
1233 * in = %esi
1234 * out = %edi
1235 */
1236
1237#if defined( USE_MMX ) || defined( RUN_TIME_MMX )
1238
1239#if defined( RUN_TIME_MMX )
1240
1241 cmpl $DO_USE_MMX, inflate_fast_use_mmx
1242 jne .L_update_next_in
1243
1244#endif /* RUN_TIME_MMX */
1245
1246 movl %ebp, %ebx
1247
1248.L_update_next_in:
1249
1250#endif
1251
1252#define strm_r %eax
1253#define state_r %edx
1254
1255 /* len = bits >> 3;
1256 * in -= len;
1257 * bits -= len << 3;
1258 * hold &= (1U << bits) - 1;
1259 * state->hold = hold;
1260 * state->bits = bits;
1261 * strm->next_in = in;
1262 * strm->next_out = out;
1263 */
1264 movl strm_sp(%esp), strm_r
1265 movl %ebx, %ecx
1266 movl state_strm(strm_r), state_r
1267 shrl $3, %ecx
1268 subl %ecx, in_r
1269 shll $3, %ecx
1270 subl %ecx, %ebx
1271 movl out_r, next_out_strm(strm_r)
1272 movl %ebx, bits_state(state_r)
1273 movl %ebx, %ecx
1274
1275 leal buf(%esp), %ebx
1276 cmpl %ebx, last(%esp)
1277 jne .L_buf_not_used /* if buf != last */
1278
1279 subl %ebx, in_r /* in -= buf */
1280 movl next_in_strm(strm_r), %ebx
1281 movl %ebx, last(%esp) /* last = strm->next_in */
1282 addl %ebx, in_r /* in += strm->next_in */
1283 movl avail_in_strm(strm_r), %ebx
1284 subl $11, %ebx
1285 addl %ebx, last(%esp) /* last = &strm->next_in[ avail_in - 11 ] */
1286
1287.L_buf_not_used:
1288 movl in_r, next_in_strm(strm_r)
1289
1290 movl $1, %ebx
1291 shll %cl, %ebx
1292 decl %ebx
1293
1294#if defined( USE_MMX ) || defined( RUN_TIME_MMX )
1295
1296#if defined( RUN_TIME_MMX )
1297
1298 cmpl $DO_USE_MMX, inflate_fast_use_mmx
1299 jne .L_update_hold
1300
1301#endif /* RUN_TIME_MMX */
1302
1303 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
1304 movd hold_mm, %ebp
1305
1306 emms
1307
1308.L_update_hold:
1309
1310#endif /* USE_MMX || RUN_TIME_MMX */
1311
1312 andl %ebx, %ebp
1313 movl %ebp, hold_state(state_r)
1314
1315#define last_r %ebx
1316
1317 /* strm->avail_in = in < last ? 11 + (last - in) : 11 - (in - last) */
1318 movl last(%esp), last_r
1319 cmpl in_r, last_r
1320 jbe .L_last_is_smaller /* if (in >= last) */
1321
1322 subl in_r, last_r /* last -= in */
1323 addl $11, last_r /* last += 11 */
1324 movl last_r, avail_in_strm(strm_r)
1325 jmp .L_fixup_out
1326.L_last_is_smaller:
1327 subl last_r, in_r /* in -= last */
1328 negl in_r /* in = -in */
1329 addl $11, in_r /* in += 11 */
1330 movl in_r, avail_in_strm(strm_r)
1331
1332#undef last_r
1333#define end_r %ebx
1334
1335.L_fixup_out:
1336 /* strm->avail_out = out < end ? 257 + (end - out) : 257 - (out - end)*/
1337 movl end(%esp), end_r
1338 cmpl out_r, end_r
1339 jbe .L_end_is_smaller /* if (out >= end) */
1340
1341 subl out_r, end_r /* end -= out */
1342 addl $257, end_r /* end += 257 */
1343 movl end_r, avail_out_strm(strm_r)
1344 jmp .L_done
1345.L_end_is_smaller:
1346 subl end_r, out_r /* out -= end */
1347 negl out_r /* out = -out */
1348 addl $257, out_r /* out += 257 */
1349 movl out_r, avail_out_strm(strm_r)
1350
1351#undef end_r
1352#undef strm_r
1353#undef state_r
1354
1355.L_done:
1356 addl $local_var_size, %esp
1357 popf
1358 popl %ebx
1359 popl %ebp
1360 popl %esi
1361 popl %edi
1362 ret
1363
1364#if defined( GAS_ELF )
1365/* elf info */
1366.type inflate_fast,@function
1367.size inflate_fast,.-inflate_fast
1368#endif
Note: See TracBrowser for help on using the repository browser.