1 |
/* gzappend -- command to append to a gzip file |
2 |
|
3 |
Copyright (C) 2003, 2012 Mark Adler, all rights reserved |
4 |
version 1.2, 11 Oct 2012 |
5 |
|
6 |
This software is provided 'as-is', without any express or implied |
7 |
warranty. In no event will the author be held liable for any damages |
8 |
arising from the use of this software. |
9 |
|
10 |
Permission is granted to anyone to use this software for any purpose, |
11 |
including commercial applications, and to alter it and redistribute it |
12 |
freely, subject to the following restrictions: |
13 |
|
14 |
1. The origin of this software must not be misrepresented; you must not |
15 |
claim that you wrote the original software. If you use this software |
16 |
in a product, an acknowledgment in the product documentation would be |
17 |
appreciated but is not required. |
18 |
2. Altered source versions must be plainly marked as such, and must not be |
19 |
misrepresented as being the original software. |
20 |
3. This notice may not be removed or altered from any source distribution. |
21 |
|
22 |
Mark Adler madler@alumni.caltech.edu |
23 |
*/ |
24 |
|
25 |
/* |
26 |
* Change history: |
27 |
* |
28 |
* 1.0 19 Oct 2003 - First version |
29 |
* 1.1 4 Nov 2003 - Expand and clarify some comments and notes |
30 |
* - Add version and copyright to help |
31 |
* - Send help to stdout instead of stderr |
32 |
* - Add some preemptive typecasts |
33 |
* - Add L to constants in lseek() calls |
34 |
* - Remove some debugging information in error messages |
35 |
* - Use new data_type definition for zlib 1.2.1 |
36 |
* - Simplfy and unify file operations |
37 |
* - Finish off gzip file in gztack() |
38 |
* - Use deflatePrime() instead of adding empty blocks |
39 |
* - Keep gzip file clean on appended file read errors |
40 |
* - Use in-place rotate instead of auxiliary buffer |
41 |
* (Why you ask? Because it was fun to write!) |
42 |
* 1.2 11 Oct 2012 - Fix for proper z_const usage |
43 |
* - Check for input buffer malloc failure |
44 |
*/ |
45 |
|
46 |
/* |
47 |
gzappend takes a gzip file and appends to it, compressing files from the |
48 |
command line or data from stdin. The gzip file is written to directly, to |
49 |
avoid copying that file, in case it's large. Note that this results in the |
50 |
unfriendly behavior that if gzappend fails, the gzip file is corrupted. |
51 |
|
52 |
This program was written to illustrate the use of the new Z_BLOCK option of |
53 |
zlib 1.2.x's inflate() function. This option returns from inflate() at each |
54 |
block boundary to facilitate locating and modifying the last block bit at |
55 |
the start of the final deflate block. Also whether using Z_BLOCK or not, |
56 |
another required feature of zlib 1.2.x is that inflate() now provides the |
57 |
number of unusued bits in the last input byte used. gzappend will not work |
58 |
with versions of zlib earlier than 1.2.1. |
59 |
|
60 |
gzappend first decompresses the gzip file internally, discarding all but |
61 |
the last 32K of uncompressed data, and noting the location of the last block |
62 |
bit and the number of unused bits in the last byte of the compressed data. |
63 |
The gzip trailer containing the CRC-32 and length of the uncompressed data |
64 |
is verified. This trailer will be later overwritten. |
65 |
|
66 |
Then the last block bit is cleared by seeking back in the file and rewriting |
67 |
the byte that contains it. Seeking forward, the last byte of the compressed |
68 |
data is saved along with the number of unused bits to initialize deflate. |
69 |
|
70 |
A deflate process is initialized, using the last 32K of the uncompressed |
71 |
data from the gzip file to initialize the dictionary. If the total |
72 |
uncompressed data was less than 32K, then all of it is used to initialize |
73 |
the dictionary. The deflate output bit buffer is also initialized with the |
74 |
last bits from the original deflate stream. From here on, the data to |
75 |
append is simply compressed using deflate, and written to the gzip file. |
76 |
When that is complete, the new CRC-32 and uncompressed length are written |
77 |
as the trailer of the gzip file. |
78 |
*/ |
79 |
|
80 |
#include <stdio.h> |
81 |
#include <stdlib.h> |
82 |
#include <string.h> |
83 |
#include <fcntl.h> |
84 |
#include <unistd.h> |
85 |
#include "zlib.h" |
86 |
|
87 |
#define local static |
88 |
#define LGCHUNK 14 |
89 |
#define CHUNK (1U << LGCHUNK) |
90 |
#define DSIZE 32768U |
91 |
|
92 |
/* print an error message and terminate with extreme prejudice */ |
93 |
local void bye(char *msg1, char *msg2) |
94 |
{ |
95 |
fprintf(stderr, "gzappend error: %s%s\n", msg1, msg2); |
96 |
exit(1); |
97 |
} |
98 |
|
99 |
/* return the greatest common divisor of a and b using Euclid's algorithm, |
100 |
modified to be fast when one argument much greater than the other, and |
101 |
coded to avoid unnecessary swapping */ |
102 |
local unsigned gcd(unsigned a, unsigned b) |
103 |
{ |
104 |
unsigned c; |
105 |
|
106 |
while (a && b) |
107 |
if (a > b) { |
108 |
c = b; |
109 |
while (a - c >= c) |
110 |
c <<= 1; |
111 |
a -= c; |
112 |
} |
113 |
else { |
114 |
c = a; |
115 |
while (b - c >= c) |
116 |
c <<= 1; |
117 |
b -= c; |
118 |
} |
119 |
return a + b; |
120 |
} |
121 |
|
122 |
/* rotate list[0..len-1] left by rot positions, in place */ |
123 |
local void rotate(unsigned char *list, unsigned len, unsigned rot) |
124 |
{ |
125 |
unsigned char tmp; |
126 |
unsigned cycles; |
127 |
unsigned char *start, *last, *to, *from; |
128 |
|
129 |
/* normalize rot and handle degenerate cases */ |
130 |
if (len < 2) return; |
131 |
if (rot >= len) rot %= len; |
132 |
if (rot == 0) return; |
133 |
|
134 |
/* pointer to last entry in list */ |
135 |
last = list + (len - 1); |
136 |
|
137 |
/* do simple left shift by one */ |
138 |
if (rot == 1) { |
139 |
tmp = *list; |
140 |
memcpy(list, list + 1, len - 1); |
141 |
*last = tmp; |
142 |
return; |
143 |
} |
144 |
|
145 |
/* do simple right shift by one */ |
146 |
if (rot == len - 1) { |
147 |
tmp = *last; |
148 |
memmove(list + 1, list, len - 1); |
149 |
*list = tmp; |
150 |
return; |
151 |
} |
152 |
|
153 |
/* otherwise do rotate as a set of cycles in place */ |
154 |
cycles = gcd(len, rot); /* number of cycles */ |
155 |
do { |
156 |
start = from = list + cycles; /* start index is arbitrary */ |
157 |
tmp = *from; /* save entry to be overwritten */ |
158 |
for (;;) { |
159 |
to = from; /* next step in cycle */ |
160 |
from += rot; /* go right rot positions */ |
161 |
if (from > last) from -= len; /* (pointer better not wrap) */ |
162 |
if (from == start) break; /* all but one shifted */ |
163 |
*to = *from; /* shift left */ |
164 |
} |
165 |
*to = tmp; /* complete the circle */ |
166 |
} while (--cycles); |
167 |
} |
168 |
|
169 |
/* structure for gzip file read operations */ |
170 |
typedef struct { |
171 |
int fd; /* file descriptor */ |
172 |
int size; /* 1 << size is bytes in buf */ |
173 |
unsigned left; /* bytes available at next */ |
174 |
unsigned char *buf; /* buffer */ |
175 |
z_const unsigned char *next; /* next byte in buffer */ |
176 |
char *name; /* file name for error messages */ |
177 |
} file; |
178 |
|
179 |
/* reload buffer */ |
180 |
local int readin(file *in) |
181 |
{ |
182 |
int len; |
183 |
|
184 |
len = read(in->fd, in->buf, 1 << in->size); |
185 |
if (len == -1) bye("error reading ", in->name); |
186 |
in->left = (unsigned)len; |
187 |
in->next = in->buf; |
188 |
return len; |
189 |
} |
190 |
|
191 |
/* read from file in, exit if end-of-file */ |
192 |
local int readmore(file *in) |
193 |
{ |
194 |
if (readin(in) == 0) bye("unexpected end of ", in->name); |
195 |
return 0; |
196 |
} |
197 |
|
198 |
#define read1(in) (in->left == 0 ? readmore(in) : 0, \ |
199 |
in->left--, *(in->next)++) |
200 |
|
201 |
/* skip over n bytes of in */ |
202 |
local void skip(file *in, unsigned n) |
203 |
{ |
204 |
unsigned bypass; |
205 |
|
206 |
if (n > in->left) { |
207 |
n -= in->left; |
208 |
bypass = n & ~((1U << in->size) - 1); |
209 |
if (bypass) { |
210 |
if (lseek(in->fd, (off_t)bypass, SEEK_CUR) == -1) |
211 |
bye("seeking ", in->name); |
212 |
n -= bypass; |
213 |
} |
214 |
readmore(in); |
215 |
if (n > in->left) |
216 |
bye("unexpected end of ", in->name); |
217 |
} |
218 |
in->left -= n; |
219 |
in->next += n; |
220 |
} |
221 |
|
222 |
/* read a four-byte unsigned integer, little-endian, from in */ |
223 |
unsigned long read4(file *in) |
224 |
{ |
225 |
unsigned long val; |
226 |
|
227 |
val = read1(in); |
228 |
val += (unsigned)read1(in) << 8; |
229 |
val += (unsigned long)read1(in) << 16; |
230 |
val += (unsigned long)read1(in) << 24; |
231 |
return val; |
232 |
} |
233 |
|
234 |
/* skip over gzip header */ |
235 |
local void gzheader(file *in) |
236 |
{ |
237 |
int flags; |
238 |
unsigned n; |
239 |
|
240 |
if (read1(in) != 31 || read1(in) != 139) bye(in->name, " not a gzip file"); |
241 |
if (read1(in) != 8) bye("unknown compression method in", in->name); |
242 |
flags = read1(in); |
243 |
if (flags & 0xe0) bye("unknown header flags set in", in->name); |
244 |
skip(in, 6); |
245 |
if (flags & 4) { |
246 |
n = read1(in); |
247 |
n += (unsigned)(read1(in)) << 8; |
248 |
skip(in, n); |
249 |
} |
250 |
if (flags & 8) while (read1(in) != 0) ; |
251 |
if (flags & 16) while (read1(in) != 0) ; |
252 |
if (flags & 2) skip(in, 2); |
253 |
} |
254 |
|
255 |
/* decompress gzip file "name", return strm with a deflate stream ready to |
256 |
continue compression of the data in the gzip file, and return a file |
257 |
descriptor pointing to where to write the compressed data -- the deflate |
258 |
stream is initialized to compress using level "level" */ |
259 |
local int gzscan(char *name, z_stream *strm, int level) |
260 |
{ |
261 |
int ret, lastbit, left, full; |
262 |
unsigned have; |
263 |
unsigned long crc, tot; |
264 |
unsigned char *window; |
265 |
off_t lastoff, end; |
266 |
file gz; |
267 |
|
268 |
/* open gzip file */ |
269 |
gz.name = name; |
270 |
gz.fd = open(name, O_RDWR, 0); |
271 |
if (gz.fd == -1) bye("cannot open ", name); |
272 |
gz.buf = malloc(CHUNK); |
273 |
if (gz.buf == NULL) bye("out of memory", ""); |
274 |
gz.size = LGCHUNK; |
275 |
gz.left = 0; |
276 |
|
277 |
/* skip gzip header */ |
278 |
gzheader(&gz); |
279 |
|
280 |
/* prepare to decompress */ |
281 |
window = malloc(DSIZE); |
282 |
if (window == NULL) bye("out of memory", ""); |
283 |
strm->zalloc = Z_NULL; |
284 |
strm->zfree = Z_NULL; |
285 |
strm->opaque = Z_NULL; |
286 |
ret = inflateInit2(strm, -15); |
287 |
if (ret != Z_OK) bye("out of memory", " or library mismatch"); |
288 |
|
289 |
/* decompress the deflate stream, saving append information */ |
290 |
lastbit = 0; |
291 |
lastoff = lseek(gz.fd, 0L, SEEK_CUR) - gz.left; |
292 |
left = 0; |
293 |
strm->avail_in = gz.left; |
294 |
strm->next_in = gz.next; |
295 |
crc = crc32(0L, Z_NULL, 0); |
296 |
have = full = 0; |
297 |
do { |
298 |
/* if needed, get more input */ |
299 |
if (strm->avail_in == 0) { |
300 |
readmore(&gz); |
301 |
strm->avail_in = gz.left; |
302 |
strm->next_in = gz.next; |
303 |
} |
304 |
|
305 |
/* set up output to next available section of sliding window */ |
306 |
strm->avail_out = DSIZE - have; |
307 |
strm->next_out = window + have; |
308 |
|
309 |
/* inflate and check for errors */ |
310 |
ret = inflate(strm, Z_BLOCK); |
311 |
if (ret == Z_STREAM_ERROR) bye("internal stream error!", ""); |
312 |
if (ret == Z_MEM_ERROR) bye("out of memory", ""); |
313 |
if (ret == Z_DATA_ERROR) |
314 |
bye("invalid compressed data--format violated in", name); |
315 |
|
316 |
/* update crc and sliding window pointer */ |
317 |
crc = crc32(crc, window + have, DSIZE - have - strm->avail_out); |
318 |
if (strm->avail_out) |
319 |
have = DSIZE - strm->avail_out; |
320 |
else { |
321 |
have = 0; |
322 |
full = 1; |
323 |
} |
324 |
|
325 |
/* process end of block */ |
326 |
if (strm->data_type & 128) { |
327 |
if (strm->data_type & 64) |
328 |
left = strm->data_type & 0x1f; |
329 |
else { |
330 |
lastbit = strm->data_type & 0x1f; |
331 |
lastoff = lseek(gz.fd, 0L, SEEK_CUR) - strm->avail_in; |
332 |
} |
333 |
} |
334 |
} while (ret != Z_STREAM_END); |
335 |
inflateEnd(strm); |
336 |
gz.left = strm->avail_in; |
337 |
gz.next = strm->next_in; |
338 |
|
339 |
/* save the location of the end of the compressed data */ |
340 |
end = lseek(gz.fd, 0L, SEEK_CUR) - gz.left; |
341 |
|
342 |
/* check gzip trailer and save total for deflate */ |
343 |
if (crc != read4(&gz)) |
344 |
bye("invalid compressed data--crc mismatch in ", name); |
345 |
tot = strm->total_out; |
346 |
if ((tot & 0xffffffffUL) != read4(&gz)) |
347 |
bye("invalid compressed data--length mismatch in", name); |
348 |
|
349 |
/* if not at end of file, warn */ |
350 |
if (gz.left || readin(&gz)) |
351 |
fprintf(stderr, |
352 |
"gzappend warning: junk at end of gzip file overwritten\n"); |
353 |
|
354 |
/* clear last block bit */ |
355 |
lseek(gz.fd, lastoff - (lastbit != 0), SEEK_SET); |
356 |
if (read(gz.fd, gz.buf, 1) != 1) bye("reading after seek on ", name); |
357 |
*gz.buf = (unsigned char)(*gz.buf ^ (1 << ((8 - lastbit) & 7))); |
358 |
lseek(gz.fd, -1L, SEEK_CUR); |
359 |
if (write(gz.fd, gz.buf, 1) != 1) bye("writing after seek to ", name); |
360 |
|
361 |
/* if window wrapped, build dictionary from window by rotating */ |
362 |
if (full) { |
363 |
rotate(window, DSIZE, have); |
364 |
have = DSIZE; |
365 |
} |
366 |
|
367 |
/* set up deflate stream with window, crc, total_in, and leftover bits */ |
368 |
ret = deflateInit2(strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY); |
369 |
if (ret != Z_OK) bye("out of memory", ""); |
370 |
deflateSetDictionary(strm, window, have); |
371 |
strm->adler = crc; |
372 |
strm->total_in = tot; |
373 |
if (left) { |
374 |
lseek(gz.fd, --end, SEEK_SET); |
375 |
if (read(gz.fd, gz.buf, 1) != 1) bye("reading after seek on ", name); |
376 |
deflatePrime(strm, 8 - left, *gz.buf); |
377 |
} |
378 |
lseek(gz.fd, end, SEEK_SET); |
379 |
|
380 |
/* clean up and return */ |
381 |
free(window); |
382 |
free(gz.buf); |
383 |
return gz.fd; |
384 |
} |
385 |
|
386 |
/* append file "name" to gzip file gd using deflate stream strm -- if last |
387 |
is true, then finish off the deflate stream at the end */ |
388 |
local void gztack(char *name, int gd, z_stream *strm, int last) |
389 |
{ |
390 |
int fd, len, ret; |
391 |
unsigned left; |
392 |
unsigned char *in, *out; |
393 |
|
394 |
/* open file to compress and append */ |
395 |
fd = 0; |
396 |
if (name != NULL) { |
397 |
fd = open(name, O_RDONLY, 0); |
398 |
if (fd == -1) |
399 |
fprintf(stderr, "gzappend warning: %s not found, skipping ...\n", |
400 |
name); |
401 |
} |
402 |
|
403 |
/* allocate buffers */ |
404 |
in = malloc(CHUNK); |
405 |
out = malloc(CHUNK); |
406 |
if (in == NULL || out == NULL) bye("out of memory", ""); |
407 |
|
408 |
/* compress input file and append to gzip file */ |
409 |
do { |
410 |
/* get more input */ |
411 |
len = read(fd, in, CHUNK); |
412 |
if (len == -1) { |
413 |
fprintf(stderr, |
414 |
"gzappend warning: error reading %s, skipping rest ...\n", |
415 |
name); |
416 |
len = 0; |
417 |
} |
418 |
strm->avail_in = (unsigned)len; |
419 |
strm->next_in = in; |
420 |
if (len) strm->adler = crc32(strm->adler, in, (unsigned)len); |
421 |
|
422 |
/* compress and write all available output */ |
423 |
do { |
424 |
strm->avail_out = CHUNK; |
425 |
strm->next_out = out; |
426 |
ret = deflate(strm, last && len == 0 ? Z_FINISH : Z_NO_FLUSH); |
427 |
left = CHUNK - strm->avail_out; |
428 |
while (left) { |
429 |
len = write(gd, out + CHUNK - strm->avail_out - left, left); |
430 |
if (len == -1) bye("writing gzip file", ""); |
431 |
left -= (unsigned)len; |
432 |
} |
433 |
} while (strm->avail_out == 0 && ret != Z_STREAM_END); |
434 |
} while (len != 0); |
435 |
|
436 |
/* write trailer after last entry */ |
437 |
if (last) { |
438 |
deflateEnd(strm); |
439 |
out[0] = (unsigned char)(strm->adler); |
440 |
out[1] = (unsigned char)(strm->adler >> 8); |
441 |
out[2] = (unsigned char)(strm->adler >> 16); |
442 |
out[3] = (unsigned char)(strm->adler >> 24); |
443 |
out[4] = (unsigned char)(strm->total_in); |
444 |
out[5] = (unsigned char)(strm->total_in >> 8); |
445 |
out[6] = (unsigned char)(strm->total_in >> 16); |
446 |
out[7] = (unsigned char)(strm->total_in >> 24); |
447 |
len = 8; |
448 |
do { |
449 |
ret = write(gd, out + 8 - len, len); |
450 |
if (ret == -1) bye("writing gzip file", ""); |
451 |
len -= ret; |
452 |
} while (len); |
453 |
close(gd); |
454 |
} |
455 |
|
456 |
/* clean up and return */ |
457 |
free(out); |
458 |
free(in); |
459 |
if (fd > 0) close(fd); |
460 |
} |
461 |
|
462 |
/* process the compression level option if present, scan the gzip file, and |
463 |
append the specified files, or append the data from stdin if no other file |
464 |
names are provided on the command line -- the gzip file must be writable |
465 |
and seekable */ |
466 |
int main(int argc, char **argv) |
467 |
{ |
468 |
int gd, level; |
469 |
z_stream strm; |
470 |
|
471 |
/* ignore command name */ |
472 |
argc--; argv++; |
473 |
|
474 |
/* provide usage if no arguments */ |
475 |
if (*argv == NULL) { |
476 |
printf( |
477 |
"gzappend 1.2 (11 Oct 2012) Copyright (C) 2003, 2012 Mark Adler\n" |
478 |
); |
479 |
printf( |
480 |
"usage: gzappend [-level] file.gz [ addthis [ andthis ... ]]\n"); |
481 |
return 0; |
482 |
} |
483 |
|
484 |
/* set compression level */ |
485 |
level = Z_DEFAULT_COMPRESSION; |
486 |
if (argv[0][0] == '-') { |
487 |
if (argv[0][1] < '0' || argv[0][1] > '9' || argv[0][2] != 0) |
488 |
bye("invalid compression level", ""); |
489 |
level = argv[0][1] - '0'; |
490 |
if (*++argv == NULL) bye("no gzip file name after options", ""); |
491 |
} |
492 |
|
493 |
/* prepare to append to gzip file */ |
494 |
gd = gzscan(*argv++, &strm, level); |
495 |
|
496 |
/* append files on command line, or from stdin if none */ |
497 |
if (*argv == NULL) |
498 |
gztack(NULL, gd, &strm, 1); |
499 |
else |
500 |
do { |
501 |
gztack(*argv, gd, &strm, argv[1] == NULL); |
502 |
} while (*++argv != NULL); |
503 |
return 0; |
504 |
} |