C++爬虫原理(三):解压Gzip网页数据
Http 1.1中支持Gzip压缩,可以非常极大的节约带宽,Gzip解压不同与zip压缩,不要使用uncompress了。一开始搞错了,用的开源zip库的,闭门造车了。。。。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
#ifndef GZIP_H #define GZIP_H #include "zlib/zlib.h" /* Compress gzip data */ /* data 原数据 ndata 原数据长度 zdata 压缩后数据 nzdata 压缩后长度 */ int gzcompress(Bytef *data, uLong ndata, Bytef *zdata, uLong *nzdata) { z_stream c_stream; int err = 0; if(data && ndata > 0) { c_stream.zalloc = NULL; c_stream.zfree = NULL; c_stream.opaque = NULL; //只有设置为MAX_WBITS + 16才能在在压缩文本中带header和trailer if(deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION, Z_DEFLATED, MAX_WBITS + 16, 8, Z_DEFAULT_STRATEGY) != Z_OK) return -1; c_stream.next_in = data; c_stream.avail_in = ndata; c_stream.next_out = zdata; c_stream.avail_out = *nzdata; while(c_stream.avail_in != 0 && c_stream.total_out < *nzdata) { if(deflate(&c_stream, Z_NO_FLUSH) != Z_OK) return -1; } if(c_stream.avail_in != 0) return c_stream.avail_in; for(;;) { if((err = deflate(&c_stream, Z_FINISH)) == Z_STREAM_END) break; if(err != Z_OK) return -1; } if(deflateEnd(&c_stream) != Z_OK) return -1; *nzdata = c_stream.total_out; return 0; } return -1; } /* Uncompress gzip data */ /* zdata 数据 nzdata 原数据长度 data 解压后数据 ndata 解压后长度 */ int gzdecompress(Byte *zdata, uLong nzdata, Byte *data, uLong *ndata) { int err = 0; z_stream d_stream = {0}; /* decompression stream */ static char dummy_head[2] = { 0x8 + 0x7 * 0x10, (((0x8 + 0x7 * 0x10) * 0x100 + 30) / 31 * 31) & 0xFF, }; d_stream.zalloc = NULL; d_stream.zfree = NULL; d_stream.opaque = NULL; d_stream.next_in = zdata; d_stream.avail_in = 0; d_stream.next_out = data; //只有设置为MAX_WBITS + 16才能在解压带header和trailer的文本 if(inflateInit2(&d_stream, MAX_WBITS + 16) != Z_OK) return -1; //if(inflateInit2(&d_stream, 47) != Z_OK) return -1; while(d_stream.total_out < *ndata && d_stream.total_in < nzdata) { d_stream.avail_in = d_stream.avail_out = 1; /* force small buffers */ if((err = inflate(&d_stream, Z_NO_FLUSH)) == Z_STREAM_END) break; if(err != Z_OK) { if(err == Z_DATA_ERROR) { d_stream.next_in = (Bytef*) dummy_head; d_stream.avail_in = sizeof(dummy_head); if((err = inflate(&d_stream, Z_NO_FLUSH)) != Z_OK) { return -1; } } else return -1; } } if(inflateEnd(&d_stream) != Z_OK) return -1; *ndata = d_stream.total_out; return 0; } #endif // GZIP_H |
原代码地址:http://www.oschina.net/code/piece_full?code=22542