/// (C) 2010, Andrew Polar under GPL ver. 3. // LICENSE // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License as // published by the Free Software Foundation; either version 3 of // the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details at // Visit . // // This is skeleton of iROLZ algorithm #include #include #include #include #include #include #define PREFIX_SIZE 2 //must be 1, 2 or 3 #define HISTORY_BUFFER_BITLEN 16 //trade between time and compression ratio #define LONGEST_COUNT 255 //must be one byte #define SUFFICIENT_MATCH 63 //does not make big difference #define MAX_STEPS 127 //must be some limit #define MINIMUM_MATCH 2 //must be 1 or larger, the larger the limit the more literals unsigned short* g_buffer = 0; int current_byte; int number_of_literals, number_of_couples; double estimated_compression_ratio; ///////auxiliary functions/////////////////////////////// template double calculate_entropy(T* data, int data_size) { int min, max, counter, buffer_size; int* buffer; double entropy; double log2 = log(2.0); double prob; min = data[0]; max = data[0]; for (counter=0; counter max) max = data[counter]; } buffer_size = max - min + 1; buffer = (int*)malloc(buffer_size * sizeof(int)); memset(buffer, 0x00, buffer_size * sizeof(int)); for (counter=0; counter 0) { prob = (double)buffer[counter]; prob /= (double)data_size; entropy += log(prob)/log2*prob; } } entropy *= -1.0; if (buffer) free(buffer); return entropy; } int getFileSize(char* fileName) { FILE* f; if ((f = fopen(fileName, "rb")) == NULL) return -1; fseek(f, 0, SEEK_END); int bytes = ftell(f); fclose(f); return bytes; } //////end auxiliary functions list////////////////// //this is reversible reducer of entropy for executables of Ilya Muraviev //source: http://balz.sourceforge.net/ void exetransform(int mode, unsigned char* data, int size) { const int end = size - 8; int i = 0; while ((reinterpret_cast(data[i]) != 0x4550) && (++i < end)); // perform call/jmp address translation while (i < end) { if ((data[i++] & 254) == 0xe8) { int &addr = reinterpret_cast(data[i]); if (mode) { if ((addr >= -i) && (addr<(size - i))) { addr += i; } else { if ((addr > 0) && (addr < size)) addr -= size; } } else { if (addr < 0) { if ((addr + i) >= 0) addr += size; } else { if (addr < size) addr -= i; } } i += 4; } } } class Dictionary { public: Dictionary(int PrefixSize, int OffsetLen); ~Dictionary(); void updateDictionary(unsigned char byte); int getNextPosition(int position); private: int* m_last_position_lookup; int* m_self_addressed_dictionary; int m_prefix_mask, m_buffer_mask, m_context, m_index; }; Dictionary::Dictionary(int PrefixSize, int OffsetLen) { if (PrefixSize == 1) m_prefix_mask = 0xff; else if (PrefixSize == 2) m_prefix_mask = 0xffff; else m_prefix_mask = 0xffffff; m_last_position_lookup = (int*)malloc((m_prefix_mask + 1) * sizeof(int)); memset(m_last_position_lookup, 0x00, (m_prefix_mask + 1) * sizeof(int)); m_buffer_mask = (1<= data_size) return 0; int max_count, max_step, step, position, count; position = index; max_count = max_step = step = 0; while (true) { int new_position = dictionary->getNextPosition(position); if (new_position >= position) break; if (data[index + MINIMUM_MATCH] == data[new_position + MINIMUM_MATCH]) { //quick test count = 0; while (true) { if (index + 1 + count >= data_size) break; if (data[index + 1 + count] == data[new_position + 1 + count]) { ++count; } else break; if (count >= LONGEST_COUNT) break; } if (count > max_count) { max_count = count; max_step = step; } if (max_count > SUFFICIENT_MATCH) break; } if (step++ >= MAX_STEPS) break; position = new_position; } nsteps = max_step; return max_count; } bool irolz_compress(const unsigned char* data, const int data_size) { Dictionary* dictionary = new Dictionary(PREFIX_SIZE, HISTORY_BUFFER_BITLEN); int max_count, max_step; int index = 0; do { dictionary->updateDictionary(data[index]); //save literal g_buffer[current_byte++] = data[index]; while (true) { max_count = getLongestMatch(dictionary, data, index, data_size, max_step); if (max_count >= MINIMUM_MATCH ) { //save g_buffer[current_byte++] = 256 + max_count; g_buffer[current_byte++] = max_step; for (int k=0; kupdateDictionary(data[index]); } } else { break; } } ++index; } while (index < data_size); delete dictionary; return true; } void irolz_decompress(unsigned char* data, int& size, int data_size) { //this is for statistics only int* literals_and_leng = (int*)malloc(data_size * sizeof(int)); int* index = (int*)malloc(data_size * sizeof(int)); int index_cnt = 0; int literals_and_leng_cnt = 0; /////////////////////////////////////////////////////////////// number_of_literals = number_of_couples = 0; Dictionary* dictionary = new Dictionary(PREFIX_SIZE, HISTORY_BUFFER_BITLEN); int counter = 0; for (int i=0; iupdateDictionary(data[counter]); ++counter; ++number_of_literals; } else { literals_and_leng[literals_and_leng_cnt++] = g_buffer[i]; //statistics int length = g_buffer[i] - 256; ++i; int step = g_buffer[i]; index[index_cnt++] = g_buffer[i]; int offset = 0; int position = counter - 1; for (int k=0; k<=step; ++k) { position = dictionary->getNextPosition(position); if (position < 0) { printf("Error: misformatted data\n"); return; } } offset = counter - 1 - position; for (int k=0; kupdateDictionary(data[counter]); ++counter; } ++number_of_couples; } } size = counter; delete dictionary; //rough approximate estimation of compression ratio, this estimation is for lowest compression, //it must be better when context modelling is applied. double entropy_literals_and_leng = calculate_entropy(literals_and_leng, literals_and_leng_cnt); double entropy_index = calculate_entropy(index, index_cnt); double bits = entropy_literals_and_leng * double(literals_and_leng_cnt); bits += entropy_index * double(index_cnt); estimated_compression_ratio = bits/8.0/double(data_size); if (index) free(index); if (literals_and_leng) free(literals_and_leng); /////////////////////////////////////////////// } int main() { char fileName[] = "C:\\Users\\apolar\\projects\\files\\ohs.doc"; //char fileName[] = "C:\\\Users\\apolar\\projects\\files\\calgary\\bib"; int data_size = getFileSize(fileName); if (data_size < 0) { printf("File not found\n"); return -1; } unsigned char* data = (unsigned char*)malloc(data_size * sizeof(*data)); FILE* f = fopen(fileName, "rb"); fread(data, 1, data_size, f); fclose(f); g_buffer = (unsigned short*)malloc((data_size + data_size/2)*sizeof(unsigned short)); //preprocessing to provide better compression of exe and dll //if applied should be reversed by exetransform(0, data, data_size) //to restore original. exetransform(1, data, data_size); printf("Andrew Polar under GPL ver. 3. LICENSE\n"); clock_t start = clock(); current_byte = 0; irolz_compress(data, data_size); clock_t end = clock(); printf("Encoding time %2.3f s.\n", (double)(end - start)/CLOCKS_PER_SEC); int compressed_array_size = current_byte; start = clock(); unsigned char* test = (unsigned char*)malloc((data_size + data_size/2)); int size = 0; irolz_decompress(test, size, data_size); end = clock(); printf("Decoding time %2.3f s.\n", (double)(end - start)/CLOCKS_PER_SEC); bool isOK = true; if (size != data_size) { printf("Size mismatch\n"); isOK = false; } else { for (int i=0; i