/// (C) 2010, Andrew Polar under GPL ver. 3. // LICENSE // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License as // published by the Free Software Foundation; either version 3 of // the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details at // Visit . // // This is DEMO of iROLZ algorithm with streaming dictionary. #include #include #include #include #include #include #define PREFIX_SIZE 1 //must be 1, 2 or 3 #define HISTORY_BUFFER_BITLEN 17 //trade between time and compression ratio #define LONGEST_COUNT 255 //must be one byte #define SUFFICIENT_MATCH 63 //does not make big difference #define MINIMUM_MATCH 2 //must be 1 or larger, the larger the limit the more literals #define BIT_LEN_FOR_STEPS 6 //these are offset indexes, typically 5 bits is enough int TERMINATION_FLAG = (1< double calculate_entropy(T* data, int data_size) { int min, max, counter, buffer_size; int* buffer; double entropy; double log2 = log(2.0); double prob; min = data[0]; max = data[0]; for (counter=0; counter max) max = data[counter]; } buffer_size = max - min + 1; buffer = (int*)malloc(buffer_size * sizeof(int)); memset(buffer, 0x00, buffer_size * sizeof(int)); for (counter=0; counter 0) { prob = (double)buffer[counter]; prob /= (double)data_size; entropy += log(prob)/log2*prob; } } entropy *= -1.0; if (buffer) free(buffer); return entropy; } int getFileSize(char* fileName) { FILE* f; if ((f = fopen(fileName, "rb")) == NULL) return -1; fseek(f, 0, SEEK_END); int bytes = ftell(f); fclose(f); return bytes; } //////end auxiliary functions list////////////////// //this is reversible reducer of entropy for executables of Ilya Muraviev //source: http://balz.sourceforge.net/ void exetransform(int mode, unsigned char* data, int size) { const int end = size - 8; int i = 0; while ((reinterpret_cast(data[i]) != 0x4550) && (++i < end)); // perform call/jmp address translation while (i < end) { if ((data[i++] & 254) == 0xe8) { int &addr = reinterpret_cast(data[i]); if (mode) { if ((addr >= -i) && (addr<(size - i))) { addr += i; } else { if ((addr > 0) && (addr < size)) addr -= size; } } else { if (addr < 0) { if ((addr + i) >= 0) addr += size; } else { if (addr < size) addr -= i; } } i += 4; } } } class DictionaryStream { public: DictionaryStream(int PrefixSize, int OffsetLen); ~DictionaryStream(); void updateDictionary(unsigned char byte); int getLongestMatch(const unsigned char* data, const int current_pos, const int data_size, int& offset_index); void setPosition(int step, int position); unsigned char getNextValue(int k); private: int* m_last_position_lookup; int* m_self_addressed_dictionary; unsigned char* m_search_buffer; int m_prefix_mask, m_buffer_mask, m_context, m_index, m_position; }; DictionaryStream::DictionaryStream(int PrefixSize, int OffsetLen) { if (PrefixSize == 1) m_prefix_mask = 0xff; else if (PrefixSize == 2) m_prefix_mask = 0xffff; else m_prefix_mask = 0xffffff; m_buffer_mask = (1< position) { if (jumpedOver) break; else jumpedOver = true; } count = 0; while (current_pos + 1 + count < data_size && count < LONGEST_COUNT && data[current_pos + 1 + count] == m_search_buffer[(new_position + 1 + count) & m_buffer_mask]) { ++count; } if (count > max_len) { max_len = count; offset_index = step; } if (max_len > SUFFICIENT_MATCH) break; if (step++ >= MAX_STEPS) break; position = new_position; } return max_len; } void DictionaryStream::setPosition(int step, int position) { m_position = position & m_buffer_mask; for (int k=0; k<=step; ++k) { m_position = m_self_addressed_dictionary[m_position]; } } unsigned char DictionaryStream::getNextValue(int k) { return m_search_buffer[(m_position + k + 1) & m_buffer_mask]; } void irolz_compress(const unsigned char* data, const int size) { DictionaryStream* dictionary = new DictionaryStream(PREFIX_SIZE, HISTORY_BUFFER_BITLEN); int index = 0; do { dictionary->updateDictionary(data[index]); g_buffer[current_byte++] = data[index]; //literal int len, offset; while (true) { len = dictionary->getLongestMatch(data, index, size, offset); if (len >= MINIMUM_MATCH ) { g_buffer[current_byte++] = 256 + offset; //couple offset g_buffer[current_byte++] = len; //couple len for (int k=0; kupdateDictionary(data[index]); } } else { break; } } ++index; } while (index < size); g_buffer[current_byte++] = TERMINATION_FLAG + 256; delete dictionary; } void irolz_decompress(unsigned char* test, int& test_size) { DictionaryStream* dict = new DictionaryStream(PREFIX_SIZE, HISTORY_BUFFER_BITLEN); int counter = 0; int i = 0; while (true) { if (g_buffer[i] < 256) { test[counter] = (unsigned char)g_buffer[i]; dict->updateDictionary(test[counter]); ++i; ++counter; } else { int step = g_buffer[i] - 256; if (step == TERMINATION_FLAG) break; ++i; int length = g_buffer[i]; int position = counter - 1; dict->setPosition(step, position); for (int k=0; kgetNextValue(k); ++counter; } for (int k=counter-length; kupdateDictionary(test[k]); } ++i; } } delete dict; test_size = counter; } int main() { char fileName[] = "C:\\Users\\apolar\\projects\\files\\acrord32.exe"; //char fileName[] = "C:\\\Users\\apolar\\projects\\files\\calgary\\bib"; int size = getFileSize(fileName); if (size < 0) { printf("File not found\n"); return -1; } unsigned char* data = (unsigned char*)malloc(size * sizeof(*data)); unsigned char* test = (unsigned char*)malloc(size); g_buffer = (unsigned short*)malloc((size + size/2)*sizeof(unsigned short)); FILE* f = fopen(fileName, "rb"); fread(data, 1, size, f); fclose(f); //preprocessing to provide better compression of exe and dll //if applied should be reversed by exetransform(0, data, data_size) //to restore original. exetransform(1, data, size); //encoding printf("Andrew Polar under GPL ver. 3. LICENSE\n"); clock_t start = clock(); irolz_compress(data, size); clock_t end = clock(); printf("Encoding time %2.3f s.\n", (double)(end - start)/CLOCKS_PER_SEC); int compressed_array_size = current_byte; //end encoding //decoding start = clock(); int test_size = 0; irolz_decompress(test, test_size); end = clock(); printf("Decoding time %2.3f s.\n", (double)(end - start)/CLOCKS_PER_SEC); //end decoding bool isOK = true; if (size != test_size) { printf("Size mismatch\n"); isOK = false; } else { for (int i=0; i