/// (C) 2010, Andrew Polar under GPL ver. 3. // LICENSE // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License as // published by the Free Software Foundation; either version 3 of // the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // General Public License for more details at // Visit . // // This is skeleton of iROLZ algorithm with two dictionaries #include #include #include #include #include #include #define BIT_LENGTH_FOR_MATCH 9 //must be 9 bits or larger #define BIT_LENGTH_FOR_INDEX 5 //can be 1 or more #define PREFIX_SIZE_MAIN 3 //can be 1, 2 or 3 #define PREFIX_SIZE_AUXILIARY 1 //can be 1, 2 or 3 #define HISTORY_BUFFER_BITLEN_MAIN 16 //trade between time and compression ratio #define HISTORY_BUFFER_BITLEN_AUX 19 //trade between time and compression ratio #define SUFFICIENT_MATCH 63 //does not make big difference #define MINIMUM_MATCH 2 //must be 1 or larger, the larger the limit the more literals int END_LABEL = (1< double calculate_entropy(T* data, int data_size) { int min, max, counter, buffer_size; int* buffer; double entropy; double log2 = log(2.0); double prob; min = data[0]; max = data[0]; for (counter=0; counter max) max = data[counter]; } buffer_size = max - min + 1; buffer = (int*)malloc(buffer_size * sizeof(int)); memset(buffer, 0x00, buffer_size * sizeof(int)); for (counter=0; counter 0) { prob = (double)buffer[counter]; prob /= (double)data_size; entropy += log(prob)/log2*prob; } } entropy *= -1.0; if (buffer) free(buffer); return entropy; } int getFileSize(char* fileName) { FILE* f; if ((f = fopen(fileName, "rb")) == NULL) return -1; fseek(f, 0, SEEK_END); int bytes = ftell(f); fclose(f); return bytes; } //////end auxiliary functions list////////////////// //this is reversible reducer of entropy for executables of Ilya Muraviev //source: http://balz.sourceforge.net/ void exetransform(int mode, unsigned char* data, int size) { const int end = size - 8; int i = 0; while ((reinterpret_cast(data[i]) != 0x4550) && (++i < end)); // perform call/jmp address translation while (i < end) { if ((data[i++] & 254) == 0xe8) { int &addr = reinterpret_cast(data[i]); if (mode) { if ((addr >= -i) && (addr<(size - i))) { addr += i; } else { if ((addr > 0) && (addr < size)) addr -= size; } } else { if (addr < 0) { if ((addr + i) >= 0) addr += size; } else { if (addr < size) addr -= i; } } i += 4; } } } class Dictionary { public: Dictionary(int PrefixSize, int OffsetLen); ~Dictionary(); void updateDictionary(unsigned char byte); int getNextPosition(int position); private: int* m_last_position_lookup; int* m_self_addressed_dictionary; int m_prefix_mask, m_buffer_mask, m_context, m_index; }; Dictionary::Dictionary(int PrefixSize, int OffsetLen) { if (PrefixSize == 1) m_prefix_mask = 0xff; else if (PrefixSize == 2) m_prefix_mask = 0xffff; else m_prefix_mask = 0xffffff; m_last_position_lookup = (int*)malloc((m_prefix_mask + 1) * sizeof(int)); memset(m_last_position_lookup, 0x00, (m_prefix_mask + 1) * sizeof(int)); m_buffer_mask = (1<getNextPosition(position); if (new_position >= position) break; count = 0; while (true) { if (index + 1 + count >= data_size) break; if (data[index + 1 + count] == data[new_position + 1 + count]) { ++count; } else break; if (count >= LONGEST_MATCH) break; } if (count > max_count) { max_count = count; max_step = step; } if (max_count > SUFFICIENT_MATCH) break; if (step++ >= MAX_INDEX) break; position = new_position; } nsteps = max_step; return max_count; } bool irolz_compress(const unsigned char* data, const int data_size) { Dictionary* main = new Dictionary(PREFIX_SIZE_MAIN, HISTORY_BUFFER_BITLEN_MAIN); Dictionary* auxiliary = new Dictionary(PREFIX_SIZE_AUXILIARY, HISTORY_BUFFER_BITLEN_AUX); int max_count_main, max_step_main, max_count_aux, max_step_aux, max_count, max_step; int index = 0; do { main->updateDictionary(data[index]); auxiliary->updateDictionary(data[index]); //save literal g_buffer[current_byte++] = data[index]; while (true) { max_count_main = getLongestMatch(main, data, index, data_size, max_step_main); max_count_aux = getLongestMatch(auxiliary, data, index, data_size, max_step_aux); // bool isMain = true; if (max_count_main >= max_count_aux) { max_count = max_count_main; max_step = max_step_main; } else { max_count = max_count_aux; max_step = max_step_aux; isMain = false; } // if (max_count >= MINIMUM_MATCH ) { //save g_buffer[current_byte++] = 256 + max_count; g_buffer[current_byte++] = max_step; if (!isMain) { g_buffer[current_byte-1] |= (1<updateDictionary(data[index]); auxiliary->updateDictionary(data[index]); } } else { break; } } ++index; } while (index < data_size); g_buffer[current_byte++] = END_LABEL; delete auxiliary; delete main; return true; } void irolz_decompress(unsigned char* data, int& size, int data_size) { //this is for statistics only short* literals_and_leng = (short*)malloc(data_size * sizeof(short)); short* index_main = (short*)malloc(data_size * sizeof(short)); short* index_aux = (short*)malloc(data_size * sizeof(short)); short* flag = (short*)malloc(data_size * sizeof(short)); int flag_cnt = 0; int index_main_cnt = 0; int index_aux_cnt = 0; int literals_and_leng_cnt = 0; /////////////////////////////////////////////////////////////// number_of_literals = number_of_couples = 0; Dictionary* main = new Dictionary(PREFIX_SIZE_MAIN, HISTORY_BUFFER_BITLEN_MAIN); Dictionary* auxiliary = new Dictionary(PREFIX_SIZE_AUXILIARY, HISTORY_BUFFER_BITLEN_AUX); int counter = 0; int i = 0; int STEP_MASK = (1<updateDictionary(data[counter]); auxiliary->updateDictionary(data[counter]); ++counter; ++number_of_literals; } else { literals_and_leng[literals_and_leng_cnt++] = g_buffer[i]; //statistics int length = g_buffer[i] - 256; ++i; bool isAuxiliary = false; if (g_buffer[i] > STEP_MASK) { isAuxiliary = true; } int step = g_buffer[i] & STEP_MASK; if (!isAuxiliary) { index_main[index_main_cnt++] = step; flag[flag_cnt++] = 1; } else { index_aux[index_aux_cnt++] = step; flag[flag_cnt++] = 0; } int offset = 0; int position = counter - 1; for (int k=0; k<=step; ++k) { if (isAuxiliary) { position = auxiliary->getNextPosition(position); } else { position = main->getNextPosition(position); } if (position < 0) { printf("Error: misformatted data\n"); return; } } offset = counter - 1 - position; for (int k=0; kupdateDictionary(data[counter]); auxiliary->updateDictionary(data[counter]); ++counter; } ++number_of_couples; } ++i; } size = counter; delete auxiliary; delete main; //rough approximate estimation of compression ratio, this estimation is for lowest compression, //it must be better when context modelling is applied. double entropy_literals_and_leng = calculate_entropy(literals_and_leng, literals_and_leng_cnt); double entropy_index_main = calculate_entropy(index_main, index_main_cnt); double entropy_index_aux = calculate_entropy(index_aux, index_aux_cnt); double entropy_flag = calculate_entropy(flag, flag_cnt); double bits = entropy_literals_and_leng * double(literals_and_leng_cnt); bits += entropy_index_main * double(index_main_cnt); bits += entropy_index_aux * double(index_aux_cnt); bits += entropy_flag * double(flag_cnt); estimated_compression_ratio = bits/8.0/double(data_size); if (flag) free(flag); if (index_aux) free(index_aux); if (index_main) free(index_main); if (literals_and_leng) free(literals_and_leng); /////////////////////////////////////////////// } int main() { char fileName[] = "C:\\Users\\apolar\\projects\\files\\acrord32.exe"; //char fileName[] = "C:\\\Users\\apolar\\projects\\files\\calgary\\bib"; int data_size = getFileSize(fileName); if (data_size < 0) { printf("File not found\n"); return -1; } unsigned char* data = (unsigned char*)malloc(data_size * sizeof(*data)); FILE* f = fopen(fileName, "rb"); fread(data, 1, data_size, f); fclose(f); g_buffer = (unsigned short*)malloc((data_size + data_size/2)*sizeof(unsigned short)); //preprocessing to provide better compression of exe and dll //if applied should be reversed by exetransform(0, data, data_size) //to restore original. exetransform(1, data, data_size); printf("Andrew Polar under GPL ver. 3. LICENSE\n"); clock_t start = clock(); current_byte = 0; irolz_compress(data, data_size); clock_t end = clock(); printf("Encoding time %2.3f s.\n", (double)(end - start)/CLOCKS_PER_SEC); int compressed_array_size = current_byte; start = clock(); unsigned char* test = (unsigned char*)malloc((data_size + data_size/2)); int size = 0; irolz_decompress(test, size, data_size); end = clock(); printf("Decoding time %2.3f s.\n", (double)(end - start)/CLOCKS_PER_SEC); bool isOK = true; if (size != data_size) { printf("Size mismatch\n"); isOK = false; } else { for (int i=0; i