//MSufSort suffix sorting algorithm. //Version 3.1.1 BETA //Authors: Michael A. Maniscalco and Simon J. Puglisi //Date: May 10, 2007 //web: http://www.michael-maniscalco.com //email: michael@michael-maniscalco.com //This file is slight modification of original program //by Andrew Polar: //1. Only BWT is left. //2. Tandem repeat sort removed. //3. Insertion sort removed. //4. 7 way quick sort removed. //This modification is only a reserch about contribution of removed //options to time of code execution. I don't recommend to use this //version for production without exhaustive testing. #include #include #include #include #include "time.h" #include "stdio.h" #include "conio.h" using namespace std; #define DONT_CHECK_DURING_TWO_STAGE 0x80000000 #define DONT_CHECK_DURING_RIGHT_LEFT_TWO_STAGE 0x40000000 __forceinline unsigned short GetU16(void * address){return *(unsigned short *)address;} class MSufSort { public: MSufSort(); virtual ~MSufSort(); bool BWT(unsigned char * source, unsigned int nBytes, unsigned int & bwtIndex); void UnBWT(unsigned char * source, unsigned int nBytes, unsigned int index); private: class PartitionSize { public: unsigned int m_size; unsigned short m_suffix; bool operator < (const PartitionSize & p) const{return m_size < p.m_size;} }; // stack containing sub partition sizes of the suffix array during // suffix sort. typedef struct PartitionInfo { PartitionInfo(unsigned int size, unsigned int matchLength, unsigned int firstSortedRank = 0xffffffff, unsigned int partitionOffset = 0xffffffff): m_size(size), m_matchLength(matchLength), m_firstSortedRank(firstSortedRank), m_partitionOffset(partitionOffset) { } unsigned int m_size; unsigned int m_matchLength; unsigned int m_firstSortedRank; unsigned int m_partitionOffset; } PartitionInfo; stack m_partitionSize; void Initialize(unsigned char * source, unsigned int nBytes); void MultiKeyQuickSort(unsigned int ** array, unsigned int count); unsigned int SelectPivot(unsigned int * indexA, unsigned int * indexB, unsigned int * indexC); void Swap(unsigned int ** ptrA, unsigned int ** ptrB); unsigned int GetValue(unsigned int * suffixIndex); void RestoreOriginalString(); void CompleteImprovedTwoStageAsBWT(); unsigned int ** m_SA; //Suffix Array unsigned int * m_ISA; //Inverse Suffix Array (ISA[i] = k, if SA[k] = i). unsigned char * m_source; unsigned int m_firstRank[0x10000]; //65536 unsigned int m_lastRank[0x10000]; unsigned int m_suffixCount[0x10000]; unsigned int m_bStarSuffixCount[0x10000]; unsigned int m_nextSortedRank; unsigned int m_bStarCount; unsigned int m_sourceLength; unsigned int m_bwtIndex; unsigned int m_tandemRepeatDepth; unsigned int m_currentMatchLength; unsigned char m_tandemRepeatSymbol; }; __forceinline unsigned int MSufSort::GetValue(unsigned int * suffixIndex) { if (*suffixIndex < 0x80000000) //binary 1000 0000 0000 0000 0000 0000 0000 0000 return (suffixIndex[-1] & 0x3fffffff); //binary 11 1111 1111 1111 1111 1111 1111 1111 //if value is 31 bit long it returns least 30 bits of the previous value, otherwise returns value as is return *suffixIndex; } __forceinline void MSufSort::Swap(unsigned int ** ptrA, unsigned int ** ptrB) { unsigned int * temp = *ptrA; *ptrA = *ptrB; *ptrB = temp; } __forceinline unsigned int MSufSort::SelectPivot(unsigned int * index1, unsigned int * index2, unsigned int * index3) { // middle of three method. unsigned int value1 = GetValue(index1); unsigned int value2 = GetValue(index2); unsigned int value3 = GetValue(index3); //returns value in the middle between min and max if (value1 < value2) return ((value2 < value3) ? value2 : (value1 < value3) ? value3 : value1); return ((value1 < value3) ? value1 : (value2 < value3) ? value3 : value2); } MSufSort::MSufSort() { m_sourceLength = 0; m_tandemRepeatDepth = 0; } MSufSort::~MSufSort() {} void MSufSort::Initialize(unsigned char * source, unsigned int nBytes) { // initializes pointers to work space and initializes counts memset(m_suffixCount, 0, sizeof(unsigned int) * 0x10000); memset(m_bStarSuffixCount, 0, sizeof(unsigned int) * 0x10000); m_source = source; m_ISA = (unsigned int *)source; m_sourceLength = nBytes; // count the suffixes and B* suffixes. unsigned char * endPtr = (unsigned char *)source; unsigned char * ptr = endPtr + nBytes - 1; unsigned char * startPtr = ptr; int last = -1; m_suffixCount[ptr[0]]++; while (ptr >= endPtr) { while ((ptr >= endPtr) && (ptr[0] >= last)) { if (ptr != startPtr) m_suffixCount[GetU16(ptr)]++; last = *ptr--; } if (ptr >= endPtr) { m_suffixCount[GetU16(ptr)]++; m_bStarSuffixCount[GetU16(ptr)]++; last = *ptr--; while ((ptr >= endPtr) && (ptr[0] <= last)) { m_suffixCount[GetU16(ptr)]++; last = *ptr--; } } } // determine the partially sorted ranks of the suffixes for the ISA based // on the counts of each suffix using the first two symbols of each suffix. // Also, calculate the partition boundaries for the SA based on the first // two symbols of each B* suffix. unsigned int * firstSaPartitionIndex = new unsigned int[0x10000]; unsigned int * partitionOffset = new unsigned int[0x10000]; unsigned int rank = 0x80000300; unsigned int saPartitionIndex = 0; PartitionSize * pSize = new PartitionSize[0x10000]; for (unsigned int i = 0; i < 0x10000; i++) { unsigned short suffix = (i >> 8) | ((i & 0xff) << 8); if (m_suffixCount[suffix]) { pSize[suffix].m_size = m_suffixCount[suffix]; pSize[suffix].m_suffix = suffix; partitionOffset[suffix] = saPartitionIndex; if (m_bStarSuffixCount[suffix]) { firstSaPartitionIndex[suffix] = saPartitionIndex; saPartitionIndex += m_bStarSuffixCount[suffix]; } m_firstRank[suffix] = rank; rank += m_suffixCount[suffix]; rank += (0x200 - (rank & 0xff)); m_lastRank[suffix] = rank; rank += 0x200; } } // push the SA partition lengths for B* suffixes sorted for first two symbols // onto stack of partially sorted SA partitions. sort(pSize, pSize + 0x10000); m_bStarCount = 0; for (int i = 0; i < 0x10000; i++) { unsigned short suffix = pSize[i].m_suffix; if (m_bStarSuffixCount[suffix]) { m_bStarCount += m_bStarSuffixCount[suffix]; m_partitionSize.push(PartitionInfo(m_bStarSuffixCount[suffix], 2, m_firstRank[suffix], partitionOffset[suffix])); } } // Initialize the SA to reference each B* suffix m_SA = (unsigned int **)(source + (((m_sourceLength + 3) * 6) - (m_bStarCount * 4))); endPtr = (unsigned char *)source; ptr = endPtr + nBytes - 1; startPtr = ptr; last = -1; while (ptr >= endPtr) { while ((ptr >= endPtr) && (ptr[0] >= last)) last = *ptr--; if (ptr >= endPtr) { // B* suffix unsigned short suffix = GetU16(ptr); m_SA[firstSaPartitionIndex[suffix]++] = m_ISA + ((unsigned int)(ptr - endPtr) + 2); last = *ptr--; while ((ptr >= endPtr) && (ptr[0] <= last)) last = *ptr--; } } // initialize the partial ISA with the partially sorted ranks // and initialize the SA for B* suffixes with partitions sorted // by first two symbols. m_ISA[nBytes + 1] = 0x80000100 | source[1]; m_ISA[nBytes] = 0x80000200 | source[0]; m_ISA[nBytes - 1] = m_lastRank[source[nBytes - 1]]; unsigned int c2 = 0x00; unsigned int c1 = source[nBytes - 1]; for (int i = (int)nBytes - 2; i >= 0; i--) { unsigned int t = source[i]; m_ISA[i] = m_lastRank[GetU16(source + i)] | c2; c2 = c1; c1 = t; } // delete [] firstSaPartitionIndex; delete [] partitionOffset; delete [] pSize; } void MSufSort::MultiKeyQuickSort(unsigned int ** data, unsigned int count) { unsigned int matchLength = 2; unsigned int ** startOfArray = data; unsigned int ** endOfArray = data + count; stack localPartitionStack; while (true) { if (count < 2) { data[0] -= matchLength; data += count; if (localPartitionStack.empty()) return; PartitionInfo partitionStruct = localPartitionStack.top(); localPartitionStack.pop(); count = partitionStruct.m_size; m_currentMatchLength = matchLength = partitionStruct.m_matchLength; } else { // do three way partitioning. unsigned int ** ptr = data; unsigned int ** left = data; unsigned int ** right = data + count - 1; // select the pivot value. unsigned int pivotA = SelectPivot(data[0], data[count >> 1], *right); unsigned int temp; while (ptr <= right) if ((temp = GetValue(*ptr)) == pivotA) *(ptr++) += 2; else if (temp < pivotA) Swap(left++, ptr++); else { while ((right > ptr) && (GetValue(*right) > pivotA)) right--; Swap(ptr, right--); } // Calculate new partition sizes ... unsigned int leftSize = (unsigned int)(left - data); unsigned int middleSize = (unsigned int)(ptr - left); unsigned int rightSize = count - (leftSize + middleSize); if (rightSize) localPartitionStack.push(PartitionInfo(rightSize, matchLength)); if (middleSize) localPartitionStack.push(PartitionInfo(middleSize, matchLength + 2)); if (!(count = leftSize)) { PartitionInfo partitionStruct = localPartitionStack.top(); localPartitionStack.pop(); count = partitionStruct.m_size; m_currentMatchLength = matchLength = partitionStruct.m_matchLength; } } } } void MSufSort::RestoreOriginalString() { // reverts ISA to original string in same space int start = clock(); unsigned char * ptr = (unsigned char *)m_ISA; unsigned char t[2]; t[1] = m_ISA[m_sourceLength + 1] & 0xff; t[0] = m_ISA[m_sourceLength + 0] & 0xff; ptr += 2; for (unsigned int i = 0; i < m_sourceLength; ) if (m_ISA[i + 1] < 0x80000000) { ptr[i] = (m_ISA[i + 1] >> 8) & 0xff; ptr[i + 1] = (m_ISA[i + 1] & 0xff); i += 2; } else { ptr[i] = (m_ISA[i] & 0xff); i++; } ptr -= 2; ptr[0] = t[0]; ptr[1] = t[1]; int finish = clock(); } void MSufSort::CompleteImprovedTwoStageAsBWT() { // completes the second stage of the improved two stage sort but computes the BWT // directly rather than the suffix array. int start = clock(); // begin by expanding the partial SA to the complete SA space. unsigned int * SA = (unsigned int *)((unsigned char *)m_ISA + ((m_sourceLength + 3) & ~3)); unsigned int * ptr = SA; *(ptr++) = 0xc0000000; unsigned int total = 1; for (unsigned int i = 0; i < 0x10000; i++) { unsigned short s = (unsigned short)((i << 8) | (i >> 8)); if (m_suffixCount[s]) { m_firstRank[s] = total + m_bStarSuffixCount[s]; total += m_suffixCount[s]; m_lastRank[s] = total; for (unsigned int j = 0; j < m_bStarSuffixCount[s]; j++) *(ptr++) = (unsigned int)(*(m_SA++) - m_ISA); for (unsigned int j = m_bStarSuffixCount[s]; j < m_suffixCount[s]; j++) *(ptr++) = 0xc0000000; } } // now do right to left unsigned int lastRank = 0; int lastS = -1; unsigned int * lastSa = 0; unsigned int * saPtr = SA + m_sourceLength + 1; unsigned int * endSaPtr = SA; while (--saPtr > endSaPtr) { unsigned int n = *saPtr; unsigned int k; if (n < DONT_CHECK_DURING_RIGHT_LEFT_TWO_STAGE) { if (n) { if (m_source[n - 1] <= m_source[n]) { unsigned short s = GetU16(m_source + n - 1); if (s == lastS) { *(--lastSa) = n - 1; k = --lastRank; } else { if (lastS != -1) m_lastRank[lastS] = lastRank; lastS = s; lastRank = k = --m_lastRank[s]; lastSa = SA + k; *lastSa = n - 1; } *saPtr = m_source[n - 1] | DONT_CHECK_DURING_TWO_STAGE; if ((n > 1) && (m_source[n - 2] > m_source[n - 1])) SA[k] |= DONT_CHECK_DURING_RIGHT_LEFT_TWO_STAGE; // don't check this index on the remainder of the right to left. } } } } // now do left to right SA[0] = DONT_CHECK_DURING_RIGHT_LEFT_TWO_STAGE | m_source[m_sourceLength - 1];//m_sourceLength; SA[m_firstRank[m_source[m_sourceLength - 1]]++] = m_sourceLength - 1; lastRank = 0; lastS = -1; saPtr = SA; lastSa = 0; endSaPtr = SA + m_sourceLength + 1; unsigned int * startSaPtr = SA; while (++saPtr < endSaPtr) { unsigned int k; unsigned int n = *saPtr; if (n < DONT_CHECK_DURING_TWO_STAGE) { n &= 0x3fffffff; if ((n) && (m_source[n - 1] >= m_source[n])) { unsigned short s = GetU16(m_source + n - 1); if (s == lastS) { k = lastRank++; *(++lastSa) = n - 1; *saPtr = (s & 0xff) | DONT_CHECK_DURING_TWO_STAGE; } else { if (lastS != -1) m_firstRank[lastS] = lastRank; lastS = s; k = m_firstRank[s]++; lastSa = SA + k; *lastSa = n - 1; lastRank = k + 1; *saPtr = (s & 0xff) | DONT_CHECK_DURING_TWO_STAGE; } } else { if (n == 0) m_bwtIndex = (unsigned int)(saPtr - startSaPtr); else *saPtr = m_source[n - 1] | DONT_CHECK_DURING_TWO_STAGE; } } } unsigned char * bwtPtr = (unsigned char *)m_source; for (unsigned int i = 0; i <= m_sourceLength; i++) { if (i != m_bwtIndex) *(bwtPtr++) = SA[i] & 0xff; } int finish = clock(); } bool MSufSort::BWT(unsigned char * source, unsigned int nBytes, unsigned int & bwtIndex) { // does BWT rather than suffix sort on source of length nBytes int start = clock(); Initialize(source, nBytes); int finish1 = clock(); unsigned int ** ptr = m_SA; while (!m_partitionSize.empty()) { PartitionInfo partitionInfo = m_partitionSize.top(); m_partitionSize.pop(); m_nextSortedRank = partitionInfo.m_firstSortedRank; ptr = m_SA + partitionInfo.m_partitionOffset; MultiKeyQuickSort(ptr, partitionInfo.m_size); ptr += partitionInfo.m_size; } int finish3 = clock(); // now restore original string from ISA RestoreOriginalString(); // now complete the BWT using the improved two stage sort CompleteImprovedTwoStageAsBWT(); int finish = clock(); bwtIndex = m_bwtIndex; return true; } void MSufSort::UnBWT(unsigned char * source, unsigned int nBytes, unsigned int bwtIndex) { // does the reverse BWT on the source provided. // reverses the blocksort transform int start = clock(); unsigned int * index = new unsigned int[nBytes + 1]; unsigned int symbolRange[0x102]; for (unsigned int i = 0; i < 0x102; i++) symbolRange[i] = 0; unsigned char * ptr; ptr = source; symbolRange[0] = 1; // our -1 symbol for (unsigned int i = 0; i < nBytes; i++, ptr++) symbolRange[(*ptr) + 1]++; unsigned int n = 0; for (int i = 0; i < 0x101; i++) { unsigned int temp = symbolRange[i]; symbolRange[i] = n; n += temp; } n = 0; index[0] = bwtIndex; for (unsigned int i = 0; i < nBytes; i++, n++) { if (i == bwtIndex) n++; unsigned char symbol = source[i]; index[symbolRange[symbol + 1]++] = n; } n = bwtIndex; unsigned char * reversedBuffer = new unsigned char[nBytes]; for (unsigned int i = 0; i < nBytes; i++) { n = index[n]; if (n >= bwtIndex) reversedBuffer[i] = source[n - 1]; else reversedBuffer[i] = source[n]; } memcpy(source, reversedBuffer, nBytes * sizeof(unsigned char)); int finish = clock(); delete [] index; delete [] reversedBuffer; } //////////////////////////////////////////////////////////////////////////////////////////////// // Next is demo for using of the classes shown above. //////////////////////////////////////////////////////////////////////////////////////////////// int getFileSize(char* fileName) { FILE* f; if ((f = fopen(fileName, "rb")) == NULL) return -1; fseek(f, 0, SEEK_END); int bytes = ftell(f); fclose(f); return bytes; } int main() { char fileName[] = "C:\\Users\\apolar\\projects\\files\\rafale.bmp"; int nSize = getFileSize(fileName); if (nSize < 0) { printf("File not foung\n"); return 0; } unsigned char* data = (unsigned char*)malloc((nSize + 4) * 6); unsigned char* copy = (unsigned char*)malloc(nSize); FILE* in = fopen(fileName, "rb"); fread(data, 1, nSize, in); fclose(in); memcpy(copy, data, nSize); //Encoding MSufSort * coder = new MSufSort(); unsigned int index; int start = clock(); coder->BWT(data, nSize, index); int finish = clock(); cout << "Elapsed time: " << (double(finish - start) / CLOCKS_PER_SEC) << " seconds\n"; delete coder; //end encoding //Decoding MSufSort * decoder = new MSufSort(); start = clock(); decoder->UnBWT(data, nSize, index); finish = clock(); cout << "Elapsed time: " << (double(finish - start) / CLOCKS_PER_SEC) << " seconds\n"; delete decoder; //end decoidng bool isOK = true; for (int i=0; i