huffman压缩是一种压缩算法,其中经典的部分就是根据字符出现的频率建立huffman树,然后根据huffman树的构建结果标示每个字符。huffman编码也称为前缀编码,就是每个字符的表示形式不是另一个字符表示的前缀。如果学过c语言版本的数据结构的话,那么会知道其上面的算法的时间复杂度是O(N^2), 也算是比较复杂的,那么首先贴上这个版本算法的代码:
#include<iostream> #include<string> using namespace std; typedef struct huffman_node_s { int weight; int parent; int lchild; int rchild; }huffman_node_t, *HuffmanTree; typedef char** HuffmanCode; void select(HuffmanTree ht, int n, int* s1, int* s2) { int i; int temp; *s1 = *s2 = 0; for (i = 1; i <= n; i++) { if (0 == ht[i].parent) { if (0 == *s1 && 0 == *s2) { *s1 = i; continue; } else if (0 == *s2) { *s2 = i; if (ht[*s1].weight > ht[*s2].weight) { temp = *s1, *s1 = *s2, *s2 = temp; } continue; } if (ht[i].weight < ht[*s1].weight && ht[i].weight < ht[*s2].weight) { *s2 = *s1; *s1 = i; } else if (ht[i].weight > ht[*s1].weight && ht[i].weight < ht[*s2].weight) { *s2 = i; } } } } void HuffmanEncode(HuffmanTree* ht, HuffmanCode* hc, int* weight, int n) { int i, start; int s1, s2; int c, f; int m = 2 * n - 1; *ht = (huffman_node_t*)malloc((m + 1) * sizeof(huffman_node_t)); for (i = 1; i <= m; i++) { if (i <= n) { (*ht)[i].weight = weight[i - 1]; } else { (*ht)[i].weight = 0; } (*ht)[i].parent = 0; (*ht)[i].lchild = 0; (*ht)[i].rchild = 0; } for (i = n + 1; i <= m; i++) { select(*ht, i - 1, &s1, &s2); (*ht)[i].lchild = s1; (*ht)[i].rchild = s2; (*ht)[i].weight = (*ht)[s1].weight + (*ht)[s2].weight; (*ht)[s1].parent = (*ht)[s2].parent = i; } *hc = (char**)malloc((n + 1) * sizeof(char*)); char* temp = (char*)malloc(n * sizeof(char)); for (i = 1; i <= n; i++) { temp[n - 1] = ' '; start = n - 1; for (c = i, f = (*ht)[i].parent; f != 0; c = f, f = (*ht)[f].parent) { if (c == (*ht)[f].lchild) temp[--start] = '0'; else temp[--start] = '1'; } (*hc)[i] = (char*)malloc(n - start); strcpy((*hc)[i], temp + start); } } int main(int argc, char* argv[]) { int weight[] = {5, 29, 7, 8, 14, 23, 3, 11}; int length = sizeof(weight) / sizeof(int); HuffmanTree ht = NULL; HuffmanCode hc = NULL; HuffmanEncode(&ht, &hc, weight, length); int i; for (i = 1; i <= length; i++) cout << hc[i] << endl; for (i = 1; i <= length; i++) free(hc[i]); free(hc); cin.get(); return 0; }
还有另外一种算法,就是用爽队列的形式,可以把时间复杂度降到O(N*logN),算法的核心思想是:
1, 建立两个空的队列
2,为每一个字符建立一个节点,并按照字符出现的频率以非递减的方式放入第一个队列
3,每步要找出出现频率最小的两个字符,那么可以根据以下方法进行查找:
a,如果第二个队列为空,那么使第一个队列的头结点出列
b,如果第一个队列为空,那么使第二个队列的头结点出列
c,如果两个队列都不为空,那比较两个队列头结点字符出现的频率,使出现频率较小的头结点出列
4,创建一个新的临时节点,它的频率是第三步骤中出列两个节点所包含的字符的频率之和,然后将临时节点压入第二个队列,当第一个队列中不包含元素节点而第二个队列中只有一个元素节点的时候,停止算法,下面给出代码:
#include<iostream> #include<string> using namespace std; typedef struct queue_node_s { char data; int frequent; struct queue_node_s* lchild; struct queue_node_s* rchild; }queue_node_t; typedef struct queue_s { int front, rear; int capcity; queue_node_t** arr; }queue_t; queue_node_t* createNode(char data, int frequent) { queue_node_t* node = (queue_node_t*)malloc(sizeof(queue_node_t)); node->data = data; node->frequent = frequent; node->lchild = NULL; node->rchild = NULL; return node; } queue_t* createQueue(int size) { queue_t* queue = (queue_t*)malloc(sizeof(queue_t)); queue->capcity = size; queue->front = queue->rear = -1; queue->arr = (queue_node_t**)malloc(size * sizeof(queue_node_t)); if (NULL == queue->arr) { free(queue); return NULL; } return queue; } bool isQueueEmpty(queue_t* queue) { if (-1 == queue->front && -1 == queue->rear) return true; return false; } bool isContainOne(queue_t* queue) { if (queue->rear == queue->front && queue->front != -1) return true; return false; } bool isQueueFull(queue_t* queue) { return queue->rear == queue->capcity - 1; } void enQueue(queue_t* queue, queue_node_t* item) { if (isQueueFull(queue)) return; queue->arr[++queue->rear] = item; if (-1 == queue->front) queue->front++; } queue_node_t* deQueue(queue_t* queue) { if (isQueueEmpty(queue)) return NULL; queue_node_t* temp = queue->arr[queue->front]; if (queue->front == queue->rear) queue->front = queue->rear = -1; else queue->front++; return temp; } queue_node_t* getFront(queue_t* queue) { if (isQueueEmpty(queue)) return NULL; return queue->arr[queue->front]; } queue_node_t* findMin(queue_t* queueOne, queue_t* queueTwo) { if (isQueueEmpty(queueOne)) return deQueue(queueTwo); if (isQueueEmpty(queueTwo)) return deQueue(queueOne); if (getFront(queueOne)->frequent < getFront(queueTwo)->frequent) return deQueue(queueOne); return deQueue(queueTwo); } void printArr(char* arr, int n) { int i; for (i = 0; i < n; i++) printf("%c", arr[i]); cout << endl; } bool isLeaf(queue_node_t* node) { if (NULL == node->lchild && NULL == node->rchild) return true; return false; } queue_node_t* buildHuffmanTree(char* data, int* frequents, int size) { queue_node_t* lchild; queue_node_t* rchild; queue_node_t* top; queue_t* queueOne = createQueue(size); queue_t* queueTwo = createQueue(size); int i; for (i = 0; i < size; i++) enQueue(queueOne, createNode(data[i], frequents[i])); while (!(isQueueEmpty(queueOne) && isContainOne(queueTwo))) { lchild = findMin(queueOne, queueTwo); rchild = findMin(queueOne, queueTwo); top = createNode('$', lchild->frequent + rchild->frequent); top->lchild = lchild; top->rchild = rchild; enQueue(queueTwo, top); } return deQueue(queueTwo); } void printCodes(queue_node_t* node, char* arr, int top) { if (node->lchild) { arr[top] = '0'; printCodes(node->lchild, arr, top + 1); } if (node->rchild) { arr[top] = '1'; printCodes(node->rchild, arr, top + 1); } if (isLeaf(node)) { printf("%c:", node->data); printArr(arr, top); } } void HuffmanCodes(char* data, int* frequents, int size) { queue_node_t* root = buildHuffmanTree(data, frequents, size); char* arr = (char*)malloc(size * sizeof(char)); int top = 0; printCodes(root, arr, top); free(arr); } int main(int argc, char* argv[]) { char data[] = {'a', 'b', 'c', 'd', 'e', 'f'}; int freq[] = {5, 9, 12, 13, 16, 45}; int size = sizeof(data) / sizeof(data[0]); HuffmanCodes(data, freq, size); cin.get(); return 0; }