很難想象我們以後考完研大半個月了,挺懷念那段時光的,雖然一切都是未知數,但是總有一個目標指引着自己前進,這段時間閒下來,於是把自己以前的一些數據結構算法知識回顧了下,所以有了這篇文章,接下來幾天會陸續發佈出來。
正文開始
哈夫曼編碼我就不介紹了,很多博客都有,這也是數據結構裏很基礎的樹的應用。可以參考這篇博客,我就不重複造輪子了。huffman編碼——原理與實現
整個程序的執行流程就是統計文本,構造哈夫曼樹,生成哈夫曼編碼,根據哈夫曼編碼對文本進行壓縮,然後根據壓縮文件和內存中的哈夫曼樹對壓縮後的二進制文件進行解壓復現,該程序只支持純ASCII字符文件。
廢話不多說,直接上代碼,在ubuntu18.04下gcc C99標準下編譯
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#define ASCII_LEN 128 //總ASCII碼數量
const char *SOURCE_FILE = "source.txt"; //待編碼的源文件
const char *HUFFMAN_RESULT_FILE = "Huffman.txt"; //編碼的統計表包括編碼的二進制串
const char *CODE_FILE = "code.dat"; //壓縮後的文件
const char *RECODE_FILE = "recode.txt"; //壓縮後譯碼出來的文件
typedef struct {
int n; //頻次
int bits; //壓縮後的bit數
char c; //對應字符
char *data; //編碼的二進制字符串
} Letter;
Letter word[ASCII_LEN]; //統計各字符的使用情況
typedef struct {
int n;
char *data;
int top;
} Stack; //簡單地實現了一個棧
typedef struct Node {
int weight; //權值
int parent; //父節點的下標
int lChild, rChild; //左右孩子的下標
char c;
} HTNode, *HuffmanTree; //哈夫曼樹結構
typedef struct {
int n; //該節點佔用幾位 一個unsigned char 8位
char *data;
char c;
} HuffmanCodeNode, *HuffmanCode; //用來存儲每個節點的編碼
//在HT數組的前K個元素中選出權值最小且parent=-1的節點
int min(HuffmanTree HT, int k) {
bool first = true;
int min, minWeight;
for (int i = 0; i < k; ++i) {
if (HT[i].parent != -1) continue;
if (first) {
min = i;
minWeight = HT[i].weight;
first = false;
continue;
}
if (minWeight > HT[i].weight) {
min = i;
minWeight = HT[i].weight;
}
}
HT[min].parent = 1;
return min;
}
void selectMinium(HuffmanTree HT, int k, int *min1, int *min2) {
*min1 = min(HT, k);
*min2 = min(HT, k);
}
//創建哈夫曼樹
HuffmanTree createHuffmanTree(Letter *word, int n) {
int total = 2 * n - 1;
HuffmanTree HT = malloc(sizeof(HTNode) * total);
int i = 0;
for (int j = 0; j < ASCII_LEN; ++j) {
if (!word[j].n) continue;
HT[i].parent = HT[i].lChild = HT[i].rChild = -1; //初始化節點值爲-1
HT[i].weight = word[j].n;
HT[i++].c = (char) j;
}
for (i = n; i < total; ++i) { //合併後的節點初始化
HT[i].parent = HT[i].lChild = HT[i].rChild = -1;
HT[i].weight = 0; //不初始化爲0行不行?
}
int min1 = 0, min2 = 0;
for (i = n; i < total; ++i) {
selectMinium(HT, i, &min1, &min2);
HT[min1].parent = HT[min2].parent = i;
HT[i].lChild = min1;
HT[i].rChild = min2;
HT[i].weight = HT[min1].weight + HT[min2].weight;
}
return HT;
}
//根據哈夫曼樹從終端節點回溯到樹根,填入編碼字符串(用棧倒序)
HuffmanCode huffmanCoding(HuffmanTree HT, int n) {
HuffmanCode HC = malloc(n * sizeof(HuffmanCodeNode));
Stack s;
s.data = malloc(n * sizeof(char));
for (int i = 0; i < n; ++i) {
s.top = 0, s.n = 0; //top表示棧頂元素的下一個位置,同時也是棧的大小
memset(s.data, '\0', sizeof(char) * n);
int current = i, father = HT[i].parent;
while (father != -1) {
if (HT[father].lChild == current) {
s.data[s.top++] = '0';
} else {
s.data[s.top++] = '1';
}
current = father;
father = HT[father].parent;
}
s.n = s.top; //記錄當前大小
HC[i].n = s.n;
HC[i].data = malloc(sizeof(char) * s.n);
memset(HC[i].data, '\0', sizeof(char) * s.n);
for (int j = 0; j < s.n; ++j) {
HC[i].data[j] = s.data[--s.top];
}
}
free(s.data);
return HC;
}
//獲取文本中字符的頻次只支持ASCII字符
void getLetterFrequency() {
FILE *file = fopen(SOURCE_FILE, "r"); //只讀方式打開文本文件
memset(word, 0, sizeof(word));
while (!feof(file)) {
++word[fgetc(file)].n; //按照其ascii碼值存放頻次
}
fclose(file);
}
int main() {
getLetterFrequency();
int noNull = 0;
for (int i = 0; i < ASCII_LEN; ++i) { //統計一共多少字符
if (word[i].n) ++noNull;
}
int j = 0;
HuffmanTree ht = createHuffmanTree(word, noNull);
HuffmanCode hc = huffmanCoding(ht, noNull);
for (int i = 0, j = 0; i < ASCII_LEN && j < noNull; ++i) { //將編碼結果從樹寫回結構體數組中
if (word[i].n) {
word[i].data = hc[j].data;
word[i].bits = hc[j++].n;
}
}
FILE *resultFile = fopen(HUFFMAN_RESULT_FILE, "w"); //統計結果寫入文件
for (int i = 0, j = 0; i < ASCII_LEN; ++i) {
if (word[i].n) {
fprintf(resultFile, "%c:%d %s\n", (char) i, word[i].n, word[i].data);
}
}
fclose(resultFile);
FILE *sourceFile = fopen(SOURCE_FILE, "r");
FILE *encodeFile = fopen(CODE_FILE, "wb+"); //二進制形式寫入
unsigned char buf = 0x00;
int count = 0;
while (!feof(sourceFile)) {
int ch = fgetc(sourceFile);
for (int i = 0; i < word[ch].bits; ++i) {
++count;
buf <<= 1;
if (word[ch].data[i] == '1') //如果是0則只需要左移
buf |= 1;
if (count % 8 == 0) {
fwrite(&buf, sizeof(unsigned char), 1, encodeFile);
buf = 0x00;
}
}
}
while (count % 8 != 0) { //將剩下的bit位前移,會有多餘的
buf <<= 1;
++count;
}
fwrite(&buf, sizeof(unsigned char), 1, encodeFile);
FILE *recodeFile = fopen(RECODE_FILE, "w");
fseek(encodeFile, 0, SEEK_SET); //設置文件指針置頭部,剛纔寫過一次現在讀;
int i = 2 * noNull - 2; //根節點位置
count = 0;
while (!feof(encodeFile)) {
if (ht[i].lChild == -1 && ht[i].rChild == -1) {
fputc(ht[i].c, recodeFile);
i = 2 * noNull - 2;
}
if (count % 8 == 0)fread(&buf, sizeof(unsigned char), 1, encodeFile); //讀完8bit再繼續讀一次
int t = buf & 0x80; //右移取一位
buf <<= 1;
++count;
if (t == 0x80) { //最高位爲1
i = ht[i].rChild;
} else if (t == 0) {
i = ht[i].lChild;
}
}
fclose(sourceFile);
fclose(encodeFile);
fclose(recodeFile);
for (int k = 0; k < noNull; ++k) {
free(hc[k].data);
}
free(hc);
free(ht);
return 0;
}
程序運行過程中沒有輸出,都記錄在文件中
code.dat相比源文件壓縮了約一半的空間,這取決於文本字符的統計分佈。