《算法》第五章部分程序 part 8

▶ 书中第五章部分程序，包括在加上自己补充的代码，适用于基因序列的 2-Bit 压缩算法，行程长压缩算法，Huffman 压缩算法，LZW 压缩算法

● 适用于基因序列的 2-Bit 压缩算法

 1 package package01;
 2 
 3 import edu.princeton.cs.algs4.BinaryStdIn;
 4 import edu.princeton.cs.algs4.BinaryStdOut
 5 import edu.princeton.cs.algs4.Alphabet;
 6 
 7 public class class01
 8 {
 9     private class01() {}
10 
11     public static void compress()   // 压缩，ATGC 每个字母只用 00、01、10、11 之一来表示
12     {
13         Alphabet DNA = Alphabet.DNA;
14         String s = BinaryStdIn.readString();
15         int n = s.length();
16         BinaryStdOut.write(n);
17         for (int i = 0; i < n; i++)
18             BinaryStdOut.write(DNA.toIndex(s.charAt(i)), 2);            // 找到字母在字母表 DNA 中的索引（0 ~ 3），转化为两位二进制字符
19         BinaryStdOut.close();                                           // 关闭二进制流，补齐字符字节，防止出错
20     }
21 
22     public static void expand()     // 扩展
23     {
24         Alphabet DNA = Alphabet.DNA;
25         int n = BinaryStdIn.readInt();
26         for (int i = 0; i < n; i++)
27             BinaryStdOut.write(DNA.toChar(BinaryStdIn.readChar(2)), 8); // 读入两位二进制字符，作为索引得到对应的字母，转为 Byte 写出
28         BinaryStdOut.close();
29     }
30 
31     public static void main(String[] args)
32     {
33         if (args[0].equals("-"))    // '-' 表压缩，'+' 表扩展
34             compress();
35         else if (args[0].equals("+"))
36             expand();
37         else
38             throw new IllegalArgumentException("
<main> Illegal command.
");
39     }
40 }

● 行程长压缩算法，注意输入和输出都是二进制串

 1 package package01;
 2 
 3 import edu.princeton.cs.algs4.BinaryStdIn;
 4 import edu.princeton.cs.algs4.BinaryStdOut;
 5 
 6 public class class01
 7 {
 8     private static final int R = 256;   // 最大行程长
 9     private static final int LG_R = 8;  // log(R)，为了表示最大行程长需要的 Bit 数
10 
11     private class01() {}
12 
13     public static void compress()
14     {
15         char run = 0;                                   // 累计扫描相同的字符数，初始化为 0
16         for (boolean b = false; !BinaryStdIn.isEmpty();)// 默认第 1 个字符为 false
17         {
18             if (BinaryStdIn.readBoolean() != b)         // 与上一个字符不同
19             {
20                 BinaryStdOut.write(run, LG_R);          // 以规定的 Bit 宽度写入累计行程长
21                 run = 1;                                // 累计扫描数设定为 1
22                 b = !b;                                 // 翻转字符 b
23             }
24             else                                        // 与上一个字符相同
25             {
26                 if (run == R - 1)                       // 累计扫描述已经最大，需要切成两截分别记录
27                 {
28                     BinaryStdOut.write(run, LG_R);      // 先写入长为 R-1 的行程
29                     run = 0;
30                     BinaryStdOut.write(run, LG_R);      // 再插入一个长为 0 的行程
31                 }
32                 run++;                                  // 计数器自加 1
33             }
34         }
35         BinaryStdOut.write(run, LG_R);
36         BinaryStdOut.close();
37     }
38     
39     public static void expand()
40     {
41         for (boolean b = false; !BinaryStdIn.isEmpty();)// 默认第 1 个字符为 false
42         {
43             int run = BinaryStdIn.readInt(LG_R);        // 按规定的 Bit 宽度读入一个行程长
44             for (int i = 0; i < run; i++)               // 向输出中写行程长个数的字符 b
45                 BinaryStdOut.write(b);
46             b = !b;                                     // 翻转字符 b
47         }
48         BinaryStdOut.close();
49     }    
50 
51     public static void main(String[] args)
52     {
53         if (args[0].equals("-"))
54             compress();
55         else if (args[0].equals("+"))
56             expand();
57         else
58             throw new IllegalArgumentException("
<main> Illegal command.
");
59     }
60 }

● Huffman 压缩算法

  1 package package01;
  2 
  3 import edu.princeton.cs.algs4.BinaryStdIn;
  4 import edu.princeton.cs.algs4.BinaryStdOut;
  5 import edu.princeton.cs.algs4.MinPQ;
  6 
  7 public class class01
  8 {
  9     private static final int R = 256;                       // 字符表基数
 10 
 11     private class01() { }
 12 
 13     private static class Node implements Comparable<Node>   // Trie 树节点
 14     {
 15         private final char ch;                              // 字符，频率和左右子树
 16         private final int freq;
 17         private final Node left, right;
 18 
 19         Node(char inputCh, int inputFreq, Node inputLeft, Node inputRight)
 20         {
 21             ch = inputCh;
 22             freq = inputFreq;
 23             left = inputLeft;
 24             right = inputRight;
 25         }
 26 
 27         private boolean isLeaf()                            // 是否为叶节点
 28         {
 29             return (left == null) && (right == null);
 30         }
 31 
 32         public int compareTo(Node that)                     // 两节点相比较
 33         {
 34             return this.freq - that.freq;
 35         }
 36     }
 37 
 38     public static void compress()
 39     {
 40         char[] input = BinaryStdIn.readString().toCharArray();
 41         int[] freq = new int[R];                            // 统计字母频率
 42         for (int i = 0; i < input.length; i++)
 43             freq[input[i]]++;
 44         Node root = buildTrie(freq);                        // 用频率表建立 Trie 树
 45         String[] st = new String[R];                        // 建立查找表，将字符映射到 01 串
 46         buildCode(st, root, "");
 47         writeTrie(root);                                    // 首先写入 Trie 树用于解压
 48         BinaryStdOut.write(input.length);                   // 写输入串的长度
 49         for (int i = 0; i < input.length; i++)              // 压缩过程，循环每次处理一个字符
 50         {
 51             String code = st[input[i]];
 52             for (int j = 0; j < code.length(); j++)
 53                 BinaryStdOut.write(code.charAt(j) != '0');  // code[j] == '0' 则条件不成立，写个 0，反之写个 1
 54         }
 55         BinaryStdOut.close();
 56     }
 57 
 58     public static void expand()
 59     {
 60         Node root = readTrie();                             // 首先读取 Trie 树
 61         int length = BinaryStdIn.readInt();                 // 读取输入串的长度
 62         for (int i = 0; i < length; i++)                    // 解压过程，循环每次处理一个字符
 63         {
 64             Node x = root;
 65             for (; !x.isLeaf(); x = (BinaryStdIn.readBoolean() ? x.right : x.left));    // 顺着Trie 树往下直到叶节点
 66             BinaryStdOut.write(x.ch, 8);                                                // 输出叶节点对应的字符
 67         }
 68         BinaryStdOut.close();
 69     }
 70 
 71     private static Node buildTrie(int[] freq)               // 用频率表建立 Trie 树，返回根节点
 72     {
 73         MinPQ<Node> pq = new MinPQ<Node>();                 // 初始化所有字母节点，入队
 74         for (char i = 0; i < R; i++)
 75         {
 76             if (freq[i] > 0)
 77                 pq.insert(new Node(i, freq[i], null, null));
 78         }
 79         if (pq.size() == 1)                                 // 只有一个字母的情况，原文不是 '' 则插入 ''，不然插入 '1'
 80             pq.insert(new Node(freq[''] == 0 ? '' : '1', 0, null, null));
 81 
 82         for (; pq.size() > 1;)                              // 每次合并两个频率最小的节点并入队，直到队列中只剩一个元素
 83         {
 84             Node left = pq.delMin(), right = pq.delMin();
 85             Node parent = new Node('', left.freq + right.freq, left, right);
 86             pq.insert(parent);
 87         }
 88         return pq.delMin();                                 // 最后剩下的节点就是根节点，其 freq 即为总字符数
 89     }
 90 
 91     private static void buildCode(String[] st, Node x, String s)    // 遍历以 x 为根节点的树，每到达叶节点就对查找表 st 赋字符串值，当前字符串前缀为 s
 92     {
 93         if (x.isLeaf())
 94             st[x.ch] = s;                                   // x.ch 是叶节点对应的字母（如 a = 97），将查找表中该索引位置的值设为遍历进来时得到的 01 串
 95         else
 96         {
 97             buildCode(st, x.left, s + '0');                 // 往左遍历，串上加一个 '0'
 98             buildCode(st, x.right, s + '1');                // 往左遍历，串上加一个 '1'
 99         }
100     }
101 
102     private static void writeTrie(Node x)   // 将 x 为根节点的 Trie 树写成字符串
103     {
104         if (x.isLeaf())                     // x 是叶节点，写个 1，后跟字符的 8 Bit 二进制表示，然后递归回退
105         {
106             BinaryStdOut.write(true);
107             BinaryStdOut.write(x.ch, 8);
108             return;
109         }
110         BinaryStdOut.write(false);          // x 不是叶节点，写个 0，然后分别遍历 x 的左右子节点
111         writeTrie(x.left);
112         writeTrie(x.right);
113     }
114 
115     private static Node readTrie()          // 读取 Trie 树，注意频率没有用，全都设为 -1
116     {
117         if (BinaryStdIn.readBoolean())      // 当前位是 1 则说明接下来是叶节点，一次性读入 8 Bit，转成 char
118             return new Node(BinaryStdIn.readChar(), -1, null, null);
119         else                                // 当前位是 0 则说明还没有到叶节点，左右子节点分别向下递归
120             return new Node('', -1, readTrie(), readTrie());
121     }
122 
123     public static void main(String[] args)
124     {
125         if (args[0].equals("-"))
126             compress();
127         else if (args[0].equals("+"))
128             expand();
129         else
130             throw new IllegalArgumentException("
<main> Illegal command.
");
131     }
132 }

● LZW 压缩算法

 1 package package01;
 2 
 3 import edu.princeton.cs.algs4.BinaryStdIn;
 4 import edu.princeton.cs.algs4.BinaryStdOut;
 5 import edu.princeton.cs.algs4.TST;
 6 
 7 public class class01
 8 {
 9     private static final int R = 256;   // 字符表基数
10     private static final int W = 12;    // 编码宽度（Bit）
11     private static final int L = 4096;  // 编码表大小， L = 2^W
12 
13     private class01() {}
14 
15     public static void compress()
16     {
17         String input = BinaryStdIn.readString();
18         TST<Integer> st = new TST<Integer>();               // 编译表
19         for (int i = 0; i < R; i++)                         // 先初始化所有的单字符
20             st.put("" + (char)i, i);
21         for (int code = R + 1; input.length() > 0;)         // st[R] 标记 EOF，st[R+1] ~ st[L-1] 记录新的编码
22         {
23             String s = st.longestPrefixOf(input);           // 在输入中寻找目前掌握的最长前缀
24             BinaryStdOut.write(st.get(s), W);               // 把当前最长前缀的编码写到输出中，指定宽度 W
25             int t = s.length();
26             if (t < input.length() && code < L)             // 刚才找到的前缀短于原文剩余长度，且编译表还没填满
27                 st.put(input.substring(0, t + 1), code++);  // 连着刚才找到的前缀以及 1 个前瞻字符一起编码为编译表的新一项
28             input = input.substring(t);                     // 砍掉原文前 t 字符，用于下一次扫描
29         }
30         BinaryStdOut.write(R, W);                           // 最后写上字符表基数
31         BinaryStdOut.close();
32     }
33 
34     public static void expand()
35     {
36         String[] st = new String[L];
37         for (int i = 0; i < R; i++)             // 初始化编译表的前 R 项
38             st[i] = "" + (char)i;
39         st[R] = "";                             // st[R] 标记 EOF（这里没用）
40 
41         int codeword = BinaryStdIn.readInt(W);  // 读入定宽字符，如果读入的字符就是字符集基数，说明正文部分是空的，不用解压
42         if (codeword == R)
43             return;
44         String val = st[codeword];              // 从编译表中找到原文，第一个定宽字符的原文肯定是单字符
45         for (int i = R + 1;;)                        // i 指向当前编译表中的首个空白项
46         {
47             BinaryStdOut.write(val);            // 找到的原文写到输出流
48             codeword = BinaryStdIn.readInt(W);  // 读取下一个定宽字符，同 for 之前的部分
49             if (codeword == R)
50                 break;
51             String s = st[codeword];
52             if (i == codeword)                  // 特殊情况，欲查找的编码欲当前段字符串的编码相同
53                 s = val + val.charAt(0);        // 此时原文的最后一个字符与首字符相同，补上
54             if (i < L)                          // 编译表还没填满
55                 st[i++] = val + s.charAt(0);    // 连着之前的原文以及当前原文的第 1 个字符一起编码为编译表的新一项
56             val = s;                            // 当前原文保存到 val 中，用于下次编码
57         }
58         BinaryStdOut.close();
59     }
60 
61     public static void main(String[] args)
62     {
63         if (args[0].equals("-"))
64             compress();
65         else if (args[0].equals("+"))
66             expand();
67         else
68             throw new IllegalArgumentException("
<main> Illegal command.
");
69     }
70 }

相关阅读:
二维数组和最大字数组求取 2
spring冲刺第七天
 spring冲刺第六天
 寻找水王
 spring冲刺第五天
 spring冲刺第四天
 spring冲刺第三天
 spring冲刺第二天
 大道至简读书笔记3
spring冲刺第一天
原文地址：https://www.cnblogs.com/cuancuancuanhao/p/9869433.html