UTF-8、UTF-16、UTF-32编码的相互转换（不使用现成的函数）

UTF-8、UTF-16、UTF-32编码的相互转换（不使用现成的函数）
最近在考虑写一个可以跨平台的通用字符串类，首先需要搞定的就是编码转换问题。

vs默认保存代码文件，使用的是本地code（中文即GBK，日文即Shift-JIS），也可以使用带BOM的UTF-8。
gcc则是UTF-8，有无BOM均可（源代码的字符集可以由参数-finput-charset指定）。
那么源代码可以采用带BOM的UTF-8来保存。而windows下的unicode是UTF-16编码；Linux则使用UTF-8或UTF-32。因此不论在哪种系统里，程序在处理字符串时都需要考虑UTF编码之间的相互转换。

下面直接贴出算法代码。算法上我借鉴了秦建辉（http://blog.csdn.net/jhqin）的UnicodeConverter，只是在外面增加了一些泛型处理，让使用相对简单。

核心算法（来自UnicodeConverter）：
[cpp] view plain copy
1. namespace transform
2. {
3. /*
4. UTF-32 to UTF-8
5. */
7. inline static size_t utf(uint32 src, uint8* des)
8. {
9. if (src == 0) return 0;
11. static const byte PREFIX[] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
12. static const uint32 CODE_UP[] =
13. {
14. 0x80, // U+00000000 - U+0000007F
15. 0x800, // U+00000080 - U+000007FF
16. 0x10000, // U+00000800 - U+0000FFFF
17. 0x200000, // U+00010000 - U+001FFFFF
18. 0x4000000, // U+00200000 - U+03FFFFFF
19. 0x80000000 // U+04000000 - U+7FFFFFFF
20. };
22. size_t i, len = sizeof(CODE_UP) / sizeof(uint32);
23. for(i = 0; i < len; ++i)
24. if (src < CODE_UP[i]) break;
26. if (i == len) return 0; // the src is invalid
28. len = i + 1;
29. if (des)
30. {
31. for(; i > 0; --i)
32. {
33. des[i] = static_cast<uint8>((src & 0x3F) | 0x80);
34. src >>= 6;
35. }
36. des[0] = static_cast<uint8>(src | PREFIX[len - 1]);
37. }
38. return len;
39. }
41. /*
42. UTF-8 to UTF-32
43. */
45. inline static size_t utf(const uint8* src, uint32& des)
46. {
47. if (!src || (*src) == 0) return 0;
49. uint8 b = *(src++);
51. if (b < 0x80)
52. {
53. des = b;
54. return 1;
55. }
57. if (b < 0xC0 || b > 0xFD) return 0; // the src is invalid
59. size_t len;
61. if (b < 0xE0)
62. {
63. des = b & 0x1F;
64. len = 2;
65. }
66. else
67. if (b < 0xF0)
68. {
69. des = b & 0x0F;
70. len = 3;
71. }
72. else
73. if (b < 0xF8)
74. {
75. des = b & 0x07;
76. len = 4;
77. }
78. else
79. if (b < 0xFC)
80. {
81. des = b & 0x03;
82. len = 5;
83. }
84. else
85. {
86. des = b & 0x01;
87. len = 6;
88. }
90. size_t i = 1;
91. for (; i < len; ++i)
92. {
93. b = *(src++);
94. if (b < 0x80 || b > 0xBF) return 0; // the src is invalid
95. des = (des << 6) + (b & 0x3F);
96. }
97. return len;
98. }
100. /*
101. UTF-32 to UTF-16
102. */
104. inline static size_t utf(uint32 src, uint16* des)
105. {
106. if (src == 0) return 0;
108. if (src <= 0xFFFF)
109. {
110. if (des) (*des) = static_cast<uint16>(src);
111. return 1;
112. }
113. else
114. if (src <= 0xEFFFF)
115. {
116. if (des)
117. {
118. des[0] = static_cast<uint16>(0xD800 + (src >> 10) - 0x40); // high
119. des[1] = static_cast<uint16>(0xDC00 + (src & 0x03FF)); // low
120. }
121. return 2;
122. }
123. return 0;
124. }
126. /*
127. UTF-16 to UTF-32
128. */
130. inline static size_t utf(const uint16* src, uint32& des)
131. {
132. if (!src || (*src) == 0) return 0;
134. uint16 w1 = src[0];
135. if (w1 >= 0xD800 && w1 <= 0xDFFF)
136. {
137. if (w1 < 0xDC00)
138. {
139. uint16 w2 = src[1];
140. if (w2 >= 0xDC00 && w2 <= 0xDFFF)
141. {
142. des = (w2 & 0x03FF) + (((w1 & 0x03FF) + 0x40) << 10);
143. return 2;
144. }
145. }
146. return 0; // the src is invalid
147. }
148. else
149. {
150. des = w1;
151. return 1;
152. }
153. }
154. }
上面这些算法都是针对单个字符的，并且是UTF-32和UTF-16/8之间的互转。
通过上面的算法，可以得到UTF-16和UTF-8之间的单字符转换算法：
[cpp] view plain copy
1. namespace transform
2. {
3. /*
4. UTF-16 to UTF-8
5. */
7. inline static size_t utf(uint16 src, uint8* des)
8. {
9. // make utf-16 to utf-32
10. uint32 tmp;
11. if (utf(&src, tmp) != 1) return 0;
12. // make utf-32 to utf-8
13. return utf(tmp, des);
14. }
16. /*
17. UTF-8 to UTF-16
18. */
20. inline static size_t utf(const uint8* src, uint16& des)
21. {
22. // make utf-8 to utf-32
23. uint32 tmp;
24. size_t len = utf(src, tmp);
25. if (len == 0) return 0;
26. // make utf-32 to utf-16
27. if (utf(tmp, &des) != 1) return 0;
28. return len;
29. }
30. }
同样，通过上面的单字符转换算法，可以得到整个字符串的转换算法：
[cpp] view plain copy
1. namespace transform
2. {
3. /*
4. UTF-X: string to string
5. */
7. template <typename T>
8. size_t utf(const uint32* src, T* des) // UTF-32 to UTF-X(8/16)
9. {
10. if (!src || (*src) == 0) return 0;
12. size_t num = 0;
13. for(; *src; ++src)
14. {
15. size_t len = utf(*src, des);
16. if (len == 0) break;
17. if (des) des += len;
18. num += len;
19. }
20. if (des) (*des) = 0;
21. return num;
22. }
24. template <typename T>
25. size_t utf(const T* src, uint32* des) // UTF-X(8/16) to UTF-32
26. {
27. if (!src || (*src) == 0) return 0;
29. size_t num = 0;
30. while(*src)
31. {
32. uint32 tmp;
33. size_t len = utf(src, tmp);
34. if (len == 0) break;
35. if (des)
36. {
37. (*des) = tmp;
38. ++des;
39. }
40. src += len;
41. num += 1;
42. }
43. if (des) (*des) = 0;
44. return num;
45. }
47. template <typename T, typename U>
48. size_t utf(const T* src, U* des) // UTF-X(8/16) to UTF-Y(16/8)
49. {
50. if (!src || (*src) == 0) return 0;
52. size_t num = 0;
53. while(*src)
54. {
55. // make utf-x to ucs4
56. uint32 tmp;
57. size_t len = utf(src, tmp);
58. if (len == 0) break;
59. src += len;
60. // make ucs4 to utf-y
61. len = utf(tmp, des);
62. if (len == 0) break;
63. if (des) des += len;
64. num += len;
65. }
66. if (des) (*des) = 0;
67. return num;
68. }
69. }
有了这些之后，我们已经可以完整的做UTF-8/16/32之间的相互转换了，但是这些函数的使用仍然不是很方便。
比如我现在想把一个UTF-8字符串转换成一个wchar_t*字符串，我得这样写：
[cpp] view plain copy
1. const uint8* c = (uint8*)"こんにちわ、世界";
2. size_t n = (sizeof(wchar_t) == 2) ?
3. transform::utf(c, (uint16*)0) :
4. transform::utf(c, (uint32*)0);
5. wchar_t* s = new wchar_t[n];
6. if (sizeof(wchar_t) == 2)
7. transform::utf(c, (uint16*)s);
8. else
9. transform::utf(c, (uint32*)s);
这显然是一件很抽搐的事情，因为wchar_t在不同的操作系统（windows/linux）里有不同的sizeof长度。
上面的类型强制转换只是为了去适配合适的函数重载，当然我们也可以通过函数名来区分这些函数：比如分别叫utf8_to_utf32之类的。但是这改变不了写if-else来适配长度的问题。

显然这里可以通过泛型来让算法更好用。
首先，需要被抽离出来的就是参数的类型大小和类型本身的依赖关系：
[cpp] view plain copy
1. template <size_t X> struct utf_type;
2. template <> struct utf_type<1> { typedef uint8 type_t; };
3. template <> struct utf_type<2> { typedef uint16 type_t; };
4. template <> struct utf_type<4> { typedef uint32 type_t; };
然后，实现一个简单的check算法，这样后面就可以利用SFINAE的技巧筛选出合适的算法函数：
[cpp] view plain copy
1. template <size_t X, typename T>
2. struct check
3. {
4. static const bool value =
5. ((sizeof(T) == sizeof(typename utf_type<X>::type_t)) && !is_pointer<T>::value);
6. };
下面我们需要一个detail，即泛型适配的细节。从上面的算法函数参数中，我们可以很容易的观察出一些规律：
只要是由大向小转换（比如32->16，或16->8）的，其对外接口可以抽象成这两种形式：
[cpp] view plain copy
1. type_t utf(T src, U* des)
2. type_t utf(const T* src, U* des)
而由小向大的转换，则是下面这两种形式：
[cpp] view plain copy
1. type_t utf(const T* src, U& des)
2. type_t utf(const T* src, U* des)
再加上第二个指针参数是可以给一个默认值（空指针）的，因此适配的泛型类就可以写成这样：
[cpp] view plain copy
1. template <size_t X, size_t Y, bool = (X > Y), bool = (X != Y)>
2. struct detail;
4. /*
5. UTF-X(32/16) to UTF-Y(16/8)
6. */
8. template <size_t X, size_t Y>
9. struct detail<X, Y, true, true>
10. {
11. typedef typename utf_type<X>::type_t src_t;
12. typedef typename utf_type<Y>::type_t des_t;
14. template <typename T, typename U>
15. static typename enable_if<check<X, T>::value && check<Y, U>::value,
16. size_t>::type_t utf(T src, U* des)
17. {
18. return transform::utf((src_t)(src), (des_t*)(des));
19. }
21. template <typename T>
22. static typename enable_if<check<X, T>::value,
23. size_t>::type_t utf(T src)
24. {
25. return transform::utf((src_t)(src), (des_t*)(0));
26. }
28. template <typename T, typename U>
29. static typename enable_if<check<X, T>::value && check<Y, U>::value,
30. size_t>::type_t utf(const T* src, U* des)
31. {
32. return transform::utf((const src_t*)(src), (des_t*)(des));
33. }
35. template <typename T>
36. static typename enable_if<check<X, T>::value,
37. size_t>::type_t utf(const T* src)
38. {
39. return transform::utf((src_t)(src), (des_t*)(0));
40. }
41. };
43. /*
44. UTF-X(16/8) to UTF-Y(32/16)
45. */
47. template <size_t X, size_t Y>
48. struct detail<X, Y, false, true>
49. {
50. typedef typename utf_type<X>::type_t src_t;
51. typedef typename utf_type<Y>::type_t des_t;
53. template <typename T, typename U>
54. static typename enable_if<check<X, T>::value && check<Y, U>::value,
55. size_t>::type_t utf(const T* src, U& des)
56. {
57. des_t tmp; // for disable the warning strict-aliasing from gcc 4.4
58. size_t ret = transform::utf((const src_t*)(src), tmp);
59. des = tmp;
60. return ret;
61. }
63. template <typename T, typename U>
64. static typename enable_if<check<X, T>::value && check<Y, U>::value,
65. size_t>::type_t utf(const T* src, U* des)
66. {
67. return transform::utf((const src_t*)(src), (des_t*)(des));
68. }
70. template <typename T>
71. static typename enable_if<check<X, T>::value,
72. size_t>::type_t utf(const T* src)
73. {
74. return transform::utf((const src_t*)(src), (des_t*)(0));
75. }
76. };
最后的外敷类收尾就可以相当的简单：
[cpp] view plain copy
1. template <typename T, typename U>
2. struct converter
3. : detail<sizeof(T), sizeof(U)>
4. {};
通过上面的detail，我们也可以很轻松的写出一个通过指定8、16这些数字，来控制选择哪些转换算法的外敷模板。
有了converter，同类型的需求（指UTF-8转wchar_t）就可以变得轻松愉快很多：
[cpp] view plain copy
1. const char* c = "こんにちわ、世界";
2. wstring s;
3. size_t n; wchar_t w;
4. while (!!(n = converter<char, wchar_t>::utf(c, w))) // 这里的!!是为了屏蔽gcc的警告
5. {
6. s.push_back(w);
7. c += n;
8. }
9. FILE* fp = fopen("test_converter.txt", "wb");
10. fwrite(s.c_str(), sizeof(wchar_t), s.length(), fp);
11. fclose(fp);
上面这一小段代码是将一段UTF-8的文字逐字符转换为wchar_t，并一个个push_back到wstring里，最后把转换完毕的字符串输出到test_converter.txt里。

其实上面的泛型还是显得累赘了。为什么不直接在transform::utf上使用泛型参数呢？
一开始只想到上面那个方法，自然是由于惯性的想要手动指定如何转换编码的缘故，比如最开始的想法，是想做成类似这样的模板：utf<8, 32>(s1, s2)，指定两个数字，来决定输入和输出的格式。

后来发现，直接指定字符串/字符的类型或许更加直接些。
现在回头再看看，其实转换所需要的字长（8、16、32）已经在参数的类型中指定了：8bits的char或byte类型肯定不会是用来存放UTF-32的嘛。。
所以只需要把上面核心算法的参数泛型化就可以了。这时代码就会写成下面这个样子：
[cpp] view plain copy
1. namespace transform
2. {
3. namespace private_
4. {
5. template <size_t X> struct utf_type;
6. template <> struct utf_type<1> { typedef uint8 type_t; };
7. template <> struct utf_type<2> { typedef uint16 type_t; };
8. template <> struct utf_type<4> { typedef uint32 type_t; };
10. template <typename T, size_t X>
11. struct check
12. {
13. static const bool value =
14. ((sizeof(T) == sizeof(typename utf_type<X>::type_t)) && !is_pointer<T>::value);
15. }
16. }
18. using namespace transform::private_;
20. /*
21. UTF-32 to UTF-8
22. */
24. template <typename T, typename U>
25. typename enable_if<check<T, 4>::value && check<U, 1>::value,
26. size_t>::type_t utf(T src, U* des)
27. {
28. if (src == 0) return 0;
30. static const byte PREFIX[] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
31. static const uint32 CODE_UP[] =
32. {
33. 0x80, // U+00000000 - U+0000007F
34. 0x800, // U+00000080 - U+000007FF
35. 0x10000, // U+00000800 - U+0000FFFF
36. 0x200000, // U+00010000 - U+001FFFFF
37. 0x4000000, // U+00200000 - U+03FFFFFF
38. 0x80000000 // U+04000000 - U+7FFFFFFF
39. };
41. size_t i, len = sizeof(CODE_UP) / sizeof(uint32);
42. for(i = 0; i < len; ++i)
43. if (src < CODE_UP[i]) break;
45. if (i == len) return 0; // the src is invalid
47. len = i + 1;
48. if (des)
49. {
50. for(; i > 0; --i)
51. {
52. des[i] = static_cast((src & 0x3F) | 0x80);
53. src >>= 6;
54. }
55. des[0] = static_cast(src | PREFIX[len - 1]);
56. }
57. return len;
58. }
60. /*
61. UTF-8 to UTF-32
62. */
64. template <typename T, typename U>
65. typename enable_if<check<T, 1>::value && check<U, 4>::value,
66. size_t>::type_t utf(const T* src, U& des)
67. {
68. if (!src || (*src) == 0) return 0;
70. uint8 b = *(src++);
72. if (b < 0x80)
73. {
74. des = b;
75. return 1;
76. }
78. if (b < 0xC0 || b > 0xFD) return 0; // the src is invalid
80. size_t len;
82. if (b < 0xE0)
83. {
84. des = b & 0x1F;
85. len = 2;
86. }
87. else
88. if (b < 0xF0)
89. {
90. des = b & 0x0F;
91. len = 3;
92. }
93. else
94. if (b < 0xF8)
95. {
96. des = b & 0x07;
97. len = 4;
98. }
99. else
100. if (b < 0xFC)
101. {
102. des = b & 0x03;
103. len = 5;
104. }
105. else
106. {
107. des = b & 0x01;
108. len = 6;
109. }
111. size_t i = 1;
112. for (; i < len; ++i)
113. {
114. b = *(src++);
115. if (b < 0x80 || b > 0xBF) return 0; // the src is invalid
116. des = (des << 6) + (b & 0x3F);
117. }
118. return len;
119. }
121. /*
122. UTF-32 to UTF-16
123. */
125. template <typename T, typename U>
126. typename enable_if<check<T, 4>::value && check<U, 2>::value,
127. size_t>::type_t utf(T src, U* des)
128. {
129. if (src == 0) return 0;
131. if (src <= 0xFFFF)
132. {
133. if (des) (*des) = static_cast(src);
134. return 1;
135. }
136. else
137. if (src <= 0xEFFFF)
138. {
139. if (des)
140. {
141. des[0] = static_cast(0xD800 + (src >> 10) - 0x40); // high
142. des[1] = static_cast(0xDC00 + (src & 0x03FF)); // low
143. }
144. return 2;
145. }
146. return 0;
147. }
149. /*
150. UTF-16 to UTF-32
151. */
153. template <typename T, typename U>
154. typename enable_if<check<T, 2>::value && check<U, 4>::value,
155. size_t>::type_t utf(const T* src, U& des)
156. {
157. if (!src || (*src) == 0) return 0;
159. uint16 w1 = src[0];
160. if (w1 >= 0xD800 && w1 <= 0xDFFF)
161. {
162. if (w1 < 0xDC00)
163. {
164. uint16 w2 = src[1];
165. if (w2 >= 0xDC00 && w2 <= 0xDFFF)
166. {
167. des = (w2 & 0x03FF) + (((w1 & 0x03FF) + 0x40) << 10);
168. return 2;
169. }
170. }
171. return 0; // the src is invalid
172. }
173. else
174. {
175. des = w1;
176. return 1;
177. }
178. }
180. /*
181. UTF-16 to UTF-8
182. */
184. template <typename T, typename U>
185. typename enable_if<check<T, 2>::value && check<U, 1>::value,
186. size_t>::type_t utf(T src, U* des)
187. {
188. // make utf-16 to utf-32
189. uint32 tmp;
190. if (utf(&src, tmp) != 1) return 0;
191. // make utf-32 to utf-8
192. return utf(tmp, des);
193. }
195. /*
196. UTF-8 to UTF-16
197. */
199. template <typename T, typename U>
200. typename enable_if<check<T, 1>::value && check<U, 2>::value,
201. size_t>::type_t utf(const T* src, U& des)
202. {
203. // make utf-8 to utf-32
204. uint32 tmp;
205. size_t len = utf(src, tmp);
206. if (len == 0) return 0;
207. // make utf-32 to utf-16
208. if (utf(tmp, &des) != 1) return 0;
209. return len;
210. }
212. /*
213. UTF-X: string to string
214. */
216. template <typename T, typename U>
217. typename enable_if<check<T, 4>::value && (check<U, 1>::value || check<U, 2>::value),
218. size_t>::type_t utf(const T* src, U* des) // UTF-32 to UTF-X(8/16)
219. {
220. if (!src || (*src) == 0) return 0;
222. size_t num = 0;
223. for(; *src; ++src)
224. {
225. size_t len = utf(*src, des);
226. if (len == 0) break;
227. if (des) des += len;
228. num += len;
229. }
230. if (des) (*des) = 0;
231. return num;
232. }
234. template <typename T, typename U>
235. typename enable_if<(check<T, 1>::value || check<T, 2>::value) && check<U, 4>::value,
236. size_t>::type_t utf(const T* src, U* des) // UTF-X(8/16) to UTF-32
237. {
238. if (!src || (*src) == 0) return 0;
240. size_t num = 0;
241. while(*src)
242. {
243. uint32 tmp;
244. size_t len = utf(src, tmp);
245. if (len == 0) break;
246. if (des)
247. {
248. (*des) = tmp;
249. ++des;
250. }
251. src += len;
252. num += 1;
253. }
254. if (des) (*des) = 0;
255. return num;
256. }
258. template <typename T, typename U>
259. typename enable_if<(check<T, 1>::value && check<U, 2>::value) ||
260. (check<T, 2>::value && check<U, 1>::value),
261. size_t>::type_t utf(const T* src, U* des) // UTF-X(8/16) to UTF-Y(16/8)
262. {
263. if (!src || (*src) == 0) return 0;
265. size_t num = 0;
266. while(*src)
267. {
268. // make utf-x to utf-32
269. uint32 tmp;
270. size_t len = utf(src, tmp);
271. if (len == 0) break;
272. src += len;
273. // make utf-32 to utf-y
274. len = utf(tmp, des);
275. if (len == 0) break;
276. if (des) des += len;
277. num += len;
278. }
279. if (des) (*des) = 0;
280. return num;
281. }
282. }
这样用起来就更加简单了：
[cpp] view plain copy
1. const char* c = "你好世界";
2. size_t n = nx::transform::utf(c, (wchar_t*)0);
完整代码请参考：
https://code.google.com/p/nixy/source/browse/trunk/nixycore/string/transform.h

更多内容请访问：http://darkc.at

http://blog.csdn.net/markl22222/article/details/19770505
相关阅读:
Kafka2.0服务端写入和读取流程
 Kafka2.0服务端启动源码
 Kafka2.0消费者协调器源码
 Kafka2.0消费者客户端源码分析
 [六省联考2017]分手是祝愿题解(期望dp)
[信息学奥赛一本通oj1741]电子速度题解
 [CSP-S模拟测试53]题解
 [CSP-S模拟测试52]题解
 [CSP-S模拟测试ex]题解
 [CSP-S模拟测试51]题解
原文地址：https://www.cnblogs.com/findumars/p/6376034.html