• C#将汉字转换为拼音首字母


    关于这个话题以前曾经使用过一个简便的算法很长时间, 代码如下:
           private string ToPinyinSingle(string str)
            {
                if (str.CompareTo("") < 0)
                    return str;
                if (str.CompareTo("") < 0)
                    return "a";
                if (str.CompareTo("") < 0)
                    return "b";
                if (str.CompareTo("") < 0)
                    return "c";
                if (str.CompareTo("") < 0)
                    return "d";
                if (str.CompareTo("") < 0)
                    return "e";
                if (str.CompareTo("") < 0)
                    return "f";
                if (str.CompareTo("") < 0)
                    return "g";
                if (str.CompareTo("") < 0)
                    return "h";
                if (str.CompareTo("") < 0)
                    return "j";
                if (str.CompareTo("") < 0)
                    return "k";
                if (str.CompareTo("") < 0)
                    return "l";
                if (str.CompareTo("") < 0)
                    return "m";
                if (str.CompareTo("") < 0)
                    return "n";
                if (str.CompareTo("") < 0)
                    return "o";
                if (str.CompareTo("") < 0)
                    return "p";
                if (str.CompareTo("") < 0)
                    return "q";
                if (str.CompareTo("") < 0)
                    return "r";
                if (str.CompareTo("") < 0)
                    return "s";
                if (str.CompareTo("") < 0)
                    return "t";
                if (str.CompareTo("") < 0)
                    return "w";
                if (str.CompareTo("") < 0)
                    return "x";
                if (str.CompareTo("") < 0)
                    return "y";
                if (str.CompareTo("") < 0)
                    return "z";
                return str;

            } 

    这个函数只处理单个汉字, 简单地加个循环就可以让它处理文字串了.

    在.net 3.5下, 它一直工作得很好, 虽然偶尔也有出错的时候, 但是概率极低, 基本上可以忽略不计.

    然而后来我把项目升级到.net 4.0以后, 发现出错的几率直线上升, 已经高得无法容忍的程度了(例如, "梅" 会返回"L"), 简单查了一下, 没找到微软关于String.CompareTo函数有什么变化的说明, 束手无策, 于是换用另一个也很简单的算法(http://topic.csdn.net/u/20090219/12/61745e3a-a39e-4f4d-8985-67d124236694.html):

    static public string getSpell(string cn)
    {
    byte[] arrCN = System.Text.Encoding.Default.GetBytes(cn);
    if(arrCN.Length > 1)
    {
    int area = (short)arrCN[0];
    int pos = (short)arrCN[1];
    int code = (area<<8) + pos;
    int[] areacode = {45217,45253,45761,46318,46826,47010,47297,47614,48119,48119,49062,49324,49896,50371,50614,50622,50906,51387,51446,52218,52698,52698,52698,52980,53689,54481};
    for(int i=0;i<26;i++)
    {
    int max = 55290;
    if(i != 25) max = areacode[i+1];
    if(areacode[i]<=code && code<max)
    {
    return System.Text.Encoding.Default.GetString(new byte[]{(byte)(65+i)});
    }
    }
    return "?";
    }
    else return cn;

    } 

    但是这个函数出错的概率也很高, 例如"闫""窦""圳" 等都无法识别, 追查了一下原因, 发现原来对GB2312编码来说, 存放规定是这样的:

    01-09区为特殊符号。 

    16-55区为一级汉字,按拼音排序。 
    56-87区为二级汉字,按部首/笔画排序。
    每个汉字及符号以两个字节来表示。第一个字节称为“高位字节”,第二个字节称为“低位字节”。
    “高位字节”使用了0xA1-0xF7(把01-87区的区号加上0xA0),“低位字节”使用了0xA1-0xFE(把01-94加上0xA0)。
    例如“啊”字在大多数程序中,会以0xB0A1储存。(与区位码对比:0xB0=0xA0+16,0xA1=0xA0+1)


    上述几个字位置码都大于55290, 显然是二级汉字, 这个算法就处理不了了, 换言之, 这种写法只能用于处理一级汉字. 这当然是不可接受的. 

    后来翻查良久, 终于找到一个用C++写的算法, 可以同时处理一级汉字和二级汉字(http://download.csdn.net/detail/ronjay/1955072), 我把它改写成了C#, 代码如下: 

            public class ChineseToPinYin
            {
                #region " 全局变量 "

                private static string[] _regionChar = new string[32]
                {
                    "CJWGNSPGCGNESYPBTYYZDXYKYGTDJNNJQMBSGZSCYJSYYQPGKBZGYCYWJKGKLJSWKPJQHYTWDDZLSGMRYPYWWCCKZNKYDG",
                    "TTNGJEYKKZYTCJNMCYLQLYPYQFQRPZSLWBTGKJFYXJWZLTBNCXJJJJZXDTTSQZYCDXXHGCKBPHFFSSWYBGMXLPBYLLLHLX",
                    "SPZMYJHSOJNGHDZQYKLGJHSGQZHXQGKEZZWYSCSCJXYEYXADZPMDSSMZJZQJYZCDJZWQJBDZBXGZNZCPWHKXHQKMWFBPBY",
                    "DTJZZKQHYLYGXFPTYJYYZPSZLFCHMQSHGMXXSXJJSDCSBBQBEFSJYHWWGZKPYLQBGLDLCCTNMAYDDKSSNGYCSGXLYZAYBN",
                    "PTSDKDYLHGYMYLCXPYCJNDQJWXQXFYYFJLEJBZRXCCQWQQSBNKYMGPLBMJRQCFLNYMYQMSQTRBCJTHZTQFRXQHXMJJCJLX",
                    "QGJMSHZKBSWYEMYLTXFSYDSGLYCJQXSJNQBSCTYHBFTDCYZDJWYGHQFRXWCKQKXEBPTLPXJZSRMEBWHJLBJSLYYSMDXLCL",
                    "QKXLHXJRZJMFQHXHWYWSBHTRXXGLHQHFNMNYKLDYXZPWLGGTMTCFPAJJZYLJTYANJGBJPLQGDZYQYAXBKYSECJSZNSLYZH",
                    "ZXLZCGHPXZHZNYTDSBCJKDLZAYFMYDLEBBGQYZKXGLDNDNYSKJSHDLYXBCGHXYPKDJMMZNGMMCLGWZSZXZJFZNMLZZTHCS",
                    "YDBDLLSCDDNLKJYKJSYCJLKOHQASDKNHCSGANHDAASHTCPLCPQYBSDMPJLPCJOQLCDHJJYSPRCHNWJNLHLYYQYYWZPTCZG",
                    "WWMZFFJQQQQYXACLBHKDJXDGMMYDJXZLLSYGXGKJRYWZWYCLZMSSJZLDBYDCFCXYHLXCHYZJQSFQAGMNYXPFRKSSBJLYXY",
                    "SYGLNSCMHCWWMNZJJLXXHCHSYDSTTXRYCYXBYHCSMXJSZNPWGPXXTAYBGAJCXLYSDCCWZOCWKCCSBNHCPDYZNFCYYTYCKX",
                    "KYBSQKKYTQQXFCWCHCYKELZQBSQYJQCCLMTHSYWHMKTLKJLYCXWHEQQHTQHZPQSQSCFYMMDMGBWHWLGSSLYSDLMLXPTHMJ",
                    "HWLJZYHZJXHTXJLHXRSWLWZJCBXMHZQXSDZPMGFCSGLSXYMJSHXPJXWMYQKSMYPLRTHBXFTPMHYXLCHLHLZYLXGSSSSTCL",
                    "SLDCLRPBHZHXYYFHBBGDMYCNQQWLQHJJZYWJZYEJJDHPBLQXTQKWHLCHQXAGTLXLJXMSLXHTZKZJECXJCJNMFBYCSFYWYB",
                    "JZGNYSDZSQYRSLJPCLPWXSDWEJBJCBCNAYTWGMPAPCLYQPCLZXSBNMSGGFNZJJBZSFZYNDXHPLQKZCZWALSBCCJXJYZGWK",
                    "YPSGXFZFCDKHJGXDLQFSGDSLQWZKXTMHSBGZMJZRGLYJBPMLMSXLZJQQHZYJCZYDJWBMJKLDDPMJEGXYHYLXHLQYQHKYCW",
                    "CJMYYXNATJHYCCXZPCQLBZWWYTWBQCMLPMYRJCCCXFPZNZZLJPLXXYZTZLGDLDCKLYRZZGQTGJHHHJLJAXFGFJZSLCFDQZ",
                    "LCLGJDJCSNCLLJPJQDCCLCJXMYZFTSXGCGSBRZXJQQCTZHGYQTJQQLZXJYLYLBCYAMCSTYLPDJBYREGKLZYZHLYSZQLZNW",
                    "CZCLLWJQJJJKDGJZOLBBZPPGLGHTGZXYGHZMYCNQSYCYHBHGXKAMTXYXNBSKYZZGJZLQJDFCJXDYGJQJJPMGWGJJJPKQSB",
                    "GBMMCJSSCLPQPDXCDYYKYFCJDDYYGYWRHJRTGZNYQLDKLJSZZGZQZJGDYKSHPZMTLCPWNJAFYZDJCNMWESCYGLBTZCGMSS",
                    "LLYXQSXSBSJSBBSGGHFJLWPMZJNLYYWDQSHZXTYYWHMCYHYWDBXBTLMSYYYFSXJCSDXXLHJHFSSXZQHFZMZCZTQCXZXRTT",
                    "DJHNNYZQQMNQDMMGYYDXMJGDHCDYZBFFALLZTDLTFXMXQZDNGWQDBDCZJDXBZGSQQDDJCMBKZFFXMKDMDSYYSZCMLJDSYN",
                    "SPRSKMKMPCKLGDBQTFZSWTFGGLYPLLJZHGJJGYPZLTCSMCNBTJBQFKTHBYZGKPBBYMTTSSXTBNPDKLEYCJNYCDYKZDDHQH",
                    "SDZSCTARLLTKZLGECLLKJLQJAQNBDKKGHPJTZQKSECSHALQFMMGJNLYJBBTMLYZXDCJPLDLPCQDHZYCBZSCZBZMSLJFLKR",
                    "ZJSNFRGJHXPDHYJYBZGDLQCSEZGXLBLGYXTWMABCHECMWYJYZLLJJYHLGBDJLSLYGKDZPZXJYYZLWCXSZFGWYYDLYHCLJS",
                    "CMBJHBLYZLYCBLYDPDQYSXQZBYTDKYXJYYCNRJMPDJGKLCLJBCTBJDDBBLBLCZQRPPXJCGLZCSHLTOLJNMDDDLNGKAQHQH",
                    "JGYKHEZNMSHRPHQQJCHGMFPRXHJGDYCHGHLYRZQLCYQJNZSQTKQJYMSZSWLCFQQQXYFGGYPTQWLMCRNFKKFSYYLQBMQAMM",
                    "MYXCTPSHCPTXXZZSMPHPSHMCLMLDQFYQXSZYJDJJZZHQPDSZGLSTJBCKBXYQZJSGPSXQZQZRQTBDKYXZKHHGFLBCSMDLDG",
                    "DZDBLZYYCXNNCSYBZBFGLZZXSWMSCCMQNJQSBDQSJTXXMBLTXZCLZSHZCXRQJGJYLXZFJPHYMZQQYDFQJJLZZNZJCDGZYG",
                    "CTXMZYSCTLKPHTXHTLBJXJLXSCDQXCBBTJFQZFSLTJBTKQBXXJJLJCHCZDBZJDCZJDCPRNPQCJPFCZLCLZXZDMXMPHJSGZ",
                    "GSZZQJYLWTJPFSYASMCJBTZKYCWMYTCSJJLJCQLWZMALBXYFBPNLSFHTGJWEJJXXGLLJSTGSHJQLZFKCGNNDSZFDEQFHBS",
                    "AQTGLLBXMMYGSZLDYDQMJJRGBJTKGDHGKBLQKBDMBYLXWCXYTTYBKMRTJZXQJBHLMHMJJZMQASLDCYXYQDLQCAFYWYXQHZ"
                };
                private static System.Text.Encoding _encoding = System.Text.Encoding.GetEncoding("GB2312");

                #endregion

                private static bool In(int lp, int hp, int value)
                {
                    return ((value <= hp) && (value >= lp));
                }
                public static char GetFirstChar(string chineseChar)
                {
                    var bytes = _encoding.GetBytes(chineseChar);
                    if (bytes.Length != 2)
                        return chineseChar[0];
                    return GetChar(bytes[0], bytes[1], chineseChar);
                }
                private static char GetChar(byte c1, byte c2, string originChar)
                {
                    var Hi = c1 << 8;
                    var Lo = c2;
                    int n = Hi + Lo;
                    if (n <= 0xD7F9)
                    {
                        if (In(0xB0A10xB0C4, n)) return 'A';
                        if (In(0XB0C50XB2C0, n)) return 'B';
                        if (In(0xB2C10xB4ED, n)) return 'C';
                        if (In(0xB4EE0xB6E9, n)) return 'D';
                        if (In(0xB6EA0xB7A1, n)) return 'E';
                        if (In(0xB7A20xB8C0, n)) return 'F';
                        if (In(0xB8C10xB9FD, n)) return 'G';
                        if (In(0xB9FE0xBBF6, n)) return 'H';
                        if (In(0xBBF70xBFA5, n)) return 'J';
                        if (In(0xBFA60xC0AB, n)) return 'K';
                        if (In(0xC0AC0xC2E7, n)) return 'L';
                        if (In(0xC2E80xC4C2, n)) return 'M';
                        if (In(0xC4C30xC5B5, n)) return 'N';
                        if (In(0xC5B60xC5BD, n)) return 'O';
                        if (In(0xC5BE0xC6D9, n)) return 'P';
                        if (In(0xC6D10xC8BA, n)) return 'Q';
                        if (In(0xC8BB0xC8F5, n)) return 'R';
                        if (In(0xC8F60xCBF9, n)) return 'S';
                        if (In(0xCBFA0xCDD9, n)) return 'T';
                        if (In(0xCDDA0xCEF3, n)) return 'W';
                        if (In(0xCEF40xD1B8, n)) return 'X';
                        if (In(0xD1B90xD4D0, n)) return 'Y';
                        if (In(0xD4D10xD7F9, n)) return 'Z';
                        return originChar[0];
                    }
                    else
                    {
                        var b1 = (c1 & 0x7F) - 0x20 - 56;
                        var b2 = (c2 & 0x7F) - 0x20 - 1;
                        if (b1 >= 0 && b1 <= 31 && b2 >= 0 && b2 <= 93)
                        {
                            return _regionChar[b1][b2];
                        }
                        return originChar[0];
                    }
                }

            } 

     这个算法目前还没有发现哪个汉字会出错. 

    ---------------------------------------------

    作者:夏狼哉
    博客:http://www.cnblogs.com/Moosdau

    如需引用,敬请保留作者信息,谢谢

  • 相关阅读:
    testng失败截图,注解方式调用。
    HttpURLConnection和HttpClient
    JDK中的URLConnection参数详解
    如何做好Web接口测试
    selenium webdriver定位不到元素的五种原因及解决办法
    Selenium(Webdriver)自动化测试常问到的问题解答(转自:潜龙0318)
    常用网址记录
    python set
    python 浅拷贝和深拷贝
    python 元组
  • 原文地址:https://www.cnblogs.com/Moosdau/p/2277727.html
Copyright © 2020-2023  润新知