• C#多线程词频统计


    此乃软件工程课的个人项目,小弟入门水平请轻拍。。

    多谢anran大神指点@wanganran http://www.cnblogs.com/wanganran/archive/2012/09/25/2703179.html 

    以及参考了一个C#的多线程入门的文章 http://www.cnblogs.com/icuit/archive/2012/12/05/2800941.html

    各位看官想要最终代码可直接跳转到最后。。

    10.08:初步考虑用多线程(双线程,一个IO,一个compute),先快速写出单线程最最最简单的版本

     1        string rootdir = args[0];
     2             int N = Convert.ToInt32(args[1]);
     3             string filePattern = args[2];
     4 
     5             string[] files = Directory.GetFiles(rootdir, filePattern);
     6 
     7             int[] tablet = new int[128];
     8             for (int i = 'a'; i <= 'z'; i++) 
     9             {
    10                 tablet[i] = 1;
    11             }
    12             for (int i = 'A'; i <= 'Z'; i++)
    13             {
    14                 tablet[i] = 1;
    15             }
    16             StringBuilder sb = new StringBuilder(100);
    17             Dictionary<string, int> result = new Dictionary<string, int>(50000);
    18 
    19             string strKey = "";
    20             string readLine = "";
    21             foreach (string file in files)
    22             {
    23                 FileStream fs = new FileStream(file,FileMode.Open);
    24                 StreamReader sr = new StreamReader(fs);
    25                 readLine = sr.ReadToEnd();
    26                 sr.Close();
    27                 fs.Close();
    28 
    29                 int state = 0;
    30 
    31                 for (int i = 0; i < readLine.Length; i++) 
    32                 {
    33                     switch (state)
    34                     {
    35                         case 0:
    36                             if ((state = ((readLine[i] & 0x7f) != readLine[i]) ? 0 : tablet[readLine[i]]) != 0)
    37                             {
    38                                 sb.Clear();
    39                                 sb.Append(readLine[i]);
    40                             }
    41                             break;
    42                         default:
    43                             if ((state = ((readLine[i] & 0x7f) != readLine[i]) ? 0 : tablet[readLine[i]]) == 0)
    44                             {
    45                                 if (sb.Length >= 1)
    46                                 {
    47                                     strKey = sb.ToString();
    48                                     strKey = strKey.ToLower();
    49                                     int value;
    50                                     if (result.TryGetValue(strKey,out value))
    51                                     {
    52                                         result[strKey]++;
    53                                     }
    54                                     else 
    55                                     {
    56                                         result.Add(strKey,1);
    57                                     }
    58                                 }
    59                             }
    60                             else
    61                                 sb.Append(readLine[i]);
    62                             break;
    63                     }
    64                 }
    65             }
    66             var outputResult = from KVP in result
    67                              orderby KVP.Value descending
    68                              select new StringBuilder(KVP.Key).Append(" ").Append(KVP.Value);
    69             int count = 0;
    70             foreach (var str in outputResult)
    71             {
    72                 Console.WriteLine(str);
    73                 count++;
    74                 if (count > N - 1) break;
    75             }

    考虑了简单的优化包括:

    1, StringBuilder.Append

    2, Dictionary 索引复杂度O(1)

    3, Dictionary.trygetvalue代替Dictionary.containsKey();

    4,估算单词个数提前开辟内存

    性能分析:

     

    考虑优化String.ToLower(),两种思路:

    1, 自己写tolower函数

    2, 先不tolower,最后再merge

    自己写ToLower()函数(因为前面已经根据字母表对tolower的输入进行了限制,故此处可省略很多判断步骤):

            public static StringBuilder ToLower(StringBuilder str) 
            {
                for (int i = 0; i < str.Length; i++) 
                {
                    if (str[i] <= 'Z') 
                    {
                        str[i] =(char) ((int)str[i]+32);
                    }
                }
                    return str;
            }
     

    只需原来1/6的时间

    10.09. 第一项任务是把tolower merge到最后一步完成。

    即统计词频时区分大小写简历dictionary,最后再创建新的dictionary对每一个key  tolower,再merge。

    2小时后:遇到了一个问题

    Dictionary<StringBuilder,int> 中的stringbuilder是地址形式存在的,每次Add必须新创建而不能单纯改变其值,否则dictionary.tryGetValue总是true

    但是新创建sb类就会浪费时间,先通过string创建StringBuilder再sb.toString()反而多了一次消耗。

    总的回顾发现区分大小写保存dictionary再merge的思想,增加了创建第一个dictionary时每次的索引时间,且每个单词多增加了一次string到StringBuilder的内存开辟时间,增加了一次遍历每个数组并索引的操作(建立新的dictionary),而收益是从每个单词执行一次tolower到每种单词(区分大小写)tolower一次。

    遂决定暂时放弃这个思路。

    下一步:决定设时间点统计一下io操作和计算操作所用时间比(此为用cmd跑程序,要比在ide中跑时间快,后面有些图是在ide中跑的程序)

    测试数据集为原数据集复制粘贴若干次后,大小约100M的若干txt文件。

    可以看到IO时间只占总时间的约1/8~1/10

    所以目前程序开双线程意义不大,可以考虑先考虑继续优化单线程或开多线程处理。

    2小时后:

    感觉单线程的瓶颈在于索引,这点取决于dictionary数据结构,暂时先不考虑优化。

    尝试动手写多线程。

    决定先把单线程代码整理一下。

      1         static StringBuilder sb = new StringBuilder(100);
      2         static Dictionary<string, int> result = new Dictionary<string, int>(50000);
      3         static int[] tablet = new int[128];
      4         static void Main(string[] args)
      5         {
      6             DateTime dt = DateTime.Now;
      7             string rootdir = args[0];
      8             int N = Convert.ToInt32(args[1]);
      9             string filePattern = args[2];
     10 
     11             string[] files = Directory.GetFiles(rootdir, filePattern);
     12 
     13             for (int i = 'a'; i <= 'z'; i++)
     14             {
     15                 tablet[i] = 1;
     16             }
     17             for (int i = 'A'; i <= 'Z'; i++)
     18             {
     19                 tablet[i] = 1;
     20             }
     21 
     22             string readLine = "";
     23 
     24             DateTime COMt = new DateTime();
     25             DateTime IOt = new DateTime();
     26             DateTime OTHt = new DateTime();
     27             long COMtime = 0;
     28             long IOtime = 0;
     29             long OTHtime = 0;
     30 
     31             foreach (string file in files)
     32             {
     33                 IOt = DateTime.Now;
     34 
     35                 readLine = ReadFile(file);
     36 
     37                 IOtime += DateTime.Now.Second * 1000 + DateTime.Now.Millisecond - IOt.Second * 1000 - IOt.Millisecond;
     38 
     39                 COMt = DateTime.Now;
     40 
     41                 Compute(readLine);
     42 
     43                 COMtime += DateTime.Now.Second * 1000 + DateTime.Now.Millisecond - COMt.Second * 1000 - COMt.Millisecond;
     44             }
     45 
     46             OTHt = DateTime.Now;
     47 
     48             var outputResult = from KVP in result
     49                              orderby KVP.Value descending
     50                              select new StringBuilder(KVP.Key).Append(" ").Append(KVP.Value);
     51             int count = 0;
     52             foreach (var str in outputResult)
     53             {
     54                 Console.WriteLine(str);
     55                 count++;
     56                 if (count > N - 1) break;
     57             }
     58             OTHtime = DateTime.Now.Second * 1000 + DateTime.Now.Millisecond - OTHt.Second * 1000 - OTHt.Millisecond;
     59 
     60             DateTime ot = DateTime.Now;
     61             Console.WriteLine("IO time:" + IOtime + "ms. Compute time:" + COMtime + "ms. Other time: " + OTHtime + "ms.");
     62             Console.WriteLine("Time: " + ((ot.Minute * 60 + ot.Second) * 1000 + ot.Millisecond - (dt.Minute * 60 + dt.Second) * 1000 - dt.Millisecond) + "ms");
     63             Console.ReadKey();
     64         }
     65 
     66         public static string ReadFile(string file)
     67         {
     68             string readLine;
     69             FileStream fs = new FileStream(file, FileMode.Open);
     70             StreamReader sr = new StreamReader(fs);
     71             readLine = sr.ReadToEnd();
     72             sr.Close();
     73             fs.Close();
     74             return readLine;
     75         }
     76 
     77         public static void Compute(string readLine)
     78         {
     79             string strKey = "";
     80             int value;
     81             int state = 0;
     82 
     83             for (int i = 0; i < readLine.Length; i++)
     84             {
     85                 switch (state)
     86                 {
     87                     case 0:
     88                         if ((state = ( readLine[i] > ’z’ ) ? 0 : tablet[readLine[i]]) != 0)
     89                         {
     90                             sb.Clear();
     91                             sb.Append(readLine[i]);
     92                         }
     93                         break;
     94                     default:
     95                         if ((state = ( readLine[i] > ’z’ ) ? 0 : tablet[readLine[i]]) == 0)
     96                         {
     97                             if (sb.Length >= 1)
     98                             {
     99                                 //strKey = sb.ToString();
    100                                 //strKey = strKey.ToLower();
    101                                 strKey = ToLower(sb).ToString();
    102                                 if (result.TryGetValue(strKey, out value))
    103                                 {
    104                                     result[strKey]++;
    105                                 }
    106                                 else
    107                                 {
    108                                     result.Add(strKey, 1);
    109                                 }
    110                             }
    111                         }
    112                         else
    113                             sb.Append(readLine[i]);
    114                         break;
    115                 }
    116             }
    117         }
    118 
    119         public static StringBuilder ToLower(StringBuilder str)
    120         {
    121             for (int i = 0; i < str.Length; i++)
    122             {
    123                 if (str[i] <= 'Z')
    124                 {
    125                     str[i] = (char)((int)str[i] + 32);
    126                 }
    127             }
    128             return str;
    129         }
    130   

    多线程,参考wang anran大神使用BlockingCollection<T>

    先写两个线程(IO与Compute),共享一个static dictionary<string,int>以及一个static BlockingCollection<string>,预测可以比单线程提高1/10左右用时(因为创建线程也耗时间)

    晚9点:

    双线程的已经写完,果然如预测提升了约1/10左右的时间:

    但是发现了一个bug,单个字母的统计结果与另一位同学的程序的结果不符。

    现在准备来检查一下原因。

    终于找到了,原因在于对于字母的分割的歧义上。

    按照qiufeng老师给出的规定:Treat all non-alphabet characters as split characters.

    所有不在字母表中的字符(包括数字)均视为分隔符。例如:a3b应视为a和b两个单词进行统计。

    明天决定进行多个线程协同统计的编写。难点在于如何判别多个线程都已处理完毕。

    10.10 准备写一个线程IO,两个线程compute

    先把双线程的代码贴出来备份(奇怪的是今天的双线程跑不出昨天的速度了。。忘记改了哪里。。)

      1      class Program
      2     {
      3         static StringBuilder sb = new StringBuilder(100);
      4         static Dictionary<string, int> result = new Dictionary<string, int>(50000);
      5         static int[] tablet = new int[128];
      6         static BlockingCollection<string> queue=new BlockingCollection<string>(100);
      7         static void Main(string[] args)
      8         {
      9             DateTime dt = DateTime.Now;
     10             string rootdir = args[0];
     11             int N = Convert.ToInt32(args[1]);
     12             string filePattern = args[2];
     13             string[] files = Directory.GetFiles(rootdir, filePattern);
     14 
     15             for (int i = 'a'; i <= 'z'; i++)
     16             {
     17                 tablet[i] = 1;
     18             }
     19             for (int i = 'A'; i <= 'Z'; i++)
     20             {
     21                 tablet[i] = 1;
     22             }
     23 
     24             Thread FileIOth = new Thread(delegate() 
     25                 {
     26                     Read(files);
     27                 });
     28             FileIOth.Start();
     29 
     30 
     31             Thread WorkerTh = new Thread(delegate()
     32             {
     33                 Process();
     34                 var outputResult = from KVP in result
     35                                    orderby KVP.Value descending
     36                                    select new StringBuilder(KVP.Key).Append(" ").Append(KVP.Value);
     37                 int count = 0;
     38                 foreach (var str in outputResult)
     39                 {
     40                     Console.WriteLine(str);
     41                     count++;
     42                     if (count > N - 1) break;
     43                 }
     44                 DateTime ot = DateTime.Now;
     45                 Console.WriteLine("Time: " + ((ot.Minute * 60 + ot.Second) * 1000 + ot.Millisecond - (dt.Minute * 60 + dt.Second) * 1000 - dt.Millisecond) + "ms");
     46                 Console.ReadKey();
     47             });
     48             WorkerTh.Start();
     49 
     50 
     51         }
     52 
     53         public static void Read(string[] files)
     54         {
     55             foreach (string file in files)
     56             {
     57                 queue.TryAdd(ReadFile(file), -1);
     58             }
     59             queue.TryAdd("END", -1);
     60         }
     61 
     62         public static string ReadFile(string file)
     63         {
     64             string readLine;
     65             FileStream fs = new FileStream(file, FileMode.Open);
     66             StreamReader sr = new StreamReader(fs);
     67             readLine = sr.ReadToEnd();
     68             sr.Close();
     69             fs.Close();
     70             return readLine;
     71         }
     72 
     73         public static void Process()
     74         {
     75             string readLine;
     76             while (true)
     77             {
     78                     queue.TryTake(out readLine, -1);
     79                     if (readLine == "END")
     80                     {
     81                         queue.TryAdd("END", -1);
     82                         break;
     83                     }
     84                     Compute(readLine);
     85             }
     86         }
     87 
     88         public static void Compute(string readLine)
     89         {
     90             string strKey = "";
     91             int value;
     92             int state = 0;
     93 
     94             for (int i = 0; i < readLine.Length; i++)
     95             {
     96                 switch (state)
     97                 {
     98                     case 0:
     99                         if ((state = (readLine[i] > 'z') ? 0 : tablet[readLine[i]]) != 0)
    100                         {
    101                             sb.Clear();
    102                             sb.Append(readLine[i]);
    103                         }
    104                         break;
    105                     default:
    106                         if ((state = (readLine[i] > 'z') ? 0 : tablet[readLine[i]]) == 0)
    107                         {
    108                             if (sb.Length >= 1)
    109                             {
    110                                 strKey = ToLower(sb).ToString();
    111                                 if (tablet[strKey[0]] != 1) return;
    112                                 if (result.TryGetValue(strKey, out value))
    113                                 {
    114                                     result[strKey]++;
    115                                 }
    116                                 else
    117                                 {
    118                                     result.Add(strKey, 1);
    119                                 }
    120                             }
    121                         }
    122                         else
    123                             sb.Append(readLine[i]);
    124                         break;
    125                 }
    126             }
    127         }
    128 
    129         public static StringBuilder ToLower(StringBuilder str)
    130         {
    131             for (int i = 0; i < str.Length; i++)
    132             {
    133                 if (str[i] <= 'Z')
    134                 {
    135                     str[i] = (char)((int)str[i] + 32);
    136                 }
    137             }
    138             return str;
    139         }
    140     }

    多线程:两个compute一个io

    多线程反而更慢。。。寻找原因。

    请教了anran大神,大神表示lock非常占时间。。建议更换线程安全的dictionary。

    果然。。速度提升了20%。。(先去吃饭,回来再写)

    测试了4线程5线程…8线程

    发现1个主线程1个io线程3个compute线程的速度最快

      1   class Program
      2     {
      3         static ConcurrentDictionary<string, int> result = new ConcurrentDictionary<string, int>(1, 50000);
      4         static int[] tablet = new int[128];
      5         static BlockingCollection<string> queue;
      6         static Thread WorkerTh0;
      7         static Thread WorkerTh1;
      8         static Thread WorkerTh2;
      9         static void Main(string[] args)
     10         {
     11             DateTime dt = DateTime.Now;
     12             string rootdir = args[0];
     13             int N = Convert.ToInt32(args[1]);
     14             string filePattern = args[2];
     15 
     16             string[] files = Directory.GetFiles(rootdir, filePattern);
     17 
     18             for (int i = 'a'; i <= 'z'; i++)
     19             {
     20                 tablet[i] = 1;
     21             }
     22             for (int i = 'A'; i <= 'Z'; i++)
     23             {
     24                 tablet[i] = 1;
     25             }
     26 
     27             queue = new BlockingCollection<string>(100);
     28             Thread FileIOth = new Thread(delegate() { Read(files); });
     29             FileIOth.Start();
     30 
     31             WorkerTh0 = new Thread(delegate()
     32                 {
     33                     Process();
     34                 });
     35             WorkerTh0.Start();
     36             WorkerTh1 = new Thread(delegate()
     37             {
     38                 Process();
     39             });
     40             WorkerTh1.Start();
     41 
     42             WorkerTh2 = new Thread(delegate()
     43                 {
     44                     Process();
     45                     var outputResult = from KVP in result
     46                                        orderby KVP.Value descending
     47                                        select new StringBuilder(KVP.Key).Append(" ").Append(KVP.Value);
     48                     int count = 0;
     49                     foreach (var str in outputResult)
     50                     {
     51                         Console.WriteLine(str);
     52                         count++;
     53                         if (count > N - 1) break;
     54                     }
     55                     DateTime ot = DateTime.Now;
     56                     Console.WriteLine("Time: " + ((ot.Minute * 60 + ot.Second) * 1000 + ot.Millisecond - (dt.Minute * 60 + dt.Second) * 1000 - dt.Millisecond) + "ms");
     57                     //Console.ReadKey();
     58                 });
     59             WorkerTh2.Start();
     60 
     61 
     62         }
     63 
     64         public static void Read(string[] files)
     65         {
     66             foreach (string file in files)
     67             {
     68                 queue.TryAdd(ReadFile(file), -1);
     69             }
     70             queue.TryAdd("END", -1);
     71         }
     72 
     73         public static string ReadFile(string file)
     74         {
     75             string readLine;
     76             FileStream fs = new FileStream(file, FileMode.Open);
     77             StreamReader sr = new StreamReader(fs);
     78             readLine = sr.ReadToEnd();
     79             sr.Close();
     80             fs.Close();
     81             return readLine;
     82         }
     83 
     84         public static void Process()
     85         {
     86             string readLine;
     87             while (true)
     88             {
     89                 queue.TryTake(out readLine, -1);
     90                 if (readLine == "END")
     91                 {
     92                     queue.TryAdd("END", -1);
     93                     break;
     94                 }
     95                 Compute(readLine);
     96             }
     97         }
     98 
     99         public static void Compute(string readLine)
    100         {
    101             StringBuilder sb = new StringBuilder(100);
    102             string strKey = "";
    103             int value;
    104             int state = 0;
    105 
    106             for (int i = 0; i < readLine.Length; i++)
    107             {
    108                 switch (state)
    109                 {
    110                     case 0:
    111                         if ((state = (readLine[i] > 'z') ? 0 : tablet[readLine[i]]) != 0)
    112                         {
    113                             sb.Clear();
    114                             sb.Append(readLine[i]);
    115                         }
    116                         break;
    117                     default:
    118                         if ((state = (readLine[i] > 'z') ? 0 : tablet[readLine[i]]) == 0)
    119                         {
    120                             if (sb.Length >= 1)
    121                             {
    122                                 strKey = ToLower(sb).ToString();
    123                                 result.AddOrUpdate(strKey, 1, (k, v) => v + 1);
    124                             }
    125                         }
    126                         else
    127                             sb.Append(readLine[i]);
    128                         break;
    129                 }
    130             }
    131         }
    132 
    133         public static StringBuilder ToLower(StringBuilder str)
    134         {
    135             for (int i = 0; i < str.Length; i++)
    136             {
    137                 if (str[i] <= 'Z')
    138                 {
    139                     str[i] = (char)((int)str[i] + 32);
    140                 }
    141             }
    142             return str;
    143         }
    144     }

    决定等真实数据来了再测试一下效果。

  • 相关阅读:
    UML箭头含义整理
    协变返回类型
    Thymeleaf取出model中的数据
    宝塔面板中的mysql查看密码问题
    宝塔面板如何登录
    mysql查看数据库、表的基本语句
    springboot拦截器实例
    Thymeleaf中的fragments学习
    食物链
    银河英雄传说
  • 原文地址:https://www.cnblogs.com/RheetZ/p/3361129.html
Copyright © 2020-2023  润新知