• C#多线程词频统计修改


    上一篇真是让大家见笑了,我今天才真正明白了“多线程”的意义。

    今天拿到了真正的测试数据,大约380MB左右,用我的双线程大概能跑到16s左右。

    但是惊闻有同学跑到了6s!

    遂不甘心啊,果断继续修改之。

    然后发现了多线程真正的奥义————建多个dictionary分别统计,最后再merge。

    实际上索引是整个程序最耗时的地方,由于单词种类多达200k,所以如此庞大的一个dictionary每次的索引和更新是非常废时间的。

    所以在新的多线程程序中,建了一个线程数组,每个线程负责各自的一个dictionary,

    等每个线程都跑完之后,再进行Merge。

    这样整个程序的时间就上去了。

    380MB大约7s时间。

    代码见下:

      1     class Program
      2     {
      3         const int ThreadNum = 7;
      4         static ConcurrentDictionary<string, int>[] result = new ConcurrentDictionary<string, int>[ThreadNum];
      5         static ConcurrentDictionary<string, int> ResultMerge;
      6         static int[] tablet = new int[128];
      7         static BlockingCollection<string> queue;
      8         static Thread[] WorkerTh = new Thread[ThreadNum];
      9         static Thread FileIOth;
     10         static Semaphore sem = new Semaphore(0, ThreadNum);
     11         static SemaphoreSlim semslim = new SemaphoreSlim(0, ThreadNum);
     12         static void Main(string[] args)
     13         {
     14             if (args.Length != 3)
     15             {
     16                 Console.WriteLine("Command Line format: WFC rootdir N filePattern");
     17                 Console.WriteLine("WFC: executable file name");
     18                 Console.WriteLine("rootdir: the root directory of input files");
     19                 Console.WriteLine("N: the count of output words");
     20                 Console.WriteLine("filepattern: the name pattern of files to be scanned. For example, *.txt");
     21                 return;
     22             }
     23             DateTime dt = DateTime.Now;
     24             string rootdir = args[0];
     25             int N = Convert.ToInt32(args[1]);
     26             string filePattern = args[2];
     27             if (!Directory.Exists(rootdir)) 
     28             {
     29                 Console.WriteLine("The path "+ rootdir+" doesn't exist.");
     30                 return;
     31             }
     32             string[] files = Directory.GetFiles(rootdir, filePattern, SearchOption.AllDirectories);
     33             if(files.Length==0)
     34             {
     35                 Console.WriteLine("Can not find any "+filePattern+ " pattern's file.");
     36                 return;
     37             }
     38 
     39             for (int i = 'a'; i <= 'z'; i++)
     40             {
     41                 tablet[i] = 1;
     42             }
     43             for (int i = 'A'; i <= 'Z'; i++)
     44             {
     45                 tablet[i] = 1;
     46             }
     47 
     48             //ManagementClass m = new ManagementClass(new ManagementPath( "Win32_Processor"));
     49             //ManagementObjectCollection moc = m.GetInstances();
     50             //int CPUNum = 0;
     51 
     52             //string NumOfCore="";
     53             //foreach (ManagementObject mo in moc)
     54             //{
     55             //    PropertyDataCollection properties = mo.Properties;
     56             //    NumOfCore += properties["NumberOfCores"].Value;
     57             //}
     58             //CPUNum = Convert.ToInt32(NumOfCore);
     59 
     60 
     61 
     62             queue = new BlockingCollection<string>(100);
     63             FileIOth = new Thread(delegate() { Read(files); });
     64             FileIOth.Start();
     65 
     66             ResultMerge = new ConcurrentDictionary<string, int>(1, 8 * N);
     67 
     68             for (int i = 0; i < result.Length; i++)
     69             {
     70                 result[i] = new ConcurrentDictionary<string, int>(1, 20000);
     71             }
     72 
     73 
     74             int index = -1;
     75             for (int i = 0; i < result.Length - 1; i++)
     76             {
     77                 WorkerTh[i] = new Thread(delegate()
     78                 {
     79                     index++;
     80                     Process(index);
     81                     semslim.Release(1);
     82                 });
     83                 WorkerTh[i].Start();
     84             }
     85 
     86             WorkerTh[result.Length - 1] = new Thread(delegate()
     87             {
     88                
     89                 Process(result.Length - 1);
     90                 semslim.Release(1);
     91                 while (true) 
     92                 {
     93                     if (semslim.CurrentCount == ThreadNum)
     94                         break;
     95                 }
     96                 int count = 0;
     97                 for (int i = 0; i < result.Length; i++)
     98                 {
     99                     count = 0;
    100                     foreach (var item in result[i].OrderByDescending(k=>k.Value)) 
    101                     {
    102                         ResultMerge.AddOrUpdate(item.Key, item.Value, (k, v) => v + item.Value);
    103                         count++;
    104                         if (count > 5 * N)
    105                             break;
    106                     }
    107                 }
    108                 count = 0;
    109                 var FinaloutputResult = from KVP in ResultMerge
    110                                         orderby KVP.Value descending
    111                                         select new StringBuilder(KVP.Key).Append(" ").Append(KVP.Value);
    112                 foreach (var str in FinaloutputResult)
    113                 {
    114                     Console.WriteLine(str);
    115                     count++;
    116                     if (count > N - 1) break;
    117                 }
    118                 DateTime ot = DateTime.Now;
    119                 Console.WriteLine("Time: " + ((ot.Minute * 60 + ot.Second) * 1000 + ot.Millisecond - (dt.Minute * 60 + dt.Second) * 1000 - dt.Millisecond) + "ms");
    120                 //Console.ReadKey();
    121 
    122             });
    123             WorkerTh[result.Length - 1].Start();
    124 
    125         }
    126 
    127         public static void Read(string[] files)
    128         {
    129             foreach (string file in files)
    130             {
    131                 queue.TryAdd(ReadFile(file), -1);
    132             }
    133             queue.TryAdd("END", -1);
    134         }
    135 
    136         public static string ReadFile(string file)
    137         {
    138             string readLine;
    139             FileStream fs = new FileStream(file, FileMode.Open);
    140             StreamReader sr = new StreamReader(fs);
    141             readLine = sr.ReadToEnd();
    142             sr.Close();
    143             fs.Close();
    144             return readLine;
    145         }
    146 
    147         public static void Process(int index)
    148         {
    149             string readLine;
    150             while (true)
    151             {
    152                 queue.TryTake(out readLine, -1);
    153                 if (readLine == "END")
    154                 {
    155                     queue.TryAdd("END", -1);
    156                     break;
    157                 }
    158                 Compute(readLine, index);
    159             }
    160         }
    161 
    162         public static void Compute(string readLine, int index)
    163         {
    164             StringBuilder sb = new StringBuilder(100);
    165             string strKey = "";
    166             int state = 0;
    167             for (int i = 0; i < readLine.Length; i++)
    168             {
    169                 switch (state)
    170                 {
    171                     case 0:
    172                         if ((state = (readLine[i] > 'z') ? 0 : tablet[readLine[i]]) != 0)
    173                         {
    174                             sb.Clear();
    175                             sb.Append(readLine[i]);
    176                         }
    177                         break;
    178                     default:
    179                         if ((state = (readLine[i] > 'z') ? 0 : tablet[readLine[i]]) == 0)
    180                         {
    181                             if (sb.Length >= 1)
    182                             {
    183                                 strKey = ToLower(sb).ToString();
    184                                 //if (!result[index].TryGetValue(strKey, out value))
    185                                 //{
    186                                 //    result[index].TryAdd(strKey, 1);
    187                                 //}
    188                                 //else
    189                                 //{
    190                                 //    result[index][strKey]++;
    191                                 //}
    192                                 result[index].AddOrUpdate(strKey, 1, (k, v) => v + 1);
    193                             }
    194                         }
    195                         else
    196                             sb.Append(readLine[i]);
    197                         break;
    198                 }
    199             }
    200         }
    201 
    202         public static StringBuilder ToLower(StringBuilder str)
    203         {
    204             for (int i = 0; i < str.Length; i++)
    205             {
    206                 if (str[i] <= 'Z')
    207                 {
    208                     str[i] = (char)((int)str[i] + 32);
    209                 }
    210             }
    211             return str;
    212         }
    View Code

    我的电脑是i7-2600, 4核8线程。本来还想根据cpu的个数来动态的调线程池大小,后来发现读取cpu参数那行代码就耗时1s。。遂果断放弃。

    在多次测试后发现7,8个线程的表现均良好。

    另外wencong大神用的Linq自动并行化神马神马的方法可以跑进6s。

    还有惊闻guojia居然只用了1.2s。。。。。。。。。。

  • 相关阅读:
    lintcode:Flip Bits 将整数A转换为B
    lintcode:strStr 字符串查找
    lintcode:Subtree 子树
    lintcode 容易题:Partition Array by Odd and Even 奇偶分割数组
    lintcode:在二叉查找树中插入节点
    lintcode:在O(1)时间复杂度删除链表节点
    lintcode:哈希函数
    lintcode:合并排序数组 II
    lintcode:合并排序数组
    lintcode:数飞机
  • 原文地址:https://www.cnblogs.com/RheetZ/p/3369311.html
Copyright © 2020-2023  润新知