上一节讲了基数排序(Radix Sort),这一节介绍桶排序(Bucket Sort or Bin Sort)。和基数排序一样,桶排序也是一种分布式排序。
桶排序(Bucket Sort)的基本思想
- 将待排对象序列按照一定hash算法分发到N个桶中
- 对每一个桶的待排对象进行排序
- 遍历N个桶,收集所有非空桶里的有序对象(子序列)组成一个统一的有序对象序列
在每一个桶中,如果采用链式存储的话,1.和2.可以合并在一起操作,也就是在分发的过程中保证每一个桶里的对象是桶内有序的。
例如: 设有5个桶, 待排对象序列为 {29, 25, 3, 49, 9, 37, 21, 43}
1. 分发(scatter) (注:图片来源戳这里)
2. 桶内排序(sort)
3. 收集(gather)
从上面的3张图中,我们可以很直观地了解桶排序的过程。在观看了动画Bucket Sort后,我决定采用动画中给出的hash算法和对每一个桶采用单链表存储结构给出C代码实现。动画中给出的hash算法如下:
Linked List Array index = Value * NUMBER_OF_ELEMENTS/(MAXINUM_ARRAY_VALUE + 1)
e.g. (348 * 30)/1000 = 10
(15 * 30)/1000 = 0
Note that NUMBER_OF_ELEMENTS is the number of buckets, which is 30.
桶排序的C代码实现
1. 基本排序原理介绍
/* * Bucket Sort * * Bucket sort(or bin sort), is a sorting algorithm that works by * distributing the elements of an array into a number of buckets. * Each bucket is then sorted individually, either using a different * sorting algorithm, or by recursively applying the bucket sorting * algorithm. * * Typically, bucket sort works as follows: * 1. Set up an array of initially empty "buckets" * 2. Scatter: go over the original array, putting each object in * its bucket * 3. Sort each non-empty bucket * 4. Gather : visit the buckets in order and put all elements back * into the original array * * Note that step#2 and step#3 are merged into one step since we use * single linked list for per bucket for better performance. Right * here we just use insertion sorting algorithm to initiliaze a single * linked list. * * In addition, we define N(=10) buckets, and use such hash algorithm in * the following, * a) get max number of a[] as MAX * b) get width of the max number (i.e. MAX) as WIDTH * e.g. MAX = 9, WIDTH = 1; * MAX = 99, WIDTH = 2; * MAX = 999, WIDTH = 3; * c) index = a[i] * N / (10 ** WIDTH) * then we can dispatch a[i] to bucket[index] */
2. 单链表定义及基本操作
1 typedef struct list_s { 2 int data; 3 struct list_s *next; 4 } list_t; 5 6 static void 7 list_init(list_t **head, list_t *node) 8 { 9 if (*head == NULL) { 10 *head = node; 11 return; 12 } 13 14 /* get both prev and next of the node to insert */ 15 list_t *node_prev = *head; 16 list_t *node_next = NULL; 17 for (list_t *p = *head; p != NULL; p = p->next) { 18 if (p->data < node->data) { 19 node_prev = p; 20 continue; 21 } 22 23 node_next = p; 24 break; 25 } 26 27 if (node_next == NULL) { /* append node to the tail */ 28 node_prev->next = node; 29 } else { 30 if (node_next == node_prev) { /* == *head */ 31 node->next = *head; 32 *head = node; 33 return; 34 } 35 36 /* node_prev -> node -> node_next */ 37 node_prev->next = node; 38 node->next = node_next; 39 } 40 } 41 42 static void 43 list_show(list_t *head) 44 { 45 if (head == NULL) 46 return; 47 48 for (list_t *p = head; p != NULL; p = p->next) 49 printf("%d ", p->data); 50 printf(" "); 51 } 52 53 static void 54 list_fini(list_t *head) 55 { 56 list_t *p = head; 57 while (p != NULL) { 58 list_t *q = p; 59 p = p->next; 60 free(q); 61 } 62 }
3. 核心步骤之一:分发scatter()
1 /* 2 * Get width of a number 3 * e.g. 4 * for i in [ 0 .. 9 ] // width = 1 5 * for i in [ 10 .. 99 ] // width = 2 6 * for i in [100 .. 999] // width = 3 7 * ... 8 */ 9 static int 10 get_width_of_num(int num) 11 { 12 int w = 1; 13 for (int q = num / 10; q != 0; q /= 10) 14 w++; 15 return w; 16 } 17 18 static int 19 get_hash_base(int a[], size_t n) 20 { 21 /* get max one of a[] */ 22 int max = a[0]; 23 for (int i = 0; i < n; i++) { 24 if (max < a[i]) 25 max = a[i]; 26 } 27 28 /* get hash base which is 10**N, N=1, 2, ... */ 29 int base = 1; 30 for (int i = 0; i < get_width_of_num(max); i++) 31 base *= 10; 32 33 return base; 34 } 35 36 static void 37 scatter(list_t **bucket, size_t m, int a[], size_t n) 38 { 39 int base = get_hash_base(a, n); 40 41 for (int i = 0; i < n; i++) { 42 /* 1. new a node for a[i] */ 43 list_t *nodep = NULL; 44 nodep = (list_t *)malloc(sizeof (list_t)); 45 if (nodep == NULL) /* error: failed to malloc */ 46 return; 47 48 nodep->data = a[i]; 49 nodep->next = NULL; 50 51 /* 2. dispatch the new node to bucket[j] */ 52 int j = a[i] * m / base; 53 list_init(&(bucket[j]), nodep); 54 } 55 }
4. 核心步骤之二:收集gather()
1 static void 2 gather(list_t **bucket, size_t m, int a[], size_t n) 3 { 4 int k = 0; 5 for (int i = 0; i < m; i++) { 6 if (bucket[i] == NULL) 7 continue; 8 9 for (list_t *p = bucket[i]; p != NULL; p = p->next) { 10 a[k++] = p->data; 11 12 if (k >= n) /* overflow */ 13 break; 14 } 15 16 list_fini(bucket[i]); 17 } 18 }
5. 桶排序bucketsort()
1 void 2 bucketsort(int a[], size_t n) 3 { 4 /* alloc bucket[] */ 5 #define BUCKET_NUM 10 6 list_t **bucket = (list_t **)malloc(sizeof (list_t *) * BUCKET_NUM); 7 if (bucket == NULL) /* error: failed to malloc */ 8 return; 9 for (int i = 0; i < BUCKET_NUM; i++) 10 bucket[i] = NULL; 11 12 /* scatter elements in a[] to bucket[] */ 13 scatter(bucket, BUCKET_NUM, a, n); 14 15 /* gather a[] by walking bucket[] */ 16 gather(bucket, BUCKET_NUM, a, n); 17 18 free(bucket); 19 }
6. 完整的C代码
o bucketsort.c (或访问这里)
1 /* 2 * Bucket Sort 3 * 4 * Bucket sort(or bin sort), is a sorting algorithm that works by 5 * distributing the elements of an array into a number of buckets. 6 * Each bucket is then sorted individually, either using a different 7 * sorting algorithm, or by recursively applying the bucket sorting 8 * algorithm. 9 * 10 * Typically, bucket sort works as follows: 11 * 1. Set up an array of initially empty "buckets" 12 * 2. Scatter: go over the original array, putting each object in 13 * its bucket 14 * 3. Sort each non-empty bucket 15 * 4. Gather : visit the buckets in order and put all elements back 16 * into the original array 17 * 18 * Note that step#2 and step#3 are merged into one step since we use 19 * single linked list for per bucket for better performance. Right 20 * here we just use insertion sorting algorithm to initiliaze a single 21 * linked list. 22 * 23 * In addition, we define N(=10) buckets, and use such hash algorithm in 24 * the following, 25 * a) get max number of a[] as MAX 26 * b) get width of the max number (i.e. MAX) as WIDTH 27 * e.g. MAX = 9, WIDTH = 1; 28 * MAX = 99, WIDTH = 2; 29 * MAX = 999, WIDTH = 3; 30 * c) index = a[i] * N / (10 ** WIDTH) 31 * then we can dispatch a[i] to bucket[index] 32 */ 33 34 #include <stdio.h> 35 #include <stdlib.h> 36 #include <string.h> 37 38 typedef enum bool_s {false, true} bool_t; 39 40 bool_t g_isint = true; 41 42 typedef struct list_s { 43 int data; 44 struct list_s *next; 45 } list_t; 46 47 static void 48 list_init(list_t **head, list_t *node) 49 { 50 if (*head == NULL) { 51 *head = node; 52 return; 53 } 54 55 /* get both prev and next of the node to insert */ 56 list_t *node_prev = *head; 57 list_t *node_next = NULL; 58 for (list_t *p = *head; p != NULL; p = p->next) { 59 if (p->data < node->data) { 60 node_prev = p; 61 continue; 62 } 63 64 node_next = p; 65 break; 66 } 67 68 if (node_next == NULL) { /* append node to the tail */ 69 node_prev->next = node; 70 } else { 71 if (node_next == node_prev) { /* == *head */ 72 node->next = *head; 73 *head = node; 74 return; 75 } 76 77 /* node_prev -> node -> node_next */ 78 node_prev->next = node; 79 node->next = node_next; 80 } 81 } 82 83 static void 84 list_show(list_t *head) 85 { 86 if (head == NULL) 87 return; 88 89 for (list_t *p = head; p != NULL; p = p->next) 90 printf("%d ", p->data); 91 printf(" "); 92 } 93 94 static void 95 list_fini(list_t *head) 96 { 97 list_t *p = head; 98 while (p != NULL) { 99 list_t *q = p; 100 p = p->next; 101 free(q); 102 } 103 } 104 105 static void 106 show(int a[], size_t n) 107 { 108 if (g_isint) { 109 for (int i = 0; i < n; i++) 110 printf("%-2d ", a[i]); 111 } else { 112 for (int i = 0; i < n; i++) 113 printf("%-2c ", a[i]); 114 } 115 printf(" "); 116 } 117 118 /* 119 * Get width of a number 120 * e.g. 121 * for i in [ 0 .. 9 ] // width = 1 122 * for i in [ 10 .. 99 ] // width = 2 123 * for i in [100 .. 999] // width = 3 124 * ... 125 */ 126 static int 127 get_width_of_num(int num) 128 { 129 int w = 1; 130 for (int q = num / 10; q != 0; q /= 10) 131 w++; 132 return w; 133 } 134 135 static int 136 get_hash_base(int a[], size_t n) 137 { 138 /* get max one of a[] */ 139 int max = a[0]; 140 for (int i = 0; i < n; i++) { 141 if (max < a[i]) 142 max = a[i]; 143 } 144 145 /* get hash base which is 10**N, N=1, 2, ... */ 146 int base = 1; 147 for (int i = 0; i < get_width_of_num(max); i++) 148 base *= 10; 149 150 return base; 151 } 152 153 static void 154 scatter(list_t **bucket, size_t m, int a[], size_t n) 155 { 156 int base = get_hash_base(a, n); 157 158 for (int i = 0; i < n; i++) { 159 /* 1. new a node for a[i] */ 160 list_t *nodep = NULL; 161 nodep = (list_t *)malloc(sizeof (list_t)); 162 if (nodep == NULL) /* error: failed to malloc */ 163 return; 164 165 nodep->data = a[i]; 166 nodep->next = NULL; 167 168 /* 2. dispatch the new node to bucket[j] */ 169 int j = a[i] * m / base; 170 list_init(&(bucket[j]), nodep); 171 172 /* NOTE: dump bucket[j] just for visual observation */ 173 printf("%d:%d %d bucket[%d] : ", i, j, a[i], j); 174 list_show(bucket[j]); 175 } 176 } 177 178 static void 179 gather(list_t **bucket, size_t m, int a[], size_t n) 180 { 181 int k = 0; 182 for (int i = 0; i < m; i++) { 183 if (bucket[i] == NULL) 184 continue; 185 186 for (list_t *p = bucket[i]; p != NULL; p = p->next) { 187 a[k++] = p->data; 188 189 if (k >= n) /* overflow */ 190 break; 191 } 192 193 list_fini(bucket[i]); 194 } 195 } 196 197 void 198 bucketsort(int a[], size_t n) 199 { 200 /* alloc bucket[] */ 201 #define BUCKET_NUM 10 202 list_t **bucket = (list_t **)malloc(sizeof (list_t *) * BUCKET_NUM); 203 if (bucket == NULL) /* error: failed to malloc */ 204 return; 205 for (int i = 0; i < BUCKET_NUM; i++) 206 bucket[i] = NULL; 207 208 /* scatter elements in a[] to bucket[] */ 209 scatter(bucket, BUCKET_NUM, a, n); 210 211 /* gather a[] by walking bucket[] */ 212 gather(bucket, BUCKET_NUM, a, n); 213 214 free(bucket); 215 } 216 217 int 218 main(int argc, char *argv[]) 219 { 220 if (argc < 2) { 221 fprintf(stderr, "Usage: %s <C1> [C2] ... ", argv[0]); 222 return -1; 223 } 224 225 argc--; 226 argv++; 227 228 int n = argc; 229 int *a = (int *)malloc(sizeof(int) * n); 230 #define VALIDATE(p) do { if (p == NULL) return -1; } while (0) 231 VALIDATE(a); 232 233 char *s = getenv("ISINT"); 234 if (s != NULL && strncmp(s, "true", 4) == 0) 235 g_isint = true; 236 else if (s != NULL && strncmp(s, "false", 4) == 0) 237 g_isint = false; 238 239 if (g_isint) { 240 for (int i = 0; i < n; i++) 241 *(a+i) = atoi(argv[i]); 242 } else { 243 for (int i = 0; i < n; i++) 244 *(a+i) = argv[i][0]; 245 } 246 247 printf(" "); 248 for (int i = 0; i < n; i++) 249 printf("%-2x ", i); 250 printf(" "); 251 252 printf("Before sorting: "); show(a, n); 253 bucketsort(a, n); 254 printf("After sorting: "); show(a, n); 255 256 #define FREE(p) do { free(p); p = NULL; } while (0) 257 FREE(a); 258 return 0; 259 }
o 编译并测试
$ gcc -g -Wall -std=gnu99 -m32 -o bucketsort bucketsort.c $ ./bucketsort 29 25 3 49 9 37 21 43 0 1 2 3 4 5 6 7 Before sorting: 29 25 3 49 9 37 21 43 0:2 29 bucket[2] : 29 1:2 25 bucket[2] : 25 29 2:0 3 bucket[0] : 3 3:4 49 bucket[4] : 49 4:0 9 bucket[0] : 3 9 5:3 37 bucket[3] : 37 6:2 21 bucket[2] : 21 25 29 7:4 43 bucket[4] : 43 49 After sorting: 3 9 21 25 29 37 43 49
桶排序(Bucket Sort)的排序稳定性取决于每一个桶内排序的稳定性。 如果每一个桶的排序方法是稳定的,则桶排序就是一种稳定的排序算法。特别需要注意的是,桶排序非常耗费存储空间。 就上面的实现而言,我们消耗了n个链表结点和10个桶,也就是说,其空间复杂度为O(n+k) (其中,k为桶的个数)。 从时间复杂度的角度看,我们实现的桶排序算法,最好的时间复杂度是O(n+k), 也就是n个元素在分发阶段均匀地分散在k(=10)个桶中,并且每个桶在分发的时候不需要进行链式插入排序就保持有序;那么在收集阶段,每个桶都有元素被遍历到。 最坏的时间复杂度是O(n**2), 也就是说n个元素在分发阶段被装入了一个桶X中,而且在对桶X进行链式插入排序时的时间复杂度为O(n**2)。 维基百科对桶排序的时间复杂度总结为:
Worst-case performance O(n ** 2)
Best-case performance Ω(n + k)
Average performance Θ(n + k)
虽然桶排序很耗费存储空间,但它并非一无是处。在需要使用并行处理以提高排序速度的时候,桶排序可以很好地予以支持。例如,假设有1000个桶,有100万条数据需要排序。那么,我们完全可以启动1000个线程对这100万条数据进行并行分发并做链式插入排序,然后等1000个线程都结束后,将1000个桶里的数据收集回来就OK了。当然,用32位的程序处理100万条数据(如data域为int),大约需要占用4*2*1000000 + 4*1000 = 8M的额外存储空间。(在32位的程序中,int和pointer都是占4个字节)
参考资料
总结
到此为止,常见的9种排序算法都已经介绍完毕,前后历时一个半月,充满了艰辛,也充满了快乐,尤其是对一个热爱编码的程序员来说,乐趣显然是大大滴,从此再也不怕Intel或者AMD的面试官问我神马是堆排序啦:-)。 所有C代码实现都已经保存在我的GitHub里,如感兴趣,请浏览vCodeHub/xdsa/sorting。 最后,引用一句古诗表达一下我此刻的真切感受,"纸上得来终觉浅,绝知此事要躬行"。学习算法,首先要看大师写的书,国产数据结构的书最好不要看,晦涩难懂不说,而且还很可能被误导。但是,仅仅看书是不够的,只有动手去编码实际体会一下,才能知其然也知其所以然,从而印象深刻,进而融会贯通。