根据日志追踪启动过程:
sheep启动:
1 int main(int argc, char **argv) 2 { 3 int ch, longindex, ret, port = SD_LISTEN_PORT, io_port = SD_LISTEN_PORT; 4 int rc = 1; 5 const char *dirp = DEFAULT_OBJECT_DIR, *short_options; 6 char *dir, *pid_file = NULL, *bindaddr = NULL, log_path[PATH_MAX], 7 *argp = NULL; 8 bool explicit_addr = false; 9 bool daemonize = true; 10 int32_t nr_vnodes = -1; 11 int64_t zone = -1; 12 uint32_t max_dynamic_threads = 0; 13 struct option *long_options; 14 #ifdef HAVE_HTTP 15 const char *http_options = NULL; 16 #endif 17 static struct logger_user_info sheep_info; 18 struct stat logdir_st; 19 enum log_dst_type log_dst_type; 20 21 sys->cinfo.flags |= SD_CLUSTER_FLAG_AUTO_VNODES; // vnode 策略 22 sys->node_status = SD_NODE_STATUS_INITIALIZATION; 23 24 sys->rthrottling.max_exec_count = 0; // 恢复时间间隔和策略 25 sys->rthrottling.queue_work_interval = 0; 26 sys->rthrottling.throttling = false; 27 // 处理各种崩溃异常的注册处理函数 28 install_crash_handler(crash_handler); 29 signal(SIGPIPE, SIG_IGN); 30 // 捕获signal信号处理,信号发生时发开并创建日志文件 31 install_sighandler(SIGHUP, sighup_handler, false); 32 // 解析命令行参数 33 long_options = build_long_options(sheep_options); 34 short_options = build_short_options(sheep_options); 35 while ((ch = getopt_long(argc, argv, short_options, long_options, 36 &longindex)) >= 0) { 37 switch (ch) { 38 case 'p': 39 port = str_to_u16(optarg); 40 if (errno != 0 || port < 1) { 41 sd_err("Invalid port number '%s'", optarg); 42 exit(1); 43 } 44 break; 45 case 'P': 46 pid_file = optarg; 47 break; 48 #ifdef HAVE_HTTP 49 case 'r': 50 http_options = optarg; 51 break; 52 #endif 53 case 'l': 54 if (option_parse(optarg, ",", log_parsers) < 0) 55 exit(1); 56 break; 57 case 'n': 58 sys->nosync = true; 59 break; 60 case 'y': 61 if (!str_to_addr(optarg, sys->this_node.nid.addr)) { 62 sd_err("Invalid address: '%s'", optarg); 63 exit(1); 64 } 65 explicit_addr = true; 66 break; 67 case 'D': 68 sys->backend_dio = true; 69 break; 70 case 'f': 71 daemonize = false; 72 break; 73 case 'g': 74 if (nr_vnodes > 0) { 75 sd_err("Options '-g' and '-V' can not be both specified"); 76 exit(1); 77 } 78 nr_vnodes = 0; 79 break; 80 case 'z': 81 zone = str_to_u32(optarg); 82 if (errno != 0) { 83 sd_err("Invalid zone id '%s': must be " 84 "an integer between 0 and %u", optarg, 85 UINT32_MAX); 86 exit(1); 87 } 88 sys->this_node.zone = zone; 89 break; 90 case 'u': 91 sys->upgrade = true; 92 break; 93 case 'c': 94 sys->cdrv = find_cdrv(optarg); 95 if (!sys->cdrv) { 96 sd_err("Invalid cluster driver '%s'", optarg); 97 show_features(0); 98 exit(1); 99 } 100 101 sys->cdrv_option = get_cdrv_option(sys->cdrv, optarg); 102 break; 103 case 'i': 104 if (option_parse(optarg, ",", ionic_parsers) < 0) 105 exit(1); 106 107 if (!str_to_addr(io_addr, sys->this_node.nid.io_addr)) { 108 sd_err("Bad addr: '%s'", io_addr); 109 exit(1); 110 } 111 112 if (io_pt) 113 if (sscanf(io_pt, "%u", &io_port) != 1) { 114 sd_err("Bad port '%s'", io_pt); 115 exit(1); 116 } 117 sys->this_node.nid.io_port = io_port; 118 #ifdef HAVE_ACCELIO 119 if (!strcmp(io_transport, "tcp")) 120 sys->this_node.nid.io_transport_type = 121 IO_TRANSPORT_TYPE_TCP; 122 else if (!strcmp(io_transport, "rdma")) 123 sys->this_node.nid.io_transport_type = 124 IO_TRANSPORT_TYPE_RDMA; 125 else { 126 sd_err("unknown transport type: %s", 127 io_transport); 128 exit(1); 129 } 130 #endif 131 break; 132 case 'j': 133 uatomic_set_true(&sys->use_journal); 134 if (option_parse(optarg, ",", journal_parsers) < 0) 135 exit(1); 136 if (!jsize) { 137 sd_err("you must specify size for journal"); 138 exit(1); 139 } 140 break; 141 case 'b': 142 if (!inetaddr_is_valid(optarg)) 143 exit(1); 144 bindaddr = optarg; 145 break; 146 case 'h': 147 usage(0); 148 break; 149 case 'R': 150 if (option_parse(optarg, ",", recovery_parsers) < 0) 151 exit(1); 152 sys->rthrottling.max_exec_count = max_exec_count; 153 sys->rthrottling.queue_work_interval 154 = queue_work_interval; 155 if (max_exec_count > 0 && queue_work_interval > 0) 156 sys->rthrottling.throttling = true; 157 break; 158 case 'v': 159 fprintf(stdout, "Sheepdog daemon version %s ", 160 PACKAGE_VERSION); 161 show_features(1); 162 exit(0); 163 break; 164 case 'V': 165 sys->cinfo.flags &= ~SD_CLUSTER_FLAG_AUTO_VNODES; 166 if (nr_vnodes == 0) { 167 sd_err("Options '-g' and '-V' can not be both specified"); 168 exit(1); 169 } 170 nr_vnodes = str_to_u16(optarg); 171 if (errno != 0 || nr_vnodes < 1) { 172 sd_err("Invalid number of vnodes '%s': must be " 173 "an integer between 1 and %u", 174 optarg, UINT16_MAX); 175 exit(1); 176 } 177 break; 178 case 'W': 179 wildcard_recovery = true; 180 break; 181 case 'w': 182 if (option_parse(optarg, ",", wq_parsers) < 0) 183 exit(1); 184 break; 185 case 'x': 186 max_dynamic_threads = str_to_u32(optarg); 187 if (errno != 0 || max_dynamic_threads < 1) { 188 sd_err("Invalid number of threads '%s': " 189 "must be an integer between 1 and %"PRIu32, 190 optarg, UINT32_MAX); 191 exit(1); 192 } 193 set_max_dynamic_threads((size_t)max_dynamic_threads); 194 break; 195 default: 196 usage(1); 197 break; 198 } 199 } 200 201 #ifdef HAVE_DISKVNODES 202 sys->cinfo.flags |= SD_CLUSTER_FLAG_DISKMODE; 203 #endif 204 205 sheep_info.port = port; 206 early_log_init(log_format, &sheep_info); 207 208 if (nr_vnodes == 0) { 209 sys->gateway_only = true; 210 sys->disk_space = 0; 211 } else if (nr_vnodes == -1) 212 nr_vnodes = SD_DEFAULT_VNODES; 213 214 if (optind != argc) { 215 argp = strdup(argv[optind]); 216 dirp = strtok(argv[optind], ","); 217 } 218 219 ret = sd_inode_actor_init(sheep_bnode_writer, sheep_bnode_reader); 220 if (ret) 221 exit(1); 222 223 if (!strcmp(log_dst, "default")) 224 log_dst_type = LOG_DST_DEFAULT; 225 else if (!strcmp(log_dst, "stdout"))init_path_space 226 log_dst_type = LOG_DST_STDOUT; 227 else if (!strcmp(log_dst, "syslog")) 228 log_dst_type = LOG_DST_SYSLOG; 229 else { 230 sd_err("invalid type of log destination: %s", log_dst); 231 exit(1); 232 } 233 234 if (logdir) { 235 if (log_dst_type != LOG_DST_DEFAULT) { 236 sd_err("logdir (%s) is specified but logging" 237 " destination is %s", logdir, 238 log_dst_type == LOG_DST_STDOUT 239 ? "stdout" : "syslog"); 240 exit(1); 241 } 242 243 memset(&logdir_st, 0, sizeof(logdir_st)); 244 ret = stat(logdir, &logdir_st); 245 if (ret < 0) { 246 sd_err("stat() failed on %s, %m", logdir); 247 exit(1); 248 } 249 250 if (!S_ISDIR(logdir_st.st_mode)) { 251 sd_err("log dir: %s is not a directory", logdir); 252 exit(1); 253 } 254 } 255 // 创建基本目录 256 ret = init_base_path(dirp); 257 if (ret) 258 exit(1); 259 260 dir = realpath(dirp, NULL); 261 if (!dir) { 262 sd_err("%m"); 263 exit(1); 264 } 265 266 snprintf(log_path, sizeof(log_path), "%s/" LOG_FILE_NAME, 267 logdir ?: dir); 268 269 free(logdir); 270 271 srandom(port); 272 273 if (daemonize && log_dst_type == LOG_DST_STDOUT) 274 daemonize = false; 275 // 创建sheep进程并对基本目录上锁 276 if (lock_and_daemon(daemonize, dir)) { 277 free(argp); 278 goto cleanup_dir; 279 } 280 281 #ifdef HAVE_ACCELIO 282 sd_xio_init(); 283 xio_init_main_ctx(); 284 #endif 285 // 日志操作 286 ret = log_init(program_name, log_dst_type, log_level, log_path); 287 if (ret) { 288 free(argp); 289 goto cleanup_dir; 290 } 291 // 初始化obj,epoch,config路径 292 ret = init_global_pathnames(dir, argp); 293 free(argp); 294 if (ret) 295 goto cleanup_log; 296 // 创建epoll事件触发机制 297 ret = init_event(EPOLL_SIZE); 298 if (ret) 299 goto cleanup_log; 300 // 初始化节点配置文件 301 ret = init_node_config_file(); 302 if (ret) 303 goto cleanup_log; 304 // 初始化配置文件 305 ret = init_config_file(); 306 if (ret) 307 goto cleanup_log; 308 // 创建网络连接或者io操作的监听端口,监听客户端qemu的连接请求listen_handler 309 // 并于client_handler绑定 310 ret = create_listen_port(bindaddr, port); 311 if (ret) 312 goto cleanup_log; 313 314 #ifndef HAVE_ACCELIO 315 if (io_addr && create_listen_port(io_addr, io_port)) // 端口是7000 316 goto cleanup_log; 317 #else 318 if (io_addr) { 319 bool rdma; 320 321 if (!strcmp(io_transport, "rdma")) 322 rdma = true; 323 else { 324 sd_assert(!strcmp(io_transport, "tcp")); 325 rdma = false; 326 } 327 328 if (xio_create_listen_port(io_addr, io_port, rdma)) 329 goto cleanup_log; 330 } else { 331 sd_err("accelio is enabled but io address (-i) isn't passed, exiting"); 332 goto cleanup_log; 333 } 334 #endif 335 // 同一台主机的进程间通信,创建socket和绑定端口 336 ret = init_unix_domain_socket(dir); 337 if (ret) 338 goto cleanup_log; 339 // 本地请求的初始化,注册local_req_handler事件:local_req_handler 340 local_request_init(); 341 // 信号事件的初始化:signal_handler 342 ret = init_signal(); 343 if (ret) 344 goto cleanup_log; 345 346 /* This function must be called before create_cluster() */ 347 ret = init_disk_space(dir); 348 if (ret) 349 goto cleanup_log; 350 // 创建集群 351 ret = create_cluster(port, zone, nr_vnodes, explicit_addr); 352 if (ret) { 353 sd_err("failed to create sheepdog cluster"); 354 goto cleanup_log; 355 } 356 // 监控节点连接状态 357 ret = start_node_connectivity_monitor(); 358 if (ret) 359 goto cleanup_journal; 360 361 /* We should init trace for work queue before journal init */ 362 ret = wq_trace_init(); 363 if (ret) { 364 sd_err("failed to init trace for work queue"); 365 goto cleanup_log; 366 } 367 368 /* We should init journal file before backend init */ 369 if (uatomic_is_true(&sys->use_journal)) { 370 if (!strlen(jpath)) 371 /* internal journal */ 372 memcpy(jpath, dir, strlen(dir)); 373 sd_debug("%s, %"PRIu64", %d", jpath, jsize, jskip); 374 // 创建journal文件,根据条件判断是否需要恢复 375 ret = journal_file_init(jpath, jsize, jskip); 376 if (ret) 377 goto cleanup_cluster; 378 } 379 380 init_fec(); 381 382 /* 创建工作队列,分为三种:单线程,动态线程,固定个数的线程 383 * After this function, we are multi-threaded. 384 * 385 * Put those init functions that need single threaded environment, for 386 * e.g, signal handling, above this call and those need multi-threaded 387 * environment, for e.g, work queues below. 388 */ 389 ret = create_work_queues(); 390 if (ret) 391 goto cleanup_journal; 392 393 ret = sockfd_init(); 394 if (ret) 395 goto cleanup_journal; 396 // 初始化存储驱动 397 ret = init_store_driver(sys->gateway_only); 398 if (ret) 399 goto cleanup_journal; 400 401 ret = trace_init(); 402 if (ret) 403 goto cleanup_journal; 404 405 #ifdef HAVE_HTTP 406 if (http_options && http_init(http_options) != 0) 407 goto cleanup_journal; 408 #endif 409 410 #ifdef HAVE_NFS 411 ret = nfs_init(NULL); 412 if (ret) 413 goto cleanup_journal; 414 #endif 415 416 if (pid_file && (create_pidfile(pid_file) != 0)) { 417 sd_err("failed to pid file '%s' - %m", pid_file); 418 goto cleanup_journal; 419 } 420 421 if (chdir(dir) < 0) { 422 sd_err("failed to chdir to %s: %m", dir); 423 goto cleanup_pid_file; 424 } 425 426 check_host_env(); 427 sd_info("sheepdog daemon (version %s) started", PACKAGE_VERSION); 428 429 while (sys->nr_outstanding_reqs != 0 || 430 (sys->cinfo.status != SD_STATUS_KILLED && 431 sys->cinfo.status != SD_STATUS_SHUTDOWN)) 432 event_loop(-1); 433 434 rc = 0; 435 sd_info("shutdown"); 436 437 cleanup_pid_file: 438 if (pid_file) 439 unlink(pid_file); 440 441 cleanup_journal: 442 if (uatomic_is_true(&sys->use_journal)) { 443 sd_info("cleaning journal file"); 444 clean_journal_file(jpath); 445 } 446 447 cleanup_cluster: 448 leave_cluster(); 449 450 cleanup_log: 451 log_close(); 452 453 cleanup_dir: 454 free(dir); 455 456 return rc; 457 }
使用zookeeper作为集群管理:
需要提供的接口:
1 static struct cluster_driver cdrv_zookeeper = { 2 .name = "zookeeper", 3 4 .init = zk_init, 5 .join = zk_join, 6 .leave = zk_leave, 7 .notify = zk_notify, 8 .block = zk_block, 9 .unblock = zk_unblock, 10 .lock = zk_lock, 11 .unlock = zk_unlock, 12 .update_node = zk_update_node, 13 .get_local_addr = get_local_addr, 14 };
zk_init:
1 static int zk_init(const char *option) 2 { 3 char *hosts, *to, *p; 4 int ret, timeo; 5 char conn[MAX_NODE_STR_LEN]; 6 7 if (!option) { 8 sd_err("You must specify zookeeper servers."); 9 return -1; 10 } 11 // 入参:timeout 12 hosts = strtok((char *)option, "="); 13 if ((to = strtok(NULL, "="))) { 14 if (sscanf(to, "%u", &zk_timeout) != 1) { 15 sd_err("Invalid parameter for timeout"); 16 return -1; 17 } 18 p = strstr(hosts, "timeout"); 19 *--p = '