One of the building blocks to implement containers is Linux namespaces. Namespaces control what a process can see. It can be the processes IDs, mount points, network adapters and more.
To use namespaces we call the clone(2) system call.
Creating a child process – fork vs clone
To create a new process in Linux, we can use fork(2) or clone(2) system calls. We use fork(2) to create a new child process with a separate memory mapping (using CoW) , we use clone(2) to create a child process that shares resources with its parent. One use of clone is to implement multithreading, other use is to implement namespaces
Namespaces with clone(2)
To create a child process in a new namespace and isolated resources we need to use one or more of the following flags :
- CLONE_NEWNET – isolate network devices
- CLONE_NEWUTS – host and domain names (UNIX Timesharing System)
- CLONE_NEWIPC – IPC objects
- CLONE_NEWPID – PIDs
- CLONE_NEWNS – mount points (file systems)
- CLONE_NEWUSER – users and groups
Simple Example – NEWPID
To create a child process with PID=1 (new processes tree) call clone(2) with CLONE_NEWPID:
1
|
clone(child_fn, child_stack+5000, CLONE_NEWPID , NULL);
|
getpid() on the child process returns 1, getppid() returns 0. If the child process creates another child it will get a process id from the new tree
Full example:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
|
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/wait.h>
#include <unistd.h>
#include <sys/types.h>
#include <signal.h>
static char child_stack[5000];
void grchild(int num)
{
printf("child(%d) in ns my PID: %d Parent ID=%d
", num, getpid(),getppid());
sleep(5);
puts("end child");
}
int child_fn(int ppid) {
int i;
printf("PID: %ld Parent:%ld
", (long)getpid(), getppid());
for(i=0;i<3;i++)
{
if(fork() == 0)
{
grchild(i+1);
exit(0);
}
kill(ppid,SIGKILL); // no effect
}
sleep(2);
kill(2,SIGKILL); // kill the first child
sleep(10);
return 0;
}
int main() {
pid_t pid = clone(child_fn, child_stack+5000, CLONE_NEWPID , getpid());
printf("clone() = %d
", pid);
waitpid(pid, NULL, 0);
return 0;
}
|
The main creates a child process in a new PID namespace and send its PID to the child. The child creates 3 children.
If the child process try to kill the parent (out of its namespace) – nothing happens but it can kill a process in its namespace (in this case the first child)
If you build and run this sample (run with sudo)
1
2
3
4
5
6
7
8
|
# sudo ./simple
clone() = 5439
PID: 1 Parent:0
child(3) in ns my PID: 4 Parent ID=1
child(2) in ns my PID: 3 Parent ID=1
child(1) in ns my PID: 2 Parent ID=1
end child
end child
|
As you can see the PIDs are 1-4 and the first child didn’t finish (SIGKILL)
Isolates Network Interfaces
To create a child process with different network interfaces use CLONE_NEWNET:
1
|
pid_t pid = clone(child_fn, child_stack+1024*1024, CLONE_NEWNET , NULL);
|
To create a virtual network adapter we can run ip command:
1
2
|
# sudo ip link add name veth0 type veth peer name veth1 netns [child pid]
# sudo ifconfig veth0 10.0.0.3
|
Now the child should run the command:
1
|
# ifconfig veth1 10.0.0.4
|
We can code all these commands but for simplicity lets use the system(3) library function
Full Example:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/wait.h>
#include <unistd.h>
static char child_stack[1024*1024];
static int child_fn() {
sleep(1);
system("ifconfig veth1 10.0.0.4");
puts("========= child network interfaces ========");
system("ifconfig -a");
puts("===========================================");
sleep(1);
system("ping -c 3 10.0.0.3");
return 0;
}
int main() {
char buf[255];
pid_t pid = clone(child_fn, child_stack+1024*1024, CLONE_NEWNET , NULL);
sprintf(buf,"ip link add name veth0 type veth peer name veth1 netns %d",pid);
system(buf);
system("ifconfig veth0 10.0.0.3");
waitpid(pid, NULL, 0);
return 0;
}
|
Run this test – the output:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
========= child network interfaces ========
lo Link encap:Local Loopback
LOOPBACK MTU:65536 Metric:1
RX packets:0 errors:0 dropped:0 overruns:0 frame:0
TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1
RX bytes:0 (0.0 B) TX bytes:0 (0.0 B)
veth1 Link encap:Ethernet HWaddr 7a:d6:68:fb:c0:04
inet addr:10.0.0.4 Bcast:10.255.255.255 Mask:255.0.0.0
inet6 addr: fe80::78d6:68ff:fefb:c004/64 Scope:Link
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:0 errors:0 dropped:0 overruns:0 frame:0
TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:0 (0.0 B) TX bytes:0 (0.0 B)
===========================================
PING 10.0.0.3 (10.0.0.3) 56(84) bytes of data.
64 bytes from 10.0.0.3: icmp_seq=1 ttl=64 time=0.076 ms
64 bytes from 10.0.0.3: icmp_seq=2 ttl=64 time=0.048 ms
64 bytes from 10.0.0.3: icmp_seq=3 ttl=64 time=0.071 ms
--- 10.0.0.3 ping statistics ---
3 packets transmitted, 3 received, 0% packet loss, time 1999ms
rtt min/avg/max/mdev = 0.048/0.065/0.076/0.012 ms
|
The child sees only the virtual adapter and can ping the parent using it
Mount Points and file system
To implement a container we need to isolate also the file system. It can be done using CLONE_NEWNS. Before coding , lets build a simple file system using BusyBox or BuildRoot
The simplest way is using buildroot – it is based on busybox.
Download and extract the package, use make menuconfig to enter the configuration menu, just exit and save the default selection and run make
1
2
3
4
|
# tar xvf ./buildroot-2017.11.2.tar.bz2
# cd buildroot-2017.11.2
# make menuconfig
# make
|
It will take a few minutes , after the build is finished you will find a file system in buildroot-2017.11.2/output/target
copy the content to another folder – in my example fs and add some device files to the /dev directory using mknod commands (buildroot can’t do that because it doesn’t run with sudo)
Full Example
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
|
#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/mount.h>
#include <stdio.h>
#include <sched.h>
#include <signal.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <arpa/inet.h>
#include <net/if.h>
#include <string.h>
#define STACK_SIZE (1024 * 1024)
static char stack[STACK_SIZE];
int setip(char *name,char *addr,char *netmask) {
struct ifreq ifr;
int fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
strncpy(ifr.ifr_name, name, IFNAMSIZ);
ifr.ifr_addr.sa_family = AF_INET;
inet_pton(AF_INET, addr, ifr.ifr_addr.sa_data + 2);
ioctl(fd, SIOCSIFADDR, &ifr);
inet_pton(AF_INET, netmask, ifr.ifr_addr.sa_data + 2);
ioctl(fd, SIOCSIFNETMASK, &ifr);
//get flags
ioctl(fd, SIOCGIFFLAGS, &ifr);
strncpy(ifr.ifr_name, name, IFNAMSIZ);
ifr.ifr_flags |= (IFF_UP | IFF_RUNNING);
// set flags
ioctl(fd, SIOCSIFFLAGS, &ifr);
return 0;
}
int child(void* arg)
{
char c;
sleep(1);
sethostname("myhost", 6);
chroot("./fs");
chdir("/");
mount("proc", "/proc", "proc", 0, NULL);
setip("veth1","10.0.0.15","255.0.0.0");
execlp("/bin/sh", "/bin/sh" , NULL);
return 1;
}
int main()
{
char buf[255];
pid_t pid = clone(child, stack+STACK_SIZE,
CLONE_NEWNET | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNS | SIGCHLD, NULL);
sprintf(buf,"sudo ip link add name veth0 type veth peer name veth1 netns %d",pid);
system(buf);
setip("veth0","10.0.0.13","255.0.0.0");
waitpid(pid, NULL, 0);
return 0;
}
|
We create a child process in a new namespace (with PIDs, network, mounts, IPC and UTS) , the parent configure the virtual adapters (using ip link) and set its ip address
The child change the hostname, change the root folder to our buildroot output , change the current directory to ‘/’ , mount proc so ps and other tools will work and set an ip address.
The last step the child does is calling the busybox shell (/bin/sh)
Run this program using sudo – you will get a different shell, file system and network:
Thats it!!
You can find the code with the full Buildroot image here
This is just a simple example to understand the concept. to implement a full container you need also to add capabilities, control groups and more