• 线程共享内存、具有独立堆栈 栈大小 多线程支付 宽松内存模型 处理器也是编译器


    宽松内存模型 (Relaxed/Weak Memory Model)

    宽松内存模型的目的是使单处理器的执行更高效。

    x86 已经是市面上能买到的 “最强” 的内存模型了

    • 这也是 Intel 自己给自己加的包袱
    • 看看 ARM/RISC-V 吧,根本就是个分布式系统

     

    (x86-TSO in Hardware memory models by Russ Cox)

    https://jyywiki.cn/OS/2022/slides/3.slides#/4/3

    多处理器间即时可见性的丧失

    满足单处理器 eventual memory consistency 的执行,在多处理器上可能无法序列化!

    当 x≠y 时,对 x, y 的内存读写可以交换顺序

    • 它们甚至可以在同一个周期里完成 (只要 load/store unit 支持)
    • 如果写 x 发生 cache miss,可以让读 y 先执行
      • 满足 “尽可能执行 μop” 的原则,最大化处理器性能
         # <-----------+
    movl $1, (x)   #   |
    movl (y), %eax # --+
    
    • 在多处理器上的表现
      • 两个处理器分别看到 y=0 和 x=0

    现代处理器:处理器也是 (动态) 编译器!

    单个处理器把汇编代码 (用电路) “编译” 成更小的 μops

    • RF[9] = load(RF[7] + 400)
    • store(RF[12], RF[13])
    • RF[3] = RF[4] + RF[5]
      • 每个 μop 都有 Fetch, Issue, Execute, Commit 四个阶段

    在任何时刻,处理器都维护一个 μop 的 “池子”

    • 每一周期向池子补充尽可能多的 μop
      • “多发射”
    • 每一周期 (在不违反编译正确性的前提下) 执行尽可能多的 μop
      • “乱序执行”、“按序提交”
    • 这就是《计算机体系结构》 (剩下就是木桶效应,哪里短板补哪里)

    画状态机

    int x = 0, y = 0;
    
    void T1() {
      x = 1;
      asm volatile("" : : "memory"); // compiler barrier
      printf("y = %d\n", y);
    }
    
    void T2() {
      y = 1;
      asm volatile("" : : "memory"); // compiler barrier
      printf("x = %d\n", x);
    }
    

     

    编译器不要优化 compiler barrier

    asm volatile(""::"memory"); 

    0x5f5e101 = 100000001

    实现源代码的按顺序翻译

    在代码中插入 “优化不能穿越” 的 barrier

    • asm volatile ("" ::: "memory");
      • Barrier 的含义是 “可以读写任何内存”
    • 使用 volatile 变量
      • 保持 C 语义和汇编语义一致

    gcc -c -O2 sum.c && objdump -d sum.o

    0000000000000020 <Tsum>:
      20:   f3 0f 1e fa             endbr64
      24:   48 81 05 00 00 00 00    addq   $0x5f5e100,0x0(%rip)        # 2f <Tsum+0xf>
      2b:   00 e1 f5 05
      2f:   c3                      ret
    

     

    gcc -c -O1 sum.c && objdump -d sum.o

    000000000000001a <Tsum>:
      1a:   f3 0f 1e fa             endbr64
      1e:   48 8b 15 00 00 00 00    mov    0x0(%rip),%rdx        # 25 <Tsum+0xb>
      25:   48 8d 42 01             lea    0x1(%rdx),%rax
      29:   48 81 c2 01 e1 f5 05    add    $0x5f5e101,%rdx
      30:   48 89 c1                mov    %rax,%rcx
      33:   48 83 c0 01             add    $0x1,%rax
      37:   48 39 d0                cmp    %rdx,%rax
      3a:   75 f4                   jne    30 <Tsum+0x16>
      3c:   48 89 0d 00 00 00 00    mov    %rcx,0x0(%rip)        # 43 <Tsum+0x29>
      43:   c3                      ret
    

      

     cat sum.c
    #include "thread.h"
    #define N 100000000
    long sum = 0;
    void Tsum()
    {
        for (int i = 0; i < N; i++)
        {
    
            sum++;
            // asm volatile("add $1,%0": "+m"(sum));
        }
    };
    int main()
    {
        create(Tsum);
        create(Tsum);
        join();
        printf("sum=%ld\n", sum);
    

      

    编译器对内存访问 “eventually consistent” 的处理导致共享内存作为线程同步工具的失效。

    eventually consistent 最终 一致

    编译器  x=1-->x=2-->x=3 编译器直接x=3 

    O1优化 O2优化

    while true;do gcc -O1 sum.c -lpthread && ./a.out;done
    sum=100000000
    sum=100000000
    sum=100000000
    sum=100000000

    while true;do gcc -O2 sum.c -lpthread && ./a.out;done
    sum=200000000
    sum=200000000
    sum=200000000

    输入man 3 printf 命令,再输入 /thread 过滤

    man 3 printf
    │printf(), fprintf(), │ Thread safety │ MT-Safe locale │
    │sprintf(), snprintf(), │ │ │
    │vprintf(), vfprintf(), │ │ │
    │vsprintf(), vsnprintf() │ │ │
    └────────────────────────┴───────────────┴────────────────┘

    CONFORMING TO
    fprintf(), printf(), sprintf(), vprintf(), vfprintf(), vsprintf():
    POSIX.1-2001, POSIX.1-2008, C89, C99.

    snprintf(), vsnprintf(): POSIX.1-2001, POSIX.1-2008, C99.

    The dprintf() and vdprintf() functions were originally GNU extensions
    that were later standardized in POSIX.1-2008.

    Concerning the return value of snprintf(), SUSv2 and C99 contradict
    each other: when snprintf() is called with size=0 then SUSv2 stipulates
    an unspecified return value less than 1, while C99 allows str to be
    NULL in this case, and gives the return value (as always) as the number
    of characters that would have been written in case the output string
    has been large enough. POSIX.1-2001 and later align their specifica‐
    tion of snprintf() with C99.

    glibc 2.1 adds length modifiers hh, j, t, and z and conversion charac‐
    ters a and A.

    glibc 2.2 adds the conversion character F with C99 semantics, and the
    flag character I.

    NOTES
    Some programs imprudently rely on code such as the following

    sprintf(buf, "%s some further text", buf);
    /thread

    printf 还能在多线程程序里调用吗?

    void thread1() { while (1) { printf("a"); } }
    void thread2() { while (1) { printf("b"); } }
    

    我们都知道 printf 是有缓冲区的 (为什么?)

    • 如果执行 buf[pos++] = ch (pos 共享) 不就 了吗?

     

    修改为 加入汇编汇编

            asm volatile("add $1,%0":"+m"(sum));

            asm volatile("lock add $1,%0":"+m"(sum));

    sum.c
    #include "thread.h"
    #define N 100000000
    long sum=0;
    void Tsum(){for(int i=0;i<N;i++)sum++;};
    int main(){
        create(Tsum);
        create(Tsum);
        join();
        printf("sum=%ld\n",sum);
    }
    

      

    while true;do gcc sum.c -lpthread && ./a.out;done

    sum=199055958

    sum=196244234

    sum=196224238

    sum=191857318

    sum=200000000

    sum=197990013

    sum=197888006

    sum=200000000

    sum=200000000

    sum=198062222

    sum=194003487

    sum=197967435

    sum=197983213

    sum=198640457

     2**64-100

    18446744073709551516

     while true;do ./a.out;done

    Alipay_withdraw 单线程安全

    gcc alipay.c -lpthread && ./a.out

    balance = 18446744073709551516

    #include "thread.h"
    
    unsigned long balance = 100;
    
    void Alipay_withdraw(int amt) {
      if (balance >= amt) {
        usleep(1); // unexpected delays
        balance -= amt;
      }
    }
    
    void Talipay(int id) {
      Alipay_withdraw(100);
    }
    
    int main() {
      create(Talipay);
      create(Talipay);
      join();
      printf("balance = %lu\n", balance);
    }
    

      

     山寨多线程支付

    原子性

    改写thread.h是线程拥有更大的栈

                  ┌─────────────┬────────────────────┐

                  │Architecture │ Default stack size │

                  ├─────────────┼────────────────────┤

                  │i386         │               2 MB │

                  ├─────────────┼────────────────────┤

                  │IA-64        │              32 MB │

                  ├─────────────┼────────────────────┤

                  │PowerPC      │               4 MB │

                  ├─────────────┼────────────────────┤

                  │S/390        │               2 MB │

                  ├─────────────┼────────────────────┤

                  │Sparc-32     │               2 MB │

                  ├─────────────┼────────────────────┤

                  │Sparc-64     │               4 MB │

                  ├─────────────┼────────────────────┤

                  │x86_64       │               2 MB │

                  └─────────────┴────────────────────┘

     

    查看函数信息 man 3 pthread_create

    clone是执行创建线程的系统调用

    https://jyywiki.cn/OS/2022/slides/3.slides#/

    多处理器编程

    strace

    • 程序 (源代码 S、二进制代码 C) = 状态机
      • 编译器 C=compile(S)
    • 应用视角的操作系统 = syscall 指令

    ##

    CPU 使用率超过了 100%

    https://jyywiki.cn/OS/2022/slides/3.slides#/1/4

    Hello, Multi-threaded World!

    #include "thread.h"
    
    void Ta() { while (1) { printf("a"); } }
    void Tb() { while (1) { printf("b"); } }
    
    int main() {
      create(Ta);
      create(Tb);
    }
    

    利用 thread.h 就可以写出利用多处理器的程序!

    • 操作系统会自动把线程放置在不同的处理器上
    • 在后台运行,可以看到 CPU 使用率超过了 100%

    CPU 使用率超过了 100%:指的是使用了超过1个CPU。

    #include "thread.h"
    
    __thread char *base, *cur; // thread-local variables
    __thread int id;
    
    // objdump to see how thread-local variables are implemented
    __attribute__((noinline)) void set_cur(void *ptr) { cur = ptr; }
    __attribute__((noinline)) char *get_cur()         { return cur; }
    
    void stackoverflow(int n) {
      set_cur(&n);
      if (n % 1024 == 0) {
        int sz = base - get_cur();
        printf("Stack size of T%d >= %d KB\n", id, sz / 1024);
      }
      stackoverflow(n + 1);
    }
    
    void Tprobe(int tid) {
      id = tid;
      base = (void *)&tid;
      stackoverflow(0);
    }
    
    int main() {
      setbuf(stdout, NULL);
      for (int i = 0; i < 4; i++) {
        create(Tprobe);
      }
    } 

    __attribute__((noinline))  objdump 在二进制代码中查看线程局部变量的实现。

    证明线程具有独立堆栈 (以及确定它们的范围)

    https://jyywiki.cn/pages/OS/2022/demos/stack-probe.c

    gcc stack-probe.c -lpthread &&./a.out  | sort -nk 6
    Stack size of T1 >= 0 KB
    Stack size of T2 >= 0 KB
    Stack size of T3 >= 0 KB
    Stack size of T4 >= 0 KB
    Stack size of T1 >= 64 KB
    Stack size of T2 >= 64 KB
    Stack size of T3 >= 64 KB
    Stack size of T4 >= 64 KB
    Stack size of T1 >= 128 KB
    Stack size of T2 >= 128 KB
    Stack size of T3 >= 128 KB
    Stack size of T4 >= 128 KB
    Stack size of T1 >= 192 KB
    Stack size of T2 >= 192 KB
    Stack size of T3 >= 192 KB
    Stack size of T4 >= 192 KB
    Stack size of T1 >= 256 KB
    Stack size of T2 >= 256 KB
    Stack size of T3 >= 256 KB
    Stack size of T4 >= 256 KB
    Stack size of T1 >= 320 KB
    Stack size of T2 >= 320 KB
    Stack size of T3 >= 320 KB
    Stack size of T4 >= 320 KB
    Stack size of T1 >= 384 KB
    Stack size of T2 >= 384 KB
    

      查看程序执行的系统调用

     strace ./a.out
    execve("./a.out", ["./a.out"], 0x7fff6c5d6c40 /* 24 vars */) = 0
    brk(NULL)                               = 0x55a2d5f75000
    arch_prctl(0x3001 /* ARCH_??? */, 0x7fff9e5ee140) = -1 EINVAL (Invalid argument)
    mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa770566000
    access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
    openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
    newfstatat(3, "", {st_mode=S_IFREG|0644, st_size=23743, ...}, AT_EMPTY_PATH) = 0
    mmap(NULL, 23743, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7fa770560000
    close(3)                                = 0
    openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
    read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0P\237\2\0\0\0\0\0"..., 832) = 832
    pread64(3, "\6\0\0\0\4\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0"..., 784, 64) = 784
    pread64(3, "\4\0\0\0 \0\0\0\5\0\0\0GNU\0\2\0\0\300\4\0\0\0\3\0\0\0\0\0\0\0"..., 48, 848) = 48
    pread64(3, "\4\0\0\0\24\0\0\0\3\0\0\0GNU\0i8\235HZ\227\223\333\350s\360\352,\223\340."..., 68, 896) = 68
    newfstatat(3, "", {st_mode=S_IFREG|0644, st_size=2216304, ...}, AT_EMPTY_PATH) = 0
    pread64(3, "\6\0\0\0\4\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0"..., 784, 64) = 784
    mmap(NULL, 2260560, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7fa770338000
    mmap(0x7fa770360000, 1658880, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x28000) = 0x7fa770360000
    mmap(0x7fa7704f5000, 360448, PROT_READ, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1bd000) = 0x7fa7704f5000
    mmap(0x7fa77054d000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x214000) = 0x7fa77054d000
    mmap(0x7fa770553000, 52816, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fa770553000
    close(3)                                = 0
    mmap(NULL, 12288, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa770335000
    arch_prctl(ARCH_SET_FS, 0x7fa770335740) = 0
    set_tid_address(0x7fa770335a10)         = 18025
    set_robust_list(0x7fa770335a20, 24)     = 0
    rseq(0x7fa7703360e0, 0x20, 0, 0x53053053) = 0
    mprotect(0x7fa77054d000, 16384, PROT_READ) = 0
    mprotect(0x55a2d5573000, 4096, PROT_READ) = 0
    mprotect(0x7fa7705a0000, 8192, PROT_READ) = 0
    prlimit64(0, RLIMIT_STACK, NULL, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0
    munmap(0x7fa770560000, 23743)           = 0
    rt_sigaction(SIGRT_1, {sa_handler=0x7fa7703c98f0, sa_mask=[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x7fa77037a520}, NULL, 8) = 0
    rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0
    mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7fa76fb34000
    mprotect(0x7fa76fb35000, 8388608, PROT_READ|PROT_WRITE) = 0
    getrandom("\xa9\x26\x87\x57\xa1\x6a\xdd\xbe", 8, GRND_NONBLOCK) = 8
    brk(NULL)                               = 0x55a2d5f75000
    brk(0x55a2d5f96000)                     = 0x55a2d5f96000
    rt_sigprocmask(SIG_BLOCK, ~[], [], 8)   = 0
    clone3({flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, child_tid=0x7fa770334910, parent_tid=0x7fa770334910, exit_signal=0, stack=0x7fa76fb34000, stack_size=0x7fff00, tls=0x7fa770334640} => {parent_tid=[18026]}, 88) = 18026
    rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
    mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7fa76f333000
    mprotect(0x7fa76f334000, 8388608, PROT_READ|PROT_WRITE) = 0
    rt_sigprocmask(SIG_BLOCK, ~[], [], 8)   = 0
    clone3({flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, child_tid=0x7fa76fb33910, parent_tid=0x7fa76fb33910, exit_signal=0, stack=0x7fa76f333000, stack_size=0x7fff00, tls=0x7fa76fb33640} => {parent_tid=[18027]}, 88) = 18027
    rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
    mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7fa76eb32000
    mprotect(0x7fa76eb33000, 8388608, PROT_READ|PROT_WRITE) = 0
    rt_sigprocmask(SIG_BLOCK, ~[], [], 8)   = 0
    clone3({flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, child_tid=0x7fa76f332910, parent_tid=0x7fa76f332910, exit_signal=0, stack=0x7fa76eb32000, stack_size=0x7fff00, tls=0x7fa76f332640} => {parent_tid=[18028]}, 88) = 18028
    rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
    mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7fa76e331000
    mprotect(0x7fa76e332000, 8388608, PROT_READ|PROT_WRITE) = 0
    rt_sigprocmask(SIG_BLOCK, ~[], [], 8)   = 0
    clone3({flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, child_tid=0x7fa76eb31910, parent_tid=0x7fa76eb31910, exit_signal=0, stack=0x7fa76e331000, stack_size=0x7fff00, tls=0x7fa76eb31640} => {parent_tid=[18029]}, 88) = 18029
    rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
    futex(0x7fa770334910, FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, 18026, NULL, FUTEX_BITSET_MATCH_ANYStack size of T3 >= 0 KB
    Stack size of T3 >= 64 KB
    Stack size of T3 >= 128 KB
    Stack size of T3 >= 192 KB
    Stack size of T3 >= 256 KB
    Stack size of T3 >= 320 KB
    

    __attribute__((noinline))

  • 相关阅读:
    指针与数组关联导致的一些现象 分类: H_HISTORY 20130211 20:14 516人阅读 评论(0) 收藏
    宏定义一些内容 分类: H_HISTORY 20130207 23:20 585人阅读 评论(0) 收藏
    使用lstat()判断文件类型 分类: H_HISTORY 20130224 11:48 703人阅读 评论(0) 收藏
    关于VMware虚拟机的上网 分类: C_OHTERS 20130220 14:36 336人阅读 评论(0) 收藏
    Segmentation fault (core dumped) 分类: H_HISTORY 20130206 11:34 18800人阅读 评论(0) 收藏
    C语言内存分配时间 分类: H_HISTORY 20130211 10:51 1432人阅读 评论(3) 收藏
    GTK+与QT的对比 分类: H_HISTORY 20130205 09:27 3101人阅读 评论(0) 收藏
    枚举作为整数 分类: H_HISTORY 20130208 11:22 576人阅读 评论(0) 收藏
    01背包问题,动态规划求解
    求两个字符串的相似度或子串
  • 原文地址:https://www.cnblogs.com/rsapaper/p/16665176.html
Copyright © 2020-2023  润新知