• 反汇编容易反编译难


    最近受同学所托,将5个内嵌了MASM语法格式的汇编代码之C函数翻译成纯C函数,以支持多种CPU指令集(比如x86/x64, arm, sparc, ...)。整个过程充满了艰辛,但也充满了乐趣。作为一个既喜欢C又喜欢汇编的程序员,在废寝忘食之余深深地体会到,汇编(disassemble)容易,反编译(decompile)难”。逆向工程实在是太不容易啦!下面给出一个简单点儿的例子予以说明。[P.S. 一肚子的Asm, C和gdb,终于派上了用场:-)]

    o void add16(word *, word *, word n)

     1 typedef unsigned char           byte;  /* 1 byte */
     2 typedef unsigned short          word;  /* 2 bytes */
     3 typedef unsigned int            dword; /* 4 bytes */
     4 typedef unsigned long long      qword; /* 8 bytes */
     5 
     6 void add16(word *a, word *b, word n)
     7 {
     8         word *dstp = a;
     9         word *srcp = b;
    10         word count = n;
    11 
    12         __asm {
    13                 xor  ecx, ecx
    14                 mov  cx, count
    15                 mov  esi, srcp
    16                 mov  edi, dstp
    17                 xor  ebx, ebx
    18 
    19         LOOP01:
    20                 xor  eax, eax
    21                 mov  ax, [esi]
    22                 add  eax, ebx
    23                 xor  edx, edx
    24                 mov  dx, [edi]
    25                 add  eax, edx
    26 
    27                 mov  [edi], ax
    28                 shr  eax, 16
    29                 mov  ebx, eax
    30 
    31                 add  esi, 2
    32                 add  edi, 2
    33                 loop LOOP01
    34 
    35                 add  [edi], bx
    36         }
    37 }

    作为一个对x86汇编熟悉N(>=12) 年的程序员,翻译上面的内嵌代码,花了足足5个小时。 5小时的艰苦历程如下:

    第1步: 将汇编代码摘取出来,用NASM实现 (本人不喜欢Windows编程,所以在Linux上用NASM)

    第2步: 给每一行汇编代码加注释,加完注释后的汇编源文件foo.asm如下:

     1 BITS 32
     2 
     3 SECTION .data
     4 
     5 count:                  equ 0x5
     6 dstp:                   dw  0x1234, 0x3456, 0xffff, 0x789a, 0x9abc, 0x0000
     7 srcp:                   dw  0xabcd, 0xffff, 0xffff, 0x0123, 0x2345, 0x0000
     8 
     9 SECTION .text
    10 
    11 global _start
    12 
    13 _start:
    14         xor  ecx, ecx           ; ecx = 0
    15         mov  cx, count          ; ecx = count
    16         mov  esi, srcp          ; esi = srcp
    17         mov  edi, dstp          ; edi = dstp
    18         xor  ebx, ebx           ; ebx = 0 /* carry */
    19 
    20     LOOP01:
    21         xor  eax, eax           ; eax = 0
    22         mov  ax, [esi]          ;  ax = *esi = *srcp
    23         add  eax, ebx           ; eax += ebx    /* eax += carry */
    24         xor  edx, edx           ; edx = 0
    25         mov  dx, [edi]          ;  dx = *edi = *dstp
    26         add  eax, edx           ; eax += edx;   /* eax += *dstp */
    27 
    28         mov  [edi], ax          ; *edi = ax, i.e. *dstp = ax
    29         shr  eax, 16            ; eax >>= 16
    30         mov  ebx, eax           ; ebx = eax     /* next carry */
    31 
    32         add  esi, 2             ; esi += 2      /* srcp++ */
    33         add  edi, 2             ; edi += 2      /* dstp++ */
    34         loop LOOP01             ; jmp back to LOOP01 until ecx == 0
    35 
    36         add  [edi], bx          ; *edi += bx
    37 
    38 _exit:
    39         mov eax, 1              ; syscall num of exit
    40         mov ebx, 0              ; error code
    41         int 0x80

    第3步: 编译foo.asm,用gdb调试

    $ nasm -f elf32 -g -F stabs foo.asm 
    $ ld -o foo foo.o

    用gdb单步调试的过程比较冗长,这里贴出最简版调试过程,

    $ gdb foo
    GNU gdb (Ubuntu 7.7.1-0ubuntu5~14.04.2) 7.7.1
    ...<snip>....................................
    (gdb)   set disassembly-flavor intel
    (gdb) #
    (gdb)
    (gdb)   disas _start
    Dump of assembler code for function _start:
       0x08048080 <+0>:     xor    ecx,ecx
       0x08048082 <+2>:     mov    cx,0x5
       0x08048086 <+6>:     mov    esi,0x80490cc
       0x0804808b <+11>:    mov    edi,0x80490c0
       0x08048090 <+16>:    xor    ebx,ebx
    End of assembler dump.
    (gdb) #
    (gdb)
    (gdb)   disas LOOP01
    Dump of assembler code for function LOOP01:
       0x08048092 <+0>:     xor    eax,eax
       0x08048094 <+2>:     mov    ax,WORD PTR [esi]
       0x08048097 <+5>:     add    eax,ebx
       0x08048099 <+7>:     xor    edx,edx
       0x0804809b <+9>:     mov    dx,WORD PTR [edi]
       0x0804809e <+12>:    add    eax,edx
       0x080480a0 <+14>:    mov    WORD PTR [edi],ax
       0x080480a3 <+17>:    shr    eax,0x10
       0x080480a6 <+20>:    mov    ebx,eax
       0x080480a8 <+22>:    add    esi,0x2
       0x080480ab <+25>:    add    edi,0x2
       0x080480ae <+28>:    loop   0x8048092 <LOOP01>
       0x080480b0 <+30>:    add    WORD PTR [edi],bx
    End of assembler dump.
    (gdb) #
    (gdb)
    (gdb)   disas _exit
    Dump of assembler code for function _exit:
       0x080480b3 <+0>:     mov    eax,0x1
       0x080480b8 <+5>:     mov    ebx,0x0
       0x080480bd <+10>:    int    0x80
    End of assembler dump.
    (gdb) #
    (gdb)
    (gdb)
    (gdb)
    (gdb)
    (gdb)
    (gdb) set disassembly-flavor intel
    (gdb) display /i $eip
    (gdb) #
    (gdb)
    (gdb) b _start
    Breakpoint 1 at 0x8048080
    (gdb) b _exit
    Breakpoint 2 at 0x80480b3
    (gdb) info b
    Num     Type           Disp Enb Address    What
    1       breakpoint     keep y   0x08048080 <_start>
    2       breakpoint     keep y   0x080480b3 <_exit>
    (gdb) #
    (gdb)
    (gdb) r
    Starting program: /var/tmp/sandbox/fanli/raw/05/cnblog/foo
    
    Breakpoint 1, 0x08048080 in _start ()
    1: x/i $eip
    => 0x8048080 <_start>:  xor    ecx,ecx
    (gdb) #
    (gdb)
    (gdb) ni
    0x08048082 in _start ()
    1: x/i $eip
    => 0x8048082 <_start+2>:        mov    cx,0x5
    (gdb)
    0x08048086 in _start ()
    1: x/i $eip
    => 0x8048086 <_start+6>:        mov    esi,0x80490cc
    (gdb)
    0x0804808b in _start ()
    1: x/i $eip
    => 0x804808b <_start+11>:       mov    edi,0x80490c0
    (gdb)
    0x08048090 in _start ()
    1: x/i $eip
    => 0x8048090 <_start+16>:       xor    ebx,ebx
    (gdb) info r ecx edi esi
    ecx            0x5      5
    edi            0x80490c0        134516928
    esi            0x80490cc        134516940
    (gdb) x /5hx 0x80490c0
    0x80490c0 <dstp>:       0x1234  0x3456  0xffff  0x789a  0x9abc
    (gdb) x /5hx 0x80490cc
    0x80490cc <srcp>:       0xabcd  0xffff  0xffff  0x0123  0x2345
    (gdb) #
    (gdb)
    (gdb) c
    Continuing.
    
    Breakpoint 2, 0x080480b3 in _exit ()
    1: x/i $eip
    => 0x80480b3 <_exit>:   mov    eax,0x1
    (gdb) #
    (gdb)
    (gdb) x /5hx 0x80490c0
    0x80490c0 <dstp>:       0xbe01  0x3455  0xffff  0x79be  0xbe01
    (gdb) x /5hx 0x80490cc
    0x80490cc <srcp>:       0xabcd  0xffff  0xffff  0x0123  0x2345
    (gdb) #
    (gdb)
    (gdb) ni
    0x080480b8 in _exit ()
    1: x/i $eip
    => 0x80480b8 <_exit+5>: mov    ebx,0x0
    (gdb)
    0x080480bd in _exit ()
    1: x/i $eip
    => 0x80480bd <_exit+10>:        int    0x80
    (gdb)
    [Inferior 1 (process 21920) exited normally]
    (gdb)

    从上面的调试过程可以看出,

            a: 0x1234  0x3456  0xffff  0x789a  0x9abc
            b: 0xabcd  0xffff  0xffff  0x0123  0x2345
    After a = a + b is done,
            a: 0xbe01  0x3455  0xffff  0x79be  0xbe01
    
    That is,
            (1) a = 0x9abc789affff34561234
            (2) b = 0x23450123ffffffffabcd
            (3) a += b
            (4) a = 0xbe0179beffff3455be01

    第4步:用Python验证一下上面的计算结果,

    $ python
    Python 2.7.6 (default, Oct 26 2016, 20:32:47)
    ...<snip>....................................
    >>> a = 0x9abc789affff34561234
    >>> b = 0x23450123ffffffffabcd
    >>> a += b
    >>> print "    a = 0x%x" % a
        a = 0xbe0179beffff3455be01

    第5步: 将foo.asm翻译成foo2.c

     1 #include <stdio.h>
     2 
     3 typedef unsigned char       byte;       /* 1 byte */
     4 typedef unsigned short      word;       /* 2 bytes */
     5 typedef unsigned int        dword;      /* 4 bytes */
     6 typedef unsigned long long  qword;      /* 8 bytes */
     7 
     8 void add16(word *a, word *b, word n)
     9 {
    10         word *dstp = a;
    11         word *srcp = b;
    12         word count = n;
    13 
    14         word carry = 0;
    15         for (word i = 0; i < count; i++) {
    16                 word p = *dstp;
    17                 word q = *srcp;
    18 
    19                 dword n = (dword)p + (dword)carry + (dword)q;
    20 
    21                 *dstp = n & 0xFFFF;      // *dstp :  low 16 bits of n
    22                 carry = (word)(n >> 16); // carry : high 16 bits of n
    23 
    24                 srcp++;
    25                 dstp++;
    26         }
    27 
    28         *dstp += carry;
    29 }
    30 
    31 static void dump(word a[], word n)
    32 {
    33         printf("	%p: ", a);
    34         for (word i = 0; i < n; i++)
    35                 printf("0x%04x ", a[i]);
    36         printf("
    ");
    37 }
    38 
    39 int main(int argc, char *argv[])
    40 {
    41         word src[] = {0xabcd, 0xffff, 0xffff, 0x0123, 0x2345, 0x0000};
    42         word dst[] = {0x1234, 0x3456, 0xffff, 0x789a, 0x9abc, 0x0000};
    43         word n   = 0x5;
    44 
    45         dump(dst, sizeof(dst)/sizeof(word));
    46         printf("+
    ");
    47         dump(src, sizeof(src)/sizeof(word));
    48 
    49         add16(dst, src, n);
    50 
    51         printf("=
    ");
    52         dump(dst, sizeof(dst)/sizeof(word));
    53 
    54         return 0;
    55 }

    o add16()截图

    第6步: 编译foo2.c并运行

    $ gcc -g -Wall -m32 -std=c99 -o foo2 foo2.c
    
    $ ./foo2
            0xbf9c1bc4: 0x1234 0x3456 0xffff 0x789a 0x9abc 0x0000
    +
            0xbf9c1bb8: 0xabcd 0xffff 0xffff 0x0123 0x2345 0x0000
    =
            0xbf9c1bc4: 0xbe01 0x3455 0xffff 0x79be 0xbe01 0x0000

    与foo.asm对应的运算结果做比较,

    0xbf9c1bc4: 0xbe01 0x3455 0xffff 0x79be 0xbe01 0x0000

    二者运算的结果完全一致,由此可见,add16()其实就是做大数加法

    • a = {a[0], a[1], ..., a[N]}, a[i] 为一个word, 占两个字节
    • b = {b[0], b[1], ..., b[N]},  b[i]为一个word, 占两个字节
    • a + b = {a[0]+b[0], a[1]+b[1], ..., a[N]+b[N]}
    • a[i] + b[i] 可能发生进位(carry), 将carry加到a[i+1]的位置上即可

    P.S. 在翻译过程中,我参考了下面两个与转移指令有关的文档。 (如果有兴趣,请阅读)

    1. 汇编语言转移指令规则汇总
    2. 常用汇编指令与标志位关系

    结束语: 上面给出的例子foo.asm只有一重循环,所以翻译成C代码相对简单。但是,如果有多重循环和多次跳转, 那么翻译起来就困难许多。例如:

     1 BITS 32
     2 
     3 SECTION .data
     4 
     5 adp:            dd      0x12345678, 0x9abcdef0, 0xffffffff, 0x9abcdefa, 
     6                         0x00000000
     7 adp_size:       equ     $ - adp
     8 bdp:            dd      0xbfffc061, 0xfedcba99, 0x76543211, 0xfedcba99, 
     9                         0x00000000
    10 bdp_size:       equ     $ - bdp
    11 cdp:            dd      0x00000000, 0x00000000, 0x00000000, 0x00000000, 
    12                         0x00000000, 0x00000000, 0x00000000, 0x00000000, 
    13                         0x00000000, 0x00000000
    14 cdp_size:       equ     $ - cdp
    15 
    16 SECTION .text
    17 
    18 global _start
    19 
    20 _start:
    21         mov  ecx, cdp           ; 001 ecx = cdp,  cdp[] = adp[] * bdp[]
    22         add  ecx, cdp_size      ; 002 ecx += cdp_size
    23 
    24         xor  edi, edi           ; 003 edi = 0
    25 
    26   LOOP1:                        ; 004
    27         mov  ebx, adp           ; 005 ebx = adp
    28         add  ebx, edi           ; 006 ebx += edi
    29         mov  eax, [ebx]         ; 007 eax = *ebx
    30         cmp  eax, 0             ; 008 if (eax == 0)
    31         je   NEXT3              ; 009     goto NEXT3
    32         xor  esi, esi           ; 010 esi = 0
    33         push eax                ; 011 save eax to stack (eax was *ebx = [adp+edi])
    34 
    35   LOOP2:                        ; 012
    36         mov  ebx, bdp           ; 013 ebx = bdp
    37         add  ebx, esi           ; 014 ebx += esi
    38         mov  eax, [ebx]         ; 015 eax = *ebx
    39         cmp  eax, 0             ; 016 if (eax == 0)
    40         je   NEXT2              ; 017     goto NEXT2
    41         pop  edx                ; 018 get edx from stack, which was pushed @ 011
    42         push edx                ; 019 save edx back to stack
    43         mul  edx                ; 020 edxeax = eax * edx
    44         mov  ebx, cdp           ; 021 ebx = cdp
    45         add  ebx, esi           ; 022 ebx += esi;
    46         add  ebx, edi           ; 023 ebx += edi;
    47         push ebx                ; 024 push ebx to stack
    48         add  [ebx], eax         ; 025 *ebx += eax;
    49 
    50   LOOP3:                        ; 026
    51         jnc  NEXT1              ; 027 == jae (>=)
    52         add  ebx, 4             ; 028 ebx += 4
    53         cmp  ebx, ecx           ; 029 if (ebx >= ecx) ; ecx = cdp + cdp_size
    54         jae  NEXT1              ; 030     goto NEXT1
    55         add  dword [ebx], 1     ; 031 *ebx += 1
    56         jmp  LOOP3              ; 032 go back to LOOP3
    57 
    58         NEXT1:                  ; 033 only used by LOOP3
    59             pop  ebx            ; 034 get ebx from stack
    60             add  ebx, 4         ; 035 ebx += 4
    61             add  [ebx], edx     ; 036 *ebx += edx
    62 
    63     LOOP4:                      ; 037
    64         jnc  NEXT2              ; 038 jnc (=jae)
    65         add  ebx, 4             ; 039 ebx += 4
    66         cmp  ebx, ecx           ; 040 if (ebx >= ecx)
    67         jae  NEXT2              ; 041     goto NEXT2
    68         add  dword [ebx], 1     ; 042 *ebx += 1
    69         jmp  LOOP4              ; 043 go back to LOOP4
    70 
    71         NEXT2:                  ; 044
    72             add  esi, 4         ; 045 esi += 4
    73             cmp  esi, bdp_size  ; 046 if (esi < bdp_size)
    74             jb   LOOP2          ; 047     go back to LOOP2
    75             pop  eax            ; 048 get eax from stack
    76 
    77     NEXT3:                      ; 049
    78         add  edi, 4             ; 050 edi += 4
    79         cmp  edi, adp_size      ; 051 if (edi < adp_size)
    80         jb   LOOP1              ; 052     go back to LOOP1
    81 
    82 _exit:
    83         mov eax, 1              ; syscall num of exit
    84         mov ebx, 0              ; error code
    85         int 0x80

    上面的代码一共包括4个LOOP,3个NEXT和10个跳转指令(e.g. jmp, jb, jae, jnc, je),翻译成C代码难度非常大。究其本质,实为做大数乘法。有关其翻译实现后的C代码,请参考前文。 反汇编(将C代码翻译成汇编代码)有现成的工具可用(e.g. gdb, objdump), 所以很容易。反编译(将汇编代码翻译成C代码),国外有收费的软件可以用(e.g. Hex-Rays Decompiler),但是也不能保证100%的正确性。所以,反汇编容易,反编译难,逆向工程很不容易。 (p.s. 有一个学信息工程专业的(但从来没做过程序员的)同学在微信群里用略带不屑的语气说“汇编是一种很古老的语言”,我真的很无语Orz...再高级的语言写的代码,也要变成机器码才能运行不是,似乎汇编是无法绕过的...)

  • 相关阅读:
    即时搜索(input框)
    HTML的页面IE注释
    浅谈attr()和prop()
    input---checked小问题
    鼠标右键事件
    JavaScript 异步开发全攻略(转)
    remove方法
    javascript 数组以及对象的深拷贝(复制数组或复制对象)的方法
    Hexo的更新 主题的更换
    深浅拷贝,原生和JQuery方法实现
  • 原文地址:https://www.cnblogs.com/idorax/p/7127744.html
Copyright © 2020-2023  润新知