STL - 33.1 std::string - 《初学者逆向工程（Reverse Engineering for Beginners）》

33.1 std::string

33.1 std::string

内部实现许多string库的实现结构包含一个指向字符串缓冲区的指针，一个包含当前字符串长度的变量以及一个表示当前字符串缓冲区大小的变量。为了能够将缓冲区指针传递给使用ASCII字符串的函数，通常string缓冲区中的字符串以0结尾。 C++标准中没有规定std::string应该如何实现，因此通常是按照上述方式实现的。按照规定，std::string应该是一个模板而不是类，以便能够支持不同的字符类型，如char、wchar_t等。

对于std::string，MSVC和GCC中的内部实现存在差异，下面依次进行说明

MSVC

MSVC的实现中，字符串存储在适当的位置，不一定位于指针指向的缓冲区（如果字符串的长度小于16个字符）。这意味着短的字符串在32位环境下至少占据16+4+4=24字节的空间，在64位环境下至少占据16+8+8=32字节，当字符串长度大于16字符时，相应的需要增加字符串自身的长度。

#include <string>
#include <stdio.h>
struct std_string
{
    union
    {
        char buf[16];
        char* ptr;
    } u;
    size_t size; // AKA ’Mysize’ in MSVC
    size_t capacity; // AKA ’Myres’ in MSVC
};
void dump_std_string(std::string s)
{
    struct std_string *p=(struct std_string*)&s;
    printf ("[%s] size:%d capacity:%d\n", p->size>16 ? p->u.ptr : p->u.buf, p->size, p->
    capacity);
};
int main()
{
    std::string s1="short string";
    std::string s2="string longer that 16 bytes";
    dump_std_string(s1);
    dump_std_string(s2);
    // that works without using c_str()
    printf ("%s\n", &s1);
    printf ("%s\n", s2);
};

通过源代码可以清晰的看到这些。如果字符串长度小于16个符号，存储字符的缓冲区不需要在堆上分配。实际上非常适宜这样做，因为大量的字符串确实都较短。显然，微软的开发人员认为16个字符是好的临界点。在main函数尾部，虽然没有使用c_str()方法，但是如果编译运行上面的代码，所有字符串都将打印在控制台上。当字符串的长度小于16个字符时，存储字符串的缓冲区位于std::string对象的开始位置，printf函数将指针当做指向以0结尾的字符数组，因此上述代码可以正常运行。第二个超过16字符的字符串的打印方式比较危险，通常程序员犯的错误是忘记写c_str()。这在很长的一段时间不会引起人的注意，直到一个很长的字符串出现，然后程序崩溃。而上述代码可以工作，因为指向字符串缓冲区的指针位于结构体的开始。

GCC

GCC的实现中，增加了一个引用计数，一个有趣的事实是一个指向std::string类实例的指针并不是指向结构体的起始位置，而是指向缓冲区的指针，在libstdc++-v3_string.h，中我们可以看到这主要是为了方便调试。

The reason you want Mdata pointing to the character %array and not the Rep is so that the debugger can see the string contents. (Probably we should add a non-inline member to get the Rep for the debugger to use, so users can check the actual string length.)

在我的例子中将考虑这一点：

#include <string>
#include <stdio.h>
struct std_string
{
    size_t length;
    size_t capacity;
    size_t refcount;
};
void dump_std_string(std::string s)
{
    char *p1=*(char**)&s; // GCC type checking workaround
    struct std_string *p2=(struct std_string*)(p1-sizeof(struct std_string));
    printf ("[%s] size:%d capacity:%d\n", p1, p2->length, p2->capacity);
};
int main()
{
    std::string s1="short string";
    std::string s2="string longer that 16 bytes";
    dump_std_string(s1);
    dump_std_string(s2);
    // GCC type checking workaround:
    printf ("%s\n", *(char**)&s1);
    printf ("%s\n", *(char**)&s2);
};

由于GCC有较强的类型检查，因此需要技巧来隐藏类似之前的错误，即使不使用c_str()，printf也能够正常工作。

更复杂的例子

#include <string>
#include <stdio.h>
int main()
{
    std::string s1="Hello, ";
    std::string s2="world!\n";
    std::string s3=s1+s2;
    printf ("%s\n", s3.c_str());
}

$SG39512 DB ’Hello, ’, 00H
$SG39514 DB ’world!’, 0aH, 00H
$SG39581 DB ’%s’, 0aH, 00H
_s2$ = -72 ; size = 24
_s3$ = -48 ; size = 24
_s1$ = -24 ; size = 24
_main PROC
sub esp, 72 ; 00000048H
push 7
push OFFSET $SG39512
lea ecx, DWORD PTR _s1$[esp+80]
mov DWORD PTR _s1$[esp+100], 15 ; 0000000fH
mov DWORD PTR _s1$[esp+96], 0
mov BYTE PTR _s1$[esp+80], 0
call ?assign@?$basic_string@DU?$char_traits@D@std@@V?
$allocator@D@2@@std@@QAEAAV12@PBDI@Z ; std::basic_string<char,std::char_traits<char>,std::
allocator<char> >::assign
push 7
push OFFSET $SG39514
lea ecx, DWORD PTR _s2$[esp+80]
mov DWORD PTR _s2$[esp+100], 15 ; 0000000fH
mov DWORD PTR _s2$[esp+96], 0
mov BYTE PTR _s2$[esp+80], 0
call ?assign@?$basic_string@DU?$char_traits@D@std@@V?
$allocator@D@2@@std@@QAEAAV12@PBDI@Z ; std::basic_string<char,std::char_traits<char>,std::
allocator<char> >::assign
lea eax, DWORD PTR _s2$[esp+72]
push eax
lea eax, DWORD PTR _s1$[esp+76]
push eax
lea eax, DWORD PTR _s3$[esp+80]
push eax
call ??$?HDU?$char_traits@D@std@@V?$allocator@D@1@@std@@YA?AV?$basic_string@DU?
$char_traits@D@std@@V?$allocator@D@2@@0@ABV10@0@Z ; std::operator+<char,std::char_traits<char
>,std::allocator<char> >
; inlined c_str() method:
cmp DWORD PTR _s3$[esp+104], 16 ; 00000010H
lea eax, DWORD PTR _s3$[esp+84]
cmovae eax, DWORD PTR _s3$[esp+84]
push eax
push OFFSET $SG39581
call _printf
add esp, 20 ; 00000014H
cmp DWORD PTR _s3$[esp+92], 16 ; 00000010H
jb SHORT $LN119@main
push DWORD PTR _s3$[esp+72]
call ??3@YAXPAX@Z ; operator delete
add esp, 4
$LN119@main:
cmp DWORD PTR _s2$[esp+92], 16 ; 00000010H
mov DWORD PTR _s3$[esp+92], 15 ; 0000000fH
mov DWORD PTR _s3$[esp+88], 0
mov BYTE PTR _s3$[esp+72], 0
jb SHORT $LN151@main
push DWORD PTR _s2$[esp+72]
call ??3@YAXPAX@Z ; operator delete
add esp, 4
$LN151@main:
cmp DWORD PTR _s1$[esp+92], 16 ; 00000010H
mov DWORD PTR _s2$[esp+92], 15 ; 0000000fH
mov DWORD PTR _s2$[esp+88], 0
mov BYTE PTR _s2$[esp+72], 0
jb SHORT $LN195@main
push DWORD PTR _s1$[esp+72]
call ??3@YAXPAX@Z ; operator delete
add esp, 4
$LN195@main:
xor eax, eax
add esp, 72 ; 00000048H
ret 0
_main ENDP

编译器并不是静态构造string对象，存储数据的缓冲区是否一定要在堆中呢？通常以0结尾的ASCII字符串存储在数据节中，然后运行时，通过赋值方法完成s1和s2两个string对象的构造。通过+操作符，s3完成string对象的构造。可以注意到上述代码中并没有c_str()方法的调用，这是因为由于函数太小，编译器将其内联了，如果一个字符串小于16个字符，eax寄存器中存放指向缓冲区的指针，否则，存放指向堆中字符串缓冲区的指针。然后，我们看到了三个析构函数的调用，当字符串长度超过16字符时，析构函数将被调用，在堆中的缓冲区会被释放。此外，由于三个std::string对象都存储在栈中，当函数结束时，他们将被自动释放。可以得到一个结论，短的字符串对象处理起来更快，因为堆访问操作较少。 GCC生成的代码甚至更简单（正如我之前提到的，GCC并不将短的字符串存储在结构体中）

.LC0:
.string "Hello, "
.LC1:
.string "world!\n"
main:
push ebp
mov ebp, esp
push edi
push esi
push ebx
and esp, -16
sub esp, 32
lea ebx, [esp+28]
lea edi, [esp+20]
mov DWORD PTR [esp+8], ebx
lea esi, [esp+24]
mov DWORD PTR [esp+4], OFFSET FLAT:.LC0
mov DWORD PTR [esp], edi
call _ZNSsC1EPKcRKSaIcE
mov DWORD PTR [esp+8], ebx
mov DWORD PTR [esp+4], OFFSET FLAT:.LC1
mov DWORD PTR [esp], esi
call _ZNSsC1EPKcRKSaIcE
mov DWORD PTR [esp+4], edi
mov DWORD PTR [esp], ebx
call _ZNSsC1ERKSs
mov DWORD PTR [esp+4], esi
mov DWORD PTR [esp], ebx
call _ZNSs6appendERKSs
; inlined c_str():
mov eax, DWORD PTR [esp+28]
mov DWORD PTR [esp], eax
call puts
mov eax, DWORD PTR [esp+28]
lea ebx, [esp+19]
mov DWORD PTR [esp+4], ebx
sub eax, 12
mov DWORD PTR [esp], eax
call _ZNSs4_Rep10_M_disposeERKSaIcE
mov eax, DWORD PTR [esp+24]
mov DWORD PTR [esp+4], ebx
sub eax, 12
mov DWORD PTR [esp], eax
call _ZNSs4_Rep10_M_disposeERKSaIcE
mov eax, DWORD PTR [esp+20]
mov DWORD PTR [esp+4], ebx
sub eax, 12
mov DWORD PTR [esp], eax
call _ZNSs4_Rep10_M_disposeERKSaIcE
lea esp, [ebp-12]
xor eax, eax
pop ebx
pop esi
pop edi
pop ebp
ret

可以看到传递给析构函数的并不是一个对象的指针，而是在对象所在位置的前12个字节的位置，也就是结构体的真正起始位置。