Linux线程局部存储 Thread Local Storage

在C/C++程序中,全局变量默认是所有线程共享的,开发者需要处理多线程竞争问题。有些情况下我们需要保证一个线程独享一份数据,其它线程无法访问。典型的就是errno全局变量,它总是会保存当前线程最后一个调用的错误码,不会存在线程冲突。这个时候需要使用线程局部存储(TLS)来解决。

pthread的内存结构

在说明TLS之前,先了解下pthread的内存结构。glibc/nptl/descr.h中定义了线程重要的数据结构struct pthread,它描述了用户态线程的完整信息,每创建一个pthread线程,都在内存中有一个对应的pthread结构体。pthread结构非常复杂,与TLS有关的是specific_1stblock数组和specific二级数组,后面会做说明。

#define PTHREAD_KEY_2NDLEVEL_SIZE       32
#define PTHREAD_KEY_1STLEVEL_SIZE \
  ((PTHREAD_KEYS_MAX + PTHREAD_KEY_2NDLEVEL_SIZE - 1) \
   / PTHREAD_KEY_2NDLEVEL_SIZE)

struct pthread
{
    union
  {
#if !TLS_DTV_AT_TP
    /* This overlaps the TCB as used for TLS without threads (see tls.h).  */
    tcbhead_t header;
#else
    struct
    {
      int multiple_threads;
      int gscope_flag;
    } header;
#endif

    void *__padding[24];
  };

  list_t list;
  pid_t tid;

  ...
  struct pthread_key_data
  {
    /* Sequence number.  We use uintptr_t to not require padding on
       32- and 64-bit machines.  On 64-bit machines it helps to avoid
       wrapping, too.  */
    uintptr_t seq;

    /* Data pointer.  */
    void *data;
  } specific_1stblock[PTHREAD_KEY_2NDLEVEL_SIZE];

  /* Two-level array for the thread-specific data.  */
  struct pthread_key_data *specific[PTHREAD_KEY_1STLEVEL_SIZE];

  /* Flag which is set when specific data is set.  */
  bool specific_used;
  ...
}

__thread

在GCC/Clang编译环境中,可以使用__thread关键字来声明TLS变量,__thread关键字不是C标准,不同的编译器名字不同。

在Xcode 13.2上测试仅i386架构不支持__thread

#if defined(__i386__)
static char *g_thread_data = NULL;
#else
static __thread char *g_thread_data = NULL;
#endif

使用__thread关键字声明的变量,存储在pthred结构体之后,栈空间之间的内存区域。也就是说,从内存布局上看,高地址到底地址的内存分布是:pthred结构、__thread变量区域、栈区(栈底和__thread变量区顶相接)。

下面以Xcode 13.2/arm64运行的程序来说明这点。

__thread uint64_t g_tls_int = 6;
__thread char *g_tls_string = "easeapi.com";;

void tls_test(void)
{
    uint64_t value = g_tls_int;
    printf("%llu", value);
    char *string = g_tls_string;
    printf("%s", string);
}

在tls_test入口处断点,查看对应的汇编程序,如下:

    0x104235240 <+0>:   sub    sp, sp, #0x40             ; =0x40 
    0x104235244 <+4>:   stp    x29, x30, [sp, #0x30]
    0x104235248 <+8>:   add    x29, sp, #0x30            ; =0x30 
    0x10423524c <+12>:  adrp   x0, 529
    0x104235250 <+16>:  add    x0, x0, #0xd70            ; =0xd70 
    0x104235254 <+20>:  ldr    x8, [x0]
    0x104235258 <+24>:  blr    x8
    0x10423525c <+28>:  str    x0, [sp, #0x10]
    0x104235260 <+32>:  adrp   x0, 529
    0x104235264 <+36>:  add    x0, x0, #0xd88            ; =0xd88 
    0x104235268 <+40>:  ldr    x8, [x0]
    0x10423526c <+44>:  blr    x8
    0x104235270 <+48>:  mov    x8, x0
    0x104235274 <+52>:  ldr    x0, [sp, #0x10]
    0x104235278 <+56>:  str    x8, [sp, #0x18]
    0x10423527c <+60>:  ldr    x8, [x0]
    0x104235280 <+64>:  stur   x8, [x29, #-0x8]
    0x104235284 <+68>:  ldur   x8, [x29, #-0x8]
    0x104235288 <+72>:  adrp   x0, 471
    0x10423528c <+76>:  add    x0, x0, #0x7fc            ; =0x7fc 
    0x104235290 <+80>:  mov    x9, sp
    0x104235294 <+84>:  str    x8, [x9]
    0x104235298 <+88>:  bl     0x104403be0               ; symbol stub for: printf
    0x10423529c <+92>:  ldr    x0, [sp, #0x18]
    0x1042352a0 <+96>:  ldr    x8, [x0]
    0x1042352a4 <+100>: stur   x8, [x29, #-0x10]
    0x1042352a8 <+104>: ldur   x8, [x29, #-0x10]
    0x1042352ac <+108>: adrp   x0, 471
    0x1042352b0 <+112>: add    x0, x0, #0x801            ; =0x801 
    0x1042352b4 <+116>: mov    x9, sp
    0x1042352b8 <+120>: str    x8, [x9]
    0x1042352bc <+124>: bl     0x104403be0               ; symbol stub for: printf
    0x1042352c0 <+128>: ldp    x29, x30, [sp, #0x30]
    0x1042352c4 <+132>: add    sp, sp, #0x40             ; =0x40 
    0x1042352c8 <+136>: ret 

0x104235274处,sp寄存器偏移0x10字节读取到x0。在0x104235278处读取x0寄存器的值(g_tls_int):

(lldb) register read x0
      x0 = 0x0000000281cf41a0
(lldb) memory read/1xg 0x0000000281cf41a0
0x281cf41a0: 0x0000000000000006

0x10423529c处,sp寄存器偏移0x18字节读取到x0。在0x1042352a0处读取x0寄存器的值(g_tls_string):

(lldb) register read x0
      x0 = 0x0000000281cf41a8
(lldb) memory read/1xg 0x0000000281cf41a8
0x281cf41a8: 0x000000010440c7f0
(lldb) memory read 0x000000010440c7f0
0x10440c7f0: 65 61 73 65 61 70 69 2e 63 6f 6d 00 25 6c 6c 75  easeapi.com.%llu
0x10440c800: 00 25 73 00 4d 79 41 70 70 6c 69 63 61 74 69 6f  .%s.MyApplicatio

从上面的测试结果来看,读取__thread变量都是通过fp指针偏移(向高地址偏移)来完成的。

__thread修饰的变量必须是POD(Plain Old Data)类型,不支持class等高级语言特性。__thread变量在线程生命周期一直存在,在线程销毁时释放。需要注意的是,由于__thread并不能指定销毁方法,当我们定义一个__thread修饰的指针变量,并在线程运行中malloc内存后,线程结束仅会将__thread变量指针置NULL,需要开发者手动free内存。

__thread char *g_tls_string = NULL;

void tls_test(void)
{
    if (g_tls_string == NULL) g_tls_string = calloc(1024, 1);
    //线程销毁时,需要手动释放malloc的内存
}

如果想要在线程结束时,自动完成malloc内存的释放,需要使用pthread specific相关的API。

pthread specific API

pthread同时提供了以下API实现TLS的功能:

//nptl/bits/pthreadtypes.h
/* Keys for thread-specific data */
typedef unsigned int pthread_key_t;

int pthread_key_create(pthread_key_t *, void (* _Nullable)(void *));
int pthread_key_delete(pthread_key_t);

int pthread_setspecific(pthread_key_t , const void * _Nullable);
void* _Nullable pthread_getspecific(pthread_key_t);

pthread_key_create的第一个参数是pthread_key_t指针,用于接收创建成功返回的pthread_key_t,第二个参数是数据析构函数指针,会在线程销毁时执行。pthread_key_create成功后获得pthread_key_t,之后可通过pthread_key_t进行线程私有数据的读写。示例代码如下:

//create key
pthread_key_t key = 0;
pthread_key_create(&key, NULL); 

//write
struct easeapi_struct data;
pthread_setspecific(key, &struct_data);

//read
struct easeapi_struct* = (struct easeapi_struct *)pthread_getspecific(key)

每一个进程都有一个全局数组__pthread_keys来管理pthread_key_t。

//nptl/internaltypes.h:
/* Thread-local data handling.  */
struct pthread_key_struct
{
  /* Sequence numbers.  Even numbers indicated vacant entries.  Note
     that zero is even.  We use uintptr_t to not require padding on
     32- and 64-bit machines.  On 64-bit machines it helps to avoid
     wrapping, too.  */
  uintptr_t seq;

  /* Destructor for the data.  */
  void (*destr) (void *);
};

//sysdeps/unix/sysv/linux/bits/local_lim.h
/* This is the value this implementation supports.  */
#define PTHREAD_KEYS_MAX 1024

//nptl/pthread_keys.c
/* Table of the key information.  */
struct pthread_key_struct __pthread_keys[PTHREAD_KEYS_MAX];

struct pthread_key_struct结构中定义了seq和传入的析构函数的指针。一个程序同时最多可以创建PTHREAD_KEYS_MAX个pthread_key_t。pthread_key_t是全局的,但不同的线程通过pthread_key_t访问读写接口时,实际上操作的是不同的内存。

当执行pthread_key_create时,会从__pthread_keys数组中找到一个没有使用的pthread_key_struct结构,并对其seq加1。返回的pthread_key_t实际上就是这个pthread_key_struct在__pthread_keys数组中的序号。如下代码:

//nptl/pthread_key_create.c:
int
___pthread_key_create (pthread_key_t *key, void (*destr) (void *))
{
  /* Find a slot in __pthread_keys which is unused.  */
  for (size_t cnt = 0; cnt < PTHREAD_KEYS_MAX; ++cnt)
    {
      uintptr_t seq = __pthread_keys[cnt].seq;

      if (KEY_UNUSED (seq) && KEY_USABLE (seq)
   /* We found an unused slot.  Try to allocate it.  */
   && ! atomic_compare_and_exchange_bool_acq (&__pthread_keys[cnt].seq,
           seq + 1, seq))
 {
   /* Remember the destructor.  */
   __pthread_keys[cnt].destr = destr;

   /* Return the key to the caller.  */
   *key = cnt;

   /* The call succeeded.  */
   return 0;
 }
    }

  return EAGAIN;
}

当执行pthread_key_delete时,会根据pthread_key_t的序号,从__pthread_keys找到对应的pthread_key_struct,并对其seq加1。如下代码:

//nptl/pthread_key_delete.c
int
___pthread_key_delete (pthread_key_t key)
{
  int result = EINVAL;

  if (__glibc_likely (key < PTHREAD_KEYS_MAX))
    {
      unsigned int seq = __pthread_keys[key].seq;

      if (__builtin_expect (! KEY_UNUSED (seq), 1)
   && ! atomic_compare_and_exchange_bool_acq (&__pthread_keys[key].seq,
           seq + 1, seq))
 /* We deleted a valid key.  */
 result = 0;
    }

  return result;
}

注意这里使用了atomic_compare_and_exchange_bool_acq来保证原子操作。

seq默认为0,无论是pthread_key_create还是pthread_key_delete都是对seq加1。当seq的值是偶数(包括0)时,表示当前pthread_key_struct未被使用,为奇数时表示在使用。

通过pthread_key_create分配pthread_key_t是全局的,但键值关联却是各线程独立的。在struct pthread结构体中有下面的定义:

 struct pthread_key_data
  {
    /* Sequence number.  We use uintptr_t to not require padding on
       32- and 64-bit machines.  On 64-bit machines it helps to avoid
       wrapping, too.  */
    uintptr_t seq;

    /* Data pointer.  */
    void *data;
  } specific_1stblock[PTHREAD_KEY_2NDLEVEL_SIZE];

  /* Two-level array for the thread-specific data.  */
  struct pthread_key_data *specific[PTHREAD_KEY_1STLEVEL_SIZE];

struct pthread_key_data结构定义了当前线程存储TLS数据的指针data,seq和struct pthread_key_struct的seq一样,标识了对应的key是否创建。

specific_1stblock并没有设置和PTHREAD_KEYS_MAX一样的大小,而是设置为PTHREAD_KEY_2NDLEVEL_SIZE(32)大小,这应该是从节省内存的角度设计的,大部分情况下我们并不会使用很多TLS变量。

执行pthread_setspecific时,当pthread_key_t个数小于PTHREAD_KEY_2NDLEVEL_SIZE,直接使用specific_1stblock数组;当pthread_key_t个数超过PTHREAD_KEY_2NDLEVEL_SIZE时,再申请内存空间使用specific二级数组,值存储在specific[idx1st][idx2nd].data。

//nptl/pthread_setspecific.c
int
___pthread_setspecific (pthread_key_t key, const void *value)
{
  struct pthread *self;
  unsigned int idx1st;
  unsigned int idx2nd;
  struct pthread_key_data *level2;
  unsigned int seq;

  self = THREAD_SELF;

  /* Special case access to the first 2nd-level block.  This is the
     usual case.  */
  if (__glibc_likely (key < PTHREAD_KEY_2NDLEVEL_SIZE))
    {
      /* Verify the key is sane.  */
      if (KEY_UNUSED ((seq = __pthread_keys[key].seq)))
 /* Not valid.  */
 return EINVAL;

      level2 = &self->specific_1stblock[key];

      /* Remember that we stored at least one set of data.  */
      if (value != NULL)
 THREAD_SETMEM (self, specific_used, true);
    }
  else
    {
      if (key >= PTHREAD_KEYS_MAX
   || KEY_UNUSED ((seq = __pthread_keys[key].seq)))
 /* Not valid.  */
 return EINVAL;

      idx1st = key / PTHREAD_KEY_2NDLEVEL_SIZE;
      idx2nd = key % PTHREAD_KEY_2NDLEVEL_SIZE;

      /* This is the second level array.  Allocate it if necessary.  */
      level2 = THREAD_GETMEM_NC (self, specific, idx1st);
      if (level2 == NULL)
 {
   if (value == NULL)
     /* We don't have to do anything.  The value would in any case
        be NULL.  We can save the memory allocation.  */
     return 0;

   level2
     = (struct pthread_key_data *) calloc (PTHREAD_KEY_2NDLEVEL_SIZE,
        sizeof (*level2));
   if (level2 == NULL)
     return ENOMEM;

   THREAD_SETMEM_NC (self, specific, idx1st, level2);
 }

      /* Pointer to the right array element.  */
      level2 = &level2[idx2nd];

      /* Remember that we stored at least one set of data.  */
      THREAD_SETMEM (self, specific_used, true);
    }

  /* Store the data and the sequence number so that we can recognize
     stale data.  */
  level2->seq = seq;
  level2->data = (void *) value;

  return 0;
}

有了上面的分析,执行pthread_getspecific的逻辑就比较清晰了。

//nptl/pthread_getspecific.c

void *
___pthread_getspecific (pthread_key_t key)
{
  struct pthread_key_data *data;

  /* Special case access to the first 2nd-level block.  This is the
     usual case.  */
  if (__glibc_likely (key < PTHREAD_KEY_2NDLEVEL_SIZE))
    data = &THREAD_SELF->specific_1stblock[key];
  else
    {
      /* Verify the key is sane.  */
      if (key >= PTHREAD_KEYS_MAX)
 /* Not valid.  */
 return NULL;

      unsigned int idx1st = key / PTHREAD_KEY_2NDLEVEL_SIZE;
      unsigned int idx2nd = key % PTHREAD_KEY_2NDLEVEL_SIZE;

      /* If the sequence number doesn't match or the key cannot be defined
  for this thread since the second level array is not allocated
  return NULL, too.  */
      struct pthread_key_data *level2 = THREAD_GETMEM_NC (THREAD_SELF,
         specific, idx1st);
      if (level2 == NULL)
 /* Not allocated, therefore no data.  */
 return NULL;

      /* There is data.  */
      data = &level2[idx2nd];
    }

  void *result = data->data;
  if (result != NULL)
    {
      uintptr_t seq = data->seq;

      if (__glibc_unlikely (seq != __pthread_keys[key].seq))
 result = data->data = NULL;
    }

  return result;
}

按照glibc的实现,当执行pthread_key_create获取的pthread_key_t应该是比较小的值才能优先使用specific_1stblock数组。但笔者在macOS环境测试发现获取的pthread_key_t比较大,这里应该是macOS具体的实现有和glibc不一致的地方?

__thread和pthread specific API对比

  • 存储区域/寻址方式不同

pthread specific API定义的数据,是通过struct pthread结构体的specific_1stblock数组和specific二级数组寻址,而__thread变量则是通过fp寄存器偏移寻址。

  • 性能/效率不同

由于__thread是通过fp寄存器偏移寻址,性能比pthread specific API高。

  • 能存储的数据不同

__thread只能修饰POD类型变量,对于指针类型的数据,有申请内存时需要手动销毁;而pthread specific API支持传入销毁方法,支持所有数据类型。

  • 支持的数据个数不同

理论上只要栈不被占满,__thread可以无限定义(存疑?);而pthread specific API只能创建PTHREAD_KEYS_MAX个key,但可以通过结构体等的方式,使用一个key存储多个值。

参考:glibc源码

其它文章

如何正确地获取线程ID?