Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 55 additions & 5 deletions TSRM/TSRM.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ typedef struct {
ts_allocate_ctor ctor;
ts_allocate_dtor dtor;
size_t fast_offset;
/* When set, storage comes from __thread memory instead of being allocated by TSRM. */
void *(*tls_addr)(void);
int done;
} tsrm_resource_type;

Expand Down Expand Up @@ -163,14 +165,19 @@ TSRM_API bool tsrm_startup(int expected_threads, int expected_resources, int deb

static void ts_free_resources(tsrm_tls_entry *thread_resources)
{
bool own_thread = thread_resources->thread_id == tsrm_thread_id();
Comment thread
henderkes marked this conversation as resolved.

/* Need to destroy in reverse order to respect dependencies. */
for (int i = thread_resources->count - 1; i >= 0; i--) {
if (!resource_types_table[i].done) {
if (resource_types_table[i].tls_addr && !own_thread) {
continue;
}
if (resource_types_table[i].dtor) {
resource_types_table[i].dtor(thread_resources->storage[i]);
}

if (!resource_types_table[i].fast_offset) {
if (!resource_types_table[i].fast_offset && !resource_types_table[i].tls_addr) {

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can't manually free __thread storage

free(thread_resources->storage[i]);
}
}
Expand Down Expand Up @@ -256,7 +263,10 @@ static void tsrm_update_active_threads(void)

p->storage = (void *) realloc(p->storage, sizeof(void *)*id_count);
for (j=p->count; j<id_count; j++) {
if (resource_types_table[j].fast_offset) {
if (resource_types_table[j].tls_addr) {
TSRM_ASSERT(p->thread_id == tsrm_thread_id());
p->storage[j] = resource_types_table[j].tls_addr();
} else if (resource_types_table[j].fast_offset) {
p->storage[j] = (void *) (((char*)p) + resource_types_table[j].fast_offset);
} else {
p->storage[j] = (void *) malloc(resource_types_table[j].size);
Expand Down Expand Up @@ -301,6 +311,7 @@ TSRM_API ts_rsrc_id ts_allocate_id(ts_rsrc_id *rsrc_id, size_t size, ts_allocate
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].ctor = ctor;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].dtor = dtor;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].fast_offset = 0;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].tls_addr = NULL;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].done = 0;

tsrm_update_active_threads();
Expand Down Expand Up @@ -359,6 +370,7 @@ TSRM_API ts_rsrc_id ts_allocate_fast_id(ts_rsrc_id *rsrc_id, size_t *offset, siz
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].ctor = ctor;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].dtor = dtor;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].fast_offset = *offset;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].tls_addr = NULL;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].done = 0;

tsrm_update_active_threads();
Expand All @@ -368,6 +380,41 @@ TSRM_API ts_rsrc_id ts_allocate_fast_id(ts_rsrc_id *rsrc_id, size_t *offset, siz
return *rsrc_id;
}/*}}}*/

/* allocates a resource id whose per-thread storage is a native __thread block */
TSRM_API ts_rsrc_id ts_allocate_tls_id(ts_rsrc_id *rsrc_id, void *(*tls_addr)(void), size_t size, ts_allocate_ctor ctor, ts_allocate_dtor dtor)
{/*{{{*/
TSRM_ERROR((TSRM_ERROR_LEVEL_CORE, "Obtaining a new TLS resource id, %d bytes", size));

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

function largely copied from above, looking at it now I see that size_t should be printed as %zu.


tsrm_mutex_lock(tsmm_mutex);

*rsrc_id = TSRM_SHUFFLE_RSRC_ID(id_count++);

if (resource_types_table_size < id_count) {
tsrm_resource_type *_tmp;
_tmp = (tsrm_resource_type *) realloc(resource_types_table, sizeof(tsrm_resource_type)*id_count);
if (!_tmp) {
TSRM_ERROR((TSRM_ERROR_LEVEL_ERROR, "Unable to allocate storage for resource"));
*rsrc_id = 0;
tsrm_mutex_unlock(tsmm_mutex);
return 0;
}
resource_types_table = _tmp;
resource_types_table_size = id_count;
}
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].size = size;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].ctor = ctor;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].dtor = dtor;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].fast_offset = 0;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].tls_addr = tls_addr;
resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].done = 0;

tsrm_update_active_threads();
tsrm_mutex_unlock(tsmm_mutex);

TSRM_ERROR((TSRM_ERROR_LEVEL_CORE, "Successfully allocated new TLS resource id %d", *rsrc_id));
return *rsrc_id;
}/*}}}*/

static void set_thread_local_storage_resource_to(tsrm_tls_entry *thread_resource)
{
tsrm_tls_set(thread_resource);
Expand Down Expand Up @@ -397,7 +444,9 @@ static void allocate_new_resource(tsrm_tls_entry **thread_resources_ptr, THREAD_
if (resource_types_table[i].done) {
(*thread_resources_ptr)->storage[i] = NULL;
} else {
if (resource_types_table[i].fast_offset) {
if (resource_types_table[i].tls_addr) {
(*thread_resources_ptr)->storage[i] = resource_types_table[i].tls_addr();
} else if (resource_types_table[i].fast_offset) {
(*thread_resources_ptr)->storage[i] = (void *) (((char*)(*thread_resources_ptr)) + resource_types_table[i].fast_offset);
} else {
(*thread_resources_ptr)->storage[i] = (void *) malloc(resource_types_table[i].size);
Expand Down Expand Up @@ -485,7 +534,8 @@ TSRM_API void *ts_resource_ex(ts_rsrc_id id, THREAD_T *th_id)
/* In case that extensions don't use the pointer passed from the dtor, but incorrectly
* use the global pointer, we need to setup the global pointer temporarily here. */
set_thread_local_storage_resource_to(thread_resources);
/* Free up the old resource from the old thread instance */
/* Dead thread, recycled id: already freed, so just zero it. */
thread_resources->thread_id = 0;
ts_free_resources(thread_resources);
free(thread_resources);
/* Allocate a new resource at the same point in the linked list, and relink the next pointer */
Expand Down Expand Up @@ -559,7 +609,7 @@ void ts_free_id(ts_rsrc_id id)
if (resource_types_table[rsrc_id].dtor) {
resource_types_table[rsrc_id].dtor(p->storage[rsrc_id]);
}
if (!resource_types_table[rsrc_id].fast_offset) {
if (!resource_types_table[rsrc_id].fast_offset && !resource_types_table[rsrc_id].tls_addr) {
free(p->storage[rsrc_id]);
}
}
Expand Down
4 changes: 3 additions & 1 deletion TSRM/TSRM.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ TSRM_API ts_rsrc_id ts_allocate_id(ts_rsrc_id *rsrc_id, size_t size, ts_allocate
/* Fast resource in reserved (pre-allocated) space */
TSRM_API void tsrm_reserve(size_t size);
TSRM_API ts_rsrc_id ts_allocate_fast_id(ts_rsrc_id *rsrc_id, size_t *offset, size_t size, ts_allocate_ctor ctor, ts_allocate_dtor dtor);
/* Must be called at startup before any other thread exists. */
TSRM_API ts_rsrc_id ts_allocate_tls_id(ts_rsrc_id *rsrc_id, void *(*tls_addr)(void), size_t size, ts_allocate_ctor ctor, ts_allocate_dtor dtor);

/* fetches the requested resource for the current thread */
TSRM_API void *ts_resource_ex(ts_rsrc_id id, THREAD_T *th_id);
Expand Down Expand Up @@ -155,7 +157,7 @@ TSRM_API bool tsrm_is_managed_thread(void);
#if !__has_attribute(tls_model) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__MUSL__) || defined(__HAIKU__)
# define TSRM_TLS_MODEL_ATTR
# define TSRM_TLS_MODEL_DEFAULT
#elif __PIC__
#elif __PIC__ && !defined(__PIE__)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a PIE program can use local exec if it's the main executable. Only shared libraries (embed, extensions) need to fall back to initial-exed.

This alone would already be a small speedup (one fewer instruction per access)

# define TSRM_TLS_MODEL_ATTR __attribute__((tls_model("initial-exec")))
# define TSRM_TLS_MODEL_INITIAL_EXEC
#else
Expand Down
17 changes: 17 additions & 0 deletions Zend/Zend.m4
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,23 @@ AC_MSG_RESULT([$ZEND_ZTS])
AS_VAR_IF([ZEND_ZTS], [yes], [
AC_DEFINE([ZTS], [1], [Define to 1 if thread safety (ZTS) is enabled.])
AS_VAR_APPEND([CFLAGS], [" -DZTS"])

AC_CACHE_CHECK([for __thread support], [php_cv_have_thread_local], [
Comment thread
henderkes marked this conversation as resolved.
Outdated
AC_LINK_IFELSE([AC_LANG_PROGRAM(
[[static __thread int tls_var;]],
[[tls_var = 1; return tls_var;]])],
[php_cv_have_thread_local=yes], [php_cv_have_thread_local=no])
])
AS_VAR_IF([php_cv_have_thread_local], [yes], [
AC_DEFINE([ZEND_EG_TLS], [1],
[Define to hold EG()/CG() in a __thread variable under ZTS.])
AS_VAR_APPEND([CFLAGS], [" -DZEND_EG_TLS"])

dnl -mtls-size=12 drops the dead high-bits offset add from TLS access,
dnl valid while the thread-local block stays under 4 KiB.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would produce linker errors if tls size exceeded 4kb, but I did a test atatic build with 100 extensions statically compiled in (what a terrible idea) and tls size ended up at 3.7kb. I can't think of a way to test whether a link would succeed before compiling, so this is unconditional.

AX_CHECK_COMPILE_FLAG([-mtls-size=12],
[AS_VAR_APPEND([CFLAGS], [" -mtls-size=12"])])
])
])

AC_MSG_CHECKING([whether to enable Zend debugging])
Expand Down
13 changes: 13 additions & 0 deletions Zend/zend.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,14 @@ ZEND_API int compiler_globals_id;
ZEND_API int executor_globals_id;
ZEND_API size_t compiler_globals_offset;
ZEND_API size_t executor_globals_offset;
# ifdef ZEND_EG_TLS
ZEND_API TSRM_TLS TSRM_TLS_MODEL_ATTR zend_executor_globals executor_globals_tls;
ZEND_API TSRM_TLS TSRM_TLS_MODEL_ATTR zend_compiler_globals compiler_globals_tls;

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possibly these could be embeded in the main _tsrm_ls_cache (TSRMLS_MAIN_CACHE_DEFINE) :

ZEND_API TSRM_TLS TSRM_TLS_MODEL_ATTR struct {
    void *cache;
    zend_executor_globals eg;
    zend_compiler_globals cg;
} _tsrm_ls_cache;

This would simplify JIT changes as we can access eg/cg at an offset from jit->tls.

Also the compiler may generate better code when both EG and CG are used in the same function.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a great idea, I'll give it a spin.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at it, it introduces tsrm macro changes in many places that assume _tsrm_ls_cache is a void*. Could deref address of .cache to keep callers as void* consumers without pulling in the whole struct.

It simplifies the jit a lot though and leaves the ability to move more things later, so I'll probably do it later.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll need to redo the jit by hand, llm missed too many crucial parts and happily regressed if ie wasn't available (aka macos and Windows).

/* ts_allocate_tls_id takes a callback so each thread resolves its own block.
* A plain &..._tls would capture only the registering thread's address. */
static void *executor_globals_tls_addr(void) { return &executor_globals_tls; }
static void *compiler_globals_tls_addr(void) { return &compiler_globals_tls; }
# endif
static HashTable *global_function_table = NULL;
static HashTable *global_class_table = NULL;
static HashTable *global_constants_table = NULL;
Expand Down Expand Up @@ -1019,8 +1027,13 @@ void zend_startup(zend_utility_functions *utility_functions) /* {{{ */
zend_init_rsrc_list_dtors();

#ifdef ZTS
#ifdef ZEND_EG_TLS
ts_allocate_tls_id(&compiler_globals_id, compiler_globals_tls_addr, sizeof(zend_compiler_globals), (ts_allocate_ctor) compiler_globals_ctor, (ts_allocate_dtor) compiler_globals_dtor);
ts_allocate_tls_id(&executor_globals_id, executor_globals_tls_addr, sizeof(zend_executor_globals), (ts_allocate_ctor) executor_globals_ctor, (ts_allocate_dtor) executor_globals_dtor);
#else
ts_allocate_fast_id(&compiler_globals_id, &compiler_globals_offset, sizeof(zend_compiler_globals), (ts_allocate_ctor) compiler_globals_ctor, (ts_allocate_dtor) compiler_globals_dtor);
ts_allocate_fast_id(&executor_globals_id, &executor_globals_offset, sizeof(zend_executor_globals), (ts_allocate_ctor) executor_globals_ctor, (ts_allocate_dtor) executor_globals_dtor);
#endif
ts_allocate_fast_id(&language_scanner_globals_id, &language_scanner_globals_offset, sizeof(zend_php_scanner_globals), (ts_allocate_ctor) php_scanner_globals_ctor, NULL);
ts_allocate_fast_id(&ini_scanner_globals_id, &ini_scanner_globals_offset, sizeof(zend_ini_scanner_globals), (ts_allocate_ctor) ini_scanner_globals_ctor, NULL);
compiler_globals = ts_resource(compiler_globals_id);
Expand Down
14 changes: 12 additions & 2 deletions Zend/zend_globals_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,12 @@ BEGIN_EXTERN_C()

/* Compiler */
#ifdef ZTS
# define CG(v) ZEND_TSRMG_FAST(compiler_globals_offset, zend_compiler_globals *, v)
# ifdef ZEND_EG_TLS
extern ZEND_API TSRM_TLS TSRM_TLS_MODEL_ATTR zend_compiler_globals compiler_globals_tls;
# define CG(v) (compiler_globals_tls.v)
# else
# define CG(v) ZEND_TSRMG_FAST(compiler_globals_offset, zend_compiler_globals *, v)
# endif
#else
# define CG(v) (compiler_globals.v)
extern ZEND_API struct _zend_compiler_globals compiler_globals;
Expand All @@ -40,7 +45,12 @@ ZEND_API int zendparse(void);

/* Executor */
#ifdef ZTS
# define EG(v) ZEND_TSRMG_FAST(executor_globals_offset, zend_executor_globals *, v)
# ifdef ZEND_EG_TLS
extern ZEND_API TSRM_TLS TSRM_TLS_MODEL_ATTR zend_executor_globals executor_globals_tls;
# define EG(v) (executor_globals_tls.v)
# else
# define EG(v) ZEND_TSRMG_FAST(executor_globals_offset, zend_executor_globals *, v)
# endif
#else
# define EG(v) (executor_globals.v)
extern ZEND_API zend_executor_globals executor_globals;
Expand Down
6 changes: 5 additions & 1 deletion ext/opcache/jit/ir/ir_aarch64.dasc
Original file line number Diff line number Diff line change
Expand Up @@ -5868,8 +5868,12 @@ static void ir_emit_tls(ir_ctx *ctx, ir_ref def, ir_insn *insn)
| ldr Rx(reg), [Rx(reg), #insn->op3]
|| }
||# else
|| /* op2 == 0 with no index requests the bare thread pointer (used to form
|| * &EG/&CG with an add); a real TLS var never sits at tprel offset 0. */
|| if (insn->op2 != 0 || insn->op3 != IR_NULL) {
||//??? IR_ASSERT(insn->op2 <= LDR_STR_PIMM64);
| ldr Rx(reg), [Rx(reg), #insn->op2]
| ldr Rx(reg), [Rx(reg), #insn->op2]
|| }
||# endif
||#endif
if (IR_REG_SPILLED(ctx->regs[def][0])) {
Expand Down
Loading
Loading