diff --git a/apps/debug_menu.c b/apps/debug_menu.c
index a11cff9350..61698f5025 100644
--- a/apps/debug_menu.c
+++ b/apps/debug_menu.c
@@ -151,25 +151,21 @@ static const char* threads_getname(int selected_item, void *data,
     selected_item -= NUM_CORES;
 #endif
 
+    const char *fmtstr = "%2d: ---";
+
     struct thread_debug_info threadinfo;
-    if (thread_get_debug_info(selected_item, &threadinfo) <= 0)
+    if (thread_get_debug_info(selected_item, &threadinfo) > 0)
     {
-        snprintf(buffer, buffer_len, "%2d: ---", selected_item);
-        return buffer;
+        fmtstr = "%2d:" IF_COP(" (%d)") " %s" IF_PRIO(" %d %d")
+                 IFN_SDL(" %2d%%") " %s";
     }
 
-    snprintf(buffer, buffer_len,
-             "%2d: " IF_COP("(%d) ") "%s " IF_PRIO("%d %d ") "%2d%% %s",
+    snprintf(buffer, buffer_len, fmtstr,
              selected_item,
-#if NUM_CORES > 1
-             threadinfo.core,
-#endif
+             IF_COP(threadinfo.core,)
              threadinfo.statusstr,
-#ifdef HAVE_PRIORITY_SCHEDULING
-             threadinfo.base_priority,
-             threadinfo.current_priority,
-#endif
-             threadinfo.stack_usage,
+             IF_PRIO(threadinfo.base_priority, threadinfo.current_priority,)
+             IFN_SDL(threadinfo.stack_usage,)
              threadinfo.name);
 
     return buffer;
@@ -187,16 +183,9 @@ static bool dbg_os(void)
 {
     struct simplelist_info info;
     simplelist_info_init(&info, IF_COP("Core and ") "Stack usage:",
-#if NUM_CORES == 1
-                            MAXTHREADS,
-#else
-                            MAXTHREADS+NUM_CORES,
-#endif
-                            NULL);
-#ifndef ROCKBOX_HAS_LOGF
+                         MAXTHREADS IF_COP( + NUM_CORES ), NULL);
     info.hide_selection = true;
     info.scroll_all = true;
-#endif
     info.action_callback = dbg_threads_action_callback;
     info.get_name = threads_getname;
     return simplelist_show_list(&info);
diff --git a/firmware/asm/m68k/thread.c b/firmware/asm/m68k/thread.c
index 7df89001d7..de07b29729 100644
--- a/firmware/asm/m68k/thread.c
+++ b/firmware/asm/m68k/thread.c
@@ -86,6 +86,8 @@ static inline void load_context(const void* addr)
     );
 }
 
+
+#ifdef RB_PROFILE
 /*---------------------------------------------------------------------------
  * Call this from asm to make sure the sp is pointing to the
  * correct place before the context is saved.
@@ -99,3 +101,6 @@ static inline void _profile_thread_stopped(int current_thread)
                   :: [id] "r" (current_thread)
                   : "cc", "memory");
 }
+
+#define profile_thread_stopped  _profile_thread_stopped
+#endif /* RB_PROFILE */
diff --git a/firmware/export/system.h b/firmware/export/system.h
index 0a13ec2208..5064fcd91d 100644
--- a/firmware/export/system.h
+++ b/firmware/export/system.h
@@ -118,15 +118,17 @@ int get_cpu_boost_counter(void);
 #define ALIGN_UP(n, a)       ALIGN_DOWN((n)+((a)-1),a)
 
 /* align start and end of buffer to nearest integer multiple of a */
-#define ALIGN_BUFFER(ptr,len,align) \
-{\
-    uintptr_t tmp_ptr1 = (uintptr_t)ptr; \
-    uintptr_t tmp_ptr2 = tmp_ptr1 + len;\
-    tmp_ptr1 = ALIGN_UP(tmp_ptr1,align); \
-    tmp_ptr2 = ALIGN_DOWN(tmp_ptr2,align); \
-    len = tmp_ptr2 - tmp_ptr1; \
-    ptr = (typeof(ptr))tmp_ptr1; \
-}
+#define ALIGN_BUFFER(ptr, size, align) \
+({                                           \
+    size_t    __sz = (size);                 \
+    size_t   __ali = (align);                \
+    uintptr_t __a1 = (uintptr_t)(ptr);       \
+    uintptr_t __a2 = __a1 + __sz;            \
+    __a1 = ALIGN_UP(__a1, __ali);            \
+    __a2 = ALIGN_DOWN(__a2, __ali);          \
+    (ptr)  = (typeof (ptr))__a1;             \
+    (size) = __a2 > __a1 ?  __a2 - __a1 : 0; \
+})
 
 #define PTR_ADD(ptr, x) ((typeof(ptr))((char*)(ptr) + (x)))
 #define PTR_SUB(ptr, x) ((typeof(ptr))((char*)(ptr) - (x)))
@@ -150,11 +152,16 @@ int get_cpu_boost_counter(void);
 #endif
 
 /* Get the byte offset of a type's member */
-#define OFFSETOF(type, membername) ((off_t)&((type *)0)->membername)
+#ifndef offsetof
+#define offsetof(type, member)  __builtin_offsetof(type, member)
+#endif
 
-/* Get the type pointer from one of its members */
-#define TYPE_FROM_MEMBER(type, memberptr, membername) \
-    ((type *)((intptr_t)(memberptr) - OFFSETOF(type, membername)))
+/* Get the containing item of *ptr in type */
+#ifndef container_of
+#define container_of(ptr, type, member) ({              \
+    const typeof (((type *)0)->member) *__mptr = (ptr); \
+    (type *)((void *)(__mptr) - offsetof(type, member)); })
+#endif
 
 /* returns index of first set bit or 32 if no bits are set */
 #if defined(CPU_ARM) && ARM_ARCH >= 5 && !defined(__thumb__)
@@ -324,6 +331,11 @@ static inline uint32_t swaw32_hw(uint32_t value)
  * for all ARM CPUs. */
 #ifdef CPU_ARM
     #define HAVE_CPU_CACHE_ALIGN
+    #define MIN_STACK_ALIGN 8
+#endif
+
+#ifndef MIN_STACK_ALIGN
+#define MIN_STACK_ALIGN (sizeof (uintptr_t))
 #endif
 
 /* Calculate CACHEALIGN_SIZE from CACHEALIGN_BITS */
diff --git a/firmware/kernel/include/mrsw_lock.h b/firmware/kernel/include/mrsw_lock.h
index d919f7be26..7511f87e93 100644
--- a/firmware/kernel/include/mrsw_lock.h
+++ b/firmware/kernel/include/mrsw_lock.h
@@ -39,10 +39,9 @@
  */
 struct mrsw_lock
 {
-    int volatile count; /* rd/wr counter; >0 = reader(s), <0 = writer */
-    struct thread_entry *queue;
-    struct blocker_splay splay; /* priority inheritance info
-                                   for waiters */
+    int volatile         count; /* counter; >0 = reader(s), <0 = writer */
+    struct __wait_queue  queue; /* waiter list */
+    struct blocker_splay splay; /* priority inheritance/owner info  */
     uint8_t rdrecursion[MAXTHREADS]; /* per-thread reader recursion counts */
     IF_COP( struct corelock cl; )
 };
diff --git a/firmware/kernel/include/mutex.h b/firmware/kernel/include/mutex.h
index 72736ec8fd..b74bfe23f5 100644
--- a/firmware/kernel/include/mutex.h
+++ b/firmware/kernel/include/mutex.h
@@ -26,13 +26,13 @@
 
 struct mutex
 {
-    struct thread_entry *queue;   /* waiter list */
-    int recursion;                /* lock owner recursion count */
-    struct blocker blocker;       /* priority inheritance info
-                                     for waiters and owner*/
-    IF_COP( struct corelock cl; ) /* multiprocessor sync */
+    struct __wait_queue queue;     /* waiter list */
+    int                 recursion; /* lock owner recursion count */
+    struct blocker      blocker;   /* priority inheritance info
+                                      for waiters and owner*/
+    IF_COP( struct corelock cl; )  /* multiprocessor sync */
 #ifdef HAVE_PRIORITY_SCHEDULING
-    bool no_preempt;
+    bool                no_preempt;
 #endif
 };
 
diff --git a/firmware/kernel/include/queue.h b/firmware/kernel/include/queue.h
index 3f24598d5b..afee4c90ff 100644
--- a/firmware/kernel/include/queue.h
+++ b/firmware/kernel/include/queue.h
@@ -88,7 +88,7 @@ struct queue_sender_list
     /* If non-NULL, there is a thread waiting for the corresponding event */
     /* Must be statically allocated to put in non-cached ram. */
     struct thread_entry *senders[QUEUE_LENGTH]; /* message->thread map */
-    struct thread_entry *list;                  /* list of senders in map */
+    struct __wait_queue list;                   /* list of senders in map */
     /* Send info for last message dequeued or NULL if replied or not sent */
     struct thread_entry * volatile curr_sender;
 #ifdef HAVE_PRIORITY_SCHEDULING
@@ -108,7 +108,7 @@ struct queue_sender_list
 
 struct event_queue
 {
-    struct thread_entry *queue;         /* waiter list */
+    struct __wait_queue queue;          /* waiter list */
     struct queue_event events[QUEUE_LENGTH]; /* list of events */
     unsigned int volatile read;         /* head of queue */
     unsigned int volatile write;        /* tail of queue */
diff --git a/firmware/kernel/include/semaphore.h b/firmware/kernel/include/semaphore.h
index 16095d9c2d..1d604a4e76 100644
--- a/firmware/kernel/include/semaphore.h
+++ b/firmware/kernel/include/semaphore.h
@@ -26,10 +26,10 @@
 
 struct semaphore
 {
-    struct thread_entry *queue;         /* Waiter list */
-    int volatile count;                 /* # of waits remaining before unsignaled */
-    int max;                            /* maximum # of waits to remain signaled */
-    IF_COP( struct corelock cl; )       /* multiprocessor sync */
+    struct __wait_queue queue;    /* Waiter list */
+    int volatile        count;    /* # of waits remaining before unsignaled */
+    int                 max;      /* maximum # of waits to remain signaled */
+    IF_COP( struct corelock cl; ) /* multiprocessor sync */
 };
 
 extern void semaphore_init(struct semaphore *s, int max, int start);
diff --git a/firmware/kernel/include/thread.h b/firmware/kernel/include/thread.h
index 5a8bff0107..dfb632785e 100644
--- a/firmware/kernel/include/thread.h
+++ b/firmware/kernel/include/thread.h
@@ -26,6 +26,7 @@
 #include <stdbool.h>
 #include "config.h"
 #include "gcc_extensions.h"
+#include "linked_list.h"
 #include "bitarray.h"
 #include "corelock.h"
 
@@ -52,7 +53,7 @@
 #define PRIORITY_REALTIME_4      4
 #define PRIORITY_REALTIME        4   /* Lowest realtime range */
 #define PRIORITY_BUFFERING       15  /* Codec buffering thread */
-#define PRIORITY_USER_INTERFACE  16  /* The main thread */
+#define PRIORITY_USER_INTERFACE  16  /* For most UI thrads */
 #define PRIORITY_RECORDING       16  /* Recording thread */
 #define PRIORITY_PLAYBACK        16  /* Variable between this and MAX */
 #define PRIORITY_PLAYBACK_MAX    5   /* Maximum allowable playback priority */
@@ -61,6 +62,7 @@
 #define NUM_PRIORITIES           32
 #define PRIORITY_IDLE            32  /* Priority representative of no tasks */
 
+#define PRIORITY_MAIN_THREAD     PRIORITY_USER_INTERFACE
 #define IO_PRIORITY_IMMEDIATE    0
 #define IO_PRIORITY_BACKGROUND   32
 
@@ -108,6 +110,9 @@ extern unsigned sleep(unsigned ticks);
 #define IFN_PRIO(...)   __VA_ARGS__
 #endif
 
+#define __wait_queue      lld_head
+#define __wait_queue_node lld_node
+
 /* Basic structure describing the owner of an object */
 struct blocker
 {
@@ -168,6 +173,7 @@ int thread_get_priority(unsigned int thread_id);
 void thread_set_io_priority(unsigned int thread_id, int io_priority);
 int thread_get_io_priority(unsigned int thread_id);
 #endif /* HAVE_IO_PRIORITY */
+
 #if NUM_CORES > 1
 unsigned int switch_core(unsigned int new_core);
 #endif
@@ -186,11 +192,21 @@ int core_get_debug_info(unsigned int core, struct core_debug_info *infop);
 
 #endif /* NUM_CORES */
 
+#ifdef HAVE_SDL_THREADS
+#define IF_SDL(x...) x
+#define IFN_SDL(x...)
+#else
+#define IF_SDL(x...)
+#define IFN_SDL(x...) x
+#endif
+
 struct thread_debug_info
 {
     char         statusstr[4];
     char         name[32];
+#ifndef HAVE_SDL_THREADS
     unsigned int stack_usage;
+#endif
 #if NUM_CORES > 1
     unsigned int core;
 #endif
diff --git a/firmware/kernel/mrsw_lock.c b/firmware/kernel/mrsw_lock.c
index 45c8801b74..b683f63d5f 100644
--- a/firmware/kernel/mrsw_lock.c
+++ b/firmware/kernel/mrsw_lock.c
@@ -19,7 +19,8 @@
  *
  ****************************************************************************/
 #include "kernel-internal.h"
-#include "mrsw-lock.h"
+#include <string.h>
+#include "mrsw_lock.h"
 
 #ifdef HAVE_PRIORITY_SCHEDULING
 
@@ -34,13 +35,14 @@ mrsw_reader_claim(struct mrsw_lock *mrsw, struct thread_entry *current,
 
 static FORCE_INLINE void
 mrsw_reader_relinquish(struct mrsw_lock *mrsw, struct thread_entry *current,
-                       int count, unsigned int slotnum)
+                       struct thread_entry *first, int count,
+                       unsigned int slotnum)
 {
     /* If no writer is queued or has ownership then noone is queued;
        if a writer owns it, then the reader would be blocked instead.
        Therefore, if the queue has threads, then the next after the
        owning readers is a writer and this is not the last reader. */
-    if (mrsw->queue)
+    if (first)
         corelock_lock(&mrsw->splay.cl);
 
     threadbit_clear_bit(&mrsw->splay.mask, slotnum);
@@ -61,10 +63,10 @@ mrsw_reader_relinquish(struct mrsw_lock *mrsw, struct thread_entry *current,
                       threadbit_popcount(&mrsw->splay.mask));
         /* switch owner to sole remaining reader */
         slotnum = threadbit_ffs(&mrsw->splay.mask);
-        mrsw->splay.blocker.thread = thread_id_entry(slotnum);
+        mrsw->splay.blocker.thread = __thread_slot_entry(slotnum);
     }
 
-    if (mrsw->queue)
+    if (first)
     {
         priority_disinherit(current, &mrsw->splay.blocker);
         corelock_unlock(&mrsw->splay.cl);
@@ -72,23 +74,25 @@ mrsw_reader_relinquish(struct mrsw_lock *mrsw, struct thread_entry *current,
 }
 
 static FORCE_INLINE unsigned int
-mrsw_reader_wakeup_writer(struct mrsw_lock *mrsw, unsigned int slotnum)
+mrsw_reader_wakeup_writer(struct mrsw_lock *mrsw, struct thread_entry *thread,
+                          unsigned int slotnum)
 {
     threadbit_clear_bit(&mrsw->splay.mask, slotnum);
-    return wakeup_thread(&mrsw->queue, WAKEUP_TRANSFER);
+    return wakeup_thread(thread, WAKEUP_TRANSFER);
 }
 
 static FORCE_INLINE unsigned int
-mrsw_writer_wakeup_writer(struct mrsw_lock *mrsw)
+mrsw_writer_wakeup_writer(struct mrsw_lock *mrsw, struct thread_entry *thread)
 {
-    return wakeup_thread(&mrsw->queue, WAKEUP_TRANSFER);
+    return wakeup_thread(thread, WAKEUP_TRANSFER);
+    (void)mrsw;
 }
 
 static FORCE_INLINE unsigned int
-mrsw_writer_wakeup_readers(struct mrsw_lock *mrsw)
+mrsw_writer_wakeup_readers(struct mrsw_lock *mrsw, struct thread_entry *first)
 {
-    unsigned int result = wakeup_thread(&mrsw->queue, WAKEUP_TRANSFER_MULTI);
-    mrsw->count = thread_self_entry()->retval;
+    unsigned int result = wakeup_thread(first, WAKEUP_TRANSFER_MULTI);
+    mrsw->count = __running_self_entry()->retval;
     return result;
 }
 
@@ -97,32 +101,36 @@ mrsw_writer_wakeup_readers(struct mrsw_lock *mrsw)
 #define mrsw_reader_claim(mrsw, current, count, slotnum) \
     do {} while (0)
 
-#define mrsw_reader_relinquish(mrsw, current, count, slotnum) \
+#define mrsw_reader_relinquish(mrsw, current, first, count, slotnum) \
     do {} while (0)
 
 static FORCE_INLINE unsigned int
-mrsw_reader_wakeup_writer(struct mrsw_lock *mrsw)
+mrsw_reader_wakeup_writer(struct mrsw_lock *mrsw, struct thread_entry *thread)
 {
-    mrsw->splay.blocker.thread = mrsw->queue;
-    return wakeup_thread(&mrsw->queue);
+    mrsw->splay.blocker.thread = thread;
+    return wakeup_thread(thread);
 }
 
 static FORCE_INLINE unsigned int
-mrsw_writer_wakeup_writer(struct mrsw_lock *mrsw)
+mrsw_writer_wakeup_writer(struct mrsw_lock *mrsw, struct thread_entry *thread)
 {
-    mrsw->splay.blocker.thread = mrsw->queue;
-    return wakeup_thread(&mrsw->queue);
+    mrsw->splay.blocker.thread = thread;
+    return wakeup_thread(thread);
 }
 
 static FORCE_INLINE unsigned int
-mrsw_writer_wakeup_readers(struct mrsw_lock *mrsw)
+mrsw_writer_wakeup_readers(struct mrsw_lock *mrsw, struct thread_entry *first)
 {
     mrsw->splay.blocker.thread = NULL;
-    int count = 0;
+    int count = 1;
 
-    while (mrsw->queue && mrsw->queue->retval != 0)
+    while (1)
     {
-        wakeup_thread(&mrsw->queue);
+        wakeup_thread(first);
+
+        if (!(first = WQ_THREAD_FIRST(&mrsw->queue)) || first->retval == 0)
+            break;
+
         count++;
     }
 
@@ -138,14 +146,11 @@ mrsw_writer_wakeup_readers(struct mrsw_lock *mrsw)
 void mrsw_init(struct mrsw_lock *mrsw)
 {
     mrsw->count = 0;
-    mrsw->queue = NULL;
-    mrsw->splay.blocker.thread = NULL;
+    wait_queue_init(&mrsw->queue);
+    blocker_splay_init(&mrsw->splay);
 #ifdef HAVE_PRIORITY_SCHEDULING
-    mrsw->splay.blocker.priority = PRIORITY_IDLE;
-    threadbit_clear(&mrsw->splay.mask);
-    corelock_init(&mrsw->splay.cl);
     memset(mrsw->rdrecursion, 0, sizeof (mrsw->rdrecursion));
-#endif /* HAVE_PRIORITY_SCHEDULING */
+#endif
     corelock_init(&mrsw->cl);
 }
 
@@ -154,7 +159,7 @@ void mrsw_init(struct mrsw_lock *mrsw)
  * access recursively. The current writer is ignored and gets access. */
 void mrsw_read_acquire(struct mrsw_lock *mrsw)
 {
-    struct thread_entry *current = thread_self_entry();
+    struct thread_entry *current = __running_self_entry();
 
     if (current == mrsw->splay.blocker.thread IF_PRIO( && mrsw->count < 0 ))
         return; /* Read request while holding write access; pass */
@@ -178,7 +183,7 @@ void mrsw_read_acquire(struct mrsw_lock *mrsw)
 
     int count = mrsw->count;
 
-    if (LIKELY(count >= 0 && !mrsw->queue))
+    if (LIKELY(count >= 0 && mrsw->queue.head == NULL))
     {
         /* Lock open to readers:
            IFN_PRIO, mrsw->count tracks reader recursion */
@@ -189,13 +194,10 @@ void mrsw_read_acquire(struct mrsw_lock *mrsw)
     }
 
     /* A writer owns it or is waiting; block... */
-    IF_COP( current->obj_cl = &mrsw->cl; )
-    IF_PRIO( current->blocker = &mrsw->splay.blocker; )
-    current->bqp = &mrsw->queue;
     current->retval = 1; /* indicate multi-wake candidate */
 
     disable_irq();
-    block_thread(current, TIMEOUT_BLOCK);
+    block_thread(current, TIMEOUT_BLOCK, &mrsw->queue, &mrsw->splay.blocker);
 
     corelock_unlock(&mrsw->cl);
 
@@ -207,7 +209,7 @@ void mrsw_read_acquire(struct mrsw_lock *mrsw)
  * leave opens up access to writer threads. The current writer is ignored. */
 void mrsw_read_release(struct mrsw_lock *mrsw)
 {
-    struct thread_entry *current = thread_self_entry();
+    struct thread_entry *current = __running_self_entry();
 
     if (current == mrsw->splay.blocker.thread IF_PRIO( && mrsw->count < 0 ))
         return; /* Read release while holding write access; ignore */
@@ -237,17 +239,18 @@ void mrsw_read_release(struct mrsw_lock *mrsw)
     unsigned int result = THREAD_NONE;
     const int oldlevel = disable_irq_save();
 
-    if (--count == 0 && mrsw->queue)
+    struct thread_entry *thread = WQ_THREAD_FIRST(&mrsw->queue);
+    if (--count == 0 && thread != NULL)
     {
         /* No readers remain and a writer is waiting */
         mrsw->count = -1;
-        result = mrsw_reader_wakeup_writer(mrsw IF_PRIO(, slotnum));
+        result = mrsw_reader_wakeup_writer(mrsw, thread IF_PRIO(, slotnum));
     }
     else
     {
         /* Giving up readership; we may be the last, or not */
         mrsw->count = count;
-        mrsw_reader_relinquish(mrsw, current, count, slotnum);
+        mrsw_reader_relinquish(mrsw, current, thread, count, slotnum);
     }
 
     restore_irq(oldlevel);
@@ -265,7 +268,7 @@ void mrsw_read_release(struct mrsw_lock *mrsw)
  * safely call recursively. */
 void mrsw_write_acquire(struct mrsw_lock *mrsw)
 {
-    struct thread_entry *current = thread_self_entry();
+    struct thread_entry *current = __running_self_entry();
 
     if (current == mrsw->splay.blocker.thread)
     {
@@ -288,13 +291,10 @@ void mrsw_write_acquire(struct mrsw_lock *mrsw)
     }
 
     /* Readers present or a writer owns it - block... */
-    IF_COP( current->obj_cl = &mrsw->cl; )
-    IF_PRIO( current->blocker = &mrsw->splay.blocker; )
-    current->bqp = &mrsw->queue;
     current->retval = 0; /* indicate single-wake candidate */
 
     disable_irq();
-    block_thread(current, TIMEOUT_BLOCK);
+    block_thread(current, TIMEOUT_BLOCK, &mrsw->queue, &mrsw->splay.blocker);
 
     corelock_unlock(&mrsw->cl);
 
@@ -305,9 +305,9 @@ void mrsw_write_acquire(struct mrsw_lock *mrsw)
 /* Release writer thread lock and open the lock to readers and writers */
 void mrsw_write_release(struct mrsw_lock *mrsw)
 {
-    KERNEL_ASSERT(thread_self_entry() == mrsw->splay.blocker.thread,
+    KERNEL_ASSERT(__running_self_entry() == mrsw->splay.blocker.thread,
                   "mrsw_write_release->wrong thread (%s != %s)\n",
-                  thread_self_entry()->name,
+                  __running_self_entry()->name,
                   mrsw->splay.blocker.thread->name);
 
     int count = mrsw->count;
@@ -323,15 +323,16 @@ void mrsw_write_release(struct mrsw_lock *mrsw)
     corelock_lock(&mrsw->cl);
     const int oldlevel = disable_irq_save();
 
-    if (mrsw->queue == NULL)           /* 'count' becomes zero */
+    struct thread_entry *thread = WQ_THREAD_FIRST(&mrsw->queue);
+    if (thread == NULL)           /* 'count' becomes zero */
     {
         mrsw->splay.blocker.thread = NULL;
         mrsw->count = 0;
     }
-    else if (mrsw->queue->retval == 0) /* 'count' stays -1 */
-        result = mrsw_writer_wakeup_writer(mrsw);
-    else                               /* 'count' becomes # of readers */
-        result = mrsw_writer_wakeup_readers(mrsw);
+    else if (thread->retval == 0) /* 'count' stays -1 */
+        result = mrsw_writer_wakeup_writer(mrsw, thread);
+    else                          /* 'count' becomes # of readers */
+        result = mrsw_writer_wakeup_readers(mrsw, thread);
 
     restore_irq(oldlevel);
     corelock_unlock(&mrsw->cl);
diff --git a/firmware/kernel/mutex.c b/firmware/kernel/mutex.c
index e5729dc893..fc49cc6d09 100644
--- a/firmware/kernel/mutex.c
+++ b/firmware/kernel/mutex.c
@@ -30,20 +30,19 @@
  * the object is available to other threads */
 void mutex_init(struct mutex *m)
 {
-    corelock_init(&m->cl);
-    m->queue = NULL;
+    wait_queue_init(&m->queue);
     m->recursion = 0;
-    m->blocker.thread = NULL;
+    blocker_init(&m->blocker);
 #ifdef HAVE_PRIORITY_SCHEDULING
-    m->blocker.priority = PRIORITY_IDLE;
     m->no_preempt = false;
 #endif
+    corelock_init(&m->cl);
 }
 
 /* Gain ownership of a mutex object or block until it becomes free */
 void mutex_lock(struct mutex *m)
 {
-    struct thread_entry *current = thread_self_entry();
+    struct thread_entry *current = __running_self_entry();
 
     if(current == m->blocker.thread)
     {
@@ -65,12 +64,8 @@ void mutex_lock(struct mutex *m)
     }
 
     /* block until the lock is open... */
-    IF_COP( current->obj_cl = &m->cl; )
-    IF_PRIO( current->blocker = &m->blocker; )
-    current->bqp = &m->queue;
-
     disable_irq();
-    block_thread(current, TIMEOUT_BLOCK);
+    block_thread(current, TIMEOUT_BLOCK, &m->queue, &m->blocker);
 
     corelock_unlock(&m->cl);
 
@@ -82,10 +77,10 @@ void mutex_lock(struct mutex *m)
 void mutex_unlock(struct mutex *m)
 {
     /* unlocker not being the owner is an unlocking violation */
-    KERNEL_ASSERT(m->blocker.thread == thread_self_entry(),
+    KERNEL_ASSERT(m->blocker.thread == __running_self_entry(),
                   "mutex_unlock->wrong thread (%s != %s)\n",
                   m->blocker.thread->name,
-                  thread_self_entry()->name);
+                  __running_self_entry()->name);
 
     if(m->recursion > 0)
     {
@@ -98,7 +93,8 @@ void mutex_unlock(struct mutex *m)
     corelock_lock(&m->cl);
 
     /* transfer to next queued thread if any */
-    if(LIKELY(m->queue == NULL))
+    struct thread_entry *thread = WQ_THREAD_FIRST(&m->queue);
+    if(LIKELY(thread == NULL))
     {
         /* no threads waiting - open the lock */
         m->blocker.thread = NULL;
@@ -107,11 +103,7 @@ void mutex_unlock(struct mutex *m)
     }
 
     const int oldlevel = disable_irq_save();
-    /* Tranfer of owning thread is handled in the wakeup protocol
-     * if priorities are enabled otherwise just set it from the
-     * queue head. */
-    IFN_PRIO( m->blocker.thread = m->queue; )
-    unsigned int result = wakeup_thread(&m->queue, WAKEUP_TRANSFER);
+    unsigned int result = wakeup_thread(thread, WAKEUP_TRANSFER);
     restore_irq(oldlevel);
 
     corelock_unlock(&m->cl);
diff --git a/firmware/kernel/pthread/thread.c b/firmware/kernel/pthread/thread.c
index 354a946698..71cbd1d136 100644
--- a/firmware/kernel/pthread/thread.c
+++ b/firmware/kernel/pthread/thread.c
@@ -3,8 +3,8 @@
 #include <errno.h>
 #include <pthread.h>
 #include "/usr/include/semaphore.h"
+#include "thread-internal.h"
 #include "kernel.h"
-#include "thread.h"
 
 #define NSEC_PER_SEC 1000000000L
 static inline void timespec_add_ns(struct timespec *a, uint64_t ns)
@@ -25,11 +25,6 @@ struct thread_init_data {
 
 __thread struct thread_entry *_current;
 
-struct thread_entry* thread_self_entry(void)
-{
-    return _current;
-}
-
 unsigned int thread_self(void)
 {
     return (unsigned) pthread_self();
@@ -70,12 +65,10 @@ static void *trampoline(void *arg)
     if (data->start_frozen)
     {
         struct corelock thaw_lock;
-        struct thread_entry *queue = NULL;
         corelock_init(&thaw_lock);
         corelock_lock(&thaw_lock);
 
         _current->lock = &thaw_lock;
-        _current->bqp = &queue;
         sem_post(&data->init_sem);
         block_thread_switch(_current, _current->lock);
         _current->lock = NULL;
@@ -97,7 +90,7 @@ void thread_thaw(unsigned int thread_id)
     if (e->lock)
     {
         corelock_lock(e->lock);
-        wakeup_thread(e->bqp);
+        wakeup_thread(e);
         corelock_unlock(e->lock);
     }
     /* else: no lock. must be running already */
@@ -135,7 +128,7 @@ unsigned int create_thread(void (*function)(void),
     data->entry = entry;
     pthread_cond_init(&entry->cond, NULL);         
     entry->runnable = true;
-    entry->l = (struct thread_list) { NULL, NULL };
+
     sem_init(&data->init_sem, 0, 0);
 
     if (pthread_create(&retval, NULL, trampoline, data) < 0)
@@ -153,58 +146,19 @@ unsigned int create_thread(void (*function)(void),
     return retval;
 }
 
-static void add_to_list_l(struct thread_entry **list,
-                          struct thread_entry *thread)
-{
-    if (*list == NULL)
-    {
-        /* Insert into unoccupied list */
-        thread->l.next = thread;
-        thread->l.prev = thread;
-        *list = thread;
-    }
-    else
-    {
-        /* Insert last */
-        thread->l.next = *list;
-        thread->l.prev = (*list)->l.prev;
-        thread->l.prev->l.next = thread;
-        (*list)->l.prev = thread;
-    }
-}
-
-static void remove_from_list_l(struct thread_entry **list,
-                               struct thread_entry *thread)
-{
-    if (thread == thread->l.next)
-    {
-        /* The only item */
-        *list = NULL;
-        return;
-    }
-
-    if (thread == *list)
-    {
-        /* List becomes next item */
-        *list = thread->l.next;
-    }
-
-    /* Fix links to jump over the removed entry. */
-    thread->l.prev->l.next = thread->l.next;
-    thread->l.next->l.prev = thread->l.prev;
-}
-
 /* for block_thread(), _w_tmp() and wakeup_thread() t->lock must point
  * to a corelock instance, and this corelock must be held by the caller */
 void block_thread_switch(struct thread_entry *t, struct corelock *cl)
 {
     t->runnable = false;
-    add_to_list_l(t->bqp, t);
+    if (wait_queue_ptr(t))
+        wait_queue_register(t);
     while(!t->runnable)
         pthread_cond_wait(&t->cond, &cl->mutex);
 }
 
-void block_thread_switch_w_tmo(struct thread_entry *t, int timeout, struct corelock *cl)
+void block_thread_switch_w_tmo(struct thread_entry *t, int timeout,
+                               struct corelock *cl)
 {
     int err = 0;
     struct timespec ts;
@@ -213,30 +167,25 @@ void block_thread_switch_w_tmo(struct thread_entry *t, int timeout, struct corel
     timespec_add_ns(&ts, timeout * (NSEC_PER_SEC/HZ));
 
     t->runnable = false;
-    add_to_list_l(t->bqp, t);
+    wait_queue_register(t->wqp, t);
     while(!t->runnable && !err)
         err = pthread_cond_timedwait(&t->cond, &cl->mutex, &ts);
 
     if (err == ETIMEDOUT)
     {   /* the thread timed out and was not explicitely woken up.
          * we need to do this now to mark it runnable again */
-        remove_from_list_l(t->bqp, t);
         t->runnable = true;
-        if (t->wakeup_ext_cb)
-            t->wakeup_ext_cb(t);
+        /* NOTE: objects do their own removal upon timer expiration */
     }
 }
 
-unsigned int wakeup_thread(struct thread_entry **list)
+unsigned int wakeup_thread(struct thread_entry *t)
 {
-    struct thread_entry *t = *list;
-    if (t)
-    {
-        remove_from_list_l(list, t);
-        t->runnable = true;
-        pthread_cond_signal(&t->cond);
-    }
-    return THREAD_NONE;
+    if (t->wqp)
+        wait_queue_remove(t);
+    t->runnable = true;
+    pthread_cond_signal(&t->cond);
+    return THREAD_OK;
 }
 
 
diff --git a/firmware/kernel/queue.c b/firmware/kernel/queue.c
index 0ba7d7e00b..927e55274c 100644
--- a/firmware/kernel/queue.c
+++ b/firmware/kernel/queue.c
@@ -51,7 +51,7 @@ static struct
  * q->events[]:          |  XX  |  E1  |  E2  |  E3  |  E4  |  XX  |
  * q->send->senders[]:   | NULL |  T1  |  T2  | NULL |  T3  | NULL |
  *                                 \/     \/            \/
- * q->send->list:       >->|T0|<->|T1|<->|T2|<-------->|T3|<-<
+ * q->send->list:       0<-|T0|<->|T1|<->|T2|<-------->|T3|->0
  * q->send->curr_sender:    /\
  *
  * Thread has E0 in its own struct queue_event.
@@ -65,20 +65,20 @@ static struct
  * more efficent to reject the majority of cases that don't need this
  * called.
  */
-static void queue_release_sender(struct thread_entry * volatile * sender,
-                                 intptr_t retval)
+static void queue_release_sender_inner(
+    struct thread_entry * volatile * sender, intptr_t retval)
 {
     struct thread_entry *thread = *sender;
-
     *sender = NULL;               /* Clear slot. */
-#ifdef HAVE_WAKEUP_EXT_CB
-    thread->wakeup_ext_cb = NULL; /* Clear callback. */
-#endif
     thread->retval = retval;      /* Assign thread-local return value. */
-    *thread->bqp = thread;        /* Move blocking queue head to thread since
-                                     wakeup_thread wakes the first thread in
-                                     the list. */
-    wakeup_thread(thread->bqp, WAKEUP_RELEASE);
+    wakeup_thread(thread, WAKEUP_RELEASE);
+}
+
+static inline void queue_release_sender(
+    struct thread_entry * volatile * sender, intptr_t retval)
+{
+    if(UNLIKELY(*sender))
+        queue_release_sender_inner(sender, retval);
 }
 
 /* Releases any waiting threads that are queued with queue_send -
@@ -93,26 +93,11 @@ static void queue_release_all_senders(struct event_queue *q)
         {
             struct thread_entry **spp =
                 &q->send->senders[i & QUEUE_LENGTH_MASK];
-
-            if(*spp)
-            {
-                queue_release_sender(spp, 0);
-            }
+            queue_release_sender(spp, 0);
         }
     }
 }
 
-#ifdef HAVE_WAKEUP_EXT_CB
-/* Callback to do extra forced removal steps from sender list in addition
- * to the normal blocking queue removal and priority dis-inherit */
-static void queue_remove_sender_thread_cb(struct thread_entry *thread)
-{
-    *((struct thread_entry **)thread->retval) = NULL;
-    thread->wakeup_ext_cb = NULL;
-    thread->retval = 0;
-}
-#endif /* HAVE_WAKEUP_EXT_CB */
-
 /* Enables queue_send on the specified queue - caller allocates the extra
  * data structure. Only queues which are taken to be owned by a thread should
  * enable this however an official owner is not compulsory but must be
@@ -132,11 +117,12 @@ void queue_enable_queue_send(struct event_queue *q,
     if(send != NULL && q->send == NULL)
     {
         memset(send, 0, sizeof(*send));
+        wait_queue_init(&send->list);
 #ifdef HAVE_PRIORITY_SCHEDULING
-        send->blocker.priority = PRIORITY_IDLE;
+        blocker_init(&send->blocker);
         if(owner_id != 0)
         {
-            send->blocker.thread = thread_id_entry(owner_id);
+            send->blocker.thread = __thread_id_entry(owner_id);
             q->blocker_p = &send->blocker;
         }
 #endif
@@ -154,24 +140,14 @@ static inline void queue_do_unblock_sender(struct queue_sender_list *send,
                                            unsigned int i)
 {
     if(send)
-    {
-        struct thread_entry **spp = &send->senders[i];
-
-        if(UNLIKELY(*spp))
-        {
-            queue_release_sender(spp, 0);
-        }
-    }
+        queue_release_sender(&send->senders[i], 0);
 }
 
 /* Perform the auto-reply sequence */
 static inline void queue_do_auto_reply(struct queue_sender_list *send)
 {
-    if(send && send->curr_sender)
-    {
-        /* auto-reply */
+    if(send)
         queue_release_sender(&send->curr_sender, 0);
-    }
 }
 
 /* Moves waiting thread's refrence from the senders array to the
@@ -191,7 +167,6 @@ static inline void queue_do_fetch_sender(struct queue_sender_list *send,
             /* Move thread reference from array to the next thread
                that queue_reply will release */
             send->curr_sender = *spp;
-            (*spp)->retval = (intptr_t)spp;
             *spp = NULL;
         }
         /* else message was posted asynchronously with queue_post */
@@ -205,18 +180,28 @@ static inline void queue_do_fetch_sender(struct queue_sender_list *send,
 #define queue_do_fetch_sender(send, rd)
 #endif /* HAVE_EXTENDED_MESSAGING_AND_NAME */
 
+static void queue_wake_waiter_inner(struct thread_entry *thread)
+{
+    wakeup_thread(thread, WAKEUP_DEFAULT);
+}
+
+static inline void queue_wake_waiter(struct event_queue *q)
+{
+    struct thread_entry *thread = WQ_THREAD_FIRST(&q->queue);
+    if(thread != NULL)
+        queue_wake_waiter_inner(thread);
+}
+
 /* Queue must not be available for use during this call */
 void queue_init(struct event_queue *q, bool register_queue)
 {
     int oldlevel = disable_irq_save();
 
     if(register_queue)
-    {
         corelock_lock(&all_queues.cl);
-    }
 
     corelock_init(&q->cl);
-    q->queue = NULL;
+    wait_queue_init(&q->queue);
     /* What garbage is in write is irrelevant because of the masking design-
      * any other functions the empty the queue do this as well so that
      * queue_count and queue_empty return sane values in the case of a
@@ -261,7 +246,7 @@ void queue_delete(struct event_queue *q)
     corelock_unlock(&all_queues.cl);
 
     /* Release thread(s) waiting on queue head */
-    thread_queue_wake(&q->queue);
+    wait_queue_wake(&q->queue);
 
 #ifdef HAVE_EXTENDED_MESSAGING_AND_NAME
     if(q->send)
@@ -293,7 +278,7 @@ void queue_wait(struct event_queue *q, struct queue_event *ev)
 
 #ifdef HAVE_PRIORITY_SCHEDULING
     KERNEL_ASSERT(QUEUE_GET_THREAD(q) == NULL ||
-                  QUEUE_GET_THREAD(q) == thread_self_entry(),
+                  QUEUE_GET_THREAD(q) == __running_self_entry(),
                   "queue_wait->wrong thread\n");
 #endif
 
@@ -307,18 +292,12 @@ void queue_wait(struct event_queue *q, struct queue_event *ev)
 
     while(1)
     {
-        struct thread_entry *current;
-
         rd = q->read;
         if (rd != q->write) /* A waking message could disappear */
             break;
 
-        current = thread_self_entry();
-
-        IF_COP( current->obj_cl = &q->cl; )
-        current->bqp = &q->queue;
-
-        block_thread(current, TIMEOUT_BLOCK);
+        struct thread_entry *current = __running_self_entry();
+        block_thread(current, TIMEOUT_BLOCK, &q->queue, NULL);
 
         corelock_unlock(&q->cl);
         switch_thread();
@@ -349,16 +328,9 @@ void queue_wait_w_tmo(struct event_queue *q, struct queue_event *ev, int ticks)
     int oldlevel;
     unsigned int rd, wr;
 
-    /* this function works only with a positive number (or zero) of ticks */
-    if (ticks == TIMEOUT_BLOCK)
-    {
-        queue_wait(q, ev);
-        return;
-    }
-
 #ifdef HAVE_EXTENDED_MESSAGING_AND_NAME
     KERNEL_ASSERT(QUEUE_GET_THREAD(q) == NULL ||
-                  QUEUE_GET_THREAD(q) == thread_self_entry(),
+                  QUEUE_GET_THREAD(q) == __running_self_entry(),
                   "queue_wait_w_tmo->wrong thread\n");
 #endif
 
@@ -372,14 +344,10 @@ void queue_wait_w_tmo(struct event_queue *q, struct queue_event *ev, int ticks)
 
     rd = q->read;
     wr = q->write;
-    if (rd == wr && ticks > 0)
+    if (rd == wr && ticks != 0)
     {
-        struct thread_entry *current = thread_self_entry();
-
-        IF_COP( current->obj_cl = &q->cl; )
-        current->bqp = &q->queue;
-
-        block_thread(current, ticks);
+        struct thread_entry *current = __running_self_entry();
+        block_thread(current, ticks, &q->queue, NULL);
         corelock_unlock(&q->cl);    
 
         switch_thread();
@@ -389,6 +357,8 @@ void queue_wait_w_tmo(struct event_queue *q, struct queue_event *ev, int ticks)
 
         rd = q->read;
         wr = q->write;
+
+        wait_queue_try_remove(current);
     }
 
 #ifdef HAVE_EXTENDED_MESSAGING_AND_NAME
@@ -436,7 +406,7 @@ void queue_post(struct event_queue *q, long id, intptr_t data)
     queue_do_unblock_sender(q->send, wr);
 
     /* Wakeup a waiting thread if any */
-    wakeup_thread(&q->queue, WAKEUP_DEFAULT);
+    queue_wake_waiter(q);
 
     corelock_unlock(&q->cl);
     restore_irq(oldlevel);
@@ -465,28 +435,17 @@ intptr_t queue_send(struct event_queue *q, long id, intptr_t data)
     {
         struct queue_sender_list *send = q->send;
         struct thread_entry **spp = &send->senders[wr];
-        struct thread_entry *current = thread_self_entry();
+        struct thread_entry *current = __running_self_entry();
 
-        if(UNLIKELY(*spp))
-        {
-            /* overflow protect - unblock any thread waiting at this index */
-            queue_release_sender(spp, 0);
-        }
+        /* overflow protect - unblock any thread waiting at this index */
+        queue_release_sender(spp, 0);
 
         /* Wakeup a waiting thread if any */
-        wakeup_thread(&q->queue, WAKEUP_DEFAULT);
+        queue_wake_waiter(q);
 
         /* Save thread in slot, add to list and wait for reply */
         *spp = current;
-        IF_COP( current->obj_cl = &q->cl; )
-        IF_PRIO( current->blocker = q->blocker_p; )
-#ifdef HAVE_WAKEUP_EXT_CB
-        current->wakeup_ext_cb = queue_remove_sender_thread_cb;
-#endif
-        current->retval = (intptr_t)spp;
-        current->bqp = &send->list;
-
-        block_thread(current, TIMEOUT_BLOCK);
+        block_thread(current, TIMEOUT_BLOCK, &send->list, q->blocker_p);
 
         corelock_unlock(&q->cl);
         switch_thread();
@@ -495,7 +454,7 @@ intptr_t queue_send(struct event_queue *q, long id, intptr_t data)
     }
 
     /* Function as queue_post if sending is not enabled */
-    wakeup_thread(&q->queue, WAKEUP_DEFAULT);
+    queue_wake_waiter(q);
 
     corelock_unlock(&q->cl);
     restore_irq(oldlevel);
@@ -530,16 +489,12 @@ void queue_reply(struct event_queue *q, intptr_t retval)
 {
     if(q->send && q->send->curr_sender)
     {
-        struct queue_sender_list *sender;
-
         int oldlevel = disable_irq_save();
         corelock_lock(&q->cl);
 
-        sender = q->send;
-
-        /* Double-check locking */
-        if(LIKELY(sender && sender->curr_sender))
-            queue_release_sender(&sender->curr_sender, retval);
+        struct queue_sender_list *send = q->send;
+        if(send)
+            queue_release_sender(&send->curr_sender, retval);
 
         corelock_unlock(&q->cl);
         restore_irq(oldlevel);
diff --git a/firmware/kernel/semaphore.c b/firmware/kernel/semaphore.c
index 1505038fbc..5e9e46798f 100644
--- a/firmware/kernel/semaphore.c
+++ b/firmware/kernel/semaphore.c
@@ -24,6 +24,7 @@
 /****************************************************************************
  * Simple semaphore functions ;)
  ****************************************************************************/
+
 /* Initialize the semaphore object.
  * max = maximum up count the semaphore may assume (max >= 1)
  * start = initial count of semaphore (0 <= count <= max) */
@@ -31,7 +32,7 @@ void semaphore_init(struct semaphore *s, int max, int start)
 {
     KERNEL_ASSERT(max > 0 && start >= 0 && start <= max,
                   "semaphore_init->inv arg\n");
-    s->queue = NULL;
+    wait_queue_init(&s->queue);
     s->max = max;
     s->count = start;
     corelock_init(&s->cl);
@@ -42,44 +43,49 @@ void semaphore_init(struct semaphore *s, int max, int start)
  * safely be used in an ISR. */
 int semaphore_wait(struct semaphore *s, int timeout)
 {
-    int ret;
-    int oldlevel;
-    int count;
+    int ret = OBJ_WAIT_TIMEDOUT;
 
-    oldlevel = disable_irq_save();
+    int oldlevel = disable_irq_save();
     corelock_lock(&s->cl);
 
-    count = s->count;
-
+    int count = s->count;
     if(LIKELY(count > 0))
     {
         /* count is not zero; down it */
         s->count = count - 1;
         ret = OBJ_WAIT_SUCCEEDED;
     }
-    else if(timeout == 0)
-    {
-        /* just polling it */
-        ret = OBJ_WAIT_TIMEDOUT;
-    }
-    else
+    else if(timeout != 0)
     {
         /* too many waits - block until count is upped... */
-        struct thread_entry * current = thread_self_entry();
-        IF_COP( current->obj_cl = &s->cl; )
-        current->bqp = &s->queue;
-        /* return value will be OBJ_WAIT_SUCCEEDED after wait if wake was
-         * explicit in semaphore_release */
-        current->retval = OBJ_WAIT_TIMEDOUT;
+        struct thread_entry *current = __running_self_entry();
 
-        block_thread(current, timeout);
+        block_thread(current, timeout, &s->queue, NULL);
         corelock_unlock(&s->cl);
 
         /* ...and turn control over to next thread */
         switch_thread();
 
-        return current->retval;
+        /* if explicit wake indicated; do no more */
+        if(LIKELY(!wait_queue_ptr(current)))
+            return OBJ_WAIT_SUCCEEDED;
+
+        disable_irq();
+        corelock_lock(&s->cl);
+
+        /* see if anyone got us after the expired wait */
+        if(wait_queue_try_remove(current))
+        {
+            count = s->count;
+            if(count > 0)
+            {
+                /* down it lately */
+                s->count = count - 1;
+                ret = OBJ_WAIT_SUCCEEDED;
+            }
+        }
     }
+    /* else just polling it */
 
     corelock_unlock(&s->cl);
     restore_irq(oldlevel);
@@ -93,18 +99,17 @@ int semaphore_wait(struct semaphore *s, int timeout)
 void semaphore_release(struct semaphore *s)
 {
     unsigned int result = THREAD_NONE;
-    int oldlevel;
 
-    oldlevel = disable_irq_save();
+    int oldlevel = disable_irq_save();
     corelock_lock(&s->cl);
 
-    if(LIKELY(s->queue != NULL))
+    struct thread_entry *thread = WQ_THREAD_FIRST(&s->queue);
+    if(LIKELY(thread != NULL))
     {
         /* a thread was queued - wake it up and keep count at 0 */
         KERNEL_ASSERT(s->count == 0,
             "semaphore_release->threads queued but count=%d!\n", s->count);
-        s->queue->retval = OBJ_WAIT_SUCCEEDED; /* indicate explicit wake */
-        result = wakeup_thread(&s->queue, WAKEUP_DEFAULT);
+        result = wakeup_thread(thread, WAKEUP_DEFAULT);
     }
     else
     {
diff --git a/firmware/kernel/thread-common.c b/firmware/kernel/thread-common.c
index b8b8ffbd4c..aad6610feb 100644
--- a/firmware/kernel/thread-common.c
+++ b/firmware/kernel/thread-common.c
@@ -18,39 +18,222 @@
  * KIND, either express or implied.
  *
  ****************************************************************************/
-#include "thread-internal.h"
+#include "kernel-internal.h"
 #include "system.h"
 
+/* Unless otherwise defined, do nothing */
+#ifndef YIELD_KERNEL_HOOK
+#define YIELD_KERNEL_HOOK() false
+#endif
+#ifndef SLEEP_KERNEL_HOOK
+#define SLEEP_KERNEL_HOOK(ticks) false
+#endif
+
+const char __main_thread_name_str[] = "main";
+
+/* Array indexing is more efficient in inlines if the elements are a native
+   word size (100s of bytes fewer instructions) */
+
+#if NUM_CORES > 1
+static struct core_entry __core_entries[NUM_CORES] IBSS_ATTR;
+struct core_entry *__cores[NUM_CORES] IBSS_ATTR;
+#else
+struct core_entry __cores[NUM_CORES] IBSS_ATTR;
+#endif
+
+static struct thread_entry __thread_entries[MAXTHREADS] IBSS_ATTR;
+struct thread_entry *__threads[MAXTHREADS] IBSS_ATTR;
+
+
+/** Internal functions **/
+
+/*---------------------------------------------------------------------------
+ * Find an empty thread slot or NULL if none found. The slot returned will
+ * be locked on multicore.
+ *---------------------------------------------------------------------------
+ */
+static struct threadalloc
+{
+    threadbit_t avail;
+#if NUM_CORES > 1
+    struct corelock cl;
+#endif
+} threadalloc SHAREDBSS_ATTR;
+
+/*---------------------------------------------------------------------------
+ * Initialize the thread allocator
+ *---------------------------------------------------------------------------
+ */
+void thread_alloc_init(void)
+{
+    corelock_init(&threadalloc.cl);
+
+    for (unsigned int core = 0; core < NUM_CORES; core++)
+    {
+    #if NUM_CORES > 1
+        struct core_entry *c = &__core_entries[core];
+        __cores[core] = c;
+    #else
+        struct core_entry *c = &__cores[core];
+    #endif
+        rtr_queue_init(&c->rtr);
+        corelock_init(&c->rtr_cl);
+        tmo_queue_init(&c->tmo);
+        c->next_tmo_check = current_tick; /* Something not in the past */
+    }
+
+    for (unsigned int slotnum = 0; slotnum < MAXTHREADS; slotnum++)
+    {
+        struct thread_entry *t = &__thread_entries[slotnum];
+        __threads[slotnum] = t;
+        corelock_init(&t->waiter_cl);
+        corelock_init(&t->slot_cl);
+        t->id = THREAD_ID_INIT(slotnum);
+        threadbit_set_bit(&threadalloc.avail, slotnum);
+    }
+}
+
+/*---------------------------------------------------------------------------
+ * Allocate a thread alot 
+ *---------------------------------------------------------------------------
+ */
+struct thread_entry * thread_alloc(void)
+{
+    struct thread_entry *thread = NULL;
+
+    corelock_lock(&threadalloc.cl);
+
+    unsigned int slotnum = threadbit_ffs(&threadalloc.avail);
+    if (slotnum < MAXTHREADS)
+    {
+        threadbit_clear_bit(&threadalloc.avail, slotnum);
+        thread = __threads[slotnum];
+    }
+
+    corelock_unlock(&threadalloc.cl);
+
+    return thread;
+}
+
+/*---------------------------------------------------------------------------
+ * Free the thread slot of 'thread'
+ *---------------------------------------------------------------------------
+ */
+void thread_free(struct thread_entry *thread)
+{
+    corelock_lock(&threadalloc.cl);
+    threadbit_set_bit(&threadalloc.avail, THREAD_ID_SLOT(thread->id));
+    corelock_unlock(&threadalloc.cl);
+}
+
+/*---------------------------------------------------------------------------
+ * Assign the thread slot a new ID. Version is 0x00000100..0xffffff00.
+ *---------------------------------------------------------------------------
+ */
+void new_thread_id(struct thread_entry *thread)
+{
+    uint32_t id = thread->id + (1u << THREAD_ID_VERSION_SHIFT);
+
+    /* If wrapped to 0, make it 1 */
+    if ((id & THREAD_ID_VERSION_MASK) == 0)
+        id |= (1u << THREAD_ID_VERSION_SHIFT);
+
+    thread->id = id;
+}
+
 /*---------------------------------------------------------------------------
  * Wakeup an entire queue of threads - returns bitwise-or of return bitmask
- * from each operation or THREAD_NONE of nothing was awakened. Object owning
- * the queue must be locked first.
- *
- * INTERNAL: Intended for use by kernel objects and not for programs.
+ * from each operation or THREAD_NONE of nothing was awakened.
  *---------------------------------------------------------------------------
  */
-unsigned int thread_queue_wake(struct thread_entry **list)
+unsigned int wait_queue_wake(struct __wait_queue *wqp)
 {
     unsigned result = THREAD_NONE;
+    struct thread_entry *thread;
 
-    for (;;)
-    {
-        unsigned int rc = wakeup_thread(list, WAKEUP_DEFAULT);
-
-        if (rc == THREAD_NONE)
-            break; /* No more threads */
-
-        result |= rc;
-    }
+    while ((thread = WQ_THREAD_FIRST(wqp)))
+        result |= wakeup_thread(thread, WAKEUP_DEFAULT);
 
     return result;
 }
 
 
-/** Debug screen stuff **/
+/** Public functions **/
+
+#ifdef RB_PROFILE
+void profile_thread(void)
+{
+    profstart(THREAD_ID_SLOT(__running_self_entry()->id));
+}
+#endif
 
 /*---------------------------------------------------------------------------
- * returns the stack space used in bytes
+ * Return the thread id of the calling thread
+ * --------------------------------------------------------------------------
+ */
+unsigned int thread_self(void)
+{
+    return __running_self_entry()->id;
+}
+
+/*---------------------------------------------------------------------------
+ * Suspends a thread's execution for at least the specified number of ticks.
+ *
+ * May result in CPU core entering wait-for-interrupt mode if no other thread
+ * may be scheduled.
+ *
+ * NOTE: sleep(0) sleeps until the end of the current tick
+ *       sleep(n) that doesn't result in rescheduling:
+ *                      n <= ticks suspended < n + 1
+ *       n to n+1 is a lower bound. Other factors may affect the actual time
+ *       a thread is suspended before it runs again.
+ *---------------------------------------------------------------------------
+ */
+unsigned sleep(unsigned ticks)
+{
+    /* In certain situations, certain bootloaders in particular, a normal
+     * threading call is inappropriate. */
+    if (SLEEP_KERNEL_HOOK(ticks))
+        return 0; /* Handled */
+
+    disable_irq();
+    sleep_thread(ticks);
+    switch_thread();
+    return 0;
+}
+
+/*---------------------------------------------------------------------------
+ * Elects another thread to run or, if no other thread may be made ready to
+ * run, immediately returns control back to the calling thread.
+ *---------------------------------------------------------------------------
+ */
+void yield(void)
+{
+    /* In certain situations, certain bootloaders in particular, a normal
+     * threading call is inappropriate. */
+    if (YIELD_KERNEL_HOOK())
+        return; /* Handled */
+
+    switch_thread();
+}
+
+
+/** Debug screen stuff **/
+
+void format_thread_name(char *buf, size_t bufsize,
+                        const struct thread_entry *thread)
+{
+    const char *name = thread->name;
+    if (!name)
+        name = "";
+
+    const char *fmt = *name ? "%s" : "%s%08lX";
+    snprintf(buf, bufsize, fmt, name, thread->id);
+}
+
+#ifndef HAVE_SDL_THREADS
+/*---------------------------------------------------------------------------
+ * Returns the maximum percentage of the stack ever used during runtime.
  *---------------------------------------------------------------------------
  */
 static unsigned int stack_usage(uintptr_t *stackptr, size_t stack_size)
@@ -69,13 +252,9 @@ static unsigned int stack_usage(uintptr_t *stackptr, size_t stack_size)
 
     return usage;
 }
+#endif /* HAVE_SDL_THREADS */
 
 #if NUM_CORES > 1
-/*---------------------------------------------------------------------------
- * Returns the maximum percentage of the core's idle stack ever used during
- * runtime.
- *---------------------------------------------------------------------------
- */
 int core_get_debug_info(unsigned int core, struct core_debug_info *infop)
 {
     extern uintptr_t * const idle_stacks[NUM_CORES];
@@ -105,29 +284,29 @@ int thread_get_debug_info(unsigned int thread_id,
     if (!infop)
         return -1;
 
-    unsigned int slot = THREAD_ID_SLOT(thread_id);
-    if (slot >= MAXTHREADS)
+    unsigned int slotnum = THREAD_ID_SLOT(thread_id);
+    if (slotnum >= MAXTHREADS)
         return -1;
 
-    extern struct thread_entry threads[MAXTHREADS];
-    struct thread_entry *thread = &threads[slot];
+    struct thread_entry *thread = __thread_slot_entry(slotnum);
 
     int oldlevel = disable_irq_save();
-    LOCK_THREAD(thread);
+    corelock_lock(&threadalloc.cl);
+    corelock_lock(&thread->slot_cl);
 
     unsigned int state = thread->state;
 
-    if (state != STATE_KILLED)
-    {
-        const char *name = thread->name;
-        if (!name)
-            name = "";
+    int ret = 0;
 
+    if (threadbit_test_bit(&threadalloc.avail, slotnum) == 0)
+    {
         bool cpu_boost = false;
 #ifdef HAVE_SCHEDULER_BOOSTCTRL
         cpu_boost = thread->cpu_boost;
 #endif
+#ifndef HAVE_SDL_THREADS
         infop->stack_usage = stack_usage(thread->stack, thread->stack_size);
+#endif
 #if NUM_CORES > 1
         infop->core = thread->core;
 #endif
@@ -140,13 +319,13 @@ int thread_get_debug_info(unsigned int thread_id,
                  cpu_boost ? '+' : (state == STATE_RUNNING ? '*' : ' '),
                  status_chars[state]);
 
-        const char *fmt = *name ? "%s" : "%s%08lX";
-        snprintf(infop->name, sizeof (infop->name), fmt, name,
-                 thread->id);
+        format_thread_name(infop->name, sizeof (infop->name), thread);
+        ret = 1;
     }
 
-    UNLOCK_THREAD(thread);
+    corelock_unlock(&thread->slot_cl);
+    corelock_unlock(&threadalloc.cl);
     restore_irq(oldlevel);
 
-    return state == STATE_KILLED ? 0 : 1;
+    return ret;
 }
diff --git a/firmware/kernel/thread-internal.h b/firmware/kernel/thread-internal.h
index 894bd1fe7c..10606a54a6 100644
--- a/firmware/kernel/thread-internal.h
+++ b/firmware/kernel/thread-internal.h
@@ -78,30 +78,11 @@ struct priority_distribution
 
 #endif /* HAVE_PRIORITY_SCHEDULING */
 
-#ifdef HAVE_CORELOCK_OBJECT
-/* Operations to be performed just before stopping a thread and starting
-   a new one if specified before calling switch_thread */
-enum
-{
-    TBOP_CLEAR = 0,       /* No operation to do */
-    TBOP_UNLOCK_CORELOCK, /* Unlock a corelock variable */
-    TBOP_SWITCH_CORE,     /* Call the core switch preparation routine */
-};
+#define __rtr_queue         lldc_head
+#define __rtr_queue_node    lldc_node
 
-struct thread_blk_ops
-{
-    struct corelock *cl_p;    /* pointer to corelock */
-    unsigned char    flags;   /* TBOP_* flags */
-};
-#endif /* NUM_CORES > 1 */
-
-/* Link information for lists thread is in */
-struct thread_entry; /* forward */
-struct thread_list
-{
-    struct thread_entry *prev; /* Previous thread in a list */
-    struct thread_entry *next; /* Next thread in a list */
-};
+#define __tmo_queue         ll_head
+#define __tmo_queue_node    ll_node
 
 /* Information kept in each thread slot
  * members are arranged according to size - largest first - in order
@@ -109,96 +90,55 @@ struct thread_list
  */
 struct thread_entry
 {
-    struct regs context;       /* Register context at switch -
-                                  _must_ be first member */
-    uintptr_t *stack;          /* Pointer to top of stack */
-    const char *name;          /* Thread name */
-    long tmo_tick;             /* Tick when thread should be woken from
-                                  timeout -
-                                  states: STATE_SLEEPING/STATE_BLOCKED_W_TMO */
-    struct thread_list l;      /* Links for blocked/waking/running -
-                                  circular linkage in both directions */
-    struct thread_list tmo;    /* Links for timeout list -
-                                  Circular in reverse direction, NULL-terminated in
-                                  forward direction -
-                                  states: STATE_SLEEPING/STATE_BLOCKED_W_TMO */
-    struct thread_entry **bqp; /* Pointer to list variable in kernel
-                                  object where thread is blocked - used
-                                  for implicit unblock and explicit wake
-                                  states: STATE_BLOCKED/STATE_BLOCKED_W_TMO  */
-#ifdef HAVE_CORELOCK_OBJECT
-    struct corelock *obj_cl;   /* Object corelock where thead is blocked -
-                                  states: STATE_BLOCKED/STATE_BLOCKED_W_TMO */
-    struct corelock waiter_cl; /* Corelock for thread_wait */
-    struct corelock slot_cl;   /* Corelock to lock thread slot */
-    unsigned char core;        /* The core to which thread belongs */
+    struct regs context;         /* Register context at switch -
+                                    _must_ be first member */
+#ifndef HAVE_SDL_THREADS
+    uintptr_t *stack;            /* Pointer to top of stack */
 #endif
-    struct thread_entry *queue; /* List of threads waiting for thread to be
-                                  removed */
-#ifdef HAVE_WAKEUP_EXT_CB
-    void (*wakeup_ext_cb)(struct thread_entry *thread); /* Callback that
-                                  performs special steps needed when being
-                                  forced off of an object's wait queue that
-                                  go beyond the standard wait queue removal
-                                  and priority disinheritance */
-    /* Only enabled when using queue_send for now */
+    const char *name;            /* Thread name */
+    long tmo_tick;               /* Tick when thread should be woken */
+    struct __rtr_queue_node rtr; /* Node for run queue */
+    struct __tmo_queue_node tmo; /* Links for timeout list */
+    struct __wait_queue_node wq; /* Node for wait queue */
+    struct __wait_queue *volatile wqp; /* Pointer to registered wait queue */
+#if NUM_CORES > 1
+    struct corelock waiter_cl;   /* Corelock for thread_wait */
+    struct corelock slot_cl;     /* Corelock to lock thread slot */
+    unsigned char core;          /* The core to which thread belongs */
 #endif
-#if defined(HAVE_SEMAPHORE_OBJECTS) || \
-    defined(HAVE_EXTENDED_MESSAGING_AND_NAME) || \
-    NUM_CORES > 1
-    volatile intptr_t retval;  /* Return value from a blocked operation/
-                                  misc. use */
-#endif
-    uint32_t id;               /* Current slot id */
-    int __errno;               /* Thread error number (errno tls) */
+    struct __wait_queue queue;   /* List of threads waiting for thread to be
+                                    removed */
+    volatile intptr_t retval;    /* Return value from a blocked operation/
+                                    misc. use */
+    uint32_t id;                 /* Current slot id */
+    int __errno;                 /* Thread error number (errno tls) */
 #ifdef HAVE_PRIORITY_SCHEDULING
     /* Priority summary of owned objects that support inheritance */
-    struct blocker *blocker;   /* Pointer to blocker when this thread is blocked
-                                  on an object that supports PIP -
-                                  states: STATE_BLOCKED/STATE_BLOCKED_W_TMO  */
+    struct blocker *blocker;     /* Pointer to blocker when this thread is blocked
+                                    on an object that supports PIP -
+                                    states: STATE_BLOCKED/STATE_BLOCKED_W_TMO  */
     struct priority_distribution pdist; /* Priority summary of owned objects
-                                  that have blocked threads and thread's own
-                                  base priority */
-    int skip_count;            /* Number of times skipped if higher priority
-                                  thread was running */
+                                    that have blocked threads and thread's own
+                                    base priority */
+    int skip_count;              /* Number of times skipped if higher priority
+                                    thread was running */
     unsigned char base_priority; /* Base priority (set explicitly during
                                   creation or thread_set_priority) */
-    unsigned char priority;    /* Scheduled priority (higher of base or
-                                  all threads blocked by this one) */
+    unsigned char priority;      /* Scheduled priority (higher of base or
+                                    all threads blocked by this one) */
 #endif
-    unsigned short stack_size; /* Size of stack in bytes */
-    unsigned char state;       /* Thread slot state (STATE_*) */
+#ifndef HAVE_SDL_THREADS
+    unsigned short stack_size;   /* Size of stack in bytes */
+#endif
+    unsigned char state;         /* Thread slot state (STATE_*) */
 #ifdef HAVE_SCHEDULER_BOOSTCTRL
-    unsigned char cpu_boost;   /* CPU frequency boost flag */
+    unsigned char cpu_boost;     /* CPU frequency boost flag */
 #endif
 #ifdef HAVE_IO_PRIORITY
     unsigned char io_priority;
 #endif
 };
 
-/* Information kept for each core
- * Members are arranged for the same reason as in thread_entry
- */
-struct core_entry
-{
-    /* "Active" lists - core is constantly active on these and are never
-       locked and interrupts do not access them */
-    struct thread_entry *running;  /* threads that are running (RTR) */
-    struct thread_entry *timeout;  /* threads that are on a timeout before
-                                      running again */
-    struct thread_entry *block_task; /* Task going off running list */
-#ifdef HAVE_PRIORITY_SCHEDULING
-    struct priority_distribution rtr; /* Summary of running and ready-to-run
-                                         threads */
-#endif
-    long next_tmo_check;           /* soonest time to check tmo threads */
-#ifdef HAVE_CORELOCK_OBJECT
-    struct thread_blk_ops blk_ops; /* operations to perform when
-                                      blocking a thread */
-    struct corelock rtr_cl;        /* Lock for rtr list */
-#endif /* NUM_CORES */
-};
-
 /* Thread ID, 32 bits = |VVVVVVVV|VVVVVVVV|VVVVVVVV|SSSSSSSS| */
 #define THREAD_ID_VERSION_SHIFT 8
 #define THREAD_ID_VERSION_MASK  0xffffff00
@@ -206,38 +146,128 @@ struct core_entry
 #define THREAD_ID_INIT(n)       ((1u << THREAD_ID_VERSION_SHIFT) | (n))
 #define THREAD_ID_SLOT(id)      ((id) & THREAD_ID_SLOT_MASK)
 
-/* Thread locking */
-#if NUM_CORES > 1
-#define LOCK_THREAD(thread) \
-    ({ corelock_lock(&(thread)->slot_cl); })
-#define TRY_LOCK_THREAD(thread) \
-    ({ corelock_try_lock(&(thread)->slot_cl); })
-#define UNLOCK_THREAD(thread) \
-    ({ corelock_unlock(&(thread)->slot_cl); })
-#define UNLOCK_THREAD_AT_TASK_SWITCH(thread) \
-    ({ unsigned int _core = (thread)->core; \
-       cores[_core].blk_ops.flags |= TBOP_UNLOCK_CORELOCK; \
-       cores[_core].blk_ops.cl_p = &(thread)->slot_cl; })
-#else /* NUM_CORES == 1*/
-#define LOCK_THREAD(thread) \
-    ({ (void)(thread); })
-#define TRY_LOCK_THREAD(thread) \
-    ({ (void)(thread); })
-#define UNLOCK_THREAD(thread) \
-    ({ (void)(thread); })
-#define UNLOCK_THREAD_AT_TASK_SWITCH(thread) \
-    ({ (void)(thread); })
-#endif /* NUM_CORES */
-
 #define DEADBEEF ((uintptr_t)0xdeadbeefdeadbeefull)
 
+/* Information kept for each core
+ * Members are arranged for the same reason as in thread_entry
+ */
+struct core_entry
+{
+    /* "Active" lists - core is constantly active on these and are never
+       locked and interrupts do not access them */
+    struct __rtr_queue rtr;          /* Threads that are runnable */
+    struct __tmo_queue tmo;          /* Threads on a bounded wait */
+    struct thread_entry *running;    /* Currently running thread */
+#ifdef HAVE_PRIORITY_SCHEDULING
+    struct priority_distribution rtr_dist; /* Summary of runnables */
+#endif
+    long next_tmo_check;             /* Next due timeout check */
+#if NUM_CORES > 1
+    struct corelock rtr_cl;          /* Lock for rtr list */
+#endif /* NUM_CORES */
+};
+
+/* Hide a few scheduler details from itself to make allocation more flexible */
+#define __main_thread_name \
+    ({ extern const char __main_thread_name_str[]; \
+       __main_thread_name_str; })
+
+static FORCE_INLINE
+    void * __get_main_stack(size_t *stacksize)
+{
+#if (CONFIG_PLATFORM & PLATFORM_NATIVE)
+    extern uintptr_t stackbegin[];
+    extern uintptr_t stackend[];
+#else
+    extern uintptr_t *stackbegin;
+    extern uintptr_t *stackend;
+#endif
+    *stacksize = (uintptr_t)stackend - (uintptr_t)stackbegin;
+    return stackbegin;
+}
+
+void format_thread_name(char *buf, size_t bufsize,
+                        const struct thread_entry *thread);
+
+static FORCE_INLINE
+    struct core_entry * __core_id_entry(unsigned int core)
+{
+#if NUM_CORES > 1
+    extern struct core_entry * __cores[NUM_CORES];
+    return __cores[core];
+#else
+    extern struct core_entry __cores[NUM_CORES];
+    return &__cores[core];
+#endif
+}
+
+#define __running_self_entry() \
+    __core_id_entry(CURRENT_CORE)->running
+
+static FORCE_INLINE
+    struct thread_entry * __thread_slot_entry(unsigned int slotnum)
+{
+    extern struct thread_entry * __threads[MAXTHREADS];
+    return __threads[slotnum];
+}
+
+#define __thread_id_entry(id) \
+    __thread_slot_entry(THREAD_ID_SLOT(id))
+
+#define THREAD_FROM(p, member) \
+    container_of(p, struct thread_entry, member)
+
+#define RTR_EMPTY(rtrp) \
+    ({ (rtrp)->head == NULL; })
+
+#define RTR_THREAD_FIRST(rtrp) \
+    ({ THREAD_FROM((rtrp)->head, rtr); })
+
+#define RTR_THREAD_NEXT(thread) \
+    ({ THREAD_FROM((thread)->rtr.next, rtr); })
+
+#define TMO_THREAD_FIRST(tmop) \
+    ({ struct __tmo_queue *__tmop = (tmop); \
+       __tmop->head ? THREAD_FROM(__tmop->head, tmo) : NULL; })
+
+#define TMO_THREAD_NEXT(thread) \
+    ({ struct __tmo_queue_node *__next = (thread)->tmo.next; \
+       __next ? THREAD_FROM(__next, tmo) : NULL; })
+
+#define WQ_THREAD_FIRST(wqp) \
+    ({ struct __wait_queue *__wqp = (wqp); \
+       __wqp->head ? THREAD_FROM(__wqp->head, wq) : NULL; })
+
+#define WQ_THREAD_NEXT(thread) \
+    ({ struct __wait_queue_node *__next = (thread)->wq.next; \
+       __next ? THREAD_FROM(__next, wq) : NULL; })
+
+void thread_alloc_init(void) INIT_ATTR;
+struct thread_entry * thread_alloc(void);
+void thread_free(struct thread_entry *thread);
+void new_thread_id(struct thread_entry *thread);
+
 /* Switch to next runnable thread */
 void switch_thread(void);
 /* Blocks a thread for at least the specified number of ticks (0 = wait until
  * next tick) */
 void sleep_thread(int ticks);
 /* Blocks the current thread on a thread queue (< 0 == infinite) */
-void block_thread(struct thread_entry *current, int timeout);
+void block_thread_(struct thread_entry *current, int timeout);
+
+#ifdef HAVE_PRIORITY_SCHEDULING
+#define block_thread(thread, timeout, __wqp, bl) \
+    ({ struct thread_entry *__t = (thread);   \
+       __t->wqp = (__wqp);                    \
+       if (!__builtin_constant_p(bl) || (bl)) \
+           __t->blocker = (bl);               \
+       block_thread_(__t, (timeout)); })
+#else
+#define block_thread(thread, timeout, __wqp, bl...) \
+    ({ struct thread_entry *__t = (thread); \
+       __t->wqp = (__wqp);                  \
+       block_thread_(__t, (timeout)); })
+#endif
 
 /* Return bit flags for thread wakeup */
 #define THREAD_NONE     0x0 /* No thread woken up (exclusive) */
@@ -246,7 +276,7 @@ void block_thread(struct thread_entry *current, int timeout);
                                higher priority than current were woken) */
 
 /* A convenience function for waking an entire queue of threads. */
-unsigned int thread_queue_wake(struct thread_entry **list);
+unsigned int wait_queue_wake(struct __wait_queue *wqp);
 
 /* Wakeup a thread at the head of a list */
 enum wakeup_thread_protocol
@@ -257,36 +287,139 @@ enum wakeup_thread_protocol
     WAKEUP_TRANSFER_MULTI,
 };
 
-unsigned int wakeup_thread_(struct thread_entry **list
+unsigned int wakeup_thread_(struct thread_entry *thread
                             IF_PRIO(, enum wakeup_thread_protocol proto));
 
 #ifdef HAVE_PRIORITY_SCHEDULING
-#define wakeup_thread(list, proto) \
-    wakeup_thread_((list), (proto))
-#else /* !HAVE_PRIORITY_SCHEDULING */
-#define wakeup_thread(list, proto...) \
-    wakeup_thread_((list));
-#endif /* HAVE_PRIORITY_SCHEDULING */
-
-#ifdef HAVE_IO_PRIORITY
-void thread_set_io_priority(unsigned int thread_id, int io_priority);
-int thread_get_io_priority(unsigned int thread_id);
-#endif /* HAVE_IO_PRIORITY */
-#if NUM_CORES > 1
-unsigned int switch_core(unsigned int new_core);
+#define wakeup_thread(thread, proto) \
+    wakeup_thread_((thread), (proto))
+#else
+#define wakeup_thread(thread, proto...) \
+    wakeup_thread_((thread));
 #endif
 
-/* Return the id of the calling thread. */
-unsigned int thread_self(void);
-
-/* Return the thread_entry for the calling thread */
-struct thread_entry* thread_self_entry(void);
-
-/* Return thread entry from id */
-struct thread_entry *thread_id_entry(unsigned int thread_id);
-
 #ifdef RB_PROFILE
 void profile_thread(void);
 #endif
 
+static inline void rtr_queue_init(struct __rtr_queue *rtrp)
+{
+    lldc_init(rtrp);
+}
+
+static inline void rtr_queue_make_first(struct __rtr_queue *rtrp,
+                                        struct thread_entry *thread)
+{
+    rtrp->head = &thread->rtr;
+}
+
+static inline void rtr_queue_add(struct __rtr_queue *rtrp,
+                                 struct thread_entry *thread)
+{
+    lldc_insert_last(rtrp, &thread->rtr);
+}
+
+static inline void rtr_queue_remove(struct __rtr_queue *rtrp,
+                                    struct thread_entry *thread)
+{
+    lldc_remove(rtrp, &thread->rtr);
+}
+
+#define TMO_NOT_QUEUED (NULL + 1)
+
+static inline bool tmo_is_queued(struct thread_entry *thread)
+{
+    return thread->tmo.next != TMO_NOT_QUEUED;
+}
+
+static inline void tmo_set_dequeued(struct thread_entry *thread)
+{
+    thread->tmo.next = TMO_NOT_QUEUED;
+}
+
+static inline void tmo_queue_init(struct __tmo_queue *tmop)
+{
+    ll_init(tmop);
+}
+
+static inline void tmo_queue_expire(struct __tmo_queue *tmop,
+                                    struct thread_entry *prev,
+                                    struct thread_entry *thread)
+{
+    ll_remove_next(tmop, prev ? &prev->tmo : NULL);
+    tmo_set_dequeued(thread);
+}
+
+static inline void tmo_queue_remove(struct __tmo_queue *tmop,
+                                    struct thread_entry *thread)
+{
+    if (tmo_is_queued(thread))
+    {
+        ll_remove(tmop, &thread->tmo);
+        tmo_set_dequeued(thread);
+    }
+}
+
+static inline void tmo_queue_register(struct __tmo_queue *tmop,
+                                      struct thread_entry *thread)
+{
+    if (!tmo_is_queued(thread))
+        ll_insert_last(tmop, &thread->tmo);
+}
+
+static inline void wait_queue_init(struct __wait_queue *wqp)
+{
+    lld_init(wqp);
+}
+
+static inline void wait_queue_register(struct thread_entry *thread)
+{
+    lld_insert_last(thread->wqp, &thread->wq);
+}
+
+static inline struct __wait_queue *
+    wait_queue_ptr(struct thread_entry *thread)
+{
+    return thread->wqp;
+}
+
+static inline struct __wait_queue *
+    wait_queue_remove(struct thread_entry *thread)
+{
+    struct __wait_queue *wqp = thread->wqp;
+    thread->wqp = NULL;
+    lld_remove(wqp, &thread->wq);
+    return wqp;
+}
+
+static inline struct __wait_queue *
+    wait_queue_try_remove(struct thread_entry *thread)
+{
+    struct __wait_queue *wqp = thread->wqp;
+    if (wqp)
+    {
+        thread->wqp = NULL;
+        lld_remove(wqp, &thread->wq);
+    }
+
+    return wqp;
+}
+
+static inline void blocker_init(struct blocker *bl)
+{
+    bl->thread = NULL;
+#ifdef HAVE_PRIORITY_SCHEDULING
+    bl->priority = PRIORITY_IDLE;
+#endif
+}
+
+static inline void blocker_splay_init(struct blocker_splay *blsplay)
+{
+    blocker_init(&blsplay->blocker);
+#ifdef HAVE_PRIORITY_SCHEDULING
+    threadbit_clear(&blsplay->mask);
+#endif
+    corelock_init(&blsplay->cl);
+}
+
 #endif /* THREAD_INTERNAL_H */
diff --git a/firmware/kernel/thread.c b/firmware/kernel/thread.c
index c148f6b76e..b916c3b521 100644
--- a/firmware/kernel/thread.c
+++ b/firmware/kernel/thread.c
@@ -37,11 +37,6 @@
 #endif
 #include "core_alloc.h"
 
-/****************************************************************************
- *                              ATTENTION!!                                 *
- *    See notes below on implementing processor-specific portions!          *
- ***************************************************************************/
-
 /* Define THREAD_EXTRA_CHECKS as 1 to enable additional state checks */
 #ifdef DEBUG
 #define THREAD_EXTRA_CHECKS 1 /* Always 1 for DEBUG */
@@ -49,7 +44,11 @@
 #define THREAD_EXTRA_CHECKS 0
 #endif
 
-/**
+/****************************************************************************
+ *                              ATTENTION!!                                 *
+ *    See notes below on implementing processor-specific portions!          *
+ ****************************************************************************
+ *
  * General locking order to guarantee progress. Order must be observed but
  * all stages are not nescessarily obligatory. Going from 1) to 3) is
  * perfectly legal.
@@ -66,14 +65,14 @@
  * unlock and the other processor's handler may proceed at that time. Not
  * nescessary when the resource in question is definitely not available to
  * interrupt handlers.
- *  
+ *
  * 2) Kernel Object
  * 1) May be needed beforehand if the kernel object allows dual-use such as
  * event queues. The kernel object must have a scheme to protect itself from
  * access by another processor and is responsible for serializing the calls
- * to block_thread(_w_tmo) and wakeup_thread both to themselves and to each
- * other. Objects' queues are also protected here.
- * 
+ * to block_thread  and wakeup_thread both to themselves and to each other.
+ * Objects' queues are also protected here.
+ *
  * 3) Thread Slot
  * This locks access to the thread's slot such that its state cannot be
  * altered by another processor when a state change is in progress such as
@@ -121,68 +120,62 @@
  * available then some careful non-blocking synchonization is needed (as on
  * PP targets at the moment).
  *---------------------------------------------------------------------------
+ *
+ *
+ *---------------------------------------------------------------------------
+ * Priority distribution structure (one category for each possible priority):
+ *
+ *       +----+----+----+ ... +------+
+ * hist: | F0 | F1 | F2 |     | Fn-1 |
+ *       +----+----+----+ ... +------+
+ * mask: | b0 | b1 | b2 |     | bn-1 |
+ *       +----+----+----+ ... +------+
+ *
+ * F = count of threads at priority category n (frequency)
+ * b = bitmask of non-zero priority categories (occupancy)
+ *
+ *        / if H[n] != 0 : 1
+ * b[n] = |
+ *        \ else         : 0
+ *
+ *---------------------------------------------------------------------------
+ * Basic priority inheritance priotocol (PIP):
+ *
+ * Mn = mutex n, Tn = thread n
+ *
+ * A lower priority thread inherits the priority of the highest priority
+ * thread blocked waiting for it to complete an action (such as release a
+ * mutex or respond to a message via queue_send):
+ *
+ * 1) T2->M1->T1
+ *
+ * T1 owns M1, T2 is waiting for M1 to realease M1. If T2 has a higher
+ * priority than T1 then T1 inherits the priority of T2.
+ *
+ * 2) T3
+ *    \/
+ *    T2->M1->T1
+ *
+ * Situation is like 1) but T2 and T3 are both queued waiting for M1 and so
+ * T1 inherits the higher of T2 and T3.
+ *
+ * 3) T3->M2->T2->M1->T1
+ *
+ * T1 owns M1, T2 owns M2. If T3 has a higher priority than both T1 and T2,
+ * then T1 inherits the priority of T3 through T2.
+ *
+ * Blocking chains can grow arbitrarily complex (though it's best that they
+ * not form at all very often :) and build-up from these units.
+ *---------------------------------------------------------------------------
  */
-
-/* Cast to the the machine pointer size, whose size could be < 4 or > 32
- * (someday :). */
-static struct core_entry cores[NUM_CORES] IBSS_ATTR;
-struct thread_entry threads[MAXTHREADS] IBSS_ATTR;
-
-static const char main_thread_name[] = "main";
-#if (CONFIG_PLATFORM & PLATFORM_NATIVE)
-extern uintptr_t stackbegin[];
-extern uintptr_t stackend[];
-#else
-extern uintptr_t *stackbegin;
-extern uintptr_t *stackend;
-#endif
-
-static inline void core_sleep(IF_COP_VOID(unsigned int core))
-        __attribute__((always_inline));
-
-void check_tmo_threads(void)
-        __attribute__((noinline));
-
-static inline void block_thread_on_l(struct thread_entry *thread, unsigned state)
-        __attribute__((always_inline));
-
-static void add_to_list_tmo(struct thread_entry *thread)
-        __attribute__((noinline));
-
-static void core_schedule_wakeup(struct thread_entry *thread)
-        __attribute__((noinline));
-
-#if NUM_CORES > 1
-static inline void run_blocking_ops(
-    unsigned int core, struct thread_entry *thread)
-        __attribute__((always_inline));
-#endif
-
-static void thread_stkov(struct thread_entry *thread)
-        __attribute__((noinline));
-
-static inline void store_context(void* addr)
-        __attribute__((always_inline));
-
-static inline void load_context(const void* addr)
-        __attribute__((always_inline));
-
-#if NUM_CORES > 1
-static void thread_final_exit_do(struct thread_entry *current)
-    __attribute__((noinline)) NORETURN_ATTR USED_ATTR;
-#else
-static inline void thread_final_exit(struct thread_entry *current)
-    __attribute__((always_inline)) NORETURN_ATTR;
-#endif
-
-void switch_thread(void)
-        __attribute__((noinline));
+static FORCE_INLINE void core_sleep(IF_COP_VOID(unsigned int core));
+static FORCE_INLINE void store_context(void* addr);
+static FORCE_INLINE void load_context(const void* addr);
 
 /****************************************************************************
  * Processor/OS-specific section - include necessary core support
  */
 
-
 #include "asm/thread.c"
 
 #if defined (CPU_PP)
@@ -193,20 +186,17 @@ void switch_thread(void)
  * End Processor-specific section
  ***************************************************************************/
 
-static NO_INLINE
+static NO_INLINE NORETURN_ATTR
   void thread_panicf(const char *msg, struct thread_entry *thread)
 {
     IF_COP( const unsigned int core = thread->core; )
-    static char namebuf[sizeof (((struct thread_debug_info *)0)->name)];
-    const char *name = thread->name;
-    if (!name)
-        name = "";
-    snprintf(namebuf, sizeof (namebuf), *name ? "%s" : "%s%08lX",
-             name, (unsigned long)thread->id);
+    static char name[sizeof (((struct thread_debug_info *)0)->name)];
+    format_thread_name(name, sizeof (name), thread);
     panicf ("%s %s" IF_COP(" (%d)"), msg, name IF_COP(, core));
+    while (1);
 }
 
-static void thread_stkov(struct thread_entry *thread)
+static NO_INLINE void thread_stkov(struct thread_entry *thread)
 {
     thread_panicf("Stkov", thread);
 }
@@ -218,36 +208,51 @@ static void thread_stkov(struct thread_entry *thread)
     ({ if (!({ exp; })) thread_panicf((msg), (thread)); })
 #else
 #define THREAD_PANICF(msg, thread) \
-    do {} while (0)
+    do {} while (1)
 #define THREAD_ASSERT(exp, msg, thread) \
     do {} while (0)
 #endif /* THREAD_EXTRA_CHECKS */
 
+/* Thread locking */
+#if NUM_CORES > 1
+#define LOCK_THREAD(thread) \
+    ({ corelock_lock(&(thread)->slot_cl); })
+#define TRY_LOCK_THREAD(thread) \
+    ({ corelock_try_lock(&(thread)->slot_cl); })
+#define UNLOCK_THREAD(thread) \
+    ({ corelock_unlock(&(thread)->slot_cl); })
+#else /* NUM_CORES == 1*/
+#define LOCK_THREAD(thread) \
+    ({ (void)(thread); })
+#define TRY_LOCK_THREAD(thread) \
+    ({ (void)(thread); })
+#define UNLOCK_THREAD(thread) \
+    ({ (void)(thread); })
+#endif /* NUM_CORES */
+
 /* RTR list */
-#define RTR_LOCK(core) \
-    ({ corelock_lock(&cores[core].rtr_cl); })
-#define RTR_UNLOCK(core) \
-    ({ corelock_unlock(&cores[core].rtr_cl); })
+#define RTR_LOCK(corep) \
+    corelock_lock(&(corep)->rtr_cl)
+#define RTR_UNLOCK(corep) \
+    corelock_unlock(&(corep)->rtr_cl)
 
 #ifdef HAVE_PRIORITY_SCHEDULING
-#define rtr_add_entry(core, priority) \
-    prio_add_entry(&cores[core].rtr, (priority))
+#define rtr_add_entry(corep, priority) \
+    prio_add_entry(&(corep)->rtr_dist, (priority))
+#define rtr_subtract_entry(corep, priority) \
+    prio_subtract_entry(&(corep)->rtr_dist, (priority))
+#define rtr_move_entry(corep, from, to) \
+    prio_move_entry(&(corep)->rtr_dist, (from), (to))
+#else /* !HAVE_PRIORITY_SCHEDULING */
+#define rtr_add_entry(corep, priority) \
+    do {} while (0)
+#define rtr_subtract_entry(corep, priority) \
+    do {} while (0)
+#define rtr_move_entry(corep, from, to) \
+    do {} while (0)
+#endif /* HAVE_PRIORITY_SCHEDULING */
 
-#define rtr_subtract_entry(core, priority) \
-    prio_subtract_entry(&cores[core].rtr, (priority))
-
-#define rtr_move_entry(core, from, to) \
-    prio_move_entry(&cores[core].rtr, (from), (to))
-#else
-#define rtr_add_entry(core, priority)
-#define rtr_add_entry_inl(core, priority)
-#define rtr_subtract_entry(core, priority)
-#define rtr_subtract_entry_inl(core, priotity)
-#define rtr_move_entry(core, from, to)
-#define rtr_move_entry_inl(core, from, to)
-#endif
-
-static inline void thread_store_context(struct thread_entry *thread)
+static FORCE_INLINE void thread_store_context(struct thread_entry *thread)
 {
 #if (CONFIG_PLATFORM & PLATFORM_HOSTED)
     thread->__errno = errno;
@@ -255,7 +260,7 @@ static inline void thread_store_context(struct thread_entry *thread)
     store_context(&thread->context);
 }
 
-static inline void thread_load_context(struct thread_entry *thread)
+static FORCE_INLINE void thread_load_context(struct thread_entry *thread)
 {
     load_context(&thread->context);
 #if (CONFIG_PLATFORM & PLATFORM_HOSTED)
@@ -263,23 +268,155 @@ static inline void thread_load_context(struct thread_entry *thread)
 #endif
 }
 
-static inline unsigned int should_switch_tasks(void)
+static FORCE_INLINE unsigned int
+should_switch_tasks(struct thread_entry *thread)
 {
-    unsigned int result = THREAD_OK;
+#ifdef HAVE_PRIORITY_SCHEDULING
+    const unsigned int core = CURRENT_CORE;
+#if NUM_CORES > 1
+    /* Forget about it if different CPU */
+    if (thread->core != core)
+        return THREAD_OK;
+#endif
+    /* Just woke something therefore a thread is on the run queue */
+    struct thread_entry *current =
+        RTR_THREAD_FIRST(&__core_id_entry(core)->rtr);
+    if (LIKELY(thread->priority >= current->priority))
+        return THREAD_OK;
+
+    /* There is a thread ready to run of higher priority on the same
+     * core as the current one; recommend a task switch. */
+    return THREAD_OK | THREAD_SWITCH;
+#else
+    return THREAD_OK;
+#endif /* HAVE_PRIORITY_SCHEDULING */
+}
 
 #ifdef HAVE_PRIORITY_SCHEDULING
-    struct thread_entry *current = cores[CURRENT_CORE].running;
-    if (current &&
-        priobit_ffs(&cores[IF_COP_CORE(current->core)].rtr.mask)
-            < current->priority)
-    {
-        /* There is a thread ready to run of higher priority on the same
-         * core as the current one; recommend a task switch. */
-        result |= THREAD_SWITCH;
-    }
+
+/*---------------------------------------------------------------------------
+ * Increment frequency at category "priority"
+ *---------------------------------------------------------------------------
+ */
+static inline unsigned int prio_add_entry(
+    struct priority_distribution *pd, int priority)
+{
+    unsigned int count = ++pd->hist[priority];
+    if (count == 1)
+        priobit_set_bit(&pd->mask, priority);
+    return count;
+}
+
+/*---------------------------------------------------------------------------
+ * Decrement frequency at category "priority"
+ *---------------------------------------------------------------------------
+ */
+static inline unsigned int prio_subtract_entry(
+    struct priority_distribution *pd, int priority)
+{
+    unsigned int count = --pd->hist[priority];
+    if (count == 0)
+        priobit_clear_bit(&pd->mask, priority);
+    return count;
+}
+
+/*---------------------------------------------------------------------------
+ * Remove from one category and add to another
+ *---------------------------------------------------------------------------
+ */
+static inline void prio_move_entry(
+    struct priority_distribution *pd, int from, int to)
+{
+    if (--pd->hist[from] == 0)
+        priobit_clear_bit(&pd->mask, from);
+
+    if (++pd->hist[to] == 1)
+        priobit_set_bit(&pd->mask, to);
+}
+
 #endif /* HAVE_PRIORITY_SCHEDULING */
 
-    return result;
+/*---------------------------------------------------------------------------
+ * Common init for new thread basic info
+ *---------------------------------------------------------------------------
+ */
+static void new_thread_base_init(struct thread_entry *thread,
+                                 void **stackp, size_t *stack_sizep,
+                                 const char *name IF_PRIO(, int priority)
+                                 IF_COP(, unsigned int core))
+{
+    ALIGN_BUFFER(*stackp, *stack_sizep, MIN_STACK_ALIGN);
+    thread->stack = *stackp;
+    thread->stack_size = *stack_sizep;
+
+    thread->name = name;
+    wait_queue_init(&thread->queue);
+    thread->wqp = NULL;
+    tmo_set_dequeued(thread);
+#ifdef HAVE_PRIORITY_SCHEDULING
+    thread->skip_count    = 0;
+    thread->blocker       = NULL;
+    thread->base_priority = priority;
+    thread->priority      = priority;
+    memset(&thread->pdist, 0, sizeof(thread->pdist));
+    prio_add_entry(&thread->pdist, priority);
+#endif
+#if NUM_CORES > 1
+    thread->core = core;
+#endif
+#ifdef HAVE_SCHEDULER_BOOSTCTRL
+    thread->cpu_boost = 0;
+#endif
+#ifdef HAVE_IO_PRIORITY
+    /* Default to high (foreground) priority */
+    thread->io_priority = IO_PRIORITY_IMMEDIATE;
+#endif
+}
+
+/*---------------------------------------------------------------------------
+ * Move a thread onto the core's run queue and promote it
+ *---------------------------------------------------------------------------
+ */
+static inline void core_rtr_add(struct core_entry *corep,
+                                struct thread_entry *thread)
+{
+    RTR_LOCK(corep);
+    rtr_queue_add(&corep->rtr, thread);
+    rtr_add_entry(corep, thread->priority);
+#ifdef HAVE_PRIORITY_SCHEDULING
+    thread->skip_count = thread->base_priority;
+#endif
+    thread->state = STATE_RUNNING;
+    RTR_UNLOCK(corep);
+}
+
+/*---------------------------------------------------------------------------
+ * Remove a thread from the core's run queue
+ *---------------------------------------------------------------------------
+ */
+static inline void core_rtr_remove(struct core_entry *corep,
+                                   struct thread_entry *thread)
+{
+    RTR_LOCK(corep);
+    rtr_queue_remove(&corep->rtr, thread);
+    rtr_subtract_entry(corep, thread->priority);
+    /* Does not demote state */
+    RTR_UNLOCK(corep);
+}
+
+/*---------------------------------------------------------------------------
+ * Move a thread back to a running state on its core
+ *---------------------------------------------------------------------------
+ */
+static NO_INLINE void core_schedule_wakeup(struct thread_entry *thread)
+{
+    const unsigned int core = IF_COP_CORE(thread->core);
+    struct core_entry *corep = __core_id_entry(core);
+    core_rtr_add(corep, thread);
+#if NUM_CORES > 1
+    if (core != CURRENT_CORE)
+        core_wake(core);
+#endif
 }
 
 #ifdef HAVE_PRIORITY_SCHEDULING
@@ -339,274 +476,20 @@ static inline void unlock_blocker_thread(struct blocker *bl)
 #endif /* NUM_CORES > 1*/
     (void)bl;
 }
-#endif /* HAVE_PRIORITY_SCHEDULING */
 
-/*---------------------------------------------------------------------------
- * Thread list structure - circular:
- *    +------------------------------+
- *    |                              |
- *    +--+---+<-+---+<-+---+<-+---+<-+
- * Head->| T |  | T |  | T |  | T |
- *    +->+---+->+---+->+---+->+---+--+
- *    |                              |
- *    +------------------------------+
- *---------------------------------------------------------------------------
- */
-
-/*---------------------------------------------------------------------------
- * Adds a thread to a list of threads using "insert last". Uses the "l"
- * links.
- *---------------------------------------------------------------------------
- */
-static void add_to_list_l(struct thread_entry **list,
-                          struct thread_entry *thread)
-{
-    struct thread_entry *l = *list;
-
-    if (l == NULL)
-    {
-        /* Insert into unoccupied list */
-        thread->l.prev = thread;
-        thread->l.next = thread;
-        *list = thread;
-        return;
-    }
-
-    /* Insert last */
-    thread->l.prev = l->l.prev;
-    thread->l.next = l;
-    l->l.prev->l.next = thread;
-    l->l.prev = thread;
-}
-
-/*---------------------------------------------------------------------------
- * Removes a thread from a list of threads. Uses the "l" links.
- *---------------------------------------------------------------------------
- */
-static void remove_from_list_l(struct thread_entry **list,
-                               struct thread_entry *thread)
-{
-    struct thread_entry *prev, *next;
-
-    next = thread->l.next;
-
-    if (thread == next)
-    {
-        /* The only item */
-        *list = NULL;
-        return;
-    }
-
-    if (thread == *list)
-    {
-        /* List becomes next item */
-        *list = next;
-    }
-
-    prev = thread->l.prev;
-    
-    /* Fix links to jump over the removed entry. */
-    next->l.prev = prev;
-    prev->l.next = next;
-}
-
-/*---------------------------------------------------------------------------
- * Timeout list structure - circular reverse (to make "remove item" O(1)),
- * NULL-terminated forward (to ease the far more common forward traversal):
- *    +------------------------------+
- *    |                              |
- *    +--+---+<-+---+<-+---+<-+---+<-+
- * Head->| T |  | T |  | T |  | T |
- *       +---+->+---+->+---+->+---+-X
- *---------------------------------------------------------------------------
- */
-
-/*---------------------------------------------------------------------------
- * Add a thread from the core's timout list by linking the pointers in its
- * tmo structure.
- *---------------------------------------------------------------------------
- */
-static void add_to_list_tmo(struct thread_entry *thread)
-{
-    struct thread_entry *tmo = cores[IF_COP_CORE(thread->core)].timeout;
-    THREAD_ASSERT(thread->tmo.prev == NULL,
-                  "add_to_list_tmo->already listed", thread);
-
-    thread->tmo.next = NULL;
-
-    if (tmo == NULL)
-    {
-        /* Insert into unoccupied list */
-        thread->tmo.prev = thread;
-        cores[IF_COP_CORE(thread->core)].timeout = thread;
-        return;
-    }
-
-    /* Insert Last */
-    thread->tmo.prev = tmo->tmo.prev;
-    tmo->tmo.prev->tmo.next = thread;
-    tmo->tmo.prev = thread;
-}
-
-/*---------------------------------------------------------------------------
- * Remove a thread from the core's timout list by unlinking the pointers in
- * its tmo structure. Sets thread->tmo.prev to NULL to indicate the timeout
- * is cancelled.
- *---------------------------------------------------------------------------
- */
-static void remove_from_list_tmo(struct thread_entry *thread)
-{
-    struct thread_entry **list = &cores[IF_COP_CORE(thread->core)].timeout;
-    struct thread_entry *prev = thread->tmo.prev;
-    struct thread_entry *next = thread->tmo.next;
-
-    THREAD_ASSERT(prev != NULL, "remove_from_list_tmo->not listed", thread);
-
-    if (next != NULL)
-        next->tmo.prev = prev;
-
-    if (thread == *list)
-    {
-        /* List becomes next item and empty if next == NULL */
-        *list = next;
-        /* Mark as unlisted */
-        thread->tmo.prev = NULL;
-    }
-    else
-    {
-        if (next == NULL)
-            (*list)->tmo.prev = prev;
-        prev->tmo.next = next;
-        /* Mark as unlisted */
-        thread->tmo.prev = NULL;
-    }
-}
-
-#ifdef HAVE_PRIORITY_SCHEDULING
-/*---------------------------------------------------------------------------
- * Priority distribution structure (one category for each possible priority):
- *
- *       +----+----+----+ ... +-----+
- * hist: | F0 | F1 | F2 |     | F31 |
- *       +----+----+----+ ... +-----+
- * mask: | b0 | b1 | b2 |     | b31 |
- *       +----+----+----+ ... +-----+
- *
- * F = count of threads at priority category n (frequency)
- * b = bitmask of non-zero priority categories (occupancy)
- *
- *        / if H[n] != 0 : 1
- * b[n] = |
- *        \ else         : 0 
- *
- *---------------------------------------------------------------------------
- * Basic priority inheritance priotocol (PIP):
- *
- * Mn = mutex n, Tn = thread n
- *
- * A lower priority thread inherits the priority of the highest priority
- * thread blocked waiting for it to complete an action (such as release a
- * mutex or respond to a message via queue_send):
- *
- * 1) T2->M1->T1
- *
- * T1 owns M1, T2 is waiting for M1 to realease M1. If T2 has a higher
- * priority than T1 then T1 inherits the priority of T2.
- *
- * 2) T3
- *    \/
- *    T2->M1->T1
- *
- * Situation is like 1) but T2 and T3 are both queued waiting for M1 and so
- * T1 inherits the higher of T2 and T3.
- *
- * 3) T3->M2->T2->M1->T1
- *
- * T1 owns M1, T2 owns M2. If T3 has a higher priority than both T1 and T2,
- * then T1 inherits the priority of T3 through T2.
- *
- * Blocking chains can grow arbitrarily complex (though it's best that they
- * not form at all very often :) and build-up from these units.
- *---------------------------------------------------------------------------
- */
-
-/*---------------------------------------------------------------------------
- * Increment frequency at category "priority"
- *---------------------------------------------------------------------------
- */
-static inline unsigned int prio_add_entry(
-    struct priority_distribution *pd, int priority)
-{
-    unsigned int count = ++pd->hist[priority];
-    if (count == 1)
-        priobit_set_bit(&pd->mask, priority);
-    return count;
-}
-
-/*---------------------------------------------------------------------------
- * Decrement frequency at category "priority"
- *---------------------------------------------------------------------------
- */
-static inline unsigned int prio_subtract_entry(
-    struct priority_distribution *pd, int priority)
-{
-    unsigned int count = --pd->hist[priority];
-    if (count == 0)
-        priobit_clear_bit(&pd->mask, priority);
-    return count;
-}
-
-/*---------------------------------------------------------------------------
- * Remove from one category and add to another
- *---------------------------------------------------------------------------
- */
-static inline void prio_move_entry(
-    struct priority_distribution *pd, int from, int to)
-{
-    if (--pd->hist[from] == 0)
-        priobit_clear_bit(&pd->mask, from);
-
-    if (++pd->hist[to] == 1)
-        priobit_set_bit(&pd->mask, to);
-}
-#endif /* HAVE_PRIORITY_SCHEDULING */
-
-/*---------------------------------------------------------------------------
- * Move a thread back to a running state on its core.
- *---------------------------------------------------------------------------
- */
-static void core_schedule_wakeup(struct thread_entry *thread)
-{
-    const unsigned int core = IF_COP_CORE(thread->core);
-
-    RTR_LOCK(core);
-
-    thread->state = STATE_RUNNING;
-
-    add_to_list_l(&cores[core].running, thread);
-    rtr_add_entry(core, thread->priority);
-
-    RTR_UNLOCK(core);
-
-#if NUM_CORES > 1
-    if (core != CURRENT_CORE)
-        core_wake(core);
-#endif
-}
-
-#ifdef HAVE_PRIORITY_SCHEDULING
 /*---------------------------------------------------------------------------
  * Change the priority and rtr entry for a running thread
  *---------------------------------------------------------------------------
  */
-static inline void set_running_thread_priority(
+static inline void set_rtr_thread_priority(
     struct thread_entry *thread, int priority)
 {
     const unsigned int core = IF_COP_CORE(thread->core);
-    RTR_LOCK(core);
-    rtr_move_entry(core, thread->priority, priority);
+    struct core_entry *corep = __core_id_entry(core);
+    RTR_LOCK(corep);
+    rtr_move_entry(corep, thread->priority, priority);
     thread->priority = priority;
-    RTR_UNLOCK(core);
+    RTR_UNLOCK(corep);
 }
 
 /*---------------------------------------------------------------------------
@@ -619,30 +502,21 @@ static inline void set_running_thread_priority(
  * penalty under high contention.
  *---------------------------------------------------------------------------
  */
-static int find_highest_priority_in_list_l(
-    struct thread_entry * const thread)
+static int wait_queue_find_priority(struct __wait_queue *wqp)
 {
-    if (LIKELY(thread != NULL))
+    int highest_priority = PRIORITY_IDLE;
+    struct thread_entry *thread = WQ_THREAD_FIRST(wqp);
+
+    while (thread != NULL)
     {
-        /* Go though list until the ending up at the initial thread */
-        int highest_priority = thread->priority;
-        struct thread_entry *curr = thread;
+        int priority = thread->priority;
+        if (priority < highest_priority)
+            highest_priority = priority;
 
-        do
-        {
-            int priority = curr->priority;
-
-            if (priority < highest_priority)
-                highest_priority = priority;
-
-            curr = curr->l.next;
-        }
-        while (curr != thread);
-
-        return highest_priority;
+        thread = WQ_THREAD_NEXT(thread);
     }
 
-    return PRIORITY_IDLE;
+    return highest_priority;
 }
 
 /*---------------------------------------------------------------------------
@@ -666,7 +540,7 @@ static void inherit_priority(
         {
             /* Multiple owners */
             struct blocker_splay *blsplay = (struct blocker_splay *)bl;
-            
+
             /* Recurse down the all the branches of this; it's the only way.
                We might meet the same queue several times if more than one of
                these threads is waiting the same queue. That isn't a problem
@@ -674,7 +548,7 @@ static void inherit_priority(
             FOR_EACH_BITARRAY_SET_BIT(&blsplay->mask, slotnum)
             {
                 bl->priority = oldblpr; /* To see the change each time */
-                blt = &threads[slotnum];
+                blt = __thread_slot_entry(slotnum);
                 LOCK_THREAD(blt);
                 inherit_priority(blocker0, bl, blt, newblpr);
             }
@@ -699,7 +573,7 @@ static void inherit_priority(
 
         if (blt->state == STATE_RUNNING)
         {
-            set_running_thread_priority(blt, newpr);
+            set_rtr_thread_priority(blt, newpr);
             break; /* Running: last in chain */
         }
 
@@ -714,7 +588,7 @@ static void inherit_priority(
             break; /* Full circle - deadlock! */
 
         /* Blocker becomes current thread and the process repeats */
-        struct thread_entry **bqp = blt->bqp;
+        struct __wait_queue *wqp = wait_queue_ptr(blt);
         struct thread_entry *t = blt;
         blt = lock_blocker_thread(bl);
 
@@ -725,7 +599,7 @@ static void inherit_priority(
         if (newpr <= oldblpr)
             newblpr = newpr;
         else if (oldpr <= oldblpr)
-            newblpr = find_highest_priority_in_list_l(*bqp);
+            newblpr = wait_queue_find_priority(wqp);
 
         if (newblpr == oldblpr)
             break; /* Queue priority not changing */
@@ -735,22 +609,46 @@ static void inherit_priority(
 }
 
 /*---------------------------------------------------------------------------
- * Quick-disinherit of priority elevation. 'thread' must be a running thread.
+ * Quick-inherit of priority elevation. 'thread' must be not runnable
  *---------------------------------------------------------------------------
  */
-static void priority_disinherit_internal(struct thread_entry *thread,
-                                         int blpr)
+static void priority_inherit_internal_inner(struct thread_entry *thread,
+                                            int blpr)
 {
-    if (blpr < PRIORITY_IDLE &&
-        prio_subtract_entry(&thread->pdist, blpr) == 0 &&
+    if (prio_add_entry(&thread->pdist, blpr) == 1 && blpr < thread->priority)
+        thread->priority = blpr;
+}
+
+static inline void priority_inherit_internal(struct thread_entry *thread,
+                                             int blpr)
+{
+    if (blpr < PRIORITY_IDLE)
+        priority_inherit_internal_inner(thread, blpr);
+}
+
+/*---------------------------------------------------------------------------
+ * Quick-disinherit of priority elevation. 'thread' must current
+ *---------------------------------------------------------------------------
+ */
+static void priority_disinherit_internal_inner(struct thread_entry *thread,
+                                               int blpr)
+{
+    if (prio_subtract_entry(&thread->pdist, blpr) == 0 &&
         blpr <= thread->priority)
     {
         int priority = priobit_ffs(&thread->pdist.mask);
         if (priority != thread->priority)
-            set_running_thread_priority(thread, priority);
+            set_rtr_thread_priority(thread, priority);
     }
 }
 
+static inline void priority_disinherit_internal(struct thread_entry *thread,
+                                                int blpr)
+{
+    if (blpr < PRIORITY_IDLE)
+        priority_disinherit_internal_inner(thread, blpr);
+}
+
 void priority_disinherit(struct thread_entry *thread, struct blocker *bl)
 {
     LOCK_THREAD(thread);
@@ -767,30 +665,32 @@ static void wakeup_thread_queue_multi_transfer(struct thread_entry *thread)
 {
     /* All threads will have the same blocker and queue; only we are changing
        it now */
-    struct thread_entry **bqp = thread->bqp;
-    struct blocker_splay *blsplay = (struct blocker_splay *)thread->blocker;
-    struct thread_entry *blt = blsplay->blocker.thread;
+    struct __wait_queue *wqp = wait_queue_ptr(thread);
+    struct blocker *bl = thread->blocker;
+    struct blocker_splay *blsplay = (struct blocker_splay *)bl;
+    struct thread_entry *blt = bl->thread;
 
     /* The first thread is already locked and is assumed tagged "multi" */
     int count = 1;
-    struct thread_entry *temp_queue = NULL;
 
-    /* 'thread' is locked on entry */
+    /* Multiple versions of the wait queue may be seen if doing more than
+       one thread; queue removal isn't destructive to the pointers of the node
+       being removed; this may lead to the blocker priority being wrong for a
+       time but it gets fixed up below after getting exclusive access to the
+       queue */
     while (1)
     {
-        LOCK_THREAD(blt);
-
-        remove_from_list_l(bqp, thread);
         thread->blocker = NULL;
+        wait_queue_remove(thread);
 
-        struct thread_entry *tnext = *bqp;
+        unsigned int slotnum = THREAD_ID_SLOT(thread->id);
+        threadbit_set_bit(&blsplay->mask, slotnum);
+
+        struct thread_entry *tnext = WQ_THREAD_NEXT(thread);
         if (tnext == NULL || tnext->retval == 0)
             break;
 
-        add_to_list_l(&temp_queue, thread);
-
         UNLOCK_THREAD(thread);
-        UNLOCK_THREAD(blt);
 
         count++;
         thread = tnext;
@@ -798,65 +698,51 @@ static void wakeup_thread_queue_multi_transfer(struct thread_entry *thread)
         LOCK_THREAD(thread);
     }
 
-    int blpr = blsplay->blocker.priority;
-    priority_disinherit_internal(blt, blpr);
-
     /* Locking order reverses here since the threads are no longer on the
-       queue side */
+       queued side */
     if (count > 1)
-    {
-        add_to_list_l(&temp_queue, thread);
-        UNLOCK_THREAD(thread);
         corelock_lock(&blsplay->cl);
 
-        blpr = find_highest_priority_in_list_l(*bqp);
+    LOCK_THREAD(blt);
+
+    int blpr = bl->priority;
+    priority_disinherit_internal(blt, blpr);
+
+    if (count > 1)
+    {
         blsplay->blocker.thread = NULL;
 
-        thread = temp_queue;
-        LOCK_THREAD(thread);
+        blpr = wait_queue_find_priority(wqp);
+
+        FOR_EACH_BITARRAY_SET_BIT(&blsplay->mask, slotnum)
+        {
+            UNLOCK_THREAD(thread);
+            thread = __thread_slot_entry(slotnum);
+            LOCK_THREAD(thread);
+            priority_inherit_internal(thread, blpr);
+            core_schedule_wakeup(thread);
+        }
     }
     else
     {
         /* Becomes a simple, direct transfer */
-        if (thread->priority <= blpr)
-            blpr = find_highest_priority_in_list_l(*bqp);
         blsplay->blocker.thread = thread;
-    }
 
-    blsplay->blocker.priority = blpr;
-
-    while (1)
-    {
-        unsigned int slotnum = THREAD_ID_SLOT(thread->id);
-        threadbit_set_bit(&blsplay->mask, slotnum);
-
-        if (blpr < PRIORITY_IDLE)
-        {
-            prio_add_entry(&thread->pdist, blpr);
-            if (blpr < thread->priority)
-                thread->priority = blpr;
-        }
-
-        if (count > 1)
-            remove_from_list_l(&temp_queue, thread);
+        if (thread->priority <= blpr)
+            blpr = wait_queue_find_priority(wqp);
 
+        priority_inherit_internal(thread, blpr);
         core_schedule_wakeup(thread);
-
-        UNLOCK_THREAD(thread);
-
-        thread = temp_queue;
-        if (thread == NULL)
-            break;
-
-        LOCK_THREAD(thread);
     }
 
+    UNLOCK_THREAD(thread);
+
+    bl->priority = blpr;
+
     UNLOCK_THREAD(blt);
 
     if (count > 1)
-    {
         corelock_unlock(&blsplay->cl);
-    }
 
     blt->retval = count;
 }
@@ -876,29 +762,20 @@ static void wakeup_thread_transfer(struct thread_entry *thread)
     struct blocker *bl = thread->blocker;
     struct thread_entry *blt = bl->thread;
 
-    THREAD_ASSERT(cores[CURRENT_CORE].running == blt,
-                  "UPPT->wrong thread", cores[CURRENT_CORE].running);
+    THREAD_ASSERT(__running_self_entry() == blt,
+                  "UPPT->wrong thread", __running_self_entry());
 
     LOCK_THREAD(blt);
 
-    struct thread_entry **bqp = thread->bqp;
-    remove_from_list_l(bqp, thread);
     thread->blocker = NULL;
+    struct __wait_queue *wqp = wait_queue_remove(thread);
 
     int blpr = bl->priority;
 
     /* Remove the object's boost from the owning thread */
-    if (prio_subtract_entry(&blt->pdist, blpr) == 0 && blpr <= blt->priority)
-    {
-        /* No more threads at this priority are waiting and the old level is
-         * at least the thread level */
-        int priority = priobit_ffs(&blt->pdist.mask);
-        if (priority != blt->priority)
-            set_running_thread_priority(blt, priority);
-    }
-
-    struct thread_entry *tnext = *bqp;
+    priority_disinherit_internal_inner(blt, blpr);
 
+    struct thread_entry *tnext = WQ_THREAD_FIRST(wqp);
     if (LIKELY(tnext == NULL))
     {
         /* Expected shortcut - no more waiters */
@@ -906,20 +783,20 @@ static void wakeup_thread_transfer(struct thread_entry *thread)
     }
     else
     {
-        /* If lowering, we need to scan threads remaining in queue */
-        int priority = thread->priority;
-        if (priority <= blpr)
-            blpr = find_highest_priority_in_list_l(tnext);
+        /* If thread is at the blocker priority, its removal may drop it */
+        if (thread->priority <= blpr)
+            blpr = wait_queue_find_priority(wqp);
 
-        if (prio_add_entry(&thread->pdist, blpr) == 1 && blpr < priority)
-            thread->priority = blpr; /* Raise new owner */
+        priority_inherit_internal_inner(thread, blpr);
     }
 
+    bl->thread = thread; /* This thread pwns */
+
     core_schedule_wakeup(thread);
     UNLOCK_THREAD(thread);
 
-    bl->thread   = thread;  /* This thread pwns */
-    bl->priority = blpr;    /* Save highest blocked priority */
+    bl->priority = blpr; /* Save highest blocked priority */
+
     UNLOCK_THREAD(blt);
 }
 
@@ -933,9 +810,9 @@ static void wakeup_thread_release(struct thread_entry *thread)
 {
     struct blocker *bl = thread->blocker;
     struct thread_entry *blt = lock_blocker_thread(bl);
-    struct thread_entry **bqp = thread->bqp;
-    remove_from_list_l(bqp, thread);
+
     thread->blocker = NULL;
+    struct __wait_queue *wqp = wait_queue_remove(thread);
 
     /* Off to see the wizard... */
     core_schedule_wakeup(thread);
@@ -950,7 +827,7 @@ static void wakeup_thread_release(struct thread_entry *thread)
 
     UNLOCK_THREAD(thread);
 
-    int newblpr = find_highest_priority_in_list_l(*bqp);
+    int newblpr = wait_queue_find_priority(wqp);
     if (newblpr == bl->priority)
     {
         /* Blocker priority won't change */
@@ -963,25 +840,17 @@ static void wakeup_thread_release(struct thread_entry *thread)
 
 #endif /* HAVE_PRIORITY_SCHEDULING */
 
+
 /*---------------------------------------------------------------------------
  * Explicitly wakeup a thread on a blocking queue. Only effects threads of
  * STATE_BLOCKED and STATE_BLOCKED_W_TMO.
  *
- * This code should be considered a critical section by the caller meaning
- * that the object's corelock should be held.
- *
- * INTERNAL: Intended for use by kernel objects and not for programs.
+ * INTERNAL: Intended for use by kernel and not programs.
  *---------------------------------------------------------------------------
  */
-unsigned int wakeup_thread_(struct thread_entry **list
+unsigned int wakeup_thread_(struct thread_entry *thread
                             IF_PRIO(, enum wakeup_thread_protocol proto))
 {
-    struct thread_entry *thread = *list;
-
-    /* Check if there is a blocked thread at all. */
-    if (*list == NULL)
-        return THREAD_NONE;
-
     LOCK_THREAD(thread);
 
     /* Determine thread's current state. */
@@ -1008,24 +877,21 @@ unsigned int wakeup_thread_(struct thread_entry **list
         else
 #endif /* HAVE_PRIORITY_SCHEDULING */
         {
-            /* No PIP - just boost the thread by aging */
-#ifdef HAVE_PRIORITY_SCHEDULING
-            thread->skip_count = thread->priority;
-#endif /* HAVE_PRIORITY_SCHEDULING */
-            remove_from_list_l(list, thread);
+            wait_queue_remove(thread);
             core_schedule_wakeup(thread);
             UNLOCK_THREAD(thread);
         }
 
-        return should_switch_tasks();
+        return should_switch_tasks(thread);
 
-    /* Nothing to do. State is not blocked. */
-    default:
-#if THREAD_EXTRA_CHECKS
-        THREAD_PANICF("wakeup_thread->block invalid", thread);
     case STATE_RUNNING:
-    case STATE_KILLED:
-#endif
+        if (wait_queue_try_remove(thread))
+        {
+            UNLOCK_THREAD(thread);
+            return THREAD_OK; /* timed out */
+        }
+
+    default:
         UNLOCK_THREAD(thread);
         return THREAD_NONE;
     }
@@ -1037,201 +903,102 @@ unsigned int wakeup_thread_(struct thread_entry **list
  * tick when the next check will occur.
  *---------------------------------------------------------------------------
  */
-void check_tmo_threads(void)
+static NO_INLINE void check_tmo_expired_inner(struct core_entry *corep)
 {
-    const unsigned int core = CURRENT_CORE;
     const long tick = current_tick; /* snapshot the current tick */
     long next_tmo_check = tick + 60*HZ; /* minimum duration: once/minute */
-    struct thread_entry *next = cores[core].timeout;
+    struct thread_entry *prev = NULL;
+    struct thread_entry *thread = TMO_THREAD_FIRST(&corep->tmo);
 
     /* If there are no processes waiting for a timeout, just keep the check
        tick from falling into the past. */
 
     /* Break the loop once we have walked through the list of all
      * sleeping processes or have removed them all. */
-    while (next != NULL)
+    while (thread != NULL)
     {
         /* Check sleeping threads. Allow interrupts between checks. */
         enable_irq();
 
-        struct thread_entry *curr = next;
-
-        next = curr->tmo.next;
+        struct thread_entry *next = TMO_THREAD_NEXT(thread);
 
         /* Lock thread slot against explicit wakeup */
         disable_irq();
-        LOCK_THREAD(curr);
+        LOCK_THREAD(thread);
 
-        unsigned state = curr->state;
+        unsigned int state = thread->state;
 
-        if (state < TIMEOUT_STATE_FIRST)
-        {
-            /* Cleanup threads no longer on a timeout but still on the
-             * list. */
-            remove_from_list_tmo(curr);
-        }
-        else if (LIKELY(TIME_BEFORE(tick, curr->tmo_tick)))
+        if (LIKELY(state >= TIMEOUT_STATE_FIRST &&
+                   TIME_BEFORE(tick, thread->tmo_tick)))
         {
             /* Timeout still pending - this will be the usual case */
-            if (TIME_BEFORE(curr->tmo_tick, next_tmo_check))
+            if (TIME_BEFORE(thread->tmo_tick, next_tmo_check))
             {
-                /* Earliest timeout found so far - move the next check up
-                   to its time */
-                next_tmo_check = curr->tmo_tick;
+                /* Move the next check up to its time */
+                next_tmo_check = thread->tmo_tick;
             }
+
+            prev = thread;
         }
         else
         {
-            /* Sleep timeout has been reached so bring the thread back to
-             * life again. */
-            if (state == STATE_BLOCKED_W_TMO)
-            {
-#ifdef HAVE_CORELOCK_OBJECT
-                /* Lock the waiting thread's kernel object */
-                struct corelock *ocl = curr->obj_cl;
+            /* TODO: there are no priority-inheriting timeout blocks
+               right now but the procedure should be established */
 
-                if (UNLIKELY(corelock_try_lock(ocl) == 0))
-                {
-                    /* Need to retry in the correct order though the need is
-                     * unlikely */
-                    UNLOCK_THREAD(curr);
-                    corelock_lock(ocl);
-                    LOCK_THREAD(curr);
+            /* Sleep timeout has been reached / garbage collect stale list
+               items */
+            tmo_queue_expire(&corep->tmo, prev, thread);
 
-                    if (UNLIKELY(curr->state != STATE_BLOCKED_W_TMO))
-                    {
-                        /* Thread was woken or removed explicitely while slot
-                         * was unlocked */
-                        corelock_unlock(ocl);
-                        remove_from_list_tmo(curr);
-                        UNLOCK_THREAD(curr);
-                        continue;
-                    }
-                }
-#endif /* NUM_CORES */
+            if (state >= TIMEOUT_STATE_FIRST)
+                core_rtr_add(corep, thread);
 
-#ifdef HAVE_WAKEUP_EXT_CB
-                if (curr->wakeup_ext_cb != NULL)
-                    curr->wakeup_ext_cb(curr);
-#endif
-
-#ifdef HAVE_PRIORITY_SCHEDULING
-                if (curr->blocker != NULL)
-                    wakeup_thread_release(curr);
-                else
-#endif
-                    remove_from_list_l(curr->bqp, curr);
-
-                corelock_unlock(ocl);
-            }
-            /* else state == STATE_SLEEPING */
-
-            remove_from_list_tmo(curr);
-
-            RTR_LOCK(core);
-
-            curr->state = STATE_RUNNING;
-
-            add_to_list_l(&cores[core].running, curr);
-            rtr_add_entry(core, curr->priority);
-
-            RTR_UNLOCK(core);
+            /* removed this one - prev doesn't change */
         }
 
-        UNLOCK_THREAD(curr);
+        UNLOCK_THREAD(thread);
+
+        thread = next;
     }
 
-    cores[core].next_tmo_check = next_tmo_check;
+    corep->next_tmo_check = next_tmo_check;
+}
+
+static FORCE_INLINE void check_tmo_expired(struct core_entry *corep)
+{
+    if (!TIME_BEFORE(current_tick, corep->next_tmo_check))
+        check_tmo_expired_inner(corep);
 }
 
 /*---------------------------------------------------------------------------
- * Performs operations that must be done before blocking a thread but after
- * the state is saved.
+ * Prepares a the current thread to sleep forever or for the given duration.
  *---------------------------------------------------------------------------
  */
-#if NUM_CORES > 1
-static inline void run_blocking_ops(
-    unsigned int core, struct thread_entry *thread)
+static FORCE_INLINE void prepare_block(struct thread_entry *current,
+                                       unsigned int state, int timeout)
 {
-    struct thread_blk_ops *ops = &cores[core].blk_ops;
-    const unsigned flags = ops->flags;
-
-    if (LIKELY(flags == TBOP_CLEAR))
-        return;
-
-    switch (flags)
-    {
-    case TBOP_SWITCH_CORE:
-        core_switch_blk_op(core, thread);
-        /* Fall-through */
-    case TBOP_UNLOCK_CORELOCK:
-        corelock_unlock(ops->cl_p);
-        break;
-    }
-
-    ops->flags = TBOP_CLEAR;
-}
-#endif /* NUM_CORES > 1 */
-
-#ifdef RB_PROFILE
-void profile_thread(void)
-{
-    profstart(cores[CURRENT_CORE].running - threads);
-}
-#endif
-
-/*---------------------------------------------------------------------------
- * Prepares a thread to block on an object's list and/or for a specified
- * duration - expects object and slot to be appropriately locked if needed
- * and interrupts to be masked.
- *---------------------------------------------------------------------------
- */
-static inline void block_thread_on_l(struct thread_entry *thread,
-                                     unsigned state)
-{
-    /* If inlined, unreachable branches will be pruned with no size penalty
-       because state is passed as a constant parameter. */
-    const unsigned int core = IF_COP_CORE(thread->core);
+    const unsigned int core = IF_COP_CORE(current->core);
 
     /* Remove the thread from the list of running threads. */
-    RTR_LOCK(core);
-    remove_from_list_l(&cores[core].running, thread);
-    rtr_subtract_entry(core, thread->priority);
-    RTR_UNLOCK(core);
+    struct core_entry *corep = __core_id_entry(core);
+    core_rtr_remove(corep, current);
 
-    /* Add a timeout to the block if not infinite */
-    switch (state)
+    if (timeout >= 0)
     {
-    case STATE_BLOCKED:
-    case STATE_BLOCKED_W_TMO:
-        /* Put the thread into a new list of inactive threads. */
-        add_to_list_l(thread->bqp, thread);
+        /* Sleep may expire. */
+        long tmo_tick = current_tick + timeout;
+        current->tmo_tick = tmo_tick;
+
+        if (TIME_BEFORE(tmo_tick, corep->next_tmo_check))
+            corep->next_tmo_check = tmo_tick;
+
+        tmo_queue_register(&corep->tmo, current);
 
         if (state == STATE_BLOCKED)
-            break;
-
-        /* Fall-through */
-    case STATE_SLEEPING:
-        /* If this thread times out sooner than any other thread, update
-           next_tmo_check to its timeout */
-        if (TIME_BEFORE(thread->tmo_tick, cores[core].next_tmo_check))
-        {
-            cores[core].next_tmo_check = thread->tmo_tick;
-        }
-
-        if (thread->tmo.prev == NULL)
-        {
-            add_to_list_tmo(thread);
-        }
-        /* else thread was never removed from list - just keep it there */
-        break;
+            state = STATE_BLOCKED_W_TMO;
     }
 
-    /* Remember the the next thread about to block. */
-    cores[core].block_task = thread;
-
     /* Report new state. */
-    thread->state = state;
+    current->state = state;
 }
 
 /*---------------------------------------------------------------------------
@@ -1239,178 +1006,120 @@ static inline void block_thread_on_l(struct thread_entry *thread,
  * that removed itself from the running list first must specify itself in
  * the paramter.
  *
- * INTERNAL: Intended for use by kernel and not for programs.
+ * INTERNAL: Intended for use by kernel and not programs.
  *---------------------------------------------------------------------------
  */
 void switch_thread(void)
 {
-
     const unsigned int core = CURRENT_CORE;
-    struct thread_entry *block = cores[core].block_task;
-    struct thread_entry *thread = cores[core].running;
+    struct core_entry *corep = __core_id_entry(core);
+    struct thread_entry *thread = corep->running;
 
-    /* Get context to save - next thread to run is unknown until all wakeups
-     * are evaluated */
-    if (block != NULL)
+    if (thread)
     {
-        cores[core].block_task = NULL;
-
-#if NUM_CORES > 1
-        if (UNLIKELY(thread == block))
-        {
-            /* This was the last thread running and another core woke us before
-             * reaching here. Force next thread selection to give tmo threads or
-             * other threads woken before this block a first chance. */
-            block = NULL;
-        }
-        else
+#ifdef RB_PROFILE
+        profile_thread_stopped(THREAD_ID_SLOT(thread->id));
 #endif
-        {
-            /* Blocking task is the old one */
-            thread = block;
-        }
+#ifdef DEBUG
+        /* Check core_ctx buflib integrity */
+        core_check_valid();
+#endif
+        thread_store_context(thread);
+
+        /* Check if the current thread stack is overflown */
+        if (UNLIKELY(thread->stack[0] != DEADBEEF) && thread->stack_size > 0)
+            thread_stkov(thread);
     }
 
-#ifdef RB_PROFILE
-#ifdef CPU_COLDFIRE
-    _profile_thread_stopped(thread->id & THREAD_ID_SLOT_MASK);
-#else
-    profile_thread_stopped(thread->id & THREAD_ID_SLOT_MASK);
-#endif
-#endif
+    /* TODO: make a real idle task */
+    for (;;)
+    {
+        disable_irq();
 
-    /* Begin task switching by saving our current context so that we can
-     * restore the state of the current thread later to the point prior
-     * to this call. */
-    thread_store_context(thread);
-#ifdef DEBUG
-    /* Check core_ctx buflib integrity */
-    core_check_valid();
-#endif
+        /* Check for expired timeouts */
+        check_tmo_expired(corep);
 
-    /* Check if the current thread stack is overflown */
-    if (UNLIKELY(thread->stack[0] != DEADBEEF) && thread->stack_size > 0)
-        thread_stkov(thread);
+        RTR_LOCK(corep);
 
-#if NUM_CORES > 1
-    /* Run any blocking operations requested before switching/sleeping */
-    run_blocking_ops(core, thread);
-#endif
+        if (!RTR_EMPTY(&corep->rtr))
+            break;
+
+        thread = NULL;
+        
+        /* Enter sleep mode to reduce power usage */
+        RTR_UNLOCK(corep);
+        core_sleep(IF_COP(core));
+
+        /* Awakened by interrupt or other CPU */
+    }
+
+    thread = (thread && thread->state == STATE_RUNNING) ?
+        RTR_THREAD_NEXT(thread) : RTR_THREAD_FIRST(&corep->rtr);
 
 #ifdef HAVE_PRIORITY_SCHEDULING
-    /* Reset the value of thread's skip count */
-    thread->skip_count = 0;
-#endif
+    /* Select the new task based on priorities and the last time a
+     * process got CPU time relative to the highest priority runnable
+     * task. If priority is not a feature, then FCFS is used (above). */
+    int max = priobit_ffs(&corep->rtr_dist.mask);
 
     for (;;)
     {
-        /* If there are threads on a timeout and the earliest wakeup is due,
-         * check the list and wake any threads that need to start running
-         * again. */
-        if (!TIME_BEFORE(current_tick, cores[core].next_tmo_check))
+        int priority = thread->priority;
+        int diff;
+
+        /* This ridiculously simple method of aging seems to work
+         * suspiciously well. It does tend to reward CPU hogs (under
+         * yielding) but that's generally not desirable at all. On
+         * the plus side, it, relatively to other threads, penalizes
+         * excess yielding which is good if some high priority thread
+         * is performing no useful work such as polling for a device
+         * to be ready. Of course, aging is only employed when higher
+         * and lower priority threads are runnable. The highest
+         * priority runnable thread(s) are never skipped unless a
+         * lower-priority process has aged sufficiently. Priorities
+         * of REALTIME class are run strictly according to priority
+         * thus are not subject to switchout due to lower-priority
+         * processes aging; they must give up the processor by going
+         * off the run list. */
+        if (LIKELY(priority <= max) ||
+            (priority > PRIORITY_REALTIME &&
+             (diff = priority - max, ++thread->skip_count > diff*diff)))
         {
-            check_tmo_threads();
-        }
-
-        disable_irq();
-        RTR_LOCK(core);
-
-        thread = cores[core].running;
-
-        if (UNLIKELY(thread == NULL))
-        {
-            /* Enter sleep mode to reduce power usage - woken up on interrupt
-             * or wakeup request from another core - expected to enable
-             * interrupts. */
-            RTR_UNLOCK(core);
-            core_sleep(IF_COP(core));
-        }
-        else
-        {
-#ifdef HAVE_PRIORITY_SCHEDULING
-            /* Select the new task based on priorities and the last time a
-             * process got CPU time relative to the highest priority runnable
-             * task. */
-            int max = priobit_ffs(&cores[core].rtr.mask);
-
-            if (block == NULL)
-            {
-                /* Not switching on a block, tentatively select next thread */
-                thread = thread->l.next;
-            }
-
-            for (;;)
-            {
-                int priority = thread->priority;
-                int diff;
-
-                /* This ridiculously simple method of aging seems to work
-                 * suspiciously well. It does tend to reward CPU hogs (under
-                 * yielding) but that's generally not desirable at all. On
-                 * the plus side, it, relatively to other threads, penalizes
-                 * excess yielding which is good if some high priority thread
-                 * is performing no useful work such as polling for a device
-                 * to be ready. Of course, aging is only employed when higher
-                 * and lower priority threads are runnable. The highest
-                 * priority runnable thread(s) are never skipped unless a
-                 * lower-priority process has aged sufficiently. Priorities
-                 * of REALTIME class are run strictly according to priority
-                 * thus are not subject to switchout due to lower-priority
-                 * processes aging; they must give up the processor by going
-                 * off the run list. */
-                if (LIKELY(priority <= max) ||
-                    (priority > PRIORITY_REALTIME &&
-                     (diff = priority - max,
-                         ++thread->skip_count > diff*diff)))
-                {
-                    cores[core].running = thread;
-                    break;
-                }
-
-                thread = thread->l.next;
-            }
-#else
-            /* Without priority use a simple FCFS algorithm */
-            if (block == NULL)
-            {
-                /* Not switching on a block, select next thread */
-                thread = thread->l.next;
-                cores[core].running = thread;
-            }
-#endif /* HAVE_PRIORITY_SCHEDULING */
-
-            RTR_UNLOCK(core);
-            enable_irq();
             break;
         }
+
+        thread = RTR_THREAD_NEXT(thread);
     }
 
-    /* And finally give control to the next thread. */
+    thread->skip_count = 0; /* Reset aging counter */
+#endif /* HAVE_PRIORITY_SCHEDULING */
+
+    rtr_queue_make_first(&corep->rtr, thread);
+    corep->running = thread;
+
+    RTR_UNLOCK(corep);
+    enable_irq();
+
+    /* And finally, give control to the next thread. */
     thread_load_context(thread);
 
 #ifdef RB_PROFILE
-    profile_thread_started(thread->id & THREAD_ID_SLOT_MASK);
+    profile_thread_started(THREAD_ID_SLOT(thread->id));
 #endif
-
 }
 
 /*---------------------------------------------------------------------------
  * Sleeps a thread for at least a specified number of ticks with zero being
  * a wait until the next tick.
  *
- * INTERNAL: Intended for use by kernel and not for programs.
+ * INTERNAL: Intended for use by kernel and not programs.
  *---------------------------------------------------------------------------
  */
 void sleep_thread(int ticks)
 {
-    struct thread_entry *current = cores[CURRENT_CORE].running;
-
+    struct thread_entry *current = __running_self_entry();
     LOCK_THREAD(current);
-
-    /* Set our timeout, remove from run list and join timeout list. */
-    current->tmo_tick = current_tick + MAX(ticks, 0) + 1;
-    block_thread_on_l(current, STATE_SLEEPING);
-
+    prepare_block(current, STATE_SLEEPING, MAX(ticks, 0) + 1);
     UNLOCK_THREAD(current);
 }
 
@@ -1418,131 +1127,42 @@ void sleep_thread(int ticks)
  * Block a thread on a blocking queue for explicit wakeup. If timeout is
  * negative, the block is infinite.
  *
- * INTERNAL: Intended for use by kernel objects and not for programs.
+ * INTERNAL: Intended for use by kernel and not programs.
  *---------------------------------------------------------------------------
  */
-void block_thread(struct thread_entry *current, int timeout)
+void block_thread_(struct thread_entry *current, int timeout)
 {
     LOCK_THREAD(current);
 
-    struct blocker *bl = NULL;
 #ifdef HAVE_PRIORITY_SCHEDULING
-    bl = current->blocker;
-    struct thread_entry *blt = bl ? lock_blocker_thread(bl) : NULL;
+    struct blocker *bl = current->blocker;
+    struct thread_entry *blt = NULL;
+    if (bl != NULL)
+    {
+        current->blocker = bl;
+        blt = lock_blocker_thread(bl);
+    }
 #endif /* HAVE_PRIORITY_SCHEDULING */
 
-    if (LIKELY(timeout < 0))
+    wait_queue_register(current);
+    prepare_block(current, STATE_BLOCKED, timeout);
+
+#ifdef HAVE_PRIORITY_SCHEDULING
+    if (bl != NULL)
     {
-        /* Block until explicitly woken */
-        block_thread_on_l(current, STATE_BLOCKED);
+        int newblpr = current->priority;
+        UNLOCK_THREAD(current);
+
+        if (newblpr < bl->priority)
+            inherit_priority(bl, bl, blt, newblpr);
+        else
+            unlock_blocker_thread(bl); /* Queue priority won't change */
     }
     else
-    {
-        /* Set the state to blocked with the specified timeout */
-        current->tmo_tick = current_tick + timeout;
-        block_thread_on_l(current, STATE_BLOCKED_W_TMO);
-    }
-
-    if (bl == NULL)
+#endif /* HAVE_PRIORITY_SCHEDULING */
     {
         UNLOCK_THREAD(current);
-        return;
     }
-
-#ifdef HAVE_PRIORITY_SCHEDULING
-    int newblpr = current->priority;
-    UNLOCK_THREAD(current);
-
-    if (newblpr >= bl->priority)
-    {
-        unlock_blocker_thread(bl);
-        return; /* Queue priority won't change */
-    }
-
-    inherit_priority(bl, bl, blt, newblpr);
-#endif /* HAVE_PRIORITY_SCHEDULING */
-}
-
-/*---------------------------------------------------------------------------
- * Assign the thread slot a new ID. Version is 0x00000100..0xffffff00.
- *---------------------------------------------------------------------------
- */
-static void new_thread_id(unsigned int slot_num,
-                          struct thread_entry *thread)
-{
-    unsigned int version =
-        (thread->id + (1u << THREAD_ID_VERSION_SHIFT))
-                & THREAD_ID_VERSION_MASK;
-
-    /* If wrapped to 0, make it 1 */
-    if (version == 0)
-        version = 1u << THREAD_ID_VERSION_SHIFT;
-
-    thread->id = version | (slot_num & THREAD_ID_SLOT_MASK);
-}
-
-/*---------------------------------------------------------------------------
- * Find an empty thread slot or MAXTHREADS if none found. The slot returned
- * will be locked on multicore.
- *---------------------------------------------------------------------------
- */
-static struct thread_entry * find_empty_thread_slot(void)
-{
-    /* Any slot could be on an interrupt-accessible list */
-    IF_COP( int oldlevel = disable_irq_save(); )
-    struct thread_entry *thread = NULL;
-    int n;
-
-    for (n = 0; n < MAXTHREADS; n++)
-    {
-        /* Obtain current slot state - lock it on multicore */
-        struct thread_entry *t = &threads[n];
-        LOCK_THREAD(t);
-
-        if (t->state == STATE_KILLED)
-        {
-            /* Slot is empty - leave it locked and caller will unlock */
-            thread = t;
-            break;
-        }
-
-        /* Finished examining slot - no longer busy - unlock on multicore */
-        UNLOCK_THREAD(t);
-    }
-
-    IF_COP( restore_irq(oldlevel); ) /* Reenable interrups - this slot is
-                                          not accesible to them yet */
-    return thread;
-}
-
-/*---------------------------------------------------------------------------
- * Return the thread_entry pointer for a thread_id. Return the current
- * thread if the ID is (unsigned int)-1 (alias for current).
- *---------------------------------------------------------------------------
- */
-struct thread_entry * thread_id_entry(unsigned int thread_id)
-{
-    return &threads[thread_id & THREAD_ID_SLOT_MASK];
-}
-
-/*---------------------------------------------------------------------------
- * Return the thread id of the calling thread
- * --------------------------------------------------------------------------
- */
-unsigned int thread_self(void)
-{
-    return cores[CURRENT_CORE].running->id;
-}
-
-/*---------------------------------------------------------------------------
- * Return the thread entry of the calling thread.
- *
- * INTERNAL: Intended for use by kernel and not for programs.
- *---------------------------------------------------------------------------
- */
-struct thread_entry* thread_self_entry(void)
-{
-    return cores[CURRENT_CORE].running;
 }
 
 /*---------------------------------------------------------------------------
@@ -1552,9 +1172,8 @@ struct thread_entry* thread_self_entry(void)
  */
 void core_idle(void)
 {
-    IF_COP( const unsigned int core = CURRENT_CORE; )
     disable_irq();
-    core_sleep(IF_COP(core));
+    core_sleep(IF_COP(CURRENT_CORE));
 }
 
 /*---------------------------------------------------------------------------
@@ -1570,141 +1189,64 @@ unsigned int create_thread(void (*function)(void),
                            IF_PRIO(, int priority)
                            IF_COP(, unsigned int core))
 {
-    unsigned int i;
-    unsigned int stack_words;
-    uintptr_t stackptr, stackend;
-    struct thread_entry *thread;
-    unsigned state;
-    int oldlevel;
-
-    thread = find_empty_thread_slot();
+    struct thread_entry *thread = thread_alloc();
     if (thread == NULL)
-    {
         return 0;
-    }
 
-    oldlevel = disable_irq_save();
+    new_thread_base_init(thread, &stack, &stack_size, name
+                         IF_PRIO(, priority) IF_COP(, core));
+
+    unsigned int stack_words = stack_size / sizeof (uintptr_t);
+    if (stack_words == 0)
+        return 0;
 
     /* Munge the stack to make it easy to spot stack overflows */
-    stackptr = ALIGN_UP((uintptr_t)stack, sizeof (uintptr_t));
-    stackend = ALIGN_DOWN((uintptr_t)stack + stack_size, sizeof (uintptr_t));
-    stack_size = stackend - stackptr;
-    stack_words = stack_size / sizeof (uintptr_t);
-
-    for (i = 0; i < stack_words; i++)
-    {
-        ((uintptr_t *)stackptr)[i] = DEADBEEF;
-    }
-
-    /* Store interesting information */
-    thread->name = name;
-    thread->stack = (uintptr_t *)stackptr;
-    thread->stack_size = stack_size;
-    thread->queue = NULL;
-#ifdef HAVE_WAKEUP_EXT_CB
-    thread->wakeup_ext_cb = NULL;
-#endif
-#ifdef HAVE_SCHEDULER_BOOSTCTRL
-    thread->cpu_boost = 0;
-#endif
-#ifdef HAVE_PRIORITY_SCHEDULING
-    memset(&thread->pdist, 0, sizeof(thread->pdist));
-    thread->blocker = NULL;
-    thread->base_priority = priority;
-    thread->priority = priority;
-    thread->skip_count = priority;
-    prio_add_entry(&thread->pdist, priority);
-#endif
-
-#ifdef HAVE_IO_PRIORITY
-    /* Default to high (foreground) priority */
-    thread->io_priority = IO_PRIORITY_IMMEDIATE;
-#endif
+    for (unsigned int i = 0; i < stack_words; i++)
+        ((uintptr_t *)stack)[i] = DEADBEEF;
 
 #if NUM_CORES > 1
-    thread->core = core;
-
     /* Writeback stack munging or anything else before starting */
     if (core != CURRENT_CORE)
-    {
         commit_dcache();
-    }
 #endif
 
-    /* Thread is not on any timeout list but be a bit paranoid */
-    thread->tmo.prev = NULL;
-
-    state = (flags & CREATE_THREAD_FROZEN) ?
-        STATE_FROZEN : STATE_RUNNING;
-    
-    thread->context.sp = (typeof (thread->context.sp))stackend;
-
-    /* Load the thread's context structure with needed startup information */
+    thread->context.sp = (typeof (thread->context.sp))(stack + stack_size);
     THREAD_STARTUP_INIT(core, thread, function);
 
-    thread->state = state;
-    i = thread->id; /* Snapshot while locked */
+    int oldlevel = disable_irq_save();
+    LOCK_THREAD(thread);
 
-    if (state == STATE_RUNNING)
+    thread->state = STATE_FROZEN;
+
+    if (!(flags & CREATE_THREAD_FROZEN))
         core_schedule_wakeup(thread);
 
+    unsigned int id = thread->id; /* Snapshot while locked */
+
     UNLOCK_THREAD(thread);
     restore_irq(oldlevel);
 
-    return i;
+    return id;
 }
 
-#ifdef HAVE_SCHEDULER_BOOSTCTRL
-/*---------------------------------------------------------------------------
- * Change the boost state of a thread boosting or unboosting the CPU
- * as required.
- *---------------------------------------------------------------------------
- */
-static inline void boost_thread(struct thread_entry *thread, bool boost)
-{
-    if ((thread->cpu_boost != 0) != boost)
-    {
-        thread->cpu_boost = boost;
-        cpu_boost(boost);
-    }
-}
-
-void trigger_cpu_boost(void)
-{
-    struct thread_entry *current = cores[CURRENT_CORE].running;
-    boost_thread(current, true);
-}
-
-void cancel_cpu_boost(void)
-{
-    struct thread_entry *current = cores[CURRENT_CORE].running;
-    boost_thread(current, false);
-}
-#endif /* HAVE_SCHEDULER_BOOSTCTRL */
-
 /*---------------------------------------------------------------------------
  * Block the current thread until another thread terminates. A thread may
- * wait on itself to terminate which prevents it from running again and it
- * will need to be killed externally.
+ * wait on itself to terminate but that will deadlock
+ *.
  * Parameter is the ID as returned from create_thread().
  *---------------------------------------------------------------------------
  */
 void thread_wait(unsigned int thread_id)
 {
-    struct thread_entry *current = cores[CURRENT_CORE].running;
-    struct thread_entry *thread = thread_id_entry(thread_id);
+    struct thread_entry *current = __running_self_entry();
+    struct thread_entry *thread = __thread_id_entry(thread_id);
 
-    /* Lock thread-as-waitable-object lock */
     corelock_lock(&thread->waiter_cl);
 
-    /* Be sure it hasn't been killed yet */
     if (thread->id == thread_id && thread->state != STATE_KILLED)
     {
-        IF_COP( current->obj_cl = &thread->waiter_cl; )
-        current->bqp = &thread->queue;
-
         disable_irq();
-        block_thread(current, TIMEOUT_BLOCK);
+        block_thread(current, TIMEOUT_BLOCK, &thread->queue, NULL);
 
         corelock_unlock(&thread->waiter_cl);
 
@@ -1716,36 +1258,35 @@ void thread_wait(unsigned int thread_id)
 }
 
 /*---------------------------------------------------------------------------
- * Exit the current thread. The Right Way to Do Things (TM).
+ * Exit the current thread
  *---------------------------------------------------------------------------
  */
-/* This is done to foil optimizations that may require the current stack,
- * such as optimizing subexpressions that put variables on the stack that
- * get used after switching stacks. */
-#if NUM_CORES > 1
-/* Called by ASM stub */
-static void thread_final_exit_do(struct thread_entry *current)
-#else
-/* No special procedure is required before calling */
-static inline void thread_final_exit(struct thread_entry *current)
-#endif
+static USED_ATTR NORETURN_ATTR
+void thread_exit_final(struct thread_entry *current)
 {
-    /* At this point, this thread isn't using resources allocated for
-     * execution except the slot itself. */
+    /* Slot is no longer this thread */
+    new_thread_id(current);
+    current->name = NULL;
 
-    /* Signal this thread */
-    thread_queue_wake(&current->queue);
+    /* No longer using resources from creator */
+    wait_queue_wake(&current->queue);
+
+    UNLOCK_THREAD(current);
     corelock_unlock(&current->waiter_cl);
+
+    thread_free(current);
+
     switch_thread();
+
     /* This should never and must never be reached - if it is, the
      * state is corrupted */
     THREAD_PANICF("thread_exit->K:*R", current);
-    while (1);
 }
 
 void thread_exit(void)
 {
-    register struct thread_entry * current = cores[CURRENT_CORE].running;
+    struct core_entry *corep = __core_id_entry(CURRENT_CORE);
+    register struct thread_entry *current = corep->running;
 
     /* Cancel CPU boost if any */
     cancel_cpu_boost();
@@ -1764,24 +1305,21 @@ void thread_exit(void)
         thread_panicf("abandon ship!", current);
 #endif /* HAVE_PRIORITY_SCHEDULING */
 
-    if (current->tmo.prev != NULL)
-    {
-        /* Cancel pending timeout list removal */
-        remove_from_list_tmo(current);
-    }
+    /* Remove from scheduler lists */
+    tmo_queue_remove(&corep->tmo, current);
+    prepare_block(current, STATE_KILLED, -1);
+    corep->running = NULL; /* No switch_thread context save */
 
-    /* Switch tasks and never return */
-    block_thread_on_l(current, STATE_KILLED);
+#ifdef RB_PROFILE
+    profile_thread_stopped(THREAD_ID_SLOT(current->id));
+#endif
 
-    /* Slot must be unusable until thread is really gone */
-    UNLOCK_THREAD_AT_TASK_SWITCH(current);
-
-    /* Update ID for this slot */
-    new_thread_id(current->id, current);
-    current->name = NULL;
-
-    /* Do final cleanup and remove the thread */
-    thread_final_exit(current);
+    /* Do final release of resources and remove the thread */
+#if NUM_CORES > 1
+    thread_exit_finalize(current->core, current);
+#else
+    thread_exit_final(current);
+#endif
 }
 
 #ifdef HAVE_PRIORITY_SCHEDULING
@@ -1796,10 +1334,8 @@ int thread_set_priority(unsigned int thread_id, int priority)
         return -1; /* Invalid priority argument */
 
     int old_base_priority = -1;
-    struct thread_entry *thread = thread_id_entry(thread_id);
+    struct thread_entry *thread = __thread_id_entry(thread_id);
 
-    /* Thread could be on any list and therefore on an interrupt accessible
-       one - disable interrupts */
     const int oldlevel = disable_irq_save();
     LOCK_THREAD(thread);
 
@@ -1825,7 +1361,7 @@ int thread_set_priority(unsigned int thread_id, int priority)
     {
         /* This thread is running - just change location on the run queue.
            Also sets thread->priority. */
-        set_running_thread_priority(thread, new_priority);
+        set_rtr_thread_priority(thread, new_priority);
         goto done;
     }
 
@@ -1838,7 +1374,7 @@ int thread_set_priority(unsigned int thread_id, int priority)
     }
 
     struct thread_entry *blt = lock_blocker_thread(bl);
-    struct thread_entry **bqp = thread->bqp;
+    struct __wait_queue *wqp = wait_queue_ptr(thread);
 
     thread->priority = new_priority;
 
@@ -1850,7 +1386,7 @@ int thread_set_priority(unsigned int thread_id, int priority)
     if (new_priority < oldblpr)
         newblpr = new_priority;
     else if (old_priority <= oldblpr)
-        newblpr = find_highest_priority_in_list_l(*bqp);
+        newblpr = wait_queue_find_priority(wqp);
 
     if (newblpr == oldblpr)
     {
@@ -1872,7 +1408,7 @@ done:
  */
 int thread_get_priority(unsigned int thread_id)
 {
-    struct thread_entry *thread = thread_id_entry(thread_id);
+    struct thread_entry *thread = __thread_id_entry(thread_id);
     int base_priority = thread->base_priority;
 
     /* Simply check without locking slot. It may or may not be valid by the
@@ -1888,13 +1424,13 @@ int thread_get_priority(unsigned int thread_id)
 #ifdef HAVE_IO_PRIORITY
 int thread_get_io_priority(unsigned int thread_id)
 {
-    struct thread_entry *thread = thread_id_entry(thread_id);
+    struct thread_entry *thread = __thread_id_entry(thread_id);
     return thread->io_priority;
 }
 
 void thread_set_io_priority(unsigned int thread_id,int io_priority)
 {
-    struct thread_entry *thread = thread_id_entry(thread_id);
+    struct thread_entry *thread = __thread_id_entry(thread_id);
     thread->io_priority = io_priority;
 }
 #endif
@@ -1907,7 +1443,7 @@ void thread_set_io_priority(unsigned int thread_id,int io_priority)
  */
 void thread_thaw(unsigned int thread_id)
 {
-    struct thread_entry *thread = thread_id_entry(thread_id);
+    struct thread_entry *thread = __thread_id_entry(thread_id);
     int oldlevel = disable_irq_save();
 
     LOCK_THREAD(thread);
@@ -1926,69 +1462,73 @@ void thread_thaw(unsigned int thread_id)
  * Switch the processor that the currently executing thread runs on.
  *---------------------------------------------------------------------------
  */
+static USED_ATTR NORETURN_ATTR
+void switch_core_final(unsigned int old_core, struct thread_entry *current)
+{
+    /* Old core won't be using slot resources at this point */
+    core_schedule_wakeup(current);
+    UNLOCK_THREAD(current);
+#ifdef RB_PROFILE
+    profile_thread_stopped(THREAD_ID_SLOT(current->id));
+#endif
+    switch_thread();
+    /* not reached */
+    THREAD_PANICF("switch_core_final->same core!", current);
+    (void)old_core;
+}
+
 unsigned int switch_core(unsigned int new_core)
 {
-    const unsigned int core = CURRENT_CORE;
-    struct thread_entry *current = cores[core].running;
+    const unsigned int old_core = CURRENT_CORE;
+    if (old_core == new_core)
+        return old_core; /* No change */
 
-    if (core == new_core)
-    {
-        /* No change - just return same core */
-        return core;
-    }
+    struct core_entry *corep = __core_id_entry(old_core);
+    struct thread_entry *current = corep->running;
 
     disable_irq();
     LOCK_THREAD(current);
 
-    /* Get us off the running list for the current core */
-    RTR_LOCK(core);
-    remove_from_list_l(&cores[core].running, current);
-    rtr_subtract_entry(core, current->priority);
-    RTR_UNLOCK(core);
+    /* Remove us from old core lists */
+    tmo_queue_remove(&corep->tmo, current);
+    core_rtr_remove(corep, current);
+    corep->running = NULL; /* No switch_thread context save */
 
-    /* Stash return value (old core) in a safe place */
-    current->retval = core;
-
-    /* If a timeout hadn't yet been cleaned-up it must be removed now or
-     * the other core will likely attempt a removal from the wrong list! */
-    if (current->tmo.prev != NULL)
-    {
-        remove_from_list_tmo(current);
-    }
-
-    /* Change the core number for this thread slot */
+    /* Do the actual migration */
     current->core = new_core;
+    switch_thread_core(old_core, current);
 
-    /* Do not use core_schedule_wakeup here since this will result in
-     * the thread starting to run on the other core before being finished on
-     * this one. Delay the  list unlock to keep the other core stuck
-     * until this thread is ready. */
-    RTR_LOCK(new_core);
-
-    rtr_add_entry(new_core, current->priority);
-    add_to_list_l(&cores[new_core].running, current);
-
-    /* Make a callback into device-specific code, unlock the wakeup list so
-     * that execution may resume on the new core, unlock our slot and finally
-     * restore the interrupt level */
-    cores[core].blk_ops.flags = TBOP_SWITCH_CORE;
-    cores[core].blk_ops.cl_p  = &cores[new_core].rtr_cl;
-    cores[core].block_task    = current;
-
-    UNLOCK_THREAD(current);
-
-    /* Alert other core to activity */
-    core_wake(new_core);
-
-    /* Do the stack switching, cache_maintenence and switch_thread call -
-       requires native code */
-    switch_thread_core(core, current);
-
-    /* Finally return the old core to caller */
-    return current->retval;
+    /* Executing on new core */
+    return old_core;
 }
 #endif /* NUM_CORES > 1 */
 
+#ifdef HAVE_SCHEDULER_BOOSTCTRL
+/*---------------------------------------------------------------------------
+ * Change the boost state of a thread boosting or unboosting the CPU
+ * as required.
+ *---------------------------------------------------------------------------
+ */
+static inline void boost_thread(struct thread_entry *thread, bool boost)
+{
+    if ((thread->cpu_boost != 0) != boost)
+    {
+        thread->cpu_boost = boost;
+        cpu_boost(boost);
+    }
+}
+
+void trigger_cpu_boost(void)
+{
+    boost_thread(__running_self_entry(), true);
+}
+
+void cancel_cpu_boost(void)
+{
+    boost_thread(__running_self_entry(), false);
+}
+#endif /* HAVE_SCHEDULER_BOOSTCTRL */
+
 /*---------------------------------------------------------------------------
  * Initialize threading API. This assumes interrupts are not yet enabled. On
  * multicore setups, no core is allowed to proceed until create_thread calls
@@ -1998,127 +1538,56 @@ unsigned int switch_core(unsigned int new_core)
 void INIT_ATTR init_threads(void)
 {
     const unsigned int core = CURRENT_CORE;
-    struct thread_entry *thread;
 
     if (core == CPU)
     {
-        /* Initialize core locks and IDs in all slots */
-        int n;
-        for (n = 0; n < MAXTHREADS; n++)
+        thread_alloc_init(); /* before using cores! */
+
+        /* Create main thread */
+        struct thread_entry *thread = thread_alloc();
+        if (thread == NULL)
         {
-            thread = &threads[n];
-            corelock_init(&thread->waiter_cl);
-            corelock_init(&thread->slot_cl);
-            thread->id = THREAD_ID_INIT(n);
+            /* WTF? There really must be a slot available at this stage.
+             * This can fail if, for example, .bss isn't zero'ed out by the
+             * loader or threads is in the wrong section. */
+            THREAD_PANICF("init_threads->no slot", NULL);
         }
-    }
 
-    /* CPU will initialize first and then sleep */
-    thread = find_empty_thread_slot();
+        size_t stack_size;
+        void *stack = __get_main_stack(&stack_size);
+        new_thread_base_init(thread, &stack, &stack_size, __main_thread_name
+                             IF_PRIO(, PRIORITY_MAIN_THREAD) IF_COP(, core));
 
-    if (thread == NULL)
-    {
-        /* WTF? There really must be a slot available at this stage.
-         * This can fail if, for example, .bss isn't zero'ed out by the loader
-         * or threads is in the wrong section. */
-        THREAD_PANICF("init_threads->no slot", NULL);
-    }
+        struct core_entry *corep = __core_id_entry(core);
+        core_rtr_add(corep, thread);
+        corep->running = thread;
 
-    /* Initialize initially non-zero members of core */
-    cores[core].next_tmo_check = current_tick; /* Something not in the past */
-
-    /* Initialize initially non-zero members of slot */
-    UNLOCK_THREAD(thread); /* No sync worries yet */
-    thread->name = main_thread_name;
-    thread->state = STATE_RUNNING;
-    IF_COP( thread->core = core; )
-#ifdef HAVE_PRIORITY_SCHEDULING
-    corelock_init(&cores[core].rtr_cl);
-    thread->base_priority = PRIORITY_USER_INTERFACE;
-    prio_add_entry(&thread->pdist, PRIORITY_USER_INTERFACE);
-    thread->priority = PRIORITY_USER_INTERFACE;
-    rtr_add_entry(core, PRIORITY_USER_INTERFACE);
-#endif
-
-    add_to_list_l(&cores[core].running, thread);
-
-    if (core == CPU)
-    {
-        thread->stack = stackbegin;
-        thread->stack_size = (uintptr_t)stackend - (uintptr_t)stackbegin;
-#if NUM_CORES > 1  /* This code path will not be run on single core targets */
-        /* Wait for other processors to finish their inits since create_thread
-         * isn't safe to call until the kernel inits are done. The first
-         * threads created in the system must of course be created by CPU.
-         * Another possible approach is to initialize all cores and slots
-         * for each core by CPU, let the remainder proceed in parallel and
-         * signal CPU when all are finished. */
-        core_thread_init(CPU);
-    } 
-    else
-    {
-        /* Initial stack is the idle stack */
-        thread->stack = idle_stacks[core];
-        thread->stack_size = IDLE_STACK_SIZE;
-        /* After last processor completes, it should signal all others to
-         * proceed or may signal the next and call thread_exit(). The last one
-         * to finish will signal CPU. */
-        core_thread_init(core);
-        /* Other cores do not have a main thread - go idle inside switch_thread
-         * until a thread can run on the core. */
-        thread_exit();
-#endif /* NUM_CORES */
-    }
 #ifdef INIT_MAIN_THREAD
-    init_main_thread(&thread->context);
+        init_main_thread(&thread->context);
 #endif
-}
-
-/* Unless otherwise defined, do nothing */
-#ifndef YIELD_KERNEL_HOOK
-#define YIELD_KERNEL_HOOK() false
-#endif
-#ifndef SLEEP_KERNEL_HOOK
-#define SLEEP_KERNEL_HOOK(ticks) false
-#endif
-
-/*---------------------------------------------------------------------------
- * Suspends a thread's execution for at least the specified number of ticks.
- *
- * May result in CPU core entering wait-for-interrupt mode if no other thread
- * may be scheduled.
- *
- * NOTE: sleep(0) sleeps until the end of the current tick
- *       sleep(n) that doesn't result in rescheduling:
- *                      n <= ticks suspended < n + 1
- *       n to n+1 is a lower bound. Other factors may affect the actual time
- *       a thread is suspended before it runs again.
- *---------------------------------------------------------------------------
- */
-unsigned sleep(unsigned ticks)
-{
-    /* In certain situations, certain bootloaders in particular, a normal
-     * threading call is inappropriate. */
-    if (SLEEP_KERNEL_HOOK(ticks))
-        return 0; /* Handled */
-
-    disable_irq();
-    sleep_thread(ticks);
-    switch_thread();
-    return 0;
-}
-
-/*---------------------------------------------------------------------------
- * Elects another thread to run or, if no other thread may be made ready to
- * run, immediately returns control back to the calling thread.
- *---------------------------------------------------------------------------
- */
-void yield(void)
-{
-    /* In certain situations, certain bootloaders in particular, a normal
-     * threading call is inappropriate. */
-    if (YIELD_KERNEL_HOOK())
-        return; /* handled */
-
-    switch_thread();
+    }
+
+#if NUM_CORES > 1
+    /* Boot CPU:
+     * Wait for other processors to finish their inits since create_thread
+     * isn't safe to call until the kernel inits are done. The first
+     * threads created in the system must of course be created by CPU.
+     * Another possible approach is to initialize all cores and slots
+     * for each core by CPU, let the remainder proceed in parallel and
+     * signal CPU when all are finished.
+     *
+     * Other:
+     * After last processor completes, it should signal all others to
+     * proceed or may signal the next and call thread_exit(). The last one
+     * to finish will signal CPU.
+     */
+    core_thread_init(core);
+
+    if (core != CPU)
+    {
+        /* No main thread on coprocessors - go idle and wait */
+        switch_thread();
+        THREAD_PANICF("init_threads() - coprocessor returned", NULL);
+    }
+#endif /* NUM_CORES */
 }
diff --git a/firmware/libc/errno.c b/firmware/libc/errno.c
index 146d6196ca..0672768484 100644
--- a/firmware/libc/errno.c
+++ b/firmware/libc/errno.c
@@ -1,5 +1,5 @@
 #include "../thread-internal.h"
 int * __errno(void)
 {
-    return &thread_self_entry()->__errno;
+    return &__running_self_entry()->__errno;
 }
diff --git a/firmware/target/arm/pp/app-pp.lds b/firmware/target/arm/pp/app-pp.lds
index e6c2b255dd..0b8cbd8430 100644
--- a/firmware/target/arm/pp/app-pp.lds
+++ b/firmware/target/arm/pp/app-pp.lds
@@ -125,6 +125,7 @@ SECTIONS
     .idle_stacks (NOLOAD) :
     {
        *(.idle_stacks)
+       . = ALIGN(8);
 #if NUM_CORES > 1
        cpu_idlestackbegin = .;
        . += IDLE_STACK_SIZE;
diff --git a/firmware/target/arm/pp/thread-pp.c b/firmware/target/arm/pp/thread-pp.c
index 184d243e8d..0af8caa43a 100644
--- a/firmware/target/arm/pp/thread-pp.c
+++ b/firmware/target/arm/pp/thread-pp.c
@@ -82,46 +82,22 @@ static void INIT_ATTR core_thread_init(unsigned int core)
  * to use a stack from an unloaded module until another thread runs on it.
  *---------------------------------------------------------------------------
  */
-static inline void NORETURN_ATTR __attribute__((always_inline))
-    thread_final_exit(struct thread_entry *current)
+static void __attribute__((naked, noinline, noreturn))
+    thread_exit_finalize(unsigned int core, struct thread_entry *current)
 {
     asm volatile (
-        "cmp    %1, #0               \n" /* CPU? */
+        "ldr    r2, =idle_stacks     \n" /* switch to idle stack  */
+        "ldr    sp, [r2, r0, lsl #2] \n"
+        "add    sp, sp, %0*4         \n"
+        "cmp    r0, #0               \n" /* CPU? */
+        "mov    r4, r1               \n"
         "blne   commit_dcache        \n"
-        "mov    r0, %0               \n" /* copy thread parameter */
-        "mov    sp, %2               \n" /* switch to idle stack  */
-        "bl     thread_final_exit_do \n" /* finish removal        */
-        : : "r"(current),
-            "r"(current->core),
-            "r"(&idle_stacks[current->core][IDLE_STACK_WORDS])
-        : "r0", "r1", "r2", "r3", "ip", "lr"); /* Because of flush call,
-                                                  force inputs out
-                                                  of scratch regs */
-    while (1);
-}
+        "mov    r0, r4               \n"
+        "b      thread_exit_final    \n"
+        : : "i"(IDLE_STACK_WORDS));
 
-/*---------------------------------------------------------------------------
- * Perform core switch steps that need to take place inside switch_thread.
- *
- * These steps must take place while before changing the processor and after
- * having entered switch_thread since switch_thread may not do a normal return
- * because the stack being used for anything the compiler saved will not belong
- * to the thread's destination core and it may have been recycled for other
- * purposes by the time a normal context load has taken place. switch_thread
- * will also clobber anything stashed in the thread's context or stored in the
- * nonvolatile registers if it is saved there before the call since the
- * compiler's order of operations cannot be known for certain.
- */
-static void core_switch_blk_op(unsigned int core, struct thread_entry *thread)
-{
-    /* Flush our data to ram */
-    commit_dcache();
-    /* Stash thread in r4 slot */
-    thread->context.r[0] = (uint32_t)thread;
-    /* Stash restart address in r5 slot */
-    thread->context.r[1] = thread->context.start;
-    /* Save sp in context.sp while still running on old core */
-    thread->context.sp = idle_stacks[core][IDLE_STACK_WORDS-1];
+    while (1);
+    (void)core; (void)current;
 }
 
 /*---------------------------------------------------------------------------
@@ -136,31 +112,32 @@ static void core_switch_blk_op(unsigned int core, struct thread_entry *thread)
 /*---------------------------------------------------------------------------
  * This actually performs the core switch.
  */
-static void __attribute__((naked))
-    switch_thread_core(unsigned int core, struct thread_entry *thread)
+static void __attribute__((naked, noinline))
+    switch_thread_core(unsigned int old_core, struct thread_entry *thread)
 {
-    /* Pure asm for this because compiler behavior isn't sufficiently predictable.
-     * Stack access also isn't permitted until restoring the original stack and
-     * context. */
     asm volatile (
-        "stmfd  sp!, { r4-r11, lr }      \n" /* Stack all non-volatile context on current core */
-        "ldr    r2, =idle_stacks         \n" /* r2 = &idle_stacks[core][IDLE_STACK_WORDS] */
-        "ldr    r2, [r2, r0, lsl #2]     \n"
-        "add    r2, r2, %0*4             \n"
-        "stmfd  r2!, { sp }              \n" /* save original stack pointer on idle stack */
-        "mov    sp, r2                   \n" /* switch stacks */
-        "adr    r2, 1f                   \n" /* r2 = new core restart address */
-        "str    r2, [r1, #40]            \n" /* thread->context.start = r2 */
-        "ldr    pc, =switch_thread       \n" /* r0 = thread after call - see load_context */
-    "1:                                  \n"
-        "ldr    sp, [r0, #32]            \n" /* Reload original sp from context structure */
-        "mov    r1, #0                   \n" /* Clear start address */
-        "str    r1, [r0, #40]            \n"
-        "bl     commit_discard_idcache   \n" /* Invalidate new core's cache */
-        "ldmfd  sp!, { r4-r11, pc }      \n" /* Restore non-volatile context to new core and return */
-        : : "i"(IDLE_STACK_WORDS)
-    );
-    (void)core; (void)thread;
+        "stmfd  sp!, { r4-r5, lr }     \n" /* can't use the first two ctx fields */
+        "add    r2, r1, #8             \n"
+        "stmia  r2, { r6-r11, sp }     \n" /* save remaining context */
+        "adr    r2, .new_core_restart  \n" /* save context ptr + restart address */
+        "str    r2, [r1, #40]          \n" /* make 'start' non-null */
+        "stmia  r1, { r1-r2 }          \n"
+        "ldr    r2, =idle_stacks       \n" /* switch to idle stack on old core */
+        "ldr    sp, [r2, r0, lsl #2]   \n"
+        "add    sp, sp, %0*4           \n"
+        "stmfd  sp!, { r0-r1 }         \n"
+        "bl     commit_dcache          \n" /* write back everything */
+        "ldmfd  sp!, { r0-r1 }         \n"
+        "b      switch_core_final      \n"
+    ".new_core_restart:                \n"
+        "mov    r1, #0                 \n" /* mark as started */
+        "str    r1, [r0, #40]          \n"
+        "add    r0, r0, #8             \n"
+        "ldmia  r0, { r6-r11, sp }     \n" /* restore non-volatiles and stack */
+        "bl     commit_discard_idcache \n" /* invalidate new core's cache */
+        "ldmfd  sp!, { r4-r5, pc }     \n" /* restore remaining context */
+        : : "i"(IDLE_STACK_WORDS));
+    (void)old_core; (void)thread;
 }
 
 /** PP-model-specific dual-core code **/
diff --git a/firmware/target/hosted/sdl/thread-sdl.c b/firmware/target/hosted/sdl/thread-sdl.c
index fda877e0f5..a76941f103 100644
--- a/firmware/target/hosted/sdl/thread-sdl.c
+++ b/firmware/target/hosted/sdl/thread-sdl.c
@@ -32,13 +32,13 @@
 #include "core_alloc.h"
 
 /* Define this as 1 to show informational messages that are not errors. */
-#define THREAD_SDL_DEBUGF_ENABLED 0
+#define THREAD_SDL_DEBUGF_ENABLED 1
 
 #if THREAD_SDL_DEBUGF_ENABLED
 #define THREAD_SDL_DEBUGF(...) DEBUGF(__VA_ARGS__)
-static char __name[32];
+static char __name[sizeof (((struct thread_debug_info *)0)->name)];
 #define THREAD_SDL_GET_NAME(thread) \
-    ({ thread_get_name(__name, ARRAYLEN(__name), thread); __name; })
+    ({ format_thread_name(__name, sizeof (__name), thread); __name; })
 #else
 #define THREAD_SDL_DEBUGF(...)
 #define THREAD_SDL_GET_NAME(thread)
@@ -47,9 +47,6 @@ static char __name[32];
 #define THREAD_PANICF(str...) \
     ({ fprintf(stderr, str); exit(-1); })
 
-/* Thread/core entries as in rockbox core */
-static struct core_entry cores[NUM_CORES];
-struct thread_entry threads[MAXTHREADS];
 /* Jump buffers for graceful exit - kernel threads don't stay neatly
  * in their start routines responding to messages so this is the only
  * way to get them back in there so they may exit */
@@ -74,7 +71,7 @@ void sim_thread_shutdown(void)
 
     /* Tell all threads jump back to their start routines, unlock and exit
        gracefully - we'll check each one in turn for it's status. Threads
-       _could_ terminate via remove_thread or multiple threads could exit
+       _could_ terminate via thread_exit or multiple threads could exit
        on each unlock but that is safe. */
 
     /* Do this before trying to acquire lock */
@@ -86,7 +83,7 @@ void sim_thread_shutdown(void)
     /* Signal all threads on delay or block */
     for (i = 0; i < MAXTHREADS; i++)
     {
-        struct thread_entry *thread = &threads[i];
+        struct thread_entry *thread = __thread_slot_entry(i);
         if (thread->context.s == NULL)
             continue;
         SDL_SemPost(thread->context.s);
@@ -95,7 +92,7 @@ void sim_thread_shutdown(void)
     /* Wait for all threads to finish and cleanup old ones. */
     for (i = 0; i < MAXTHREADS; i++)
     {
-        struct thread_entry *thread = &threads[i];
+        struct thread_entry *thread = __thread_slot_entry(i);
         SDL_Thread *t = thread->context.t;
 
         if (t != NULL)
@@ -111,11 +108,11 @@ void sim_thread_shutdown(void)
         }
         else
         {
-            /* Wait on any previous thread in this location-- could be one not quite
-             * finished exiting but has just unlocked the mutex. If it's NULL, the
-             * call returns immediately.
+            /* Wait on any previous thread in this location-- could be one not
+             * quite finished exiting but has just unlocked the mutex. If it's
+             * NULL, the call returns immediately.
              *
-             * See remove_thread below for more information. */
+             * See thread_exit below for more information. */
             SDL_WaitThread(thread->context.told, NULL);
         }
     }
@@ -126,103 +123,6 @@ void sim_thread_shutdown(void)
     threads_status = THREADS_EXIT_COMMAND_DONE;
 }
 
-static void new_thread_id(unsigned int slot_num,
-                          struct thread_entry *thread)
-{
-    unsigned int version =
-        (thread->id + (1u << THREAD_ID_VERSION_SHIFT))
-            & THREAD_ID_VERSION_MASK;
-
-    if (version == 0)
-        version = 1u << THREAD_ID_VERSION_SHIFT;
-
-    thread->id = version | (slot_num & THREAD_ID_SLOT_MASK);
-}
-
-static struct thread_entry * find_empty_thread_slot(void)
-{
-    struct thread_entry *thread = NULL;
-    int n;
-
-    for (n = 0; n < MAXTHREADS; n++)
-    {
-        int state = threads[n].state;
-
-        if (state == STATE_KILLED)
-        {
-            thread = &threads[n];
-            break;
-        }
-    }
-
-    return thread;
-}
-
-
-/* Initialize SDL threading */
-void init_threads(void)
-{
-    static uintptr_t main_stack[] = { DEADBEEF, 0 };
-    struct thread_entry *thread;
-    int n;
-
-    memset(cores, 0, sizeof(cores));
-    memset(threads, 0, sizeof(threads));
-
-    m = SDL_CreateMutex();
-
-    if (SDL_LockMutex(m) == -1)
-    {
-        fprintf(stderr, "Couldn't lock mutex\n");
-        return;
-    }
-
-    /* Initialize all IDs */
-    for (n = 0; n < MAXTHREADS; n++)
-        threads[n].id = THREAD_ID_INIT(n);
-
-    /* Slot 0 is reserved for the main thread - initialize it here and
-       then create the SDL thread - it is possible to have a quick, early
-       shutdown try to access the structure. */
-    thread = &threads[0];
-    thread->stack = main_stack;
-    thread->stack_size = sizeof (main_stack);
-    thread->name = "main";
-    thread->state = STATE_RUNNING;
-    thread->context.s = SDL_CreateSemaphore(0);
-    thread->context.t = NULL; /* NULL for the implicit main thread */
-    cores[CURRENT_CORE].running = thread;
- 
-    if (thread->context.s == NULL)
-    {
-        fprintf(stderr, "Failed to create main semaphore\n");
-        return;
-    }
-
-    /* Tell all threads jump back to their start routines, unlock and exit
-       gracefully - we'll check each one in turn for it's status. Threads
-       _could_ terminate via remove_thread or multiple threads could exit
-       on each unlock but that is safe. */
-
-    /* Setup jump for exit */
-    if (setjmp(thread_jmpbufs[0]) == 0)
-    {
-        THREAD_SDL_DEBUGF("Main thread: %p\n", thread);
-        return;
-    }
-
-    SDL_UnlockMutex(m);
-
-    /* Set to 'COMMAND_DONE' when other rockbox threads have exited. */
-    while (threads_status < THREADS_EXIT_COMMAND_DONE)
-        SDL_Delay(10);
-
-    SDL_DestroyMutex(m);
-
-    /* We're the main thead - perform exit - doesn't return. */
-    sim_do_exit();
-}
-
 void sim_thread_exception_wait(void)
 {
     while (1)
@@ -237,7 +137,7 @@ void sim_thread_exception_wait(void)
 void sim_thread_lock(void *me)
 {
     SDL_LockMutex(m);
-    cores[CURRENT_CORE].running = (struct thread_entry *)me;
+    __running_self_entry() = (struct thread_entry *)me;
 
     if (threads_status != THREADS_RUN)
         thread_exit();
@@ -245,70 +145,14 @@ void sim_thread_lock(void *me)
 
 void * sim_thread_unlock(void)
 {
-    struct thread_entry *current = cores[CURRENT_CORE].running;
+    struct thread_entry *current = __running_self_entry();
     SDL_UnlockMutex(m);
     return current;
 }
 
-struct thread_entry * thread_id_entry(unsigned int thread_id)
-{
-    return &threads[thread_id & THREAD_ID_SLOT_MASK];
-}
-
-static void add_to_list_l(struct thread_entry **list,
-                          struct thread_entry *thread)
-{
-    if (*list == NULL)
-    {
-        /* Insert into unoccupied list */
-        thread->l.next = thread;
-        thread->l.prev = thread;
-        *list = thread;
-    }
-    else
-    {
-        /* Insert last */
-        thread->l.next = *list;
-        thread->l.prev = (*list)->l.prev;
-        thread->l.prev->l.next = thread;
-        (*list)->l.prev = thread;
-    }
-}
-
-static void remove_from_list_l(struct thread_entry **list,
-                               struct thread_entry *thread)
-{
-    if (thread == thread->l.next)
-    {
-        /* The only item */
-        *list = NULL;
-        return;
-    }
-
-    if (thread == *list)
-    {
-        /* List becomes next item */
-        *list = thread->l.next;
-    }
-
-    /* Fix links to jump over the removed entry. */
-    thread->l.prev->l.next = thread->l.next;
-    thread->l.next->l.prev = thread->l.prev;
-}
-
-unsigned int thread_self(void)
-{
-    return cores[CURRENT_CORE].running->id;
-}
-
-struct thread_entry* thread_self_entry(void)
-{
-    return cores[CURRENT_CORE].running;
-}
-
 void switch_thread(void)
 {
-    struct thread_entry *current = cores[CURRENT_CORE].running;
+    struct thread_entry *current = __running_self_entry();
 
     enable_irq();
 
@@ -346,17 +190,7 @@ void switch_thread(void)
 
         oldlevel = disable_irq_save();
 
-        if (current->state == STATE_BLOCKED_W_TMO)
-        {
-            /* Timed out */
-            remove_from_list_l(current->bqp, current);
-
-#ifdef HAVE_WAKEUP_EXT_CB
-            if (current->wakeup_ext_cb != NULL)
-                current->wakeup_ext_cb(current);
-#endif
-            current->state = STATE_RUNNING;
-        }
+        current->state = STATE_RUNNING;
 
         if (result == SDL_MUTEX_TIMEDOUT)
         {
@@ -384,7 +218,7 @@ void switch_thread(void)
 #ifdef DEBUG
     core_check_valid();
 #endif
-    cores[CURRENT_CORE].running = current;
+    __running_self_entry() = current;
 
     if (threads_status != THREADS_RUN)
         thread_exit();
@@ -392,7 +226,7 @@ void switch_thread(void)
 
 void sleep_thread(int ticks)
 {
-    struct thread_entry *current = cores[CURRENT_CORE].running;
+    struct thread_entry *current = __running_self_entry();
     int rem;
 
     current->state = STATE_SLEEPING;
@@ -404,7 +238,7 @@ void sleep_thread(int ticks)
     current->tmo_tick = (1000/HZ) * ticks + ((1000/HZ)-1) - rem;
 }
 
-void block_thread(struct thread_entry *current, int ticks)
+void block_thread_(struct thread_entry *current, int ticks)
 {
     if (ticks < 0)
         current->state = STATE_BLOCKED;
@@ -414,24 +248,19 @@ void block_thread(struct thread_entry *current, int ticks)
         current->tmo_tick = (1000/HZ)*ticks;
     }
 
-    add_to_list_l(current->bqp, current);
+    wait_queue_register(current);
 }
 
-unsigned int wakeup_thread_(struct thread_entry **list)
+unsigned int wakeup_thread_(struct thread_entry *thread)
 {
-    struct thread_entry *thread = *list;
-
-    if (thread != NULL)
+    switch (thread->state)
     {
-        switch (thread->state)
-        {
-        case STATE_BLOCKED:
-        case STATE_BLOCKED_W_TMO:
-            remove_from_list_l(list, thread);
-            thread->state = STATE_RUNNING;
-            SDL_SemPost(thread->context.s);
-            return THREAD_OK;
-        }
+    case STATE_BLOCKED:
+    case STATE_BLOCKED_W_TMO:
+        wait_queue_remove(thread);
+        thread->state = STATE_RUNNING;
+        SDL_SemPost(thread->context.s);
+        return THREAD_OK;
     }
 
     return THREAD_NONE;
@@ -439,7 +268,7 @@ unsigned int wakeup_thread_(struct thread_entry **list)
 
 void thread_thaw(unsigned int thread_id)
 {
-    struct thread_entry *thread = thread_id_entry(thread_id);
+    struct thread_entry *thread = __thread_id_entry(thread_id);
 
     if (thread->id == thread_id && thread->state == STATE_FROZEN)
     {
@@ -450,15 +279,14 @@ void thread_thaw(unsigned int thread_id)
 
 int runthread(void *data)
 {
-    struct thread_entry *current;
-    jmp_buf *current_jmpbuf;
-
     /* Cannot access thread variables before locking the mutex as the
        data structures may not be filled-in yet. */
     SDL_LockMutex(m);
-    cores[CURRENT_CORE].running = (struct thread_entry *)data;
-    current = cores[CURRENT_CORE].running;
-    current_jmpbuf = &thread_jmpbufs[current - threads];
+
+    struct thread_entry *current = (struct thread_entry *)data;
+    __running_self_entry() = current;
+
+    jmp_buf *current_jmpbuf = &thread_jmpbufs[THREAD_ID_SLOT(current->id)];
 
     /* Setup jump for exit */
     if (setjmp(*current_jmpbuf) == 0)
@@ -469,14 +297,15 @@ int runthread(void *data)
             SDL_UnlockMutex(m);
             SDL_SemWait(current->context.s);
             SDL_LockMutex(m);
-            cores[CURRENT_CORE].running = current;
+            __running_self_entry() = current;
         }
 
         if (threads_status == THREADS_RUN)
         {
             current->context.start();
             THREAD_SDL_DEBUGF("Thread Done: %d (%s)\n",
-                              current - threads, THREAD_SDL_GET_NAME(current));
+                              THREAD_ID_SLOT(current->id),
+                              THREAD_SDL_GET_NAME(current));
             /* Thread routine returned - suicide */
         }
 
@@ -495,27 +324,23 @@ unsigned int create_thread(void (*function)(void),
                            void* stack, size_t stack_size,
                            unsigned flags, const char *name)
 {
-    struct thread_entry *thread;
-    SDL_Thread* t;
-    SDL_sem *s;
-
     THREAD_SDL_DEBUGF("Creating thread: (%s)\n", name ? name : "");
 
-    thread = find_empty_thread_slot();
+    struct thread_entry *thread = thread_alloc();
     if (thread == NULL)
     {
         DEBUGF("Failed to find thread slot\n");
         return 0;
     }
 
-    s = SDL_CreateSemaphore(0);
+    SDL_sem *s = SDL_CreateSemaphore(0);
     if (s == NULL)
     {
         DEBUGF("Failed to create semaphore\n");
         return 0;
     }
 
-    t = SDL_CreateThread(runthread, thread);
+    SDL_Thread *t = SDL_CreateThread(runthread, thread);
     if (t == NULL)
     {
         DEBUGF("Failed to create SDL thread\n");
@@ -523,12 +348,6 @@ unsigned int create_thread(void (*function)(void),
         return 0;
     }
 
-    unsigned int stack_words = stack_size / sizeof (uintptr_t);
-    for (unsigned int i = stack_words; i-- > 0;)
-        ((uintptr_t *)stack)[i] = DEADBEEF;
-
-    thread->stack = stack;
-    thread->stack_size = stack_size;
     thread->name = name;
     thread->state = (flags & CREATE_THREAD_FROZEN) ?
         STATE_FROZEN : STATE_RUNNING;
@@ -536,27 +355,22 @@ unsigned int create_thread(void (*function)(void),
     thread->context.t = t;
     thread->context.s = s;
 
-    THREAD_SDL_DEBUGF("New Thread: %d (%s)\n",
-                      thread - threads, THREAD_SDL_GET_NAME(thread));
+    THREAD_SDL_DEBUGF("New Thread: %lu (%s)\n",
+                      (unsigned long)thread->id,
+                      THREAD_SDL_GET_NAME(thread));
 
     return thread->id;
+    (void)stack; (void)stack_size;
 }
 
-static void remove_thread(unsigned int thread_id)
+void thread_exit(void)
 {
-    struct thread_entry *current = cores[CURRENT_CORE].running;
-    struct thread_entry *thread = thread_id_entry(thread_id);
-
-    SDL_Thread *t;
-    SDL_sem *s;
-
-    if (thread->id != thread_id)
-        return;
+    struct thread_entry *current = __running_self_entry();
 
     int oldlevel = disable_irq_save();
 
-    t = thread->context.t;
-    s = thread->context.s;
+    SDL_Thread *t = current->context.t;
+    SDL_sem *s = current->context.s;
 
     /* Wait the last thread here and keep this one or SDL will leak it since
      * it doesn't free its own library allocations unless a wait is performed.
@@ -566,59 +380,27 @@ static void remove_thread(unsigned int thread_id)
      *
      * However, see more below about SDL_KillThread.
      */
-    SDL_WaitThread(thread->context.told, NULL);
+    SDL_WaitThread(current->context.told, NULL);
 
-    thread->context.t = NULL;
-    thread->context.s = NULL;
-    thread->context.told = t;
+    current->context.t = NULL;
+    current->context.s = NULL;
+    current->context.told = t;
 
-    if (thread != current)
-    {
-        switch (thread->state)
-        {
-        case STATE_BLOCKED:
-        case STATE_BLOCKED_W_TMO:
-            /* Remove thread from object it's waiting on */
-            remove_from_list_l(thread->bqp, thread);
-
-#ifdef HAVE_WAKEUP_EXT_CB
-            if (thread->wakeup_ext_cb != NULL)
-                thread->wakeup_ext_cb(thread);
-#endif
-            break;
-        }
-
-        SDL_SemPost(s);
-    }
-
-    THREAD_SDL_DEBUGF("Removing thread: %d (%s)\n",
-        thread - threads, THREAD_SDL_GET_NAME(thread));
-
-    new_thread_id(thread->id, thread);
-    thread->state = STATE_KILLED;
-    thread_queue_wake(&thread->queue);
+    unsigned int id = current->id;
+    new_thread_id(current);
+    current->state = STATE_KILLED;
+    wait_queue_wake(&current->queue);
 
     SDL_DestroySemaphore(s);
 
-    if (thread == current)
-    {
-        /* Do a graceful exit - perform the longjmp back into the thread
-           function to return */
-        restore_irq(oldlevel);
-        longjmp(thread_jmpbufs[current - threads], 1);
-    }
-
-    /* SDL_KillThread frees the old pointer too because it uses SDL_WaitThread
-     * to wait for the host to remove it. */
-    thread->context.told = NULL;
-    SDL_KillThread(t);
+    /* Do a graceful exit - perform the longjmp back into the thread
+       function to return */
     restore_irq(oldlevel);
-}
 
-void thread_exit(void)
-{
-    unsigned int id = thread_self();
-    remove_thread(id);
+    thread_free(current);
+
+    longjmp(thread_jmpbufs[THREAD_ID_SLOT(id)], 1);
+
     /* This should never and must never be reached - if it is, the
      * state is corrupted */
     THREAD_PANICF("thread_exit->K:*R (ID: %d)", id);
@@ -627,44 +409,73 @@ void thread_exit(void)
 
 void thread_wait(unsigned int thread_id)
 {
-    struct thread_entry *current = cores[CURRENT_CORE].running;
-    struct thread_entry *thread = thread_id_entry(thread_id);
+    struct thread_entry *current = __running_self_entry();
+    struct thread_entry *thread = __thread_id_entry(thread_id);
 
     if (thread->id == thread_id && thread->state != STATE_KILLED)
     {
-        current->bqp = &thread->queue;
-        block_thread(current, TIMEOUT_BLOCK);
+        block_thread(current, TIMEOUT_BLOCK, &thread->queue);
         switch_thread();
     }
 }
 
-/*---------------------------------------------------------------------------
- * Suspends a thread's execution for at least the specified number of ticks.
- *
- * May result in CPU core entering wait-for-interrupt mode if no other thread
- * may be scheduled.
- *
- * NOTE: sleep(0) sleeps until the end of the current tick
- *       sleep(n) that doesn't result in rescheduling:
- *                      n <= ticks suspended < n + 1
- *       n to n+1 is a lower bound. Other factors may affect the actual time
- *       a thread is suspended before it runs again.
- *---------------------------------------------------------------------------
- */
-unsigned sleep(unsigned ticks)
+/* Initialize SDL threading */
+void init_threads(void)
 {
-    disable_irq();
-    sleep_thread(ticks);
-    switch_thread();
-    return 0;
-}
+    m = SDL_CreateMutex();
 
-/*---------------------------------------------------------------------------
- * Elects another thread to run or, if no other thread may be made ready to
- * run, immediately returns control back to the calling thread.
- *---------------------------------------------------------------------------
- */
-void yield(void)
-{
-    switch_thread();
+    if (SDL_LockMutex(m) == -1)
+    {
+        fprintf(stderr, "Couldn't lock mutex\n");
+        return;
+    }
+
+    thread_alloc_init();
+
+    struct thread_entry *thread = thread_alloc();
+    if (thread == NULL)
+    {
+        fprintf(stderr, "Main thread alloc failed\n");
+        return;
+    }
+
+    /* Slot 0 is reserved for the main thread - initialize it here and
+       then create the SDL thread - it is possible to have a quick, early
+       shutdown try to access the structure. */
+    thread->name = __main_thread_name;
+    thread->state = STATE_RUNNING;
+    thread->context.s = SDL_CreateSemaphore(0);
+    thread->context.t = NULL; /* NULL for the implicit main thread */
+    __running_self_entry() = thread;
+ 
+    if (thread->context.s == NULL)
+    {
+        fprintf(stderr, "Failed to create main semaphore\n");
+        return;
+    }
+
+    /* Tell all threads jump back to their start routines, unlock and exit
+       gracefully - we'll check each one in turn for it's status. Threads
+       _could_ terminate via thread_exit or multiple threads could exit
+       on each unlock but that is safe. */
+
+    /* Setup jump for exit */
+    if (setjmp(thread_jmpbufs[THREAD_ID_SLOT(thread->id)]) == 0)
+    {
+        THREAD_SDL_DEBUGF("Main Thread: %lu (%s)\n",
+                          (unsigned long)thread->id,
+                          THREAD_SDL_GET_NAME(thread));
+        return;
+    }
+
+    SDL_UnlockMutex(m);
+
+    /* Set to 'COMMAND_DONE' when other rockbox threads have exited. */
+    while (threads_status < THREADS_EXIT_COMMAND_DONE)
+        SDL_Delay(10);
+
+    SDL_DestroyMutex(m);
+
+    /* We're the main thead - perform exit - doesn't return. */
+    sim_do_exit();
 }