--- src/sys/kern/usched_bsd4.c 2006/04/23 17:48:59 1.8 +++ src/sys/kern/usched_bsd4.c 2006/05/29 03:57:20 1.9 @@ -33,14 +33,17 @@ #include #include #include -#include #include #include #include +#include #include #include #include +#include +#include + /* * Priorities. Note that with 32 run queues per scheduler each queue * represents four priority levels. @@ -56,6 +59,7 @@ #define NQS 32 /* 32 run queues. */ #define PPQ (MAXPRI / NQS) /* priorities per queue */ +#define PPQMASK (PPQ - 1) /* * NICEPPQ - number of nice units per priority queue @@ -81,19 +85,25 @@ TAILQ_HEAD(rq, lwp); #define lwp_rqindex lwp_usdata.bsd4.rqindex #define lwp_origcpu lwp_usdata.bsd4.origcpu #define lwp_estcpu lwp_usdata.bsd4.estcpu +#define lwp_rqtype lwp_usdata.bsd4.rqtype static void bsd4_acquire_curproc(struct lwp *lp); static void bsd4_release_curproc(struct lwp *lp); static void bsd4_select_curproc(globaldata_t gd); static void bsd4_setrunqueue(struct lwp *lp); -static void bsd4_remrunqueue(struct lwp *lp); static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp); +static void bsd4_recalculate_estcpu(struct lwp *lp); static void bsd4_resetpriority(struct lwp *lp); static void bsd4_forking(struct lwp *plp, struct lwp *lp); static void bsd4_exiting(struct lwp *plp, struct lwp *lp); -static void bsd4_recalculate_estcpu(struct lwp *lp); +#ifdef SMP +static void need_user_resched_remote(void *dummy); +#endif +static struct lwp *chooseproc_locked(struct lwp *chklp); +static void bsd4_remrunqueue_locked(struct lwp *lp); +static void bsd4_setrunqueue_locked(struct lwp *lp); struct usched usched_bsd4 = { { NULL }, @@ -104,7 +114,6 @@ struct usched usched_bsd4 = { bsd4_release_curproc, bsd4_select_curproc, bsd4_setrunqueue, - bsd4_remrunqueue, bsd4_schedulerclock, bsd4_recalculate_estcpu, bsd4_resetpriority, @@ -113,6 +122,15 @@ struct usched usched_bsd4 = { NULL /* setcpumask not supported */ }; +struct usched_bsd4_pcpu { + struct thread helper_thread; + short rrcount; + short upri; + struct lwp *uschedcp; +}; + +typedef struct usched_bsd4_pcpu *bsd4_pcpu_t; + /* * We have NQS (32) run queues per scheduling class. For the normal * class, there are 128 priorities scaled onto these 32 queues. New @@ -125,20 +143,22 @@ struct usched usched_bsd4 = { * the state of all 32 queues and then a ffs() to find the first busy * queue. */ -static struct rq queues[NQS]; -static struct rq rtqueues[NQS]; -static struct rq idqueues[NQS]; -static u_int32_t queuebits; -static u_int32_t rtqueuebits; -static u_int32_t idqueuebits; -static cpumask_t curprocmask = -1; /* currently running a user process */ -static cpumask_t rdyprocmask; /* ready to accept a user process */ -static int runqcount; +static struct rq bsd4_queues[NQS]; +static struct rq bsd4_rtqueues[NQS]; +static struct rq bsd4_idqueues[NQS]; +static u_int32_t bsd4_queuebits; +static u_int32_t bsd4_rtqueuebits; +static u_int32_t bsd4_idqueuebits; +static cpumask_t bsd4_curprocmask = -1; /* currently running a user process */ +static cpumask_t bsd4_rdyprocmask; /* ready to accept a user process */ +static int bsd4_runqcount; #ifdef SMP -static int scancpu; +static volatile int bsd4_scancpu; #endif +static struct spinlock bsd4_spin; +static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU]; -SYSCTL_INT(_debug, OID_AUTO, runqcount, CTLFLAG_RD, &runqcount, 0, ""); +SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, &bsd4_runqcount, 0, ""); #ifdef INVARIANTS static int usched_nonoptimal; SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW, @@ -150,12 +170,9 @@ SYSCTL_INT(_debug, OID_AUTO, usched_opti static int usched_debug = -1; SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, ""); #ifdef SMP -static int remote_resched = 1; static int remote_resched_nonaffinity; static int remote_resched_affinity; static int choose_affinity; -SYSCTL_INT(_debug, OID_AUTO, remote_resched, CTLFLAG_RW, - &remote_resched, 0, "Resched to another cpu"); SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD, &remote_resched_nonaffinity, 0, "Number of remote rescheds"); SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD, @@ -179,106 +196,195 @@ rqinit(void *dummy) { int i; + spin_init(&bsd4_spin); for (i = 0; i < NQS; i++) { - TAILQ_INIT(&queues[i]); - TAILQ_INIT(&rtqueues[i]); - TAILQ_INIT(&idqueues[i]); + TAILQ_INIT(&bsd4_queues[i]); + TAILQ_INIT(&bsd4_rtqueues[i]); + TAILQ_INIT(&bsd4_idqueues[i]); } - atomic_clear_int(&curprocmask, 1); + atomic_clear_int(&bsd4_curprocmask, 1); } SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL) /* - * chooseproc() is called when a cpu needs a user process to LWKT schedule, - * it selects a user process and returns it. If chklp is non-NULL and chklp - * has a better or equal priority then the process that would otherwise be - * chosen, NULL is returned. + * BSD4_ACQUIRE_CURPROC * - * Until we fix the RUNQ code the chklp test has to be strict or we may - * bounce between processes trying to acquire the current process designation. + * This function is called when the kernel intends to return to userland. + * It is responsible for making the thread the current designated userland + * thread for this cpu, blocking if necessary. + * + * We are expected to handle userland reschedule requests here too. + * + * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE + * TO ANOTHER CPU! Because most of the kernel assumes that no migration will + * occur, this function is called only under very controlled circumstances. + * + * Basically we recalculate our estcpu to hopefully give us a more + * favorable disposition, setrunqueue, then wait for the curlwp + * designation to be handed to us (if the setrunqueue didn't do it). + * + * MPSAFE */ -static -struct lwp * -chooseproc(struct lwp *chklp) +static void +bsd4_acquire_curproc(struct lwp *lp) { - struct lwp *lp; - struct rq *q; - u_int32_t *which; - u_int32_t pri; + globaldata_t gd = mycpu; + bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; - if (rtqueuebits) { - pri = bsfl(rtqueuebits); - q = &rtqueues[pri]; - which = &rtqueuebits; - } else if (queuebits) { - pri = bsfl(queuebits); - q = &queues[pri]; - which = &queuebits; - } else if (idqueuebits) { - pri = bsfl(idqueuebits); - q = &idqueues[pri]; - which = &idqueuebits; - } else { - return NULL; - } - lp = TAILQ_FIRST(q); - KASSERT(lp, ("chooseproc: no lwp on busy queue")); + /* + * Possibly select another thread, or keep the current thread. + */ + if (user_resched_wanted()) + bsd4_select_curproc(gd); /* - * If the passed lwp is reasonably close to the selected - * lwp , return NULL (indicating that should be kept). - * - * Note that we must error on the side of to avoid bouncing - * between threads in the acquire code. + * If uschedcp is still pointing to us, we're done */ - if (chklp) { - if (chklp->lwp_priority < lp->lwp_priority + PPQ) - return(NULL); - } + if (dd->uschedcp == lp) + return; -#ifdef SMP /* - * If the chosen lwp does not reside on this cpu spend a few - * cycles looking for a better candidate at the same priority level. - * This is a fallback check, setrunqueue() tries to wakeup the - * correct cpu and is our front-line affinity. + * If this cpu has no current thread, and the run queue is + * empty, we can safely select ourself. */ - if (lp->lwp_thread->td_gd != mycpu && - (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL - ) { - if (chklp->lwp_thread->td_gd == mycpu) { - ++choose_affinity; - lp = chklp; - } + if (dd->uschedcp == NULL && bsd4_runqcount == 0) { + atomic_set_int(&bsd4_curprocmask, gd->gd_cpumask); + dd->uschedcp = lp; + dd->upri = lp->lwp_priority; + return; } -#endif - TAILQ_REMOVE(q, lp, lwp_procq); - --runqcount; - if (TAILQ_EMPTY(q)) - *which &= ~(1 << pri); - KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq6!")); - lp->lwp_proc->p_flag &= ~P_ONRUNQ; - return lp; + /* + * Adjust estcpu and recalculate our priority, then put us back on + * the user process scheduler's runq. Only increment the involuntary + * context switch count if the setrunqueue call did not immediately + * schedule us. + * + * Loop until we become the currently scheduled process. Note that + * calling setrunqueue can cause us to be migrated to another cpu + * after we switch away. + */ + do { + crit_enter(); + bsd4_recalculate_estcpu(lp); + lwkt_deschedule_self(gd->gd_curthread); + bsd4_setrunqueue(lp); + if ((gd->gd_curthread->td_flags & TDF_RUNQ) == 0) + ++lp->lwp_stats->p_ru.ru_nivcsw; + lwkt_switch(); + crit_exit(); + gd = mycpu; + dd = &bsd4_pcpu[gd->gd_cpuid]; + } while (dd->uschedcp != lp); + KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0); +} + +/* + * BSD4_RELEASE_CURPROC + * + * This routine detaches the current thread from the userland scheduler, + * usually because the thread needs to run in the kernel (at kernel priority) + * for a while. + * + * This routine is also responsible for selecting a new thread to + * make the current thread. + * + * NOTE: This implementation differs from the dummy example in that + * bsd4_select_curproc() is able to select the current process, whereas + * dummy_select_curproc() is not able to select the current process. + * This means we have to NULL out uschedcp. + * + * Additionally, note that we may already be on a run queue if releasing + * via the lwkt_switch() in bsd4_setrunqueue(). + * + * WARNING! The MP lock may be in an unsynchronized state due to the + * way get_mplock() works and the fact that this function may be called + * from a passive release during a lwkt_switch(). try_mplock() will deal + * with this for us but you should be aware that td_mpcount may not be + * useable. + * + * MPSAFE + */ +static void +bsd4_release_curproc(struct lwp *lp) +{ + globaldata_t gd = mycpu; + bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; + + if (dd->uschedcp == lp) { + /* + * Note: we leave ou curprocmask bit set to prevent + * unnecessary scheduler helper wakeups. + * bsd4_select_curproc() will clean it up. + */ + KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0); + dd->uschedcp = NULL; /* don't let lp be selected */ + bsd4_select_curproc(gd); + } } -#ifdef SMP /* - * called via an ipi message to reschedule on another cpu. + * BSD4_SELECT_CURPROC + * + * Select a new current process for this cpu. This satisfies a user + * scheduler reschedule request so clear that too. + * + * This routine is also responsible for equal-priority round-robining, + * typically triggered from bsd4_schedulerclock(). In our dummy example + * all the 'user' threads are LWKT scheduled all at once and we just + * call lwkt_switch(). + * + * MPSAFE */ static void -need_user_resched_remote(void *dummy) +bsd4_select_curproc(globaldata_t gd) { - need_user_resched(); -} + bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; + struct lwp *nlp; + int cpuid = gd->gd_cpuid; + crit_enter_gd(gd); + clear_user_resched(); /* This satisfied the reschedule request */ + dd->rrcount = 0; /* Reset the round-robin counter */ + + spin_lock_wr(&bsd4_spin); + if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) { + atomic_set_int(&bsd4_curprocmask, 1 << cpuid); + dd->upri = nlp->lwp_priority; + dd->uschedcp = nlp; + spin_unlock_wr(&bsd4_spin); +#ifdef SMP + lwkt_acquire(nlp->lwp_thread); #endif + lwkt_schedule(nlp->lwp_thread); + } else if (dd->uschedcp) { + dd->upri = dd->uschedcp->lwp_priority; + spin_unlock_wr(&bsd4_spin); + KKASSERT(bsd4_curprocmask & (1 << cpuid)); + } else if (bsd4_runqcount && (bsd4_rdyprocmask & (1 << cpuid))) { + atomic_clear_int(&bsd4_curprocmask, 1 << cpuid); + atomic_clear_int(&bsd4_rdyprocmask, 1 << cpuid); + dd->uschedcp = NULL; + dd->upri = PRIBASE_NULL; + spin_unlock_wr(&bsd4_spin); + lwkt_schedule(&dd->helper_thread); + } else { + dd->uschedcp = NULL; + dd->upri = PRIBASE_NULL; + atomic_clear_int(&bsd4_curprocmask, 1 << cpuid); + spin_unlock_wr(&bsd4_spin); + } + crit_exit_gd(gd); +} /* - * setrunqueue() 'wakes up' a 'user' process. GIANT must be held. The - * user process may represent any user process, including the current - * process. + * BSD4_SETRUNQUEUE + * + * This routine is called to schedule a new user process after a fork. + * + * The caller may set P_PASSIVE_ACQ in p_flag to indicate that we should + * attempt to leave the thread on the current cpu. * * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target * cpus in an attempt to keep the process on the current cpu at least for @@ -291,36 +397,38 @@ need_user_resched_remote(void *dummy) * priority then the processes running on other cpus, we will allow the * process to be stolen by another cpu. * - * WARNING! a thread can be acquired by another cpu the moment it is put - * on the user scheduler's run queue AND we release the MP lock. Since we - * release the MP lock before switching out another cpu may begin stealing - * our current thread before we are completely switched out! The - * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the - * thread before stealing it. - * - * NOTE on need_user_resched() calls: we have to call need_user_resched() - * if the new process is more important then the current process, or if - * the new process is the current process and is now less important then - * other processes. + * WARNING! This routine cannot block. bsd4_acquire_curproc() does + * a deschedule/switch interlock and we can be moved to another cpu + * the moment we are switched out. Our LWKT run state is the only + * thing preventing the transfer. + * + * The associated thread must NOT currently be scheduled (but can be the + * current process after it has been LWKT descheduled). It must NOT be on + * a bsd4 scheduler queue either. The purpose of this routine is to put + * it on a scheduler queue or make it the current user process and LWKT + * schedule it. It is possible that the thread is in the middle of a LWKT + * switchout on another cpu, lwkt_acquire() deals with that case. * - * The associated thread must NOT be scheduled. * The process must be runnable. - * This must be called at splhigh(). + * + * MPSAFE */ static void bsd4_setrunqueue(struct lwp *lp) { - struct rq *q; - struct globaldata *gd; - int pri; + globaldata_t gd; + bsd4_pcpu_t dd; int cpuid; - u_int32_t needresched; #ifdef SMP - int count; cpumask_t mask; + cpumask_t tmpmask; #endif - ASSERT_MP_LOCK_HELD(lp->lwp_thread); + /* + * First validate the process state relative to the current cpu. + * We don't need the spinlock for this, just a critical section. + * We are in control of the process. + */ crit_enter(); KASSERT(lp->lwp_proc->p_stat == SRUN, ("setrunqueue: proc not SRUN")); KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0, @@ -329,48 +437,54 @@ bsd4_setrunqueue(struct lwp *lp) KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0); /* - * Note: gd is the gd of the TARGET thread's cpu, not our cpu. + * Note: gd and dd are relative to the target thread's last cpu, + * NOT our current cpu. */ gd = lp->lwp_thread->td_gd; + dd = &bsd4_pcpu[gd->gd_cpuid]; /* + * If setrunqueue is being called due to being woken up, verses + * being called when aquiring the current process, recalculate + * estcpu. + * * Because recalculate is only called once or twice for long sleeps, * not every second forever while the process is sleeping, we have * to manually call it to resynchronize p_cpbase on wakeup or it * will wrap if the process was sleeping long enough (e.g. ~10 min * with the ACPI timer) and really mess up the nticks calculation. + * + * NOTE: because P_ONRUNQ is not set, bsd4_recalculate_estcpu()'s + * calls to resetpriority will just play with the processes priority + * fields and not mess with any queues, so it is MPSAFE in this + * context. */ - if (lp->lwp_slptime) { + if (lp->lwp_slptime && (lp->lwp_thread->td_flags & TDF_RUNNING) == 0) { bsd4_recalculate_estcpu(lp); lp->lwp_slptime = 0; } + /* - * We have not been released, make sure that we are not the currently - * designated process. + * This process is not supposed to be scheduled anywhere or assigned + * as the current process anywhere. Assert the condition. */ - KKASSERT(gd->gd_uschedcp != lp); + KKASSERT(dd->uschedcp != lp); /* - * Check cpu affinity. The associated thread is stable at the - * moment. Note that we may be checking another cpu here so we - * have to be careful. We are currently protected by the BGL. + * Check local cpu affinity. The associated thread is stable at + * the moment. Note that we may be checking another cpu here so we + * have to be careful. We can only assign uschedcp on OUR cpu. * * This allows us to avoid actually queueing the process. * acquire_curproc() will handle any threads we mistakenly schedule. */ cpuid = gd->gd_cpuid; - - if ((curprocmask & (1 << cpuid)) == 0) { - atomic_set_int(&curprocmask, 1 << cpuid); - gd->gd_uschedcp = lp; - gd->gd_upri = lp->lwp_priority; + if (gd == mycpu && (bsd4_curprocmask & (1 << cpuid)) == 0) { + atomic_set_int(&bsd4_curprocmask, 1 << cpuid); + dd->uschedcp = lp; + dd->upri = lp->lwp_priority; lwkt_schedule(lp->lwp_thread); - /* CANNOT TOUCH PROC OR TD AFTER SCHEDULE CALL TO REMOTE CPU */ crit_exit(); -#ifdef SMP - if (gd != mycpu) - ++remote_resched_affinity; -#endif return; } @@ -379,189 +493,80 @@ bsd4_setrunqueue(struct lwp *lp) * to place this process on the userland scheduler's run queue for * action by the target cpu. */ - ++runqcount; - lp->lwp_proc->p_flag |= P_ONRUNQ; - if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) { - pri = (lp->lwp_priority & PRIMASK) / PPQ; - q = &queues[pri]; - queuebits |= 1 << pri; - needresched = (queuebits & ((1 << pri) - 1)); - } else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME || - lp->lwp_rtprio.type == RTP_PRIO_FIFO) { - pri = (u_int8_t)lp->lwp_rtprio.prio; - q = &rtqueues[pri]; - rtqueuebits |= 1 << pri; - needresched = (rtqueuebits & ((1 << pri) - 1)); - } else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) { - pri = (u_int8_t)lp->lwp_rtprio.prio; - q = &idqueues[pri]; - idqueuebits |= 1 << pri; - needresched = (idqueuebits & ((1 << pri) - 1)); - } else { - needresched = 0; - panic("setrunqueue: invalid rtprio type"); - } - KKASSERT(pri < 32); - lp->lwp_rqindex = pri; /* remember the queue index */ - TAILQ_INSERT_TAIL(q, lp, lwp_procq); - #ifdef SMP /* - * Either wakeup other cpus user thread scheduler or request - * preemption on other cpus (which will also wakeup a HLT). - * - * NOTE! gd and cpuid may still be our 'hint', not our current - * cpu info. + * XXX fixme. Could be part of a remrunqueue/setrunqueue + * operation when the priority is recalculated, so TDF_MIGRATING + * may already be set. */ - - count = runqcount; + if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0) + lwkt_giveaway(lp->lwp_thread); +#endif + spin_lock_wr(&bsd4_spin); + bsd4_setrunqueue_locked(lp); + spin_unlock_wr(&bsd4_spin); /* - * Check cpu affinity for user preemption (when the curprocmask bit - * is set). Note that gd_upri is a speculative field (we modify - * another cpu's gd_upri to avoid sending ipiq storms). + * gd and cpuid may still be our 'hint', not our current cpu info. + * + * Cpu locality of reference. If the LWP has higher priority + * (lower lwp_priority value) on its target cpu, reschedule on + * that cpu. */ - if (gd == mycpu) { - if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) { - if (lp->lwp_priority < gd->gd_upri - PPQ) { - gd->gd_upri = lp->lwp_priority; - gd->gd_rrcount = 0; - need_user_resched(); - --count; - } else if (gd->gd_uschedcp == lp && needresched) { - gd->gd_rrcount = 0; + if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) { + if (dd->upri > lp->lwp_priority) { /* heuristic */ + dd->upri = lp->lwp_priority; /* heuristic */ +#ifdef SMP + if (gd == mycpu) { need_user_resched(); - --count; + } else { + lwkt_send_ipiq(gd, need_user_resched_remote, + NULL); } - } - } else if (remote_resched) { - if (lp->lwp_priority < gd->gd_upri - PPQ) { - gd->gd_upri = lp->lwp_priority; - lwkt_send_ipiq(gd, need_user_resched_remote, NULL); - --count; - ++remote_resched_affinity; - } - } - - /* - * No affinity, first schedule to any cpus that do not have a current - * process. If there is a free cpu we always schedule to it. - */ - if (count && - (mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0 && - (lp->lwp_proc->p_flag & P_PASSIVE_ACQ) == 0) { - if (!mask) - printf("lwp %d/%d nocpu to schedule it on\n", - lp->lwp_proc->p_pid, lp->lwp_tid); - while (mask && count) { - cpuid = bsfl(mask); - KKASSERT((curprocmask & (1 << cpuid)) == 0); - atomic_clear_int(&rdyprocmask, 1 << cpuid); - lwkt_schedule(&globaldata_find(cpuid)->gd_schedthread); - --count; - mask &= ~(1 << cpuid); +#else + need_user_resched(); +#endif + crit_exit(); + return; } } +#ifdef SMP /* - * If there are still runnable processes try to wakeup a random - * cpu that is running a much lower priority process in order to - * preempt on it. Note that gd_upri is only a hint, so we can - * overwrite it from the wrong cpu. If we can't find one, we - * are SOL. + * Otherwise the LWP has a lower priority or we were asked not + * to reschedule. Look for an idle cpu whos scheduler helper + * is ready to accept more work. * - * We depress the priority check so multiple cpu bound programs - * do not bounce between cpus. Remember that the clock interrupt - * will also cause all cpus to reschedule. + * Look for an idle cpu starting at our rotator (bsd4_scancpu). * - * We must mask against rdyprocmask or we will race in the boot - * code (before all cpus have working scheduler helpers), plus - * some cpus might not be operational and/or not configured to - * handle user processes. - */ - if (count && remote_resched && ncpus > 1) { - cpuid = scancpu; - do { - if (++cpuid == ncpus) - cpuid = 0; - } while (cpuid == mycpu->gd_cpuid); - scancpu = cpuid; - - if (rdyprocmask & (1 << cpuid)) { - gd = globaldata_find(cpuid); - - if (lp->lwp_priority < gd->gd_upri - PPQ) { - gd->gd_upri = lp->lwp_priority; - lwkt_send_ipiq(gd, need_user_resched_remote, NULL); - ++remote_resched_nonaffinity; - } - } - } -#else - if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) { - if (lp->lwp_priority < gd->gd_upri - PPQ) { - gd->gd_upri = lp->lwp_priority; - gd->gd_rrcount = 0; - need_user_resched(); - } else if (gd->gd_uschedcp == lp && needresched) { - gd->gd_rrcount = 0; - need_user_resched(); - } + * If no cpus are ready to accept work, just return. + * + * XXX P_PASSIVE_ACQ + */ + mask = ~bsd4_curprocmask & bsd4_rdyprocmask & mycpu->gd_other_cpus; + if (mask) { + cpuid = bsd4_scancpu; + if (++cpuid == ncpus) + cpuid = 0; + tmpmask = ~((1 << cpuid) - 1); + if (mask & tmpmask) + cpuid = bsfl(mask & tmpmask); + else + cpuid = bsfl(mask); + atomic_clear_int(&bsd4_rdyprocmask, 1 << cpuid); + bsd4_scancpu = cpuid; + lwkt_schedule(&bsd4_pcpu[cpuid].helper_thread); } #endif crit_exit(); } /* - * remrunqueue() removes a given process from the run queue that it is on, - * clearing the queue busy bit if it becomes empty. This function is called - * when a userland process is selected for LWKT scheduling. Note that - * LWKT scheduling is an abstraction of 'curproc'.. there could very well be - * several userland processes whos threads are scheduled or otherwise in - * a special state, and such processes are NOT on the userland scheduler's - * run queue. - * - * This must be called at splhigh(). - */ -static void -bsd4_remrunqueue(struct lwp *lp) -{ - struct rq *q; - u_int32_t *which; - u_int8_t pri; - - ASSERT_MP_LOCK_HELD(lp->lwp_thread); - crit_enter(); - KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq4!")); - lp->lwp_proc->p_flag &= ~P_ONRUNQ; - --runqcount; - KKASSERT(runqcount >= 0); - pri = lp->lwp_rqindex; - if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) { - q = &queues[pri]; - which = &queuebits; - } else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME || - lp->lwp_rtprio.type == RTP_PRIO_FIFO) { - q = &rtqueues[pri]; - which = &rtqueuebits; - } else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) { - q = &idqueues[pri]; - which = &idqueuebits; - } else { - panic("remrunqueue: invalid rtprio type"); - } - TAILQ_REMOVE(q, lp, lwp_procq); - if (TAILQ_EMPTY(q)) { - KASSERT((*which & (1 << pri)) != 0, - ("remrunqueue: remove from empty queue")); - *which &= ~(1 << pri); - } - crit_exit(); -} - -/* * This routine is called from a systimer IPI. It MUST be MP-safe and - * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ. + * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ on + * each cpu. + * + * Because this is effectively a 'fast' interrupt, we cannot safely * * MPSAFE */ @@ -570,13 +575,14 @@ void bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp) { globaldata_t gd = mycpu; + bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; /* * Do we need to round-robin? We round-robin 10 times a second. * This should only occur for cpu-bound batch processes. */ - if (++gd->gd_rrcount >= usched_bsd4_rrinterval) { - gd->gd_rrcount = 0; + if (++dd->rrcount >= usched_bsd4_rrinterval) { + dd->rrcount = 0; need_user_resched(); } @@ -594,254 +600,208 @@ bsd4_schedulerclock(struct lwp *lp, sysc */ if (lp->lwp_origcpu) --lp->lwp_origcpu; - - /* XXX optimize, avoid lock if no reset is required */ - if (try_mplock()) { - bsd4_resetpriority(lp); - rel_mplock(); - } + bsd4_resetpriority(lp); } /* - * Release the current process designation on p. P MUST BE CURPROC. - * Attempt to assign a new current process from the run queue. + * Called from acquire and from kern_synch's one-second timer (one of the + * callout helper threads) with a critical section held. * - * This function is called from exit1(), tsleep(), and the passive - * release code setup in //trap.c + * Decay p_estcpu based on the number of ticks we haven't been running + * and our p_nice. As the load increases each process observes a larger + * number of idle ticks (because other processes are running in them). + * This observation leads to a larger correction which tends to make the + * system more 'batchy'. * - * If we do not have or cannot get the MP lock we just wakeup the userland - * helper scheduler thread for this cpu to do the work for us. + * Note that no recalculation occurs for a process which sleeps and wakes + * up in the same tick. That is, a system doing thousands of context + * switches per second will still only do serious estcpu calculations + * ESTCPUFREQ times per second. * - * WARNING! The MP lock may be in an unsynchronized state due to the - * way get_mplock() works and the fact that this function may be called - * from a passive release during a lwkt_switch(). try_mplock() will deal - * with this for us but you should be aware that td_mpcount may not be - * useable. - */ -static void -bsd4_release_curproc(struct lwp *lp) -{ - int cpuid; - globaldata_t gd = mycpu; - - KKASSERT(lp->lwp_thread->td_gd == gd); - crit_enter(); - cpuid = gd->gd_cpuid; - - if (gd->gd_uschedcp == lp) { - if (try_mplock()) { - /* - * If we can obtain the MP lock we can directly - * select the next current process. - * - * bsd4_select_curproc() will adjust curprocmask - * for us. - */ - gd->gd_uschedcp = NULL; - gd->gd_upri = PRIBASE_NULL; - bsd4_select_curproc(gd); - rel_mplock(); - } else { - /* - * If we cannot obtain the MP lock schedule our - * helper thread to select the next current - * process. - * - * This is the only place where we adjust curprocmask - * and rdyprocmask without holding the MP lock. - */ - gd->gd_uschedcp = NULL; - gd->gd_upri = PRIBASE_NULL; - atomic_clear_int(&curprocmask, 1 << cpuid); - if (runqcount && (rdyprocmask & (1 << cpuid))) { - atomic_clear_int(&rdyprocmask, 1 << cpuid); - lwkt_schedule(&mycpu->gd_schedthread); - } - } - } - crit_exit(); -} - -/* - * Select a new current process, potentially retaining gd_uschedcp. However, - * be sure to round-robin. This routine is generally only called if a - * reschedule is requested and that typically only occurs if a new process - * has a better priority or when we are round-robining. - * - * NOTE: Must be called with giant held and the current cpu's gd. - * NOTE: The caller must handle the situation where it loses a - * uschedcp designation that it previously held, typically by - * calling acquire_curproc() again. - * NOTE: May not block + * MPSAFE */ static -void -bsd4_select_curproc(globaldata_t gd) +void +bsd4_recalculate_estcpu(struct lwp *lp) { - struct lwp *nlp; - int cpuid = gd->gd_cpuid; - void *old; - - clear_user_resched(); - get_mplock(); + globaldata_t gd = mycpu; + sysclock_t cpbase; + int loadfac; + int ndecay; + int nticks; + int nleft; /* - * Choose the next designated current user process. - * Note that we cannot schedule gd_schedthread - * if runqcount is 0 without creating a scheduling - * loop. - * - * We do not clear the user resched request here, - * we need to test it later when we re-acquire. - * - * NOTE: chooseproc returns NULL if the chosen lwp - * is gd_uschedcp. XXX needs cleanup. + * We have to subtract periodic to get the last schedclock + * timeout time, otherwise we would get the upcoming timeout. + * Keep in mind that a process can migrate between cpus and + * while the scheduler clock should be very close, boundary + * conditions could lead to a small negative delta. */ - old = gd->gd_uschedcp; - if ((nlp = chooseproc(gd->gd_uschedcp)) != NULL) { - atomic_set_int(&curprocmask, 1 << cpuid); - gd->gd_upri = nlp->lwp_priority; - gd->gd_uschedcp = nlp; - lwkt_acquire(nlp->lwp_thread); - lwkt_schedule(nlp->lwp_thread); - } else if (gd->gd_uschedcp) { - gd->gd_upri = gd->gd_uschedcp->lwp_priority; - KKASSERT(curprocmask & (1 << cpuid)); - } else if (runqcount && (rdyprocmask & (1 << cpuid))) { - /*gd->gd_uschedcp = NULL;*/ - atomic_clear_int(&curprocmask, 1 << cpuid); - atomic_clear_int(&rdyprocmask, 1 << cpuid); - lwkt_schedule(&gd->gd_schedthread); - } else { - /*gd->gd_uschedcp = NULL;*/ - atomic_clear_int(&curprocmask, 1 << cpuid); - } - rel_mplock(); -} + cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic; -/* - * Acquire the current process designation on the CURRENT process only. - * This function is called at kernel-user priority (not userland priority) - * when curlwp does not match gd_uschedcp. - * - * This function is only called just prior to returning to user mode. - * - * Basically we recalculate our estcpu to hopefully give us a more - * favorable disposition, setrunqueue, then wait for the curlwp - * designation to be handed to us (if the setrunqueue didn't do it). - * - * WARNING! THIS FUNCTION MAY CAUSE THE CURRENT THREAD TO MIGRATE TO - * ANOTHER CPU! Because most of the kernel assumes that no migration will - * occur, this function is called only under very controlled circumstances. - */ -static void -bsd4_acquire_curproc(struct lwp *lp) -{ - globaldata_t gd = mycpu; - - get_mplock(); - crit_enter(); + if (lp->lwp_slptime > 1) { + /* + * Too much time has passed, do a coarse correction. + */ + lp->lwp_estcpu = lp->lwp_estcpu >> 1; + bsd4_resetpriority(lp); + lp->lwp_cpbase = cpbase; + lp->lwp_cpticks = 0; + } else if (lp->lwp_cpbase != cpbase) { + /* + * Adjust estcpu if we are in a different tick. Don't waste + * time if we are in the same tick. + * + * First calculate the number of ticks in the measurement + * interval. The nticks calculation can wind up 0 due to + * a bug in the handling of lwp_slptime (as yet not found), + * so make sure we do not get a divide by 0 panic. + */ + nticks = (cpbase - lp->lwp_cpbase) / gd->gd_schedclock.periodic; + if (nticks <= 0) + nticks = 1; + updatepcpu(lp, lp->lwp_cpticks, nticks); - /* - * Recalculate our priority and put us back on the userland - * scheduler's runq. - * - * Only increment the involuntary context switch count if the - * setrunqueue call did not immediately schedule us. - */ - KKASSERT(lp == gd->gd_curthread->td_lwp); - bsd4_recalculate_estcpu(lp); - lwkt_deschedule_self(gd->gd_curthread); - bsd4_setrunqueue(lp); - if ((gd->gd_curthread->td_flags & TDF_RUNQ) == 0) - ++lp->lwp_stats->p_ru.ru_nivcsw; - lwkt_switch(); + if ((nleft = nticks - lp->lwp_cpticks) < 0) + nleft = 0; + if (usched_debug == lp->lwp_proc->p_pid) { + printf("pid %d tid %d estcpu %d cpticks %d nticks %d nleft %d", + lp->lwp_proc->p_pid, lp->lwp_tid, lp->lwp_estcpu, + lp->lwp_cpticks, nticks, nleft); + } - /* - * Because we put ourselves back on the userland scheduler's run - * queue, WE MAY HAVE BEEN MIGRATED TO ANOTHER CPU - */ - gd = mycpu; + /* + * Calculate a decay value based on ticks remaining scaled + * down by the instantanious load and p_nice. + */ + if ((loadfac = bsd4_runqcount) < 2) + loadfac = 2; + ndecay = nleft * usched_bsd4_decay * 2 * + (PRIO_MAX * 2 - lp->lwp_proc->p_nice) / (loadfac * PRIO_MAX * 2); - /* - * We better be the current process when we wake up, and we had - * better not be on the run queue. - */ - KKASSERT(gd->gd_uschedcp == lp); - KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0); + /* + * Adjust p_estcpu. Handle a border case where batch jobs + * can get stalled long enough to decay to zero when they + * shouldn't. + */ + if (lp->lwp_estcpu > ndecay * 2) + lp->lwp_estcpu -= ndecay; + else + lp->lwp_estcpu >>= 1; - crit_exit(); - rel_mplock(); + if (usched_debug == lp->lwp_proc->p_pid) + printf(" ndecay %d estcpu %d\n", ndecay, lp->lwp_estcpu); + bsd4_resetpriority(lp); + lp->lwp_cpbase = cpbase; + lp->lwp_cpticks = 0; + } } /* * Compute the priority of a process when running in user mode. * Arrange to reschedule if the resulting priority is better * than that of the current process. + * + * This routine may be called with any process. + * + * This routine is called by fork1() for initial setup with the process + * of the run queue, and also may be called normally with the process on or + * off the run queue. + * + * MPSAFE */ static void bsd4_resetpriority(struct lwp *lp) { + bsd4_pcpu_t dd; int newpriority; - int opq; - int npq; - - ASSERT_MP_LOCK_HELD(curthread); + u_short newrqtype; + int reschedcpu; /* - * Set p_priority for general process comparisons + * Calculate the new priority and queue type */ - switch(lp->lwp_rtprio.type) { + crit_enter(); + spin_lock_wr(&bsd4_spin); + + newrqtype = lp->lwp_rtprio.type; + + switch(newrqtype) { case RTP_PRIO_REALTIME: - lp->lwp_priority = PRIBASE_REALTIME + lp->lwp_rtprio.prio; - return; + newpriority = PRIBASE_REALTIME + + (lp->lwp_rtprio.prio & PRIMASK); + break; case RTP_PRIO_NORMAL: + newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ; + newpriority += lp->lwp_estcpu * PPQ / ESTCPUPPQ; + newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ / + NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ); + newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK); break; case RTP_PRIO_IDLE: - lp->lwp_priority = PRIBASE_IDLE + lp->lwp_rtprio.prio; - return; + newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK); + break; case RTP_PRIO_THREAD: - lp->lwp_priority = PRIBASE_THREAD + lp->lwp_rtprio.prio; - return; + newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK); + break; + default: + panic("Bad RTP_PRIO %d", newrqtype); + /* NOT REACHED */ } /* - * NORMAL priorities fall through. These are based on niceness - * and cpu use. Lower numbers == higher priorities. - * - * Calculate our priority based on our niceness and estimated cpu. - * Note that the nice value adjusts the baseline, which effects - * cpu bursts but does not effect overall cpu use between cpu-bound - * processes. The use of the nice field in the decay calculation - * controls the overall cpu use. - * - * This isn't an exact calculation. We fit the full nice and - * estcpu range into the priority range so the actual PPQ value - * is incorrect, but it's still a reasonable way to think about it. - */ - newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ; - newpriority += lp->lwp_estcpu * PPQ / ESTCPUPPQ; - newpriority = newpriority * MAXPRI / - (PRIO_RANGE * PPQ / NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ); - newpriority = MIN(newpriority, MAXPRI - 1); /* sanity */ - newpriority = MAX(newpriority, 0); /* sanity */ - npq = newpriority / PPQ; - crit_enter(); - opq = (lp->lwp_priority & PRIMASK) / PPQ; - if (lp->lwp_proc->p_stat == SRUN && (lp->lwp_proc->p_flag & P_ONRUNQ) && opq != npq) { - /* - * We have to move the process to another queue - */ - bsd4_remrunqueue(lp); - lp->lwp_priority = PRIBASE_NORMAL + newpriority; - bsd4_setrunqueue(lp); + * The newpriority incorporates the queue type so do a simple masked + * check to determine if the process has moved to another queue. If + * it has, and it is currently on a run queue, then move it. + */ + if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) { + lp->lwp_priority = newpriority; + if (lp->lwp_proc->p_flag & P_ONRUNQ) { + bsd4_remrunqueue_locked(lp); + lp->lwp_rqtype = newrqtype; + lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; + bsd4_setrunqueue_locked(lp); + reschedcpu = lp->lwp_thread->td_gd->gd_cpuid; + } else { + lp->lwp_rqtype = newrqtype; + lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; + reschedcpu = -1; + } } else { - /* - * We can just adjust the priority and it will be picked - * up later. - */ - KKASSERT(opq == npq || (lp->lwp_proc->p_flag & P_ONRUNQ) == 0); - lp->lwp_priority = PRIBASE_NORMAL + newpriority; + lp->lwp_rqtype = newrqtype; + lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; + lp->lwp_priority = newpriority; + reschedcpu = -1; + } + spin_unlock_wr(&bsd4_spin); + + /* + * Determine if we need to reschedule the target cpu. Since at + * most we are moving an already-scheduled lwp around, we don't + * have to be fancy here. + */ + if (reschedcpu >= 0) { + dd = &bsd4_pcpu[reschedcpu]; + if (dd->uschedcp == lp) { + /* + * We don't need to reschedule ourselves. In fact, + * this could lead to a livelock. + */ + dd->upri = lp->lwp_priority; + } else if (dd->upri > lp->lwp_priority) { /* heuristic */ + dd->upri = lp->lwp_priority; /* heuristic */ +#ifdef SMP + if (reschedcpu == mycpu->gd_cpuid) { + need_user_resched(); + } else { + lwkt_send_ipiq(lp->lwp_thread->td_gd, + need_user_resched_remote, NULL); + } +#else + need_user_resched(); +#endif + } } crit_exit(); } @@ -886,100 +846,202 @@ bsd4_exiting(struct lwp *plp, struct lwp } } + /* - * Called from acquire and from kern_synch's one-second timer with a - * critical section held. + * chooseproc() is called when a cpu needs a user process to LWKT schedule, + * it selects a user process and returns it. If chklp is non-NULL and chklp + * has a better or equal priority then the process that would otherwise be + * chosen, NULL is returned. * - * Decay p_estcpu based on the number of ticks we haven't been running - * and our p_nice. As the load increases each process observes a larger - * number of idle ticks (because other processes are running in them). - * This observation leads to a larger correction which tends to make the - * system more 'batchy'. + * Until we fix the RUNQ code the chklp test has to be strict or we may + * bounce between processes trying to acquire the current process designation. * - * Note that no recalculation occurs for a process which sleeps and wakes - * up in the same tick. That is, a system doing thousands of context - * switches per second will still only do serious estcpu calculations - * ESTCPUFREQ times per second. + * MPSAFE - must be called with bsd4_spin exclusive held. The spinlock is + * left intact through the entire routine. */ static -void -bsd4_recalculate_estcpu(struct lwp *lp) +struct lwp * +chooseproc_locked(struct lwp *chklp) { - globaldata_t gd = mycpu; - sysclock_t cpbase; - int loadfac; - int ndecay; - int nticks; - int nleft; + struct lwp *lp; + struct rq *q; + u_int32_t *which; + u_int32_t pri; - ASSERT_MP_LOCK_HELD(curthread); + if (bsd4_rtqueuebits) { + pri = bsfl(bsd4_rtqueuebits); + q = &bsd4_rtqueues[pri]; + which = &bsd4_rtqueuebits; + } else if (bsd4_queuebits) { + pri = bsfl(bsd4_queuebits); + q = &bsd4_queues[pri]; + which = &bsd4_queuebits; + } else if (bsd4_idqueuebits) { + pri = bsfl(bsd4_idqueuebits); + q = &bsd4_idqueues[pri]; + which = &bsd4_idqueuebits; + } else { + return NULL; + } + lp = TAILQ_FIRST(q); + KASSERT(lp, ("chooseproc: no lwp on busy queue")); /* - * We have to subtract periodic to get the last schedclock - * timeout time, otherwise we would get the upcoming timeout. - * Keep in mind that a process can migrate between cpus and - * while the scheduler clock should be very close, boundary - * conditions could lead to a small negative delta. + * If the passed lwp is reasonably close to the selected + * lwp , return NULL (indicating that should be kept). + * + * Note that we must error on the side of to avoid bouncing + * between threads in the acquire code. */ - cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic; - - if (lp->lwp_slptime > 1) { - /* - * Too much time has passed, do a coarse correction. - */ - lp->lwp_estcpu = lp->lwp_estcpu >> 1; - bsd4_resetpriority(lp); - lp->lwp_cpbase = cpbase; - lp->lwp_cpticks = 0; - } else if (lp->lwp_cpbase != cpbase) { - /* - * Adjust estcpu if we are in a different tick. Don't waste - * time if we are in the same tick. - * - * First calculate the number of ticks in the measurement - * interval. The nticks calculation can wind up 0 due to - * a bug in the handling of lwp_slptime (as yet not found), - * so make sure we do not get a divide by 0 panic. - */ - nticks = (cpbase - lp->lwp_cpbase) / gd->gd_schedclock.periodic; - if (nticks <= 0) - nticks = 1; - updatepcpu(lp, lp->lwp_cpticks, nticks); + if (chklp) { + if (chklp->lwp_priority < lp->lwp_priority + PPQ) + return(NULL); + } - if ((nleft = nticks - lp->lwp_cpticks) < 0) - nleft = 0; - if (usched_debug == lp->lwp_proc->p_pid) { - printf("pid %d tid %d estcpu %d cpticks %d nticks %d nleft %d", - lp->lwp_proc->p_pid, lp->lwp_tid, lp->lwp_estcpu, - lp->lwp_cpticks, nticks, nleft); +#ifdef SMP + /* + * If the chosen lwp does not reside on this cpu spend a few + * cycles looking for a better candidate at the same priority level. + * This is a fallback check, setrunqueue() tries to wakeup the + * correct cpu and is our front-line affinity. + */ + if (lp->lwp_thread->td_gd != mycpu && + (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL + ) { + if (chklp->lwp_thread->td_gd == mycpu) { + ++choose_affinity; + lp = chklp; } + } +#endif - /* - * Calculate a decay value based on ticks remaining scaled - * down by the instantanious load and p_nice. - */ - if ((loadfac = runqcount) < 2) - loadfac = 2; - ndecay = nleft * usched_bsd4_decay * 2 * - (PRIO_MAX * 2 - lp->lwp_proc->p_nice) / (loadfac * PRIO_MAX * 2); + TAILQ_REMOVE(q, lp, lwp_procq); + --bsd4_runqcount; + if (TAILQ_EMPTY(q)) + *which &= ~(1 << pri); + KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq6!")); + lp->lwp_proc->p_flag &= ~P_ONRUNQ; + return lp; +} - /* - * Adjust p_estcpu. Handle a border case where batch jobs - * can get stalled long enough to decay to zero when they - * shouldn't. - */ - if (lp->lwp_estcpu > ndecay * 2) - lp->lwp_estcpu -= ndecay; - else - lp->lwp_estcpu >>= 1; +#ifdef SMP +/* + * Called via an ipi message to reschedule on another cpu. + * + * MPSAFE + */ +static +void +need_user_resched_remote(void *dummy) +{ + need_user_resched(); +} - if (usched_debug == lp->lwp_proc->p_pid) - printf(" ndecay %d estcpu %d\n", ndecay, lp->lwp_estcpu); +#endif - bsd4_resetpriority(lp); - lp->lwp_cpbase = cpbase; - lp->lwp_cpticks = 0; + +/* + * bsd4_remrunqueue_locked() removes a given process from the run queue + * that it is on, clearing the queue busy bit if it becomes empty. + * + * Note that user process scheduler is different from the LWKT schedule. + * The user process scheduler only manages user processes but it uses LWKT + * underneath, and a user process operating in the kernel will often be + * 'released' from our management. + * + * MPSAFE - bsd4_spin must be held exclusively on call + */ +static void +bsd4_remrunqueue_locked(struct lwp *lp) +{ + struct rq *q; + u_int32_t *which; + u_int8_t pri; + + KKASSERT(lp->lwp_proc->p_flag & P_ONRUNQ); + lp->lwp_proc->p_flag &= ~P_ONRUNQ; + --bsd4_runqcount; + KKASSERT(bsd4_runqcount >= 0); + + pri = lp->lwp_rqindex; + switch(lp->lwp_rqtype) { + case RTP_PRIO_NORMAL: + q = &bsd4_queues[pri]; + which = &bsd4_queuebits; + break; + case RTP_PRIO_REALTIME: + case RTP_PRIO_FIFO: + q = &bsd4_rtqueues[pri]; + which = &bsd4_rtqueuebits; + break; + case RTP_PRIO_IDLE: + q = &bsd4_idqueues[pri]; + which = &bsd4_idqueuebits; + break; + default: + panic("remrunqueue: invalid rtprio type"); + /* NOT REACHED */ + } + TAILQ_REMOVE(q, lp, lwp_procq); + if (TAILQ_EMPTY(q)) { + KASSERT((*which & (1 << pri)) != 0, + ("remrunqueue: remove from empty queue")); + *which &= ~(1 << pri); + } +} + +/* + * bsd4_setrunqueue_locked() + * + * Add a process whos rqtype and rqindex had previously been calculated + * onto the appropriate run queue. Determine if the addition requires + * a reschedule on a cpu and return the cpuid or -1. + * + * NOTE: Lower priorities are better priorities. + * + * MPSAFE - bsd4_spin must be held exclusively on call + */ +static void +bsd4_setrunqueue_locked(struct lwp *lp) +{ + struct rq *q; + u_int32_t *which; + int pri; + + KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0); + lp->lwp_proc->p_flag |= P_ONRUNQ; + ++bsd4_runqcount; + + pri = lp->lwp_rqindex; + + switch(lp->lwp_rqtype) { + case RTP_PRIO_NORMAL: + q = &bsd4_queues[pri]; + which = &bsd4_queuebits; + break; + case RTP_PRIO_REALTIME: + case RTP_PRIO_FIFO: + q = &bsd4_rtqueues[pri]; + which = &bsd4_rtqueuebits; + break; + case RTP_PRIO_IDLE: + q = &bsd4_idqueues[pri]; + which = &bsd4_idqueuebits; + break; + default: + panic("remrunqueue: invalid rtprio type"); + /* NOT REACHED */ } + + /* + * Add to the correct queue and set the appropriate bit. If no + * lower priority (i.e. better) processes are in the queue then + * we want a reschedule, calculate the best cpu for the job. + * + * Always run reschedules on the LWPs original cpu. + */ + TAILQ_INSERT_TAIL(q, lp, lwp_procq); + *which |= 1 << pri; } #ifdef SMP @@ -991,29 +1053,71 @@ bsd4_recalculate_estcpu(struct lwp *lp) * need the helper since there is only one cpu. We can't use the idle * thread for this because we need to hold the MP lock. Additionally, * doing things this way allows us to HLT idle cpus on MP systems. + * + * MPSAFE */ static void sched_thread(void *dummy) { - globaldata_t gd = mycpu; - int cpuid = gd->gd_cpuid; /* doesn't change */ - u_int32_t cpumask = 1 << cpuid; /* doesn't change */ + globaldata_t gd; + bsd4_pcpu_t dd; + struct lwp *nlp; + cpumask_t cpumask; + cpumask_t tmpmask; + int cpuid; + int tmpid; + + gd = mycpu; + cpuid = gd->gd_cpuid; /* doesn't change */ + cpumask = 1 << cpuid; /* doesn't change */ + dd = &bsd4_pcpu[cpuid]; + + /* + * Scheduler thread does not need to hold the MP lock + */ + rel_mplock(); - ASSERT_MP_LOCK_HELD(curthread); for (;;) { - struct lwp *nlp; - + crit_enter_gd(gd); lwkt_deschedule_self(gd->gd_curthread); /* interlock */ - atomic_set_int(&rdyprocmask, cpumask); - crit_enter_quick(gd->gd_curthread); - if ((curprocmask & cpumask) == 0 && (nlp = chooseproc(NULL)) != NULL) { - atomic_set_int(&curprocmask, cpumask); - gd->gd_upri = nlp->lwp_priority; - gd->gd_uschedcp = nlp; - lwkt_acquire(nlp->lwp_thread); - lwkt_schedule(nlp->lwp_thread); + spin_lock_wr(&bsd4_spin); + atomic_set_int(&bsd4_rdyprocmask, cpumask); + if ((bsd4_curprocmask & cpumask) == 0) { + if ((nlp = chooseproc_locked(NULL)) != NULL) { + atomic_set_int(&bsd4_curprocmask, cpumask); + dd->upri = nlp->lwp_priority; + dd->uschedcp = nlp; + spin_unlock_wr(&bsd4_spin); + lwkt_acquire(nlp->lwp_thread); + lwkt_schedule(nlp->lwp_thread); + } else { + spin_unlock_wr(&bsd4_spin); + } + } else { + /* + * Someone scheduled us but raced. In order to not lose + * track of the fact that there may be a LWP ready to go, + * forward the request to another cpu if available. + * + * Rotate through cpus starting with cpuid + 1. Since cpuid + * is already masked out by gd_other_cpus, just use ~cpumask. + */ + tmpmask = ~bsd4_curprocmask & bsd4_rdyprocmask & + mycpu->gd_other_cpus; + if (tmpmask) { + if (tmpmask & ~(cpumask - 1)) + tmpid = bsfl(tmpmask & ~(cpumask - 1)); + else + tmpid = bsfl(tmpmask); + bsd4_scancpu = tmpid; + atomic_clear_int(&bsd4_rdyprocmask, 1 << tmpid); + spin_unlock_wr(&bsd4_spin); + lwkt_schedule(&bsd4_pcpu[tmpid].helper_thread); + } else { + spin_unlock_wr(&bsd4_spin); + } } - crit_exit_quick(gd->gd_curthread); + crit_exit_gd(gd); lwkt_switch(); } } @@ -1031,7 +1135,7 @@ sched_thread_cpu_init(void) printf("start scheduler helpers on cpus:"); for (i = 0; i < ncpus; ++i) { - globaldata_t dgd = globaldata_find(i); + bsd4_pcpu_t dd = &bsd4_pcpu[i]; cpumask_t mask = 1 << i; if ((mask & smp_active_mask) == 0) @@ -1040,7 +1144,7 @@ sched_thread_cpu_init(void) if (bootverbose) printf(" %d", i); - lwkt_create(sched_thread, NULL, NULL, &dgd->gd_schedthread, + lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread, TDF_STOPREQ, i, "usched %d", i); /* @@ -1048,8 +1152,8 @@ sched_thread_cpu_init(void) * been enabled in rqinit(). */ if (i) - atomic_clear_int(&curprocmask, mask); - atomic_set_int(&rdyprocmask, mask); + atomic_clear_int(&bsd4_curprocmask, mask); + atomic_set_int(&bsd4_rdyprocmask, mask); } if (bootverbose) printf("\n");