br.cond.sptk.many b7
 END(load_switch_stack)
 
+GLOBAL_ENTRY(prefetch_stack)
+       add r14 = -IA64_SWITCH_STACK_SIZE, sp
+       add r15 = IA64_TASK_THREAD_KSP_OFFSET, in0
+       ;;
+       ld8 r16 = [r15]                         // load next's stack pointer
+       lfetch.fault.excl [r14], 128
+       ;;
+       lfetch.fault.excl [r14], 128
+       lfetch.fault [r16], 128
+       ;;
+       lfetch.fault.excl [r14], 128
+       lfetch.fault [r16], 128
+       ;;
+       lfetch.fault.excl [r14], 128
+       lfetch.fault [r16], 128
+       ;;
+       lfetch.fault.excl [r14], 128
+       lfetch.fault [r16], 128
+       ;;
+       lfetch.fault [r16], 128
+       br.ret.sptk.many rp
+END(prefetch_switch_stack)
+
 GLOBAL_ENTRY(execve)
        mov r15=__NR_execve                     // put syscall number in place
        break __BREAK_SYSCALL
 
  */
 #define __ARCH_WANT_UNLOCKED_CTXSW
 
+#define ARCH_HAS_PREFETCH_SWITCH_STACK
 #define ia64_platform_is(x) (strcmp(x, platform_name) == 0)
 
 void cpu_idle_wait(void);
 
 #define GROUP_AT(gi, i) \
     ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
 
+#ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
+extern void prefetch_stack(struct task_struct*);
+#else
+static inline void prefetch_stack(struct task_struct *t) { }
+#endif
 
 struct audit_context;          /* See audit.c */
 struct mempolicy;
 
        if (next == rq->idle)
                schedstat_inc(rq, sched_goidle);
        prefetch(next);
+       prefetch_stack(next);
        clear_tsk_need_resched(prev);
        rcu_qsctr_inc(task_cpu(prev));