From: Tejun Heo Date: Mon, 15 May 2006 11:58:12 +0000 (+0900) Subject: [PATCH] libata-eh-fw: update ata_scsi_error() for new EH X-Git-Tag: v2.6.18-rc1~1079^2~98^2~26 X-Git-Url: http://pilppa.com/gitweb/?a=commitdiff_plain;h=ad9e27624479bd167dd7eac0cea4bb3ad13bc926;p=linux-2.6-omap-h63xx.git [PATCH] libata-eh-fw: update ata_scsi_error() for new EH Update ata_scsi_error() for new EH. ata_scsi_error() is responsible for claiming timed out qcs and invoking ->error_handler in safe and synchronized manner. As the state of the controller is unknown if a qc has timed out, the port is frozen in such cases. Note that ata_scsi_timed_out() isn't used for new EH. This is because a timed out qc cannot be claimed by EH without freezing the port and freezing the port in ata_scsi_timed_out() results in unnecessary abortion of other active qcs. ata_scsi_timed_out() can be removed once all drivers are converted to new EH. While at it, add 'TODO: kill' comments to old EH functions. Signed-off-by: Tejun Heo --- diff --git a/drivers/scsi/libata-eh.c b/drivers/scsi/libata-eh.c index cb4e2b8d32d..0803231f657 100644 --- a/drivers/scsi/libata-eh.c +++ b/drivers/scsi/libata-eh.c @@ -44,6 +44,8 @@ #include "libata.h" +static void __ata_port_freeze(struct ata_port *ap); + /** * ata_scsi_timed_out - SCSI layer time out callback * @cmd: timed out SCSI command @@ -55,6 +57,8 @@ * from finishing it by setting EH_SCHEDULED and return * EH_NOT_HANDLED. * + * TODO: kill this function once old EH is gone. + * * LOCKING: * Called from timer context * @@ -67,10 +71,16 @@ enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd) struct ata_port *ap = ata_shost_to_port(host); unsigned long flags; struct ata_queued_cmd *qc; - enum scsi_eh_timer_return ret = EH_HANDLED; + enum scsi_eh_timer_return ret; DPRINTK("ENTER\n"); + if (ap->ops->error_handler) { + ret = EH_NOT_HANDLED; + goto out; + } + + ret = EH_HANDLED; spin_lock_irqsave(&ap->host_set->lock, flags); qc = ata_qc_from_tag(ap, ap->active_tag); if (qc) { @@ -81,6 +91,7 @@ enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd) } spin_unlock_irqrestore(&ap->host_set->lock, flags); + out: DPRINTK("EXIT, ret=%d\n", ret); return ret; } @@ -100,21 +111,132 @@ enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd) void ata_scsi_error(struct Scsi_Host *host) { struct ata_port *ap = ata_shost_to_port(host); + spinlock_t *hs_lock = &ap->host_set->lock; + int i, repeat_cnt = ATA_EH_MAX_REPEAT; + unsigned long flags; DPRINTK("ENTER\n"); - /* synchronize with IRQ handler and port task */ - spin_unlock_wait(&ap->host_set->lock); + /* synchronize with port task */ ata_port_flush_task(ap); - WARN_ON(ata_qc_from_tag(ap, ap->active_tag) == NULL); + /* synchronize with host_set lock and sort out timeouts */ + + /* For new EH, all qcs are finished in one of three ways - + * normal completion, error completion, and SCSI timeout. + * Both cmpletions can race against SCSI timeout. When normal + * completion wins, the qc never reaches EH. When error + * completion wins, the qc has ATA_QCFLAG_FAILED set. + * + * When SCSI timeout wins, things are a bit more complex. + * Normal or error completion can occur after the timeout but + * before this point. In such cases, both types of + * completions are honored. A scmd is determined to have + * timed out iff its associated qc is active and not failed. + */ + if (ap->ops->error_handler) { + struct scsi_cmnd *scmd, *tmp; + int nr_timedout = 0; + + spin_lock_irqsave(hs_lock, flags); + + list_for_each_entry_safe(scmd, tmp, &host->eh_cmd_q, eh_entry) { + struct ata_queued_cmd *qc; + + for (i = 0; i < ATA_MAX_QUEUE; i++) { + qc = __ata_qc_from_tag(ap, i); + if (qc->flags & ATA_QCFLAG_ACTIVE && + qc->scsicmd == scmd) + break; + } + + if (i < ATA_MAX_QUEUE) { + /* the scmd has an associated qc */ + if (!(qc->flags & ATA_QCFLAG_FAILED)) { + /* which hasn't failed yet, timeout */ + qc->err_mask |= AC_ERR_TIMEOUT; + qc->flags |= ATA_QCFLAG_FAILED; + nr_timedout++; + } + } else { + /* Normal completion occurred after + * SCSI timeout but before this point. + * Successfully complete it. + */ + scmd->retries = scmd->allowed; + scsi_eh_finish_cmd(scmd, &ap->eh_done_q); + } + } + + /* If we have timed out qcs. They belong to EH from + * this point but the state of the controller is + * unknown. Freeze the port to make sure the IRQ + * handler doesn't diddle with those qcs. This must + * be done atomically w.r.t. setting QCFLAG_FAILED. + */ + if (nr_timedout) + __ata_port_freeze(ap); + + spin_unlock_irqrestore(hs_lock, flags); + } else + spin_unlock_wait(hs_lock); + + repeat: + /* invoke error handler */ + if (ap->ops->error_handler) { + /* clear EH pending */ + spin_lock_irqsave(hs_lock, flags); + ap->flags &= ~ATA_FLAG_EH_PENDING; + spin_unlock_irqrestore(hs_lock, flags); + + /* invoke EH */ + ap->ops->error_handler(ap); + + /* Exception might have happend after ->error_handler + * recovered the port but before this point. Repeat + * EH in such case. + */ + spin_lock_irqsave(hs_lock, flags); + + if (ap->flags & ATA_FLAG_EH_PENDING) { + if (--repeat_cnt) { + ata_port_printk(ap, KERN_INFO, + "EH pending after completion, " + "repeating EH (cnt=%d)\n", repeat_cnt); + spin_unlock_irqrestore(hs_lock, flags); + goto repeat; + } + ata_port_printk(ap, KERN_ERR, "EH pending after %d " + "tries, giving up\n", ATA_EH_MAX_REPEAT); + } - ap->ops->eng_timeout(ap); + /* Clear host_eh_scheduled while holding hs_lock such + * that if exception occurs after this point but + * before EH completion, SCSI midlayer will + * re-initiate EH. + */ + host->host_eh_scheduled = 0; + + spin_unlock_irqrestore(hs_lock, flags); + } else { + WARN_ON(ata_qc_from_tag(ap, ap->active_tag) == NULL); + ap->ops->eng_timeout(ap); + } + /* finish or retry handled scmd's and clean up */ WARN_ON(host->host_failed || !list_empty(&host->eh_cmd_q)); scsi_eh_flush_done_q(&ap->eh_done_q); + /* clean up */ + spin_lock_irqsave(hs_lock, flags); + + if (ap->flags & ATA_FLAG_RECOVERED) + ata_port_printk(ap, KERN_INFO, "EH complete\n"); + ap->flags &= ~ATA_FLAG_RECOVERED; + + spin_unlock_irqrestore(hs_lock, flags); + DPRINTK("EXIT\n"); } @@ -133,6 +255,8 @@ void ata_scsi_error(struct Scsi_Host *host) * an interrupt was not delivered to the driver, even though the * transaction completed successfully. * + * TODO: kill this function once old EH is gone. + * * LOCKING: * Inherited from SCSI layer (none, can sleep) */ @@ -198,6 +322,8 @@ static void ata_qc_timeout(struct ata_queued_cmd *qc) * an interrupt was not delivered to the driver, even though the * transaction completed successfully. * + * TODO: kill this function once old EH is gone. + * * LOCKING: * Inherited from SCSI layer (none, can sleep) */ diff --git a/include/linux/libata.h b/include/linux/libata.h index 6758b4d374a..5ad50163c8e 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -225,6 +225,9 @@ enum { ATA_PORT_PRIMARY = (1 << 0), ATA_PORT_SECONDARY = (1 << 1), + /* max repeat if error condition is still set after ->error_handler */ + ATA_EH_MAX_REPEAT = 5, + /* how hard are we gonna try to probe/recover devices */ ATA_PROBE_MAX_TRIES = 3, };