/[linux-patches]/genpatches-2.6/trunk/2.6.19/4105_dm-bbr.patch
Gentoo

Contents of /genpatches-2.6/trunk/2.6.19/4105_dm-bbr.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 274 - (hide annotations) (download)
Sun Jan 22 14:07:02 2006 UTC (12 years, 8 months ago) by phreak
Original Path: genpatches-2.6/trunk/2.6.16-pre/4305_dm-bbr.patch
File size: 32133 byte(s)
Rediffing 4305_dm-bbr.patch
1 phreak 274 Index: linux-git/drivers/md/dm-bbr.c
2     ===================================================================
3     --- /dev/null
4     +++ linux-git/drivers/md/dm-bbr.c
5 dsd 7 @@ -0,0 +1,1003 @@
6     +/*
7     + * (C) Copyright IBM Corp. 2002, 2004
8     + *
9     + * This program is free software; you can redistribute it and/or modify
10     + * it under the terms of the GNU General Public License as published by
11     + * the Free Software Foundation; either version 2 of the License, or
12     + * (at your option) any later version.
13     + *
14     + * This program is distributed in the hope that it will be useful,
15     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
16     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
17     + * the GNU General Public License for more details.
18     + *
19     + * You should have received a copy of the GNU General Public License
20     + * along with this program; if not, write to the Free Software
21     + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22     + *
23     + * linux/drivers/md/dm-bbr.c
24     + *
25     + * Bad-block-relocation (BBR) target for device-mapper.
26     + *
27     + * The BBR target is designed to remap I/O write failures to another safe
28     + * location on disk. Note that most disk drives have BBR built into them,
29     + * this means that our software BBR will be only activated when all hardware
30     + * BBR replacement sectors have been used.
31     + */
32     +
33     +#include <linux/module.h>
34     +#include <linux/init.h>
35     +#include <linux/bio.h>
36     +#include <linux/spinlock.h>
37     +#include <linux/slab.h>
38     +#include <linux/mempool.h>
39     +#include <linux/workqueue.h>
40     +#include <linux/vmalloc.h>
41     +
42     +#include "dm.h"
43     +#include "dm-bio-list.h"
44     +#include "dm-bio-record.h"
45     +#include "dm-bbr.h"
46     +#include "dm-io.h"
47     +
48     +#define SECTOR_SIZE (1 << SECTOR_SHIFT)
49     +
50     +static struct workqueue_struct *dm_bbr_wq = NULL;
51     +static void bbr_remap_handler(void *data);
52     +static kmem_cache_t *bbr_remap_cache;
53     +static kmem_cache_t *bbr_io_cache;
54     +static mempool_t *bbr_io_pool;
55     +
56     +/**
57     + * bbr_binary_tree_destroy
58     + *
59     + * Destroy the binary tree.
60     + **/
61     +static void bbr_binary_tree_destroy(struct bbr_runtime_remap *root)
62     +{
63     + struct bbr_runtime_remap **link = NULL;
64     + struct bbr_runtime_remap *node = root;
65     +
66     + while (node) {
67     + if (node->left) {
68     + link = &(node->left);
69     + node = node->left;
70     + continue;
71     + }
72     + if (node->right) {
73     + link = &(node->right);
74     + node = node->right;
75     + continue;
76     + }
77     +
78     + kmem_cache_free(bbr_remap_cache, node);
79     + if (node == root) {
80     + /* If root is deleted, we're done. */
81     + break;
82     + }
83     +
84     + /* Back to root. */
85     + node = root;
86     + *link = NULL;
87     + }
88     +}
89     +
90     +static void bbr_free_remap(struct bbr_private *bbr_id)
91     +{
92     + spin_lock_irq(&bbr_id->remap_root_lock);
93     + bbr_binary_tree_destroy(bbr_id->remap_root);
94     + bbr_id->remap_root = NULL;
95     + spin_unlock_irq(&bbr_id->remap_root_lock);
96     +}
97     +
98     +static struct bbr_private *bbr_alloc_private(void)
99     +{
100     + struct bbr_private *bbr_id;
101     +
102     + bbr_id = kmalloc(sizeof(*bbr_id), GFP_KERNEL);
103     + if (bbr_id) {
104     + memset(bbr_id, 0, sizeof(*bbr_id));
105     + INIT_WORK(&bbr_id->remap_work, bbr_remap_handler, bbr_id);
106     + bbr_id->remap_root_lock = SPIN_LOCK_UNLOCKED;
107     + bbr_id->remap_ios_lock = SPIN_LOCK_UNLOCKED;
108     + bbr_id->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0);
109     + }
110     +
111     + return bbr_id;
112     +}
113     +
114     +static void bbr_free_private(struct bbr_private *bbr_id)
115     +{
116     + if (bbr_id->bbr_table) {
117     + vfree(bbr_id->bbr_table);
118     + }
119     + bbr_free_remap(bbr_id);
120     + kfree(bbr_id);
121     +}
122     +
123     +static u32 crc_table[256];
124     +static u32 crc_table_built = 0;
125     +
126     +static void build_crc_table(void)
127     +{
128     + u32 i, j, crc;
129     +
130     + for (i = 0; i <= 255; i++) {
131     + crc = i;
132     + for (j = 8; j > 0; j--) {
133     + if (crc & 1)
134     + crc = (crc >> 1) ^ CRC_POLYNOMIAL;
135     + else
136     + crc >>= 1;
137     + }
138     + crc_table[i] = crc;
139     + }
140     + crc_table_built = 1;
141     +}
142     +
143     +static u32 calculate_crc(u32 crc, void *buffer, u32 buffersize)
144     +{
145     + unsigned char *current_byte;
146     + u32 temp1, temp2, i;
147     +
148     + current_byte = (unsigned char *) buffer;
149     + /* Make sure the crc table is available */
150     + if (!crc_table_built)
151     + build_crc_table();
152     + /* Process each byte in the buffer. */
153     + for (i = 0; i < buffersize; i++) {
154     + temp1 = (crc >> 8) & 0x00FFFFFF;
155     + temp2 = crc_table[(crc ^ (u32) * current_byte) &
156     + (u32) 0xff];
157     + current_byte++;
158     + crc = temp1 ^ temp2;
159     + }
160     + return crc;
161     +}
162     +
163     +/**
164     + * le_bbr_table_sector_to_cpu
165     + *
166     + * Convert bbr meta data from on-disk (LE) format
167     + * to the native cpu endian format.
168     + **/
169     +static void le_bbr_table_sector_to_cpu(struct bbr_table *p)
170     +{
171     + int i;
172     + p->signature = le32_to_cpup(&p->signature);
173     + p->crc = le32_to_cpup(&p->crc);
174     + p->sequence_number = le32_to_cpup(&p->sequence_number);
175     + p->in_use_cnt = le32_to_cpup(&p->in_use_cnt);
176     + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
177     + p->entries[i].bad_sect =
178     + le64_to_cpup(&p->entries[i].bad_sect);
179     + p->entries[i].replacement_sect =
180     + le64_to_cpup(&p->entries[i].replacement_sect);
181     + }
182     +}
183     +
184     +/**
185     + * cpu_bbr_table_sector_to_le
186     + *
187     + * Convert bbr meta data from cpu endian format to on-disk (LE) format
188     + **/
189     +static void cpu_bbr_table_sector_to_le(struct bbr_table *p,
190     + struct bbr_table *le)
191     +{
192     + int i;
193     + le->signature = cpu_to_le32p(&p->signature);
194     + le->crc = cpu_to_le32p(&p->crc);
195     + le->sequence_number = cpu_to_le32p(&p->sequence_number);
196     + le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt);
197     + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
198     + le->entries[i].bad_sect =
199     + cpu_to_le64p(&p->entries[i].bad_sect);
200     + le->entries[i].replacement_sect =
201     + cpu_to_le64p(&p->entries[i].replacement_sect);
202     + }
203     +}
204     +
205     +/**
206     + * validate_bbr_table_sector
207     + *
208     + * Check the specified BBR table sector for a valid signature and CRC. If it's
209     + * valid, endian-convert the table sector.
210     + **/
211     +static int validate_bbr_table_sector(struct bbr_table *p)
212     +{
213     + int rc = 0;
214     + int org_crc, final_crc;
215     +
216     + if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) {
217     + DMERR("dm-bbr: BBR table signature doesn't match!");
218     + DMERR("dm-bbr: Found 0x%x. Expecting 0x%x",
219     + le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE);
220     + rc = -EINVAL;
221     + goto out;
222     + }
223     +
224     + if (!p->crc) {
225     + DMERR("dm-bbr: BBR table sector has no CRC!");
226     + rc = -EINVAL;
227     + goto out;
228     + }
229     +
230     + org_crc = le32_to_cpup(&p->crc);
231     + p->crc = 0;
232     + final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p));
233     + if (final_crc != org_crc) {
234     + DMERR("dm-bbr: CRC failed!");
235     + DMERR("dm-bbr: Found 0x%x. Expecting 0x%x",
236     + org_crc, final_crc);
237     + rc = -EINVAL;
238     + goto out;
239     + }
240     +
241     + p->crc = cpu_to_le32p(&org_crc);
242     + le_bbr_table_sector_to_cpu(p);
243     +
244     +out:
245     + return rc;
246     +}
247     +
248     +/**
249     + * bbr_binary_tree_insert
250     + *
251     + * Insert a node into the binary tree.
252     + **/
253     +static void bbr_binary_tree_insert(struct bbr_runtime_remap **root,
254     + struct bbr_runtime_remap *newnode)
255     +{
256     + struct bbr_runtime_remap **node = root;
257     + while (node && *node) {
258     + if (newnode->remap.bad_sect > (*node)->remap.bad_sect) {
259     + node = &((*node)->right);
260     + } else {
261     + node = &((*node)->left);
262     + }
263     + }
264     +
265     + newnode->left = newnode->right = NULL;
266     + *node = newnode;
267     +}
268     +
269     +/**
270     + * bbr_binary_search
271     + *
272     + * Search for a node that contains bad_sect == lsn.
273     + **/
274     +static struct bbr_runtime_remap *bbr_binary_search(
275     + struct bbr_runtime_remap *root,
276     + u64 lsn)
277     +{
278     + struct bbr_runtime_remap *node = root;
279     + while (node) {
280     + if (node->remap.bad_sect == lsn) {
281     + break;
282     + }
283     + if (lsn > node->remap.bad_sect) {
284     + node = node->right;
285     + } else {
286     + node = node->left;
287     + }
288     + }
289     + return node;
290     +}
291     +
292     +/**
293     + * bbr_insert_remap_entry
294     + *
295     + * Create a new remap entry and add it to the binary tree for this node.
296     + **/
297     +static int bbr_insert_remap_entry(struct bbr_private *bbr_id,
298     + struct bbr_table_entry *new_bbr_entry)
299     +{
300     + struct bbr_runtime_remap *newnode;
301     +
302     + newnode = kmem_cache_alloc(bbr_remap_cache, GFP_NOIO);
303     + if (!newnode) {
304     + DMERR("dm-bbr: Could not allocate from remap cache!");
305     + return -ENOMEM;
306     + }
307     + newnode->remap.bad_sect = new_bbr_entry->bad_sect;
308     + newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
309     + spin_lock_irq(&bbr_id->remap_root_lock);
310     + bbr_binary_tree_insert(&bbr_id->remap_root, newnode);
311     + spin_unlock_irq(&bbr_id->remap_root_lock);
312     + return 0;
313     +}
314     +
315     +/**
316     + * bbr_table_to_remap_list
317     + *
318     + * The on-disk bbr table is sorted by the replacement sector LBA. In order to
319     + * improve run time performance, the in memory remap list must be sorted by
320     + * the bad sector LBA. This function is called at discovery time to initialize
321     + * the remap list. This function assumes that at least one copy of meta data
322     + * is valid.
323     + **/
324     +static u32 bbr_table_to_remap_list(struct bbr_private *bbr_id)
325     +{
326     + u32 in_use_blks = 0;
327     + int i, j;
328     + struct bbr_table *p;
329     +
330     + for (i = 0, p = bbr_id->bbr_table;
331     + i < bbr_id->nr_sects_bbr_table;
332     + i++, p++) {
333     + if (!p->in_use_cnt) {
334     + break;
335     + }
336     + in_use_blks += p->in_use_cnt;
337     + for (j = 0; j < p->in_use_cnt; j++) {
338     + bbr_insert_remap_entry(bbr_id, &p->entries[j]);
339     + }
340     + }
341     + if (in_use_blks) {
342     + char b[32];
343     + DMWARN("dm-bbr: There are %u BBR entries for device %s",
344     + in_use_blks, format_dev_t(b, bbr_id->dev->bdev->bd_dev));
345     + }
346     +
347     + return in_use_blks;
348     +}
349     +
350     +/**
351     + * bbr_search_remap_entry
352     + *
353     + * Search remap entry for the specified sector. If found, return a pointer to
354     + * the table entry. Otherwise, return NULL.
355     + **/
356     +static struct bbr_table_entry *bbr_search_remap_entry(
357     + struct bbr_private *bbr_id,
358     + u64 lsn)
359     +{
360     + struct bbr_runtime_remap *p;
361     +
362     + spin_lock_irq(&bbr_id->remap_root_lock);
363     + p = bbr_binary_search(bbr_id->remap_root, lsn);
364     + spin_unlock_irq(&bbr_id->remap_root_lock);
365     + if (p) {
366     + return (&p->remap);
367     + } else {
368     + return NULL;
369     + }
370     +}
371     +
372     +/**
373     + * bbr_remap
374     + *
375     + * If *lsn is in the remap table, return TRUE and modify *lsn,
376     + * else, return FALSE.
377     + **/
378     +static inline int bbr_remap(struct bbr_private *bbr_id,
379     + u64 *lsn)
380     +{
381     + struct bbr_table_entry *e;
382     +
383     + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
384     + e = bbr_search_remap_entry(bbr_id, *lsn);
385     + if (e) {
386     + *lsn = e->replacement_sect;
387     + return 1;
388     + }
389     + }
390     + return 0;
391     +}
392     +
393     +/**
394     + * bbr_remap_probe
395     + *
396     + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap
397     + * table return TRUE, Else, return FALSE.
398     + **/
399     +static inline int bbr_remap_probe(struct bbr_private *bbr_id,
400     + u64 lsn, u64 nr_sects)
401     +{
402     + u64 tmp, cnt;
403     +
404     + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
405     + for (cnt = 0, tmp = lsn;
406     + cnt < nr_sects;
407     + cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) {
408     + if (bbr_remap(bbr_id,&tmp)) {
409     + return 1;
410     + }
411     + }
412     + }
413     + return 0;
414     +}
415     +
416     +/**
417     + * bbr_setup
418     + *
419     + * Read the remap tables from disk and set up the initial remap tree.
420     + **/
421     +static int bbr_setup(struct bbr_private *bbr_id)
422     +{
423     + struct bbr_table *table = bbr_id->bbr_table;
424     + struct io_region job;
425     + unsigned long error;
426     + int i, rc = 0;
427     +
428     + job.bdev = bbr_id->dev->bdev;
429     + job.count = 1;
430     +
431     + /* Read and verify each BBR table sector individually. */
432     + for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) {
433     + job.sector = bbr_id->lba_table1 + i;
434     + rc = dm_io_sync_vm(1, &job, READ, table, &error);
435     + if (rc && bbr_id->lba_table2) {
436     + job.sector = bbr_id->lba_table2 + i;
437     + rc = dm_io_sync_vm(1, &job, READ, table, &error);
438     + }
439     + if (rc) {
440     + goto out;
441     + }
442     +
443     + rc = validate_bbr_table_sector(table);
444     + if (rc) {
445     + goto out;
446     + }
447     + }
448     + atomic_set(&bbr_id->in_use_replacement_blks,
449     + bbr_table_to_remap_list(bbr_id));
450     +
451     +out:
452     + if (rc) {
453     + DMERR("dm-bbr: error during device setup: %d", rc);
454     + }
455     + return rc;
456     +}
457     +
458     +/**
459     + * bbr_io_remap_error
460     + * @bbr_id: Private data for the BBR node.
461     + * @rw: READ or WRITE.
462     + * @starting_lsn: Starting sector of request to remap.
463     + * @count: Number of sectors in the request.
464     + * @page: Page containing the data for the request.
465     + * @offset: Byte-offset of the data within the page.
466     + *
467     + * For the requested range, try to write each sector individually. For each
468     + * sector that fails, find the next available remap location and write the
469     + * data to that new location. Then update the table and write both copies
470     + * of the table to disk. Finally, update the in-memory mapping and do any
471     + * other necessary bookkeeping.
472     + **/
473     +static int bbr_io_remap_error(struct bbr_private *bbr_id,
474     + int rw,
475     + u64 starting_lsn,
476     + u64 count,
477     + struct page *page,
478     + unsigned int offset)
479     +{
480     + struct bbr_table *bbr_table;
481     + struct io_region job;
482     + struct page_list pl;
483     + unsigned long table_sector_index;
484     + unsigned long table_sector_offset;
485     + unsigned long index;
486     + unsigned long error;
487     + u64 lsn, new_lsn;
488     + char b[32];
489     + int rc;
490     +
491     + job.bdev = bbr_id->dev->bdev;
492     + job.count = 1;
493     + pl.page = page;
494     + pl.next = NULL;
495     +
496     + /* For each sector in the request. */
497     + for (lsn = 0; lsn < count; lsn++, offset += SECTOR_SIZE) {
498     + job.sector = starting_lsn + lsn;
499     + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
500     + while (rc) {
501     + /* Find the next available relocation sector. */
502     + new_lsn = atomic_read(&bbr_id->in_use_replacement_blks);
503     + if (new_lsn >= bbr_id->nr_replacement_blks) {
504     + /* No more replacement sectors available. */
505     + return -EIO;
506     + }
507     + new_lsn += bbr_id->start_replacement_sect;
508     +
509     + /* Write the data to its new location. */
510     + DMWARN("dm-bbr: device %s: Trying to remap bad sector "PFU64" to sector "PFU64,
511     + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
512     + starting_lsn + lsn, new_lsn);
513     + job.sector = new_lsn;
514     + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
515     + if (rc) {
516     + /* This replacement sector is bad.
517     + * Try the next one.
518     + */
519     + DMERR("dm-bbr: device %s: replacement sector "PFU64" is bad. Skipping.",
520     + format_dev_t(b, bbr_id->dev->bdev->bd_dev), new_lsn);
521     + atomic_inc(&bbr_id->in_use_replacement_blks);
522     + continue;
523     + }
524     +
525     + /* Add this new entry to the on-disk table. */
526     + table_sector_index = new_lsn -
527     + bbr_id->start_replacement_sect;
528     + table_sector_offset = table_sector_index /
529     + BBR_ENTRIES_PER_SECT;
530     + index = table_sector_index % BBR_ENTRIES_PER_SECT;
531     +
532     + bbr_table = &bbr_id->bbr_table[table_sector_offset];
533     + bbr_table->entries[index].bad_sect = starting_lsn + lsn;
534     + bbr_table->entries[index].replacement_sect = new_lsn;
535     + bbr_table->in_use_cnt++;
536     + bbr_table->sequence_number++;
537     + bbr_table->crc = 0;
538     + bbr_table->crc = calculate_crc(INITIAL_CRC,
539     + bbr_table,
540     + sizeof(struct bbr_table));
541     +
542     + /* Write the table to disk. */
543     + cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
544     + if (bbr_id->lba_table1) {
545     + job.sector = bbr_id->lba_table1 + table_sector_offset;
546     + rc = dm_io_sync_vm(1, &job, WRITE, bbr_table, &error);
547     + }
548     + if (bbr_id->lba_table2) {
549     + job.sector = bbr_id->lba_table2 + table_sector_offset;
550     + rc |= dm_io_sync_vm(1, &job, WRITE, bbr_table, &error);
551     + }
552     + le_bbr_table_sector_to_cpu(bbr_table);
553     +
554     + if (rc) {
555     + /* Error writing one of the tables to disk. */
556     + DMERR("dm-bbr: device %s: error updating BBR tables on disk.",
557     + format_dev_t(b, bbr_id->dev->bdev->bd_dev));
558     + return rc;
559     + }
560     +
561     + /* Insert a new entry in the remapping binary-tree. */
562     + rc = bbr_insert_remap_entry(bbr_id,
563     + &bbr_table->entries[index]);
564     + if (rc) {
565     + DMERR("dm-bbr: device %s: error adding new entry to remap tree.",
566     + format_dev_t(b, bbr_id->dev->bdev->bd_dev));
567     + return rc;
568     + }
569     +
570     + atomic_inc(&bbr_id->in_use_replacement_blks);
571     + }
572     + }
573     +
574     + return 0;
575     +}
576     +
577     +/**
578     + * bbr_io_process_request
579     + *
580     + * For each sector in this request, check if the sector has already
581     + * been remapped. If so, process all previous sectors in the request,
582     + * followed by the remapped sector. Then reset the starting lsn and
583     + * count, and keep going with the rest of the request as if it were
584     + * a whole new request. If any of the sync_io's return an error,
585     + * call the remapper to relocate the bad sector(s).
586     + *
587     + * 2.5 Note: When switching over to bio's for the I/O path, we have made
588     + * the assumption that the I/O request described by the bio is one
589     + * virtually contiguous piece of memory (even though the bio vector
590     + * describes it using a series of physical page addresses).
591     + **/
592     +static int bbr_io_process_request(struct bbr_private *bbr_id,
593     + struct bio *bio)
594     +{
595     + struct io_region job;
596     + u64 starting_lsn = bio->bi_sector;
597     + u64 count, lsn, remapped_lsn;
598     + struct page_list pl;
599     + unsigned int offset;
600     + unsigned long error;
601     + int i, rw = bio_data_dir(bio);
602     + int rc = 0;
603     +
604     + job.bdev = bbr_id->dev->bdev;
605     + pl.next = NULL;
606     +
607     + /* Each bio can contain multiple vectors, each with a different page.
608     + * Treat each vector as a separate request.
609     + */
610     + /* KMC: Is this the right way to walk the bvec list? */
611     + for (i = 0;
612     + i < bio->bi_vcnt;
613     + i++, bio->bi_idx++, starting_lsn += count) {
614     +
615     + /* Bvec info: number of sectors, page,
616     + * and byte-offset within page.
617     + */
618     + count = bio_iovec(bio)->bv_len >> SECTOR_SHIFT;
619     + pl.page = bio_iovec(bio)->bv_page;
620     + offset = bio_iovec(bio)->bv_offset;
621     +
622     + /* For each sector in this bvec, check if the sector has
623     + * already been remapped. If so, process all previous sectors
624     + * in this request, followed by the remapped sector. Then reset
625     + * the starting lsn and count and keep going with the rest of
626     + * the request as if it were a whole new request.
627     + */
628     + for (lsn = 0; lsn < count; lsn++) {
629     + remapped_lsn = starting_lsn + lsn;
630     + rc = bbr_remap(bbr_id, &remapped_lsn);
631     + if (!rc) {
632     + /* This sector is fine. */
633     + continue;
634     + }
635     +
636     + /* Process all sectors in the request up to this one. */
637     + if (lsn > 0) {
638     + job.sector = starting_lsn;
639     + job.count = lsn;
640     + rc = dm_io_sync(1, &job, rw, &pl,
641     + offset, &error);
642     + if (rc) {
643     + /* If this I/O failed, then one of the
644     + * sectors in this request needs to be
645     + * relocated.
646     + */
647     + rc = bbr_io_remap_error(bbr_id, rw,
648     + starting_lsn,
649     + lsn, pl.page,
650     + offset);
651     + if (rc) {
652     + /* KMC: Return? Or continue to next bvec? */
653     + return rc;
654     + }
655     + }
656     + offset += (lsn << SECTOR_SHIFT);
657     + }
658 phreak 274 +
659 dsd 7 + /* Process the remapped sector. */
660     + job.sector = remapped_lsn;
661     + job.count = 1;
662     + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
663     + if (rc) {
664     + /* BUGBUG - Need more processing if this caused
665     + * an error. If this I/O failed, then the
666     + * existing remap is now bad, and we need to
667     + * find a new remap. Can't use
668     + * bbr_io_remap_error(), because the existing
669     + * map entry needs to be changed, not added
670     + * again, and the original table entry also
671     + * needs to be changed.
672     + */
673     + return rc;
674     + }
675     +
676     + starting_lsn += (lsn + 1);
677     + count -= (lsn + 1);
678     + lsn = -1;
679     + offset += SECTOR_SIZE;
680     + }
681     +
682     + /* Check for any remaining sectors after the last split. This
683     + * could potentially be the whole request, but that should be a
684     + * rare case because requests should only be processed by the
685     + * thread if we know an error occurred or they contained one or
686     + * more remapped sectors.
687     + */
688     + if (count) {
689     + job.sector = starting_lsn;
690     + job.count = count;
691     + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
692     + if (rc) {
693     + /* If this I/O failed, then one of the sectors
694     + * in this request needs to be relocated.
695     + */
696     + rc = bbr_io_remap_error(bbr_id, rw, starting_lsn,
697     + count, pl.page, offset);
698     + if (rc) {
699     + /* KMC: Return? Or continue to next bvec? */
700     + return rc;
701     + }
702     + }
703     + }
704     + }
705     +
706     + return 0;
707     +}
708     +
709     +static void bbr_io_process_requests(struct bbr_private *bbr_id,
710     + struct bio *bio)
711     +{
712     + struct bio *next;
713     + int rc;
714     +
715     + while (bio) {
716     + next = bio->bi_next;
717     + bio->bi_next = NULL;
718     +
719     + rc = bbr_io_process_request(bbr_id, bio);
720     +
721     + bio_endio(bio, bio->bi_size, rc);
722     +
723     + bio = next;
724     + }
725     +}
726     +
727     +/**
728     + * bbr_remap_handler
729     + *
730     + * This is the handler for the bbr work-queue.
731     + *
732     + * I/O requests should only be sent to this handler if we know that:
733     + * a) the request contains at least one remapped sector.
734     + * or
735     + * b) the request caused an error on the normal I/O path.
736     + *
737     + * This function uses synchronous I/O, so sending a request to this
738     + * thread that doesn't need special processing will cause severe
739     + * performance degredation.
740     + **/
741     +static void bbr_remap_handler(void *data)
742     +{
743     + struct bbr_private *bbr_id = data;
744     + struct bio *bio;
745     + unsigned long flags;
746     +
747     + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
748     + bio = bio_list_get(&bbr_id->remap_ios);
749     + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
750     +
751     + bbr_io_process_requests(bbr_id, bio);
752     +}
753     +
754     +/**
755     + * bbr_endio
756     + *
757     + * This is the callback for normal write requests. Check for an error
758     + * during the I/O, and send to the thread for processing if necessary.
759     + **/
760     +static int bbr_endio(struct dm_target *ti, struct bio *bio,
761     + int error, union map_info *map_context)
762     +{
763     + struct bbr_private *bbr_id = ti->private;
764     + struct dm_bio_details *bbr_io = map_context->ptr;
765     +
766     + if (error && bbr_io) {
767     + unsigned long flags;
768     + char b[32];
769     +
770     + dm_bio_restore(bbr_io, bio);
771     + map_context->ptr = NULL;
772     +
773     + DMERR("dm-bbr: device %s: I/O failure on sector %lu. "
774     + "Scheduling for retry.",
775     + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
776     + (unsigned long)bio->bi_sector);
777     +
778     + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
779     + bio_list_add(&bbr_id->remap_ios, bio);
780     + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
781     +
782     + queue_work(dm_bbr_wq, &bbr_id->remap_work);
783     +
784     + error = 1;
785     + }
786     +
787     + if (bbr_io)
788     + mempool_free(bbr_io, bbr_io_pool);
789     +
790     + return error;
791     +}
792     +
793     +/**
794     + * Construct a bbr mapping
795     + **/
796     +static int bbr_ctr(struct dm_target *ti, unsigned int argc, char **argv)
797     +{
798     + struct bbr_private *bbr_id;
799     + unsigned long block_size;
800     + char *end;
801     + int rc = -EINVAL;
802     +
803     + if (argc != 8) {
804     + ti->error = "dm-bbr requires exactly 8 arguments: "
805     + "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size";
806     + goto out1;
807     + }
808     +
809     + bbr_id = bbr_alloc_private();
810     + if (!bbr_id) {
811     + ti->error = "dm-bbr: Error allocating bbr private data.";
812     + goto out1;
813     + }
814     +
815     + bbr_id->offset = simple_strtoull(argv[1], &end, 10);
816     + bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10);
817     + bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10);
818     + bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10);
819     + bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10);
820     + bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10);
821     + block_size = simple_strtoul(argv[7], &end, 10);
822     + bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT);
823     +
824     + bbr_id->bbr_table = vmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT);
825     + if (!bbr_id->bbr_table) {
826     + ti->error = "dm-bbr: Error allocating bbr table.";
827     + goto out2;
828     + }
829     +
830     + if (dm_get_device(ti, argv[0], 0, ti->len,
831     + dm_table_get_mode(ti->table), &bbr_id->dev)) {
832     + ti->error = "dm-bbr: Device lookup failed";
833     + goto out2;
834     + }
835     +
836     + rc = bbr_setup(bbr_id);
837     + if (rc) {
838     + ti->error = "dm-bbr: Device setup failed";
839     + goto out3;
840     + }
841     +
842     + ti->private = bbr_id;
843     + return 0;
844     +
845     +out3:
846     + dm_put_device(ti, bbr_id->dev);
847     +out2:
848     + bbr_free_private(bbr_id);
849     +out1:
850     + return rc;
851     +}
852     +
853     +static void bbr_dtr(struct dm_target *ti)
854     +{
855     + struct bbr_private *bbr_id = ti->private;
856     +
857     + dm_put_device(ti, bbr_id->dev);
858     + bbr_free_private(bbr_id);
859     +}
860     +
861     +static int bbr_map(struct dm_target *ti, struct bio *bio,
862     + union map_info *map_context)
863     +{
864     + struct bbr_private *bbr_id = ti->private;
865     + struct dm_bio_details *bbr_io;
866     + unsigned long flags;
867     + int rc = 1;
868     +
869     + bio->bi_sector += bbr_id->offset;
870     +
871     + if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
872     + !bbr_remap_probe(bbr_id, bio->bi_sector, bio_sectors(bio))) {
873     + /* No existing remaps or this request doesn't
874     + * contain any remapped sectors.
875     + */
876     + bio->bi_bdev = bbr_id->dev->bdev;
877     +
878     + bbr_io = mempool_alloc(bbr_io_pool, GFP_NOIO);
879     + dm_bio_record(bbr_io, bio);
880     + map_context->ptr = bbr_io;
881     + } else {
882     + /* This request has at least one remapped sector.
883     + * Give it to the work-queue for processing.
884     + */
885     + map_context->ptr = NULL;
886     + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
887     + bio_list_add(&bbr_id->remap_ios, bio);
888     + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
889     +
890     + queue_work(dm_bbr_wq, &bbr_id->remap_work);
891     + rc = 0;
892     + }
893     +
894     + return rc;
895     +}
896     +
897     +static int bbr_status(struct dm_target *ti, status_type_t type,
898     + char *result, unsigned int maxlen)
899     +{
900     + struct bbr_private *bbr_id = ti->private;
901     + char b[BDEVNAME_SIZE];
902     +
903     + switch (type) {
904     + case STATUSTYPE_INFO:
905     + result[0] = '\0';
906     + break;
907     +
908     + case STATUSTYPE_TABLE:
909     + snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u",
910     + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
911     + bbr_id->offset, bbr_id->lba_table1, bbr_id->lba_table2,
912     + bbr_id->nr_sects_bbr_table,
913     + bbr_id->start_replacement_sect,
914     + bbr_id->nr_replacement_blks,
915     + bbr_id->blksize_in_sects << SECTOR_SHIFT);
916     + break;
917     + }
918     + return 0;
919     +}
920     +
921     +static struct target_type bbr_target = {
922     + .name = "bbr",
923     + .version= {1, 0, 1},
924     + .module = THIS_MODULE,
925     + .ctr = bbr_ctr,
926     + .dtr = bbr_dtr,
927     + .map = bbr_map,
928     + .end_io = bbr_endio,
929     + .status = bbr_status,
930     +};
931     +
932     +int __init dm_bbr_init(void)
933     +{
934     + int rc;
935     +
936     + rc = dm_register_target(&bbr_target);
937     + if (rc) {
938     + DMERR("dm-bbr: error registering target.");
939     + goto err1;
940     + }
941     +
942     + bbr_remap_cache = kmem_cache_create("bbr-remap",
943     + sizeof(struct bbr_runtime_remap),
944     + 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
945     + if (!bbr_remap_cache) {
946     + DMERR("dm-bbr: error creating remap cache.");
947     + rc = ENOMEM;
948     + goto err2;
949     + }
950     +
951     + bbr_io_cache = kmem_cache_create("bbr-io", sizeof(struct dm_bio_details),
952     + 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
953     + if (!bbr_io_cache) {
954     + DMERR("dm-bbr: error creating io cache.");
955     + rc = ENOMEM;
956     + goto err3;
957     + }
958     +
959     + bbr_io_pool = mempool_create(256, mempool_alloc_slab,
960     + mempool_free_slab, bbr_io_cache);
961     + if (!bbr_io_pool) {
962     + DMERR("dm-bbr: error creating io mempool.");
963     + rc = ENOMEM;
964     + goto err4;
965     + }
966     +
967     + dm_bbr_wq = create_workqueue("dm-bbr");
968     + if (!dm_bbr_wq) {
969     + DMERR("dm-bbr: error creating work-queue.");
970     + rc = ENOMEM;
971     + goto err5;
972     + }
973     +
974     + rc = dm_io_get(1);
975     + if (rc) {
976     + DMERR("dm-bbr: error initializing I/O service.");
977     + goto err6;
978     + }
979     +
980     + return 0;
981     +
982     +err6:
983     + destroy_workqueue(dm_bbr_wq);
984     +err5:
985     + mempool_destroy(bbr_io_pool);
986     +err4:
987     + kmem_cache_destroy(bbr_io_cache);
988     +err3:
989     + kmem_cache_destroy(bbr_remap_cache);
990     +err2:
991     + dm_unregister_target(&bbr_target);
992     +err1:
993     + return rc;
994     +}
995     +
996     +void __exit dm_bbr_exit(void)
997     +{
998     + dm_io_put(1);
999     + destroy_workqueue(dm_bbr_wq);
1000     + mempool_destroy(bbr_io_pool);
1001     + kmem_cache_destroy(bbr_io_cache);
1002     + kmem_cache_destroy(bbr_remap_cache);
1003     + dm_unregister_target(&bbr_target);
1004     +}
1005     +
1006     +module_init(dm_bbr_init);
1007     +module_exit(dm_bbr_exit);
1008     +MODULE_LICENSE("GPL");
1009 phreak 274 Index: linux-git/drivers/md/dm-bbr.h
1010     ===================================================================
1011     --- /dev/null
1012     +++ linux-git/drivers/md/dm-bbr.h
1013 dsd 7 @@ -0,0 +1,125 @@
1014     +/*
1015     + * (C) Copyright IBM Corp. 2002, 2004
1016     + *
1017     + * This program is free software; you can redistribute it and/or modify
1018     + * it under the terms of the GNU General Public License as published by
1019     + * the Free Software Foundation; either version 2 of the License, or
1020     + * (at your option) any later version.
1021     + *
1022     + * This program is distributed in the hope that it will be useful,
1023     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1024     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
1025     + * the GNU General Public License for more details.
1026     + *
1027     + * You should have received a copy of the GNU General Public License
1028     + * along with this program; if not, write to the Free Software
1029     + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1030     + *
1031     + * linux/drivers/md/dm-bbr.h
1032     + *
1033     + * Bad-block-relocation (BBR) target for device-mapper.
1034     + *
1035     + * The BBR target is designed to remap I/O write failures to another safe
1036     + * location on disk. Note that most disk drives have BBR built into them,
1037     + * this means that our software BBR will be only activated when all hardware
1038     + * BBR replacement sectors have been used.
1039     + */
1040     +
1041     +#define BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */
1042     +#define BBR_ENTRIES_PER_SECT 31
1043     +#define INITIAL_CRC 0xFFFFFFFF
1044     +#define CRC_POLYNOMIAL 0xEDB88320L
1045     +
1046     +/**
1047     + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines.
1048     + * Use these in place of %Ld, %Lu, and %Lx.
1049     + **/
1050     +#if BITS_PER_LONG > 32
1051     +#define PFU64 "%lu"
1052     +#else
1053     +#define PFU64 "%Lu"
1054     +#endif
1055     +
1056     +/**
1057     + * struct bbr_table_entry
1058     + * @bad_sect: LBA of bad location.
1059     + * @replacement_sect: LBA of new location.
1060     + *
1061     + * Structure to describe one BBR remap.
1062     + **/
1063     +struct bbr_table_entry {
1064     + u64 bad_sect;
1065     + u64 replacement_sect;
1066     +};
1067     +
1068     +/**
1069     + * struct bbr_table
1070     + * @signature: Signature on each BBR table sector.
1071     + * @crc: CRC for this table sector.
1072     + * @sequence_number: Used to resolve conflicts when primary and secondary
1073     + * tables do not match.
1074     + * @in_use_cnt: Number of in-use table entries.
1075     + * @entries: Actual table of remaps.
1076     + *
1077     + * Structure to describe each sector of the metadata table. Each sector in this
1078     + * table can describe 31 remapped sectors.
1079     + **/
1080     +struct bbr_table {
1081     + u32 signature;
1082     + u32 crc;
1083     + u32 sequence_number;
1084     + u32 in_use_cnt;
1085     + struct bbr_table_entry entries[BBR_ENTRIES_PER_SECT];
1086     +};
1087     +
1088     +/**
1089     + * struct bbr_runtime_remap
1090     + *
1091     + * Node in the binary tree used to keep track of remaps.
1092     + **/
1093     +struct bbr_runtime_remap {
1094     + struct bbr_table_entry remap;
1095     + struct bbr_runtime_remap *left;
1096     + struct bbr_runtime_remap *right;
1097     +};
1098     +
1099     +/**
1100     + * struct bbr_private
1101     + * @dev: Info about underlying device.
1102     + * @bbr_table: Copy of metadata table.
1103     + * @remap_root: Binary tree containing all remaps.
1104     + * @remap_root_lock: Lock for the binary tree.
1105     + * @remap_work: For adding work items to the work-queue.
1106     + * @remap_ios: List of I/Os for the work-queue to handle.
1107     + * @remap_ios_lock: Lock for the remap_ios list.
1108     + * @offset: LBA of data area.
1109     + * @lba_table1: LBA of primary BBR table.
1110     + * @lba_table2: LBA of secondary BBR table.
1111     + * @nr_sects_bbr_table: Size of each BBR table.
1112     + * @nr_replacement_blks: Number of replacement blocks.
1113     + * @start_replacement_sect: LBA of start of replacement blocks.
1114     + * @blksize_in_sects: Size of each block.
1115     + * @in_use_replacement_blks: Current number of remapped blocks.
1116     + *
1117     + * Private data for each BBR target.
1118     + **/
1119     +struct bbr_private {
1120     + struct dm_dev *dev;
1121     + struct bbr_table *bbr_table;
1122     + struct bbr_runtime_remap *remap_root;
1123     + spinlock_t remap_root_lock;
1124     +
1125     + struct work_struct remap_work;
1126     + struct bio_list remap_ios;
1127     + spinlock_t remap_ios_lock;
1128     +
1129     + u64 offset;
1130     + u64 lba_table1;
1131     + u64 lba_table2;
1132     + u64 nr_sects_bbr_table;
1133     + u64 start_replacement_sect;
1134     + u64 nr_replacement_blks;
1135     + u32 blksize_in_sects;
1136     + atomic_t in_use_replacement_blks;
1137     +};
1138     +
1139 phreak 274 Index: linux-git/drivers/md/Kconfig
1140     ===================================================================
1141     --- linux-git.orig/drivers/md/Kconfig
1142     +++ linux-git/drivers/md/Kconfig
1143 dsd 7 @@ -236,5 +236,16 @@ config DM_MULTIPATH_EMC
1144     ---help---
1145     Multipath support for EMC CX/AX series hardware.
1146    
1147     +config BLK_DEV_DM_BBR
1148     + tristate "Bad Block Relocation Device Target (EXPERIMENTAL)"
1149     + depends on BLK_DEV_DM && EXPERIMENTAL
1150     + ---help---
1151     + Support for devices with software-based bad-block-relocation.
1152     +
1153     + To compile this as a module, choose M here: the module will be
1154     + called dm-bbr.
1155     +
1156     + If unsure, say N.
1157     +
1158     endmenu
1159    
1160 phreak 274 Index: linux-git/drivers/md/Makefile
1161     ===================================================================
1162     --- linux-git.orig/drivers/md/Makefile
1163     +++ linux-git/drivers/md/Makefile
1164     @@ -37,6 +37,7 @@ obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc
1165 dsd 7 obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
1166     obj-$(CONFIG_DM_MIRROR) += dm-mirror.o
1167     obj-$(CONFIG_DM_ZERO) += dm-zero.o
1168     +obj-$(CONFIG_BLK_DEV_DM_BBR) += dm-bbr.o
1169    
1170     quiet_cmd_unroll = UNROLL $@
1171     cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \

  ViewVC Help
Powered by ViewVC 1.1.20