/[linux-patches]/genpatches-2.6/trunk/2.6.19/4105_dm-bbr.patch
Gentoo

Contents of /genpatches-2.6/trunk/2.6.19/4105_dm-bbr.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 534 - (hide annotations) (download)
Fri Jul 7 07:35:20 2006 UTC (12 years, 5 months ago) by phreak
Original Path: genpatches-2.6/trunk/2.6.18-pre/4105_dm-bbr.patch
File size: 32162 byte(s)
Fixing 4105_dm-bbr.patch and 4300_squashfs-3.0.patch for 2.6.18-rc1
1 phreak 366 Index: linux-git/drivers/md/Kconfig
2     ===================================================================
3     --- linux-git.orig/drivers/md/Kconfig
4     +++ linux-git/drivers/md/Kconfig
5 phreak 520 @@ -249,5 +249,16 @@ config DM_MULTIPATH_EMC
6 phreak 366 ---help---
7     Multipath support for EMC CX/AX series hardware.
8    
9     +config BLK_DEV_DM_BBR
10     + tristate "Bad Block Relocation Device Target (EXPERIMENTAL)"
11     + depends on BLK_DEV_DM && EXPERIMENTAL
12     + ---help---
13     + Support for devices with software-based bad-block-relocation.
14     +
15     + To compile this as a module, choose M here: the module will be
16     + called dm-bbr.
17     +
18     + If unsure, say N.
19     +
20     endmenu
21    
22     Index: linux-git/drivers/md/Makefile
23     ===================================================================
24     --- linux-git.orig/drivers/md/Makefile
25     +++ linux-git/drivers/md/Makefile
26 phreak 520 @@ -36,6 +36,7 @@ obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc
27 phreak 366 obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
28     obj-$(CONFIG_DM_MIRROR) += dm-mirror.o
29     obj-$(CONFIG_DM_ZERO) += dm-zero.o
30     +obj-$(CONFIG_BLK_DEV_DM_BBR) += dm-bbr.o
31    
32     quiet_cmd_unroll = UNROLL $@
33     cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
34 phreak 274 Index: linux-git/drivers/md/dm-bbr.c
35     ===================================================================
36     --- /dev/null
37     +++ linux-git/drivers/md/dm-bbr.c
38 phreak 534 @@ -0,0 +1,1004 @@
39 dsd 7 +/*
40     + * (C) Copyright IBM Corp. 2002, 2004
41     + *
42     + * This program is free software; you can redistribute it and/or modify
43     + * it under the terms of the GNU General Public License as published by
44     + * the Free Software Foundation; either version 2 of the License, or
45     + * (at your option) any later version.
46     + *
47     + * This program is distributed in the hope that it will be useful,
48     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
49     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
50     + * the GNU General Public License for more details.
51     + *
52     + * You should have received a copy of the GNU General Public License
53     + * along with this program; if not, write to the Free Software
54     + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
55     + *
56     + * linux/drivers/md/dm-bbr.c
57     + *
58     + * Bad-block-relocation (BBR) target for device-mapper.
59     + *
60     + * The BBR target is designed to remap I/O write failures to another safe
61     + * location on disk. Note that most disk drives have BBR built into them,
62     + * this means that our software BBR will be only activated when all hardware
63     + * BBR replacement sectors have been used.
64     + */
65     +
66     +#include <linux/module.h>
67     +#include <linux/init.h>
68     +#include <linux/bio.h>
69     +#include <linux/spinlock.h>
70     +#include <linux/slab.h>
71     +#include <linux/mempool.h>
72     +#include <linux/workqueue.h>
73     +#include <linux/vmalloc.h>
74     +
75     +#include "dm.h"
76     +#include "dm-bio-list.h"
77     +#include "dm-bio-record.h"
78     +#include "dm-bbr.h"
79     +#include "dm-io.h"
80     +
81 phreak 534 +#define DM_MSG_PREFIX "bbr"
82 dsd 7 +#define SECTOR_SIZE (1 << SECTOR_SHIFT)
83     +
84     +static struct workqueue_struct *dm_bbr_wq = NULL;
85     +static void bbr_remap_handler(void *data);
86     +static kmem_cache_t *bbr_remap_cache;
87     +static kmem_cache_t *bbr_io_cache;
88     +static mempool_t *bbr_io_pool;
89     +
90     +/**
91     + * bbr_binary_tree_destroy
92     + *
93     + * Destroy the binary tree.
94     + **/
95     +static void bbr_binary_tree_destroy(struct bbr_runtime_remap *root)
96     +{
97     + struct bbr_runtime_remap **link = NULL;
98     + struct bbr_runtime_remap *node = root;
99     +
100     + while (node) {
101     + if (node->left) {
102     + link = &(node->left);
103     + node = node->left;
104     + continue;
105     + }
106     + if (node->right) {
107     + link = &(node->right);
108     + node = node->right;
109     + continue;
110     + }
111     +
112     + kmem_cache_free(bbr_remap_cache, node);
113     + if (node == root) {
114     + /* If root is deleted, we're done. */
115     + break;
116     + }
117     +
118     + /* Back to root. */
119     + node = root;
120     + *link = NULL;
121     + }
122     +}
123     +
124     +static void bbr_free_remap(struct bbr_private *bbr_id)
125     +{
126     + spin_lock_irq(&bbr_id->remap_root_lock);
127     + bbr_binary_tree_destroy(bbr_id->remap_root);
128     + bbr_id->remap_root = NULL;
129     + spin_unlock_irq(&bbr_id->remap_root_lock);
130     +}
131     +
132     +static struct bbr_private *bbr_alloc_private(void)
133     +{
134     + struct bbr_private *bbr_id;
135     +
136     + bbr_id = kmalloc(sizeof(*bbr_id), GFP_KERNEL);
137     + if (bbr_id) {
138     + memset(bbr_id, 0, sizeof(*bbr_id));
139     + INIT_WORK(&bbr_id->remap_work, bbr_remap_handler, bbr_id);
140     + bbr_id->remap_root_lock = SPIN_LOCK_UNLOCKED;
141     + bbr_id->remap_ios_lock = SPIN_LOCK_UNLOCKED;
142     + bbr_id->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0);
143     + }
144     +
145     + return bbr_id;
146     +}
147     +
148     +static void bbr_free_private(struct bbr_private *bbr_id)
149     +{
150     + if (bbr_id->bbr_table) {
151     + vfree(bbr_id->bbr_table);
152     + }
153     + bbr_free_remap(bbr_id);
154     + kfree(bbr_id);
155     +}
156     +
157     +static u32 crc_table[256];
158     +static u32 crc_table_built = 0;
159     +
160     +static void build_crc_table(void)
161     +{
162     + u32 i, j, crc;
163     +
164     + for (i = 0; i <= 255; i++) {
165     + crc = i;
166     + for (j = 8; j > 0; j--) {
167     + if (crc & 1)
168     + crc = (crc >> 1) ^ CRC_POLYNOMIAL;
169     + else
170     + crc >>= 1;
171     + }
172     + crc_table[i] = crc;
173     + }
174     + crc_table_built = 1;
175     +}
176     +
177     +static u32 calculate_crc(u32 crc, void *buffer, u32 buffersize)
178     +{
179     + unsigned char *current_byte;
180     + u32 temp1, temp2, i;
181     +
182     + current_byte = (unsigned char *) buffer;
183     + /* Make sure the crc table is available */
184     + if (!crc_table_built)
185     + build_crc_table();
186     + /* Process each byte in the buffer. */
187     + for (i = 0; i < buffersize; i++) {
188     + temp1 = (crc >> 8) & 0x00FFFFFF;
189     + temp2 = crc_table[(crc ^ (u32) * current_byte) &
190     + (u32) 0xff];
191     + current_byte++;
192     + crc = temp1 ^ temp2;
193     + }
194     + return crc;
195     +}
196     +
197     +/**
198     + * le_bbr_table_sector_to_cpu
199     + *
200     + * Convert bbr meta data from on-disk (LE) format
201     + * to the native cpu endian format.
202     + **/
203     +static void le_bbr_table_sector_to_cpu(struct bbr_table *p)
204     +{
205     + int i;
206     + p->signature = le32_to_cpup(&p->signature);
207     + p->crc = le32_to_cpup(&p->crc);
208     + p->sequence_number = le32_to_cpup(&p->sequence_number);
209     + p->in_use_cnt = le32_to_cpup(&p->in_use_cnt);
210     + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
211     + p->entries[i].bad_sect =
212     + le64_to_cpup(&p->entries[i].bad_sect);
213     + p->entries[i].replacement_sect =
214     + le64_to_cpup(&p->entries[i].replacement_sect);
215     + }
216     +}
217     +
218     +/**
219     + * cpu_bbr_table_sector_to_le
220     + *
221     + * Convert bbr meta data from cpu endian format to on-disk (LE) format
222     + **/
223     +static void cpu_bbr_table_sector_to_le(struct bbr_table *p,
224     + struct bbr_table *le)
225     +{
226     + int i;
227     + le->signature = cpu_to_le32p(&p->signature);
228     + le->crc = cpu_to_le32p(&p->crc);
229     + le->sequence_number = cpu_to_le32p(&p->sequence_number);
230     + le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt);
231     + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
232     + le->entries[i].bad_sect =
233     + cpu_to_le64p(&p->entries[i].bad_sect);
234     + le->entries[i].replacement_sect =
235     + cpu_to_le64p(&p->entries[i].replacement_sect);
236     + }
237     +}
238     +
239     +/**
240     + * validate_bbr_table_sector
241     + *
242     + * Check the specified BBR table sector for a valid signature and CRC. If it's
243     + * valid, endian-convert the table sector.
244     + **/
245     +static int validate_bbr_table_sector(struct bbr_table *p)
246     +{
247     + int rc = 0;
248     + int org_crc, final_crc;
249     +
250     + if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) {
251     + DMERR("dm-bbr: BBR table signature doesn't match!");
252     + DMERR("dm-bbr: Found 0x%x. Expecting 0x%x",
253     + le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE);
254     + rc = -EINVAL;
255     + goto out;
256     + }
257     +
258     + if (!p->crc) {
259     + DMERR("dm-bbr: BBR table sector has no CRC!");
260     + rc = -EINVAL;
261     + goto out;
262     + }
263     +
264     + org_crc = le32_to_cpup(&p->crc);
265     + p->crc = 0;
266     + final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p));
267     + if (final_crc != org_crc) {
268     + DMERR("dm-bbr: CRC failed!");
269     + DMERR("dm-bbr: Found 0x%x. Expecting 0x%x",
270     + org_crc, final_crc);
271     + rc = -EINVAL;
272     + goto out;
273     + }
274     +
275     + p->crc = cpu_to_le32p(&org_crc);
276     + le_bbr_table_sector_to_cpu(p);
277     +
278     +out:
279     + return rc;
280     +}
281     +
282     +/**
283     + * bbr_binary_tree_insert
284     + *
285     + * Insert a node into the binary tree.
286     + **/
287     +static void bbr_binary_tree_insert(struct bbr_runtime_remap **root,
288     + struct bbr_runtime_remap *newnode)
289     +{
290     + struct bbr_runtime_remap **node = root;
291     + while (node && *node) {
292     + if (newnode->remap.bad_sect > (*node)->remap.bad_sect) {
293     + node = &((*node)->right);
294     + } else {
295     + node = &((*node)->left);
296     + }
297     + }
298     +
299     + newnode->left = newnode->right = NULL;
300     + *node = newnode;
301     +}
302     +
303     +/**
304     + * bbr_binary_search
305     + *
306     + * Search for a node that contains bad_sect == lsn.
307     + **/
308     +static struct bbr_runtime_remap *bbr_binary_search(
309     + struct bbr_runtime_remap *root,
310     + u64 lsn)
311     +{
312     + struct bbr_runtime_remap *node = root;
313     + while (node) {
314     + if (node->remap.bad_sect == lsn) {
315     + break;
316     + }
317     + if (lsn > node->remap.bad_sect) {
318     + node = node->right;
319     + } else {
320     + node = node->left;
321     + }
322     + }
323     + return node;
324     +}
325     +
326     +/**
327     + * bbr_insert_remap_entry
328     + *
329     + * Create a new remap entry and add it to the binary tree for this node.
330     + **/
331     +static int bbr_insert_remap_entry(struct bbr_private *bbr_id,
332     + struct bbr_table_entry *new_bbr_entry)
333     +{
334     + struct bbr_runtime_remap *newnode;
335     +
336     + newnode = kmem_cache_alloc(bbr_remap_cache, GFP_NOIO);
337     + if (!newnode) {
338     + DMERR("dm-bbr: Could not allocate from remap cache!");
339     + return -ENOMEM;
340     + }
341     + newnode->remap.bad_sect = new_bbr_entry->bad_sect;
342     + newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
343     + spin_lock_irq(&bbr_id->remap_root_lock);
344     + bbr_binary_tree_insert(&bbr_id->remap_root, newnode);
345     + spin_unlock_irq(&bbr_id->remap_root_lock);
346     + return 0;
347     +}
348     +
349     +/**
350     + * bbr_table_to_remap_list
351     + *
352     + * The on-disk bbr table is sorted by the replacement sector LBA. In order to
353     + * improve run time performance, the in memory remap list must be sorted by
354     + * the bad sector LBA. This function is called at discovery time to initialize
355     + * the remap list. This function assumes that at least one copy of meta data
356     + * is valid.
357     + **/
358     +static u32 bbr_table_to_remap_list(struct bbr_private *bbr_id)
359     +{
360     + u32 in_use_blks = 0;
361     + int i, j;
362     + struct bbr_table *p;
363     +
364     + for (i = 0, p = bbr_id->bbr_table;
365     + i < bbr_id->nr_sects_bbr_table;
366     + i++, p++) {
367     + if (!p->in_use_cnt) {
368     + break;
369     + }
370     + in_use_blks += p->in_use_cnt;
371     + for (j = 0; j < p->in_use_cnt; j++) {
372     + bbr_insert_remap_entry(bbr_id, &p->entries[j]);
373     + }
374     + }
375     + if (in_use_blks) {
376     + char b[32];
377     + DMWARN("dm-bbr: There are %u BBR entries for device %s",
378     + in_use_blks, format_dev_t(b, bbr_id->dev->bdev->bd_dev));
379     + }
380     +
381     + return in_use_blks;
382     +}
383     +
384     +/**
385     + * bbr_search_remap_entry
386     + *
387     + * Search remap entry for the specified sector. If found, return a pointer to
388     + * the table entry. Otherwise, return NULL.
389     + **/
390     +static struct bbr_table_entry *bbr_search_remap_entry(
391     + struct bbr_private *bbr_id,
392     + u64 lsn)
393     +{
394     + struct bbr_runtime_remap *p;
395     +
396     + spin_lock_irq(&bbr_id->remap_root_lock);
397     + p = bbr_binary_search(bbr_id->remap_root, lsn);
398     + spin_unlock_irq(&bbr_id->remap_root_lock);
399     + if (p) {
400     + return (&p->remap);
401     + } else {
402     + return NULL;
403     + }
404     +}
405     +
406     +/**
407     + * bbr_remap
408     + *
409     + * If *lsn is in the remap table, return TRUE and modify *lsn,
410     + * else, return FALSE.
411     + **/
412     +static inline int bbr_remap(struct bbr_private *bbr_id,
413     + u64 *lsn)
414     +{
415     + struct bbr_table_entry *e;
416     +
417     + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
418     + e = bbr_search_remap_entry(bbr_id, *lsn);
419     + if (e) {
420     + *lsn = e->replacement_sect;
421     + return 1;
422     + }
423     + }
424     + return 0;
425     +}
426     +
427     +/**
428     + * bbr_remap_probe
429     + *
430     + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap
431     + * table return TRUE, Else, return FALSE.
432     + **/
433     +static inline int bbr_remap_probe(struct bbr_private *bbr_id,
434     + u64 lsn, u64 nr_sects)
435     +{
436     + u64 tmp, cnt;
437     +
438     + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
439     + for (cnt = 0, tmp = lsn;
440     + cnt < nr_sects;
441     + cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) {
442     + if (bbr_remap(bbr_id,&tmp)) {
443     + return 1;
444     + }
445     + }
446     + }
447     + return 0;
448     +}
449     +
450     +/**
451     + * bbr_setup
452     + *
453     + * Read the remap tables from disk and set up the initial remap tree.
454     + **/
455     +static int bbr_setup(struct bbr_private *bbr_id)
456     +{
457     + struct bbr_table *table = bbr_id->bbr_table;
458     + struct io_region job;
459     + unsigned long error;
460     + int i, rc = 0;
461     +
462     + job.bdev = bbr_id->dev->bdev;
463     + job.count = 1;
464     +
465     + /* Read and verify each BBR table sector individually. */
466     + for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) {
467     + job.sector = bbr_id->lba_table1 + i;
468     + rc = dm_io_sync_vm(1, &job, READ, table, &error);
469     + if (rc && bbr_id->lba_table2) {
470     + job.sector = bbr_id->lba_table2 + i;
471     + rc = dm_io_sync_vm(1, &job, READ, table, &error);
472     + }
473     + if (rc) {
474     + goto out;
475     + }
476     +
477     + rc = validate_bbr_table_sector(table);
478     + if (rc) {
479     + goto out;
480     + }
481     + }
482     + atomic_set(&bbr_id->in_use_replacement_blks,
483     + bbr_table_to_remap_list(bbr_id));
484     +
485     +out:
486     + if (rc) {
487     + DMERR("dm-bbr: error during device setup: %d", rc);
488     + }
489     + return rc;
490     +}
491     +
492     +/**
493     + * bbr_io_remap_error
494     + * @bbr_id: Private data for the BBR node.
495     + * @rw: READ or WRITE.
496     + * @starting_lsn: Starting sector of request to remap.
497     + * @count: Number of sectors in the request.
498     + * @page: Page containing the data for the request.
499     + * @offset: Byte-offset of the data within the page.
500     + *
501     + * For the requested range, try to write each sector individually. For each
502     + * sector that fails, find the next available remap location and write the
503     + * data to that new location. Then update the table and write both copies
504     + * of the table to disk. Finally, update the in-memory mapping and do any
505     + * other necessary bookkeeping.
506     + **/
507     +static int bbr_io_remap_error(struct bbr_private *bbr_id,
508     + int rw,
509     + u64 starting_lsn,
510     + u64 count,
511     + struct page *page,
512     + unsigned int offset)
513     +{
514     + struct bbr_table *bbr_table;
515     + struct io_region job;
516     + struct page_list pl;
517     + unsigned long table_sector_index;
518     + unsigned long table_sector_offset;
519     + unsigned long index;
520     + unsigned long error;
521     + u64 lsn, new_lsn;
522     + char b[32];
523     + int rc;
524     +
525     + job.bdev = bbr_id->dev->bdev;
526     + job.count = 1;
527     + pl.page = page;
528     + pl.next = NULL;
529     +
530     + /* For each sector in the request. */
531     + for (lsn = 0; lsn < count; lsn++, offset += SECTOR_SIZE) {
532     + job.sector = starting_lsn + lsn;
533     + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
534     + while (rc) {
535     + /* Find the next available relocation sector. */
536     + new_lsn = atomic_read(&bbr_id->in_use_replacement_blks);
537     + if (new_lsn >= bbr_id->nr_replacement_blks) {
538     + /* No more replacement sectors available. */
539     + return -EIO;
540     + }
541     + new_lsn += bbr_id->start_replacement_sect;
542     +
543     + /* Write the data to its new location. */
544     + DMWARN("dm-bbr: device %s: Trying to remap bad sector "PFU64" to sector "PFU64,
545     + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
546     + starting_lsn + lsn, new_lsn);
547     + job.sector = new_lsn;
548     + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
549     + if (rc) {
550     + /* This replacement sector is bad.
551     + * Try the next one.
552     + */
553     + DMERR("dm-bbr: device %s: replacement sector "PFU64" is bad. Skipping.",
554     + format_dev_t(b, bbr_id->dev->bdev->bd_dev), new_lsn);
555     + atomic_inc(&bbr_id->in_use_replacement_blks);
556     + continue;
557     + }
558     +
559     + /* Add this new entry to the on-disk table. */
560     + table_sector_index = new_lsn -
561     + bbr_id->start_replacement_sect;
562     + table_sector_offset = table_sector_index /
563     + BBR_ENTRIES_PER_SECT;
564     + index = table_sector_index % BBR_ENTRIES_PER_SECT;
565     +
566     + bbr_table = &bbr_id->bbr_table[table_sector_offset];
567     + bbr_table->entries[index].bad_sect = starting_lsn + lsn;
568     + bbr_table->entries[index].replacement_sect = new_lsn;
569     + bbr_table->in_use_cnt++;
570     + bbr_table->sequence_number++;
571     + bbr_table->crc = 0;
572     + bbr_table->crc = calculate_crc(INITIAL_CRC,
573     + bbr_table,
574     + sizeof(struct bbr_table));
575     +
576     + /* Write the table to disk. */
577     + cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
578     + if (bbr_id->lba_table1) {
579     + job.sector = bbr_id->lba_table1 + table_sector_offset;
580     + rc = dm_io_sync_vm(1, &job, WRITE, bbr_table, &error);
581     + }
582     + if (bbr_id->lba_table2) {
583     + job.sector = bbr_id->lba_table2 + table_sector_offset;
584     + rc |= dm_io_sync_vm(1, &job, WRITE, bbr_table, &error);
585     + }
586     + le_bbr_table_sector_to_cpu(bbr_table);
587     +
588     + if (rc) {
589     + /* Error writing one of the tables to disk. */
590     + DMERR("dm-bbr: device %s: error updating BBR tables on disk.",
591     + format_dev_t(b, bbr_id->dev->bdev->bd_dev));
592     + return rc;
593     + }
594     +
595     + /* Insert a new entry in the remapping binary-tree. */
596     + rc = bbr_insert_remap_entry(bbr_id,
597     + &bbr_table->entries[index]);
598     + if (rc) {
599     + DMERR("dm-bbr: device %s: error adding new entry to remap tree.",
600     + format_dev_t(b, bbr_id->dev->bdev->bd_dev));
601     + return rc;
602     + }
603     +
604     + atomic_inc(&bbr_id->in_use_replacement_blks);
605     + }
606     + }
607     +
608     + return 0;
609     +}
610     +
611     +/**
612     + * bbr_io_process_request
613     + *
614     + * For each sector in this request, check if the sector has already
615     + * been remapped. If so, process all previous sectors in the request,
616     + * followed by the remapped sector. Then reset the starting lsn and
617     + * count, and keep going with the rest of the request as if it were
618     + * a whole new request. If any of the sync_io's return an error,
619     + * call the remapper to relocate the bad sector(s).
620     + *
621     + * 2.5 Note: When switching over to bio's for the I/O path, we have made
622     + * the assumption that the I/O request described by the bio is one
623     + * virtually contiguous piece of memory (even though the bio vector
624     + * describes it using a series of physical page addresses).
625     + **/
626     +static int bbr_io_process_request(struct bbr_private *bbr_id,
627     + struct bio *bio)
628     +{
629     + struct io_region job;
630     + u64 starting_lsn = bio->bi_sector;
631     + u64 count, lsn, remapped_lsn;
632     + struct page_list pl;
633     + unsigned int offset;
634     + unsigned long error;
635     + int i, rw = bio_data_dir(bio);
636     + int rc = 0;
637     +
638     + job.bdev = bbr_id->dev->bdev;
639     + pl.next = NULL;
640     +
641     + /* Each bio can contain multiple vectors, each with a different page.
642     + * Treat each vector as a separate request.
643     + */
644     + /* KMC: Is this the right way to walk the bvec list? */
645     + for (i = 0;
646     + i < bio->bi_vcnt;
647     + i++, bio->bi_idx++, starting_lsn += count) {
648     +
649     + /* Bvec info: number of sectors, page,
650     + * and byte-offset within page.
651     + */
652     + count = bio_iovec(bio)->bv_len >> SECTOR_SHIFT;
653     + pl.page = bio_iovec(bio)->bv_page;
654     + offset = bio_iovec(bio)->bv_offset;
655     +
656     + /* For each sector in this bvec, check if the sector has
657     + * already been remapped. If so, process all previous sectors
658     + * in this request, followed by the remapped sector. Then reset
659     + * the starting lsn and count and keep going with the rest of
660     + * the request as if it were a whole new request.
661     + */
662     + for (lsn = 0; lsn < count; lsn++) {
663     + remapped_lsn = starting_lsn + lsn;
664     + rc = bbr_remap(bbr_id, &remapped_lsn);
665     + if (!rc) {
666     + /* This sector is fine. */
667     + continue;
668     + }
669     +
670     + /* Process all sectors in the request up to this one. */
671     + if (lsn > 0) {
672     + job.sector = starting_lsn;
673     + job.count = lsn;
674     + rc = dm_io_sync(1, &job, rw, &pl,
675     + offset, &error);
676     + if (rc) {
677     + /* If this I/O failed, then one of the
678     + * sectors in this request needs to be
679     + * relocated.
680     + */
681     + rc = bbr_io_remap_error(bbr_id, rw,
682     + starting_lsn,
683     + lsn, pl.page,
684     + offset);
685     + if (rc) {
686     + /* KMC: Return? Or continue to next bvec? */
687     + return rc;
688     + }
689     + }
690     + offset += (lsn << SECTOR_SHIFT);
691     + }
692 phreak 274 +
693 dsd 7 + /* Process the remapped sector. */
694     + job.sector = remapped_lsn;
695     + job.count = 1;
696     + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
697     + if (rc) {
698     + /* BUGBUG - Need more processing if this caused
699     + * an error. If this I/O failed, then the
700     + * existing remap is now bad, and we need to
701     + * find a new remap. Can't use
702     + * bbr_io_remap_error(), because the existing
703     + * map entry needs to be changed, not added
704     + * again, and the original table entry also
705     + * needs to be changed.
706     + */
707     + return rc;
708     + }
709     +
710     + starting_lsn += (lsn + 1);
711     + count -= (lsn + 1);
712     + lsn = -1;
713     + offset += SECTOR_SIZE;
714     + }
715     +
716     + /* Check for any remaining sectors after the last split. This
717     + * could potentially be the whole request, but that should be a
718     + * rare case because requests should only be processed by the
719     + * thread if we know an error occurred or they contained one or
720     + * more remapped sectors.
721     + */
722     + if (count) {
723     + job.sector = starting_lsn;
724     + job.count = count;
725     + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
726     + if (rc) {
727     + /* If this I/O failed, then one of the sectors
728     + * in this request needs to be relocated.
729     + */
730     + rc = bbr_io_remap_error(bbr_id, rw, starting_lsn,
731     + count, pl.page, offset);
732     + if (rc) {
733     + /* KMC: Return? Or continue to next bvec? */
734     + return rc;
735     + }
736     + }
737     + }
738     + }
739     +
740     + return 0;
741     +}
742     +
743     +static void bbr_io_process_requests(struct bbr_private *bbr_id,
744     + struct bio *bio)
745     +{
746     + struct bio *next;
747     + int rc;
748     +
749     + while (bio) {
750     + next = bio->bi_next;
751     + bio->bi_next = NULL;
752     +
753     + rc = bbr_io_process_request(bbr_id, bio);
754     +
755     + bio_endio(bio, bio->bi_size, rc);
756     +
757     + bio = next;
758     + }
759     +}
760     +
761     +/**
762     + * bbr_remap_handler
763     + *
764     + * This is the handler for the bbr work-queue.
765     + *
766     + * I/O requests should only be sent to this handler if we know that:
767     + * a) the request contains at least one remapped sector.
768     + * or
769     + * b) the request caused an error on the normal I/O path.
770     + *
771     + * This function uses synchronous I/O, so sending a request to this
772     + * thread that doesn't need special processing will cause severe
773     + * performance degredation.
774     + **/
775     +static void bbr_remap_handler(void *data)
776     +{
777     + struct bbr_private *bbr_id = data;
778     + struct bio *bio;
779     + unsigned long flags;
780     +
781     + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
782     + bio = bio_list_get(&bbr_id->remap_ios);
783     + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
784     +
785     + bbr_io_process_requests(bbr_id, bio);
786     +}
787     +
788     +/**
789     + * bbr_endio
790     + *
791     + * This is the callback for normal write requests. Check for an error
792     + * during the I/O, and send to the thread for processing if necessary.
793     + **/
794     +static int bbr_endio(struct dm_target *ti, struct bio *bio,
795     + int error, union map_info *map_context)
796     +{
797     + struct bbr_private *bbr_id = ti->private;
798     + struct dm_bio_details *bbr_io = map_context->ptr;
799     +
800     + if (error && bbr_io) {
801     + unsigned long flags;
802     + char b[32];
803     +
804     + dm_bio_restore(bbr_io, bio);
805     + map_context->ptr = NULL;
806     +
807     + DMERR("dm-bbr: device %s: I/O failure on sector %lu. "
808     + "Scheduling for retry.",
809     + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
810     + (unsigned long)bio->bi_sector);
811     +
812     + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
813     + bio_list_add(&bbr_id->remap_ios, bio);
814     + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
815     +
816     + queue_work(dm_bbr_wq, &bbr_id->remap_work);
817     +
818     + error = 1;
819     + }
820     +
821     + if (bbr_io)
822     + mempool_free(bbr_io, bbr_io_pool);
823     +
824     + return error;
825     +}
826     +
827     +/**
828     + * Construct a bbr mapping
829     + **/
830     +static int bbr_ctr(struct dm_target *ti, unsigned int argc, char **argv)
831     +{
832     + struct bbr_private *bbr_id;
833     + unsigned long block_size;
834     + char *end;
835     + int rc = -EINVAL;
836     +
837     + if (argc != 8) {
838     + ti->error = "dm-bbr requires exactly 8 arguments: "
839     + "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size";
840     + goto out1;
841     + }
842     +
843     + bbr_id = bbr_alloc_private();
844     + if (!bbr_id) {
845     + ti->error = "dm-bbr: Error allocating bbr private data.";
846     + goto out1;
847     + }
848     +
849     + bbr_id->offset = simple_strtoull(argv[1], &end, 10);
850     + bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10);
851     + bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10);
852     + bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10);
853     + bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10);
854     + bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10);
855     + block_size = simple_strtoul(argv[7], &end, 10);
856     + bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT);
857     +
858     + bbr_id->bbr_table = vmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT);
859     + if (!bbr_id->bbr_table) {
860     + ti->error = "dm-bbr: Error allocating bbr table.";
861     + goto out2;
862     + }
863     +
864     + if (dm_get_device(ti, argv[0], 0, ti->len,
865     + dm_table_get_mode(ti->table), &bbr_id->dev)) {
866     + ti->error = "dm-bbr: Device lookup failed";
867     + goto out2;
868     + }
869     +
870     + rc = bbr_setup(bbr_id);
871     + if (rc) {
872     + ti->error = "dm-bbr: Device setup failed";
873     + goto out3;
874     + }
875     +
876     + ti->private = bbr_id;
877     + return 0;
878     +
879     +out3:
880     + dm_put_device(ti, bbr_id->dev);
881     +out2:
882     + bbr_free_private(bbr_id);
883     +out1:
884     + return rc;
885     +}
886     +
887     +static void bbr_dtr(struct dm_target *ti)
888     +{
889     + struct bbr_private *bbr_id = ti->private;
890     +
891     + dm_put_device(ti, bbr_id->dev);
892     + bbr_free_private(bbr_id);
893     +}
894     +
895     +static int bbr_map(struct dm_target *ti, struct bio *bio,
896     + union map_info *map_context)
897     +{
898     + struct bbr_private *bbr_id = ti->private;
899     + struct dm_bio_details *bbr_io;
900     + unsigned long flags;
901     + int rc = 1;
902     +
903     + bio->bi_sector += bbr_id->offset;
904     +
905     + if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
906     + !bbr_remap_probe(bbr_id, bio->bi_sector, bio_sectors(bio))) {
907     + /* No existing remaps or this request doesn't
908     + * contain any remapped sectors.
909     + */
910     + bio->bi_bdev = bbr_id->dev->bdev;
911     +
912     + bbr_io = mempool_alloc(bbr_io_pool, GFP_NOIO);
913     + dm_bio_record(bbr_io, bio);
914     + map_context->ptr = bbr_io;
915     + } else {
916     + /* This request has at least one remapped sector.
917     + * Give it to the work-queue for processing.
918     + */
919     + map_context->ptr = NULL;
920     + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
921     + bio_list_add(&bbr_id->remap_ios, bio);
922     + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
923     +
924     + queue_work(dm_bbr_wq, &bbr_id->remap_work);
925     + rc = 0;
926     + }
927     +
928     + return rc;
929     +}
930     +
931     +static int bbr_status(struct dm_target *ti, status_type_t type,
932     + char *result, unsigned int maxlen)
933     +{
934     + struct bbr_private *bbr_id = ti->private;
935     + char b[BDEVNAME_SIZE];
936     +
937     + switch (type) {
938     + case STATUSTYPE_INFO:
939     + result[0] = '\0';
940     + break;
941     +
942     + case STATUSTYPE_TABLE:
943     + snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u",
944     + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
945     + bbr_id->offset, bbr_id->lba_table1, bbr_id->lba_table2,
946     + bbr_id->nr_sects_bbr_table,
947     + bbr_id->start_replacement_sect,
948     + bbr_id->nr_replacement_blks,
949     + bbr_id->blksize_in_sects << SECTOR_SHIFT);
950     + break;
951     + }
952     + return 0;
953     +}
954     +
955     +static struct target_type bbr_target = {
956     + .name = "bbr",
957     + .version= {1, 0, 1},
958     + .module = THIS_MODULE,
959     + .ctr = bbr_ctr,
960     + .dtr = bbr_dtr,
961     + .map = bbr_map,
962     + .end_io = bbr_endio,
963     + .status = bbr_status,
964     +};
965     +
966     +int __init dm_bbr_init(void)
967     +{
968     + int rc;
969     +
970     + rc = dm_register_target(&bbr_target);
971     + if (rc) {
972     + DMERR("dm-bbr: error registering target.");
973     + goto err1;
974     + }
975     +
976     + bbr_remap_cache = kmem_cache_create("bbr-remap",
977     + sizeof(struct bbr_runtime_remap),
978     + 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
979     + if (!bbr_remap_cache) {
980     + DMERR("dm-bbr: error creating remap cache.");
981     + rc = ENOMEM;
982     + goto err2;
983     + }
984     +
985     + bbr_io_cache = kmem_cache_create("bbr-io", sizeof(struct dm_bio_details),
986     + 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
987     + if (!bbr_io_cache) {
988     + DMERR("dm-bbr: error creating io cache.");
989     + rc = ENOMEM;
990     + goto err3;
991     + }
992     +
993     + bbr_io_pool = mempool_create(256, mempool_alloc_slab,
994     + mempool_free_slab, bbr_io_cache);
995     + if (!bbr_io_pool) {
996     + DMERR("dm-bbr: error creating io mempool.");
997     + rc = ENOMEM;
998     + goto err4;
999     + }
1000     +
1001     + dm_bbr_wq = create_workqueue("dm-bbr");
1002     + if (!dm_bbr_wq) {
1003     + DMERR("dm-bbr: error creating work-queue.");
1004     + rc = ENOMEM;
1005     + goto err5;
1006     + }
1007     +
1008     + rc = dm_io_get(1);
1009     + if (rc) {
1010     + DMERR("dm-bbr: error initializing I/O service.");
1011     + goto err6;
1012     + }
1013     +
1014     + return 0;
1015     +
1016     +err6:
1017     + destroy_workqueue(dm_bbr_wq);
1018     +err5:
1019     + mempool_destroy(bbr_io_pool);
1020     +err4:
1021     + kmem_cache_destroy(bbr_io_cache);
1022     +err3:
1023     + kmem_cache_destroy(bbr_remap_cache);
1024     +err2:
1025     + dm_unregister_target(&bbr_target);
1026     +err1:
1027     + return rc;
1028     +}
1029     +
1030     +void __exit dm_bbr_exit(void)
1031     +{
1032     + dm_io_put(1);
1033     + destroy_workqueue(dm_bbr_wq);
1034     + mempool_destroy(bbr_io_pool);
1035     + kmem_cache_destroy(bbr_io_cache);
1036     + kmem_cache_destroy(bbr_remap_cache);
1037     + dm_unregister_target(&bbr_target);
1038     +}
1039     +
1040     +module_init(dm_bbr_init);
1041     +module_exit(dm_bbr_exit);
1042     +MODULE_LICENSE("GPL");
1043 phreak 274 Index: linux-git/drivers/md/dm-bbr.h
1044     ===================================================================
1045     --- /dev/null
1046     +++ linux-git/drivers/md/dm-bbr.h
1047 dsd 7 @@ -0,0 +1,125 @@
1048     +/*
1049     + * (C) Copyright IBM Corp. 2002, 2004
1050     + *
1051     + * This program is free software; you can redistribute it and/or modify
1052     + * it under the terms of the GNU General Public License as published by
1053     + * the Free Software Foundation; either version 2 of the License, or
1054     + * (at your option) any later version.
1055     + *
1056     + * This program is distributed in the hope that it will be useful,
1057     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1058     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
1059     + * the GNU General Public License for more details.
1060     + *
1061     + * You should have received a copy of the GNU General Public License
1062     + * along with this program; if not, write to the Free Software
1063     + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1064     + *
1065     + * linux/drivers/md/dm-bbr.h
1066     + *
1067     + * Bad-block-relocation (BBR) target for device-mapper.
1068     + *
1069     + * The BBR target is designed to remap I/O write failures to another safe
1070     + * location on disk. Note that most disk drives have BBR built into them,
1071     + * this means that our software BBR will be only activated when all hardware
1072     + * BBR replacement sectors have been used.
1073     + */
1074     +
1075     +#define BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */
1076     +#define BBR_ENTRIES_PER_SECT 31
1077     +#define INITIAL_CRC 0xFFFFFFFF
1078     +#define CRC_POLYNOMIAL 0xEDB88320L
1079     +
1080     +/**
1081     + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines.
1082     + * Use these in place of %Ld, %Lu, and %Lx.
1083     + **/
1084     +#if BITS_PER_LONG > 32
1085     +#define PFU64 "%lu"
1086     +#else
1087     +#define PFU64 "%Lu"
1088     +#endif
1089     +
1090     +/**
1091     + * struct bbr_table_entry
1092     + * @bad_sect: LBA of bad location.
1093     + * @replacement_sect: LBA of new location.
1094     + *
1095     + * Structure to describe one BBR remap.
1096     + **/
1097     +struct bbr_table_entry {
1098     + u64 bad_sect;
1099     + u64 replacement_sect;
1100     +};
1101     +
1102     +/**
1103     + * struct bbr_table
1104     + * @signature: Signature on each BBR table sector.
1105     + * @crc: CRC for this table sector.
1106     + * @sequence_number: Used to resolve conflicts when primary and secondary
1107     + * tables do not match.
1108     + * @in_use_cnt: Number of in-use table entries.
1109     + * @entries: Actual table of remaps.
1110     + *
1111     + * Structure to describe each sector of the metadata table. Each sector in this
1112     + * table can describe 31 remapped sectors.
1113     + **/
1114     +struct bbr_table {
1115     + u32 signature;
1116     + u32 crc;
1117     + u32 sequence_number;
1118     + u32 in_use_cnt;
1119     + struct bbr_table_entry entries[BBR_ENTRIES_PER_SECT];
1120     +};
1121     +
1122     +/**
1123     + * struct bbr_runtime_remap
1124     + *
1125     + * Node in the binary tree used to keep track of remaps.
1126     + **/
1127     +struct bbr_runtime_remap {
1128     + struct bbr_table_entry remap;
1129     + struct bbr_runtime_remap *left;
1130     + struct bbr_runtime_remap *right;
1131     +};
1132     +
1133     +/**
1134     + * struct bbr_private
1135     + * @dev: Info about underlying device.
1136     + * @bbr_table: Copy of metadata table.
1137     + * @remap_root: Binary tree containing all remaps.
1138     + * @remap_root_lock: Lock for the binary tree.
1139     + * @remap_work: For adding work items to the work-queue.
1140     + * @remap_ios: List of I/Os for the work-queue to handle.
1141     + * @remap_ios_lock: Lock for the remap_ios list.
1142     + * @offset: LBA of data area.
1143     + * @lba_table1: LBA of primary BBR table.
1144     + * @lba_table2: LBA of secondary BBR table.
1145     + * @nr_sects_bbr_table: Size of each BBR table.
1146     + * @nr_replacement_blks: Number of replacement blocks.
1147     + * @start_replacement_sect: LBA of start of replacement blocks.
1148     + * @blksize_in_sects: Size of each block.
1149     + * @in_use_replacement_blks: Current number of remapped blocks.
1150     + *
1151     + * Private data for each BBR target.
1152     + **/
1153     +struct bbr_private {
1154     + struct dm_dev *dev;
1155     + struct bbr_table *bbr_table;
1156     + struct bbr_runtime_remap *remap_root;
1157     + spinlock_t remap_root_lock;
1158     +
1159     + struct work_struct remap_work;
1160     + struct bio_list remap_ios;
1161     + spinlock_t remap_ios_lock;
1162     +
1163     + u64 offset;
1164     + u64 lba_table1;
1165     + u64 lba_table2;
1166     + u64 nr_sects_bbr_table;
1167     + u64 start_replacement_sect;
1168     + u64 nr_replacement_blks;
1169     + u32 blksize_in_sects;
1170     + atomic_t in_use_replacement_blks;
1171     +};
1172     +

  ViewVC Help
Powered by ViewVC 1.1.20