/[linux-patches]/genpatches-2.6/trunk/2.6.19/4105_dm-bbr.patch
Gentoo

Contents of /genpatches-2.6/trunk/2.6.19/4105_dm-bbr.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 665 - (show annotations) (download)
Sat Oct 7 10:34:09 2006 UTC (11 years, 8 months ago) by phreak
Original Path: genpatches-2.6/trunk/2.6.19-pre/4105_dm-bbr.patch
File size: 32199 byte(s)
Offset fixes for 2500_via-irq-quirk-revert.patch, 4000_deprecate-sk98lin.patch and 4105_dm-bbr.patch.
1 Index: linux-2.6.19/drivers/md/Kconfig
2 ===================================================================
3 --- linux-2.6.19.orig/drivers/md/Kconfig
4 +++ linux-2.6.19/drivers/md/Kconfig
5 @@ -261,6 +261,17 @@ config DM_MULTIPATH_EMC
6 ---help---
7 Multipath support for EMC CX/AX series hardware.
8
9 +config BLK_DEV_DM_BBR
10 + tristate "Bad Block Relocation Device Target (EXPERIMENTAL)"
11 + depends on BLK_DEV_DM && EXPERIMENTAL
12 + ---help---
13 + Support for devices with software-based bad-block-relocation.
14 +
15 + To compile this as a module, choose M here: the module will be
16 + called dm-bbr.
17 +
18 + If unsure, say N.
19 +
20 endmenu
21
22 endif
23 Index: linux-2.6.19/drivers/md/Makefile
24 ===================================================================
25 --- linux-2.6.19.orig/drivers/md/Makefile
26 +++ linux-2.6.19/drivers/md/Makefile
27 @@ -36,6 +36,7 @@ obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc
28 obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
29 obj-$(CONFIG_DM_MIRROR) += dm-mirror.o
30 obj-$(CONFIG_DM_ZERO) += dm-zero.o
31 +obj-$(CONFIG_BLK_DEV_DM_BBR) += dm-bbr.o
32
33 quiet_cmd_unroll = UNROLL $@
34 cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
35 Index: linux-2.6.19/drivers/md/dm-bbr.c
36 ===================================================================
37 --- /dev/null
38 +++ linux-2.6.19/drivers/md/dm-bbr.c
39 @@ -0,0 +1,1004 @@
40 +/*
41 + * (C) Copyright IBM Corp. 2002, 2004
42 + *
43 + * This program is free software; you can redistribute it and/or modify
44 + * it under the terms of the GNU General Public License as published by
45 + * the Free Software Foundation; either version 2 of the License, or
46 + * (at your option) any later version.
47 + *
48 + * This program is distributed in the hope that it will be useful,
49 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
50 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
51 + * the GNU General Public License for more details.
52 + *
53 + * You should have received a copy of the GNU General Public License
54 + * along with this program; if not, write to the Free Software
55 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
56 + *
57 + * linux/drivers/md/dm-bbr.c
58 + *
59 + * Bad-block-relocation (BBR) target for device-mapper.
60 + *
61 + * The BBR target is designed to remap I/O write failures to another safe
62 + * location on disk. Note that most disk drives have BBR built into them,
63 + * this means that our software BBR will be only activated when all hardware
64 + * BBR replacement sectors have been used.
65 + */
66 +
67 +#include <linux/module.h>
68 +#include <linux/init.h>
69 +#include <linux/bio.h>
70 +#include <linux/spinlock.h>
71 +#include <linux/slab.h>
72 +#include <linux/mempool.h>
73 +#include <linux/workqueue.h>
74 +#include <linux/vmalloc.h>
75 +
76 +#include "dm.h"
77 +#include "dm-bio-list.h"
78 +#include "dm-bio-record.h"
79 +#include "dm-bbr.h"
80 +#include "dm-io.h"
81 +
82 +#define DM_MSG_PREFIX "bbr"
83 +#define SECTOR_SIZE (1 << SECTOR_SHIFT)
84 +
85 +static struct workqueue_struct *dm_bbr_wq = NULL;
86 +static void bbr_remap_handler(void *data);
87 +static kmem_cache_t *bbr_remap_cache;
88 +static kmem_cache_t *bbr_io_cache;
89 +static mempool_t *bbr_io_pool;
90 +
91 +/**
92 + * bbr_binary_tree_destroy
93 + *
94 + * Destroy the binary tree.
95 + **/
96 +static void bbr_binary_tree_destroy(struct bbr_runtime_remap *root)
97 +{
98 + struct bbr_runtime_remap **link = NULL;
99 + struct bbr_runtime_remap *node = root;
100 +
101 + while (node) {
102 + if (node->left) {
103 + link = &(node->left);
104 + node = node->left;
105 + continue;
106 + }
107 + if (node->right) {
108 + link = &(node->right);
109 + node = node->right;
110 + continue;
111 + }
112 +
113 + kmem_cache_free(bbr_remap_cache, node);
114 + if (node == root) {
115 + /* If root is deleted, we're done. */
116 + break;
117 + }
118 +
119 + /* Back to root. */
120 + node = root;
121 + *link = NULL;
122 + }
123 +}
124 +
125 +static void bbr_free_remap(struct bbr_private *bbr_id)
126 +{
127 + spin_lock_irq(&bbr_id->remap_root_lock);
128 + bbr_binary_tree_destroy(bbr_id->remap_root);
129 + bbr_id->remap_root = NULL;
130 + spin_unlock_irq(&bbr_id->remap_root_lock);
131 +}
132 +
133 +static struct bbr_private *bbr_alloc_private(void)
134 +{
135 + struct bbr_private *bbr_id;
136 +
137 + bbr_id = kmalloc(sizeof(*bbr_id), GFP_KERNEL);
138 + if (bbr_id) {
139 + memset(bbr_id, 0, sizeof(*bbr_id));
140 + INIT_WORK(&bbr_id->remap_work, bbr_remap_handler, bbr_id);
141 + bbr_id->remap_root_lock = SPIN_LOCK_UNLOCKED;
142 + bbr_id->remap_ios_lock = SPIN_LOCK_UNLOCKED;
143 + bbr_id->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0);
144 + }
145 +
146 + return bbr_id;
147 +}
148 +
149 +static void bbr_free_private(struct bbr_private *bbr_id)
150 +{
151 + if (bbr_id->bbr_table) {
152 + vfree(bbr_id->bbr_table);
153 + }
154 + bbr_free_remap(bbr_id);
155 + kfree(bbr_id);
156 +}
157 +
158 +static u32 crc_table[256];
159 +static u32 crc_table_built = 0;
160 +
161 +static void build_crc_table(void)
162 +{
163 + u32 i, j, crc;
164 +
165 + for (i = 0; i <= 255; i++) {
166 + crc = i;
167 + for (j = 8; j > 0; j--) {
168 + if (crc & 1)
169 + crc = (crc >> 1) ^ CRC_POLYNOMIAL;
170 + else
171 + crc >>= 1;
172 + }
173 + crc_table[i] = crc;
174 + }
175 + crc_table_built = 1;
176 +}
177 +
178 +static u32 calculate_crc(u32 crc, void *buffer, u32 buffersize)
179 +{
180 + unsigned char *current_byte;
181 + u32 temp1, temp2, i;
182 +
183 + current_byte = (unsigned char *) buffer;
184 + /* Make sure the crc table is available */
185 + if (!crc_table_built)
186 + build_crc_table();
187 + /* Process each byte in the buffer. */
188 + for (i = 0; i < buffersize; i++) {
189 + temp1 = (crc >> 8) & 0x00FFFFFF;
190 + temp2 = crc_table[(crc ^ (u32) * current_byte) &
191 + (u32) 0xff];
192 + current_byte++;
193 + crc = temp1 ^ temp2;
194 + }
195 + return crc;
196 +}
197 +
198 +/**
199 + * le_bbr_table_sector_to_cpu
200 + *
201 + * Convert bbr meta data from on-disk (LE) format
202 + * to the native cpu endian format.
203 + **/
204 +static void le_bbr_table_sector_to_cpu(struct bbr_table *p)
205 +{
206 + int i;
207 + p->signature = le32_to_cpup(&p->signature);
208 + p->crc = le32_to_cpup(&p->crc);
209 + p->sequence_number = le32_to_cpup(&p->sequence_number);
210 + p->in_use_cnt = le32_to_cpup(&p->in_use_cnt);
211 + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
212 + p->entries[i].bad_sect =
213 + le64_to_cpup(&p->entries[i].bad_sect);
214 + p->entries[i].replacement_sect =
215 + le64_to_cpup(&p->entries[i].replacement_sect);
216 + }
217 +}
218 +
219 +/**
220 + * cpu_bbr_table_sector_to_le
221 + *
222 + * Convert bbr meta data from cpu endian format to on-disk (LE) format
223 + **/
224 +static void cpu_bbr_table_sector_to_le(struct bbr_table *p,
225 + struct bbr_table *le)
226 +{
227 + int i;
228 + le->signature = cpu_to_le32p(&p->signature);
229 + le->crc = cpu_to_le32p(&p->crc);
230 + le->sequence_number = cpu_to_le32p(&p->sequence_number);
231 + le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt);
232 + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
233 + le->entries[i].bad_sect =
234 + cpu_to_le64p(&p->entries[i].bad_sect);
235 + le->entries[i].replacement_sect =
236 + cpu_to_le64p(&p->entries[i].replacement_sect);
237 + }
238 +}
239 +
240 +/**
241 + * validate_bbr_table_sector
242 + *
243 + * Check the specified BBR table sector for a valid signature and CRC. If it's
244 + * valid, endian-convert the table sector.
245 + **/
246 +static int validate_bbr_table_sector(struct bbr_table *p)
247 +{
248 + int rc = 0;
249 + int org_crc, final_crc;
250 +
251 + if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) {
252 + DMERR("dm-bbr: BBR table signature doesn't match!");
253 + DMERR("dm-bbr: Found 0x%x. Expecting 0x%x",
254 + le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE);
255 + rc = -EINVAL;
256 + goto out;
257 + }
258 +
259 + if (!p->crc) {
260 + DMERR("dm-bbr: BBR table sector has no CRC!");
261 + rc = -EINVAL;
262 + goto out;
263 + }
264 +
265 + org_crc = le32_to_cpup(&p->crc);
266 + p->crc = 0;
267 + final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p));
268 + if (final_crc != org_crc) {
269 + DMERR("dm-bbr: CRC failed!");
270 + DMERR("dm-bbr: Found 0x%x. Expecting 0x%x",
271 + org_crc, final_crc);
272 + rc = -EINVAL;
273 + goto out;
274 + }
275 +
276 + p->crc = cpu_to_le32p(&org_crc);
277 + le_bbr_table_sector_to_cpu(p);
278 +
279 +out:
280 + return rc;
281 +}
282 +
283 +/**
284 + * bbr_binary_tree_insert
285 + *
286 + * Insert a node into the binary tree.
287 + **/
288 +static void bbr_binary_tree_insert(struct bbr_runtime_remap **root,
289 + struct bbr_runtime_remap *newnode)
290 +{
291 + struct bbr_runtime_remap **node = root;
292 + while (node && *node) {
293 + if (newnode->remap.bad_sect > (*node)->remap.bad_sect) {
294 + node = &((*node)->right);
295 + } else {
296 + node = &((*node)->left);
297 + }
298 + }
299 +
300 + newnode->left = newnode->right = NULL;
301 + *node = newnode;
302 +}
303 +
304 +/**
305 + * bbr_binary_search
306 + *
307 + * Search for a node that contains bad_sect == lsn.
308 + **/
309 +static struct bbr_runtime_remap *bbr_binary_search(
310 + struct bbr_runtime_remap *root,
311 + u64 lsn)
312 +{
313 + struct bbr_runtime_remap *node = root;
314 + while (node) {
315 + if (node->remap.bad_sect == lsn) {
316 + break;
317 + }
318 + if (lsn > node->remap.bad_sect) {
319 + node = node->right;
320 + } else {
321 + node = node->left;
322 + }
323 + }
324 + return node;
325 +}
326 +
327 +/**
328 + * bbr_insert_remap_entry
329 + *
330 + * Create a new remap entry and add it to the binary tree for this node.
331 + **/
332 +static int bbr_insert_remap_entry(struct bbr_private *bbr_id,
333 + struct bbr_table_entry *new_bbr_entry)
334 +{
335 + struct bbr_runtime_remap *newnode;
336 +
337 + newnode = kmem_cache_alloc(bbr_remap_cache, GFP_NOIO);
338 + if (!newnode) {
339 + DMERR("dm-bbr: Could not allocate from remap cache!");
340 + return -ENOMEM;
341 + }
342 + newnode->remap.bad_sect = new_bbr_entry->bad_sect;
343 + newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
344 + spin_lock_irq(&bbr_id->remap_root_lock);
345 + bbr_binary_tree_insert(&bbr_id->remap_root, newnode);
346 + spin_unlock_irq(&bbr_id->remap_root_lock);
347 + return 0;
348 +}
349 +
350 +/**
351 + * bbr_table_to_remap_list
352 + *
353 + * The on-disk bbr table is sorted by the replacement sector LBA. In order to
354 + * improve run time performance, the in memory remap list must be sorted by
355 + * the bad sector LBA. This function is called at discovery time to initialize
356 + * the remap list. This function assumes that at least one copy of meta data
357 + * is valid.
358 + **/
359 +static u32 bbr_table_to_remap_list(struct bbr_private *bbr_id)
360 +{
361 + u32 in_use_blks = 0;
362 + int i, j;
363 + struct bbr_table *p;
364 +
365 + for (i = 0, p = bbr_id->bbr_table;
366 + i < bbr_id->nr_sects_bbr_table;
367 + i++, p++) {
368 + if (!p->in_use_cnt) {
369 + break;
370 + }
371 + in_use_blks += p->in_use_cnt;
372 + for (j = 0; j < p->in_use_cnt; j++) {
373 + bbr_insert_remap_entry(bbr_id, &p->entries[j]);
374 + }
375 + }
376 + if (in_use_blks) {
377 + char b[32];
378 + DMWARN("dm-bbr: There are %u BBR entries for device %s",
379 + in_use_blks, format_dev_t(b, bbr_id->dev->bdev->bd_dev));
380 + }
381 +
382 + return in_use_blks;
383 +}
384 +
385 +/**
386 + * bbr_search_remap_entry
387 + *
388 + * Search remap entry for the specified sector. If found, return a pointer to
389 + * the table entry. Otherwise, return NULL.
390 + **/
391 +static struct bbr_table_entry *bbr_search_remap_entry(
392 + struct bbr_private *bbr_id,
393 + u64 lsn)
394 +{
395 + struct bbr_runtime_remap *p;
396 +
397 + spin_lock_irq(&bbr_id->remap_root_lock);
398 + p = bbr_binary_search(bbr_id->remap_root, lsn);
399 + spin_unlock_irq(&bbr_id->remap_root_lock);
400 + if (p) {
401 + return (&p->remap);
402 + } else {
403 + return NULL;
404 + }
405 +}
406 +
407 +/**
408 + * bbr_remap
409 + *
410 + * If *lsn is in the remap table, return TRUE and modify *lsn,
411 + * else, return FALSE.
412 + **/
413 +static inline int bbr_remap(struct bbr_private *bbr_id,
414 + u64 *lsn)
415 +{
416 + struct bbr_table_entry *e;
417 +
418 + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
419 + e = bbr_search_remap_entry(bbr_id, *lsn);
420 + if (e) {
421 + *lsn = e->replacement_sect;
422 + return 1;
423 + }
424 + }
425 + return 0;
426 +}
427 +
428 +/**
429 + * bbr_remap_probe
430 + *
431 + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap
432 + * table return TRUE, Else, return FALSE.
433 + **/
434 +static inline int bbr_remap_probe(struct bbr_private *bbr_id,
435 + u64 lsn, u64 nr_sects)
436 +{
437 + u64 tmp, cnt;
438 +
439 + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
440 + for (cnt = 0, tmp = lsn;
441 + cnt < nr_sects;
442 + cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) {
443 + if (bbr_remap(bbr_id,&tmp)) {
444 + return 1;
445 + }
446 + }
447 + }
448 + return 0;
449 +}
450 +
451 +/**
452 + * bbr_setup
453 + *
454 + * Read the remap tables from disk and set up the initial remap tree.
455 + **/
456 +static int bbr_setup(struct bbr_private *bbr_id)
457 +{
458 + struct bbr_table *table = bbr_id->bbr_table;
459 + struct io_region job;
460 + unsigned long error;
461 + int i, rc = 0;
462 +
463 + job.bdev = bbr_id->dev->bdev;
464 + job.count = 1;
465 +
466 + /* Read and verify each BBR table sector individually. */
467 + for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) {
468 + job.sector = bbr_id->lba_table1 + i;
469 + rc = dm_io_sync_vm(1, &job, READ, table, &error);
470 + if (rc && bbr_id->lba_table2) {
471 + job.sector = bbr_id->lba_table2 + i;
472 + rc = dm_io_sync_vm(1, &job, READ, table, &error);
473 + }
474 + if (rc) {
475 + goto out;
476 + }
477 +
478 + rc = validate_bbr_table_sector(table);
479 + if (rc) {
480 + goto out;
481 + }
482 + }
483 + atomic_set(&bbr_id->in_use_replacement_blks,
484 + bbr_table_to_remap_list(bbr_id));
485 +
486 +out:
487 + if (rc) {
488 + DMERR("dm-bbr: error during device setup: %d", rc);
489 + }
490 + return rc;
491 +}
492 +
493 +/**
494 + * bbr_io_remap_error
495 + * @bbr_id: Private data for the BBR node.
496 + * @rw: READ or WRITE.
497 + * @starting_lsn: Starting sector of request to remap.
498 + * @count: Number of sectors in the request.
499 + * @page: Page containing the data for the request.
500 + * @offset: Byte-offset of the data within the page.
501 + *
502 + * For the requested range, try to write each sector individually. For each
503 + * sector that fails, find the next available remap location and write the
504 + * data to that new location. Then update the table and write both copies
505 + * of the table to disk. Finally, update the in-memory mapping and do any
506 + * other necessary bookkeeping.
507 + **/
508 +static int bbr_io_remap_error(struct bbr_private *bbr_id,
509 + int rw,
510 + u64 starting_lsn,
511 + u64 count,
512 + struct page *page,
513 + unsigned int offset)
514 +{
515 + struct bbr_table *bbr_table;
516 + struct io_region job;
517 + struct page_list pl;
518 + unsigned long table_sector_index;
519 + unsigned long table_sector_offset;
520 + unsigned long index;
521 + unsigned long error;
522 + u64 lsn, new_lsn;
523 + char b[32];
524 + int rc;
525 +
526 + job.bdev = bbr_id->dev->bdev;
527 + job.count = 1;
528 + pl.page = page;
529 + pl.next = NULL;
530 +
531 + /* For each sector in the request. */
532 + for (lsn = 0; lsn < count; lsn++, offset += SECTOR_SIZE) {
533 + job.sector = starting_lsn + lsn;
534 + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
535 + while (rc) {
536 + /* Find the next available relocation sector. */
537 + new_lsn = atomic_read(&bbr_id->in_use_replacement_blks);
538 + if (new_lsn >= bbr_id->nr_replacement_blks) {
539 + /* No more replacement sectors available. */
540 + return -EIO;
541 + }
542 + new_lsn += bbr_id->start_replacement_sect;
543 +
544 + /* Write the data to its new location. */
545 + DMWARN("dm-bbr: device %s: Trying to remap bad sector "PFU64" to sector "PFU64,
546 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
547 + starting_lsn + lsn, new_lsn);
548 + job.sector = new_lsn;
549 + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
550 + if (rc) {
551 + /* This replacement sector is bad.
552 + * Try the next one.
553 + */
554 + DMERR("dm-bbr: device %s: replacement sector "PFU64" is bad. Skipping.",
555 + format_dev_t(b, bbr_id->dev->bdev->bd_dev), new_lsn);
556 + atomic_inc(&bbr_id->in_use_replacement_blks);
557 + continue;
558 + }
559 +
560 + /* Add this new entry to the on-disk table. */
561 + table_sector_index = new_lsn -
562 + bbr_id->start_replacement_sect;
563 + table_sector_offset = table_sector_index /
564 + BBR_ENTRIES_PER_SECT;
565 + index = table_sector_index % BBR_ENTRIES_PER_SECT;
566 +
567 + bbr_table = &bbr_id->bbr_table[table_sector_offset];
568 + bbr_table->entries[index].bad_sect = starting_lsn + lsn;
569 + bbr_table->entries[index].replacement_sect = new_lsn;
570 + bbr_table->in_use_cnt++;
571 + bbr_table->sequence_number++;
572 + bbr_table->crc = 0;
573 + bbr_table->crc = calculate_crc(INITIAL_CRC,
574 + bbr_table,
575 + sizeof(struct bbr_table));
576 +
577 + /* Write the table to disk. */
578 + cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
579 + if (bbr_id->lba_table1) {
580 + job.sector = bbr_id->lba_table1 + table_sector_offset;
581 + rc = dm_io_sync_vm(1, &job, WRITE, bbr_table, &error);
582 + }
583 + if (bbr_id->lba_table2) {
584 + job.sector = bbr_id->lba_table2 + table_sector_offset;
585 + rc |= dm_io_sync_vm(1, &job, WRITE, bbr_table, &error);
586 + }
587 + le_bbr_table_sector_to_cpu(bbr_table);
588 +
589 + if (rc) {
590 + /* Error writing one of the tables to disk. */
591 + DMERR("dm-bbr: device %s: error updating BBR tables on disk.",
592 + format_dev_t(b, bbr_id->dev->bdev->bd_dev));
593 + return rc;
594 + }
595 +
596 + /* Insert a new entry in the remapping binary-tree. */
597 + rc = bbr_insert_remap_entry(bbr_id,
598 + &bbr_table->entries[index]);
599 + if (rc) {
600 + DMERR("dm-bbr: device %s: error adding new entry to remap tree.",
601 + format_dev_t(b, bbr_id->dev->bdev->bd_dev));
602 + return rc;
603 + }
604 +
605 + atomic_inc(&bbr_id->in_use_replacement_blks);
606 + }
607 + }
608 +
609 + return 0;
610 +}
611 +
612 +/**
613 + * bbr_io_process_request
614 + *
615 + * For each sector in this request, check if the sector has already
616 + * been remapped. If so, process all previous sectors in the request,
617 + * followed by the remapped sector. Then reset the starting lsn and
618 + * count, and keep going with the rest of the request as if it were
619 + * a whole new request. If any of the sync_io's return an error,
620 + * call the remapper to relocate the bad sector(s).
621 + *
622 + * 2.5 Note: When switching over to bio's for the I/O path, we have made
623 + * the assumption that the I/O request described by the bio is one
624 + * virtually contiguous piece of memory (even though the bio vector
625 + * describes it using a series of physical page addresses).
626 + **/
627 +static int bbr_io_process_request(struct bbr_private *bbr_id,
628 + struct bio *bio)
629 +{
630 + struct io_region job;
631 + u64 starting_lsn = bio->bi_sector;
632 + u64 count, lsn, remapped_lsn;
633 + struct page_list pl;
634 + unsigned int offset;
635 + unsigned long error;
636 + int i, rw = bio_data_dir(bio);
637 + int rc = 0;
638 +
639 + job.bdev = bbr_id->dev->bdev;
640 + pl.next = NULL;
641 +
642 + /* Each bio can contain multiple vectors, each with a different page.
643 + * Treat each vector as a separate request.
644 + */
645 + /* KMC: Is this the right way to walk the bvec list? */
646 + for (i = 0;
647 + i < bio->bi_vcnt;
648 + i++, bio->bi_idx++, starting_lsn += count) {
649 +
650 + /* Bvec info: number of sectors, page,
651 + * and byte-offset within page.
652 + */
653 + count = bio_iovec(bio)->bv_len >> SECTOR_SHIFT;
654 + pl.page = bio_iovec(bio)->bv_page;
655 + offset = bio_iovec(bio)->bv_offset;
656 +
657 + /* For each sector in this bvec, check if the sector has
658 + * already been remapped. If so, process all previous sectors
659 + * in this request, followed by the remapped sector. Then reset
660 + * the starting lsn and count and keep going with the rest of
661 + * the request as if it were a whole new request.
662 + */
663 + for (lsn = 0; lsn < count; lsn++) {
664 + remapped_lsn = starting_lsn + lsn;
665 + rc = bbr_remap(bbr_id, &remapped_lsn);
666 + if (!rc) {
667 + /* This sector is fine. */
668 + continue;
669 + }
670 +
671 + /* Process all sectors in the request up to this one. */
672 + if (lsn > 0) {
673 + job.sector = starting_lsn;
674 + job.count = lsn;
675 + rc = dm_io_sync(1, &job, rw, &pl,
676 + offset, &error);
677 + if (rc) {
678 + /* If this I/O failed, then one of the
679 + * sectors in this request needs to be
680 + * relocated.
681 + */
682 + rc = bbr_io_remap_error(bbr_id, rw,
683 + starting_lsn,
684 + lsn, pl.page,
685 + offset);
686 + if (rc) {
687 + /* KMC: Return? Or continue to next bvec? */
688 + return rc;
689 + }
690 + }
691 + offset += (lsn << SECTOR_SHIFT);
692 + }
693 +
694 + /* Process the remapped sector. */
695 + job.sector = remapped_lsn;
696 + job.count = 1;
697 + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
698 + if (rc) {
699 + /* BUGBUG - Need more processing if this caused
700 + * an error. If this I/O failed, then the
701 + * existing remap is now bad, and we need to
702 + * find a new remap. Can't use
703 + * bbr_io_remap_error(), because the existing
704 + * map entry needs to be changed, not added
705 + * again, and the original table entry also
706 + * needs to be changed.
707 + */
708 + return rc;
709 + }
710 +
711 + starting_lsn += (lsn + 1);
712 + count -= (lsn + 1);
713 + lsn = -1;
714 + offset += SECTOR_SIZE;
715 + }
716 +
717 + /* Check for any remaining sectors after the last split. This
718 + * could potentially be the whole request, but that should be a
719 + * rare case because requests should only be processed by the
720 + * thread if we know an error occurred or they contained one or
721 + * more remapped sectors.
722 + */
723 + if (count) {
724 + job.sector = starting_lsn;
725 + job.count = count;
726 + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
727 + if (rc) {
728 + /* If this I/O failed, then one of the sectors
729 + * in this request needs to be relocated.
730 + */
731 + rc = bbr_io_remap_error(bbr_id, rw, starting_lsn,
732 + count, pl.page, offset);
733 + if (rc) {
734 + /* KMC: Return? Or continue to next bvec? */
735 + return rc;
736 + }
737 + }
738 + }
739 + }
740 +
741 + return 0;
742 +}
743 +
744 +static void bbr_io_process_requests(struct bbr_private *bbr_id,
745 + struct bio *bio)
746 +{
747 + struct bio *next;
748 + int rc;
749 +
750 + while (bio) {
751 + next = bio->bi_next;
752 + bio->bi_next = NULL;
753 +
754 + rc = bbr_io_process_request(bbr_id, bio);
755 +
756 + bio_endio(bio, bio->bi_size, rc);
757 +
758 + bio = next;
759 + }
760 +}
761 +
762 +/**
763 + * bbr_remap_handler
764 + *
765 + * This is the handler for the bbr work-queue.
766 + *
767 + * I/O requests should only be sent to this handler if we know that:
768 + * a) the request contains at least one remapped sector.
769 + * or
770 + * b) the request caused an error on the normal I/O path.
771 + *
772 + * This function uses synchronous I/O, so sending a request to this
773 + * thread that doesn't need special processing will cause severe
774 + * performance degredation.
775 + **/
776 +static void bbr_remap_handler(void *data)
777 +{
778 + struct bbr_private *bbr_id = data;
779 + struct bio *bio;
780 + unsigned long flags;
781 +
782 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
783 + bio = bio_list_get(&bbr_id->remap_ios);
784 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
785 +
786 + bbr_io_process_requests(bbr_id, bio);
787 +}
788 +
789 +/**
790 + * bbr_endio
791 + *
792 + * This is the callback for normal write requests. Check for an error
793 + * during the I/O, and send to the thread for processing if necessary.
794 + **/
795 +static int bbr_endio(struct dm_target *ti, struct bio *bio,
796 + int error, union map_info *map_context)
797 +{
798 + struct bbr_private *bbr_id = ti->private;
799 + struct dm_bio_details *bbr_io = map_context->ptr;
800 +
801 + if (error && bbr_io) {
802 + unsigned long flags;
803 + char b[32];
804 +
805 + dm_bio_restore(bbr_io, bio);
806 + map_context->ptr = NULL;
807 +
808 + DMERR("dm-bbr: device %s: I/O failure on sector %lu. "
809 + "Scheduling for retry.",
810 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
811 + (unsigned long)bio->bi_sector);
812 +
813 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
814 + bio_list_add(&bbr_id->remap_ios, bio);
815 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
816 +
817 + queue_work(dm_bbr_wq, &bbr_id->remap_work);
818 +
819 + error = 1;
820 + }
821 +
822 + if (bbr_io)
823 + mempool_free(bbr_io, bbr_io_pool);
824 +
825 + return error;
826 +}
827 +
828 +/**
829 + * Construct a bbr mapping
830 + **/
831 +static int bbr_ctr(struct dm_target *ti, unsigned int argc, char **argv)
832 +{
833 + struct bbr_private *bbr_id;
834 + unsigned long block_size;
835 + char *end;
836 + int rc = -EINVAL;
837 +
838 + if (argc != 8) {
839 + ti->error = "dm-bbr requires exactly 8 arguments: "
840 + "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size";
841 + goto out1;
842 + }
843 +
844 + bbr_id = bbr_alloc_private();
845 + if (!bbr_id) {
846 + ti->error = "dm-bbr: Error allocating bbr private data.";
847 + goto out1;
848 + }
849 +
850 + bbr_id->offset = simple_strtoull(argv[1], &end, 10);
851 + bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10);
852 + bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10);
853 + bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10);
854 + bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10);
855 + bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10);
856 + block_size = simple_strtoul(argv[7], &end, 10);
857 + bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT);
858 +
859 + bbr_id->bbr_table = vmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT);
860 + if (!bbr_id->bbr_table) {
861 + ti->error = "dm-bbr: Error allocating bbr table.";
862 + goto out2;
863 + }
864 +
865 + if (dm_get_device(ti, argv[0], 0, ti->len,
866 + dm_table_get_mode(ti->table), &bbr_id->dev)) {
867 + ti->error = "dm-bbr: Device lookup failed";
868 + goto out2;
869 + }
870 +
871 + rc = bbr_setup(bbr_id);
872 + if (rc) {
873 + ti->error = "dm-bbr: Device setup failed";
874 + goto out3;
875 + }
876 +
877 + ti->private = bbr_id;
878 + return 0;
879 +
880 +out3:
881 + dm_put_device(ti, bbr_id->dev);
882 +out2:
883 + bbr_free_private(bbr_id);
884 +out1:
885 + return rc;
886 +}
887 +
888 +static void bbr_dtr(struct dm_target *ti)
889 +{
890 + struct bbr_private *bbr_id = ti->private;
891 +
892 + dm_put_device(ti, bbr_id->dev);
893 + bbr_free_private(bbr_id);
894 +}
895 +
896 +static int bbr_map(struct dm_target *ti, struct bio *bio,
897 + union map_info *map_context)
898 +{
899 + struct bbr_private *bbr_id = ti->private;
900 + struct dm_bio_details *bbr_io;
901 + unsigned long flags;
902 + int rc = 1;
903 +
904 + bio->bi_sector += bbr_id->offset;
905 +
906 + if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
907 + !bbr_remap_probe(bbr_id, bio->bi_sector, bio_sectors(bio))) {
908 + /* No existing remaps or this request doesn't
909 + * contain any remapped sectors.
910 + */
911 + bio->bi_bdev = bbr_id->dev->bdev;
912 +
913 + bbr_io = mempool_alloc(bbr_io_pool, GFP_NOIO);
914 + dm_bio_record(bbr_io, bio);
915 + map_context->ptr = bbr_io;
916 + } else {
917 + /* This request has at least one remapped sector.
918 + * Give it to the work-queue for processing.
919 + */
920 + map_context->ptr = NULL;
921 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
922 + bio_list_add(&bbr_id->remap_ios, bio);
923 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
924 +
925 + queue_work(dm_bbr_wq, &bbr_id->remap_work);
926 + rc = 0;
927 + }
928 +
929 + return rc;
930 +}
931 +
932 +static int bbr_status(struct dm_target *ti, status_type_t type,
933 + char *result, unsigned int maxlen)
934 +{
935 + struct bbr_private *bbr_id = ti->private;
936 + char b[BDEVNAME_SIZE];
937 +
938 + switch (type) {
939 + case STATUSTYPE_INFO:
940 + result[0] = '\0';
941 + break;
942 +
943 + case STATUSTYPE_TABLE:
944 + snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u",
945 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
946 + bbr_id->offset, bbr_id->lba_table1, bbr_id->lba_table2,
947 + bbr_id->nr_sects_bbr_table,
948 + bbr_id->start_replacement_sect,
949 + bbr_id->nr_replacement_blks,
950 + bbr_id->blksize_in_sects << SECTOR_SHIFT);
951 + break;
952 + }
953 + return 0;
954 +}
955 +
956 +static struct target_type bbr_target = {
957 + .name = "bbr",
958 + .version= {1, 0, 1},
959 + .module = THIS_MODULE,
960 + .ctr = bbr_ctr,
961 + .dtr = bbr_dtr,
962 + .map = bbr_map,
963 + .end_io = bbr_endio,
964 + .status = bbr_status,
965 +};
966 +
967 +int __init dm_bbr_init(void)
968 +{
969 + int rc;
970 +
971 + rc = dm_register_target(&bbr_target);
972 + if (rc) {
973 + DMERR("dm-bbr: error registering target.");
974 + goto err1;
975 + }
976 +
977 + bbr_remap_cache = kmem_cache_create("bbr-remap",
978 + sizeof(struct bbr_runtime_remap),
979 + 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
980 + if (!bbr_remap_cache) {
981 + DMERR("dm-bbr: error creating remap cache.");
982 + rc = ENOMEM;
983 + goto err2;
984 + }
985 +
986 + bbr_io_cache = kmem_cache_create("bbr-io", sizeof(struct dm_bio_details),
987 + 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
988 + if (!bbr_io_cache) {
989 + DMERR("dm-bbr: error creating io cache.");
990 + rc = ENOMEM;
991 + goto err3;
992 + }
993 +
994 + bbr_io_pool = mempool_create(256, mempool_alloc_slab,
995 + mempool_free_slab, bbr_io_cache);
996 + if (!bbr_io_pool) {
997 + DMERR("dm-bbr: error creating io mempool.");
998 + rc = ENOMEM;
999 + goto err4;
1000 + }
1001 +
1002 + dm_bbr_wq = create_workqueue("dm-bbr");
1003 + if (!dm_bbr_wq) {
1004 + DMERR("dm-bbr: error creating work-queue.");
1005 + rc = ENOMEM;
1006 + goto err5;
1007 + }
1008 +
1009 + rc = dm_io_get(1);
1010 + if (rc) {
1011 + DMERR("dm-bbr: error initializing I/O service.");
1012 + goto err6;
1013 + }
1014 +
1015 + return 0;
1016 +
1017 +err6:
1018 + destroy_workqueue(dm_bbr_wq);
1019 +err5:
1020 + mempool_destroy(bbr_io_pool);
1021 +err4:
1022 + kmem_cache_destroy(bbr_io_cache);
1023 +err3:
1024 + kmem_cache_destroy(bbr_remap_cache);
1025 +err2:
1026 + dm_unregister_target(&bbr_target);
1027 +err1:
1028 + return rc;
1029 +}
1030 +
1031 +void __exit dm_bbr_exit(void)
1032 +{
1033 + dm_io_put(1);
1034 + destroy_workqueue(dm_bbr_wq);
1035 + mempool_destroy(bbr_io_pool);
1036 + kmem_cache_destroy(bbr_io_cache);
1037 + kmem_cache_destroy(bbr_remap_cache);
1038 + dm_unregister_target(&bbr_target);
1039 +}
1040 +
1041 +module_init(dm_bbr_init);
1042 +module_exit(dm_bbr_exit);
1043 +MODULE_LICENSE("GPL");
1044 Index: linux-2.6.19/drivers/md/dm-bbr.h
1045 ===================================================================
1046 --- /dev/null
1047 +++ linux-2.6.19/drivers/md/dm-bbr.h
1048 @@ -0,0 +1,125 @@
1049 +/*
1050 + * (C) Copyright IBM Corp. 2002, 2004
1051 + *
1052 + * This program is free software; you can redistribute it and/or modify
1053 + * it under the terms of the GNU General Public License as published by
1054 + * the Free Software Foundation; either version 2 of the License, or
1055 + * (at your option) any later version.
1056 + *
1057 + * This program is distributed in the hope that it will be useful,
1058 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1059 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
1060 + * the GNU General Public License for more details.
1061 + *
1062 + * You should have received a copy of the GNU General Public License
1063 + * along with this program; if not, write to the Free Software
1064 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1065 + *
1066 + * linux/drivers/md/dm-bbr.h
1067 + *
1068 + * Bad-block-relocation (BBR) target for device-mapper.
1069 + *
1070 + * The BBR target is designed to remap I/O write failures to another safe
1071 + * location on disk. Note that most disk drives have BBR built into them,
1072 + * this means that our software BBR will be only activated when all hardware
1073 + * BBR replacement sectors have been used.
1074 + */
1075 +
1076 +#define BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */
1077 +#define BBR_ENTRIES_PER_SECT 31
1078 +#define INITIAL_CRC 0xFFFFFFFF
1079 +#define CRC_POLYNOMIAL 0xEDB88320L
1080 +
1081 +/**
1082 + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines.
1083 + * Use these in place of %Ld, %Lu, and %Lx.
1084 + **/
1085 +#if BITS_PER_LONG > 32
1086 +#define PFU64 "%lu"
1087 +#else
1088 +#define PFU64 "%Lu"
1089 +#endif
1090 +
1091 +/**
1092 + * struct bbr_table_entry
1093 + * @bad_sect: LBA of bad location.
1094 + * @replacement_sect: LBA of new location.
1095 + *
1096 + * Structure to describe one BBR remap.
1097 + **/
1098 +struct bbr_table_entry {
1099 + u64 bad_sect;
1100 + u64 replacement_sect;
1101 +};
1102 +
1103 +/**
1104 + * struct bbr_table
1105 + * @signature: Signature on each BBR table sector.
1106 + * @crc: CRC for this table sector.
1107 + * @sequence_number: Used to resolve conflicts when primary and secondary
1108 + * tables do not match.
1109 + * @in_use_cnt: Number of in-use table entries.
1110 + * @entries: Actual table of remaps.
1111 + *
1112 + * Structure to describe each sector of the metadata table. Each sector in this
1113 + * table can describe 31 remapped sectors.
1114 + **/
1115 +struct bbr_table {
1116 + u32 signature;
1117 + u32 crc;
1118 + u32 sequence_number;
1119 + u32 in_use_cnt;
1120 + struct bbr_table_entry entries[BBR_ENTRIES_PER_SECT];
1121 +};
1122 +
1123 +/**
1124 + * struct bbr_runtime_remap
1125 + *
1126 + * Node in the binary tree used to keep track of remaps.
1127 + **/
1128 +struct bbr_runtime_remap {
1129 + struct bbr_table_entry remap;
1130 + struct bbr_runtime_remap *left;
1131 + struct bbr_runtime_remap *right;
1132 +};
1133 +
1134 +/**
1135 + * struct bbr_private
1136 + * @dev: Info about underlying device.
1137 + * @bbr_table: Copy of metadata table.
1138 + * @remap_root: Binary tree containing all remaps.
1139 + * @remap_root_lock: Lock for the binary tree.
1140 + * @remap_work: For adding work items to the work-queue.
1141 + * @remap_ios: List of I/Os for the work-queue to handle.
1142 + * @remap_ios_lock: Lock for the remap_ios list.
1143 + * @offset: LBA of data area.
1144 + * @lba_table1: LBA of primary BBR table.
1145 + * @lba_table2: LBA of secondary BBR table.
1146 + * @nr_sects_bbr_table: Size of each BBR table.
1147 + * @nr_replacement_blks: Number of replacement blocks.
1148 + * @start_replacement_sect: LBA of start of replacement blocks.
1149 + * @blksize_in_sects: Size of each block.
1150 + * @in_use_replacement_blks: Current number of remapped blocks.
1151 + *
1152 + * Private data for each BBR target.
1153 + **/
1154 +struct bbr_private {
1155 + struct dm_dev *dev;
1156 + struct bbr_table *bbr_table;
1157 + struct bbr_runtime_remap *remap_root;
1158 + spinlock_t remap_root_lock;
1159 +
1160 + struct work_struct remap_work;
1161 + struct bio_list remap_ios;
1162 + spinlock_t remap_ios_lock;
1163 +
1164 + u64 offset;
1165 + u64 lba_table1;
1166 + u64 lba_table2;
1167 + u64 nr_sects_bbr_table;
1168 + u64 start_replacement_sect;
1169 + u64 nr_replacement_blks;
1170 + u32 blksize_in_sects;
1171 + atomic_t in_use_replacement_blks;
1172 +};
1173 +

  ViewVC Help
Powered by ViewVC 1.1.20