/[linux-patches]/genpatches-2.6/tags/2.6.32-15/4100_dm-bbr.patch
Gentoo

Contents of /genpatches-2.6/tags/2.6.32-15/4100_dm-bbr.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1735 - (show annotations) (download)
Wed Aug 4 11:25:09 2010 UTC (3 years, 11 months ago) by mpagano
File size: 33202 byte(s)
2.6.32-15 release
1 BBR Target, updated by dsd@gentoo.org
2
3 Incomplete changelog:
4 2008/06/16: updated for new API in 2.6.26
5 2007/07/08: updated for new API in 2.6.22
6
7 Index: linux-2.6.26-gentoo/drivers/md/Kconfig
8 ===================================================================
9 --- linux-2.6.26-gentoo.orig/drivers/md/Kconfig
10 +++ linux-2.6.26-gentoo/drivers/md/Kconfig
11 @@ -288,4 +288,15 @@ config DM_UEVENT
12 ---help---
13 Generate udev events for DM events.
14
15 +config BLK_DEV_DM_BBR
16 + tristate "Bad Block Relocation Device Target (EXPERIMENTAL)"
17 + depends on BLK_DEV_DM && EXPERIMENTAL
18 + ---help---
19 + Support for devices with software-based bad-block-relocation.
20 +
21 + To compile this as a module, choose M here: the module will be
22 + called dm-bbr.
23 +
24 + If unsure, say N.
25 +
26 endif # MD
27 Index: linux-2.6.26-gentoo/drivers/md/Makefile
28 ===================================================================
29 --- linux-2.6.26-gentoo.orig/drivers/md/Makefile
30 +++ linux-2.6.26-gentoo/drivers/md/Makefile
31 @@ -41,6 +41,7 @@ obj-$(CONFIG_DM_MULTIPATH_RDAC) += dm-rd
32 obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
33 obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o
34 obj-$(CONFIG_DM_ZERO) += dm-zero.o
35 +obj-$(CONFIG_BLK_DEV_DM_BBR) += dm-bbr.o
36
37 quiet_cmd_unroll = UNROLL $@
38 cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
39 Index: linux-2.6.26-gentoo/drivers/md/dm-bbr.c
40 ===================================================================
41 --- /dev/null
42 +++ linux-2.6.26-gentoo/drivers/md/dm-bbr.c
43 @@ -0,0 +1,1012 @@
44 +/*
45 + * (C) Copyright IBM Corp. 2002, 2004
46 + *
47 + * This program is free software; you can redistribute it and/or modify
48 + * it under the terms of the GNU General Public License as published by
49 + * the Free Software Foundation; either version 2 of the License, or
50 + * (at your option) any later version.
51 + *
52 + * This program is distributed in the hope that it will be useful,
53 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
54 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
55 + * the GNU General Public License for more details.
56 + *
57 + * You should have received a copy of the GNU General Public License
58 + * along with this program; if not, write to the Free Software
59 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
60 + *
61 + * linux/drivers/md/dm-bbr.c
62 + *
63 + * Bad-block-relocation (BBR) target for device-mapper.
64 + *
65 + * The BBR target is designed to remap I/O write failures to another safe
66 + * location on disk. Note that most disk drives have BBR built into them,
67 + * this means that our software BBR will be only activated when all hardware
68 + * BBR replacement sectors have been used.
69 + */
70 +
71 +#include <linux/module.h>
72 +#include <linux/init.h>
73 +#include <linux/bio.h>
74 +#include <linux/spinlock.h>
75 +#include <linux/slab.h>
76 +#include <linux/mempool.h>
77 +#include <linux/workqueue.h>
78 +#include <linux/vmalloc.h>
79 +#include <linux/dm-io.h>
80 +#include <linux/bio.h>
81 +
82 +#include "dm.h"
83 +#include "dm-bio-record.h"
84 +#include "dm-bbr.h"
85 +
86 +#define DM_MSG_PREFIX "bbr"
87 +#define SECTOR_SIZE (1 << SECTOR_SHIFT)
88 +
89 +static struct workqueue_struct *dm_bbr_wq = NULL;
90 +static void bbr_remap_handler(struct work_struct *work);
91 +static struct kmem_cache *bbr_remap_cache;
92 +static struct kmem_cache *bbr_io_cache;
93 +static mempool_t *bbr_io_pool;
94 +
95 +/**
96 + * bbr_binary_tree_destroy
97 + *
98 + * Destroy the binary tree.
99 + **/
100 +static void bbr_binary_tree_destroy(struct bbr_runtime_remap *root)
101 +{
102 + struct bbr_runtime_remap **link = NULL;
103 + struct bbr_runtime_remap *node = root;
104 +
105 + while (node) {
106 + if (node->left) {
107 + link = &node->left;
108 + node = node->left;
109 + continue;
110 + }
111 + if (node->right) {
112 + link = &node->right;
113 + node = node->right;
114 + continue;
115 + }
116 +
117 + kmem_cache_free(bbr_remap_cache, node);
118 + if (node == root) {
119 + /* If root is deleted, we're done. */
120 + break;
121 + }
122 +
123 + /* Back to root. */
124 + node = root;
125 + *link = NULL;
126 + }
127 +}
128 +
129 +static void bbr_free_remap(struct bbr_private *bbr_id)
130 +{
131 + spin_lock_irq(&bbr_id->remap_root_lock);
132 + bbr_binary_tree_destroy(bbr_id->remap_root);
133 + bbr_id->remap_root = NULL;
134 + spin_unlock_irq(&bbr_id->remap_root_lock);
135 +}
136 +
137 +static struct bbr_private *bbr_alloc_private(void)
138 +{
139 + struct bbr_private *bbr_id;
140 +
141 + bbr_id = kzalloc(sizeof(*bbr_id), GFP_KERNEL);
142 + if (bbr_id == NULL)
143 + return NULL;
144 +
145 + INIT_WORK(&bbr_id->remap_work, bbr_remap_handler);
146 + spin_lock_init(&bbr_id->remap_root_lock);
147 + spin_lock_init(&bbr_id->remap_ios_lock);
148 + bbr_id->in_use_replacement_blks = (atomic_t) ATOMIC_INIT(0);
149 +
150 + return bbr_id;
151 +}
152 +
153 +static void bbr_free_private(struct bbr_private *bbr_id)
154 +{
155 + vfree(bbr_id->bbr_table);
156 + bbr_free_remap(bbr_id);
157 + kfree(bbr_id);
158 +}
159 +
160 +static u32 crc_table[256];
161 +static u32 crc_table_built = 0;
162 +
163 +static void build_crc_table(void)
164 +{
165 + u32 i, j, crc;
166 +
167 + for (i = 0; i <= 255; i++) {
168 + crc = i;
169 + for (j = 8; j > 0; j--) {
170 + if (crc & 1)
171 + crc = (crc >> 1) ^ CRC_POLYNOMIAL;
172 + else
173 + crc >>= 1;
174 + }
175 + crc_table[i] = crc;
176 + }
177 + crc_table_built = 1;
178 +}
179 +
180 +static u32 calculate_crc(u32 crc, void *buffer, u32 buffersize)
181 +{
182 + unsigned char *current_byte;
183 + u32 temp1, temp2, i;
184 +
185 + current_byte = (unsigned char *) buffer;
186 + /* Make sure the crc table is available */
187 + if (!crc_table_built)
188 + build_crc_table();
189 + /* Process each byte in the buffer. */
190 + for (i = 0; i < buffersize; i++) {
191 + temp1 = (crc >> 8) & 0x00FFFFFF;
192 + temp2 = crc_table[(crc ^ (u32) * current_byte) &
193 + (u32) 0xff];
194 + current_byte++;
195 + crc = temp1 ^ temp2;
196 + }
197 + return crc;
198 +}
199 +
200 +/**
201 + * le_bbr_table_sector_to_cpu
202 + *
203 + * Convert bbr meta data from on-disk (LE) format
204 + * to the native cpu endian format.
205 + **/
206 +static void le_bbr_table_sector_to_cpu(struct bbr_table *p)
207 +{
208 + int i;
209 + p->signature = le32_to_cpup(&p->signature);
210 + p->crc = le32_to_cpup(&p->crc);
211 + p->sequence_number = le32_to_cpup(&p->sequence_number);
212 + p->in_use_cnt = le32_to_cpup(&p->in_use_cnt);
213 + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
214 + p->entries[i].bad_sect =
215 + le64_to_cpup(&p->entries[i].bad_sect);
216 + p->entries[i].replacement_sect =
217 + le64_to_cpup(&p->entries[i].replacement_sect);
218 + }
219 +}
220 +
221 +/**
222 + * cpu_bbr_table_sector_to_le
223 + *
224 + * Convert bbr meta data from cpu endian format to on-disk (LE) format
225 + **/
226 +static void cpu_bbr_table_sector_to_le(struct bbr_table *p,
227 + struct bbr_table *le)
228 +{
229 + int i;
230 + le->signature = cpu_to_le32p(&p->signature);
231 + le->crc = cpu_to_le32p(&p->crc);
232 + le->sequence_number = cpu_to_le32p(&p->sequence_number);
233 + le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt);
234 + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
235 + le->entries[i].bad_sect =
236 + cpu_to_le64p(&p->entries[i].bad_sect);
237 + le->entries[i].replacement_sect =
238 + cpu_to_le64p(&p->entries[i].replacement_sect);
239 + }
240 +}
241 +
242 +/**
243 + * validate_bbr_table_sector
244 + *
245 + * Check the specified BBR table sector for a valid signature and CRC. If it's
246 + * valid, endian-convert the table sector.
247 + **/
248 +static int validate_bbr_table_sector(struct bbr_table *p)
249 +{
250 + int org_crc, final_crc;
251 +
252 + if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) {
253 + DMERR("BBR table signature doesn't match!");
254 + DMERR("Found 0x%x. Expecting 0x%x",
255 + le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE);
256 + return -EINVAL;
257 + }
258 +
259 + if (!p->crc) {
260 + DMERR("BBR table sector has no CRC!");
261 + return -EINVAL;
262 + }
263 +
264 + org_crc = le32_to_cpup(&p->crc);
265 + p->crc = 0;
266 + final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p));
267 + if (final_crc != org_crc) {
268 + DMERR("CRC failed!");
269 + DMERR("Found 0x%x. Expecting 0x%x",
270 + org_crc, final_crc);
271 + return -EINVAL;
272 + }
273 +
274 + p->crc = cpu_to_le32p(&org_crc);
275 + le_bbr_table_sector_to_cpu(p);
276 +
277 + return 0;
278 +}
279 +
280 +/**
281 + * bbr_binary_tree_insert
282 + *
283 + * Insert a node into the binary tree.
284 + **/
285 +static void bbr_binary_tree_insert(struct bbr_runtime_remap **root,
286 + struct bbr_runtime_remap *newnode)
287 +{
288 + struct bbr_runtime_remap **node = root;
289 + while (node && *node) {
290 + node = (newnode->remap.bad_sect > (*node)->remap.bad_sect) ?
291 + &(*node)->right : &(*node)->left;
292 + }
293 +
294 + newnode->left = newnode->right = NULL;
295 + *node = newnode;
296 +}
297 +
298 +/**
299 + * bbr_binary_search
300 + *
301 + * Search for a node that contains bad_sect == lsn.
302 + **/
303 +static struct bbr_runtime_remap *bbr_binary_search(
304 + struct bbr_runtime_remap *root,
305 + u64 lsn)
306 +{
307 + struct bbr_runtime_remap *node = root;
308 + while (node) {
309 + if (node->remap.bad_sect == lsn)
310 + break;
311 +
312 + node = (lsn > node->remap.bad_sect) ? node->right : node->left;
313 + }
314 + return node;
315 +}
316 +
317 +/**
318 + * bbr_insert_remap_entry
319 + *
320 + * Create a new remap entry and add it to the binary tree for this node.
321 + **/
322 +static int bbr_insert_remap_entry(struct bbr_private *bbr_id,
323 + struct bbr_table_entry *new_bbr_entry)
324 +{
325 + struct bbr_runtime_remap *newnode;
326 +
327 + newnode = kmem_cache_alloc(bbr_remap_cache, GFP_NOIO);
328 + if (!newnode) {
329 + DMERR("Could not allocate from remap cache!");
330 + return -ENOMEM;
331 + }
332 + newnode->remap.bad_sect = new_bbr_entry->bad_sect;
333 + newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
334 + spin_lock_irq(&bbr_id->remap_root_lock);
335 + bbr_binary_tree_insert(&bbr_id->remap_root, newnode);
336 + spin_unlock_irq(&bbr_id->remap_root_lock);
337 + return 0;
338 +}
339 +
340 +/**
341 + * bbr_table_to_remap_list
342 + *
343 + * The on-disk bbr table is sorted by the replacement sector LBA. In order to
344 + * improve run time performance, the in memory remap list must be sorted by
345 + * the bad sector LBA. This function is called at discovery time to initialize
346 + * the remap list. This function assumes that at least one copy of meta data
347 + * is valid.
348 + **/
349 +static u32 bbr_table_to_remap_list(struct bbr_private *bbr_id)
350 +{
351 + u32 in_use_blks = 0;
352 + int i, j;
353 + struct bbr_table *p;
354 +
355 + for (i = 0, p = bbr_id->bbr_table;
356 + i < bbr_id->nr_sects_bbr_table;
357 + i++, p++) {
358 + if (!p->in_use_cnt)
359 + break;
360 +
361 + in_use_blks += p->in_use_cnt;
362 + for (j = 0; j < p->in_use_cnt; j++)
363 + bbr_insert_remap_entry(bbr_id, &p->entries[j]);
364 + }
365 + if (in_use_blks) {
366 + char b[32];
367 + DMWARN("There are %u BBR entries for device %s",
368 + in_use_blks, format_dev_t(b, bbr_id->dev->bdev->bd_dev));
369 + }
370 +
371 + return in_use_blks;
372 +}
373 +
374 +/**
375 + * bbr_search_remap_entry
376 + *
377 + * Search remap entry for the specified sector. If found, return a pointer to
378 + * the table entry. Otherwise, return NULL.
379 + **/
380 +static struct bbr_table_entry *bbr_search_remap_entry(
381 + struct bbr_private *bbr_id,
382 + u64 lsn)
383 +{
384 + struct bbr_runtime_remap *p;
385 +
386 + spin_lock_irq(&bbr_id->remap_root_lock);
387 + p = bbr_binary_search(bbr_id->remap_root, lsn);
388 + spin_unlock_irq(&bbr_id->remap_root_lock);
389 + return (p) ? &p->remap : NULL;
390 +}
391 +
392 +/**
393 + * bbr_remap
394 + *
395 + * If *lsn is in the remap table, return TRUE and modify *lsn,
396 + * else, return FALSE.
397 + **/
398 +static int bbr_remap(struct bbr_private *bbr_id,
399 + u64 *lsn)
400 +{
401 + struct bbr_table_entry *e;
402 +
403 + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
404 + e = bbr_search_remap_entry(bbr_id, *lsn);
405 + if (e) {
406 + *lsn = e->replacement_sect;
407 + return 1;
408 + }
409 + }
410 + return 0;
411 +}
412 +
413 +/**
414 + * bbr_remap_probe
415 + *
416 + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap
417 + * table return TRUE, Else, return FALSE.
418 + **/
419 +static int bbr_remap_probe(struct bbr_private *bbr_id,
420 + u64 lsn, u64 nr_sects)
421 +{
422 + u64 tmp, cnt;
423 +
424 + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
425 + for (cnt = 0, tmp = lsn;
426 + cnt < nr_sects;
427 + cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) {
428 + if (bbr_remap(bbr_id,&tmp))
429 + return 1;
430 + }
431 + }
432 + return 0;
433 +}
434 +
435 +static int rw_table(struct bbr_private *bbr_id, void *vma,
436 + struct dm_io_region *ptr, int rw)
437 +{
438 + bbr_id->vma_io_req.bi_rw = rw;
439 + bbr_id->vma_io_req.mem.ptr.vma = vma;
440 + bbr_id->vma_io_req.notify.fn = NULL;
441 +
442 + return dm_io(&bbr_id->vma_io_req, 1, ptr, NULL);
443 +}
444 +
445 +static int io_sync(struct bbr_private *bbr_id, struct page_list *pl,
446 + unsigned offset, struct dm_io_region *ptr, int rw)
447 +{
448 + bbr_id->page_io_req.bi_rw = rw;
449 + bbr_id->page_io_req.mem.ptr.pl = pl;
450 + bbr_id->page_io_req.mem.offset = offset;
451 + bbr_id->page_io_req.notify.fn = NULL;
452 +
453 + return dm_io(&bbr_id->page_io_req, 1, ptr, NULL);
454 +}
455 +
456 +/**
457 + * bbr_setup
458 + *
459 + * Read the remap tables from disk and set up the initial remap tree.
460 + **/
461 +static int bbr_setup(struct bbr_private *bbr_id)
462 +{
463 + struct bbr_table *table = bbr_id->bbr_table;
464 + struct dm_io_region job;
465 + int i, rc = 0;
466 +
467 + job.bdev = bbr_id->dev->bdev;
468 + job.count = 1;
469 +
470 + /* Read and verify each BBR table sector individually. */
471 + for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) {
472 + job.sector = bbr_id->lba_table1 + i;
473 + rc = rw_table(bbr_id, table, &job, READ);
474 + if (rc && bbr_id->lba_table2) {
475 + job.sector = bbr_id->lba_table2 + i;
476 + rc = rw_table(bbr_id, table, &job, READ);
477 + }
478 + if (rc)
479 + goto out;
480 +
481 + rc = validate_bbr_table_sector(table);
482 + if (rc)
483 + goto out;
484 + }
485 + atomic_set(&bbr_id->in_use_replacement_blks,
486 + bbr_table_to_remap_list(bbr_id));
487 +
488 +out:
489 + if (rc)
490 + DMERR("error during device setup: %d", rc);
491 + return rc;
492 +}
493 +
494 +/**
495 + * bbr_io_remap_error
496 + * @bbr_id: Private data for the BBR node.
497 + * @rw: READ or WRITE.
498 + * @starting_lsn: Starting sector of request to remap.
499 + * @count: Number of sectors in the request.
500 + * @page: Page containing the data for the request.
501 + * @offset: Byte-offset of the data within the page.
502 + *
503 + * For the requested range, try to write each sector individually. For each
504 + * sector that fails, find the next available remap location and write the
505 + * data to that new location. Then update the table and write both copies
506 + * of the table to disk. Finally, update the in-memory mapping and do any
507 + * other necessary bookkeeping.
508 + **/
509 +static int bbr_io_remap_error(struct bbr_private *bbr_id,
510 + int rw,
511 + u64 starting_lsn,
512 + u64 count,
513 + struct page *page,
514 + unsigned int offset)
515 +{
516 + struct bbr_table *bbr_table;
517 + struct dm_io_region job;
518 + struct page_list pl;
519 + unsigned long table_sector_index;
520 + unsigned long table_sector_offset;
521 + unsigned long index;
522 + u64 lsn, new_lsn;
523 + char b[32];
524 + int rc;
525 +
526 + job.bdev = bbr_id->dev->bdev;
527 + job.count = 1;
528 + pl.page = page;
529 + pl.next = NULL;
530 +
531 + /* For each sector in the request. */
532 + for (lsn = 0; lsn < count; lsn++, offset += SECTOR_SIZE) {
533 + job.sector = starting_lsn + lsn;
534 + rc = io_sync(bbr_id, &pl, offset, &job, rw);
535 + while (rc) {
536 + /* Find the next available relocation sector. */
537 + new_lsn = atomic_read(&bbr_id->in_use_replacement_blks);
538 + if (new_lsn >= bbr_id->nr_replacement_blks) {
539 + /* No more replacement sectors available. */
540 + return -EIO;
541 + }
542 + new_lsn += bbr_id->start_replacement_sect;
543 +
544 + /* Write the data to its new location. */
545 + DMWARN("device %s: Trying to remap bad sector "PFU64" to sector "PFU64,
546 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
547 + starting_lsn + lsn, new_lsn);
548 + job.sector = new_lsn;
549 + rc = io_sync(bbr_id, &pl, offset, &job, rw);
550 + if (rc) {
551 + /* This replacement sector is bad.
552 + * Try the next one.
553 + */
554 + DMERR("device %s: replacement sector "PFU64" is bad. Skipping.",
555 + format_dev_t(b, bbr_id->dev->bdev->bd_dev), new_lsn);
556 + atomic_inc(&bbr_id->in_use_replacement_blks);
557 + continue;
558 + }
559 +
560 + /* Add this new entry to the on-disk table. */
561 + table_sector_index = new_lsn -
562 + bbr_id->start_replacement_sect;
563 + table_sector_offset = table_sector_index /
564 + BBR_ENTRIES_PER_SECT;
565 + index = table_sector_index % BBR_ENTRIES_PER_SECT;
566 +
567 + bbr_table = &bbr_id->bbr_table[table_sector_offset];
568 + bbr_table->entries[index].bad_sect = starting_lsn + lsn;
569 + bbr_table->entries[index].replacement_sect = new_lsn;
570 + bbr_table->in_use_cnt++;
571 + bbr_table->sequence_number++;
572 + bbr_table->crc = 0;
573 + bbr_table->crc = calculate_crc(INITIAL_CRC,
574 + bbr_table,
575 + sizeof(struct bbr_table));
576 +
577 + /* Write the table to disk. */
578 + cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
579 + if (bbr_id->lba_table1) {
580 + job.sector = bbr_id->lba_table1 + table_sector_offset;
581 + rc = rw_table(bbr_id, bbr_table, &job, WRITE);
582 + }
583 + if (bbr_id->lba_table2) {
584 + job.sector = bbr_id->lba_table2 + table_sector_offset;
585 + rc |= rw_table(bbr_id, bbr_table, &job, WRITE);
586 + }
587 + le_bbr_table_sector_to_cpu(bbr_table);
588 +
589 + if (rc) {
590 + /* Error writing one of the tables to disk. */
591 + DMERR("device %s: error updating BBR tables on disk.",
592 + format_dev_t(b, bbr_id->dev->bdev->bd_dev));
593 + return rc;
594 + }
595 +
596 + /* Insert a new entry in the remapping binary-tree. */
597 + rc = bbr_insert_remap_entry(bbr_id,
598 + &bbr_table->entries[index]);
599 + if (rc) {
600 + DMERR("device %s: error adding new entry to remap tree.",
601 + format_dev_t(b, bbr_id->dev->bdev->bd_dev));
602 + return rc;
603 + }
604 +
605 + atomic_inc(&bbr_id->in_use_replacement_blks);
606 + }
607 + }
608 +
609 + return 0;
610 +}
611 +
612 +/**
613 + * bbr_io_process_request
614 + *
615 + * For each sector in this request, check if the sector has already
616 + * been remapped. If so, process all previous sectors in the request,
617 + * followed by the remapped sector. Then reset the starting lsn and
618 + * count, and keep going with the rest of the request as if it were
619 + * a whole new request. If any of the sync_io's return an error,
620 + * call the remapper to relocate the bad sector(s).
621 + *
622 + * 2.5 Note: When switching over to bio's for the I/O path, we have made
623 + * the assumption that the I/O request described by the bio is one
624 + * virtually contiguous piece of memory (even though the bio vector
625 + * describes it using a series of physical page addresses).
626 + **/
627 +static int bbr_io_process_request(struct bbr_private *bbr_id,
628 + struct bio *bio)
629 +{
630 + struct dm_io_region job;
631 + u64 starting_lsn = bio->bi_sector;
632 + u64 count, lsn, remapped_lsn;
633 + struct page_list pl;
634 + unsigned int offset;
635 + int i, rw = bio_data_dir(bio);
636 + int rc = 0;
637 +
638 + job.bdev = bbr_id->dev->bdev;
639 + pl.next = NULL;
640 +
641 + /* Each bio can contain multiple vectors, each with a different page.
642 + * Treat each vector as a separate request.
643 + */
644 + /* KMC: Is this the right way to walk the bvec list? */
645 + for (i = 0;
646 + i < bio->bi_vcnt;
647 + i++, bio->bi_idx++, starting_lsn += count) {
648 +
649 + /* Bvec info: number of sectors, page,
650 + * and byte-offset within page.
651 + */
652 + count = bio_iovec(bio)->bv_len >> SECTOR_SHIFT;
653 + pl.page = bio_iovec(bio)->bv_page;
654 + offset = bio_iovec(bio)->bv_offset;
655 +
656 + /* For each sector in this bvec, check if the sector has
657 + * already been remapped. If so, process all previous sectors
658 + * in this request, followed by the remapped sector. Then reset
659 + * the starting lsn and count and keep going with the rest of
660 + * the request as if it were a whole new request.
661 + */
662 + for (lsn = 0; lsn < count; lsn++) {
663 + remapped_lsn = starting_lsn + lsn;
664 + rc = bbr_remap(bbr_id, &remapped_lsn);
665 + if (!rc) {
666 + /* This sector is fine. */
667 + continue;
668 + }
669 +
670 + /* Process all sectors in the request up to this one. */
671 + if (lsn > 0) {
672 + job.sector = starting_lsn;
673 + job.count = lsn;
674 + rc = io_sync(bbr_id, &pl, offset, &job, rw);
675 + if (rc) {
676 + /* If this I/O failed, then one of the
677 + * sectors in this request needs to be
678 + * relocated.
679 + */
680 + rc = bbr_io_remap_error(bbr_id, rw,
681 + starting_lsn,
682 + lsn, pl.page,
683 + offset);
684 + if (rc) {
685 + /* KMC: Return? Or continue to next bvec? */
686 + return rc;
687 + }
688 + }
689 + offset += (lsn << SECTOR_SHIFT);
690 + }
691 +
692 + /* Process the remapped sector. */
693 + job.sector = remapped_lsn;
694 + job.count = 1;
695 + rc = io_sync(bbr_id, &pl, offset, &job, rw);
696 + if (rc) {
697 + /* BUGBUG - Need more processing if this caused
698 + * an error. If this I/O failed, then the
699 + * existing remap is now bad, and we need to
700 + * find a new remap. Can't use
701 + * bbr_io_remap_error(), because the existing
702 + * map entry needs to be changed, not added
703 + * again, and the original table entry also
704 + * needs to be changed.
705 + */
706 + return rc;
707 + }
708 +
709 + starting_lsn += (lsn + 1);
710 + count -= (lsn + 1);
711 + lsn = -1;
712 + offset += SECTOR_SIZE;
713 + }
714 +
715 + /* Check for any remaining sectors after the last split. This
716 + * could potentially be the whole request, but that should be a
717 + * rare case because requests should only be processed by the
718 + * thread if we know an error occurred or they contained one or
719 + * more remapped sectors.
720 + */
721 + if (count) {
722 + job.sector = starting_lsn;
723 + job.count = count;
724 + rc = io_sync(bbr_id, &pl, offset, &job, rw);
725 + if (rc) {
726 + /* If this I/O failed, then one of the sectors
727 + * in this request needs to be relocated.
728 + */
729 + rc = bbr_io_remap_error(bbr_id, rw, starting_lsn,
730 + count, pl.page, offset);
731 + if (rc) {
732 + /* KMC: Return? Or continue to next bvec? */
733 + return rc;
734 + }
735 + }
736 + }
737 + }
738 +
739 + return 0;
740 +}
741 +
742 +static void bbr_io_process_requests(struct bbr_private *bbr_id,
743 + struct bio *bio)
744 +{
745 + struct bio *next;
746 + int rc;
747 +
748 + while (bio) {
749 + next = bio->bi_next;
750 + bio->bi_next = NULL;
751 +
752 + rc = bbr_io_process_request(bbr_id, bio);
753 +
754 + bio_endio(bio, rc);
755 +
756 + bio = next;
757 + }
758 +}
759 +
760 +/**
761 + * bbr_remap_handler
762 + *
763 + * This is the handler for the bbr work-queue.
764 + *
765 + * I/O requests should only be sent to this handler if we know that:
766 + * a) the request contains at least one remapped sector.
767 + * or
768 + * b) the request caused an error on the normal I/O path.
769 + *
770 + * This function uses synchronous I/O, so sending a request to this
771 + * thread that doesn't need special processing will cause severe
772 + * performance degredation.
773 + **/
774 +static void bbr_remap_handler(struct work_struct *work)
775 +{
776 + struct bbr_private *bbr_id =
777 + container_of(work, struct bbr_private, remap_work);
778 + struct bio *bio;
779 + unsigned long flags;
780 +
781 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
782 + bio = bio_list_get(&bbr_id->remap_ios);
783 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
784 +
785 + bbr_io_process_requests(bbr_id, bio);
786 +}
787 +
788 +/**
789 + * bbr_endio
790 + *
791 + * This is the callback for normal write requests. Check for an error
792 + * during the I/O, and send to the thread for processing if necessary.
793 + **/
794 +static int bbr_endio(struct dm_target *ti, struct bio *bio,
795 + int error, union map_info *map_context)
796 +{
797 + struct bbr_private *bbr_id = ti->private;
798 + struct dm_bio_details *bbr_io = map_context->ptr;
799 +
800 + if (error && bbr_io) {
801 + unsigned long flags;
802 + char b[32];
803 +
804 + dm_bio_restore(bbr_io, bio);
805 + map_context->ptr = NULL;
806 +
807 + DMERR("device %s: I/O failure on sector %lu. "
808 + "Scheduling for retry.",
809 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
810 + (unsigned long)bio->bi_sector);
811 +
812 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
813 + bio_list_add(&bbr_id->remap_ios, bio);
814 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
815 +
816 + queue_work(dm_bbr_wq, &bbr_id->remap_work);
817 +
818 + error = 1;
819 + }
820 +
821 + if (bbr_io)
822 + mempool_free(bbr_io, bbr_io_pool);
823 +
824 + return error;
825 +}
826 +
827 +/**
828 + * Construct a bbr mapping
829 + **/
830 +static int bbr_ctr(struct dm_target *ti, unsigned int argc, char **argv)
831 +{
832 + struct bbr_private *bbr_id;
833 + unsigned long block_size;
834 + char *end;
835 + int rc = -EINVAL;
836 +
837 + if (argc != 8) {
838 + ti->error = "dm-bbr requires exactly 8 arguments: "
839 + "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size";
840 + goto out1;
841 + }
842 +
843 + bbr_id = bbr_alloc_private();
844 + if (!bbr_id) {
845 + ti->error = "dm-bbr: Error allocating bbr private data.";
846 + goto out1;
847 + }
848 +
849 + bbr_id->offset = simple_strtoull(argv[1], &end, 10);
850 + bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10);
851 + bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10);
852 + bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10);
853 + bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10);
854 + bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10);
855 + block_size = simple_strtoul(argv[7], &end, 10);
856 + bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT);
857 +
858 + bbr_id->vma_io_req.mem.type = DM_IO_VMA;
859 + bbr_id->vma_io_req.client = dm_io_client_create(1);
860 + if (IS_ERR(bbr_id->vma_io_req.client)) {
861 + rc = PTR_ERR(bbr_id->vma_io_req.client);
862 + DMWARN("couldn't allocate disk VMA io client");
863 + goto out2;
864 + }
865 +
866 + bbr_id->page_io_req.mem.type = DM_IO_PAGE_LIST;
867 + bbr_id->page_io_req.client = dm_io_client_create(1);
868 + if (IS_ERR(bbr_id->page_io_req.client)) {
869 + rc = PTR_ERR(bbr_id->page_io_req.client);
870 + DMWARN("couldn't allocate pagelist io client");
871 + goto out3;
872 + }
873 +
874 + bbr_id->bbr_table = vmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT);
875 + if (!bbr_id->bbr_table) {
876 + ti->error = "dm-bbr: Error allocating bbr table.";
877 + goto out4;
878 + }
879 +
880 + if (dm_get_device(ti, argv[0], 0, ti->len,
881 + dm_table_get_mode(ti->table), &bbr_id->dev)) {
882 + ti->error = "dm-bbr: Device lookup failed";
883 + goto out4;
884 + }
885 +
886 + rc = bbr_setup(bbr_id);
887 + if (rc) {
888 + ti->error = "dm-bbr: Device setup failed";
889 + goto out5;
890 + }
891 +
892 + ti->private = bbr_id;
893 + return 0;
894 +
895 +out5:
896 + dm_put_device(ti, bbr_id->dev);
897 +out4:
898 + dm_io_client_destroy(bbr_id->page_io_req.client);
899 +out3:
900 + dm_io_client_destroy(bbr_id->vma_io_req.client);
901 +out2:
902 + bbr_free_private(bbr_id);
903 +out1:
904 + return rc;
905 +}
906 +
907 +static void bbr_dtr(struct dm_target *ti)
908 +{
909 + struct bbr_private *bbr_id = ti->private;
910 +
911 + dm_put_device(ti, bbr_id->dev);
912 + dm_io_client_destroy(bbr_id->page_io_req.client);
913 + dm_io_client_destroy(bbr_id->vma_io_req.client);
914 + bbr_free_private(bbr_id);
915 +}
916 +
917 +static int bbr_map(struct dm_target *ti, struct bio *bio,
918 + union map_info *map_context)
919 +{
920 + struct bbr_private *bbr_id = ti->private;
921 + struct dm_bio_details *bbr_io;
922 + unsigned long flags;
923 + int rc = 1;
924 +
925 + bio->bi_sector += bbr_id->offset;
926 +
927 + if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
928 + !bbr_remap_probe(bbr_id, bio->bi_sector, bio_sectors(bio))) {
929 + /* No existing remaps or this request doesn't
930 + * contain any remapped sectors.
931 + */
932 + bio->bi_bdev = bbr_id->dev->bdev;
933 +
934 + bbr_io = mempool_alloc(bbr_io_pool, GFP_NOIO);
935 + dm_bio_record(bbr_io, bio);
936 + map_context->ptr = bbr_io;
937 + } else {
938 + /* This request has at least one remapped sector.
939 + * Give it to the work-queue for processing.
940 + */
941 + map_context->ptr = NULL;
942 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
943 + bio_list_add(&bbr_id->remap_ios, bio);
944 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
945 +
946 + queue_work(dm_bbr_wq, &bbr_id->remap_work);
947 + rc = 0;
948 + }
949 +
950 + return rc;
951 +}
952 +
953 +static int bbr_status(struct dm_target *ti, status_type_t type,
954 + char *result, unsigned int maxlen)
955 +{
956 + struct bbr_private *bbr_id = ti->private;
957 + char b[BDEVNAME_SIZE];
958 +
959 + switch (type) {
960 + case STATUSTYPE_INFO:
961 + result[0] = '\0';
962 + break;
963 +
964 + case STATUSTYPE_TABLE:
965 + snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u",
966 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
967 + bbr_id->offset, bbr_id->lba_table1, bbr_id->lba_table2,
968 + bbr_id->nr_sects_bbr_table,
969 + bbr_id->start_replacement_sect,
970 + bbr_id->nr_replacement_blks,
971 + bbr_id->blksize_in_sects << SECTOR_SHIFT);
972 + break;
973 + }
974 + return 0;
975 +}
976 +
977 +static struct target_type bbr_target = {
978 + .name = "bbr",
979 + .version= {1, 0, 1},
980 + .module = THIS_MODULE,
981 + .ctr = bbr_ctr,
982 + .dtr = bbr_dtr,
983 + .map = bbr_map,
984 + .end_io = bbr_endio,
985 + .status = bbr_status,
986 +};
987 +
988 +int __init dm_bbr_init(void)
989 +{
990 + int rc;
991 +
992 + rc = dm_register_target(&bbr_target);
993 + if (rc) {
994 + DMERR("error registering target.");
995 + goto err1;
996 + }
997 +
998 + bbr_remap_cache = kmem_cache_create("bbr-remap",
999 + sizeof(struct bbr_runtime_remap),
1000 + 0, SLAB_HWCACHE_ALIGN, NULL);
1001 + if (!bbr_remap_cache) {
1002 + DMERR("error creating remap cache.");
1003 + rc = ENOMEM;
1004 + goto err2;
1005 + }
1006 +
1007 + bbr_io_cache = kmem_cache_create("bbr-io", sizeof(struct dm_bio_details),
1008 + 0, SLAB_HWCACHE_ALIGN, NULL);
1009 + if (!bbr_io_cache) {
1010 + DMERR("error creating io cache.");
1011 + rc = ENOMEM;
1012 + goto err3;
1013 + }
1014 +
1015 + bbr_io_pool = mempool_create(256, mempool_alloc_slab,
1016 + mempool_free_slab, bbr_io_cache);
1017 + if (!bbr_io_pool) {
1018 + DMERR("error creating io mempool.");
1019 + rc = ENOMEM;
1020 + goto err4;
1021 + }
1022 +
1023 + dm_bbr_wq = create_workqueue("dm-bbr");
1024 + if (!dm_bbr_wq) {
1025 + DMERR("error creating work-queue.");
1026 + rc = ENOMEM;
1027 + goto err5;
1028 + }
1029 +
1030 + return 0;
1031 +
1032 +err5:
1033 + mempool_destroy(bbr_io_pool);
1034 +err4:
1035 + kmem_cache_destroy(bbr_io_cache);
1036 +err3:
1037 + kmem_cache_destroy(bbr_remap_cache);
1038 +err2:
1039 + dm_unregister_target(&bbr_target);
1040 +err1:
1041 + return rc;
1042 +}
1043 +
1044 +void __exit dm_bbr_exit(void)
1045 +{
1046 + destroy_workqueue(dm_bbr_wq);
1047 + mempool_destroy(bbr_io_pool);
1048 + kmem_cache_destroy(bbr_io_cache);
1049 + kmem_cache_destroy(bbr_remap_cache);
1050 + dm_unregister_target(&bbr_target);
1051 +}
1052 +
1053 +module_init(dm_bbr_init);
1054 +module_exit(dm_bbr_exit);
1055 +MODULE_LICENSE("GPL");
1056 Index: linux-2.6.26-gentoo/drivers/md/dm-bbr.h
1057 ===================================================================
1058 --- /dev/null
1059 +++ linux-2.6.26-gentoo/drivers/md/dm-bbr.h
1060 @@ -0,0 +1,130 @@
1061 +/*
1062 + * (C) Copyright IBM Corp. 2002, 2004
1063 + *
1064 + * This program is free software; you can redistribute it and/or modify
1065 + * it under the terms of the GNU General Public License as published by
1066 + * the Free Software Foundation; either version 2 of the License, or
1067 + * (at your option) any later version.
1068 + *
1069 + * This program is distributed in the hope that it will be useful,
1070 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1071 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
1072 + * the GNU General Public License for more details.
1073 + *
1074 + * You should have received a copy of the GNU General Public License
1075 + * along with this program; if not, write to the Free Software
1076 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1077 + *
1078 + * linux/drivers/md/dm-bbr.h
1079 + *
1080 + * Bad-block-relocation (BBR) target for device-mapper.
1081 + *
1082 + * The BBR target is designed to remap I/O write failures to another safe
1083 + * location on disk. Note that most disk drives have BBR built into them,
1084 + * this means that our software BBR will be only activated when all hardware
1085 + * BBR replacement sectors have been used.
1086 + */
1087 +
1088 +#include <linux/dm-io.h>
1089 +
1090 +#define BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */
1091 +#define BBR_ENTRIES_PER_SECT 31
1092 +#define INITIAL_CRC 0xFFFFFFFF
1093 +#define CRC_POLYNOMIAL 0xEDB88320L
1094 +
1095 +/**
1096 + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines.
1097 + * Use these in place of %Ld, %Lu, and %Lx.
1098 + **/
1099 +#if BITS_PER_LONG > 32
1100 +#define PFU64 "%llu"
1101 +#else
1102 +#define PFU64 "%Lu"
1103 +#endif
1104 +
1105 +/**
1106 + * struct bbr_table_entry
1107 + * @bad_sect: LBA of bad location.
1108 + * @replacement_sect: LBA of new location.
1109 + *
1110 + * Structure to describe one BBR remap.
1111 + **/
1112 +struct bbr_table_entry {
1113 + u64 bad_sect;
1114 + u64 replacement_sect;
1115 +};
1116 +
1117 +/**
1118 + * struct bbr_table
1119 + * @signature: Signature on each BBR table sector.
1120 + * @crc: CRC for this table sector.
1121 + * @sequence_number: Used to resolve conflicts when primary and secondary
1122 + * tables do not match.
1123 + * @in_use_cnt: Number of in-use table entries.
1124 + * @entries: Actual table of remaps.
1125 + *
1126 + * Structure to describe each sector of the metadata table. Each sector in this
1127 + * table can describe 31 remapped sectors.
1128 + **/
1129 +struct bbr_table {
1130 + u32 signature;
1131 + u32 crc;
1132 + u32 sequence_number;
1133 + u32 in_use_cnt;
1134 + struct bbr_table_entry entries[BBR_ENTRIES_PER_SECT];
1135 +};
1136 +
1137 +/**
1138 + * struct bbr_runtime_remap
1139 + *
1140 + * Node in the binary tree used to keep track of remaps.
1141 + **/
1142 +struct bbr_runtime_remap {
1143 + struct bbr_table_entry remap;
1144 + struct bbr_runtime_remap *left;
1145 + struct bbr_runtime_remap *right;
1146 +};
1147 +
1148 +/**
1149 + * struct bbr_private
1150 + * @dev: Info about underlying device.
1151 + * @bbr_table: Copy of metadata table.
1152 + * @remap_root: Binary tree containing all remaps.
1153 + * @remap_root_lock: Lock for the binary tree.
1154 + * @remap_work: For adding work items to the work-queue.
1155 + * @remap_ios: List of I/Os for the work-queue to handle.
1156 + * @remap_ios_lock: Lock for the remap_ios list.
1157 + * @offset: LBA of data area.
1158 + * @lba_table1: LBA of primary BBR table.
1159 + * @lba_table2: LBA of secondary BBR table.
1160 + * @nr_sects_bbr_table: Size of each BBR table.
1161 + * @nr_replacement_blks: Number of replacement blocks.
1162 + * @start_replacement_sect: LBA of start of replacement blocks.
1163 + * @blksize_in_sects: Size of each block.
1164 + * @in_use_replacement_blks: Current number of remapped blocks.
1165 + *
1166 + * Private data for each BBR target.
1167 + **/
1168 +struct bbr_private {
1169 + struct dm_dev *dev;
1170 + struct bbr_table *bbr_table;
1171 + struct bbr_runtime_remap *remap_root;
1172 + spinlock_t remap_root_lock;
1173 +
1174 + struct dm_io_request vma_io_req;
1175 + struct dm_io_request page_io_req;
1176 +
1177 + struct work_struct remap_work;
1178 + struct bio_list remap_ios;
1179 + spinlock_t remap_ios_lock;
1180 +
1181 + u64 offset;
1182 + u64 lba_table1;
1183 + u64 lba_table2;
1184 + u64 nr_sects_bbr_table;
1185 + u64 start_replacement_sect;
1186 + u64 nr_replacement_blks;
1187 + u32 blksize_in_sects;
1188 + atomic_t in_use_replacement_blks;
1189 +};
1190 +

  ViewVC Help
Powered by ViewVC 1.1.20