/[linux-patches]/hardened/2.6/trunk/2.6.23/4105_dm-bbr.patch
Gentoo

Contents of /hardened/2.6/trunk/2.6.23/4105_dm-bbr.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1157 - (show annotations) (download)
Fri Oct 12 23:22:36 2007 UTC (6 years, 9 months ago) by phreak
File size: 32532 byte(s)
Initial patchset for 2.6.23.
1 BBR Target, updated by dsd@gentoo.org
2
3 Incomplete changelog:
4 2007/07/08: updated for new API in 2.6.22
5
6 --- a/drivers/md/Kconfig
7 +++ b/drivers/md/Kconfig
8 @@ -276,4 +276,15 @@ config DM_DELAY
9
10 If unsure, say N.
11
12 +config BLK_DEV_DM_BBR
13 + tristate "Bad Block Relocation Device Target (EXPERIMENTAL)"
14 + depends on BLK_DEV_DM && EXPERIMENTAL
15 + ---help---
16 + Support for devices with software-based bad-block-relocation.
17 +
18 + To compile this as a module, choose M here: the module will be
19 + called dm-bbr.
20 +
21 + If unsure, say N.
22 +
23 endif # MD
24 --- a/drivers/md/Makefile
25 +++ b/drivers/md/Makefile
26 @@ -39,6 +39,7 @@ obj-$(CONFIG_DM_MULTIPATH_RDAC) += dm-rd
27 obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
28 obj-$(CONFIG_DM_MIRROR) += dm-mirror.o
29 obj-$(CONFIG_DM_ZERO) += dm-zero.o
30 +obj-$(CONFIG_BLK_DEV_DM_BBR) += dm-bbr.o
31
32 quiet_cmd_unroll = UNROLL $@
33 cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
34 --- /dev/null
35 +++ b/drivers/md/dm-bbr.c
36 @@ -0,0 +1,1012 @@
37 +/*
38 + * (C) Copyright IBM Corp. 2002, 2004
39 + *
40 + * This program is free software; you can redistribute it and/or modify
41 + * it under the terms of the GNU General Public License as published by
42 + * the Free Software Foundation; either version 2 of the License, or
43 + * (at your option) any later version.
44 + *
45 + * This program is distributed in the hope that it will be useful,
46 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
47 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
48 + * the GNU General Public License for more details.
49 + *
50 + * You should have received a copy of the GNU General Public License
51 + * along with this program; if not, write to the Free Software
52 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
53 + *
54 + * linux/drivers/md/dm-bbr.c
55 + *
56 + * Bad-block-relocation (BBR) target for device-mapper.
57 + *
58 + * The BBR target is designed to remap I/O write failures to another safe
59 + * location on disk. Note that most disk drives have BBR built into them,
60 + * this means that our software BBR will be only activated when all hardware
61 + * BBR replacement sectors have been used.
62 + */
63 +
64 +#include <linux/module.h>
65 +#include <linux/init.h>
66 +#include <linux/bio.h>
67 +#include <linux/spinlock.h>
68 +#include <linux/slab.h>
69 +#include <linux/mempool.h>
70 +#include <linux/workqueue.h>
71 +#include <linux/vmalloc.h>
72 +
73 +#include "dm.h"
74 +#include "dm-bio-list.h"
75 +#include "dm-bio-record.h"
76 +#include "dm-bbr.h"
77 +#include "dm-io.h"
78 +
79 +#define DM_MSG_PREFIX "bbr"
80 +#define SECTOR_SIZE (1 << SECTOR_SHIFT)
81 +
82 +static struct workqueue_struct *dm_bbr_wq = NULL;
83 +static void bbr_remap_handler(struct work_struct *work);
84 +static struct kmem_cache *bbr_remap_cache;
85 +static struct kmem_cache *bbr_io_cache;
86 +static mempool_t *bbr_io_pool;
87 +
88 +/**
89 + * bbr_binary_tree_destroy
90 + *
91 + * Destroy the binary tree.
92 + **/
93 +static void bbr_binary_tree_destroy(struct bbr_runtime_remap *root)
94 +{
95 + struct bbr_runtime_remap **link = NULL;
96 + struct bbr_runtime_remap *node = root;
97 +
98 + while (node) {
99 + if (node->left) {
100 + link = &node->left;
101 + node = node->left;
102 + continue;
103 + }
104 + if (node->right) {
105 + link = &node->right;
106 + node = node->right;
107 + continue;
108 + }
109 +
110 + kmem_cache_free(bbr_remap_cache, node);
111 + if (node == root) {
112 + /* If root is deleted, we're done. */
113 + break;
114 + }
115 +
116 + /* Back to root. */
117 + node = root;
118 + *link = NULL;
119 + }
120 +}
121 +
122 +static void bbr_free_remap(struct bbr_private *bbr_id)
123 +{
124 + spin_lock_irq(&bbr_id->remap_root_lock);
125 + bbr_binary_tree_destroy(bbr_id->remap_root);
126 + bbr_id->remap_root = NULL;
127 + spin_unlock_irq(&bbr_id->remap_root_lock);
128 +}
129 +
130 +static struct bbr_private *bbr_alloc_private(void)
131 +{
132 + struct bbr_private *bbr_id;
133 +
134 + bbr_id = kzalloc(sizeof(*bbr_id), GFP_KERNEL);
135 + if (bbr_id == NULL)
136 + return NULL;
137 +
138 + INIT_WORK(&bbr_id->remap_work, bbr_remap_handler);
139 + spin_lock_init(&bbr_id->remap_root_lock);
140 + spin_lock_init(&bbr_id->remap_ios_lock);
141 + bbr_id->in_use_replacement_blks = (atomic_t) ATOMIC_INIT(0);
142 +
143 + return bbr_id;
144 +}
145 +
146 +static void bbr_free_private(struct bbr_private *bbr_id)
147 +{
148 + vfree(bbr_id->bbr_table);
149 + bbr_free_remap(bbr_id);
150 + kfree(bbr_id);
151 +}
152 +
153 +static u32 crc_table[256];
154 +static u32 crc_table_built = 0;
155 +
156 +static void build_crc_table(void)
157 +{
158 + u32 i, j, crc;
159 +
160 + for (i = 0; i <= 255; i++) {
161 + crc = i;
162 + for (j = 8; j > 0; j--) {
163 + if (crc & 1)
164 + crc = (crc >> 1) ^ CRC_POLYNOMIAL;
165 + else
166 + crc >>= 1;
167 + }
168 + crc_table[i] = crc;
169 + }
170 + crc_table_built = 1;
171 +}
172 +
173 +static u32 calculate_crc(u32 crc, void *buffer, u32 buffersize)
174 +{
175 + unsigned char *current_byte;
176 + u32 temp1, temp2, i;
177 +
178 + current_byte = (unsigned char *) buffer;
179 + /* Make sure the crc table is available */
180 + if (!crc_table_built)
181 + build_crc_table();
182 + /* Process each byte in the buffer. */
183 + for (i = 0; i < buffersize; i++) {
184 + temp1 = (crc >> 8) & 0x00FFFFFF;
185 + temp2 = crc_table[(crc ^ (u32) * current_byte) &
186 + (u32) 0xff];
187 + current_byte++;
188 + crc = temp1 ^ temp2;
189 + }
190 + return crc;
191 +}
192 +
193 +/**
194 + * le_bbr_table_sector_to_cpu
195 + *
196 + * Convert bbr meta data from on-disk (LE) format
197 + * to the native cpu endian format.
198 + **/
199 +static void le_bbr_table_sector_to_cpu(struct bbr_table *p)
200 +{
201 + int i;
202 + p->signature = le32_to_cpup(&p->signature);
203 + p->crc = le32_to_cpup(&p->crc);
204 + p->sequence_number = le32_to_cpup(&p->sequence_number);
205 + p->in_use_cnt = le32_to_cpup(&p->in_use_cnt);
206 + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
207 + p->entries[i].bad_sect =
208 + le64_to_cpup(&p->entries[i].bad_sect);
209 + p->entries[i].replacement_sect =
210 + le64_to_cpup(&p->entries[i].replacement_sect);
211 + }
212 +}
213 +
214 +/**
215 + * cpu_bbr_table_sector_to_le
216 + *
217 + * Convert bbr meta data from cpu endian format to on-disk (LE) format
218 + **/
219 +static void cpu_bbr_table_sector_to_le(struct bbr_table *p,
220 + struct bbr_table *le)
221 +{
222 + int i;
223 + le->signature = cpu_to_le32p(&p->signature);
224 + le->crc = cpu_to_le32p(&p->crc);
225 + le->sequence_number = cpu_to_le32p(&p->sequence_number);
226 + le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt);
227 + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
228 + le->entries[i].bad_sect =
229 + cpu_to_le64p(&p->entries[i].bad_sect);
230 + le->entries[i].replacement_sect =
231 + cpu_to_le64p(&p->entries[i].replacement_sect);
232 + }
233 +}
234 +
235 +/**
236 + * validate_bbr_table_sector
237 + *
238 + * Check the specified BBR table sector for a valid signature and CRC. If it's
239 + * valid, endian-convert the table sector.
240 + **/
241 +static int validate_bbr_table_sector(struct bbr_table *p)
242 +{
243 + int org_crc, final_crc;
244 +
245 + if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) {
246 + DMERR("BBR table signature doesn't match!");
247 + DMERR("Found 0x%x. Expecting 0x%x",
248 + le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE);
249 + return -EINVAL;
250 + }
251 +
252 + if (!p->crc) {
253 + DMERR("BBR table sector has no CRC!");
254 + return -EINVAL;
255 + }
256 +
257 + org_crc = le32_to_cpup(&p->crc);
258 + p->crc = 0;
259 + final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p));
260 + if (final_crc != org_crc) {
261 + DMERR("CRC failed!");
262 + DMERR("Found 0x%x. Expecting 0x%x",
263 + org_crc, final_crc);
264 + return -EINVAL;
265 + }
266 +
267 + p->crc = cpu_to_le32p(&org_crc);
268 + le_bbr_table_sector_to_cpu(p);
269 +
270 + return 0;
271 +}
272 +
273 +/**
274 + * bbr_binary_tree_insert
275 + *
276 + * Insert a node into the binary tree.
277 + **/
278 +static void bbr_binary_tree_insert(struct bbr_runtime_remap **root,
279 + struct bbr_runtime_remap *newnode)
280 +{
281 + struct bbr_runtime_remap **node = root;
282 + while (node && *node) {
283 + node = (newnode->remap.bad_sect > (*node)->remap.bad_sect) ?
284 + &(*node)->right : &(*node)->left;
285 + }
286 +
287 + newnode->left = newnode->right = NULL;
288 + *node = newnode;
289 +}
290 +
291 +/**
292 + * bbr_binary_search
293 + *
294 + * Search for a node that contains bad_sect == lsn.
295 + **/
296 +static struct bbr_runtime_remap *bbr_binary_search(
297 + struct bbr_runtime_remap *root,
298 + u64 lsn)
299 +{
300 + struct bbr_runtime_remap *node = root;
301 + while (node) {
302 + if (node->remap.bad_sect == lsn)
303 + break;
304 +
305 + node = (lsn > node->remap.bad_sect) ? node->right : node->left;
306 + }
307 + return node;
308 +}
309 +
310 +/**
311 + * bbr_insert_remap_entry
312 + *
313 + * Create a new remap entry and add it to the binary tree for this node.
314 + **/
315 +static int bbr_insert_remap_entry(struct bbr_private *bbr_id,
316 + struct bbr_table_entry *new_bbr_entry)
317 +{
318 + struct bbr_runtime_remap *newnode;
319 +
320 + newnode = kmem_cache_alloc(bbr_remap_cache, GFP_NOIO);
321 + if (!newnode) {
322 + DMERR("Could not allocate from remap cache!");
323 + return -ENOMEM;
324 + }
325 + newnode->remap.bad_sect = new_bbr_entry->bad_sect;
326 + newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
327 + spin_lock_irq(&bbr_id->remap_root_lock);
328 + bbr_binary_tree_insert(&bbr_id->remap_root, newnode);
329 + spin_unlock_irq(&bbr_id->remap_root_lock);
330 + return 0;
331 +}
332 +
333 +/**
334 + * bbr_table_to_remap_list
335 + *
336 + * The on-disk bbr table is sorted by the replacement sector LBA. In order to
337 + * improve run time performance, the in memory remap list must be sorted by
338 + * the bad sector LBA. This function is called at discovery time to initialize
339 + * the remap list. This function assumes that at least one copy of meta data
340 + * is valid.
341 + **/
342 +static u32 bbr_table_to_remap_list(struct bbr_private *bbr_id)
343 +{
344 + u32 in_use_blks = 0;
345 + int i, j;
346 + struct bbr_table *p;
347 +
348 + for (i = 0, p = bbr_id->bbr_table;
349 + i < bbr_id->nr_sects_bbr_table;
350 + i++, p++) {
351 + if (!p->in_use_cnt)
352 + break;
353 +
354 + in_use_blks += p->in_use_cnt;
355 + for (j = 0; j < p->in_use_cnt; j++)
356 + bbr_insert_remap_entry(bbr_id, &p->entries[j]);
357 + }
358 + if (in_use_blks) {
359 + char b[32];
360 + DMWARN("There are %u BBR entries for device %s",
361 + in_use_blks, format_dev_t(b, bbr_id->dev->bdev->bd_dev));
362 + }
363 +
364 + return in_use_blks;
365 +}
366 +
367 +/**
368 + * bbr_search_remap_entry
369 + *
370 + * Search remap entry for the specified sector. If found, return a pointer to
371 + * the table entry. Otherwise, return NULL.
372 + **/
373 +static struct bbr_table_entry *bbr_search_remap_entry(
374 + struct bbr_private *bbr_id,
375 + u64 lsn)
376 +{
377 + struct bbr_runtime_remap *p;
378 +
379 + spin_lock_irq(&bbr_id->remap_root_lock);
380 + p = bbr_binary_search(bbr_id->remap_root, lsn);
381 + spin_unlock_irq(&bbr_id->remap_root_lock);
382 + return (p) ? &p->remap : NULL;
383 +}
384 +
385 +/**
386 + * bbr_remap
387 + *
388 + * If *lsn is in the remap table, return TRUE and modify *lsn,
389 + * else, return FALSE.
390 + **/
391 +static int bbr_remap(struct bbr_private *bbr_id,
392 + u64 *lsn)
393 +{
394 + struct bbr_table_entry *e;
395 +
396 + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
397 + e = bbr_search_remap_entry(bbr_id, *lsn);
398 + if (e) {
399 + *lsn = e->replacement_sect;
400 + return 1;
401 + }
402 + }
403 + return 0;
404 +}
405 +
406 +/**
407 + * bbr_remap_probe
408 + *
409 + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap
410 + * table return TRUE, Else, return FALSE.
411 + **/
412 +static int bbr_remap_probe(struct bbr_private *bbr_id,
413 + u64 lsn, u64 nr_sects)
414 +{
415 + u64 tmp, cnt;
416 +
417 + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
418 + for (cnt = 0, tmp = lsn;
419 + cnt < nr_sects;
420 + cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) {
421 + if (bbr_remap(bbr_id,&tmp))
422 + return 1;
423 + }
424 + }
425 + return 0;
426 +}
427 +
428 +static int rw_table(struct bbr_private *bbr_id, void *vma,
429 + struct io_region *ptr, int rw)
430 +{
431 + bbr_id->vma_io_req.bi_rw = rw;
432 + bbr_id->vma_io_req.mem.ptr.vma = vma;
433 + bbr_id->vma_io_req.notify.fn = NULL;
434 +
435 + return dm_io(&bbr_id->vma_io_req, 1, ptr, NULL);
436 +}
437 +
438 +static int io_sync(struct bbr_private *bbr_id, struct page_list *pl,
439 + unsigned offset, struct io_region *ptr, int rw)
440 +{
441 + bbr_id->page_io_req.bi_rw = rw;
442 + bbr_id->page_io_req.mem.ptr.pl = pl;
443 + bbr_id->page_io_req.mem.offset = offset;
444 + bbr_id->page_io_req.notify.fn = NULL;
445 +
446 + return dm_io(&bbr_id->page_io_req, 1, ptr, NULL);
447 +}
448 +
449 +/**
450 + * bbr_setup
451 + *
452 + * Read the remap tables from disk and set up the initial remap tree.
453 + **/
454 +static int bbr_setup(struct bbr_private *bbr_id)
455 +{
456 + struct bbr_table *table = bbr_id->bbr_table;
457 + struct io_region job;
458 + int i, rc = 0;
459 +
460 + job.bdev = bbr_id->dev->bdev;
461 + job.count = 1;
462 +
463 + /* Read and verify each BBR table sector individually. */
464 + for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) {
465 + job.sector = bbr_id->lba_table1 + i;
466 + rc = rw_table(bbr_id, table, &job, READ);
467 + if (rc && bbr_id->lba_table2) {
468 + job.sector = bbr_id->lba_table2 + i;
469 + rc = rw_table(bbr_id, table, &job, READ);
470 + }
471 + if (rc)
472 + goto out;
473 +
474 + rc = validate_bbr_table_sector(table);
475 + if (rc)
476 + goto out;
477 + }
478 + atomic_set(&bbr_id->in_use_replacement_blks,
479 + bbr_table_to_remap_list(bbr_id));
480 +
481 +out:
482 + if (rc)
483 + DMERR("error during device setup: %d", rc);
484 + return rc;
485 +}
486 +
487 +/**
488 + * bbr_io_remap_error
489 + * @bbr_id: Private data for the BBR node.
490 + * @rw: READ or WRITE.
491 + * @starting_lsn: Starting sector of request to remap.
492 + * @count: Number of sectors in the request.
493 + * @page: Page containing the data for the request.
494 + * @offset: Byte-offset of the data within the page.
495 + *
496 + * For the requested range, try to write each sector individually. For each
497 + * sector that fails, find the next available remap location and write the
498 + * data to that new location. Then update the table and write both copies
499 + * of the table to disk. Finally, update the in-memory mapping and do any
500 + * other necessary bookkeeping.
501 + **/
502 +static int bbr_io_remap_error(struct bbr_private *bbr_id,
503 + int rw,
504 + u64 starting_lsn,
505 + u64 count,
506 + struct page *page,
507 + unsigned int offset)
508 +{
509 + struct bbr_table *bbr_table;
510 + struct io_region job;
511 + struct page_list pl;
512 + unsigned long table_sector_index;
513 + unsigned long table_sector_offset;
514 + unsigned long index;
515 + u64 lsn, new_lsn;
516 + char b[32];
517 + int rc;
518 +
519 + job.bdev = bbr_id->dev->bdev;
520 + job.count = 1;
521 + pl.page = page;
522 + pl.next = NULL;
523 +
524 + /* For each sector in the request. */
525 + for (lsn = 0; lsn < count; lsn++, offset += SECTOR_SIZE) {
526 + job.sector = starting_lsn + lsn;
527 + rc = io_sync(bbr_id, &pl, offset, &job, rw);
528 + while (rc) {
529 + /* Find the next available relocation sector. */
530 + new_lsn = atomic_read(&bbr_id->in_use_replacement_blks);
531 + if (new_lsn >= bbr_id->nr_replacement_blks) {
532 + /* No more replacement sectors available. */
533 + return -EIO;
534 + }
535 + new_lsn += bbr_id->start_replacement_sect;
536 +
537 + /* Write the data to its new location. */
538 + DMWARN("device %s: Trying to remap bad sector "PFU64" to sector "PFU64,
539 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
540 + starting_lsn + lsn, new_lsn);
541 + job.sector = new_lsn;
542 + rc = io_sync(bbr_id, &pl, offset, &job, rw);
543 + if (rc) {
544 + /* This replacement sector is bad.
545 + * Try the next one.
546 + */
547 + DMERR("device %s: replacement sector "PFU64" is bad. Skipping.",
548 + format_dev_t(b, bbr_id->dev->bdev->bd_dev), new_lsn);
549 + atomic_inc(&bbr_id->in_use_replacement_blks);
550 + continue;
551 + }
552 +
553 + /* Add this new entry to the on-disk table. */
554 + table_sector_index = new_lsn -
555 + bbr_id->start_replacement_sect;
556 + table_sector_offset = table_sector_index /
557 + BBR_ENTRIES_PER_SECT;
558 + index = table_sector_index % BBR_ENTRIES_PER_SECT;
559 +
560 + bbr_table = &bbr_id->bbr_table[table_sector_offset];
561 + bbr_table->entries[index].bad_sect = starting_lsn + lsn;
562 + bbr_table->entries[index].replacement_sect = new_lsn;
563 + bbr_table->in_use_cnt++;
564 + bbr_table->sequence_number++;
565 + bbr_table->crc = 0;
566 + bbr_table->crc = calculate_crc(INITIAL_CRC,
567 + bbr_table,
568 + sizeof(struct bbr_table));
569 +
570 + /* Write the table to disk. */
571 + cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
572 + if (bbr_id->lba_table1) {
573 + job.sector = bbr_id->lba_table1 + table_sector_offset;
574 + rc = rw_table(bbr_id, bbr_table, &job, WRITE);
575 + }
576 + if (bbr_id->lba_table2) {
577 + job.sector = bbr_id->lba_table2 + table_sector_offset;
578 + rc |= rw_table(bbr_id, bbr_table, &job, WRITE);
579 + }
580 + le_bbr_table_sector_to_cpu(bbr_table);
581 +
582 + if (rc) {
583 + /* Error writing one of the tables to disk. */
584 + DMERR("device %s: error updating BBR tables on disk.",
585 + format_dev_t(b, bbr_id->dev->bdev->bd_dev));
586 + return rc;
587 + }
588 +
589 + /* Insert a new entry in the remapping binary-tree. */
590 + rc = bbr_insert_remap_entry(bbr_id,
591 + &bbr_table->entries[index]);
592 + if (rc) {
593 + DMERR("device %s: error adding new entry to remap tree.",
594 + format_dev_t(b, bbr_id->dev->bdev->bd_dev));
595 + return rc;
596 + }
597 +
598 + atomic_inc(&bbr_id->in_use_replacement_blks);
599 + }
600 + }
601 +
602 + return 0;
603 +}
604 +
605 +/**
606 + * bbr_io_process_request
607 + *
608 + * For each sector in this request, check if the sector has already
609 + * been remapped. If so, process all previous sectors in the request,
610 + * followed by the remapped sector. Then reset the starting lsn and
611 + * count, and keep going with the rest of the request as if it were
612 + * a whole new request. If any of the sync_io's return an error,
613 + * call the remapper to relocate the bad sector(s).
614 + *
615 + * 2.5 Note: When switching over to bio's for the I/O path, we have made
616 + * the assumption that the I/O request described by the bio is one
617 + * virtually contiguous piece of memory (even though the bio vector
618 + * describes it using a series of physical page addresses).
619 + **/
620 +static int bbr_io_process_request(struct bbr_private *bbr_id,
621 + struct bio *bio)
622 +{
623 + struct io_region job;
624 + u64 starting_lsn = bio->bi_sector;
625 + u64 count, lsn, remapped_lsn;
626 + struct page_list pl;
627 + unsigned int offset;
628 + int i, rw = bio_data_dir(bio);
629 + int rc = 0;
630 +
631 + job.bdev = bbr_id->dev->bdev;
632 + pl.next = NULL;
633 +
634 + /* Each bio can contain multiple vectors, each with a different page.
635 + * Treat each vector as a separate request.
636 + */
637 + /* KMC: Is this the right way to walk the bvec list? */
638 + for (i = 0;
639 + i < bio->bi_vcnt;
640 + i++, bio->bi_idx++, starting_lsn += count) {
641 +
642 + /* Bvec info: number of sectors, page,
643 + * and byte-offset within page.
644 + */
645 + count = bio_iovec(bio)->bv_len >> SECTOR_SHIFT;
646 + pl.page = bio_iovec(bio)->bv_page;
647 + offset = bio_iovec(bio)->bv_offset;
648 +
649 + /* For each sector in this bvec, check if the sector has
650 + * already been remapped. If so, process all previous sectors
651 + * in this request, followed by the remapped sector. Then reset
652 + * the starting lsn and count and keep going with the rest of
653 + * the request as if it were a whole new request.
654 + */
655 + for (lsn = 0; lsn < count; lsn++) {
656 + remapped_lsn = starting_lsn + lsn;
657 + rc = bbr_remap(bbr_id, &remapped_lsn);
658 + if (!rc) {
659 + /* This sector is fine. */
660 + continue;
661 + }
662 +
663 + /* Process all sectors in the request up to this one. */
664 + if (lsn > 0) {
665 + job.sector = starting_lsn;
666 + job.count = lsn;
667 + rc = io_sync(bbr_id, &pl, offset, &job, rw);
668 + if (rc) {
669 + /* If this I/O failed, then one of the
670 + * sectors in this request needs to be
671 + * relocated.
672 + */
673 + rc = bbr_io_remap_error(bbr_id, rw,
674 + starting_lsn,
675 + lsn, pl.page,
676 + offset);
677 + if (rc) {
678 + /* KMC: Return? Or continue to next bvec? */
679 + return rc;
680 + }
681 + }
682 + offset += (lsn << SECTOR_SHIFT);
683 + }
684 +
685 + /* Process the remapped sector. */
686 + job.sector = remapped_lsn;
687 + job.count = 1;
688 + rc = io_sync(bbr_id, &pl, offset, &job, rw);
689 + if (rc) {
690 + /* BUGBUG - Need more processing if this caused
691 + * an error. If this I/O failed, then the
692 + * existing remap is now bad, and we need to
693 + * find a new remap. Can't use
694 + * bbr_io_remap_error(), because the existing
695 + * map entry needs to be changed, not added
696 + * again, and the original table entry also
697 + * needs to be changed.
698 + */
699 + return rc;
700 + }
701 +
702 + starting_lsn += (lsn + 1);
703 + count -= (lsn + 1);
704 + lsn = -1;
705 + offset += SECTOR_SIZE;
706 + }
707 +
708 + /* Check for any remaining sectors after the last split. This
709 + * could potentially be the whole request, but that should be a
710 + * rare case because requests should only be processed by the
711 + * thread if we know an error occurred or they contained one or
712 + * more remapped sectors.
713 + */
714 + if (count) {
715 + job.sector = starting_lsn;
716 + job.count = count;
717 + rc = io_sync(bbr_id, &pl, offset, &job, rw);
718 + if (rc) {
719 + /* If this I/O failed, then one of the sectors
720 + * in this request needs to be relocated.
721 + */
722 + rc = bbr_io_remap_error(bbr_id, rw, starting_lsn,
723 + count, pl.page, offset);
724 + if (rc) {
725 + /* KMC: Return? Or continue to next bvec? */
726 + return rc;
727 + }
728 + }
729 + }
730 + }
731 +
732 + return 0;
733 +}
734 +
735 +static void bbr_io_process_requests(struct bbr_private *bbr_id,
736 + struct bio *bio)
737 +{
738 + struct bio *next;
739 + int rc;
740 +
741 + while (bio) {
742 + next = bio->bi_next;
743 + bio->bi_next = NULL;
744 +
745 + rc = bbr_io_process_request(bbr_id, bio);
746 +
747 + bio_endio(bio, bio->bi_size, rc);
748 +
749 + bio = next;
750 + }
751 +}
752 +
753 +/**
754 + * bbr_remap_handler
755 + *
756 + * This is the handler for the bbr work-queue.
757 + *
758 + * I/O requests should only be sent to this handler if we know that:
759 + * a) the request contains at least one remapped sector.
760 + * or
761 + * b) the request caused an error on the normal I/O path.
762 + *
763 + * This function uses synchronous I/O, so sending a request to this
764 + * thread that doesn't need special processing will cause severe
765 + * performance degredation.
766 + **/
767 +static void bbr_remap_handler(struct work_struct *work)
768 +{
769 + struct bbr_private *bbr_id =
770 + container_of(work, struct bbr_private, remap_work);
771 + struct bio *bio;
772 + unsigned long flags;
773 +
774 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
775 + bio = bio_list_get(&bbr_id->remap_ios);
776 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
777 +
778 + bbr_io_process_requests(bbr_id, bio);
779 +}
780 +
781 +/**
782 + * bbr_endio
783 + *
784 + * This is the callback for normal write requests. Check for an error
785 + * during the I/O, and send to the thread for processing if necessary.
786 + **/
787 +static int bbr_endio(struct dm_target *ti, struct bio *bio,
788 + int error, union map_info *map_context)
789 +{
790 + struct bbr_private *bbr_id = ti->private;
791 + struct dm_bio_details *bbr_io = map_context->ptr;
792 +
793 + if (error && bbr_io) {
794 + unsigned long flags;
795 + char b[32];
796 +
797 + dm_bio_restore(bbr_io, bio);
798 + map_context->ptr = NULL;
799 +
800 + DMERR("device %s: I/O failure on sector %lu. "
801 + "Scheduling for retry.",
802 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
803 + (unsigned long)bio->bi_sector);
804 +
805 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
806 + bio_list_add(&bbr_id->remap_ios, bio);
807 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
808 +
809 + queue_work(dm_bbr_wq, &bbr_id->remap_work);
810 +
811 + error = 1;
812 + }
813 +
814 + if (bbr_io)
815 + mempool_free(bbr_io, bbr_io_pool);
816 +
817 + return error;
818 +}
819 +
820 +/**
821 + * Construct a bbr mapping
822 + **/
823 +static int bbr_ctr(struct dm_target *ti, unsigned int argc, char **argv)
824 +{
825 + struct bbr_private *bbr_id;
826 + unsigned long block_size;
827 + char *end;
828 + int rc = -EINVAL;
829 +
830 + if (argc != 8) {
831 + ti->error = "dm-bbr requires exactly 8 arguments: "
832 + "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size";
833 + goto out1;
834 + }
835 +
836 + bbr_id = bbr_alloc_private();
837 + if (!bbr_id) {
838 + ti->error = "dm-bbr: Error allocating bbr private data.";
839 + goto out1;
840 + }
841 +
842 + bbr_id->offset = simple_strtoull(argv[1], &end, 10);
843 + bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10);
844 + bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10);
845 + bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10);
846 + bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10);
847 + bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10);
848 + block_size = simple_strtoul(argv[7], &end, 10);
849 + bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT);
850 +
851 + bbr_id->vma_io_req.mem.type = DM_IO_VMA;
852 + bbr_id->vma_io_req.client = dm_io_client_create(1);
853 + if (IS_ERR(bbr_id->vma_io_req.client)) {
854 + rc = PTR_ERR(bbr_id->vma_io_req.client);
855 + DMWARN("couldn't allocate disk VMA io client");
856 + goto out2;
857 + }
858 +
859 + bbr_id->page_io_req.mem.type = DM_IO_PAGE_LIST;
860 + bbr_id->page_io_req.client = dm_io_client_create(1);
861 + if (IS_ERR(bbr_id->page_io_req.client)) {
862 + rc = PTR_ERR(bbr_id->page_io_req.client);
863 + DMWARN("couldn't allocate pagelist io client");
864 + goto out3;
865 + }
866 +
867 + bbr_id->bbr_table = vmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT);
868 + if (!bbr_id->bbr_table) {
869 + ti->error = "dm-bbr: Error allocating bbr table.";
870 + goto out4;
871 + }
872 +
873 + if (dm_get_device(ti, argv[0], 0, ti->len,
874 + dm_table_get_mode(ti->table), &bbr_id->dev)) {
875 + ti->error = "dm-bbr: Device lookup failed";
876 + goto out4;
877 + }
878 +
879 + rc = bbr_setup(bbr_id);
880 + if (rc) {
881 + ti->error = "dm-bbr: Device setup failed";
882 + goto out5;
883 + }
884 +
885 + ti->private = bbr_id;
886 + return 0;
887 +
888 +out5:
889 + dm_put_device(ti, bbr_id->dev);
890 +out4:
891 + dm_io_client_destroy(bbr_id->page_io_req.client);
892 +out3:
893 + dm_io_client_destroy(bbr_id->vma_io_req.client);
894 +out2:
895 + bbr_free_private(bbr_id);
896 +out1:
897 + return rc;
898 +}
899 +
900 +static void bbr_dtr(struct dm_target *ti)
901 +{
902 + struct bbr_private *bbr_id = ti->private;
903 +
904 + dm_put_device(ti, bbr_id->dev);
905 + dm_io_client_destroy(bbr_id->page_io_req.client);
906 + dm_io_client_destroy(bbr_id->vma_io_req.client);
907 + bbr_free_private(bbr_id);
908 +}
909 +
910 +static int bbr_map(struct dm_target *ti, struct bio *bio,
911 + union map_info *map_context)
912 +{
913 + struct bbr_private *bbr_id = ti->private;
914 + struct dm_bio_details *bbr_io;
915 + unsigned long flags;
916 + int rc = 1;
917 +
918 + bio->bi_sector += bbr_id->offset;
919 +
920 + if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
921 + !bbr_remap_probe(bbr_id, bio->bi_sector, bio_sectors(bio))) {
922 + /* No existing remaps or this request doesn't
923 + * contain any remapped sectors.
924 + */
925 + bio->bi_bdev = bbr_id->dev->bdev;
926 +
927 + bbr_io = mempool_alloc(bbr_io_pool, GFP_NOIO);
928 + dm_bio_record(bbr_io, bio);
929 + map_context->ptr = bbr_io;
930 + } else {
931 + /* This request has at least one remapped sector.
932 + * Give it to the work-queue for processing.
933 + */
934 + map_context->ptr = NULL;
935 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
936 + bio_list_add(&bbr_id->remap_ios, bio);
937 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
938 +
939 + queue_work(dm_bbr_wq, &bbr_id->remap_work);
940 + rc = 0;
941 + }
942 +
943 + return rc;
944 +}
945 +
946 +static int bbr_status(struct dm_target *ti, status_type_t type,
947 + char *result, unsigned int maxlen)
948 +{
949 + struct bbr_private *bbr_id = ti->private;
950 + char b[BDEVNAME_SIZE];
951 +
952 + switch (type) {
953 + case STATUSTYPE_INFO:
954 + result[0] = '\0';
955 + break;
956 +
957 + case STATUSTYPE_TABLE:
958 + snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u",
959 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
960 + bbr_id->offset, bbr_id->lba_table1, bbr_id->lba_table2,
961 + bbr_id->nr_sects_bbr_table,
962 + bbr_id->start_replacement_sect,
963 + bbr_id->nr_replacement_blks,
964 + bbr_id->blksize_in_sects << SECTOR_SHIFT);
965 + break;
966 + }
967 + return 0;
968 +}
969 +
970 +static struct target_type bbr_target = {
971 + .name = "bbr",
972 + .version= {1, 0, 1},
973 + .module = THIS_MODULE,
974 + .ctr = bbr_ctr,
975 + .dtr = bbr_dtr,
976 + .map = bbr_map,
977 + .end_io = bbr_endio,
978 + .status = bbr_status,
979 +};
980 +
981 +int __init dm_bbr_init(void)
982 +{
983 + int rc;
984 +
985 + rc = dm_register_target(&bbr_target);
986 + if (rc) {
987 + DMERR("error registering target.");
988 + goto err1;
989 + }
990 +
991 + bbr_remap_cache = kmem_cache_create("bbr-remap",
992 + sizeof(struct bbr_runtime_remap),
993 + 0, SLAB_HWCACHE_ALIGN, NULL);
994 + if (!bbr_remap_cache) {
995 + DMERR("error creating remap cache.");
996 + rc = ENOMEM;
997 + goto err2;
998 + }
999 +
1000 + bbr_io_cache = kmem_cache_create("bbr-io", sizeof(struct dm_bio_details),
1001 + 0, SLAB_HWCACHE_ALIGN, NULL);
1002 + if (!bbr_io_cache) {
1003 + DMERR("error creating io cache.");
1004 + rc = ENOMEM;
1005 + goto err3;
1006 + }
1007 +
1008 + bbr_io_pool = mempool_create(256, mempool_alloc_slab,
1009 + mempool_free_slab, bbr_io_cache);
1010 + if (!bbr_io_pool) {
1011 + DMERR("error creating io mempool.");
1012 + rc = ENOMEM;
1013 + goto err4;
1014 + }
1015 +
1016 + dm_bbr_wq = create_workqueue("dm-bbr");
1017 + if (!dm_bbr_wq) {
1018 + DMERR("error creating work-queue.");
1019 + rc = ENOMEM;
1020 + goto err5;
1021 + }
1022 +
1023 + return 0;
1024 +
1025 +err5:
1026 + mempool_destroy(bbr_io_pool);
1027 +err4:
1028 + kmem_cache_destroy(bbr_io_cache);
1029 +err3:
1030 + kmem_cache_destroy(bbr_remap_cache);
1031 +err2:
1032 + dm_unregister_target(&bbr_target);
1033 +err1:
1034 + return rc;
1035 +}
1036 +
1037 +void __exit dm_bbr_exit(void)
1038 +{
1039 + destroy_workqueue(dm_bbr_wq);
1040 + mempool_destroy(bbr_io_pool);
1041 + kmem_cache_destroy(bbr_io_cache);
1042 + kmem_cache_destroy(bbr_remap_cache);
1043 + dm_unregister_target(&bbr_target);
1044 +}
1045 +
1046 +module_init(dm_bbr_init);
1047 +module_exit(dm_bbr_exit);
1048 +MODULE_LICENSE("GPL");
1049 --- /dev/null
1050 +++ b/drivers/md/dm-bbr.h
1051 @@ -0,0 +1,130 @@
1052 +/*
1053 + * (C) Copyright IBM Corp. 2002, 2004
1054 + *
1055 + * This program is free software; you can redistribute it and/or modify
1056 + * it under the terms of the GNU General Public License as published by
1057 + * the Free Software Foundation; either version 2 of the License, or
1058 + * (at your option) any later version.
1059 + *
1060 + * This program is distributed in the hope that it will be useful,
1061 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1062 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
1063 + * the GNU General Public License for more details.
1064 + *
1065 + * You should have received a copy of the GNU General Public License
1066 + * along with this program; if not, write to the Free Software
1067 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1068 + *
1069 + * linux/drivers/md/dm-bbr.h
1070 + *
1071 + * Bad-block-relocation (BBR) target for device-mapper.
1072 + *
1073 + * The BBR target is designed to remap I/O write failures to another safe
1074 + * location on disk. Note that most disk drives have BBR built into them,
1075 + * this means that our software BBR will be only activated when all hardware
1076 + * BBR replacement sectors have been used.
1077 + */
1078 +
1079 +#include "dm-io.h"
1080 +
1081 +#define BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */
1082 +#define BBR_ENTRIES_PER_SECT 31
1083 +#define INITIAL_CRC 0xFFFFFFFF
1084 +#define CRC_POLYNOMIAL 0xEDB88320L
1085 +
1086 +/**
1087 + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines.
1088 + * Use these in place of %Ld, %Lu, and %Lx.
1089 + **/
1090 +#if BITS_PER_LONG > 32
1091 +#define PFU64 "%llu"
1092 +#else
1093 +#define PFU64 "%Lu"
1094 +#endif
1095 +
1096 +/**
1097 + * struct bbr_table_entry
1098 + * @bad_sect: LBA of bad location.
1099 + * @replacement_sect: LBA of new location.
1100 + *
1101 + * Structure to describe one BBR remap.
1102 + **/
1103 +struct bbr_table_entry {
1104 + u64 bad_sect;
1105 + u64 replacement_sect;
1106 +};
1107 +
1108 +/**
1109 + * struct bbr_table
1110 + * @signature: Signature on each BBR table sector.
1111 + * @crc: CRC for this table sector.
1112 + * @sequence_number: Used to resolve conflicts when primary and secondary
1113 + * tables do not match.
1114 + * @in_use_cnt: Number of in-use table entries.
1115 + * @entries: Actual table of remaps.
1116 + *
1117 + * Structure to describe each sector of the metadata table. Each sector in this
1118 + * table can describe 31 remapped sectors.
1119 + **/
1120 +struct bbr_table {
1121 + u32 signature;
1122 + u32 crc;
1123 + u32 sequence_number;
1124 + u32 in_use_cnt;
1125 + struct bbr_table_entry entries[BBR_ENTRIES_PER_SECT];
1126 +};
1127 +
1128 +/**
1129 + * struct bbr_runtime_remap
1130 + *
1131 + * Node in the binary tree used to keep track of remaps.
1132 + **/
1133 +struct bbr_runtime_remap {
1134 + struct bbr_table_entry remap;
1135 + struct bbr_runtime_remap *left;
1136 + struct bbr_runtime_remap *right;
1137 +};
1138 +
1139 +/**
1140 + * struct bbr_private
1141 + * @dev: Info about underlying device.
1142 + * @bbr_table: Copy of metadata table.
1143 + * @remap_root: Binary tree containing all remaps.
1144 + * @remap_root_lock: Lock for the binary tree.
1145 + * @remap_work: For adding work items to the work-queue.
1146 + * @remap_ios: List of I/Os for the work-queue to handle.
1147 + * @remap_ios_lock: Lock for the remap_ios list.
1148 + * @offset: LBA of data area.
1149 + * @lba_table1: LBA of primary BBR table.
1150 + * @lba_table2: LBA of secondary BBR table.
1151 + * @nr_sects_bbr_table: Size of each BBR table.
1152 + * @nr_replacement_blks: Number of replacement blocks.
1153 + * @start_replacement_sect: LBA of start of replacement blocks.
1154 + * @blksize_in_sects: Size of each block.
1155 + * @in_use_replacement_blks: Current number of remapped blocks.
1156 + *
1157 + * Private data for each BBR target.
1158 + **/
1159 +struct bbr_private {
1160 + struct dm_dev *dev;
1161 + struct bbr_table *bbr_table;
1162 + struct bbr_runtime_remap *remap_root;
1163 + spinlock_t remap_root_lock;
1164 +
1165 + struct dm_io_request vma_io_req;
1166 + struct dm_io_request page_io_req;
1167 +
1168 + struct work_struct remap_work;
1169 + struct bio_list remap_ios;
1170 + spinlock_t remap_ios_lock;
1171 +
1172 + u64 offset;
1173 + u64 lba_table1;
1174 + u64 lba_table2;
1175 + u64 nr_sects_bbr_table;
1176 + u64 start_replacement_sect;
1177 + u64 nr_replacement_blks;
1178 + u32 blksize_in_sects;
1179 + atomic_t in_use_replacement_blks;
1180 +};
1181 +

  ViewVC Help
Powered by ViewVC 1.1.20