/[linux-patches]/genpatches-2.6/tags/2.6.20-9/4105_dm-bbr.patch
Gentoo

Contents of /genpatches-2.6/tags/2.6.20-9/4105_dm-bbr.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 927 - (show annotations) (download)
Thu Apr 26 15:33:43 2007 UTC (7 years, 4 months ago) by phreak
File size: 31898 byte(s)
2.6.20-9 release
1 BBR Target.
2
3 Index: linux-2.6.20-rc6/drivers/md/Kconfig
4 ===================================================================
5 --- linux-2.6.20-rc6.orig/drivers/md/Kconfig
6 +++ linux-2.6.20-rc6/drivers/md/Kconfig
7 @@ -262,6 +262,17 @@ config DM_MULTIPATH_EMC
8 ---help---
9 Multipath support for EMC CX/AX series hardware.
10
11 +config BLK_DEV_DM_BBR
12 + tristate "Bad Block Relocation Device Target (EXPERIMENTAL)"
13 + depends on BLK_DEV_DM && EXPERIMENTAL
14 + ---help---
15 + Support for devices with software-based bad-block-relocation.
16 +
17 + To compile this as a module, choose M here: the module will be
18 + called dm-bbr.
19 +
20 + If unsure, say N.
21 +
22 endmenu
23
24 endif
25 Index: linux-2.6.20-rc6/drivers/md/Makefile
26 ===================================================================
27 --- linux-2.6.20-rc6.orig/drivers/md/Makefile
28 +++ linux-2.6.20-rc6/drivers/md/Makefile
29 @@ -36,6 +36,7 @@ obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc
30 obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
31 obj-$(CONFIG_DM_MIRROR) += dm-mirror.o
32 obj-$(CONFIG_DM_ZERO) += dm-zero.o
33 +obj-$(CONFIG_BLK_DEV_DM_BBR) += dm-bbr.o
34
35 quiet_cmd_unroll = UNROLL $@
36 cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
37 Index: linux-2.6.20-rc6/drivers/md/dm-bbr.c
38 ===================================================================
39 --- /dev/null
40 +++ linux-2.6.20-rc6/drivers/md/dm-bbr.c
41 @@ -0,0 +1,982 @@
42 +/*
43 + * (C) Copyright IBM Corp. 2002, 2004
44 + *
45 + * This program is free software; you can redistribute it and/or modify
46 + * it under the terms of the GNU General Public License as published by
47 + * the Free Software Foundation; either version 2 of the License, or
48 + * (at your option) any later version.
49 + *
50 + * This program is distributed in the hope that it will be useful,
51 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
52 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
53 + * the GNU General Public License for more details.
54 + *
55 + * You should have received a copy of the GNU General Public License
56 + * along with this program; if not, write to the Free Software
57 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
58 + *
59 + * linux/drivers/md/dm-bbr.c
60 + *
61 + * Bad-block-relocation (BBR) target for device-mapper.
62 + *
63 + * The BBR target is designed to remap I/O write failures to another safe
64 + * location on disk. Note that most disk drives have BBR built into them,
65 + * this means that our software BBR will be only activated when all hardware
66 + * BBR replacement sectors have been used.
67 + */
68 +
69 +#include <linux/module.h>
70 +#include <linux/init.h>
71 +#include <linux/bio.h>
72 +#include <linux/spinlock.h>
73 +#include <linux/slab.h>
74 +#include <linux/mempool.h>
75 +#include <linux/workqueue.h>
76 +#include <linux/vmalloc.h>
77 +
78 +#include "dm.h"
79 +#include "dm-bio-list.h"
80 +#include "dm-bio-record.h"
81 +#include "dm-bbr.h"
82 +#include "dm-io.h"
83 +
84 +#define DM_MSG_PREFIX "bbr"
85 +#define SECTOR_SIZE (1 << SECTOR_SHIFT)
86 +
87 +static struct workqueue_struct *dm_bbr_wq = NULL;
88 +static void bbr_remap_handler(struct work_struct *work);
89 +static struct kmem_cache *bbr_remap_cache;
90 +static struct kmem_cache *bbr_io_cache;
91 +static mempool_t *bbr_io_pool;
92 +
93 +/**
94 + * bbr_binary_tree_destroy
95 + *
96 + * Destroy the binary tree.
97 + **/
98 +static void bbr_binary_tree_destroy(struct bbr_runtime_remap *root)
99 +{
100 + struct bbr_runtime_remap **link = NULL;
101 + struct bbr_runtime_remap *node = root;
102 +
103 + while (node) {
104 + if (node->left) {
105 + link = &node->left;
106 + node = node->left;
107 + continue;
108 + }
109 + if (node->right) {
110 + link = &node->right;
111 + node = node->right;
112 + continue;
113 + }
114 +
115 + kmem_cache_free(bbr_remap_cache, node);
116 + if (node == root) {
117 + /* If root is deleted, we're done. */
118 + break;
119 + }
120 +
121 + /* Back to root. */
122 + node = root;
123 + *link = NULL;
124 + }
125 +}
126 +
127 +static void bbr_free_remap(struct bbr_private *bbr_id)
128 +{
129 + spin_lock_irq(&bbr_id->remap_root_lock);
130 + bbr_binary_tree_destroy(bbr_id->remap_root);
131 + bbr_id->remap_root = NULL;
132 + spin_unlock_irq(&bbr_id->remap_root_lock);
133 +}
134 +
135 +static struct bbr_private *bbr_alloc_private(void)
136 +{
137 + struct bbr_private *bbr_id;
138 +
139 + bbr_id = kzalloc(sizeof(*bbr_id), GFP_KERNEL);
140 + if (bbr_id == NULL)
141 + return NULL;
142 +
143 + INIT_WORK(&bbr_id->remap_work, bbr_remap_handler);
144 + spin_lock_init(&bbr_id->remap_root_lock);
145 + spin_lock_init(&bbr_id->remap_ios_lock);
146 + bbr_id->in_use_replacement_blks = (atomic_t) ATOMIC_INIT(0);
147 +
148 + return bbr_id;
149 +}
150 +
151 +static void bbr_free_private(struct bbr_private *bbr_id)
152 +{
153 + vfree(bbr_id->bbr_table);
154 + bbr_free_remap(bbr_id);
155 + kfree(bbr_id);
156 +}
157 +
158 +static u32 crc_table[256];
159 +static u32 crc_table_built = 0;
160 +
161 +static void build_crc_table(void)
162 +{
163 + u32 i, j, crc;
164 +
165 + for (i = 0; i <= 255; i++) {
166 + crc = i;
167 + for (j = 8; j > 0; j--) {
168 + if (crc & 1)
169 + crc = (crc >> 1) ^ CRC_POLYNOMIAL;
170 + else
171 + crc >>= 1;
172 + }
173 + crc_table[i] = crc;
174 + }
175 + crc_table_built = 1;
176 +}
177 +
178 +static u32 calculate_crc(u32 crc, void *buffer, u32 buffersize)
179 +{
180 + unsigned char *current_byte;
181 + u32 temp1, temp2, i;
182 +
183 + current_byte = (unsigned char *) buffer;
184 + /* Make sure the crc table is available */
185 + if (!crc_table_built)
186 + build_crc_table();
187 + /* Process each byte in the buffer. */
188 + for (i = 0; i < buffersize; i++) {
189 + temp1 = (crc >> 8) & 0x00FFFFFF;
190 + temp2 = crc_table[(crc ^ (u32) * current_byte) &
191 + (u32) 0xff];
192 + current_byte++;
193 + crc = temp1 ^ temp2;
194 + }
195 + return crc;
196 +}
197 +
198 +/**
199 + * le_bbr_table_sector_to_cpu
200 + *
201 + * Convert bbr meta data from on-disk (LE) format
202 + * to the native cpu endian format.
203 + **/
204 +static void le_bbr_table_sector_to_cpu(struct bbr_table *p)
205 +{
206 + int i;
207 + p->signature = le32_to_cpup(&p->signature);
208 + p->crc = le32_to_cpup(&p->crc);
209 + p->sequence_number = le32_to_cpup(&p->sequence_number);
210 + p->in_use_cnt = le32_to_cpup(&p->in_use_cnt);
211 + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
212 + p->entries[i].bad_sect =
213 + le64_to_cpup(&p->entries[i].bad_sect);
214 + p->entries[i].replacement_sect =
215 + le64_to_cpup(&p->entries[i].replacement_sect);
216 + }
217 +}
218 +
219 +/**
220 + * cpu_bbr_table_sector_to_le
221 + *
222 + * Convert bbr meta data from cpu endian format to on-disk (LE) format
223 + **/
224 +static void cpu_bbr_table_sector_to_le(struct bbr_table *p,
225 + struct bbr_table *le)
226 +{
227 + int i;
228 + le->signature = cpu_to_le32p(&p->signature);
229 + le->crc = cpu_to_le32p(&p->crc);
230 + le->sequence_number = cpu_to_le32p(&p->sequence_number);
231 + le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt);
232 + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
233 + le->entries[i].bad_sect =
234 + cpu_to_le64p(&p->entries[i].bad_sect);
235 + le->entries[i].replacement_sect =
236 + cpu_to_le64p(&p->entries[i].replacement_sect);
237 + }
238 +}
239 +
240 +/**
241 + * validate_bbr_table_sector
242 + *
243 + * Check the specified BBR table sector for a valid signature and CRC. If it's
244 + * valid, endian-convert the table sector.
245 + **/
246 +static int validate_bbr_table_sector(struct bbr_table *p)
247 +{
248 + int org_crc, final_crc;
249 +
250 + if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) {
251 + DMERR("BBR table signature doesn't match!");
252 + DMERR("Found 0x%x. Expecting 0x%x",
253 + le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE);
254 + return -EINVAL;
255 + }
256 +
257 + if (!p->crc) {
258 + DMERR("BBR table sector has no CRC!");
259 + return -EINVAL;
260 + }
261 +
262 + org_crc = le32_to_cpup(&p->crc);
263 + p->crc = 0;
264 + final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p));
265 + if (final_crc != org_crc) {
266 + DMERR("CRC failed!");
267 + DMERR("Found 0x%x. Expecting 0x%x",
268 + org_crc, final_crc);
269 + return -EINVAL;
270 + }
271 +
272 + p->crc = cpu_to_le32p(&org_crc);
273 + le_bbr_table_sector_to_cpu(p);
274 +
275 + return 0;
276 +}
277 +
278 +/**
279 + * bbr_binary_tree_insert
280 + *
281 + * Insert a node into the binary tree.
282 + **/
283 +static void bbr_binary_tree_insert(struct bbr_runtime_remap **root,
284 + struct bbr_runtime_remap *newnode)
285 +{
286 + struct bbr_runtime_remap **node = root;
287 + while (node && *node) {
288 + node = (newnode->remap.bad_sect > (*node)->remap.bad_sect) ?
289 + &(*node)->right : &(*node)->left;
290 + }
291 +
292 + newnode->left = newnode->right = NULL;
293 + *node = newnode;
294 +}
295 +
296 +/**
297 + * bbr_binary_search
298 + *
299 + * Search for a node that contains bad_sect == lsn.
300 + **/
301 +static struct bbr_runtime_remap *bbr_binary_search(
302 + struct bbr_runtime_remap *root,
303 + u64 lsn)
304 +{
305 + struct bbr_runtime_remap *node = root;
306 + while (node) {
307 + if (node->remap.bad_sect == lsn)
308 + break;
309 +
310 + node = (lsn > node->remap.bad_sect) ? node->right : node->left;
311 + }
312 + return node;
313 +}
314 +
315 +/**
316 + * bbr_insert_remap_entry
317 + *
318 + * Create a new remap entry and add it to the binary tree for this node.
319 + **/
320 +static int bbr_insert_remap_entry(struct bbr_private *bbr_id,
321 + struct bbr_table_entry *new_bbr_entry)
322 +{
323 + struct bbr_runtime_remap *newnode;
324 +
325 + newnode = kmem_cache_alloc(bbr_remap_cache, GFP_NOIO);
326 + if (!newnode) {
327 + DMERR("Could not allocate from remap cache!");
328 + return -ENOMEM;
329 + }
330 + newnode->remap.bad_sect = new_bbr_entry->bad_sect;
331 + newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
332 + spin_lock_irq(&bbr_id->remap_root_lock);
333 + bbr_binary_tree_insert(&bbr_id->remap_root, newnode);
334 + spin_unlock_irq(&bbr_id->remap_root_lock);
335 + return 0;
336 +}
337 +
338 +/**
339 + * bbr_table_to_remap_list
340 + *
341 + * The on-disk bbr table is sorted by the replacement sector LBA. In order to
342 + * improve run time performance, the in memory remap list must be sorted by
343 + * the bad sector LBA. This function is called at discovery time to initialize
344 + * the remap list. This function assumes that at least one copy of meta data
345 + * is valid.
346 + **/
347 +static u32 bbr_table_to_remap_list(struct bbr_private *bbr_id)
348 +{
349 + u32 in_use_blks = 0;
350 + int i, j;
351 + struct bbr_table *p;
352 +
353 + for (i = 0, p = bbr_id->bbr_table;
354 + i < bbr_id->nr_sects_bbr_table;
355 + i++, p++) {
356 + if (!p->in_use_cnt)
357 + break;
358 +
359 + in_use_blks += p->in_use_cnt;
360 + for (j = 0; j < p->in_use_cnt; j++)
361 + bbr_insert_remap_entry(bbr_id, &p->entries[j]);
362 + }
363 + if (in_use_blks) {
364 + char b[32];
365 + DMWARN("There are %u BBR entries for device %s",
366 + in_use_blks, format_dev_t(b, bbr_id->dev->bdev->bd_dev));
367 + }
368 +
369 + return in_use_blks;
370 +}
371 +
372 +/**
373 + * bbr_search_remap_entry
374 + *
375 + * Search remap entry for the specified sector. If found, return a pointer to
376 + * the table entry. Otherwise, return NULL.
377 + **/
378 +static struct bbr_table_entry *bbr_search_remap_entry(
379 + struct bbr_private *bbr_id,
380 + u64 lsn)
381 +{
382 + struct bbr_runtime_remap *p;
383 +
384 + spin_lock_irq(&bbr_id->remap_root_lock);
385 + p = bbr_binary_search(bbr_id->remap_root, lsn);
386 + spin_unlock_irq(&bbr_id->remap_root_lock);
387 + return (p) ? &p->remap : NULL;
388 +}
389 +
390 +/**
391 + * bbr_remap
392 + *
393 + * If *lsn is in the remap table, return TRUE and modify *lsn,
394 + * else, return FALSE.
395 + **/
396 +static int bbr_remap(struct bbr_private *bbr_id,
397 + u64 *lsn)
398 +{
399 + struct bbr_table_entry *e;
400 +
401 + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
402 + e = bbr_search_remap_entry(bbr_id, *lsn);
403 + if (e) {
404 + *lsn = e->replacement_sect;
405 + return 1;
406 + }
407 + }
408 + return 0;
409 +}
410 +
411 +/**
412 + * bbr_remap_probe
413 + *
414 + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap
415 + * table return TRUE, Else, return FALSE.
416 + **/
417 +static int bbr_remap_probe(struct bbr_private *bbr_id,
418 + u64 lsn, u64 nr_sects)
419 +{
420 + u64 tmp, cnt;
421 +
422 + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
423 + for (cnt = 0, tmp = lsn;
424 + cnt < nr_sects;
425 + cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) {
426 + if (bbr_remap(bbr_id,&tmp))
427 + return 1;
428 + }
429 + }
430 + return 0;
431 +}
432 +
433 +/**
434 + * bbr_setup
435 + *
436 + * Read the remap tables from disk and set up the initial remap tree.
437 + **/
438 +static int bbr_setup(struct bbr_private *bbr_id)
439 +{
440 + struct bbr_table *table = bbr_id->bbr_table;
441 + struct io_region job;
442 + unsigned long error;
443 + int i, rc = 0;
444 +
445 + job.bdev = bbr_id->dev->bdev;
446 + job.count = 1;
447 +
448 + /* Read and verify each BBR table sector individually. */
449 + for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) {
450 + job.sector = bbr_id->lba_table1 + i;
451 + rc = dm_io_sync_vm(1, &job, READ, table, &error);
452 + if (rc && bbr_id->lba_table2) {
453 + job.sector = bbr_id->lba_table2 + i;
454 + rc = dm_io_sync_vm(1, &job, READ, table, &error);
455 + }
456 + if (rc)
457 + goto out;
458 +
459 + rc = validate_bbr_table_sector(table);
460 + if (rc)
461 + goto out;
462 + }
463 + atomic_set(&bbr_id->in_use_replacement_blks,
464 + bbr_table_to_remap_list(bbr_id));
465 +
466 +out:
467 + if (rc)
468 + DMERR("error during device setup: %d", rc);
469 + return rc;
470 +}
471 +
472 +/**
473 + * bbr_io_remap_error
474 + * @bbr_id: Private data for the BBR node.
475 + * @rw: READ or WRITE.
476 + * @starting_lsn: Starting sector of request to remap.
477 + * @count: Number of sectors in the request.
478 + * @page: Page containing the data for the request.
479 + * @offset: Byte-offset of the data within the page.
480 + *
481 + * For the requested range, try to write each sector individually. For each
482 + * sector that fails, find the next available remap location and write the
483 + * data to that new location. Then update the table and write both copies
484 + * of the table to disk. Finally, update the in-memory mapping and do any
485 + * other necessary bookkeeping.
486 + **/
487 +static int bbr_io_remap_error(struct bbr_private *bbr_id,
488 + int rw,
489 + u64 starting_lsn,
490 + u64 count,
491 + struct page *page,
492 + unsigned int offset)
493 +{
494 + struct bbr_table *bbr_table;
495 + struct io_region job;
496 + struct page_list pl;
497 + unsigned long table_sector_index;
498 + unsigned long table_sector_offset;
499 + unsigned long index;
500 + unsigned long error;
501 + u64 lsn, new_lsn;
502 + char b[32];
503 + int rc;
504 +
505 + job.bdev = bbr_id->dev->bdev;
506 + job.count = 1;
507 + pl.page = page;
508 + pl.next = NULL;
509 +
510 + /* For each sector in the request. */
511 + for (lsn = 0; lsn < count; lsn++, offset += SECTOR_SIZE) {
512 + job.sector = starting_lsn + lsn;
513 + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
514 + while (rc) {
515 + /* Find the next available relocation sector. */
516 + new_lsn = atomic_read(&bbr_id->in_use_replacement_blks);
517 + if (new_lsn >= bbr_id->nr_replacement_blks) {
518 + /* No more replacement sectors available. */
519 + return -EIO;
520 + }
521 + new_lsn += bbr_id->start_replacement_sect;
522 +
523 + /* Write the data to its new location. */
524 + DMWARN("device %s: Trying to remap bad sector "PFU64" to sector "PFU64,
525 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
526 + starting_lsn + lsn, new_lsn);
527 + job.sector = new_lsn;
528 + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
529 + if (rc) {
530 + /* This replacement sector is bad.
531 + * Try the next one.
532 + */
533 + DMERR("device %s: replacement sector "PFU64" is bad. Skipping.",
534 + format_dev_t(b, bbr_id->dev->bdev->bd_dev), new_lsn);
535 + atomic_inc(&bbr_id->in_use_replacement_blks);
536 + continue;
537 + }
538 +
539 + /* Add this new entry to the on-disk table. */
540 + table_sector_index = new_lsn -
541 + bbr_id->start_replacement_sect;
542 + table_sector_offset = table_sector_index /
543 + BBR_ENTRIES_PER_SECT;
544 + index = table_sector_index % BBR_ENTRIES_PER_SECT;
545 +
546 + bbr_table = &bbr_id->bbr_table[table_sector_offset];
547 + bbr_table->entries[index].bad_sect = starting_lsn + lsn;
548 + bbr_table->entries[index].replacement_sect = new_lsn;
549 + bbr_table->in_use_cnt++;
550 + bbr_table->sequence_number++;
551 + bbr_table->crc = 0;
552 + bbr_table->crc = calculate_crc(INITIAL_CRC,
553 + bbr_table,
554 + sizeof(struct bbr_table));
555 +
556 + /* Write the table to disk. */
557 + cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
558 + if (bbr_id->lba_table1) {
559 + job.sector = bbr_id->lba_table1 + table_sector_offset;
560 + rc = dm_io_sync_vm(1, &job, WRITE, bbr_table, &error);
561 + }
562 + if (bbr_id->lba_table2) {
563 + job.sector = bbr_id->lba_table2 + table_sector_offset;
564 + rc |= dm_io_sync_vm(1, &job, WRITE, bbr_table, &error);
565 + }
566 + le_bbr_table_sector_to_cpu(bbr_table);
567 +
568 + if (rc) {
569 + /* Error writing one of the tables to disk. */
570 + DMERR("device %s: error updating BBR tables on disk.",
571 + format_dev_t(b, bbr_id->dev->bdev->bd_dev));
572 + return rc;
573 + }
574 +
575 + /* Insert a new entry in the remapping binary-tree. */
576 + rc = bbr_insert_remap_entry(bbr_id,
577 + &bbr_table->entries[index]);
578 + if (rc) {
579 + DMERR("device %s: error adding new entry to remap tree.",
580 + format_dev_t(b, bbr_id->dev->bdev->bd_dev));
581 + return rc;
582 + }
583 +
584 + atomic_inc(&bbr_id->in_use_replacement_blks);
585 + }
586 + }
587 +
588 + return 0;
589 +}
590 +
591 +/**
592 + * bbr_io_process_request
593 + *
594 + * For each sector in this request, check if the sector has already
595 + * been remapped. If so, process all previous sectors in the request,
596 + * followed by the remapped sector. Then reset the starting lsn and
597 + * count, and keep going with the rest of the request as if it were
598 + * a whole new request. If any of the sync_io's return an error,
599 + * call the remapper to relocate the bad sector(s).
600 + *
601 + * 2.5 Note: When switching over to bio's for the I/O path, we have made
602 + * the assumption that the I/O request described by the bio is one
603 + * virtually contiguous piece of memory (even though the bio vector
604 + * describes it using a series of physical page addresses).
605 + **/
606 +static int bbr_io_process_request(struct bbr_private *bbr_id,
607 + struct bio *bio)
608 +{
609 + struct io_region job;
610 + u64 starting_lsn = bio->bi_sector;
611 + u64 count, lsn, remapped_lsn;
612 + struct page_list pl;
613 + unsigned int offset;
614 + unsigned long error;
615 + int i, rw = bio_data_dir(bio);
616 + int rc = 0;
617 +
618 + job.bdev = bbr_id->dev->bdev;
619 + pl.next = NULL;
620 +
621 + /* Each bio can contain multiple vectors, each with a different page.
622 + * Treat each vector as a separate request.
623 + */
624 + /* KMC: Is this the right way to walk the bvec list? */
625 + for (i = 0;
626 + i < bio->bi_vcnt;
627 + i++, bio->bi_idx++, starting_lsn += count) {
628 +
629 + /* Bvec info: number of sectors, page,
630 + * and byte-offset within page.
631 + */
632 + count = bio_iovec(bio)->bv_len >> SECTOR_SHIFT;
633 + pl.page = bio_iovec(bio)->bv_page;
634 + offset = bio_iovec(bio)->bv_offset;
635 +
636 + /* For each sector in this bvec, check if the sector has
637 + * already been remapped. If so, process all previous sectors
638 + * in this request, followed by the remapped sector. Then reset
639 + * the starting lsn and count and keep going with the rest of
640 + * the request as if it were a whole new request.
641 + */
642 + for (lsn = 0; lsn < count; lsn++) {
643 + remapped_lsn = starting_lsn + lsn;
644 + rc = bbr_remap(bbr_id, &remapped_lsn);
645 + if (!rc) {
646 + /* This sector is fine. */
647 + continue;
648 + }
649 +
650 + /* Process all sectors in the request up to this one. */
651 + if (lsn > 0) {
652 + job.sector = starting_lsn;
653 + job.count = lsn;
654 + rc = dm_io_sync(1, &job, rw, &pl,
655 + offset, &error);
656 + if (rc) {
657 + /* If this I/O failed, then one of the
658 + * sectors in this request needs to be
659 + * relocated.
660 + */
661 + rc = bbr_io_remap_error(bbr_id, rw,
662 + starting_lsn,
663 + lsn, pl.page,
664 + offset);
665 + if (rc) {
666 + /* KMC: Return? Or continue to next bvec? */
667 + return rc;
668 + }
669 + }
670 + offset += (lsn << SECTOR_SHIFT);
671 + }
672 +
673 + /* Process the remapped sector. */
674 + job.sector = remapped_lsn;
675 + job.count = 1;
676 + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
677 + if (rc) {
678 + /* BUGBUG - Need more processing if this caused
679 + * an error. If this I/O failed, then the
680 + * existing remap is now bad, and we need to
681 + * find a new remap. Can't use
682 + * bbr_io_remap_error(), because the existing
683 + * map entry needs to be changed, not added
684 + * again, and the original table entry also
685 + * needs to be changed.
686 + */
687 + return rc;
688 + }
689 +
690 + starting_lsn += (lsn + 1);
691 + count -= (lsn + 1);
692 + lsn = -1;
693 + offset += SECTOR_SIZE;
694 + }
695 +
696 + /* Check for any remaining sectors after the last split. This
697 + * could potentially be the whole request, but that should be a
698 + * rare case because requests should only be processed by the
699 + * thread if we know an error occurred or they contained one or
700 + * more remapped sectors.
701 + */
702 + if (count) {
703 + job.sector = starting_lsn;
704 + job.count = count;
705 + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
706 + if (rc) {
707 + /* If this I/O failed, then one of the sectors
708 + * in this request needs to be relocated.
709 + */
710 + rc = bbr_io_remap_error(bbr_id, rw, starting_lsn,
711 + count, pl.page, offset);
712 + if (rc) {
713 + /* KMC: Return? Or continue to next bvec? */
714 + return rc;
715 + }
716 + }
717 + }
718 + }
719 +
720 + return 0;
721 +}
722 +
723 +static void bbr_io_process_requests(struct bbr_private *bbr_id,
724 + struct bio *bio)
725 +{
726 + struct bio *next;
727 + int rc;
728 +
729 + while (bio) {
730 + next = bio->bi_next;
731 + bio->bi_next = NULL;
732 +
733 + rc = bbr_io_process_request(bbr_id, bio);
734 +
735 + bio_endio(bio, bio->bi_size, rc);
736 +
737 + bio = next;
738 + }
739 +}
740 +
741 +/**
742 + * bbr_remap_handler
743 + *
744 + * This is the handler for the bbr work-queue.
745 + *
746 + * I/O requests should only be sent to this handler if we know that:
747 + * a) the request contains at least one remapped sector.
748 + * or
749 + * b) the request caused an error on the normal I/O path.
750 + *
751 + * This function uses synchronous I/O, so sending a request to this
752 + * thread that doesn't need special processing will cause severe
753 + * performance degredation.
754 + **/
755 +static void bbr_remap_handler(struct work_struct *work)
756 +{
757 + struct bbr_private *bbr_id =
758 + container_of(work, struct bbr_private, remap_work);
759 + struct bio *bio;
760 + unsigned long flags;
761 +
762 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
763 + bio = bio_list_get(&bbr_id->remap_ios);
764 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
765 +
766 + bbr_io_process_requests(bbr_id, bio);
767 +}
768 +
769 +/**
770 + * bbr_endio
771 + *
772 + * This is the callback for normal write requests. Check for an error
773 + * during the I/O, and send to the thread for processing if necessary.
774 + **/
775 +static int bbr_endio(struct dm_target *ti, struct bio *bio,
776 + int error, union map_info *map_context)
777 +{
778 + struct bbr_private *bbr_id = ti->private;
779 + struct dm_bio_details *bbr_io = map_context->ptr;
780 +
781 + if (error && bbr_io) {
782 + unsigned long flags;
783 + char b[32];
784 +
785 + dm_bio_restore(bbr_io, bio);
786 + map_context->ptr = NULL;
787 +
788 + DMERR("device %s: I/O failure on sector %lu. "
789 + "Scheduling for retry.",
790 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
791 + (unsigned long)bio->bi_sector);
792 +
793 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
794 + bio_list_add(&bbr_id->remap_ios, bio);
795 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
796 +
797 + queue_work(dm_bbr_wq, &bbr_id->remap_work);
798 +
799 + error = 1;
800 + }
801 +
802 + if (bbr_io)
803 + mempool_free(bbr_io, bbr_io_pool);
804 +
805 + return error;
806 +}
807 +
808 +/**
809 + * Construct a bbr mapping
810 + **/
811 +static int bbr_ctr(struct dm_target *ti, unsigned int argc, char **argv)
812 +{
813 + struct bbr_private *bbr_id;
814 + unsigned long block_size;
815 + char *end;
816 + int rc = -EINVAL;
817 +
818 + if (argc != 8) {
819 + ti->error = "dm-bbr requires exactly 8 arguments: "
820 + "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size";
821 + goto out1;
822 + }
823 +
824 + bbr_id = bbr_alloc_private();
825 + if (!bbr_id) {
826 + ti->error = "dm-bbr: Error allocating bbr private data.";
827 + goto out1;
828 + }
829 +
830 + bbr_id->offset = simple_strtoull(argv[1], &end, 10);
831 + bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10);
832 + bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10);
833 + bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10);
834 + bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10);
835 + bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10);
836 + block_size = simple_strtoul(argv[7], &end, 10);
837 + bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT);
838 +
839 + bbr_id->bbr_table = vmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT);
840 + if (!bbr_id->bbr_table) {
841 + ti->error = "dm-bbr: Error allocating bbr table.";
842 + goto out2;
843 + }
844 +
845 + if (dm_get_device(ti, argv[0], 0, ti->len,
846 + dm_table_get_mode(ti->table), &bbr_id->dev)) {
847 + ti->error = "dm-bbr: Device lookup failed";
848 + goto out2;
849 + }
850 +
851 + rc = bbr_setup(bbr_id);
852 + if (rc) {
853 + ti->error = "dm-bbr: Device setup failed";
854 + goto out3;
855 + }
856 +
857 + ti->private = bbr_id;
858 + return 0;
859 +
860 +out3:
861 + dm_put_device(ti, bbr_id->dev);
862 +out2:
863 + bbr_free_private(bbr_id);
864 +out1:
865 + return rc;
866 +}
867 +
868 +static void bbr_dtr(struct dm_target *ti)
869 +{
870 + struct bbr_private *bbr_id = ti->private;
871 +
872 + dm_put_device(ti, bbr_id->dev);
873 + bbr_free_private(bbr_id);
874 +}
875 +
876 +static int bbr_map(struct dm_target *ti, struct bio *bio,
877 + union map_info *map_context)
878 +{
879 + struct bbr_private *bbr_id = ti->private;
880 + struct dm_bio_details *bbr_io;
881 + unsigned long flags;
882 + int rc = 1;
883 +
884 + bio->bi_sector += bbr_id->offset;
885 +
886 + if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
887 + !bbr_remap_probe(bbr_id, bio->bi_sector, bio_sectors(bio))) {
888 + /* No existing remaps or this request doesn't
889 + * contain any remapped sectors.
890 + */
891 + bio->bi_bdev = bbr_id->dev->bdev;
892 +
893 + bbr_io = mempool_alloc(bbr_io_pool, GFP_NOIO);
894 + dm_bio_record(bbr_io, bio);
895 + map_context->ptr = bbr_io;
896 + } else {
897 + /* This request has at least one remapped sector.
898 + * Give it to the work-queue for processing.
899 + */
900 + map_context->ptr = NULL;
901 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
902 + bio_list_add(&bbr_id->remap_ios, bio);
903 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
904 +
905 + queue_work(dm_bbr_wq, &bbr_id->remap_work);
906 + rc = 0;
907 + }
908 +
909 + return rc;
910 +}
911 +
912 +static int bbr_status(struct dm_target *ti, status_type_t type,
913 + char *result, unsigned int maxlen)
914 +{
915 + struct bbr_private *bbr_id = ti->private;
916 + char b[BDEVNAME_SIZE];
917 +
918 + switch (type) {
919 + case STATUSTYPE_INFO:
920 + result[0] = '\0';
921 + break;
922 +
923 + case STATUSTYPE_TABLE:
924 + snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u",
925 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
926 + bbr_id->offset, bbr_id->lba_table1, bbr_id->lba_table2,
927 + bbr_id->nr_sects_bbr_table,
928 + bbr_id->start_replacement_sect,
929 + bbr_id->nr_replacement_blks,
930 + bbr_id->blksize_in_sects << SECTOR_SHIFT);
931 + break;
932 + }
933 + return 0;
934 +}
935 +
936 +static struct target_type bbr_target = {
937 + .name = "bbr",
938 + .version= {1, 0, 1},
939 + .module = THIS_MODULE,
940 + .ctr = bbr_ctr,
941 + .dtr = bbr_dtr,
942 + .map = bbr_map,
943 + .end_io = bbr_endio,
944 + .status = bbr_status,
945 +};
946 +
947 +int __init dm_bbr_init(void)
948 +{
949 + int rc;
950 +
951 + rc = dm_register_target(&bbr_target);
952 + if (rc) {
953 + DMERR("error registering target.");
954 + goto err1;
955 + }
956 +
957 + bbr_remap_cache = kmem_cache_create("bbr-remap",
958 + sizeof(struct bbr_runtime_remap),
959 + 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
960 + if (!bbr_remap_cache) {
961 + DMERR("error creating remap cache.");
962 + rc = ENOMEM;
963 + goto err2;
964 + }
965 +
966 + bbr_io_cache = kmem_cache_create("bbr-io", sizeof(struct dm_bio_details),
967 + 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
968 + if (!bbr_io_cache) {
969 + DMERR("error creating io cache.");
970 + rc = ENOMEM;
971 + goto err3;
972 + }
973 +
974 + bbr_io_pool = mempool_create(256, mempool_alloc_slab,
975 + mempool_free_slab, bbr_io_cache);
976 + if (!bbr_io_pool) {
977 + DMERR("error creating io mempool.");
978 + rc = ENOMEM;
979 + goto err4;
980 + }
981 +
982 + dm_bbr_wq = create_workqueue("dm-bbr");
983 + if (!dm_bbr_wq) {
984 + DMERR("error creating work-queue.");
985 + rc = ENOMEM;
986 + goto err5;
987 + }
988 +
989 + rc = dm_io_get(1);
990 + if (rc) {
991 + DMERR("error initializing I/O service.");
992 + goto err6;
993 + }
994 +
995 + return 0;
996 +
997 +err6:
998 + destroy_workqueue(dm_bbr_wq);
999 +err5:
1000 + mempool_destroy(bbr_io_pool);
1001 +err4:
1002 + kmem_cache_destroy(bbr_io_cache);
1003 +err3:
1004 + kmem_cache_destroy(bbr_remap_cache);
1005 +err2:
1006 + dm_unregister_target(&bbr_target);
1007 +err1:
1008 + return rc;
1009 +}
1010 +
1011 +void __exit dm_bbr_exit(void)
1012 +{
1013 + dm_io_put(1);
1014 + destroy_workqueue(dm_bbr_wq);
1015 + mempool_destroy(bbr_io_pool);
1016 + kmem_cache_destroy(bbr_io_cache);
1017 + kmem_cache_destroy(bbr_remap_cache);
1018 + dm_unregister_target(&bbr_target);
1019 +}
1020 +
1021 +module_init(dm_bbr_init);
1022 +module_exit(dm_bbr_exit);
1023 +MODULE_LICENSE("GPL");
1024 Index: linux-2.6.20-rc6/drivers/md/dm-bbr.h
1025 ===================================================================
1026 --- /dev/null
1027 +++ linux-2.6.20-rc6/drivers/md/dm-bbr.h
1028 @@ -0,0 +1,125 @@
1029 +/*
1030 + * (C) Copyright IBM Corp. 2002, 2004
1031 + *
1032 + * This program is free software; you can redistribute it and/or modify
1033 + * it under the terms of the GNU General Public License as published by
1034 + * the Free Software Foundation; either version 2 of the License, or
1035 + * (at your option) any later version.
1036 + *
1037 + * This program is distributed in the hope that it will be useful,
1038 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1039 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
1040 + * the GNU General Public License for more details.
1041 + *
1042 + * You should have received a copy of the GNU General Public License
1043 + * along with this program; if not, write to the Free Software
1044 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1045 + *
1046 + * linux/drivers/md/dm-bbr.h
1047 + *
1048 + * Bad-block-relocation (BBR) target for device-mapper.
1049 + *
1050 + * The BBR target is designed to remap I/O write failures to another safe
1051 + * location on disk. Note that most disk drives have BBR built into them,
1052 + * this means that our software BBR will be only activated when all hardware
1053 + * BBR replacement sectors have been used.
1054 + */
1055 +
1056 +#define BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */
1057 +#define BBR_ENTRIES_PER_SECT 31
1058 +#define INITIAL_CRC 0xFFFFFFFF
1059 +#define CRC_POLYNOMIAL 0xEDB88320L
1060 +
1061 +/**
1062 + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines.
1063 + * Use these in place of %Ld, %Lu, and %Lx.
1064 + **/
1065 +#if BITS_PER_LONG > 32
1066 +#define PFU64 "%lu"
1067 +#else
1068 +#define PFU64 "%Lu"
1069 +#endif
1070 +
1071 +/**
1072 + * struct bbr_table_entry
1073 + * @bad_sect: LBA of bad location.
1074 + * @replacement_sect: LBA of new location.
1075 + *
1076 + * Structure to describe one BBR remap.
1077 + **/
1078 +struct bbr_table_entry {
1079 + u64 bad_sect;
1080 + u64 replacement_sect;
1081 +};
1082 +
1083 +/**
1084 + * struct bbr_table
1085 + * @signature: Signature on each BBR table sector.
1086 + * @crc: CRC for this table sector.
1087 + * @sequence_number: Used to resolve conflicts when primary and secondary
1088 + * tables do not match.
1089 + * @in_use_cnt: Number of in-use table entries.
1090 + * @entries: Actual table of remaps.
1091 + *
1092 + * Structure to describe each sector of the metadata table. Each sector in this
1093 + * table can describe 31 remapped sectors.
1094 + **/
1095 +struct bbr_table {
1096 + u32 signature;
1097 + u32 crc;
1098 + u32 sequence_number;
1099 + u32 in_use_cnt;
1100 + struct bbr_table_entry entries[BBR_ENTRIES_PER_SECT];
1101 +};
1102 +
1103 +/**
1104 + * struct bbr_runtime_remap
1105 + *
1106 + * Node in the binary tree used to keep track of remaps.
1107 + **/
1108 +struct bbr_runtime_remap {
1109 + struct bbr_table_entry remap;
1110 + struct bbr_runtime_remap *left;
1111 + struct bbr_runtime_remap *right;
1112 +};
1113 +
1114 +/**
1115 + * struct bbr_private
1116 + * @dev: Info about underlying device.
1117 + * @bbr_table: Copy of metadata table.
1118 + * @remap_root: Binary tree containing all remaps.
1119 + * @remap_root_lock: Lock for the binary tree.
1120 + * @remap_work: For adding work items to the work-queue.
1121 + * @remap_ios: List of I/Os for the work-queue to handle.
1122 + * @remap_ios_lock: Lock for the remap_ios list.
1123 + * @offset: LBA of data area.
1124 + * @lba_table1: LBA of primary BBR table.
1125 + * @lba_table2: LBA of secondary BBR table.
1126 + * @nr_sects_bbr_table: Size of each BBR table.
1127 + * @nr_replacement_blks: Number of replacement blocks.
1128 + * @start_replacement_sect: LBA of start of replacement blocks.
1129 + * @blksize_in_sects: Size of each block.
1130 + * @in_use_replacement_blks: Current number of remapped blocks.
1131 + *
1132 + * Private data for each BBR target.
1133 + **/
1134 +struct bbr_private {
1135 + struct dm_dev *dev;
1136 + struct bbr_table *bbr_table;
1137 + struct bbr_runtime_remap *remap_root;
1138 + spinlock_t remap_root_lock;
1139 +
1140 + struct work_struct remap_work;
1141 + struct bio_list remap_ios;
1142 + spinlock_t remap_ios_lock;
1143 +
1144 + u64 offset;
1145 + u64 lba_table1;
1146 + u64 lba_table2;
1147 + u64 nr_sects_bbr_table;
1148 + u64 start_replacement_sect;
1149 + u64 nr_replacement_blks;
1150 + u32 blksize_in_sects;
1151 + atomic_t in_use_replacement_blks;
1152 +};
1153 +

  ViewVC Help
Powered by ViewVC 1.1.20