/[linux-patches]/genpatches-2.6/trunk/2.6.22-pre/4105_dm-bbr.patch
Gentoo

Contents of /genpatches-2.6/trunk/2.6.22-pre/4105_dm-bbr.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 963 - (show annotations) (download)
Sat May 19 09:58:57 2007 UTC (10 years, 11 months ago) by phreak
File size: 31515 byte(s)
Starting branch for 2.6.22.
1 BBR Target.
2
3 ---
4 drivers/md/Kconfig | 11
5 drivers/md/Makefile | 1
6 drivers/md/dm-bbr.c | 982 ++++++++++++++++++++++++++++++++++++++++++++++++++++
7 drivers/md/dm-bbr.h | 125 ++++++
8 4 files changed, 1119 insertions(+)
9
10 --- a/drivers/md/Kconfig
11 +++ b/drivers/md/Kconfig
12 @@ -271,6 +271,17 @@ config DM_DELAY
13
14 If unsure, say N.
15
16 +config BLK_DEV_DM_BBR
17 + tristate "Bad Block Relocation Device Target (EXPERIMENTAL)"
18 + depends on BLK_DEV_DM && EXPERIMENTAL
19 + ---help---
20 + Support for devices with software-based bad-block-relocation.
21 +
22 + To compile this as a module, choose M here: the module will be
23 + called dm-bbr.
24 +
25 + If unsure, say N.
26 +
27 endmenu
28
29 endif
30 --- a/drivers/md/Makefile
31 +++ b/drivers/md/Makefile
32 @@ -37,6 +37,7 @@ obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc
33 obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
34 obj-$(CONFIG_DM_MIRROR) += dm-mirror.o
35 obj-$(CONFIG_DM_ZERO) += dm-zero.o
36 +obj-$(CONFIG_BLK_DEV_DM_BBR) += dm-bbr.o
37
38 quiet_cmd_unroll = UNROLL $@
39 cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
40 --- /dev/null
41 +++ b/drivers/md/dm-bbr.c
42 @@ -0,0 +1,982 @@
43 +/*
44 + * (C) Copyright IBM Corp. 2002, 2004
45 + *
46 + * This program is free software; you can redistribute it and/or modify
47 + * it under the terms of the GNU General Public License as published by
48 + * the Free Software Foundation; either version 2 of the License, or
49 + * (at your option) any later version.
50 + *
51 + * This program is distributed in the hope that it will be useful,
52 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
53 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
54 + * the GNU General Public License for more details.
55 + *
56 + * You should have received a copy of the GNU General Public License
57 + * along with this program; if not, write to the Free Software
58 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
59 + *
60 + * linux/drivers/md/dm-bbr.c
61 + *
62 + * Bad-block-relocation (BBR) target for device-mapper.
63 + *
64 + * The BBR target is designed to remap I/O write failures to another safe
65 + * location on disk. Note that most disk drives have BBR built into them,
66 + * this means that our software BBR will be only activated when all hardware
67 + * BBR replacement sectors have been used.
68 + */
69 +
70 +#include <linux/module.h>
71 +#include <linux/init.h>
72 +#include <linux/bio.h>
73 +#include <linux/spinlock.h>
74 +#include <linux/slab.h>
75 +#include <linux/mempool.h>
76 +#include <linux/workqueue.h>
77 +#include <linux/vmalloc.h>
78 +
79 +#include "dm.h"
80 +#include "dm-bio-list.h"
81 +#include "dm-bio-record.h"
82 +#include "dm-bbr.h"
83 +#include "dm-io.h"
84 +
85 +#define DM_MSG_PREFIX "bbr"
86 +#define SECTOR_SIZE (1 << SECTOR_SHIFT)
87 +
88 +static struct workqueue_struct *dm_bbr_wq = NULL;
89 +static void bbr_remap_handler(struct work_struct *work);
90 +static struct kmem_cache *bbr_remap_cache;
91 +static struct kmem_cache *bbr_io_cache;
92 +static mempool_t *bbr_io_pool;
93 +
94 +/**
95 + * bbr_binary_tree_destroy
96 + *
97 + * Destroy the binary tree.
98 + **/
99 +static void bbr_binary_tree_destroy(struct bbr_runtime_remap *root)
100 +{
101 + struct bbr_runtime_remap **link = NULL;
102 + struct bbr_runtime_remap *node = root;
103 +
104 + while (node) {
105 + if (node->left) {
106 + link = &node->left;
107 + node = node->left;
108 + continue;
109 + }
110 + if (node->right) {
111 + link = &node->right;
112 + node = node->right;
113 + continue;
114 + }
115 +
116 + kmem_cache_free(bbr_remap_cache, node);
117 + if (node == root) {
118 + /* If root is deleted, we're done. */
119 + break;
120 + }
121 +
122 + /* Back to root. */
123 + node = root;
124 + *link = NULL;
125 + }
126 +}
127 +
128 +static void bbr_free_remap(struct bbr_private *bbr_id)
129 +{
130 + spin_lock_irq(&bbr_id->remap_root_lock);
131 + bbr_binary_tree_destroy(bbr_id->remap_root);
132 + bbr_id->remap_root = NULL;
133 + spin_unlock_irq(&bbr_id->remap_root_lock);
134 +}
135 +
136 +static struct bbr_private *bbr_alloc_private(void)
137 +{
138 + struct bbr_private *bbr_id;
139 +
140 + bbr_id = kzalloc(sizeof(*bbr_id), GFP_KERNEL);
141 + if (bbr_id == NULL)
142 + return NULL;
143 +
144 + INIT_WORK(&bbr_id->remap_work, bbr_remap_handler);
145 + spin_lock_init(&bbr_id->remap_root_lock);
146 + spin_lock_init(&bbr_id->remap_ios_lock);
147 + bbr_id->in_use_replacement_blks = (atomic_t) ATOMIC_INIT(0);
148 +
149 + return bbr_id;
150 +}
151 +
152 +static void bbr_free_private(struct bbr_private *bbr_id)
153 +{
154 + vfree(bbr_id->bbr_table);
155 + bbr_free_remap(bbr_id);
156 + kfree(bbr_id);
157 +}
158 +
159 +static u32 crc_table[256];
160 +static u32 crc_table_built = 0;
161 +
162 +static void build_crc_table(void)
163 +{
164 + u32 i, j, crc;
165 +
166 + for (i = 0; i <= 255; i++) {
167 + crc = i;
168 + for (j = 8; j > 0; j--) {
169 + if (crc & 1)
170 + crc = (crc >> 1) ^ CRC_POLYNOMIAL;
171 + else
172 + crc >>= 1;
173 + }
174 + crc_table[i] = crc;
175 + }
176 + crc_table_built = 1;
177 +}
178 +
179 +static u32 calculate_crc(u32 crc, void *buffer, u32 buffersize)
180 +{
181 + unsigned char *current_byte;
182 + u32 temp1, temp2, i;
183 +
184 + current_byte = (unsigned char *) buffer;
185 + /* Make sure the crc table is available */
186 + if (!crc_table_built)
187 + build_crc_table();
188 + /* Process each byte in the buffer. */
189 + for (i = 0; i < buffersize; i++) {
190 + temp1 = (crc >> 8) & 0x00FFFFFF;
191 + temp2 = crc_table[(crc ^ (u32) * current_byte) &
192 + (u32) 0xff];
193 + current_byte++;
194 + crc = temp1 ^ temp2;
195 + }
196 + return crc;
197 +}
198 +
199 +/**
200 + * le_bbr_table_sector_to_cpu
201 + *
202 + * Convert bbr meta data from on-disk (LE) format
203 + * to the native cpu endian format.
204 + **/
205 +static void le_bbr_table_sector_to_cpu(struct bbr_table *p)
206 +{
207 + int i;
208 + p->signature = le32_to_cpup(&p->signature);
209 + p->crc = le32_to_cpup(&p->crc);
210 + p->sequence_number = le32_to_cpup(&p->sequence_number);
211 + p->in_use_cnt = le32_to_cpup(&p->in_use_cnt);
212 + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
213 + p->entries[i].bad_sect =
214 + le64_to_cpup(&p->entries[i].bad_sect);
215 + p->entries[i].replacement_sect =
216 + le64_to_cpup(&p->entries[i].replacement_sect);
217 + }
218 +}
219 +
220 +/**
221 + * cpu_bbr_table_sector_to_le
222 + *
223 + * Convert bbr meta data from cpu endian format to on-disk (LE) format
224 + **/
225 +static void cpu_bbr_table_sector_to_le(struct bbr_table *p,
226 + struct bbr_table *le)
227 +{
228 + int i;
229 + le->signature = cpu_to_le32p(&p->signature);
230 + le->crc = cpu_to_le32p(&p->crc);
231 + le->sequence_number = cpu_to_le32p(&p->sequence_number);
232 + le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt);
233 + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
234 + le->entries[i].bad_sect =
235 + cpu_to_le64p(&p->entries[i].bad_sect);
236 + le->entries[i].replacement_sect =
237 + cpu_to_le64p(&p->entries[i].replacement_sect);
238 + }
239 +}
240 +
241 +/**
242 + * validate_bbr_table_sector
243 + *
244 + * Check the specified BBR table sector for a valid signature and CRC. If it's
245 + * valid, endian-convert the table sector.
246 + **/
247 +static int validate_bbr_table_sector(struct bbr_table *p)
248 +{
249 + int org_crc, final_crc;
250 +
251 + if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) {
252 + DMERR("BBR table signature doesn't match!");
253 + DMERR("Found 0x%x. Expecting 0x%x",
254 + le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE);
255 + return -EINVAL;
256 + }
257 +
258 + if (!p->crc) {
259 + DMERR("BBR table sector has no CRC!");
260 + return -EINVAL;
261 + }
262 +
263 + org_crc = le32_to_cpup(&p->crc);
264 + p->crc = 0;
265 + final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p));
266 + if (final_crc != org_crc) {
267 + DMERR("CRC failed!");
268 + DMERR("Found 0x%x. Expecting 0x%x",
269 + org_crc, final_crc);
270 + return -EINVAL;
271 + }
272 +
273 + p->crc = cpu_to_le32p(&org_crc);
274 + le_bbr_table_sector_to_cpu(p);
275 +
276 + return 0;
277 +}
278 +
279 +/**
280 + * bbr_binary_tree_insert
281 + *
282 + * Insert a node into the binary tree.
283 + **/
284 +static void bbr_binary_tree_insert(struct bbr_runtime_remap **root,
285 + struct bbr_runtime_remap *newnode)
286 +{
287 + struct bbr_runtime_remap **node = root;
288 + while (node && *node) {
289 + node = (newnode->remap.bad_sect > (*node)->remap.bad_sect) ?
290 + &(*node)->right : &(*node)->left;
291 + }
292 +
293 + newnode->left = newnode->right = NULL;
294 + *node = newnode;
295 +}
296 +
297 +/**
298 + * bbr_binary_search
299 + *
300 + * Search for a node that contains bad_sect == lsn.
301 + **/
302 +static struct bbr_runtime_remap *bbr_binary_search(
303 + struct bbr_runtime_remap *root,
304 + u64 lsn)
305 +{
306 + struct bbr_runtime_remap *node = root;
307 + while (node) {
308 + if (node->remap.bad_sect == lsn)
309 + break;
310 +
311 + node = (lsn > node->remap.bad_sect) ? node->right : node->left;
312 + }
313 + return node;
314 +}
315 +
316 +/**
317 + * bbr_insert_remap_entry
318 + *
319 + * Create a new remap entry and add it to the binary tree for this node.
320 + **/
321 +static int bbr_insert_remap_entry(struct bbr_private *bbr_id,
322 + struct bbr_table_entry *new_bbr_entry)
323 +{
324 + struct bbr_runtime_remap *newnode;
325 +
326 + newnode = kmem_cache_alloc(bbr_remap_cache, GFP_NOIO);
327 + if (!newnode) {
328 + DMERR("Could not allocate from remap cache!");
329 + return -ENOMEM;
330 + }
331 + newnode->remap.bad_sect = new_bbr_entry->bad_sect;
332 + newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
333 + spin_lock_irq(&bbr_id->remap_root_lock);
334 + bbr_binary_tree_insert(&bbr_id->remap_root, newnode);
335 + spin_unlock_irq(&bbr_id->remap_root_lock);
336 + return 0;
337 +}
338 +
339 +/**
340 + * bbr_table_to_remap_list
341 + *
342 + * The on-disk bbr table is sorted by the replacement sector LBA. In order to
343 + * improve run time performance, the in memory remap list must be sorted by
344 + * the bad sector LBA. This function is called at discovery time to initialize
345 + * the remap list. This function assumes that at least one copy of meta data
346 + * is valid.
347 + **/
348 +static u32 bbr_table_to_remap_list(struct bbr_private *bbr_id)
349 +{
350 + u32 in_use_blks = 0;
351 + int i, j;
352 + struct bbr_table *p;
353 +
354 + for (i = 0, p = bbr_id->bbr_table;
355 + i < bbr_id->nr_sects_bbr_table;
356 + i++, p++) {
357 + if (!p->in_use_cnt)
358 + break;
359 +
360 + in_use_blks += p->in_use_cnt;
361 + for (j = 0; j < p->in_use_cnt; j++)
362 + bbr_insert_remap_entry(bbr_id, &p->entries[j]);
363 + }
364 + if (in_use_blks) {
365 + char b[32];
366 + DMWARN("There are %u BBR entries for device %s",
367 + in_use_blks, format_dev_t(b, bbr_id->dev->bdev->bd_dev));
368 + }
369 +
370 + return in_use_blks;
371 +}
372 +
373 +/**
374 + * bbr_search_remap_entry
375 + *
376 + * Search remap entry for the specified sector. If found, return a pointer to
377 + * the table entry. Otherwise, return NULL.
378 + **/
379 +static struct bbr_table_entry *bbr_search_remap_entry(
380 + struct bbr_private *bbr_id,
381 + u64 lsn)
382 +{
383 + struct bbr_runtime_remap *p;
384 +
385 + spin_lock_irq(&bbr_id->remap_root_lock);
386 + p = bbr_binary_search(bbr_id->remap_root, lsn);
387 + spin_unlock_irq(&bbr_id->remap_root_lock);
388 + return (p) ? &p->remap : NULL;
389 +}
390 +
391 +/**
392 + * bbr_remap
393 + *
394 + * If *lsn is in the remap table, return TRUE and modify *lsn,
395 + * else, return FALSE.
396 + **/
397 +static int bbr_remap(struct bbr_private *bbr_id,
398 + u64 *lsn)
399 +{
400 + struct bbr_table_entry *e;
401 +
402 + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
403 + e = bbr_search_remap_entry(bbr_id, *lsn);
404 + if (e) {
405 + *lsn = e->replacement_sect;
406 + return 1;
407 + }
408 + }
409 + return 0;
410 +}
411 +
412 +/**
413 + * bbr_remap_probe
414 + *
415 + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap
416 + * table return TRUE, Else, return FALSE.
417 + **/
418 +static int bbr_remap_probe(struct bbr_private *bbr_id,
419 + u64 lsn, u64 nr_sects)
420 +{
421 + u64 tmp, cnt;
422 +
423 + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
424 + for (cnt = 0, tmp = lsn;
425 + cnt < nr_sects;
426 + cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) {
427 + if (bbr_remap(bbr_id,&tmp))
428 + return 1;
429 + }
430 + }
431 + return 0;
432 +}
433 +
434 +/**
435 + * bbr_setup
436 + *
437 + * Read the remap tables from disk and set up the initial remap tree.
438 + **/
439 +static int bbr_setup(struct bbr_private *bbr_id)
440 +{
441 + struct bbr_table *table = bbr_id->bbr_table;
442 + struct io_region job;
443 + unsigned long error;
444 + int i, rc = 0;
445 +
446 + job.bdev = bbr_id->dev->bdev;
447 + job.count = 1;
448 +
449 + /* Read and verify each BBR table sector individually. */
450 + for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) {
451 + job.sector = bbr_id->lba_table1 + i;
452 + rc = dm_io_sync_vm(1, &job, READ, table, &error);
453 + if (rc && bbr_id->lba_table2) {
454 + job.sector = bbr_id->lba_table2 + i;
455 + rc = dm_io_sync_vm(1, &job, READ, table, &error);
456 + }
457 + if (rc)
458 + goto out;
459 +
460 + rc = validate_bbr_table_sector(table);
461 + if (rc)
462 + goto out;
463 + }
464 + atomic_set(&bbr_id->in_use_replacement_blks,
465 + bbr_table_to_remap_list(bbr_id));
466 +
467 +out:
468 + if (rc)
469 + DMERR("error during device setup: %d", rc);
470 + return rc;
471 +}
472 +
473 +/**
474 + * bbr_io_remap_error
475 + * @bbr_id: Private data for the BBR node.
476 + * @rw: READ or WRITE.
477 + * @starting_lsn: Starting sector of request to remap.
478 + * @count: Number of sectors in the request.
479 + * @page: Page containing the data for the request.
480 + * @offset: Byte-offset of the data within the page.
481 + *
482 + * For the requested range, try to write each sector individually. For each
483 + * sector that fails, find the next available remap location and write the
484 + * data to that new location. Then update the table and write both copies
485 + * of the table to disk. Finally, update the in-memory mapping and do any
486 + * other necessary bookkeeping.
487 + **/
488 +static int bbr_io_remap_error(struct bbr_private *bbr_id,
489 + int rw,
490 + u64 starting_lsn,
491 + u64 count,
492 + struct page *page,
493 + unsigned int offset)
494 +{
495 + struct bbr_table *bbr_table;
496 + struct io_region job;
497 + struct page_list pl;
498 + unsigned long table_sector_index;
499 + unsigned long table_sector_offset;
500 + unsigned long index;
501 + unsigned long error;
502 + u64 lsn, new_lsn;
503 + char b[32];
504 + int rc;
505 +
506 + job.bdev = bbr_id->dev->bdev;
507 + job.count = 1;
508 + pl.page = page;
509 + pl.next = NULL;
510 +
511 + /* For each sector in the request. */
512 + for (lsn = 0; lsn < count; lsn++, offset += SECTOR_SIZE) {
513 + job.sector = starting_lsn + lsn;
514 + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
515 + while (rc) {
516 + /* Find the next available relocation sector. */
517 + new_lsn = atomic_read(&bbr_id->in_use_replacement_blks);
518 + if (new_lsn >= bbr_id->nr_replacement_blks) {
519 + /* No more replacement sectors available. */
520 + return -EIO;
521 + }
522 + new_lsn += bbr_id->start_replacement_sect;
523 +
524 + /* Write the data to its new location. */
525 + DMWARN("device %s: Trying to remap bad sector "PFU64" to sector "PFU64,
526 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
527 + starting_lsn + lsn, new_lsn);
528 + job.sector = new_lsn;
529 + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
530 + if (rc) {
531 + /* This replacement sector is bad.
532 + * Try the next one.
533 + */
534 + DMERR("device %s: replacement sector "PFU64" is bad. Skipping.",
535 + format_dev_t(b, bbr_id->dev->bdev->bd_dev), new_lsn);
536 + atomic_inc(&bbr_id->in_use_replacement_blks);
537 + continue;
538 + }
539 +
540 + /* Add this new entry to the on-disk table. */
541 + table_sector_index = new_lsn -
542 + bbr_id->start_replacement_sect;
543 + table_sector_offset = table_sector_index /
544 + BBR_ENTRIES_PER_SECT;
545 + index = table_sector_index % BBR_ENTRIES_PER_SECT;
546 +
547 + bbr_table = &bbr_id->bbr_table[table_sector_offset];
548 + bbr_table->entries[index].bad_sect = starting_lsn + lsn;
549 + bbr_table->entries[index].replacement_sect = new_lsn;
550 + bbr_table->in_use_cnt++;
551 + bbr_table->sequence_number++;
552 + bbr_table->crc = 0;
553 + bbr_table->crc = calculate_crc(INITIAL_CRC,
554 + bbr_table,
555 + sizeof(struct bbr_table));
556 +
557 + /* Write the table to disk. */
558 + cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
559 + if (bbr_id->lba_table1) {
560 + job.sector = bbr_id->lba_table1 + table_sector_offset;
561 + rc = dm_io_sync_vm(1, &job, WRITE, bbr_table, &error);
562 + }
563 + if (bbr_id->lba_table2) {
564 + job.sector = bbr_id->lba_table2 + table_sector_offset;
565 + rc |= dm_io_sync_vm(1, &job, WRITE, bbr_table, &error);
566 + }
567 + le_bbr_table_sector_to_cpu(bbr_table);
568 +
569 + if (rc) {
570 + /* Error writing one of the tables to disk. */
571 + DMERR("device %s: error updating BBR tables on disk.",
572 + format_dev_t(b, bbr_id->dev->bdev->bd_dev));
573 + return rc;
574 + }
575 +
576 + /* Insert a new entry in the remapping binary-tree. */
577 + rc = bbr_insert_remap_entry(bbr_id,
578 + &bbr_table->entries[index]);
579 + if (rc) {
580 + DMERR("device %s: error adding new entry to remap tree.",
581 + format_dev_t(b, bbr_id->dev->bdev->bd_dev));
582 + return rc;
583 + }
584 +
585 + atomic_inc(&bbr_id->in_use_replacement_blks);
586 + }
587 + }
588 +
589 + return 0;
590 +}
591 +
592 +/**
593 + * bbr_io_process_request
594 + *
595 + * For each sector in this request, check if the sector has already
596 + * been remapped. If so, process all previous sectors in the request,
597 + * followed by the remapped sector. Then reset the starting lsn and
598 + * count, and keep going with the rest of the request as if it were
599 + * a whole new request. If any of the sync_io's return an error,
600 + * call the remapper to relocate the bad sector(s).
601 + *
602 + * 2.5 Note: When switching over to bio's for the I/O path, we have made
603 + * the assumption that the I/O request described by the bio is one
604 + * virtually contiguous piece of memory (even though the bio vector
605 + * describes it using a series of physical page addresses).
606 + **/
607 +static int bbr_io_process_request(struct bbr_private *bbr_id,
608 + struct bio *bio)
609 +{
610 + struct io_region job;
611 + u64 starting_lsn = bio->bi_sector;
612 + u64 count, lsn, remapped_lsn;
613 + struct page_list pl;
614 + unsigned int offset;
615 + unsigned long error;
616 + int i, rw = bio_data_dir(bio);
617 + int rc = 0;
618 +
619 + job.bdev = bbr_id->dev->bdev;
620 + pl.next = NULL;
621 +
622 + /* Each bio can contain multiple vectors, each with a different page.
623 + * Treat each vector as a separate request.
624 + */
625 + /* KMC: Is this the right way to walk the bvec list? */
626 + for (i = 0;
627 + i < bio->bi_vcnt;
628 + i++, bio->bi_idx++, starting_lsn += count) {
629 +
630 + /* Bvec info: number of sectors, page,
631 + * and byte-offset within page.
632 + */
633 + count = bio_iovec(bio)->bv_len >> SECTOR_SHIFT;
634 + pl.page = bio_iovec(bio)->bv_page;
635 + offset = bio_iovec(bio)->bv_offset;
636 +
637 + /* For each sector in this bvec, check if the sector has
638 + * already been remapped. If so, process all previous sectors
639 + * in this request, followed by the remapped sector. Then reset
640 + * the starting lsn and count and keep going with the rest of
641 + * the request as if it were a whole new request.
642 + */
643 + for (lsn = 0; lsn < count; lsn++) {
644 + remapped_lsn = starting_lsn + lsn;
645 + rc = bbr_remap(bbr_id, &remapped_lsn);
646 + if (!rc) {
647 + /* This sector is fine. */
648 + continue;
649 + }
650 +
651 + /* Process all sectors in the request up to this one. */
652 + if (lsn > 0) {
653 + job.sector = starting_lsn;
654 + job.count = lsn;
655 + rc = dm_io_sync(1, &job, rw, &pl,
656 + offset, &error);
657 + if (rc) {
658 + /* If this I/O failed, then one of the
659 + * sectors in this request needs to be
660 + * relocated.
661 + */
662 + rc = bbr_io_remap_error(bbr_id, rw,
663 + starting_lsn,
664 + lsn, pl.page,
665 + offset);
666 + if (rc) {
667 + /* KMC: Return? Or continue to next bvec? */
668 + return rc;
669 + }
670 + }
671 + offset += (lsn << SECTOR_SHIFT);
672 + }
673 +
674 + /* Process the remapped sector. */
675 + job.sector = remapped_lsn;
676 + job.count = 1;
677 + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
678 + if (rc) {
679 + /* BUGBUG - Need more processing if this caused
680 + * an error. If this I/O failed, then the
681 + * existing remap is now bad, and we need to
682 + * find a new remap. Can't use
683 + * bbr_io_remap_error(), because the existing
684 + * map entry needs to be changed, not added
685 + * again, and the original table entry also
686 + * needs to be changed.
687 + */
688 + return rc;
689 + }
690 +
691 + starting_lsn += (lsn + 1);
692 + count -= (lsn + 1);
693 + lsn = -1;
694 + offset += SECTOR_SIZE;
695 + }
696 +
697 + /* Check for any remaining sectors after the last split. This
698 + * could potentially be the whole request, but that should be a
699 + * rare case because requests should only be processed by the
700 + * thread if we know an error occurred or they contained one or
701 + * more remapped sectors.
702 + */
703 + if (count) {
704 + job.sector = starting_lsn;
705 + job.count = count;
706 + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
707 + if (rc) {
708 + /* If this I/O failed, then one of the sectors
709 + * in this request needs to be relocated.
710 + */
711 + rc = bbr_io_remap_error(bbr_id, rw, starting_lsn,
712 + count, pl.page, offset);
713 + if (rc) {
714 + /* KMC: Return? Or continue to next bvec? */
715 + return rc;
716 + }
717 + }
718 + }
719 + }
720 +
721 + return 0;
722 +}
723 +
724 +static void bbr_io_process_requests(struct bbr_private *bbr_id,
725 + struct bio *bio)
726 +{
727 + struct bio *next;
728 + int rc;
729 +
730 + while (bio) {
731 + next = bio->bi_next;
732 + bio->bi_next = NULL;
733 +
734 + rc = bbr_io_process_request(bbr_id, bio);
735 +
736 + bio_endio(bio, bio->bi_size, rc);
737 +
738 + bio = next;
739 + }
740 +}
741 +
742 +/**
743 + * bbr_remap_handler
744 + *
745 + * This is the handler for the bbr work-queue.
746 + *
747 + * I/O requests should only be sent to this handler if we know that:
748 + * a) the request contains at least one remapped sector.
749 + * or
750 + * b) the request caused an error on the normal I/O path.
751 + *
752 + * This function uses synchronous I/O, so sending a request to this
753 + * thread that doesn't need special processing will cause severe
754 + * performance degredation.
755 + **/
756 +static void bbr_remap_handler(struct work_struct *work)
757 +{
758 + struct bbr_private *bbr_id =
759 + container_of(work, struct bbr_private, remap_work);
760 + struct bio *bio;
761 + unsigned long flags;
762 +
763 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
764 + bio = bio_list_get(&bbr_id->remap_ios);
765 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
766 +
767 + bbr_io_process_requests(bbr_id, bio);
768 +}
769 +
770 +/**
771 + * bbr_endio
772 + *
773 + * This is the callback for normal write requests. Check for an error
774 + * during the I/O, and send to the thread for processing if necessary.
775 + **/
776 +static int bbr_endio(struct dm_target *ti, struct bio *bio,
777 + int error, union map_info *map_context)
778 +{
779 + struct bbr_private *bbr_id = ti->private;
780 + struct dm_bio_details *bbr_io = map_context->ptr;
781 +
782 + if (error && bbr_io) {
783 + unsigned long flags;
784 + char b[32];
785 +
786 + dm_bio_restore(bbr_io, bio);
787 + map_context->ptr = NULL;
788 +
789 + DMERR("device %s: I/O failure on sector %lu. "
790 + "Scheduling for retry.",
791 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
792 + (unsigned long)bio->bi_sector);
793 +
794 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
795 + bio_list_add(&bbr_id->remap_ios, bio);
796 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
797 +
798 + queue_work(dm_bbr_wq, &bbr_id->remap_work);
799 +
800 + error = 1;
801 + }
802 +
803 + if (bbr_io)
804 + mempool_free(bbr_io, bbr_io_pool);
805 +
806 + return error;
807 +}
808 +
809 +/**
810 + * Construct a bbr mapping
811 + **/
812 +static int bbr_ctr(struct dm_target *ti, unsigned int argc, char **argv)
813 +{
814 + struct bbr_private *bbr_id;
815 + unsigned long block_size;
816 + char *end;
817 + int rc = -EINVAL;
818 +
819 + if (argc != 8) {
820 + ti->error = "dm-bbr requires exactly 8 arguments: "
821 + "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size";
822 + goto out1;
823 + }
824 +
825 + bbr_id = bbr_alloc_private();
826 + if (!bbr_id) {
827 + ti->error = "dm-bbr: Error allocating bbr private data.";
828 + goto out1;
829 + }
830 +
831 + bbr_id->offset = simple_strtoull(argv[1], &end, 10);
832 + bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10);
833 + bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10);
834 + bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10);
835 + bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10);
836 + bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10);
837 + block_size = simple_strtoul(argv[7], &end, 10);
838 + bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT);
839 +
840 + bbr_id->bbr_table = vmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT);
841 + if (!bbr_id->bbr_table) {
842 + ti->error = "dm-bbr: Error allocating bbr table.";
843 + goto out2;
844 + }
845 +
846 + if (dm_get_device(ti, argv[0], 0, ti->len,
847 + dm_table_get_mode(ti->table), &bbr_id->dev)) {
848 + ti->error = "dm-bbr: Device lookup failed";
849 + goto out2;
850 + }
851 +
852 + rc = bbr_setup(bbr_id);
853 + if (rc) {
854 + ti->error = "dm-bbr: Device setup failed";
855 + goto out3;
856 + }
857 +
858 + ti->private = bbr_id;
859 + return 0;
860 +
861 +out3:
862 + dm_put_device(ti, bbr_id->dev);
863 +out2:
864 + bbr_free_private(bbr_id);
865 +out1:
866 + return rc;
867 +}
868 +
869 +static void bbr_dtr(struct dm_target *ti)
870 +{
871 + struct bbr_private *bbr_id = ti->private;
872 +
873 + dm_put_device(ti, bbr_id->dev);
874 + bbr_free_private(bbr_id);
875 +}
876 +
877 +static int bbr_map(struct dm_target *ti, struct bio *bio,
878 + union map_info *map_context)
879 +{
880 + struct bbr_private *bbr_id = ti->private;
881 + struct dm_bio_details *bbr_io;
882 + unsigned long flags;
883 + int rc = 1;
884 +
885 + bio->bi_sector += bbr_id->offset;
886 +
887 + if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
888 + !bbr_remap_probe(bbr_id, bio->bi_sector, bio_sectors(bio))) {
889 + /* No existing remaps or this request doesn't
890 + * contain any remapped sectors.
891 + */
892 + bio->bi_bdev = bbr_id->dev->bdev;
893 +
894 + bbr_io = mempool_alloc(bbr_io_pool, GFP_NOIO);
895 + dm_bio_record(bbr_io, bio);
896 + map_context->ptr = bbr_io;
897 + } else {
898 + /* This request has at least one remapped sector.
899 + * Give it to the work-queue for processing.
900 + */
901 + map_context->ptr = NULL;
902 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
903 + bio_list_add(&bbr_id->remap_ios, bio);
904 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
905 +
906 + queue_work(dm_bbr_wq, &bbr_id->remap_work);
907 + rc = 0;
908 + }
909 +
910 + return rc;
911 +}
912 +
913 +static int bbr_status(struct dm_target *ti, status_type_t type,
914 + char *result, unsigned int maxlen)
915 +{
916 + struct bbr_private *bbr_id = ti->private;
917 + char b[BDEVNAME_SIZE];
918 +
919 + switch (type) {
920 + case STATUSTYPE_INFO:
921 + result[0] = '\0';
922 + break;
923 +
924 + case STATUSTYPE_TABLE:
925 + snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u",
926 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
927 + bbr_id->offset, bbr_id->lba_table1, bbr_id->lba_table2,
928 + bbr_id->nr_sects_bbr_table,
929 + bbr_id->start_replacement_sect,
930 + bbr_id->nr_replacement_blks,
931 + bbr_id->blksize_in_sects << SECTOR_SHIFT);
932 + break;
933 + }
934 + return 0;
935 +}
936 +
937 +static struct target_type bbr_target = {
938 + .name = "bbr",
939 + .version= {1, 0, 1},
940 + .module = THIS_MODULE,
941 + .ctr = bbr_ctr,
942 + .dtr = bbr_dtr,
943 + .map = bbr_map,
944 + .end_io = bbr_endio,
945 + .status = bbr_status,
946 +};
947 +
948 +int __init dm_bbr_init(void)
949 +{
950 + int rc;
951 +
952 + rc = dm_register_target(&bbr_target);
953 + if (rc) {
954 + DMERR("error registering target.");
955 + goto err1;
956 + }
957 +
958 + bbr_remap_cache = kmem_cache_create("bbr-remap",
959 + sizeof(struct bbr_runtime_remap),
960 + 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
961 + if (!bbr_remap_cache) {
962 + DMERR("error creating remap cache.");
963 + rc = ENOMEM;
964 + goto err2;
965 + }
966 +
967 + bbr_io_cache = kmem_cache_create("bbr-io", sizeof(struct dm_bio_details),
968 + 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
969 + if (!bbr_io_cache) {
970 + DMERR("error creating io cache.");
971 + rc = ENOMEM;
972 + goto err3;
973 + }
974 +
975 + bbr_io_pool = mempool_create(256, mempool_alloc_slab,
976 + mempool_free_slab, bbr_io_cache);
977 + if (!bbr_io_pool) {
978 + DMERR("error creating io mempool.");
979 + rc = ENOMEM;
980 + goto err4;
981 + }
982 +
983 + dm_bbr_wq = create_workqueue("dm-bbr");
984 + if (!dm_bbr_wq) {
985 + DMERR("error creating work-queue.");
986 + rc = ENOMEM;
987 + goto err5;
988 + }
989 +
990 + rc = dm_io_get(1);
991 + if (rc) {
992 + DMERR("error initializing I/O service.");
993 + goto err6;
994 + }
995 +
996 + return 0;
997 +
998 +err6:
999 + destroy_workqueue(dm_bbr_wq);
1000 +err5:
1001 + mempool_destroy(bbr_io_pool);
1002 +err4:
1003 + kmem_cache_destroy(bbr_io_cache);
1004 +err3:
1005 + kmem_cache_destroy(bbr_remap_cache);
1006 +err2:
1007 + dm_unregister_target(&bbr_target);
1008 +err1:
1009 + return rc;
1010 +}
1011 +
1012 +void __exit dm_bbr_exit(void)
1013 +{
1014 + dm_io_put(1);
1015 + destroy_workqueue(dm_bbr_wq);
1016 + mempool_destroy(bbr_io_pool);
1017 + kmem_cache_destroy(bbr_io_cache);
1018 + kmem_cache_destroy(bbr_remap_cache);
1019 + dm_unregister_target(&bbr_target);
1020 +}
1021 +
1022 +module_init(dm_bbr_init);
1023 +module_exit(dm_bbr_exit);
1024 +MODULE_LICENSE("GPL");
1025 --- /dev/null
1026 +++ b/drivers/md/dm-bbr.h
1027 @@ -0,0 +1,125 @@
1028 +/*
1029 + * (C) Copyright IBM Corp. 2002, 2004
1030 + *
1031 + * This program is free software; you can redistribute it and/or modify
1032 + * it under the terms of the GNU General Public License as published by
1033 + * the Free Software Foundation; either version 2 of the License, or
1034 + * (at your option) any later version.
1035 + *
1036 + * This program is distributed in the hope that it will be useful,
1037 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1038 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
1039 + * the GNU General Public License for more details.
1040 + *
1041 + * You should have received a copy of the GNU General Public License
1042 + * along with this program; if not, write to the Free Software
1043 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1044 + *
1045 + * linux/drivers/md/dm-bbr.h
1046 + *
1047 + * Bad-block-relocation (BBR) target for device-mapper.
1048 + *
1049 + * The BBR target is designed to remap I/O write failures to another safe
1050 + * location on disk. Note that most disk drives have BBR built into them,
1051 + * this means that our software BBR will be only activated when all hardware
1052 + * BBR replacement sectors have been used.
1053 + */
1054 +
1055 +#define BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */
1056 +#define BBR_ENTRIES_PER_SECT 31
1057 +#define INITIAL_CRC 0xFFFFFFFF
1058 +#define CRC_POLYNOMIAL 0xEDB88320L
1059 +
1060 +/**
1061 + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines.
1062 + * Use these in place of %Ld, %Lu, and %Lx.
1063 + **/
1064 +#if BITS_PER_LONG > 32
1065 +#define PFU64 "%lu"
1066 +#else
1067 +#define PFU64 "%Lu"
1068 +#endif
1069 +
1070 +/**
1071 + * struct bbr_table_entry
1072 + * @bad_sect: LBA of bad location.
1073 + * @replacement_sect: LBA of new location.
1074 + *
1075 + * Structure to describe one BBR remap.
1076 + **/
1077 +struct bbr_table_entry {
1078 + u64 bad_sect;
1079 + u64 replacement_sect;
1080 +};
1081 +
1082 +/**
1083 + * struct bbr_table
1084 + * @signature: Signature on each BBR table sector.
1085 + * @crc: CRC for this table sector.
1086 + * @sequence_number: Used to resolve conflicts when primary and secondary
1087 + * tables do not match.
1088 + * @in_use_cnt: Number of in-use table entries.
1089 + * @entries: Actual table of remaps.
1090 + *
1091 + * Structure to describe each sector of the metadata table. Each sector in this
1092 + * table can describe 31 remapped sectors.
1093 + **/
1094 +struct bbr_table {
1095 + u32 signature;
1096 + u32 crc;
1097 + u32 sequence_number;
1098 + u32 in_use_cnt;
1099 + struct bbr_table_entry entries[BBR_ENTRIES_PER_SECT];
1100 +};
1101 +
1102 +/**
1103 + * struct bbr_runtime_remap
1104 + *
1105 + * Node in the binary tree used to keep track of remaps.
1106 + **/
1107 +struct bbr_runtime_remap {
1108 + struct bbr_table_entry remap;
1109 + struct bbr_runtime_remap *left;
1110 + struct bbr_runtime_remap *right;
1111 +};
1112 +
1113 +/**
1114 + * struct bbr_private
1115 + * @dev: Info about underlying device.
1116 + * @bbr_table: Copy of metadata table.
1117 + * @remap_root: Binary tree containing all remaps.
1118 + * @remap_root_lock: Lock for the binary tree.
1119 + * @remap_work: For adding work items to the work-queue.
1120 + * @remap_ios: List of I/Os for the work-queue to handle.
1121 + * @remap_ios_lock: Lock for the remap_ios list.
1122 + * @offset: LBA of data area.
1123 + * @lba_table1: LBA of primary BBR table.
1124 + * @lba_table2: LBA of secondary BBR table.
1125 + * @nr_sects_bbr_table: Size of each BBR table.
1126 + * @nr_replacement_blks: Number of replacement blocks.
1127 + * @start_replacement_sect: LBA of start of replacement blocks.
1128 + * @blksize_in_sects: Size of each block.
1129 + * @in_use_replacement_blks: Current number of remapped blocks.
1130 + *
1131 + * Private data for each BBR target.
1132 + **/
1133 +struct bbr_private {
1134 + struct dm_dev *dev;
1135 + struct bbr_table *bbr_table;
1136 + struct bbr_runtime_remap *remap_root;
1137 + spinlock_t remap_root_lock;
1138 +
1139 + struct work_struct remap_work;
1140 + struct bio_list remap_ios;
1141 + spinlock_t remap_ios_lock;
1142 +
1143 + u64 offset;
1144 + u64 lba_table1;
1145 + u64 lba_table2;
1146 + u64 nr_sects_bbr_table;
1147 + u64 start_replacement_sect;
1148 + u64 nr_replacement_blks;
1149 + u32 blksize_in_sects;
1150 + atomic_t in_use_replacement_blks;
1151 +};
1152 +

  ViewVC Help
Powered by ViewVC 1.1.20