/[linux-patches]/genpatches-2.6/trunk/2.6.18-pre/4105_dm-bbr.patch
Gentoo

Contents of /genpatches-2.6/trunk/2.6.18-pre/4105_dm-bbr.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 534 - (show annotations) (download)
Fri Jul 7 07:35:20 2006 UTC (12 years, 2 months ago) by phreak
File size: 32162 byte(s)
Fixing 4105_dm-bbr.patch and 4300_squashfs-3.0.patch for 2.6.18-rc1
1 Index: linux-git/drivers/md/Kconfig
2 ===================================================================
3 --- linux-git.orig/drivers/md/Kconfig
4 +++ linux-git/drivers/md/Kconfig
5 @@ -249,5 +249,16 @@ config DM_MULTIPATH_EMC
6 ---help---
7 Multipath support for EMC CX/AX series hardware.
8
9 +config BLK_DEV_DM_BBR
10 + tristate "Bad Block Relocation Device Target (EXPERIMENTAL)"
11 + depends on BLK_DEV_DM && EXPERIMENTAL
12 + ---help---
13 + Support for devices with software-based bad-block-relocation.
14 +
15 + To compile this as a module, choose M here: the module will be
16 + called dm-bbr.
17 +
18 + If unsure, say N.
19 +
20 endmenu
21
22 Index: linux-git/drivers/md/Makefile
23 ===================================================================
24 --- linux-git.orig/drivers/md/Makefile
25 +++ linux-git/drivers/md/Makefile
26 @@ -36,6 +36,7 @@ obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc
27 obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
28 obj-$(CONFIG_DM_MIRROR) += dm-mirror.o
29 obj-$(CONFIG_DM_ZERO) += dm-zero.o
30 +obj-$(CONFIG_BLK_DEV_DM_BBR) += dm-bbr.o
31
32 quiet_cmd_unroll = UNROLL $@
33 cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
34 Index: linux-git/drivers/md/dm-bbr.c
35 ===================================================================
36 --- /dev/null
37 +++ linux-git/drivers/md/dm-bbr.c
38 @@ -0,0 +1,1004 @@
39 +/*
40 + * (C) Copyright IBM Corp. 2002, 2004
41 + *
42 + * This program is free software; you can redistribute it and/or modify
43 + * it under the terms of the GNU General Public License as published by
44 + * the Free Software Foundation; either version 2 of the License, or
45 + * (at your option) any later version.
46 + *
47 + * This program is distributed in the hope that it will be useful,
48 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
49 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
50 + * the GNU General Public License for more details.
51 + *
52 + * You should have received a copy of the GNU General Public License
53 + * along with this program; if not, write to the Free Software
54 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
55 + *
56 + * linux/drivers/md/dm-bbr.c
57 + *
58 + * Bad-block-relocation (BBR) target for device-mapper.
59 + *
60 + * The BBR target is designed to remap I/O write failures to another safe
61 + * location on disk. Note that most disk drives have BBR built into them,
62 + * this means that our software BBR will be only activated when all hardware
63 + * BBR replacement sectors have been used.
64 + */
65 +
66 +#include <linux/module.h>
67 +#include <linux/init.h>
68 +#include <linux/bio.h>
69 +#include <linux/spinlock.h>
70 +#include <linux/slab.h>
71 +#include <linux/mempool.h>
72 +#include <linux/workqueue.h>
73 +#include <linux/vmalloc.h>
74 +
75 +#include "dm.h"
76 +#include "dm-bio-list.h"
77 +#include "dm-bio-record.h"
78 +#include "dm-bbr.h"
79 +#include "dm-io.h"
80 +
81 +#define DM_MSG_PREFIX "bbr"
82 +#define SECTOR_SIZE (1 << SECTOR_SHIFT)
83 +
84 +static struct workqueue_struct *dm_bbr_wq = NULL;
85 +static void bbr_remap_handler(void *data);
86 +static kmem_cache_t *bbr_remap_cache;
87 +static kmem_cache_t *bbr_io_cache;
88 +static mempool_t *bbr_io_pool;
89 +
90 +/**
91 + * bbr_binary_tree_destroy
92 + *
93 + * Destroy the binary tree.
94 + **/
95 +static void bbr_binary_tree_destroy(struct bbr_runtime_remap *root)
96 +{
97 + struct bbr_runtime_remap **link = NULL;
98 + struct bbr_runtime_remap *node = root;
99 +
100 + while (node) {
101 + if (node->left) {
102 + link = &(node->left);
103 + node = node->left;
104 + continue;
105 + }
106 + if (node->right) {
107 + link = &(node->right);
108 + node = node->right;
109 + continue;
110 + }
111 +
112 + kmem_cache_free(bbr_remap_cache, node);
113 + if (node == root) {
114 + /* If root is deleted, we're done. */
115 + break;
116 + }
117 +
118 + /* Back to root. */
119 + node = root;
120 + *link = NULL;
121 + }
122 +}
123 +
124 +static void bbr_free_remap(struct bbr_private *bbr_id)
125 +{
126 + spin_lock_irq(&bbr_id->remap_root_lock);
127 + bbr_binary_tree_destroy(bbr_id->remap_root);
128 + bbr_id->remap_root = NULL;
129 + spin_unlock_irq(&bbr_id->remap_root_lock);
130 +}
131 +
132 +static struct bbr_private *bbr_alloc_private(void)
133 +{
134 + struct bbr_private *bbr_id;
135 +
136 + bbr_id = kmalloc(sizeof(*bbr_id), GFP_KERNEL);
137 + if (bbr_id) {
138 + memset(bbr_id, 0, sizeof(*bbr_id));
139 + INIT_WORK(&bbr_id->remap_work, bbr_remap_handler, bbr_id);
140 + bbr_id->remap_root_lock = SPIN_LOCK_UNLOCKED;
141 + bbr_id->remap_ios_lock = SPIN_LOCK_UNLOCKED;
142 + bbr_id->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0);
143 + }
144 +
145 + return bbr_id;
146 +}
147 +
148 +static void bbr_free_private(struct bbr_private *bbr_id)
149 +{
150 + if (bbr_id->bbr_table) {
151 + vfree(bbr_id->bbr_table);
152 + }
153 + bbr_free_remap(bbr_id);
154 + kfree(bbr_id);
155 +}
156 +
157 +static u32 crc_table[256];
158 +static u32 crc_table_built = 0;
159 +
160 +static void build_crc_table(void)
161 +{
162 + u32 i, j, crc;
163 +
164 + for (i = 0; i <= 255; i++) {
165 + crc = i;
166 + for (j = 8; j > 0; j--) {
167 + if (crc & 1)
168 + crc = (crc >> 1) ^ CRC_POLYNOMIAL;
169 + else
170 + crc >>= 1;
171 + }
172 + crc_table[i] = crc;
173 + }
174 + crc_table_built = 1;
175 +}
176 +
177 +static u32 calculate_crc(u32 crc, void *buffer, u32 buffersize)
178 +{
179 + unsigned char *current_byte;
180 + u32 temp1, temp2, i;
181 +
182 + current_byte = (unsigned char *) buffer;
183 + /* Make sure the crc table is available */
184 + if (!crc_table_built)
185 + build_crc_table();
186 + /* Process each byte in the buffer. */
187 + for (i = 0; i < buffersize; i++) {
188 + temp1 = (crc >> 8) & 0x00FFFFFF;
189 + temp2 = crc_table[(crc ^ (u32) * current_byte) &
190 + (u32) 0xff];
191 + current_byte++;
192 + crc = temp1 ^ temp2;
193 + }
194 + return crc;
195 +}
196 +
197 +/**
198 + * le_bbr_table_sector_to_cpu
199 + *
200 + * Convert bbr meta data from on-disk (LE) format
201 + * to the native cpu endian format.
202 + **/
203 +static void le_bbr_table_sector_to_cpu(struct bbr_table *p)
204 +{
205 + int i;
206 + p->signature = le32_to_cpup(&p->signature);
207 + p->crc = le32_to_cpup(&p->crc);
208 + p->sequence_number = le32_to_cpup(&p->sequence_number);
209 + p->in_use_cnt = le32_to_cpup(&p->in_use_cnt);
210 + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
211 + p->entries[i].bad_sect =
212 + le64_to_cpup(&p->entries[i].bad_sect);
213 + p->entries[i].replacement_sect =
214 + le64_to_cpup(&p->entries[i].replacement_sect);
215 + }
216 +}
217 +
218 +/**
219 + * cpu_bbr_table_sector_to_le
220 + *
221 + * Convert bbr meta data from cpu endian format to on-disk (LE) format
222 + **/
223 +static void cpu_bbr_table_sector_to_le(struct bbr_table *p,
224 + struct bbr_table *le)
225 +{
226 + int i;
227 + le->signature = cpu_to_le32p(&p->signature);
228 + le->crc = cpu_to_le32p(&p->crc);
229 + le->sequence_number = cpu_to_le32p(&p->sequence_number);
230 + le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt);
231 + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
232 + le->entries[i].bad_sect =
233 + cpu_to_le64p(&p->entries[i].bad_sect);
234 + le->entries[i].replacement_sect =
235 + cpu_to_le64p(&p->entries[i].replacement_sect);
236 + }
237 +}
238 +
239 +/**
240 + * validate_bbr_table_sector
241 + *
242 + * Check the specified BBR table sector for a valid signature and CRC. If it's
243 + * valid, endian-convert the table sector.
244 + **/
245 +static int validate_bbr_table_sector(struct bbr_table *p)
246 +{
247 + int rc = 0;
248 + int org_crc, final_crc;
249 +
250 + if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) {
251 + DMERR("dm-bbr: BBR table signature doesn't match!");
252 + DMERR("dm-bbr: Found 0x%x. Expecting 0x%x",
253 + le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE);
254 + rc = -EINVAL;
255 + goto out;
256 + }
257 +
258 + if (!p->crc) {
259 + DMERR("dm-bbr: BBR table sector has no CRC!");
260 + rc = -EINVAL;
261 + goto out;
262 + }
263 +
264 + org_crc = le32_to_cpup(&p->crc);
265 + p->crc = 0;
266 + final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p));
267 + if (final_crc != org_crc) {
268 + DMERR("dm-bbr: CRC failed!");
269 + DMERR("dm-bbr: Found 0x%x. Expecting 0x%x",
270 + org_crc, final_crc);
271 + rc = -EINVAL;
272 + goto out;
273 + }
274 +
275 + p->crc = cpu_to_le32p(&org_crc);
276 + le_bbr_table_sector_to_cpu(p);
277 +
278 +out:
279 + return rc;
280 +}
281 +
282 +/**
283 + * bbr_binary_tree_insert
284 + *
285 + * Insert a node into the binary tree.
286 + **/
287 +static void bbr_binary_tree_insert(struct bbr_runtime_remap **root,
288 + struct bbr_runtime_remap *newnode)
289 +{
290 + struct bbr_runtime_remap **node = root;
291 + while (node && *node) {
292 + if (newnode->remap.bad_sect > (*node)->remap.bad_sect) {
293 + node = &((*node)->right);
294 + } else {
295 + node = &((*node)->left);
296 + }
297 + }
298 +
299 + newnode->left = newnode->right = NULL;
300 + *node = newnode;
301 +}
302 +
303 +/**
304 + * bbr_binary_search
305 + *
306 + * Search for a node that contains bad_sect == lsn.
307 + **/
308 +static struct bbr_runtime_remap *bbr_binary_search(
309 + struct bbr_runtime_remap *root,
310 + u64 lsn)
311 +{
312 + struct bbr_runtime_remap *node = root;
313 + while (node) {
314 + if (node->remap.bad_sect == lsn) {
315 + break;
316 + }
317 + if (lsn > node->remap.bad_sect) {
318 + node = node->right;
319 + } else {
320 + node = node->left;
321 + }
322 + }
323 + return node;
324 +}
325 +
326 +/**
327 + * bbr_insert_remap_entry
328 + *
329 + * Create a new remap entry and add it to the binary tree for this node.
330 + **/
331 +static int bbr_insert_remap_entry(struct bbr_private *bbr_id,
332 + struct bbr_table_entry *new_bbr_entry)
333 +{
334 + struct bbr_runtime_remap *newnode;
335 +
336 + newnode = kmem_cache_alloc(bbr_remap_cache, GFP_NOIO);
337 + if (!newnode) {
338 + DMERR("dm-bbr: Could not allocate from remap cache!");
339 + return -ENOMEM;
340 + }
341 + newnode->remap.bad_sect = new_bbr_entry->bad_sect;
342 + newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
343 + spin_lock_irq(&bbr_id->remap_root_lock);
344 + bbr_binary_tree_insert(&bbr_id->remap_root, newnode);
345 + spin_unlock_irq(&bbr_id->remap_root_lock);
346 + return 0;
347 +}
348 +
349 +/**
350 + * bbr_table_to_remap_list
351 + *
352 + * The on-disk bbr table is sorted by the replacement sector LBA. In order to
353 + * improve run time performance, the in memory remap list must be sorted by
354 + * the bad sector LBA. This function is called at discovery time to initialize
355 + * the remap list. This function assumes that at least one copy of meta data
356 + * is valid.
357 + **/
358 +static u32 bbr_table_to_remap_list(struct bbr_private *bbr_id)
359 +{
360 + u32 in_use_blks = 0;
361 + int i, j;
362 + struct bbr_table *p;
363 +
364 + for (i = 0, p = bbr_id->bbr_table;
365 + i < bbr_id->nr_sects_bbr_table;
366 + i++, p++) {
367 + if (!p->in_use_cnt) {
368 + break;
369 + }
370 + in_use_blks += p->in_use_cnt;
371 + for (j = 0; j < p->in_use_cnt; j++) {
372 + bbr_insert_remap_entry(bbr_id, &p->entries[j]);
373 + }
374 + }
375 + if (in_use_blks) {
376 + char b[32];
377 + DMWARN("dm-bbr: There are %u BBR entries for device %s",
378 + in_use_blks, format_dev_t(b, bbr_id->dev->bdev->bd_dev));
379 + }
380 +
381 + return in_use_blks;
382 +}
383 +
384 +/**
385 + * bbr_search_remap_entry
386 + *
387 + * Search remap entry for the specified sector. If found, return a pointer to
388 + * the table entry. Otherwise, return NULL.
389 + **/
390 +static struct bbr_table_entry *bbr_search_remap_entry(
391 + struct bbr_private *bbr_id,
392 + u64 lsn)
393 +{
394 + struct bbr_runtime_remap *p;
395 +
396 + spin_lock_irq(&bbr_id->remap_root_lock);
397 + p = bbr_binary_search(bbr_id->remap_root, lsn);
398 + spin_unlock_irq(&bbr_id->remap_root_lock);
399 + if (p) {
400 + return (&p->remap);
401 + } else {
402 + return NULL;
403 + }
404 +}
405 +
406 +/**
407 + * bbr_remap
408 + *
409 + * If *lsn is in the remap table, return TRUE and modify *lsn,
410 + * else, return FALSE.
411 + **/
412 +static inline int bbr_remap(struct bbr_private *bbr_id,
413 + u64 *lsn)
414 +{
415 + struct bbr_table_entry *e;
416 +
417 + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
418 + e = bbr_search_remap_entry(bbr_id, *lsn);
419 + if (e) {
420 + *lsn = e->replacement_sect;
421 + return 1;
422 + }
423 + }
424 + return 0;
425 +}
426 +
427 +/**
428 + * bbr_remap_probe
429 + *
430 + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap
431 + * table return TRUE, Else, return FALSE.
432 + **/
433 +static inline int bbr_remap_probe(struct bbr_private *bbr_id,
434 + u64 lsn, u64 nr_sects)
435 +{
436 + u64 tmp, cnt;
437 +
438 + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
439 + for (cnt = 0, tmp = lsn;
440 + cnt < nr_sects;
441 + cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) {
442 + if (bbr_remap(bbr_id,&tmp)) {
443 + return 1;
444 + }
445 + }
446 + }
447 + return 0;
448 +}
449 +
450 +/**
451 + * bbr_setup
452 + *
453 + * Read the remap tables from disk and set up the initial remap tree.
454 + **/
455 +static int bbr_setup(struct bbr_private *bbr_id)
456 +{
457 + struct bbr_table *table = bbr_id->bbr_table;
458 + struct io_region job;
459 + unsigned long error;
460 + int i, rc = 0;
461 +
462 + job.bdev = bbr_id->dev->bdev;
463 + job.count = 1;
464 +
465 + /* Read and verify each BBR table sector individually. */
466 + for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) {
467 + job.sector = bbr_id->lba_table1 + i;
468 + rc = dm_io_sync_vm(1, &job, READ, table, &error);
469 + if (rc && bbr_id->lba_table2) {
470 + job.sector = bbr_id->lba_table2 + i;
471 + rc = dm_io_sync_vm(1, &job, READ, table, &error);
472 + }
473 + if (rc) {
474 + goto out;
475 + }
476 +
477 + rc = validate_bbr_table_sector(table);
478 + if (rc) {
479 + goto out;
480 + }
481 + }
482 + atomic_set(&bbr_id->in_use_replacement_blks,
483 + bbr_table_to_remap_list(bbr_id));
484 +
485 +out:
486 + if (rc) {
487 + DMERR("dm-bbr: error during device setup: %d", rc);
488 + }
489 + return rc;
490 +}
491 +
492 +/**
493 + * bbr_io_remap_error
494 + * @bbr_id: Private data for the BBR node.
495 + * @rw: READ or WRITE.
496 + * @starting_lsn: Starting sector of request to remap.
497 + * @count: Number of sectors in the request.
498 + * @page: Page containing the data for the request.
499 + * @offset: Byte-offset of the data within the page.
500 + *
501 + * For the requested range, try to write each sector individually. For each
502 + * sector that fails, find the next available remap location and write the
503 + * data to that new location. Then update the table and write both copies
504 + * of the table to disk. Finally, update the in-memory mapping and do any
505 + * other necessary bookkeeping.
506 + **/
507 +static int bbr_io_remap_error(struct bbr_private *bbr_id,
508 + int rw,
509 + u64 starting_lsn,
510 + u64 count,
511 + struct page *page,
512 + unsigned int offset)
513 +{
514 + struct bbr_table *bbr_table;
515 + struct io_region job;
516 + struct page_list pl;
517 + unsigned long table_sector_index;
518 + unsigned long table_sector_offset;
519 + unsigned long index;
520 + unsigned long error;
521 + u64 lsn, new_lsn;
522 + char b[32];
523 + int rc;
524 +
525 + job.bdev = bbr_id->dev->bdev;
526 + job.count = 1;
527 + pl.page = page;
528 + pl.next = NULL;
529 +
530 + /* For each sector in the request. */
531 + for (lsn = 0; lsn < count; lsn++, offset += SECTOR_SIZE) {
532 + job.sector = starting_lsn + lsn;
533 + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
534 + while (rc) {
535 + /* Find the next available relocation sector. */
536 + new_lsn = atomic_read(&bbr_id->in_use_replacement_blks);
537 + if (new_lsn >= bbr_id->nr_replacement_blks) {
538 + /* No more replacement sectors available. */
539 + return -EIO;
540 + }
541 + new_lsn += bbr_id->start_replacement_sect;
542 +
543 + /* Write the data to its new location. */
544 + DMWARN("dm-bbr: device %s: Trying to remap bad sector "PFU64" to sector "PFU64,
545 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
546 + starting_lsn + lsn, new_lsn);
547 + job.sector = new_lsn;
548 + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
549 + if (rc) {
550 + /* This replacement sector is bad.
551 + * Try the next one.
552 + */
553 + DMERR("dm-bbr: device %s: replacement sector "PFU64" is bad. Skipping.",
554 + format_dev_t(b, bbr_id->dev->bdev->bd_dev), new_lsn);
555 + atomic_inc(&bbr_id->in_use_replacement_blks);
556 + continue;
557 + }
558 +
559 + /* Add this new entry to the on-disk table. */
560 + table_sector_index = new_lsn -
561 + bbr_id->start_replacement_sect;
562 + table_sector_offset = table_sector_index /
563 + BBR_ENTRIES_PER_SECT;
564 + index = table_sector_index % BBR_ENTRIES_PER_SECT;
565 +
566 + bbr_table = &bbr_id->bbr_table[table_sector_offset];
567 + bbr_table->entries[index].bad_sect = starting_lsn + lsn;
568 + bbr_table->entries[index].replacement_sect = new_lsn;
569 + bbr_table->in_use_cnt++;
570 + bbr_table->sequence_number++;
571 + bbr_table->crc = 0;
572 + bbr_table->crc = calculate_crc(INITIAL_CRC,
573 + bbr_table,
574 + sizeof(struct bbr_table));
575 +
576 + /* Write the table to disk. */
577 + cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
578 + if (bbr_id->lba_table1) {
579 + job.sector = bbr_id->lba_table1 + table_sector_offset;
580 + rc = dm_io_sync_vm(1, &job, WRITE, bbr_table, &error);
581 + }
582 + if (bbr_id->lba_table2) {
583 + job.sector = bbr_id->lba_table2 + table_sector_offset;
584 + rc |= dm_io_sync_vm(1, &job, WRITE, bbr_table, &error);
585 + }
586 + le_bbr_table_sector_to_cpu(bbr_table);
587 +
588 + if (rc) {
589 + /* Error writing one of the tables to disk. */
590 + DMERR("dm-bbr: device %s: error updating BBR tables on disk.",
591 + format_dev_t(b, bbr_id->dev->bdev->bd_dev));
592 + return rc;
593 + }
594 +
595 + /* Insert a new entry in the remapping binary-tree. */
596 + rc = bbr_insert_remap_entry(bbr_id,
597 + &bbr_table->entries[index]);
598 + if (rc) {
599 + DMERR("dm-bbr: device %s: error adding new entry to remap tree.",
600 + format_dev_t(b, bbr_id->dev->bdev->bd_dev));
601 + return rc;
602 + }
603 +
604 + atomic_inc(&bbr_id->in_use_replacement_blks);
605 + }
606 + }
607 +
608 + return 0;
609 +}
610 +
611 +/**
612 + * bbr_io_process_request
613 + *
614 + * For each sector in this request, check if the sector has already
615 + * been remapped. If so, process all previous sectors in the request,
616 + * followed by the remapped sector. Then reset the starting lsn and
617 + * count, and keep going with the rest of the request as if it were
618 + * a whole new request. If any of the sync_io's return an error,
619 + * call the remapper to relocate the bad sector(s).
620 + *
621 + * 2.5 Note: When switching over to bio's for the I/O path, we have made
622 + * the assumption that the I/O request described by the bio is one
623 + * virtually contiguous piece of memory (even though the bio vector
624 + * describes it using a series of physical page addresses).
625 + **/
626 +static int bbr_io_process_request(struct bbr_private *bbr_id,
627 + struct bio *bio)
628 +{
629 + struct io_region job;
630 + u64 starting_lsn = bio->bi_sector;
631 + u64 count, lsn, remapped_lsn;
632 + struct page_list pl;
633 + unsigned int offset;
634 + unsigned long error;
635 + int i, rw = bio_data_dir(bio);
636 + int rc = 0;
637 +
638 + job.bdev = bbr_id->dev->bdev;
639 + pl.next = NULL;
640 +
641 + /* Each bio can contain multiple vectors, each with a different page.
642 + * Treat each vector as a separate request.
643 + */
644 + /* KMC: Is this the right way to walk the bvec list? */
645 + for (i = 0;
646 + i < bio->bi_vcnt;
647 + i++, bio->bi_idx++, starting_lsn += count) {
648 +
649 + /* Bvec info: number of sectors, page,
650 + * and byte-offset within page.
651 + */
652 + count = bio_iovec(bio)->bv_len >> SECTOR_SHIFT;
653 + pl.page = bio_iovec(bio)->bv_page;
654 + offset = bio_iovec(bio)->bv_offset;
655 +
656 + /* For each sector in this bvec, check if the sector has
657 + * already been remapped. If so, process all previous sectors
658 + * in this request, followed by the remapped sector. Then reset
659 + * the starting lsn and count and keep going with the rest of
660 + * the request as if it were a whole new request.
661 + */
662 + for (lsn = 0; lsn < count; lsn++) {
663 + remapped_lsn = starting_lsn + lsn;
664 + rc = bbr_remap(bbr_id, &remapped_lsn);
665 + if (!rc) {
666 + /* This sector is fine. */
667 + continue;
668 + }
669 +
670 + /* Process all sectors in the request up to this one. */
671 + if (lsn > 0) {
672 + job.sector = starting_lsn;
673 + job.count = lsn;
674 + rc = dm_io_sync(1, &job, rw, &pl,
675 + offset, &error);
676 + if (rc) {
677 + /* If this I/O failed, then one of the
678 + * sectors in this request needs to be
679 + * relocated.
680 + */
681 + rc = bbr_io_remap_error(bbr_id, rw,
682 + starting_lsn,
683 + lsn, pl.page,
684 + offset);
685 + if (rc) {
686 + /* KMC: Return? Or continue to next bvec? */
687 + return rc;
688 + }
689 + }
690 + offset += (lsn << SECTOR_SHIFT);
691 + }
692 +
693 + /* Process the remapped sector. */
694 + job.sector = remapped_lsn;
695 + job.count = 1;
696 + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
697 + if (rc) {
698 + /* BUGBUG - Need more processing if this caused
699 + * an error. If this I/O failed, then the
700 + * existing remap is now bad, and we need to
701 + * find a new remap. Can't use
702 + * bbr_io_remap_error(), because the existing
703 + * map entry needs to be changed, not added
704 + * again, and the original table entry also
705 + * needs to be changed.
706 + */
707 + return rc;
708 + }
709 +
710 + starting_lsn += (lsn + 1);
711 + count -= (lsn + 1);
712 + lsn = -1;
713 + offset += SECTOR_SIZE;
714 + }
715 +
716 + /* Check for any remaining sectors after the last split. This
717 + * could potentially be the whole request, but that should be a
718 + * rare case because requests should only be processed by the
719 + * thread if we know an error occurred or they contained one or
720 + * more remapped sectors.
721 + */
722 + if (count) {
723 + job.sector = starting_lsn;
724 + job.count = count;
725 + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
726 + if (rc) {
727 + /* If this I/O failed, then one of the sectors
728 + * in this request needs to be relocated.
729 + */
730 + rc = bbr_io_remap_error(bbr_id, rw, starting_lsn,
731 + count, pl.page, offset);
732 + if (rc) {
733 + /* KMC: Return? Or continue to next bvec? */
734 + return rc;
735 + }
736 + }
737 + }
738 + }
739 +
740 + return 0;
741 +}
742 +
743 +static void bbr_io_process_requests(struct bbr_private *bbr_id,
744 + struct bio *bio)
745 +{
746 + struct bio *next;
747 + int rc;
748 +
749 + while (bio) {
750 + next = bio->bi_next;
751 + bio->bi_next = NULL;
752 +
753 + rc = bbr_io_process_request(bbr_id, bio);
754 +
755 + bio_endio(bio, bio->bi_size, rc);
756 +
757 + bio = next;
758 + }
759 +}
760 +
761 +/**
762 + * bbr_remap_handler
763 + *
764 + * This is the handler for the bbr work-queue.
765 + *
766 + * I/O requests should only be sent to this handler if we know that:
767 + * a) the request contains at least one remapped sector.
768 + * or
769 + * b) the request caused an error on the normal I/O path.
770 + *
771 + * This function uses synchronous I/O, so sending a request to this
772 + * thread that doesn't need special processing will cause severe
773 + * performance degredation.
774 + **/
775 +static void bbr_remap_handler(void *data)
776 +{
777 + struct bbr_private *bbr_id = data;
778 + struct bio *bio;
779 + unsigned long flags;
780 +
781 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
782 + bio = bio_list_get(&bbr_id->remap_ios);
783 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
784 +
785 + bbr_io_process_requests(bbr_id, bio);
786 +}
787 +
788 +/**
789 + * bbr_endio
790 + *
791 + * This is the callback for normal write requests. Check for an error
792 + * during the I/O, and send to the thread for processing if necessary.
793 + **/
794 +static int bbr_endio(struct dm_target *ti, struct bio *bio,
795 + int error, union map_info *map_context)
796 +{
797 + struct bbr_private *bbr_id = ti->private;
798 + struct dm_bio_details *bbr_io = map_context->ptr;
799 +
800 + if (error && bbr_io) {
801 + unsigned long flags;
802 + char b[32];
803 +
804 + dm_bio_restore(bbr_io, bio);
805 + map_context->ptr = NULL;
806 +
807 + DMERR("dm-bbr: device %s: I/O failure on sector %lu. "
808 + "Scheduling for retry.",
809 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
810 + (unsigned long)bio->bi_sector);
811 +
812 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
813 + bio_list_add(&bbr_id->remap_ios, bio);
814 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
815 +
816 + queue_work(dm_bbr_wq, &bbr_id->remap_work);
817 +
818 + error = 1;
819 + }
820 +
821 + if (bbr_io)
822 + mempool_free(bbr_io, bbr_io_pool);
823 +
824 + return error;
825 +}
826 +
827 +/**
828 + * Construct a bbr mapping
829 + **/
830 +static int bbr_ctr(struct dm_target *ti, unsigned int argc, char **argv)
831 +{
832 + struct bbr_private *bbr_id;
833 + unsigned long block_size;
834 + char *end;
835 + int rc = -EINVAL;
836 +
837 + if (argc != 8) {
838 + ti->error = "dm-bbr requires exactly 8 arguments: "
839 + "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size";
840 + goto out1;
841 + }
842 +
843 + bbr_id = bbr_alloc_private();
844 + if (!bbr_id) {
845 + ti->error = "dm-bbr: Error allocating bbr private data.";
846 + goto out1;
847 + }
848 +
849 + bbr_id->offset = simple_strtoull(argv[1], &end, 10);
850 + bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10);
851 + bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10);
852 + bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10);
853 + bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10);
854 + bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10);
855 + block_size = simple_strtoul(argv[7], &end, 10);
856 + bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT);
857 +
858 + bbr_id->bbr_table = vmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT);
859 + if (!bbr_id->bbr_table) {
860 + ti->error = "dm-bbr: Error allocating bbr table.";
861 + goto out2;
862 + }
863 +
864 + if (dm_get_device(ti, argv[0], 0, ti->len,
865 + dm_table_get_mode(ti->table), &bbr_id->dev)) {
866 + ti->error = "dm-bbr: Device lookup failed";
867 + goto out2;
868 + }
869 +
870 + rc = bbr_setup(bbr_id);
871 + if (rc) {
872 + ti->error = "dm-bbr: Device setup failed";
873 + goto out3;
874 + }
875 +
876 + ti->private = bbr_id;
877 + return 0;
878 +
879 +out3:
880 + dm_put_device(ti, bbr_id->dev);
881 +out2:
882 + bbr_free_private(bbr_id);
883 +out1:
884 + return rc;
885 +}
886 +
887 +static void bbr_dtr(struct dm_target *ti)
888 +{
889 + struct bbr_private *bbr_id = ti->private;
890 +
891 + dm_put_device(ti, bbr_id->dev);
892 + bbr_free_private(bbr_id);
893 +}
894 +
895 +static int bbr_map(struct dm_target *ti, struct bio *bio,
896 + union map_info *map_context)
897 +{
898 + struct bbr_private *bbr_id = ti->private;
899 + struct dm_bio_details *bbr_io;
900 + unsigned long flags;
901 + int rc = 1;
902 +
903 + bio->bi_sector += bbr_id->offset;
904 +
905 + if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
906 + !bbr_remap_probe(bbr_id, bio->bi_sector, bio_sectors(bio))) {
907 + /* No existing remaps or this request doesn't
908 + * contain any remapped sectors.
909 + */
910 + bio->bi_bdev = bbr_id->dev->bdev;
911 +
912 + bbr_io = mempool_alloc(bbr_io_pool, GFP_NOIO);
913 + dm_bio_record(bbr_io, bio);
914 + map_context->ptr = bbr_io;
915 + } else {
916 + /* This request has at least one remapped sector.
917 + * Give it to the work-queue for processing.
918 + */
919 + map_context->ptr = NULL;
920 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
921 + bio_list_add(&bbr_id->remap_ios, bio);
922 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
923 +
924 + queue_work(dm_bbr_wq, &bbr_id->remap_work);
925 + rc = 0;
926 + }
927 +
928 + return rc;
929 +}
930 +
931 +static int bbr_status(struct dm_target *ti, status_type_t type,
932 + char *result, unsigned int maxlen)
933 +{
934 + struct bbr_private *bbr_id = ti->private;
935 + char b[BDEVNAME_SIZE];
936 +
937 + switch (type) {
938 + case STATUSTYPE_INFO:
939 + result[0] = '\0';
940 + break;
941 +
942 + case STATUSTYPE_TABLE:
943 + snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u",
944 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
945 + bbr_id->offset, bbr_id->lba_table1, bbr_id->lba_table2,
946 + bbr_id->nr_sects_bbr_table,
947 + bbr_id->start_replacement_sect,
948 + bbr_id->nr_replacement_blks,
949 + bbr_id->blksize_in_sects << SECTOR_SHIFT);
950 + break;
951 + }
952 + return 0;
953 +}
954 +
955 +static struct target_type bbr_target = {
956 + .name = "bbr",
957 + .version= {1, 0, 1},
958 + .module = THIS_MODULE,
959 + .ctr = bbr_ctr,
960 + .dtr = bbr_dtr,
961 + .map = bbr_map,
962 + .end_io = bbr_endio,
963 + .status = bbr_status,
964 +};
965 +
966 +int __init dm_bbr_init(void)
967 +{
968 + int rc;
969 +
970 + rc = dm_register_target(&bbr_target);
971 + if (rc) {
972 + DMERR("dm-bbr: error registering target.");
973 + goto err1;
974 + }
975 +
976 + bbr_remap_cache = kmem_cache_create("bbr-remap",
977 + sizeof(struct bbr_runtime_remap),
978 + 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
979 + if (!bbr_remap_cache) {
980 + DMERR("dm-bbr: error creating remap cache.");
981 + rc = ENOMEM;
982 + goto err2;
983 + }
984 +
985 + bbr_io_cache = kmem_cache_create("bbr-io", sizeof(struct dm_bio_details),
986 + 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
987 + if (!bbr_io_cache) {
988 + DMERR("dm-bbr: error creating io cache.");
989 + rc = ENOMEM;
990 + goto err3;
991 + }
992 +
993 + bbr_io_pool = mempool_create(256, mempool_alloc_slab,
994 + mempool_free_slab, bbr_io_cache);
995 + if (!bbr_io_pool) {
996 + DMERR("dm-bbr: error creating io mempool.");
997 + rc = ENOMEM;
998 + goto err4;
999 + }
1000 +
1001 + dm_bbr_wq = create_workqueue("dm-bbr");
1002 + if (!dm_bbr_wq) {
1003 + DMERR("dm-bbr: error creating work-queue.");
1004 + rc = ENOMEM;
1005 + goto err5;
1006 + }
1007 +
1008 + rc = dm_io_get(1);
1009 + if (rc) {
1010 + DMERR("dm-bbr: error initializing I/O service.");
1011 + goto err6;
1012 + }
1013 +
1014 + return 0;
1015 +
1016 +err6:
1017 + destroy_workqueue(dm_bbr_wq);
1018 +err5:
1019 + mempool_destroy(bbr_io_pool);
1020 +err4:
1021 + kmem_cache_destroy(bbr_io_cache);
1022 +err3:
1023 + kmem_cache_destroy(bbr_remap_cache);
1024 +err2:
1025 + dm_unregister_target(&bbr_target);
1026 +err1:
1027 + return rc;
1028 +}
1029 +
1030 +void __exit dm_bbr_exit(void)
1031 +{
1032 + dm_io_put(1);
1033 + destroy_workqueue(dm_bbr_wq);
1034 + mempool_destroy(bbr_io_pool);
1035 + kmem_cache_destroy(bbr_io_cache);
1036 + kmem_cache_destroy(bbr_remap_cache);
1037 + dm_unregister_target(&bbr_target);
1038 +}
1039 +
1040 +module_init(dm_bbr_init);
1041 +module_exit(dm_bbr_exit);
1042 +MODULE_LICENSE("GPL");
1043 Index: linux-git/drivers/md/dm-bbr.h
1044 ===================================================================
1045 --- /dev/null
1046 +++ linux-git/drivers/md/dm-bbr.h
1047 @@ -0,0 +1,125 @@
1048 +/*
1049 + * (C) Copyright IBM Corp. 2002, 2004
1050 + *
1051 + * This program is free software; you can redistribute it and/or modify
1052 + * it under the terms of the GNU General Public License as published by
1053 + * the Free Software Foundation; either version 2 of the License, or
1054 + * (at your option) any later version.
1055 + *
1056 + * This program is distributed in the hope that it will be useful,
1057 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1058 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
1059 + * the GNU General Public License for more details.
1060 + *
1061 + * You should have received a copy of the GNU General Public License
1062 + * along with this program; if not, write to the Free Software
1063 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1064 + *
1065 + * linux/drivers/md/dm-bbr.h
1066 + *
1067 + * Bad-block-relocation (BBR) target for device-mapper.
1068 + *
1069 + * The BBR target is designed to remap I/O write failures to another safe
1070 + * location on disk. Note that most disk drives have BBR built into them,
1071 + * this means that our software BBR will be only activated when all hardware
1072 + * BBR replacement sectors have been used.
1073 + */
1074 +
1075 +#define BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */
1076 +#define BBR_ENTRIES_PER_SECT 31
1077 +#define INITIAL_CRC 0xFFFFFFFF
1078 +#define CRC_POLYNOMIAL 0xEDB88320L
1079 +
1080 +/**
1081 + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines.
1082 + * Use these in place of %Ld, %Lu, and %Lx.
1083 + **/
1084 +#if BITS_PER_LONG > 32
1085 +#define PFU64 "%lu"
1086 +#else
1087 +#define PFU64 "%Lu"
1088 +#endif
1089 +
1090 +/**
1091 + * struct bbr_table_entry
1092 + * @bad_sect: LBA of bad location.
1093 + * @replacement_sect: LBA of new location.
1094 + *
1095 + * Structure to describe one BBR remap.
1096 + **/
1097 +struct bbr_table_entry {
1098 + u64 bad_sect;
1099 + u64 replacement_sect;
1100 +};
1101 +
1102 +/**
1103 + * struct bbr_table
1104 + * @signature: Signature on each BBR table sector.
1105 + * @crc: CRC for this table sector.
1106 + * @sequence_number: Used to resolve conflicts when primary and secondary
1107 + * tables do not match.
1108 + * @in_use_cnt: Number of in-use table entries.
1109 + * @entries: Actual table of remaps.
1110 + *
1111 + * Structure to describe each sector of the metadata table. Each sector in this
1112 + * table can describe 31 remapped sectors.
1113 + **/
1114 +struct bbr_table {
1115 + u32 signature;
1116 + u32 crc;
1117 + u32 sequence_number;
1118 + u32 in_use_cnt;
1119 + struct bbr_table_entry entries[BBR_ENTRIES_PER_SECT];
1120 +};
1121 +
1122 +/**
1123 + * struct bbr_runtime_remap
1124 + *
1125 + * Node in the binary tree used to keep track of remaps.
1126 + **/
1127 +struct bbr_runtime_remap {
1128 + struct bbr_table_entry remap;
1129 + struct bbr_runtime_remap *left;
1130 + struct bbr_runtime_remap *right;
1131 +};
1132 +
1133 +/**
1134 + * struct bbr_private
1135 + * @dev: Info about underlying device.
1136 + * @bbr_table: Copy of metadata table.
1137 + * @remap_root: Binary tree containing all remaps.
1138 + * @remap_root_lock: Lock for the binary tree.
1139 + * @remap_work: For adding work items to the work-queue.
1140 + * @remap_ios: List of I/Os for the work-queue to handle.
1141 + * @remap_ios_lock: Lock for the remap_ios list.
1142 + * @offset: LBA of data area.
1143 + * @lba_table1: LBA of primary BBR table.
1144 + * @lba_table2: LBA of secondary BBR table.
1145 + * @nr_sects_bbr_table: Size of each BBR table.
1146 + * @nr_replacement_blks: Number of replacement blocks.
1147 + * @start_replacement_sect: LBA of start of replacement blocks.
1148 + * @blksize_in_sects: Size of each block.
1149 + * @in_use_replacement_blks: Current number of remapped blocks.
1150 + *
1151 + * Private data for each BBR target.
1152 + **/
1153 +struct bbr_private {
1154 + struct dm_dev *dev;
1155 + struct bbr_table *bbr_table;
1156 + struct bbr_runtime_remap *remap_root;
1157 + spinlock_t remap_root_lock;
1158 +
1159 + struct work_struct remap_work;
1160 + struct bio_list remap_ios;
1161 + spinlock_t remap_ios_lock;
1162 +
1163 + u64 offset;
1164 + u64 lba_table1;
1165 + u64 lba_table2;
1166 + u64 nr_sects_bbr_table;
1167 + u64 start_replacement_sect;
1168 + u64 nr_replacement_blks;
1169 + u32 blksize_in_sects;
1170 + atomic_t in_use_replacement_blks;
1171 +};
1172 +

  ViewVC Help
Powered by ViewVC 1.1.20