/[linux-patches]/genpatches-2.6/trunk/2.6.18-pre/4105_dm-bbr.patch
Gentoo

Contents of /genpatches-2.6/trunk/2.6.18-pre/4105_dm-bbr.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 358 - (show annotations) (download)
Sat Apr 1 05:31:54 2006 UTC (12 years, 5 months ago) by phreak
Original Path: genpatches-2.6/trunk/2.6.17-pre/4105_dm-bbr.patch
File size: 32133 byte(s)
Cloning 2.6.16 to start work on 2.6.17
1 Index: linux-git/drivers/md/dm-bbr.c
2 ===================================================================
3 --- /dev/null
4 +++ linux-git/drivers/md/dm-bbr.c
5 @@ -0,0 +1,1003 @@
6 +/*
7 + * (C) Copyright IBM Corp. 2002, 2004
8 + *
9 + * This program is free software; you can redistribute it and/or modify
10 + * it under the terms of the GNU General Public License as published by
11 + * the Free Software Foundation; either version 2 of the License, or
12 + * (at your option) any later version.
13 + *
14 + * This program is distributed in the hope that it will be useful,
15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
17 + * the GNU General Public License for more details.
18 + *
19 + * You should have received a copy of the GNU General Public License
20 + * along with this program; if not, write to the Free Software
21 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 + *
23 + * linux/drivers/md/dm-bbr.c
24 + *
25 + * Bad-block-relocation (BBR) target for device-mapper.
26 + *
27 + * The BBR target is designed to remap I/O write failures to another safe
28 + * location on disk. Note that most disk drives have BBR built into them,
29 + * this means that our software BBR will be only activated when all hardware
30 + * BBR replacement sectors have been used.
31 + */
32 +
33 +#include <linux/module.h>
34 +#include <linux/init.h>
35 +#include <linux/bio.h>
36 +#include <linux/spinlock.h>
37 +#include <linux/slab.h>
38 +#include <linux/mempool.h>
39 +#include <linux/workqueue.h>
40 +#include <linux/vmalloc.h>
41 +
42 +#include "dm.h"
43 +#include "dm-bio-list.h"
44 +#include "dm-bio-record.h"
45 +#include "dm-bbr.h"
46 +#include "dm-io.h"
47 +
48 +#define SECTOR_SIZE (1 << SECTOR_SHIFT)
49 +
50 +static struct workqueue_struct *dm_bbr_wq = NULL;
51 +static void bbr_remap_handler(void *data);
52 +static kmem_cache_t *bbr_remap_cache;
53 +static kmem_cache_t *bbr_io_cache;
54 +static mempool_t *bbr_io_pool;
55 +
56 +/**
57 + * bbr_binary_tree_destroy
58 + *
59 + * Destroy the binary tree.
60 + **/
61 +static void bbr_binary_tree_destroy(struct bbr_runtime_remap *root)
62 +{
63 + struct bbr_runtime_remap **link = NULL;
64 + struct bbr_runtime_remap *node = root;
65 +
66 + while (node) {
67 + if (node->left) {
68 + link = &(node->left);
69 + node = node->left;
70 + continue;
71 + }
72 + if (node->right) {
73 + link = &(node->right);
74 + node = node->right;
75 + continue;
76 + }
77 +
78 + kmem_cache_free(bbr_remap_cache, node);
79 + if (node == root) {
80 + /* If root is deleted, we're done. */
81 + break;
82 + }
83 +
84 + /* Back to root. */
85 + node = root;
86 + *link = NULL;
87 + }
88 +}
89 +
90 +static void bbr_free_remap(struct bbr_private *bbr_id)
91 +{
92 + spin_lock_irq(&bbr_id->remap_root_lock);
93 + bbr_binary_tree_destroy(bbr_id->remap_root);
94 + bbr_id->remap_root = NULL;
95 + spin_unlock_irq(&bbr_id->remap_root_lock);
96 +}
97 +
98 +static struct bbr_private *bbr_alloc_private(void)
99 +{
100 + struct bbr_private *bbr_id;
101 +
102 + bbr_id = kmalloc(sizeof(*bbr_id), GFP_KERNEL);
103 + if (bbr_id) {
104 + memset(bbr_id, 0, sizeof(*bbr_id));
105 + INIT_WORK(&bbr_id->remap_work, bbr_remap_handler, bbr_id);
106 + bbr_id->remap_root_lock = SPIN_LOCK_UNLOCKED;
107 + bbr_id->remap_ios_lock = SPIN_LOCK_UNLOCKED;
108 + bbr_id->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0);
109 + }
110 +
111 + return bbr_id;
112 +}
113 +
114 +static void bbr_free_private(struct bbr_private *bbr_id)
115 +{
116 + if (bbr_id->bbr_table) {
117 + vfree(bbr_id->bbr_table);
118 + }
119 + bbr_free_remap(bbr_id);
120 + kfree(bbr_id);
121 +}
122 +
123 +static u32 crc_table[256];
124 +static u32 crc_table_built = 0;
125 +
126 +static void build_crc_table(void)
127 +{
128 + u32 i, j, crc;
129 +
130 + for (i = 0; i <= 255; i++) {
131 + crc = i;
132 + for (j = 8; j > 0; j--) {
133 + if (crc & 1)
134 + crc = (crc >> 1) ^ CRC_POLYNOMIAL;
135 + else
136 + crc >>= 1;
137 + }
138 + crc_table[i] = crc;
139 + }
140 + crc_table_built = 1;
141 +}
142 +
143 +static u32 calculate_crc(u32 crc, void *buffer, u32 buffersize)
144 +{
145 + unsigned char *current_byte;
146 + u32 temp1, temp2, i;
147 +
148 + current_byte = (unsigned char *) buffer;
149 + /* Make sure the crc table is available */
150 + if (!crc_table_built)
151 + build_crc_table();
152 + /* Process each byte in the buffer. */
153 + for (i = 0; i < buffersize; i++) {
154 + temp1 = (crc >> 8) & 0x00FFFFFF;
155 + temp2 = crc_table[(crc ^ (u32) * current_byte) &
156 + (u32) 0xff];
157 + current_byte++;
158 + crc = temp1 ^ temp2;
159 + }
160 + return crc;
161 +}
162 +
163 +/**
164 + * le_bbr_table_sector_to_cpu
165 + *
166 + * Convert bbr meta data from on-disk (LE) format
167 + * to the native cpu endian format.
168 + **/
169 +static void le_bbr_table_sector_to_cpu(struct bbr_table *p)
170 +{
171 + int i;
172 + p->signature = le32_to_cpup(&p->signature);
173 + p->crc = le32_to_cpup(&p->crc);
174 + p->sequence_number = le32_to_cpup(&p->sequence_number);
175 + p->in_use_cnt = le32_to_cpup(&p->in_use_cnt);
176 + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
177 + p->entries[i].bad_sect =
178 + le64_to_cpup(&p->entries[i].bad_sect);
179 + p->entries[i].replacement_sect =
180 + le64_to_cpup(&p->entries[i].replacement_sect);
181 + }
182 +}
183 +
184 +/**
185 + * cpu_bbr_table_sector_to_le
186 + *
187 + * Convert bbr meta data from cpu endian format to on-disk (LE) format
188 + **/
189 +static void cpu_bbr_table_sector_to_le(struct bbr_table *p,
190 + struct bbr_table *le)
191 +{
192 + int i;
193 + le->signature = cpu_to_le32p(&p->signature);
194 + le->crc = cpu_to_le32p(&p->crc);
195 + le->sequence_number = cpu_to_le32p(&p->sequence_number);
196 + le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt);
197 + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
198 + le->entries[i].bad_sect =
199 + cpu_to_le64p(&p->entries[i].bad_sect);
200 + le->entries[i].replacement_sect =
201 + cpu_to_le64p(&p->entries[i].replacement_sect);
202 + }
203 +}
204 +
205 +/**
206 + * validate_bbr_table_sector
207 + *
208 + * Check the specified BBR table sector for a valid signature and CRC. If it's
209 + * valid, endian-convert the table sector.
210 + **/
211 +static int validate_bbr_table_sector(struct bbr_table *p)
212 +{
213 + int rc = 0;
214 + int org_crc, final_crc;
215 +
216 + if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) {
217 + DMERR("dm-bbr: BBR table signature doesn't match!");
218 + DMERR("dm-bbr: Found 0x%x. Expecting 0x%x",
219 + le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE);
220 + rc = -EINVAL;
221 + goto out;
222 + }
223 +
224 + if (!p->crc) {
225 + DMERR("dm-bbr: BBR table sector has no CRC!");
226 + rc = -EINVAL;
227 + goto out;
228 + }
229 +
230 + org_crc = le32_to_cpup(&p->crc);
231 + p->crc = 0;
232 + final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p));
233 + if (final_crc != org_crc) {
234 + DMERR("dm-bbr: CRC failed!");
235 + DMERR("dm-bbr: Found 0x%x. Expecting 0x%x",
236 + org_crc, final_crc);
237 + rc = -EINVAL;
238 + goto out;
239 + }
240 +
241 + p->crc = cpu_to_le32p(&org_crc);
242 + le_bbr_table_sector_to_cpu(p);
243 +
244 +out:
245 + return rc;
246 +}
247 +
248 +/**
249 + * bbr_binary_tree_insert
250 + *
251 + * Insert a node into the binary tree.
252 + **/
253 +static void bbr_binary_tree_insert(struct bbr_runtime_remap **root,
254 + struct bbr_runtime_remap *newnode)
255 +{
256 + struct bbr_runtime_remap **node = root;
257 + while (node && *node) {
258 + if (newnode->remap.bad_sect > (*node)->remap.bad_sect) {
259 + node = &((*node)->right);
260 + } else {
261 + node = &((*node)->left);
262 + }
263 + }
264 +
265 + newnode->left = newnode->right = NULL;
266 + *node = newnode;
267 +}
268 +
269 +/**
270 + * bbr_binary_search
271 + *
272 + * Search for a node that contains bad_sect == lsn.
273 + **/
274 +static struct bbr_runtime_remap *bbr_binary_search(
275 + struct bbr_runtime_remap *root,
276 + u64 lsn)
277 +{
278 + struct bbr_runtime_remap *node = root;
279 + while (node) {
280 + if (node->remap.bad_sect == lsn) {
281 + break;
282 + }
283 + if (lsn > node->remap.bad_sect) {
284 + node = node->right;
285 + } else {
286 + node = node->left;
287 + }
288 + }
289 + return node;
290 +}
291 +
292 +/**
293 + * bbr_insert_remap_entry
294 + *
295 + * Create a new remap entry and add it to the binary tree for this node.
296 + **/
297 +static int bbr_insert_remap_entry(struct bbr_private *bbr_id,
298 + struct bbr_table_entry *new_bbr_entry)
299 +{
300 + struct bbr_runtime_remap *newnode;
301 +
302 + newnode = kmem_cache_alloc(bbr_remap_cache, GFP_NOIO);
303 + if (!newnode) {
304 + DMERR("dm-bbr: Could not allocate from remap cache!");
305 + return -ENOMEM;
306 + }
307 + newnode->remap.bad_sect = new_bbr_entry->bad_sect;
308 + newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
309 + spin_lock_irq(&bbr_id->remap_root_lock);
310 + bbr_binary_tree_insert(&bbr_id->remap_root, newnode);
311 + spin_unlock_irq(&bbr_id->remap_root_lock);
312 + return 0;
313 +}
314 +
315 +/**
316 + * bbr_table_to_remap_list
317 + *
318 + * The on-disk bbr table is sorted by the replacement sector LBA. In order to
319 + * improve run time performance, the in memory remap list must be sorted by
320 + * the bad sector LBA. This function is called at discovery time to initialize
321 + * the remap list. This function assumes that at least one copy of meta data
322 + * is valid.
323 + **/
324 +static u32 bbr_table_to_remap_list(struct bbr_private *bbr_id)
325 +{
326 + u32 in_use_blks = 0;
327 + int i, j;
328 + struct bbr_table *p;
329 +
330 + for (i = 0, p = bbr_id->bbr_table;
331 + i < bbr_id->nr_sects_bbr_table;
332 + i++, p++) {
333 + if (!p->in_use_cnt) {
334 + break;
335 + }
336 + in_use_blks += p->in_use_cnt;
337 + for (j = 0; j < p->in_use_cnt; j++) {
338 + bbr_insert_remap_entry(bbr_id, &p->entries[j]);
339 + }
340 + }
341 + if (in_use_blks) {
342 + char b[32];
343 + DMWARN("dm-bbr: There are %u BBR entries for device %s",
344 + in_use_blks, format_dev_t(b, bbr_id->dev->bdev->bd_dev));
345 + }
346 +
347 + return in_use_blks;
348 +}
349 +
350 +/**
351 + * bbr_search_remap_entry
352 + *
353 + * Search remap entry for the specified sector. If found, return a pointer to
354 + * the table entry. Otherwise, return NULL.
355 + **/
356 +static struct bbr_table_entry *bbr_search_remap_entry(
357 + struct bbr_private *bbr_id,
358 + u64 lsn)
359 +{
360 + struct bbr_runtime_remap *p;
361 +
362 + spin_lock_irq(&bbr_id->remap_root_lock);
363 + p = bbr_binary_search(bbr_id->remap_root, lsn);
364 + spin_unlock_irq(&bbr_id->remap_root_lock);
365 + if (p) {
366 + return (&p->remap);
367 + } else {
368 + return NULL;
369 + }
370 +}
371 +
372 +/**
373 + * bbr_remap
374 + *
375 + * If *lsn is in the remap table, return TRUE and modify *lsn,
376 + * else, return FALSE.
377 + **/
378 +static inline int bbr_remap(struct bbr_private *bbr_id,
379 + u64 *lsn)
380 +{
381 + struct bbr_table_entry *e;
382 +
383 + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
384 + e = bbr_search_remap_entry(bbr_id, *lsn);
385 + if (e) {
386 + *lsn = e->replacement_sect;
387 + return 1;
388 + }
389 + }
390 + return 0;
391 +}
392 +
393 +/**
394 + * bbr_remap_probe
395 + *
396 + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap
397 + * table return TRUE, Else, return FALSE.
398 + **/
399 +static inline int bbr_remap_probe(struct bbr_private *bbr_id,
400 + u64 lsn, u64 nr_sects)
401 +{
402 + u64 tmp, cnt;
403 +
404 + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
405 + for (cnt = 0, tmp = lsn;
406 + cnt < nr_sects;
407 + cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) {
408 + if (bbr_remap(bbr_id,&tmp)) {
409 + return 1;
410 + }
411 + }
412 + }
413 + return 0;
414 +}
415 +
416 +/**
417 + * bbr_setup
418 + *
419 + * Read the remap tables from disk and set up the initial remap tree.
420 + **/
421 +static int bbr_setup(struct bbr_private *bbr_id)
422 +{
423 + struct bbr_table *table = bbr_id->bbr_table;
424 + struct io_region job;
425 + unsigned long error;
426 + int i, rc = 0;
427 +
428 + job.bdev = bbr_id->dev->bdev;
429 + job.count = 1;
430 +
431 + /* Read and verify each BBR table sector individually. */
432 + for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) {
433 + job.sector = bbr_id->lba_table1 + i;
434 + rc = dm_io_sync_vm(1, &job, READ, table, &error);
435 + if (rc && bbr_id->lba_table2) {
436 + job.sector = bbr_id->lba_table2 + i;
437 + rc = dm_io_sync_vm(1, &job, READ, table, &error);
438 + }
439 + if (rc) {
440 + goto out;
441 + }
442 +
443 + rc = validate_bbr_table_sector(table);
444 + if (rc) {
445 + goto out;
446 + }
447 + }
448 + atomic_set(&bbr_id->in_use_replacement_blks,
449 + bbr_table_to_remap_list(bbr_id));
450 +
451 +out:
452 + if (rc) {
453 + DMERR("dm-bbr: error during device setup: %d", rc);
454 + }
455 + return rc;
456 +}
457 +
458 +/**
459 + * bbr_io_remap_error
460 + * @bbr_id: Private data for the BBR node.
461 + * @rw: READ or WRITE.
462 + * @starting_lsn: Starting sector of request to remap.
463 + * @count: Number of sectors in the request.
464 + * @page: Page containing the data for the request.
465 + * @offset: Byte-offset of the data within the page.
466 + *
467 + * For the requested range, try to write each sector individually. For each
468 + * sector that fails, find the next available remap location and write the
469 + * data to that new location. Then update the table and write both copies
470 + * of the table to disk. Finally, update the in-memory mapping and do any
471 + * other necessary bookkeeping.
472 + **/
473 +static int bbr_io_remap_error(struct bbr_private *bbr_id,
474 + int rw,
475 + u64 starting_lsn,
476 + u64 count,
477 + struct page *page,
478 + unsigned int offset)
479 +{
480 + struct bbr_table *bbr_table;
481 + struct io_region job;
482 + struct page_list pl;
483 + unsigned long table_sector_index;
484 + unsigned long table_sector_offset;
485 + unsigned long index;
486 + unsigned long error;
487 + u64 lsn, new_lsn;
488 + char b[32];
489 + int rc;
490 +
491 + job.bdev = bbr_id->dev->bdev;
492 + job.count = 1;
493 + pl.page = page;
494 + pl.next = NULL;
495 +
496 + /* For each sector in the request. */
497 + for (lsn = 0; lsn < count; lsn++, offset += SECTOR_SIZE) {
498 + job.sector = starting_lsn + lsn;
499 + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
500 + while (rc) {
501 + /* Find the next available relocation sector. */
502 + new_lsn = atomic_read(&bbr_id->in_use_replacement_blks);
503 + if (new_lsn >= bbr_id->nr_replacement_blks) {
504 + /* No more replacement sectors available. */
505 + return -EIO;
506 + }
507 + new_lsn += bbr_id->start_replacement_sect;
508 +
509 + /* Write the data to its new location. */
510 + DMWARN("dm-bbr: device %s: Trying to remap bad sector "PFU64" to sector "PFU64,
511 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
512 + starting_lsn + lsn, new_lsn);
513 + job.sector = new_lsn;
514 + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
515 + if (rc) {
516 + /* This replacement sector is bad.
517 + * Try the next one.
518 + */
519 + DMERR("dm-bbr: device %s: replacement sector "PFU64" is bad. Skipping.",
520 + format_dev_t(b, bbr_id->dev->bdev->bd_dev), new_lsn);
521 + atomic_inc(&bbr_id->in_use_replacement_blks);
522 + continue;
523 + }
524 +
525 + /* Add this new entry to the on-disk table. */
526 + table_sector_index = new_lsn -
527 + bbr_id->start_replacement_sect;
528 + table_sector_offset = table_sector_index /
529 + BBR_ENTRIES_PER_SECT;
530 + index = table_sector_index % BBR_ENTRIES_PER_SECT;
531 +
532 + bbr_table = &bbr_id->bbr_table[table_sector_offset];
533 + bbr_table->entries[index].bad_sect = starting_lsn + lsn;
534 + bbr_table->entries[index].replacement_sect = new_lsn;
535 + bbr_table->in_use_cnt++;
536 + bbr_table->sequence_number++;
537 + bbr_table->crc = 0;
538 + bbr_table->crc = calculate_crc(INITIAL_CRC,
539 + bbr_table,
540 + sizeof(struct bbr_table));
541 +
542 + /* Write the table to disk. */
543 + cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
544 + if (bbr_id->lba_table1) {
545 + job.sector = bbr_id->lba_table1 + table_sector_offset;
546 + rc = dm_io_sync_vm(1, &job, WRITE, bbr_table, &error);
547 + }
548 + if (bbr_id->lba_table2) {
549 + job.sector = bbr_id->lba_table2 + table_sector_offset;
550 + rc |= dm_io_sync_vm(1, &job, WRITE, bbr_table, &error);
551 + }
552 + le_bbr_table_sector_to_cpu(bbr_table);
553 +
554 + if (rc) {
555 + /* Error writing one of the tables to disk. */
556 + DMERR("dm-bbr: device %s: error updating BBR tables on disk.",
557 + format_dev_t(b, bbr_id->dev->bdev->bd_dev));
558 + return rc;
559 + }
560 +
561 + /* Insert a new entry in the remapping binary-tree. */
562 + rc = bbr_insert_remap_entry(bbr_id,
563 + &bbr_table->entries[index]);
564 + if (rc) {
565 + DMERR("dm-bbr: device %s: error adding new entry to remap tree.",
566 + format_dev_t(b, bbr_id->dev->bdev->bd_dev));
567 + return rc;
568 + }
569 +
570 + atomic_inc(&bbr_id->in_use_replacement_blks);
571 + }
572 + }
573 +
574 + return 0;
575 +}
576 +
577 +/**
578 + * bbr_io_process_request
579 + *
580 + * For each sector in this request, check if the sector has already
581 + * been remapped. If so, process all previous sectors in the request,
582 + * followed by the remapped sector. Then reset the starting lsn and
583 + * count, and keep going with the rest of the request as if it were
584 + * a whole new request. If any of the sync_io's return an error,
585 + * call the remapper to relocate the bad sector(s).
586 + *
587 + * 2.5 Note: When switching over to bio's for the I/O path, we have made
588 + * the assumption that the I/O request described by the bio is one
589 + * virtually contiguous piece of memory (even though the bio vector
590 + * describes it using a series of physical page addresses).
591 + **/
592 +static int bbr_io_process_request(struct bbr_private *bbr_id,
593 + struct bio *bio)
594 +{
595 + struct io_region job;
596 + u64 starting_lsn = bio->bi_sector;
597 + u64 count, lsn, remapped_lsn;
598 + struct page_list pl;
599 + unsigned int offset;
600 + unsigned long error;
601 + int i, rw = bio_data_dir(bio);
602 + int rc = 0;
603 +
604 + job.bdev = bbr_id->dev->bdev;
605 + pl.next = NULL;
606 +
607 + /* Each bio can contain multiple vectors, each with a different page.
608 + * Treat each vector as a separate request.
609 + */
610 + /* KMC: Is this the right way to walk the bvec list? */
611 + for (i = 0;
612 + i < bio->bi_vcnt;
613 + i++, bio->bi_idx++, starting_lsn += count) {
614 +
615 + /* Bvec info: number of sectors, page,
616 + * and byte-offset within page.
617 + */
618 + count = bio_iovec(bio)->bv_len >> SECTOR_SHIFT;
619 + pl.page = bio_iovec(bio)->bv_page;
620 + offset = bio_iovec(bio)->bv_offset;
621 +
622 + /* For each sector in this bvec, check if the sector has
623 + * already been remapped. If so, process all previous sectors
624 + * in this request, followed by the remapped sector. Then reset
625 + * the starting lsn and count and keep going with the rest of
626 + * the request as if it were a whole new request.
627 + */
628 + for (lsn = 0; lsn < count; lsn++) {
629 + remapped_lsn = starting_lsn + lsn;
630 + rc = bbr_remap(bbr_id, &remapped_lsn);
631 + if (!rc) {
632 + /* This sector is fine. */
633 + continue;
634 + }
635 +
636 + /* Process all sectors in the request up to this one. */
637 + if (lsn > 0) {
638 + job.sector = starting_lsn;
639 + job.count = lsn;
640 + rc = dm_io_sync(1, &job, rw, &pl,
641 + offset, &error);
642 + if (rc) {
643 + /* If this I/O failed, then one of the
644 + * sectors in this request needs to be
645 + * relocated.
646 + */
647 + rc = bbr_io_remap_error(bbr_id, rw,
648 + starting_lsn,
649 + lsn, pl.page,
650 + offset);
651 + if (rc) {
652 + /* KMC: Return? Or continue to next bvec? */
653 + return rc;
654 + }
655 + }
656 + offset += (lsn << SECTOR_SHIFT);
657 + }
658 +
659 + /* Process the remapped sector. */
660 + job.sector = remapped_lsn;
661 + job.count = 1;
662 + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
663 + if (rc) {
664 + /* BUGBUG - Need more processing if this caused
665 + * an error. If this I/O failed, then the
666 + * existing remap is now bad, and we need to
667 + * find a new remap. Can't use
668 + * bbr_io_remap_error(), because the existing
669 + * map entry needs to be changed, not added
670 + * again, and the original table entry also
671 + * needs to be changed.
672 + */
673 + return rc;
674 + }
675 +
676 + starting_lsn += (lsn + 1);
677 + count -= (lsn + 1);
678 + lsn = -1;
679 + offset += SECTOR_SIZE;
680 + }
681 +
682 + /* Check for any remaining sectors after the last split. This
683 + * could potentially be the whole request, but that should be a
684 + * rare case because requests should only be processed by the
685 + * thread if we know an error occurred or they contained one or
686 + * more remapped sectors.
687 + */
688 + if (count) {
689 + job.sector = starting_lsn;
690 + job.count = count;
691 + rc = dm_io_sync(1, &job, rw, &pl, offset, &error);
692 + if (rc) {
693 + /* If this I/O failed, then one of the sectors
694 + * in this request needs to be relocated.
695 + */
696 + rc = bbr_io_remap_error(bbr_id, rw, starting_lsn,
697 + count, pl.page, offset);
698 + if (rc) {
699 + /* KMC: Return? Or continue to next bvec? */
700 + return rc;
701 + }
702 + }
703 + }
704 + }
705 +
706 + return 0;
707 +}
708 +
709 +static void bbr_io_process_requests(struct bbr_private *bbr_id,
710 + struct bio *bio)
711 +{
712 + struct bio *next;
713 + int rc;
714 +
715 + while (bio) {
716 + next = bio->bi_next;
717 + bio->bi_next = NULL;
718 +
719 + rc = bbr_io_process_request(bbr_id, bio);
720 +
721 + bio_endio(bio, bio->bi_size, rc);
722 +
723 + bio = next;
724 + }
725 +}
726 +
727 +/**
728 + * bbr_remap_handler
729 + *
730 + * This is the handler for the bbr work-queue.
731 + *
732 + * I/O requests should only be sent to this handler if we know that:
733 + * a) the request contains at least one remapped sector.
734 + * or
735 + * b) the request caused an error on the normal I/O path.
736 + *
737 + * This function uses synchronous I/O, so sending a request to this
738 + * thread that doesn't need special processing will cause severe
739 + * performance degredation.
740 + **/
741 +static void bbr_remap_handler(void *data)
742 +{
743 + struct bbr_private *bbr_id = data;
744 + struct bio *bio;
745 + unsigned long flags;
746 +
747 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
748 + bio = bio_list_get(&bbr_id->remap_ios);
749 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
750 +
751 + bbr_io_process_requests(bbr_id, bio);
752 +}
753 +
754 +/**
755 + * bbr_endio
756 + *
757 + * This is the callback for normal write requests. Check for an error
758 + * during the I/O, and send to the thread for processing if necessary.
759 + **/
760 +static int bbr_endio(struct dm_target *ti, struct bio *bio,
761 + int error, union map_info *map_context)
762 +{
763 + struct bbr_private *bbr_id = ti->private;
764 + struct dm_bio_details *bbr_io = map_context->ptr;
765 +
766 + if (error && bbr_io) {
767 + unsigned long flags;
768 + char b[32];
769 +
770 + dm_bio_restore(bbr_io, bio);
771 + map_context->ptr = NULL;
772 +
773 + DMERR("dm-bbr: device %s: I/O failure on sector %lu. "
774 + "Scheduling for retry.",
775 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
776 + (unsigned long)bio->bi_sector);
777 +
778 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
779 + bio_list_add(&bbr_id->remap_ios, bio);
780 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
781 +
782 + queue_work(dm_bbr_wq, &bbr_id->remap_work);
783 +
784 + error = 1;
785 + }
786 +
787 + if (bbr_io)
788 + mempool_free(bbr_io, bbr_io_pool);
789 +
790 + return error;
791 +}
792 +
793 +/**
794 + * Construct a bbr mapping
795 + **/
796 +static int bbr_ctr(struct dm_target *ti, unsigned int argc, char **argv)
797 +{
798 + struct bbr_private *bbr_id;
799 + unsigned long block_size;
800 + char *end;
801 + int rc = -EINVAL;
802 +
803 + if (argc != 8) {
804 + ti->error = "dm-bbr requires exactly 8 arguments: "
805 + "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size";
806 + goto out1;
807 + }
808 +
809 + bbr_id = bbr_alloc_private();
810 + if (!bbr_id) {
811 + ti->error = "dm-bbr: Error allocating bbr private data.";
812 + goto out1;
813 + }
814 +
815 + bbr_id->offset = simple_strtoull(argv[1], &end, 10);
816 + bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10);
817 + bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10);
818 + bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10);
819 + bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10);
820 + bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10);
821 + block_size = simple_strtoul(argv[7], &end, 10);
822 + bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT);
823 +
824 + bbr_id->bbr_table = vmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT);
825 + if (!bbr_id->bbr_table) {
826 + ti->error = "dm-bbr: Error allocating bbr table.";
827 + goto out2;
828 + }
829 +
830 + if (dm_get_device(ti, argv[0], 0, ti->len,
831 + dm_table_get_mode(ti->table), &bbr_id->dev)) {
832 + ti->error = "dm-bbr: Device lookup failed";
833 + goto out2;
834 + }
835 +
836 + rc = bbr_setup(bbr_id);
837 + if (rc) {
838 + ti->error = "dm-bbr: Device setup failed";
839 + goto out3;
840 + }
841 +
842 + ti->private = bbr_id;
843 + return 0;
844 +
845 +out3:
846 + dm_put_device(ti, bbr_id->dev);
847 +out2:
848 + bbr_free_private(bbr_id);
849 +out1:
850 + return rc;
851 +}
852 +
853 +static void bbr_dtr(struct dm_target *ti)
854 +{
855 + struct bbr_private *bbr_id = ti->private;
856 +
857 + dm_put_device(ti, bbr_id->dev);
858 + bbr_free_private(bbr_id);
859 +}
860 +
861 +static int bbr_map(struct dm_target *ti, struct bio *bio,
862 + union map_info *map_context)
863 +{
864 + struct bbr_private *bbr_id = ti->private;
865 + struct dm_bio_details *bbr_io;
866 + unsigned long flags;
867 + int rc = 1;
868 +
869 + bio->bi_sector += bbr_id->offset;
870 +
871 + if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
872 + !bbr_remap_probe(bbr_id, bio->bi_sector, bio_sectors(bio))) {
873 + /* No existing remaps or this request doesn't
874 + * contain any remapped sectors.
875 + */
876 + bio->bi_bdev = bbr_id->dev->bdev;
877 +
878 + bbr_io = mempool_alloc(bbr_io_pool, GFP_NOIO);
879 + dm_bio_record(bbr_io, bio);
880 + map_context->ptr = bbr_io;
881 + } else {
882 + /* This request has at least one remapped sector.
883 + * Give it to the work-queue for processing.
884 + */
885 + map_context->ptr = NULL;
886 + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags);
887 + bio_list_add(&bbr_id->remap_ios, bio);
888 + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags);
889 +
890 + queue_work(dm_bbr_wq, &bbr_id->remap_work);
891 + rc = 0;
892 + }
893 +
894 + return rc;
895 +}
896 +
897 +static int bbr_status(struct dm_target *ti, status_type_t type,
898 + char *result, unsigned int maxlen)
899 +{
900 + struct bbr_private *bbr_id = ti->private;
901 + char b[BDEVNAME_SIZE];
902 +
903 + switch (type) {
904 + case STATUSTYPE_INFO:
905 + result[0] = '\0';
906 + break;
907 +
908 + case STATUSTYPE_TABLE:
909 + snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u",
910 + format_dev_t(b, bbr_id->dev->bdev->bd_dev),
911 + bbr_id->offset, bbr_id->lba_table1, bbr_id->lba_table2,
912 + bbr_id->nr_sects_bbr_table,
913 + bbr_id->start_replacement_sect,
914 + bbr_id->nr_replacement_blks,
915 + bbr_id->blksize_in_sects << SECTOR_SHIFT);
916 + break;
917 + }
918 + return 0;
919 +}
920 +
921 +static struct target_type bbr_target = {
922 + .name = "bbr",
923 + .version= {1, 0, 1},
924 + .module = THIS_MODULE,
925 + .ctr = bbr_ctr,
926 + .dtr = bbr_dtr,
927 + .map = bbr_map,
928 + .end_io = bbr_endio,
929 + .status = bbr_status,
930 +};
931 +
932 +int __init dm_bbr_init(void)
933 +{
934 + int rc;
935 +
936 + rc = dm_register_target(&bbr_target);
937 + if (rc) {
938 + DMERR("dm-bbr: error registering target.");
939 + goto err1;
940 + }
941 +
942 + bbr_remap_cache = kmem_cache_create("bbr-remap",
943 + sizeof(struct bbr_runtime_remap),
944 + 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
945 + if (!bbr_remap_cache) {
946 + DMERR("dm-bbr: error creating remap cache.");
947 + rc = ENOMEM;
948 + goto err2;
949 + }
950 +
951 + bbr_io_cache = kmem_cache_create("bbr-io", sizeof(struct dm_bio_details),
952 + 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
953 + if (!bbr_io_cache) {
954 + DMERR("dm-bbr: error creating io cache.");
955 + rc = ENOMEM;
956 + goto err3;
957 + }
958 +
959 + bbr_io_pool = mempool_create(256, mempool_alloc_slab,
960 + mempool_free_slab, bbr_io_cache);
961 + if (!bbr_io_pool) {
962 + DMERR("dm-bbr: error creating io mempool.");
963 + rc = ENOMEM;
964 + goto err4;
965 + }
966 +
967 + dm_bbr_wq = create_workqueue("dm-bbr");
968 + if (!dm_bbr_wq) {
969 + DMERR("dm-bbr: error creating work-queue.");
970 + rc = ENOMEM;
971 + goto err5;
972 + }
973 +
974 + rc = dm_io_get(1);
975 + if (rc) {
976 + DMERR("dm-bbr: error initializing I/O service.");
977 + goto err6;
978 + }
979 +
980 + return 0;
981 +
982 +err6:
983 + destroy_workqueue(dm_bbr_wq);
984 +err5:
985 + mempool_destroy(bbr_io_pool);
986 +err4:
987 + kmem_cache_destroy(bbr_io_cache);
988 +err3:
989 + kmem_cache_destroy(bbr_remap_cache);
990 +err2:
991 + dm_unregister_target(&bbr_target);
992 +err1:
993 + return rc;
994 +}
995 +
996 +void __exit dm_bbr_exit(void)
997 +{
998 + dm_io_put(1);
999 + destroy_workqueue(dm_bbr_wq);
1000 + mempool_destroy(bbr_io_pool);
1001 + kmem_cache_destroy(bbr_io_cache);
1002 + kmem_cache_destroy(bbr_remap_cache);
1003 + dm_unregister_target(&bbr_target);
1004 +}
1005 +
1006 +module_init(dm_bbr_init);
1007 +module_exit(dm_bbr_exit);
1008 +MODULE_LICENSE("GPL");
1009 Index: linux-git/drivers/md/dm-bbr.h
1010 ===================================================================
1011 --- /dev/null
1012 +++ linux-git/drivers/md/dm-bbr.h
1013 @@ -0,0 +1,125 @@
1014 +/*
1015 + * (C) Copyright IBM Corp. 2002, 2004
1016 + *
1017 + * This program is free software; you can redistribute it and/or modify
1018 + * it under the terms of the GNU General Public License as published by
1019 + * the Free Software Foundation; either version 2 of the License, or
1020 + * (at your option) any later version.
1021 + *
1022 + * This program is distributed in the hope that it will be useful,
1023 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1024 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
1025 + * the GNU General Public License for more details.
1026 + *
1027 + * You should have received a copy of the GNU General Public License
1028 + * along with this program; if not, write to the Free Software
1029 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1030 + *
1031 + * linux/drivers/md/dm-bbr.h
1032 + *
1033 + * Bad-block-relocation (BBR) target for device-mapper.
1034 + *
1035 + * The BBR target is designed to remap I/O write failures to another safe
1036 + * location on disk. Note that most disk drives have BBR built into them,
1037 + * this means that our software BBR will be only activated when all hardware
1038 + * BBR replacement sectors have been used.
1039 + */
1040 +
1041 +#define BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */
1042 +#define BBR_ENTRIES_PER_SECT 31
1043 +#define INITIAL_CRC 0xFFFFFFFF
1044 +#define CRC_POLYNOMIAL 0xEDB88320L
1045 +
1046 +/**
1047 + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines.
1048 + * Use these in place of %Ld, %Lu, and %Lx.
1049 + **/
1050 +#if BITS_PER_LONG > 32
1051 +#define PFU64 "%lu"
1052 +#else
1053 +#define PFU64 "%Lu"
1054 +#endif
1055 +
1056 +/**
1057 + * struct bbr_table_entry
1058 + * @bad_sect: LBA of bad location.
1059 + * @replacement_sect: LBA of new location.
1060 + *
1061 + * Structure to describe one BBR remap.
1062 + **/
1063 +struct bbr_table_entry {
1064 + u64 bad_sect;
1065 + u64 replacement_sect;
1066 +};
1067 +
1068 +/**
1069 + * struct bbr_table
1070 + * @signature: Signature on each BBR table sector.
1071 + * @crc: CRC for this table sector.
1072 + * @sequence_number: Used to resolve conflicts when primary and secondary
1073 + * tables do not match.
1074 + * @in_use_cnt: Number of in-use table entries.
1075 + * @entries: Actual table of remaps.
1076 + *
1077 + * Structure to describe each sector of the metadata table. Each sector in this
1078 + * table can describe 31 remapped sectors.
1079 + **/
1080 +struct bbr_table {
1081 + u32 signature;
1082 + u32 crc;
1083 + u32 sequence_number;
1084 + u32 in_use_cnt;
1085 + struct bbr_table_entry entries[BBR_ENTRIES_PER_SECT];
1086 +};
1087 +
1088 +/**
1089 + * struct bbr_runtime_remap
1090 + *
1091 + * Node in the binary tree used to keep track of remaps.
1092 + **/
1093 +struct bbr_runtime_remap {
1094 + struct bbr_table_entry remap;
1095 + struct bbr_runtime_remap *left;
1096 + struct bbr_runtime_remap *right;
1097 +};
1098 +
1099 +/**
1100 + * struct bbr_private
1101 + * @dev: Info about underlying device.
1102 + * @bbr_table: Copy of metadata table.
1103 + * @remap_root: Binary tree containing all remaps.
1104 + * @remap_root_lock: Lock for the binary tree.
1105 + * @remap_work: For adding work items to the work-queue.
1106 + * @remap_ios: List of I/Os for the work-queue to handle.
1107 + * @remap_ios_lock: Lock for the remap_ios list.
1108 + * @offset: LBA of data area.
1109 + * @lba_table1: LBA of primary BBR table.
1110 + * @lba_table2: LBA of secondary BBR table.
1111 + * @nr_sects_bbr_table: Size of each BBR table.
1112 + * @nr_replacement_blks: Number of replacement blocks.
1113 + * @start_replacement_sect: LBA of start of replacement blocks.
1114 + * @blksize_in_sects: Size of each block.
1115 + * @in_use_replacement_blks: Current number of remapped blocks.
1116 + *
1117 + * Private data for each BBR target.
1118 + **/
1119 +struct bbr_private {
1120 + struct dm_dev *dev;
1121 + struct bbr_table *bbr_table;
1122 + struct bbr_runtime_remap *remap_root;
1123 + spinlock_t remap_root_lock;
1124 +
1125 + struct work_struct remap_work;
1126 + struct bio_list remap_ios;
1127 + spinlock_t remap_ios_lock;
1128 +
1129 + u64 offset;
1130 + u64 lba_table1;
1131 + u64 lba_table2;
1132 + u64 nr_sects_bbr_table;
1133 + u64 start_replacement_sect;
1134 + u64 nr_replacement_blks;
1135 + u32 blksize_in_sects;
1136 + atomic_t in_use_replacement_blks;
1137 +};
1138 +
1139 Index: linux-git/drivers/md/Kconfig
1140 ===================================================================
1141 --- linux-git.orig/drivers/md/Kconfig
1142 +++ linux-git/drivers/md/Kconfig
1143 @@ -236,5 +236,16 @@ config DM_MULTIPATH_EMC
1144 ---help---
1145 Multipath support for EMC CX/AX series hardware.
1146
1147 +config BLK_DEV_DM_BBR
1148 + tristate "Bad Block Relocation Device Target (EXPERIMENTAL)"
1149 + depends on BLK_DEV_DM && EXPERIMENTAL
1150 + ---help---
1151 + Support for devices with software-based bad-block-relocation.
1152 +
1153 + To compile this as a module, choose M here: the module will be
1154 + called dm-bbr.
1155 +
1156 + If unsure, say N.
1157 +
1158 endmenu
1159
1160 Index: linux-git/drivers/md/Makefile
1161 ===================================================================
1162 --- linux-git.orig/drivers/md/Makefile
1163 +++ linux-git/drivers/md/Makefile
1164 @@ -37,6 +37,7 @@ obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc
1165 obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
1166 obj-$(CONFIG_DM_MIRROR) += dm-mirror.o
1167 obj-$(CONFIG_DM_ZERO) += dm-zero.o
1168 +obj-$(CONFIG_BLK_DEV_DM_BBR) += dm-bbr.o
1169
1170 quiet_cmd_unroll = UNROLL $@
1171 cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \

  ViewVC Help
Powered by ViewVC 1.1.20