/[linux-patches]/genpatches-2.6/tags/3.0-30/1800_fix-zcache-build.patch
Gentoo

Contents of /genpatches-2.6/tags/3.0-30/1800_fix-zcache-build.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2206 - (show annotations) (download)
Mon Sep 17 18:58:14 2012 UTC (22 months, 1 week ago) by mpagano
File size: 94560 byte(s)
3.0-30 release
1 diff --git a/drivers/staging/zcache/Makefile b/drivers/staging/zcache/Makefile
2 index f5ec64f..e3c945f 100644
3 --- a/drivers/staging/zcache/Makefile
4 +++ b/drivers/staging/zcache/Makefile
5 @@ -1,3 +1,3 @@
6 -zcache-y := tmem.o
7 +zcache-y := zcache_drv.o tmem.o
8
9 obj-$(CONFIG_ZCACHE) += zcache.o
10 diff --git a/drivers/staging/zcache/zcache.c b/drivers/staging/zcache/zcache.c
11 deleted file mode 100644
12 index 77ac2d4..0000000
13 --- a/drivers/staging/zcache/zcache.c
14 +++ /dev/null
15 @@ -1,1661 +0,0 @@
16 -/*
17 - * zcache.c
18 - *
19 - * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp.
20 - * Copyright (c) 2010,2011, Nitin Gupta
21 - *
22 - * Zcache provides an in-kernel "host implementation" for transcendent memory
23 - * and, thus indirectly, for cleancache and frontswap. Zcache includes two
24 - * page-accessible memory [1] interfaces, both utilizing lzo1x compression:
25 - * 1) "compression buddies" ("zbud") is used for ephemeral pages
26 - * 2) xvmalloc is used for persistent pages.
27 - * Xvmalloc (based on the TLSF allocator) has very low fragmentation
28 - * so maximizes space efficiency, while zbud allows pairs (and potentially,
29 - * in the future, more than a pair of) compressed pages to be closely linked
30 - * so that reclaiming can be done via the kernel's physical-page-oriented
31 - * "shrinker" interface.
32 - *
33 - * [1] For a definition of page-accessible memory (aka PAM), see:
34 - * http://marc.info/?l=linux-mm&m=127811271605009
35 - */
36 -
37 -#include <linux/cpu.h>
38 -#include <linux/highmem.h>
39 -#include <linux/list.h>
40 -#include <linux/lzo.h>
41 -#include <linux/slab.h>
42 -#include <linux/spinlock.h>
43 -#include <linux/types.h>
44 -#include <linux/atomic.h>
45 -#include "tmem.h"
46 -
47 -#include "../zram/xvmalloc.h" /* if built in drivers/staging */
48 -
49 -#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP))
50 -#error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP"
51 -#endif
52 -#ifdef CONFIG_CLEANCACHE
53 -#include <linux/cleancache.h>
54 -#endif
55 -#ifdef CONFIG_FRONTSWAP
56 -#include <linux/frontswap.h>
57 -#endif
58 -
59 -#if 0
60 -/* this is more aggressive but may cause other problems? */
61 -#define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN)
62 -#else
63 -#define ZCACHE_GFP_MASK \
64 - (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
65 -#endif
66 -
67 -/**********
68 - * Compression buddies ("zbud") provides for packing two (or, possibly
69 - * in the future, more) compressed ephemeral pages into a single "raw"
70 - * (physical) page and tracking them with data structures so that
71 - * the raw pages can be easily reclaimed.
72 - *
73 - * A zbud page ("zbpg") is an aligned page containing a list_head,
74 - * a lock, and two "zbud headers". The remainder of the physical
75 - * page is divided up into aligned 64-byte "chunks" which contain
76 - * the compressed data for zero, one, or two zbuds. Each zbpg
77 - * resides on: (1) an "unused list" if it has no zbuds; (2) a
78 - * "buddied" list if it is fully populated with two zbuds; or
79 - * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks
80 - * the one unbuddied zbud uses. The data inside a zbpg cannot be
81 - * read or written unless the zbpg's lock is held.
82 - */
83 -
84 -#define ZBH_SENTINEL 0x43214321
85 -#define ZBPG_SENTINEL 0xdeadbeef
86 -
87 -#define ZBUD_MAX_BUDS 2
88 -
89 -struct zbud_hdr {
90 - uint32_t pool_id;
91 - struct tmem_oid oid;
92 - uint32_t index;
93 - uint16_t size; /* compressed size in bytes, zero means unused */
94 - DECL_SENTINEL
95 -};
96 -
97 -struct zbud_page {
98 - struct list_head bud_list;
99 - spinlock_t lock;
100 - struct zbud_hdr buddy[ZBUD_MAX_BUDS];
101 - DECL_SENTINEL
102 - /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */
103 -};
104 -
105 -#define CHUNK_SHIFT 6
106 -#define CHUNK_SIZE (1 << CHUNK_SHIFT)
107 -#define CHUNK_MASK (~(CHUNK_SIZE-1))
108 -#define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \
109 - CHUNK_MASK) >> CHUNK_SHIFT)
110 -#define MAX_CHUNK (NCHUNKS-1)
111 -
112 -static struct {
113 - struct list_head list;
114 - unsigned count;
115 -} zbud_unbuddied[NCHUNKS];
116 -/* list N contains pages with N chunks USED and NCHUNKS-N unused */
117 -/* element 0 is never used but optimizing that isn't worth it */
118 -static unsigned long zbud_cumul_chunk_counts[NCHUNKS];
119 -
120 -struct list_head zbud_buddied_list;
121 -static unsigned long zcache_zbud_buddied_count;
122 -
123 -/* protects the buddied list and all unbuddied lists */
124 -static DEFINE_SPINLOCK(zbud_budlists_spinlock);
125 -
126 -static LIST_HEAD(zbpg_unused_list);
127 -static unsigned long zcache_zbpg_unused_list_count;
128 -
129 -/* protects the unused page list */
130 -static DEFINE_SPINLOCK(zbpg_unused_list_spinlock);
131 -
132 -static atomic_t zcache_zbud_curr_raw_pages;
133 -static atomic_t zcache_zbud_curr_zpages;
134 -static unsigned long zcache_zbud_curr_zbytes;
135 -static unsigned long zcache_zbud_cumul_zpages;
136 -static unsigned long zcache_zbud_cumul_zbytes;
137 -static unsigned long zcache_compress_poor;
138 -
139 -/* forward references */
140 -static void *zcache_get_free_page(void);
141 -static void zcache_free_page(void *p);
142 -
143 -/*
144 - * zbud helper functions
145 - */
146 -
147 -static inline unsigned zbud_max_buddy_size(void)
148 -{
149 - return MAX_CHUNK << CHUNK_SHIFT;
150 -}
151 -
152 -static inline unsigned zbud_size_to_chunks(unsigned size)
153 -{
154 - BUG_ON(size == 0 || size > zbud_max_buddy_size());
155 - return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
156 -}
157 -
158 -static inline int zbud_budnum(struct zbud_hdr *zh)
159 -{
160 - unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1);
161 - struct zbud_page *zbpg = NULL;
162 - unsigned budnum = -1U;
163 - int i;
164 -
165 - for (i = 0; i < ZBUD_MAX_BUDS; i++)
166 - if (offset == offsetof(typeof(*zbpg), buddy[i])) {
167 - budnum = i;
168 - break;
169 - }
170 - BUG_ON(budnum == -1U);
171 - return budnum;
172 -}
173 -
174 -static char *zbud_data(struct zbud_hdr *zh, unsigned size)
175 -{
176 - struct zbud_page *zbpg;
177 - char *p;
178 - unsigned budnum;
179 -
180 - ASSERT_SENTINEL(zh, ZBH);
181 - budnum = zbud_budnum(zh);
182 - BUG_ON(size == 0 || size > zbud_max_buddy_size());
183 - zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
184 - ASSERT_SPINLOCK(&zbpg->lock);
185 - p = (char *)zbpg;
186 - if (budnum == 0)
187 - p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) &
188 - CHUNK_MASK);
189 - else if (budnum == 1)
190 - p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
191 - return p;
192 -}
193 -
194 -/*
195 - * zbud raw page management
196 - */
197 -
198 -static struct zbud_page *zbud_alloc_raw_page(void)
199 -{
200 - struct zbud_page *zbpg = NULL;
201 - struct zbud_hdr *zh0, *zh1;
202 - bool recycled = 0;
203 -
204 - /* if any pages on the zbpg list, use one */
205 - spin_lock(&zbpg_unused_list_spinlock);
206 - if (!list_empty(&zbpg_unused_list)) {
207 - zbpg = list_first_entry(&zbpg_unused_list,
208 - struct zbud_page, bud_list);
209 - list_del_init(&zbpg->bud_list);
210 - zcache_zbpg_unused_list_count--;
211 - recycled = 1;
212 - }
213 - spin_unlock(&zbpg_unused_list_spinlock);
214 - if (zbpg == NULL)
215 - /* none on zbpg list, try to get a kernel page */
216 - zbpg = zcache_get_free_page();
217 - if (likely(zbpg != NULL)) {
218 - INIT_LIST_HEAD(&zbpg->bud_list);
219 - zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
220 - spin_lock_init(&zbpg->lock);
221 - if (recycled) {
222 - ASSERT_INVERTED_SENTINEL(zbpg, ZBPG);
223 - SET_SENTINEL(zbpg, ZBPG);
224 - BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
225 - BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
226 - } else {
227 - atomic_inc(&zcache_zbud_curr_raw_pages);
228 - INIT_LIST_HEAD(&zbpg->bud_list);
229 - SET_SENTINEL(zbpg, ZBPG);
230 - zh0->size = 0; zh1->size = 0;
231 - tmem_oid_set_invalid(&zh0->oid);
232 - tmem_oid_set_invalid(&zh1->oid);
233 - }
234 - }
235 - return zbpg;
236 -}
237 -
238 -static void zbud_free_raw_page(struct zbud_page *zbpg)
239 -{
240 - struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1];
241 -
242 - ASSERT_SENTINEL(zbpg, ZBPG);
243 - BUG_ON(!list_empty(&zbpg->bud_list));
244 - ASSERT_SPINLOCK(&zbpg->lock);
245 - BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
246 - BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
247 - INVERT_SENTINEL(zbpg, ZBPG);
248 - spin_unlock(&zbpg->lock);
249 - spin_lock(&zbpg_unused_list_spinlock);
250 - list_add(&zbpg->bud_list, &zbpg_unused_list);
251 - zcache_zbpg_unused_list_count++;
252 - spin_unlock(&zbpg_unused_list_spinlock);
253 -}
254 -
255 -/*
256 - * core zbud handling routines
257 - */
258 -
259 -static unsigned zbud_free(struct zbud_hdr *zh)
260 -{
261 - unsigned size;
262 -
263 - ASSERT_SENTINEL(zh, ZBH);
264 - BUG_ON(!tmem_oid_valid(&zh->oid));
265 - size = zh->size;
266 - BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
267 - zh->size = 0;
268 - tmem_oid_set_invalid(&zh->oid);
269 - INVERT_SENTINEL(zh, ZBH);
270 - zcache_zbud_curr_zbytes -= size;
271 - atomic_dec(&zcache_zbud_curr_zpages);
272 - return size;
273 -}
274 -
275 -static void zbud_free_and_delist(struct zbud_hdr *zh)
276 -{
277 - unsigned chunks;
278 - struct zbud_hdr *zh_other;
279 - unsigned budnum = zbud_budnum(zh), size;
280 - struct zbud_page *zbpg =
281 - container_of(zh, struct zbud_page, buddy[budnum]);
282 -
283 - spin_lock(&zbpg->lock);
284 - if (list_empty(&zbpg->bud_list)) {
285 - /* ignore zombie page... see zbud_evict_pages() */
286 - spin_unlock(&zbpg->lock);
287 - return;
288 - }
289 - size = zbud_free(zh);
290 - ASSERT_SPINLOCK(&zbpg->lock);
291 - zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0];
292 - if (zh_other->size == 0) { /* was unbuddied: unlist and free */
293 - chunks = zbud_size_to_chunks(size) ;
294 - spin_lock(&zbud_budlists_spinlock);
295 - BUG_ON(list_empty(&zbud_unbuddied[chunks].list));
296 - list_del_init(&zbpg->bud_list);
297 - zbud_unbuddied[chunks].count--;
298 - spin_unlock(&zbud_budlists_spinlock);
299 - zbud_free_raw_page(zbpg);
300 - } else { /* was buddied: move remaining buddy to unbuddied list */
301 - chunks = zbud_size_to_chunks(zh_other->size) ;
302 - spin_lock(&zbud_budlists_spinlock);
303 - list_del_init(&zbpg->bud_list);
304 - zcache_zbud_buddied_count--;
305 - list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list);
306 - zbud_unbuddied[chunks].count++;
307 - spin_unlock(&zbud_budlists_spinlock);
308 - spin_unlock(&zbpg->lock);
309 - }
310 -}
311 -
312 -static struct zbud_hdr *zbud_create(uint32_t pool_id, struct tmem_oid *oid,
313 - uint32_t index, struct page *page,
314 - void *cdata, unsigned size)
315 -{
316 - struct zbud_hdr *zh0, *zh1, *zh = NULL;
317 - struct zbud_page *zbpg = NULL, *ztmp;
318 - unsigned nchunks;
319 - char *to;
320 - int i, found_good_buddy = 0;
321 -
322 - nchunks = zbud_size_to_chunks(size) ;
323 - for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
324 - spin_lock(&zbud_budlists_spinlock);
325 - if (!list_empty(&zbud_unbuddied[i].list)) {
326 - list_for_each_entry_safe(zbpg, ztmp,
327 - &zbud_unbuddied[i].list, bud_list) {
328 - if (spin_trylock(&zbpg->lock)) {
329 - found_good_buddy = i;
330 - goto found_unbuddied;
331 - }
332 - }
333 - }
334 - spin_unlock(&zbud_budlists_spinlock);
335 - }
336 - /* didn't find a good buddy, try allocating a new page */
337 - zbpg = zbud_alloc_raw_page();
338 - if (unlikely(zbpg == NULL))
339 - goto out;
340 - /* ok, have a page, now compress the data before taking locks */
341 - spin_lock(&zbpg->lock);
342 - spin_lock(&zbud_budlists_spinlock);
343 - list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list);
344 - zbud_unbuddied[nchunks].count++;
345 - zh = &zbpg->buddy[0];
346 - goto init_zh;
347 -
348 -found_unbuddied:
349 - ASSERT_SPINLOCK(&zbpg->lock);
350 - zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
351 - BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0)));
352 - if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */
353 - ASSERT_SENTINEL(zh0, ZBH);
354 - zh = zh1;
355 - } else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */
356 - ASSERT_SENTINEL(zh1, ZBH);
357 - zh = zh0;
358 - } else
359 - BUG();
360 - list_del_init(&zbpg->bud_list);
361 - zbud_unbuddied[found_good_buddy].count--;
362 - list_add_tail(&zbpg->bud_list, &zbud_buddied_list);
363 - zcache_zbud_buddied_count++;
364 -
365 -init_zh:
366 - SET_SENTINEL(zh, ZBH);
367 - zh->size = size;
368 - zh->index = index;
369 - zh->oid = *oid;
370 - zh->pool_id = pool_id;
371 - /* can wait to copy the data until the list locks are dropped */
372 - spin_unlock(&zbud_budlists_spinlock);
373 -
374 - to = zbud_data(zh, size);
375 - memcpy(to, cdata, size);
376 - spin_unlock(&zbpg->lock);
377 - zbud_cumul_chunk_counts[nchunks]++;
378 - atomic_inc(&zcache_zbud_curr_zpages);
379 - zcache_zbud_cumul_zpages++;
380 - zcache_zbud_curr_zbytes += size;
381 - zcache_zbud_cumul_zbytes += size;
382 -out:
383 - return zh;
384 -}
385 -
386 -static int zbud_decompress(struct page *page, struct zbud_hdr *zh)
387 -{
388 - struct zbud_page *zbpg;
389 - unsigned budnum = zbud_budnum(zh);
390 - size_t out_len = PAGE_SIZE;
391 - char *to_va, *from_va;
392 - unsigned size;
393 - int ret = 0;
394 -
395 - zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
396 - spin_lock(&zbpg->lock);
397 - if (list_empty(&zbpg->bud_list)) {
398 - /* ignore zombie page... see zbud_evict_pages() */
399 - ret = -EINVAL;
400 - goto out;
401 - }
402 - ASSERT_SENTINEL(zh, ZBH);
403 - BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
404 - to_va = kmap_atomic(page, KM_USER0);
405 - size = zh->size;
406 - from_va = zbud_data(zh, size);
407 - ret = lzo1x_decompress_safe(from_va, size, to_va, &out_len);
408 - BUG_ON(ret != LZO_E_OK);
409 - BUG_ON(out_len != PAGE_SIZE);
410 - kunmap_atomic(to_va, KM_USER0);
411 -out:
412 - spin_unlock(&zbpg->lock);
413 - return ret;
414 -}
415 -
416 -/*
417 - * The following routines handle shrinking of ephemeral pages by evicting
418 - * pages "least valuable" first.
419 - */
420 -
421 -static unsigned long zcache_evicted_raw_pages;
422 -static unsigned long zcache_evicted_buddied_pages;
423 -static unsigned long zcache_evicted_unbuddied_pages;
424 -
425 -static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid);
426 -static void zcache_put_pool(struct tmem_pool *pool);
427 -
428 -/*
429 - * Flush and free all zbuds in a zbpg, then free the pageframe
430 - */
431 -static void zbud_evict_zbpg(struct zbud_page *zbpg)
432 -{
433 - struct zbud_hdr *zh;
434 - int i, j;
435 - uint32_t pool_id[ZBUD_MAX_BUDS], index[ZBUD_MAX_BUDS];
436 - struct tmem_oid oid[ZBUD_MAX_BUDS];
437 - struct tmem_pool *pool;
438 -
439 - ASSERT_SPINLOCK(&zbpg->lock);
440 - BUG_ON(!list_empty(&zbpg->bud_list));
441 - for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) {
442 - zh = &zbpg->buddy[i];
443 - if (zh->size) {
444 - pool_id[j] = zh->pool_id;
445 - oid[j] = zh->oid;
446 - index[j] = zh->index;
447 - j++;
448 - zbud_free(zh);
449 - }
450 - }
451 - spin_unlock(&zbpg->lock);
452 - for (i = 0; i < j; i++) {
453 - pool = zcache_get_pool_by_id(pool_id[i]);
454 - if (pool != NULL) {
455 - tmem_flush_page(pool, &oid[i], index[i]);
456 - zcache_put_pool(pool);
457 - }
458 - }
459 - ASSERT_SENTINEL(zbpg, ZBPG);
460 - spin_lock(&zbpg->lock);
461 - zbud_free_raw_page(zbpg);
462 -}
463 -
464 -/*
465 - * Free nr pages. This code is funky because we want to hold the locks
466 - * protecting various lists for as short a time as possible, and in some
467 - * circumstances the list may change asynchronously when the list lock is
468 - * not held. In some cases we also trylock not only to avoid waiting on a
469 - * page in use by another cpu, but also to avoid potential deadlock due to
470 - * lock inversion.
471 - */
472 -static void zbud_evict_pages(int nr)
473 -{
474 - struct zbud_page *zbpg;
475 - int i;
476 -
477 - /* first try freeing any pages on unused list */
478 -retry_unused_list:
479 - spin_lock_bh(&zbpg_unused_list_spinlock);
480 - if (!list_empty(&zbpg_unused_list)) {
481 - /* can't walk list here, since it may change when unlocked */
482 - zbpg = list_first_entry(&zbpg_unused_list,
483 - struct zbud_page, bud_list);
484 - list_del_init(&zbpg->bud_list);
485 - zcache_zbpg_unused_list_count--;
486 - atomic_dec(&zcache_zbud_curr_raw_pages);
487 - spin_unlock_bh(&zbpg_unused_list_spinlock);
488 - zcache_free_page(zbpg);
489 - zcache_evicted_raw_pages++;
490 - if (--nr <= 0)
491 - goto out;
492 - goto retry_unused_list;
493 - }
494 - spin_unlock_bh(&zbpg_unused_list_spinlock);
495 -
496 - /* now try freeing unbuddied pages, starting with least space avail */
497 - for (i = 0; i < MAX_CHUNK; i++) {
498 -retry_unbud_list_i:
499 - spin_lock_bh(&zbud_budlists_spinlock);
500 - if (list_empty(&zbud_unbuddied[i].list)) {
501 - spin_unlock_bh(&zbud_budlists_spinlock);
502 - continue;
503 - }
504 - list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) {
505 - if (unlikely(!spin_trylock(&zbpg->lock)))
506 - continue;
507 - list_del_init(&zbpg->bud_list);
508 - zbud_unbuddied[i].count--;
509 - spin_unlock(&zbud_budlists_spinlock);
510 - zcache_evicted_unbuddied_pages++;
511 - /* want budlists unlocked when doing zbpg eviction */
512 - zbud_evict_zbpg(zbpg);
513 - local_bh_enable();
514 - if (--nr <= 0)
515 - goto out;
516 - goto retry_unbud_list_i;
517 - }
518 - spin_unlock_bh(&zbud_budlists_spinlock);
519 - }
520 -
521 - /* as a last resort, free buddied pages */
522 -retry_bud_list:
523 - spin_lock_bh(&zbud_budlists_spinlock);
524 - if (list_empty(&zbud_buddied_list)) {
525 - spin_unlock_bh(&zbud_budlists_spinlock);
526 - goto out;
527 - }
528 - list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) {
529 - if (unlikely(!spin_trylock(&zbpg->lock)))
530 - continue;
531 - list_del_init(&zbpg->bud_list);
532 - zcache_zbud_buddied_count--;
533 - spin_unlock(&zbud_budlists_spinlock);
534 - zcache_evicted_buddied_pages++;
535 - /* want budlists unlocked when doing zbpg eviction */
536 - zbud_evict_zbpg(zbpg);
537 - local_bh_enable();
538 - if (--nr <= 0)
539 - goto out;
540 - goto retry_bud_list;
541 - }
542 - spin_unlock_bh(&zbud_budlists_spinlock);
543 -out:
544 - return;
545 -}
546 -
547 -static void zbud_init(void)
548 -{
549 - int i;
550 -
551 - INIT_LIST_HEAD(&zbud_buddied_list);
552 - zcache_zbud_buddied_count = 0;
553 - for (i = 0; i < NCHUNKS; i++) {
554 - INIT_LIST_HEAD(&zbud_unbuddied[i].list);
555 - zbud_unbuddied[i].count = 0;
556 - }
557 -}
558 -
559 -#ifdef CONFIG_SYSFS
560 -/*
561 - * These sysfs routines show a nice distribution of how many zbpg's are
562 - * currently (and have ever been placed) in each unbuddied list. It's fun
563 - * to watch but can probably go away before final merge.
564 - */
565 -static int zbud_show_unbuddied_list_counts(char *buf)
566 -{
567 - int i;
568 - char *p = buf;
569 -
570 - for (i = 0; i < NCHUNKS - 1; i++)
571 - p += sprintf(p, "%u ", zbud_unbuddied[i].count);
572 - p += sprintf(p, "%d\n", zbud_unbuddied[i].count);
573 - return p - buf;
574 -}
575 -
576 -static int zbud_show_cumul_chunk_counts(char *buf)
577 -{
578 - unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0;
579 - unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0;
580 - unsigned long total_chunks_lte_42 = 0;
581 - char *p = buf;
582 -
583 - for (i = 0; i < NCHUNKS; i++) {
584 - p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]);
585 - chunks += zbud_cumul_chunk_counts[i];
586 - total_chunks += zbud_cumul_chunk_counts[i];
587 - sum_total_chunks += i * zbud_cumul_chunk_counts[i];
588 - if (i == 21)
589 - total_chunks_lte_21 = total_chunks;
590 - if (i == 32)
591 - total_chunks_lte_32 = total_chunks;
592 - if (i == 42)
593 - total_chunks_lte_42 = total_chunks;
594 - }
595 - p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n",
596 - total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42,
597 - chunks == 0 ? 0 : sum_total_chunks / chunks);
598 - return p - buf;
599 -}
600 -#endif
601 -
602 -/**********
603 - * This "zv" PAM implementation combines the TLSF-based xvMalloc
604 - * with lzo1x compression to maximize the amount of data that can
605 - * be packed into a physical page.
606 - *
607 - * Zv represents a PAM page with the index and object (plus a "size" value
608 - * necessary for decompression) immediately preceding the compressed data.
609 - */
610 -
611 -#define ZVH_SENTINEL 0x43214321
612 -
613 -struct zv_hdr {
614 - uint32_t pool_id;
615 - struct tmem_oid oid;
616 - uint32_t index;
617 - DECL_SENTINEL
618 -};
619 -
620 -static const int zv_max_page_size = (PAGE_SIZE / 8) * 7;
621 -
622 -static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id,
623 - struct tmem_oid *oid, uint32_t index,
624 - void *cdata, unsigned clen)
625 -{
626 - struct page *page;
627 - struct zv_hdr *zv = NULL;
628 - uint32_t offset;
629 - int ret;
630 -
631 - BUG_ON(!irqs_disabled());
632 - ret = xv_malloc(xvpool, clen + sizeof(struct zv_hdr),
633 - &page, &offset, ZCACHE_GFP_MASK);
634 - if (unlikely(ret))
635 - goto out;
636 - zv = kmap_atomic(page, KM_USER0) + offset;
637 - zv->index = index;
638 - zv->oid = *oid;
639 - zv->pool_id = pool_id;
640 - SET_SENTINEL(zv, ZVH);
641 - memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen);
642 - kunmap_atomic(zv, KM_USER0);
643 -out:
644 - return zv;
645 -}
646 -
647 -static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv)
648 -{
649 - unsigned long flags;
650 - struct page *page;
651 - uint32_t offset;
652 - uint16_t size;
653 -
654 - ASSERT_SENTINEL(zv, ZVH);
655 - size = xv_get_object_size(zv) - sizeof(*zv);
656 - BUG_ON(size == 0 || size > zv_max_page_size);
657 - INVERT_SENTINEL(zv, ZVH);
658 - page = virt_to_page(zv);
659 - offset = (unsigned long)zv & ~PAGE_MASK;
660 - local_irq_save(flags);
661 - xv_free(xvpool, page, offset);
662 - local_irq_restore(flags);
663 -}
664 -
665 -static void zv_decompress(struct page *page, struct zv_hdr *zv)
666 -{
667 - size_t clen = PAGE_SIZE;
668 - char *to_va;
669 - unsigned size;
670 - int ret;
671 -
672 - ASSERT_SENTINEL(zv, ZVH);
673 - size = xv_get_object_size(zv) - sizeof(*zv);
674 - BUG_ON(size == 0 || size > zv_max_page_size);
675 - to_va = kmap_atomic(page, KM_USER0);
676 - ret = lzo1x_decompress_safe((char *)zv + sizeof(*zv),
677 - size, to_va, &clen);
678 - kunmap_atomic(to_va, KM_USER0);
679 - BUG_ON(ret != LZO_E_OK);
680 - BUG_ON(clen != PAGE_SIZE);
681 -}
682 -
683 -/*
684 - * zcache core code starts here
685 - */
686 -
687 -/* useful stats not collected by cleancache or frontswap */
688 -static unsigned long zcache_flush_total;
689 -static unsigned long zcache_flush_found;
690 -static unsigned long zcache_flobj_total;
691 -static unsigned long zcache_flobj_found;
692 -static unsigned long zcache_failed_eph_puts;
693 -static unsigned long zcache_failed_pers_puts;
694 -
695 -#define MAX_POOLS_PER_CLIENT 16
696 -
697 -static struct {
698 - struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT];
699 - struct xv_pool *xvpool;
700 -} zcache_client;
701 -
702 -/*
703 - * Tmem operations assume the poolid implies the invoking client.
704 - * Zcache only has one client (the kernel itself), so translate
705 - * the poolid into the tmem_pool allocated for it. A KVM version
706 - * of zcache would have one client per guest and each client might
707 - * have a poolid==N.
708 - */
709 -static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid)
710 -{
711 - struct tmem_pool *pool = NULL;
712 -
713 - if (poolid >= 0) {
714 - pool = zcache_client.tmem_pools[poolid];
715 - if (pool != NULL)
716 - atomic_inc(&pool->refcount);
717 - }
718 - return pool;
719 -}
720 -
721 -static void zcache_put_pool(struct tmem_pool *pool)
722 -{
723 - if (pool != NULL)
724 - atomic_dec(&pool->refcount);
725 -}
726 -
727 -/* counters for debugging */
728 -static unsigned long zcache_failed_get_free_pages;
729 -static unsigned long zcache_failed_alloc;
730 -static unsigned long zcache_put_to_flush;
731 -static unsigned long zcache_aborted_preload;
732 -static unsigned long zcache_aborted_shrink;
733 -
734 -/*
735 - * Ensure that memory allocation requests in zcache don't result
736 - * in direct reclaim requests via the shrinker, which would cause
737 - * an infinite loop. Maybe a GFP flag would be better?
738 - */
739 -static DEFINE_SPINLOCK(zcache_direct_reclaim_lock);
740 -
741 -/*
742 - * for now, used named slabs so can easily track usage; later can
743 - * either just use kmalloc, or perhaps add a slab-like allocator
744 - * to more carefully manage total memory utilization
745 - */
746 -static struct kmem_cache *zcache_objnode_cache;
747 -static struct kmem_cache *zcache_obj_cache;
748 -static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0);
749 -static unsigned long zcache_curr_obj_count_max;
750 -static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0);
751 -static unsigned long zcache_curr_objnode_count_max;
752 -
753 -/*
754 - * to avoid memory allocation recursion (e.g. due to direct reclaim), we
755 - * preload all necessary data structures so the hostops callbacks never
756 - * actually do a malloc
757 - */
758 -struct zcache_preload {
759 - void *page;
760 - struct tmem_obj *obj;
761 - int nr;
762 - struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH];
763 -};
764 -static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, };
765 -
766 -static int zcache_do_preload(struct tmem_pool *pool)
767 -{
768 - struct zcache_preload *kp;
769 - struct tmem_objnode *objnode;
770 - struct tmem_obj *obj;
771 - void *page;
772 - int ret = -ENOMEM;
773 -
774 - if (unlikely(zcache_objnode_cache == NULL))
775 - goto out;
776 - if (unlikely(zcache_obj_cache == NULL))
777 - goto out;
778 - if (!spin_trylock(&zcache_direct_reclaim_lock)) {
779 - zcache_aborted_preload++;
780 - goto out;
781 - }
782 - preempt_disable();
783 - kp = &__get_cpu_var(zcache_preloads);
784 - while (kp->nr < ARRAY_SIZE(kp->objnodes)) {
785 - preempt_enable_no_resched();
786 - objnode = kmem_cache_alloc(zcache_objnode_cache,
787 - ZCACHE_GFP_MASK);
788 - if (unlikely(objnode == NULL)) {
789 - zcache_failed_alloc++;
790 - goto unlock_out;
791 - }
792 - preempt_disable();
793 - kp = &__get_cpu_var(zcache_preloads);
794 - if (kp->nr < ARRAY_SIZE(kp->objnodes))
795 - kp->objnodes[kp->nr++] = objnode;
796 - else
797 - kmem_cache_free(zcache_objnode_cache, objnode);
798 - }
799 - preempt_enable_no_resched();
800 - obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK);
801 - if (unlikely(obj == NULL)) {
802 - zcache_failed_alloc++;
803 - goto unlock_out;
804 - }
805 - page = (void *)__get_free_page(ZCACHE_GFP_MASK);
806 - if (unlikely(page == NULL)) {
807 - zcache_failed_get_free_pages++;
808 - kmem_cache_free(zcache_obj_cache, obj);
809 - goto unlock_out;
810 - }
811 - preempt_disable();
812 - kp = &__get_cpu_var(zcache_preloads);
813 - if (kp->obj == NULL)
814 - kp->obj = obj;
815 - else
816 - kmem_cache_free(zcache_obj_cache, obj);
817 - if (kp->page == NULL)
818 - kp->page = page;
819 - else
820 - free_page((unsigned long)page);
821 - ret = 0;
822 -unlock_out:
823 - spin_unlock(&zcache_direct_reclaim_lock);
824 -out:
825 - return ret;
826 -}
827 -
828 -static void *zcache_get_free_page(void)
829 -{
830 - struct zcache_preload *kp;
831 - void *page;
832 -
833 - kp = &__get_cpu_var(zcache_preloads);
834 - page = kp->page;
835 - BUG_ON(page == NULL);
836 - kp->page = NULL;
837 - return page;
838 -}
839 -
840 -static void zcache_free_page(void *p)
841 -{
842 - free_page((unsigned long)p);
843 -}
844 -
845 -/*
846 - * zcache implementation for tmem host ops
847 - */
848 -
849 -static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool)
850 -{
851 - struct tmem_objnode *objnode = NULL;
852 - unsigned long count;
853 - struct zcache_preload *kp;
854 -
855 - kp = &__get_cpu_var(zcache_preloads);
856 - if (kp->nr <= 0)
857 - goto out;
858 - objnode = kp->objnodes[kp->nr - 1];
859 - BUG_ON(objnode == NULL);
860 - kp->objnodes[kp->nr - 1] = NULL;
861 - kp->nr--;
862 - count = atomic_inc_return(&zcache_curr_objnode_count);
863 - if (count > zcache_curr_objnode_count_max)
864 - zcache_curr_objnode_count_max = count;
865 -out:
866 - return objnode;
867 -}
868 -
869 -static void zcache_objnode_free(struct tmem_objnode *objnode,
870 - struct tmem_pool *pool)
871 -{
872 - atomic_dec(&zcache_curr_objnode_count);
873 - BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0);
874 - kmem_cache_free(zcache_objnode_cache, objnode);
875 -}
876 -
877 -static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool)
878 -{
879 - struct tmem_obj *obj = NULL;
880 - unsigned long count;
881 - struct zcache_preload *kp;
882 -
883 - kp = &__get_cpu_var(zcache_preloads);
884 - obj = kp->obj;
885 - BUG_ON(obj == NULL);
886 - kp->obj = NULL;
887 - count = atomic_inc_return(&zcache_curr_obj_count);
888 - if (count > zcache_curr_obj_count_max)
889 - zcache_curr_obj_count_max = count;
890 - return obj;
891 -}
892 -
893 -static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool)
894 -{
895 - atomic_dec(&zcache_curr_obj_count);
896 - BUG_ON(atomic_read(&zcache_curr_obj_count) < 0);
897 - kmem_cache_free(zcache_obj_cache, obj);
898 -}
899 -
900 -static struct tmem_hostops zcache_hostops = {
901 - .obj_alloc = zcache_obj_alloc,
902 - .obj_free = zcache_obj_free,
903 - .objnode_alloc = zcache_objnode_alloc,
904 - .objnode_free = zcache_objnode_free,
905 -};
906 -
907 -/*
908 - * zcache implementations for PAM page descriptor ops
909 - */
910 -
911 -static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0);
912 -static unsigned long zcache_curr_eph_pampd_count_max;
913 -static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0);
914 -static unsigned long zcache_curr_pers_pampd_count_max;
915 -
916 -/* forward reference */
917 -static int zcache_compress(struct page *from, void **out_va, size_t *out_len);
918 -
919 -static void *zcache_pampd_create(struct tmem_pool *pool, struct tmem_oid *oid,
920 - uint32_t index, struct page *page)
921 -{
922 - void *pampd = NULL, *cdata;
923 - size_t clen;
924 - int ret;
925 - bool ephemeral = is_ephemeral(pool);
926 - unsigned long count;
927 -
928 - if (ephemeral) {
929 - ret = zcache_compress(page, &cdata, &clen);
930 - if (ret == 0)
931 -
932 - goto out;
933 - if (clen == 0 || clen > zbud_max_buddy_size()) {
934 - zcache_compress_poor++;
935 - goto out;
936 - }
937 - pampd = (void *)zbud_create(pool->pool_id, oid, index,
938 - page, cdata, clen);
939 - if (pampd != NULL) {
940 - count = atomic_inc_return(&zcache_curr_eph_pampd_count);
941 - if (count > zcache_curr_eph_pampd_count_max)
942 - zcache_curr_eph_pampd_count_max = count;
943 - }
944 - } else {
945 - /*
946 - * FIXME: This is all the "policy" there is for now.
947 - * 3/4 totpages should allow ~37% of RAM to be filled with
948 - * compressed frontswap pages
949 - */
950 - if (atomic_read(&zcache_curr_pers_pampd_count) >
951 - 3 * totalram_pages / 4)
952 - goto out;
953 - ret = zcache_compress(page, &cdata, &clen);
954 - if (ret == 0)
955 - goto out;
956 - if (clen > zv_max_page_size) {
957 - zcache_compress_poor++;
958 - goto out;
959 - }
960 - pampd = (void *)zv_create(zcache_client.xvpool, pool->pool_id,
961 - oid, index, cdata, clen);
962 - if (pampd == NULL)
963 - goto out;
964 - count = atomic_inc_return(&zcache_curr_pers_pampd_count);
965 - if (count > zcache_curr_pers_pampd_count_max)
966 - zcache_curr_pers_pampd_count_max = count;
967 - }
968 -out:
969 - return pampd;
970 -}
971 -
972 -/*
973 - * fill the pageframe corresponding to the struct page with the data
974 - * from the passed pampd
975 - */
976 -static int zcache_pampd_get_data(struct page *page, void *pampd,
977 - struct tmem_pool *pool)
978 -{
979 - int ret = 0;
980 -
981 - if (is_ephemeral(pool))
982 - ret = zbud_decompress(page, pampd);
983 - else
984 - zv_decompress(page, pampd);
985 - return ret;
986 -}
987 -
988 -/*
989 - * free the pampd and remove it from any zcache lists
990 - * pampd must no longer be pointed to from any tmem data structures!
991 - */
992 -static void zcache_pampd_free(void *pampd, struct tmem_pool *pool)
993 -{
994 - if (is_ephemeral(pool)) {
995 - zbud_free_and_delist((struct zbud_hdr *)pampd);
996 - atomic_dec(&zcache_curr_eph_pampd_count);
997 - BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0);
998 - } else {
999 - zv_free(zcache_client.xvpool, (struct zv_hdr *)pampd);
1000 - atomic_dec(&zcache_curr_pers_pampd_count);
1001 - BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0);
1002 - }
1003 -}
1004 -
1005 -static struct tmem_pamops zcache_pamops = {
1006 - .create = zcache_pampd_create,
1007 - .get_data = zcache_pampd_get_data,
1008 - .free = zcache_pampd_free,
1009 -};
1010 -
1011 -/*
1012 - * zcache compression/decompression and related per-cpu stuff
1013 - */
1014 -
1015 -#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS
1016 -#define LZO_DSTMEM_PAGE_ORDER 1
1017 -static DEFINE_PER_CPU(unsigned char *, zcache_workmem);
1018 -static DEFINE_PER_CPU(unsigned char *, zcache_dstmem);
1019 -
1020 -static int zcache_compress(struct page *from, void **out_va, size_t *out_len)
1021 -{
1022 - int ret = 0;
1023 - unsigned char *dmem = __get_cpu_var(zcache_dstmem);
1024 - unsigned char *wmem = __get_cpu_var(zcache_workmem);
1025 - char *from_va;
1026 -
1027 - BUG_ON(!irqs_disabled());
1028 - if (unlikely(dmem == NULL || wmem == NULL))
1029 - goto out; /* no buffer, so can't compress */
1030 - from_va = kmap_atomic(from, KM_USER0);
1031 - mb();
1032 - ret = lzo1x_1_compress(from_va, PAGE_SIZE, dmem, out_len, wmem);
1033 - BUG_ON(ret != LZO_E_OK);
1034 - *out_va = dmem;
1035 - kunmap_atomic(from_va, KM_USER0);
1036 - ret = 1;
1037 -out:
1038 - return ret;
1039 -}
1040 -
1041 -
1042 -static int zcache_cpu_notifier(struct notifier_block *nb,
1043 - unsigned long action, void *pcpu)
1044 -{
1045 - int cpu = (long)pcpu;
1046 - struct zcache_preload *kp;
1047 -
1048 - switch (action) {
1049 - case CPU_UP_PREPARE:
1050 - per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages(
1051 - GFP_KERNEL | __GFP_REPEAT,
1052 - LZO_DSTMEM_PAGE_ORDER),
1053 - per_cpu(zcache_workmem, cpu) =
1054 - kzalloc(LZO1X_MEM_COMPRESS,
1055 - GFP_KERNEL | __GFP_REPEAT);
1056 - break;
1057 - case CPU_DEAD:
1058 - case CPU_UP_CANCELED:
1059 - free_pages((unsigned long)per_cpu(zcache_dstmem, cpu),
1060 - LZO_DSTMEM_PAGE_ORDER);
1061 - per_cpu(zcache_dstmem, cpu) = NULL;
1062 - kfree(per_cpu(zcache_workmem, cpu));
1063 - per_cpu(zcache_workmem, cpu) = NULL;
1064 - kp = &per_cpu(zcache_preloads, cpu);
1065 - while (kp->nr) {
1066 - kmem_cache_free(zcache_objnode_cache,
1067 - kp->objnodes[kp->nr - 1]);
1068 - kp->objnodes[kp->nr - 1] = NULL;
1069 - kp->nr--;
1070 - }
1071 - kmem_cache_free(zcache_obj_cache, kp->obj);
1072 - free_page((unsigned long)kp->page);
1073 - break;
1074 - default:
1075 - break;
1076 - }
1077 - return NOTIFY_OK;
1078 -}
1079 -
1080 -static struct notifier_block zcache_cpu_notifier_block = {
1081 - .notifier_call = zcache_cpu_notifier
1082 -};
1083 -
1084 -#ifdef CONFIG_SYSFS
1085 -#define ZCACHE_SYSFS_RO(_name) \
1086 - static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1087 - struct kobj_attribute *attr, char *buf) \
1088 - { \
1089 - return sprintf(buf, "%lu\n", zcache_##_name); \
1090 - } \
1091 - static struct kobj_attribute zcache_##_name##_attr = { \
1092 - .attr = { .name = __stringify(_name), .mode = 0444 }, \
1093 - .show = zcache_##_name##_show, \
1094 - }
1095 -
1096 -#define ZCACHE_SYSFS_RO_ATOMIC(_name) \
1097 - static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1098 - struct kobj_attribute *attr, char *buf) \
1099 - { \
1100 - return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \
1101 - } \
1102 - static struct kobj_attribute zcache_##_name##_attr = { \
1103 - .attr = { .name = __stringify(_name), .mode = 0444 }, \
1104 - .show = zcache_##_name##_show, \
1105 - }
1106 -
1107 -#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \
1108 - static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1109 - struct kobj_attribute *attr, char *buf) \
1110 - { \
1111 - return _func(buf); \
1112 - } \
1113 - static struct kobj_attribute zcache_##_name##_attr = { \
1114 - .attr = { .name = __stringify(_name), .mode = 0444 }, \
1115 - .show = zcache_##_name##_show, \
1116 - }
1117 -
1118 -ZCACHE_SYSFS_RO(curr_obj_count_max);
1119 -ZCACHE_SYSFS_RO(curr_objnode_count_max);
1120 -ZCACHE_SYSFS_RO(flush_total);
1121 -ZCACHE_SYSFS_RO(flush_found);
1122 -ZCACHE_SYSFS_RO(flobj_total);
1123 -ZCACHE_SYSFS_RO(flobj_found);
1124 -ZCACHE_SYSFS_RO(failed_eph_puts);
1125 -ZCACHE_SYSFS_RO(failed_pers_puts);
1126 -ZCACHE_SYSFS_RO(zbud_curr_zbytes);
1127 -ZCACHE_SYSFS_RO(zbud_cumul_zpages);
1128 -ZCACHE_SYSFS_RO(zbud_cumul_zbytes);
1129 -ZCACHE_SYSFS_RO(zbud_buddied_count);
1130 -ZCACHE_SYSFS_RO(zbpg_unused_list_count);
1131 -ZCACHE_SYSFS_RO(evicted_raw_pages);
1132 -ZCACHE_SYSFS_RO(evicted_unbuddied_pages);
1133 -ZCACHE_SYSFS_RO(evicted_buddied_pages);
1134 -ZCACHE_SYSFS_RO(failed_get_free_pages);
1135 -ZCACHE_SYSFS_RO(failed_alloc);
1136 -ZCACHE_SYSFS_RO(put_to_flush);
1137 -ZCACHE_SYSFS_RO(aborted_preload);
1138 -ZCACHE_SYSFS_RO(aborted_shrink);
1139 -ZCACHE_SYSFS_RO(compress_poor);
1140 -ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages);
1141 -ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages);
1142 -ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count);
1143 -ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count);
1144 -ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts,
1145 - zbud_show_unbuddied_list_counts);
1146 -ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts,
1147 - zbud_show_cumul_chunk_counts);
1148 -
1149 -static struct attribute *zcache_attrs[] = {
1150 - &zcache_curr_obj_count_attr.attr,
1151 - &zcache_curr_obj_count_max_attr.attr,
1152 - &zcache_curr_objnode_count_attr.attr,
1153 - &zcache_curr_objnode_count_max_attr.attr,
1154 - &zcache_flush_total_attr.attr,
1155 - &zcache_flobj_total_attr.attr,
1156 - &zcache_flush_found_attr.attr,
1157 - &zcache_flobj_found_attr.attr,
1158 - &zcache_failed_eph_puts_attr.attr,
1159 - &zcache_failed_pers_puts_attr.attr,
1160 - &zcache_compress_poor_attr.attr,
1161 - &zcache_zbud_curr_raw_pages_attr.attr,
1162 - &zcache_zbud_curr_zpages_attr.attr,
1163 - &zcache_zbud_curr_zbytes_attr.attr,
1164 - &zcache_zbud_cumul_zpages_attr.attr,
1165 - &zcache_zbud_cumul_zbytes_attr.attr,
1166 - &zcache_zbud_buddied_count_attr.attr,
1167 - &zcache_zbpg_unused_list_count_attr.attr,
1168 - &zcache_evicted_raw_pages_attr.attr,
1169 - &zcache_evicted_unbuddied_pages_attr.attr,
1170 - &zcache_evicted_buddied_pages_attr.attr,
1171 - &zcache_failed_get_free_pages_attr.attr,
1172 - &zcache_failed_alloc_attr.attr,
1173 - &zcache_put_to_flush_attr.attr,
1174 - &zcache_aborted_preload_attr.attr,
1175 - &zcache_aborted_shrink_attr.attr,
1176 - &zcache_zbud_unbuddied_list_counts_attr.attr,
1177 - &zcache_zbud_cumul_chunk_counts_attr.attr,
1178 - NULL,
1179 -};
1180 -
1181 -static struct attribute_group zcache_attr_group = {
1182 - .attrs = zcache_attrs,
1183 - .name = "zcache",
1184 -};
1185 -
1186 -#endif /* CONFIG_SYSFS */
1187 -/*
1188 - * When zcache is disabled ("frozen"), pools can be created and destroyed,
1189 - * but all puts (and thus all other operations that require memory allocation)
1190 - * must fail. If zcache is unfrozen, accepts puts, then frozen again,
1191 - * data consistency requires all puts while frozen to be converted into
1192 - * flushes.
1193 - */
1194 -static bool zcache_freeze;
1195 -
1196 -/*
1197 - * zcache shrinker interface (only useful for ephemeral pages, so zbud only)
1198 - */
1199 -static int shrink_zcache_memory(struct shrinker *shrink,
1200 - struct shrink_control *sc)
1201 -{
1202 - int ret = -1;
1203 - int nr = sc->nr_to_scan;
1204 - gfp_t gfp_mask = sc->gfp_mask;
1205 -
1206 - if (nr >= 0) {
1207 - if (!(gfp_mask & __GFP_FS))
1208 - /* does this case really need to be skipped? */
1209 - goto out;
1210 - if (spin_trylock(&zcache_direct_reclaim_lock)) {
1211 - zbud_evict_pages(nr);
1212 - spin_unlock(&zcache_direct_reclaim_lock);
1213 - } else
1214 - zcache_aborted_shrink++;
1215 - }
1216 - ret = (int)atomic_read(&zcache_zbud_curr_raw_pages);
1217 -out:
1218 - return ret;
1219 -}
1220 -
1221 -static struct shrinker zcache_shrinker = {
1222 - .shrink = shrink_zcache_memory,
1223 - .seeks = DEFAULT_SEEKS,
1224 -};
1225 -
1226 -/*
1227 - * zcache shims between cleancache/frontswap ops and tmem
1228 - */
1229 -
1230 -static int zcache_put_page(int pool_id, struct tmem_oid *oidp,
1231 - uint32_t index, struct page *page)
1232 -{
1233 - struct tmem_pool *pool;
1234 - int ret = -1;
1235 -
1236 - BUG_ON(!irqs_disabled());
1237 - pool = zcache_get_pool_by_id(pool_id);
1238 - if (unlikely(pool == NULL))
1239 - goto out;
1240 - if (!zcache_freeze && zcache_do_preload(pool) == 0) {
1241 - /* preload does preempt_disable on success */
1242 - ret = tmem_put(pool, oidp, index, page);
1243 - if (ret < 0) {
1244 - if (is_ephemeral(pool))
1245 - zcache_failed_eph_puts++;
1246 - else
1247 - zcache_failed_pers_puts++;
1248 - }
1249 - zcache_put_pool(pool);
1250 - preempt_enable_no_resched();
1251 - } else {
1252 - zcache_put_to_flush++;
1253 - if (atomic_read(&pool->obj_count) > 0)
1254 - /* the put fails whether the flush succeeds or not */
1255 - (void)tmem_flush_page(pool, oidp, index);
1256 - zcache_put_pool(pool);
1257 - }
1258 -out:
1259 - return ret;
1260 -}
1261 -
1262 -static int zcache_get_page(int pool_id, struct tmem_oid *oidp,
1263 - uint32_t index, struct page *page)
1264 -{
1265 - struct tmem_pool *pool;
1266 - int ret = -1;
1267 - unsigned long flags;
1268 -
1269 - local_irq_save(flags);
1270 - pool = zcache_get_pool_by_id(pool_id);
1271 - if (likely(pool != NULL)) {
1272 - if (atomic_read(&pool->obj_count) > 0)
1273 - ret = tmem_get(pool, oidp, index, page);
1274 - zcache_put_pool(pool);
1275 - }
1276 - local_irq_restore(flags);
1277 - return ret;
1278 -}
1279 -
1280 -static int zcache_flush_page(int pool_id, struct tmem_oid *oidp, uint32_t index)
1281 -{
1282 - struct tmem_pool *pool;
1283 - int ret = -1;
1284 - unsigned long flags;
1285 -
1286 - local_irq_save(flags);
1287 - zcache_flush_total++;
1288 - pool = zcache_get_pool_by_id(pool_id);
1289 - if (likely(pool != NULL)) {
1290 - if (atomic_read(&pool->obj_count) > 0)
1291 - ret = tmem_flush_page(pool, oidp, index);
1292 - zcache_put_pool(pool);
1293 - }
1294 - if (ret >= 0)
1295 - zcache_flush_found++;
1296 - local_irq_restore(flags);
1297 - return ret;
1298 -}
1299 -
1300 -static int zcache_flush_object(int pool_id, struct tmem_oid *oidp)
1301 -{
1302 - struct tmem_pool *pool;
1303 - int ret = -1;
1304 - unsigned long flags;
1305 -
1306 - local_irq_save(flags);
1307 - zcache_flobj_total++;
1308 - pool = zcache_get_pool_by_id(pool_id);
1309 - if (likely(pool != NULL)) {
1310 - if (atomic_read(&pool->obj_count) > 0)
1311 - ret = tmem_flush_object(pool, oidp);
1312 - zcache_put_pool(pool);
1313 - }
1314 - if (ret >= 0)
1315 - zcache_flobj_found++;
1316 - local_irq_restore(flags);
1317 - return ret;
1318 -}
1319 -
1320 -static int zcache_destroy_pool(int pool_id)
1321 -{
1322 - struct tmem_pool *pool = NULL;
1323 - int ret = -1;
1324 -
1325 - if (pool_id < 0)
1326 - goto out;
1327 - pool = zcache_client.tmem_pools[pool_id];
1328 - if (pool == NULL)
1329 - goto out;
1330 - zcache_client.tmem_pools[pool_id] = NULL;
1331 - /* wait for pool activity on other cpus to quiesce */
1332 - while (atomic_read(&pool->refcount) != 0)
1333 - ;
1334 - local_bh_disable();
1335 - ret = tmem_destroy_pool(pool);
1336 - local_bh_enable();
1337 - kfree(pool);
1338 - pr_info("zcache: destroyed pool id=%d\n", pool_id);
1339 -out:
1340 - return ret;
1341 -}
1342 -
1343 -static int zcache_new_pool(uint32_t flags)
1344 -{
1345 - int poolid = -1;
1346 - struct tmem_pool *pool;
1347 -
1348 - pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL);
1349 - if (pool == NULL) {
1350 - pr_info("zcache: pool creation failed: out of memory\n");
1351 - goto out;
1352 - }
1353 -
1354 - for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++)
1355 - if (zcache_client.tmem_pools[poolid] == NULL)
1356 - break;
1357 - if (poolid >= MAX_POOLS_PER_CLIENT) {
1358 - pr_info("zcache: pool creation failed: max exceeded\n");
1359 - kfree(pool);
1360 - poolid = -1;
1361 - goto out;
1362 - }
1363 - atomic_set(&pool->refcount, 0);
1364 - pool->client = &zcache_client;
1365 - pool->pool_id = poolid;
1366 - tmem_new_pool(pool, flags);
1367 - zcache_client.tmem_pools[poolid] = pool;
1368 - pr_info("zcache: created %s tmem pool, id=%d\n",
1369 - flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
1370 - poolid);
1371 -out:
1372 - return poolid;
1373 -}
1374 -
1375 -/**********
1376 - * Two kernel functionalities currently can be layered on top of tmem.
1377 - * These are "cleancache" which is used as a second-chance cache for clean
1378 - * page cache pages; and "frontswap" which is used for swap pages
1379 - * to avoid writes to disk. A generic "shim" is provided here for each
1380 - * to translate in-kernel semantics to zcache semantics.
1381 - */
1382 -
1383 -#ifdef CONFIG_CLEANCACHE
1384 -static void zcache_cleancache_put_page(int pool_id,
1385 - struct cleancache_filekey key,
1386 - pgoff_t index, struct page *page)
1387 -{
1388 - u32 ind = (u32) index;
1389 - struct tmem_oid oid = *(struct tmem_oid *)&key;
1390 -
1391 - if (likely(ind == index))
1392 - (void)zcache_put_page(pool_id, &oid, index, page);
1393 -}
1394 -
1395 -static int zcache_cleancache_get_page(int pool_id,
1396 - struct cleancache_filekey key,
1397 - pgoff_t index, struct page *page)
1398 -{
1399 - u32 ind = (u32) index;
1400 - struct tmem_oid oid = *(struct tmem_oid *)&key;
1401 - int ret = -1;
1402 -
1403 - if (likely(ind == index))
1404 - ret = zcache_get_page(pool_id, &oid, index, page);
1405 - return ret;
1406 -}
1407 -
1408 -static void zcache_cleancache_flush_page(int pool_id,
1409 - struct cleancache_filekey key,
1410 - pgoff_t index)
1411 -{
1412 - u32 ind = (u32) index;
1413 - struct tmem_oid oid = *(struct tmem_oid *)&key;
1414 -
1415 - if (likely(ind == index))
1416 - (void)zcache_flush_page(pool_id, &oid, ind);
1417 -}
1418 -
1419 -static void zcache_cleancache_flush_inode(int pool_id,
1420 - struct cleancache_filekey key)
1421 -{
1422 - struct tmem_oid oid = *(struct tmem_oid *)&key;
1423 -
1424 - (void)zcache_flush_object(pool_id, &oid);
1425 -}
1426 -
1427 -static void zcache_cleancache_flush_fs(int pool_id)
1428 -{
1429 - if (pool_id >= 0)
1430 - (void)zcache_destroy_pool(pool_id);
1431 -}
1432 -
1433 -static int zcache_cleancache_init_fs(size_t pagesize)
1434 -{
1435 - BUG_ON(sizeof(struct cleancache_filekey) !=
1436 - sizeof(struct tmem_oid));
1437 - BUG_ON(pagesize != PAGE_SIZE);
1438 - return zcache_new_pool(0);
1439 -}
1440 -
1441 -static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
1442 -{
1443 - /* shared pools are unsupported and map to private */
1444 - BUG_ON(sizeof(struct cleancache_filekey) !=
1445 - sizeof(struct tmem_oid));
1446 - BUG_ON(pagesize != PAGE_SIZE);
1447 - return zcache_new_pool(0);
1448 -}
1449 -
1450 -static struct cleancache_ops zcache_cleancache_ops = {
1451 - .put_page = zcache_cleancache_put_page,
1452 - .get_page = zcache_cleancache_get_page,
1453 - .flush_page = zcache_cleancache_flush_page,
1454 - .flush_inode = zcache_cleancache_flush_inode,
1455 - .flush_fs = zcache_cleancache_flush_fs,
1456 - .init_shared_fs = zcache_cleancache_init_shared_fs,
1457 - .init_fs = zcache_cleancache_init_fs
1458 -};
1459 -
1460 -struct cleancache_ops zcache_cleancache_register_ops(void)
1461 -{
1462 - struct cleancache_ops old_ops =
1463 - cleancache_register_ops(&zcache_cleancache_ops);
1464 -
1465 - return old_ops;
1466 -}
1467 -#endif
1468 -
1469 -#ifdef CONFIG_FRONTSWAP
1470 -/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1471 -static int zcache_frontswap_poolid = -1;
1472 -
1473 -/*
1474 - * Swizzling increases objects per swaptype, increasing tmem concurrency
1475 - * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS
1476 - */
1477 -#define SWIZ_BITS 4
1478 -#define SWIZ_MASK ((1 << SWIZ_BITS) - 1)
1479 -#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
1480 -#define iswiz(_ind) (_ind >> SWIZ_BITS)
1481 -
1482 -static inline struct tmem_oid oswiz(unsigned type, u32 ind)
1483 -{
1484 - struct tmem_oid oid = { .oid = { 0 } };
1485 - oid.oid[0] = _oswiz(type, ind);
1486 - return oid;
1487 -}
1488 -
1489 -static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
1490 - struct page *page)
1491 -{
1492 - u64 ind64 = (u64)offset;
1493 - u32 ind = (u32)offset;
1494 - struct tmem_oid oid = oswiz(type, ind);
1495 - int ret = -1;
1496 - unsigned long flags;
1497 -
1498 - BUG_ON(!PageLocked(page));
1499 - if (likely(ind64 == ind)) {
1500 - local_irq_save(flags);
1501 - ret = zcache_put_page(zcache_frontswap_poolid, &oid,
1502 - iswiz(ind), page);
1503 - local_irq_restore(flags);
1504 - }
1505 - return ret;
1506 -}
1507 -
1508 -/* returns 0 if the page was successfully gotten from frontswap, -1 if
1509 - * was not present (should never happen!) */
1510 -static int zcache_frontswap_get_page(unsigned type, pgoff_t offset,
1511 - struct page *page)
1512 -{
1513 - u64 ind64 = (u64)offset;
1514 - u32 ind = (u32)offset;
1515 - struct tmem_oid oid = oswiz(type, ind);
1516 - int ret = -1;
1517 -
1518 - BUG_ON(!PageLocked(page));
1519 - if (likely(ind64 == ind))
1520 - ret = zcache_get_page(zcache_frontswap_poolid, &oid,
1521 - iswiz(ind), page);
1522 - return ret;
1523 -}
1524 -
1525 -/* flush a single page from frontswap */
1526 -static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset)
1527 -{
1528 - u64 ind64 = (u64)offset;
1529 - u32 ind = (u32)offset;
1530 - struct tmem_oid oid = oswiz(type, ind);
1531 -
1532 - if (likely(ind64 == ind))
1533 - (void)zcache_flush_page(zcache_frontswap_poolid, &oid,
1534 - iswiz(ind));
1535 -}
1536 -
1537 -/* flush all pages from the passed swaptype */
1538 -static void zcache_frontswap_flush_area(unsigned type)
1539 -{
1540 - struct tmem_oid oid;
1541 - int ind;
1542 -
1543 - for (ind = SWIZ_MASK; ind >= 0; ind--) {
1544 - oid = oswiz(type, ind);
1545 - (void)zcache_flush_object(zcache_frontswap_poolid, &oid);
1546 - }
1547 -}
1548 -
1549 -static void zcache_frontswap_init(unsigned ignored)
1550 -{
1551 - /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1552 - if (zcache_frontswap_poolid < 0)
1553 - zcache_frontswap_poolid = zcache_new_pool(TMEM_POOL_PERSIST);
1554 -}
1555 -
1556 -static struct frontswap_ops zcache_frontswap_ops = {
1557 - .put_page = zcache_frontswap_put_page,
1558 - .get_page = zcache_frontswap_get_page,
1559 - .flush_page = zcache_frontswap_flush_page,
1560 - .flush_area = zcache_frontswap_flush_area,
1561 - .init = zcache_frontswap_init
1562 -};
1563 -
1564 -struct frontswap_ops zcache_frontswap_register_ops(void)
1565 -{
1566 - struct frontswap_ops old_ops =
1567 - frontswap_register_ops(&zcache_frontswap_ops);
1568 -
1569 - return old_ops;
1570 -}
1571 -#endif
1572 -
1573 -/*
1574 - * zcache initialization
1575 - * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR
1576 - * NOTHING HAPPENS!
1577 - */
1578 -
1579 -static int zcache_enabled;
1580 -
1581 -static int __init enable_zcache(char *s)
1582 -{
1583 - zcache_enabled = 1;
1584 - return 1;
1585 -}
1586 -__setup("zcache", enable_zcache);
1587 -
1588 -/* allow independent dynamic disabling of cleancache and frontswap */
1589 -
1590 -static int use_cleancache = 1;
1591 -
1592 -static int __init no_cleancache(char *s)
1593 -{
1594 - use_cleancache = 0;
1595 - return 1;
1596 -}
1597 -
1598 -__setup("nocleancache", no_cleancache);
1599 -
1600 -static int use_frontswap = 1;
1601 -
1602 -static int __init no_frontswap(char *s)
1603 -{
1604 - use_frontswap = 0;
1605 - return 1;
1606 -}
1607 -
1608 -__setup("nofrontswap", no_frontswap);
1609 -
1610 -static int __init zcache_init(void)
1611 -{
1612 -#ifdef CONFIG_SYSFS
1613 - int ret = 0;
1614 -
1615 - ret = sysfs_create_group(mm_kobj, &zcache_attr_group);
1616 - if (ret) {
1617 - pr_err("zcache: can't create sysfs\n");
1618 - goto out;
1619 - }
1620 -#endif /* CONFIG_SYSFS */
1621 -#if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP)
1622 - if (zcache_enabled) {
1623 - unsigned int cpu;
1624 -
1625 - tmem_register_hostops(&zcache_hostops);
1626 - tmem_register_pamops(&zcache_pamops);
1627 - ret = register_cpu_notifier(&zcache_cpu_notifier_block);
1628 - if (ret) {
1629 - pr_err("zcache: can't register cpu notifier\n");
1630 - goto out;
1631 - }
1632 - for_each_online_cpu(cpu) {
1633 - void *pcpu = (void *)(long)cpu;
1634 - zcache_cpu_notifier(&zcache_cpu_notifier_block,
1635 - CPU_UP_PREPARE, pcpu);
1636 - }
1637 - }
1638 - zcache_objnode_cache = kmem_cache_create("zcache_objnode",
1639 - sizeof(struct tmem_objnode), 0, 0, NULL);
1640 - zcache_obj_cache = kmem_cache_create("zcache_obj",
1641 - sizeof(struct tmem_obj), 0, 0, NULL);
1642 -#endif
1643 -#ifdef CONFIG_CLEANCACHE
1644 - if (zcache_enabled && use_cleancache) {
1645 - struct cleancache_ops old_ops;
1646 -
1647 - zbud_init();
1648 - register_shrinker(&zcache_shrinker);
1649 - old_ops = zcache_cleancache_register_ops();
1650 - pr_info("zcache: cleancache enabled using kernel "
1651 - "transcendent memory and compression buddies\n");
1652 - if (old_ops.init_fs != NULL)
1653 - pr_warning("zcache: cleancache_ops overridden");
1654 - }
1655 -#endif
1656 -#ifdef CONFIG_FRONTSWAP
1657 - if (zcache_enabled && use_frontswap) {
1658 - struct frontswap_ops old_ops;
1659 -
1660 - zcache_client.xvpool = xv_create_pool();
1661 - if (zcache_client.xvpool == NULL) {
1662 - pr_err("zcache: can't create xvpool\n");
1663 - goto out;
1664 - }
1665 - old_ops = zcache_frontswap_register_ops();
1666 - pr_info("zcache: frontswap enabled using kernel "
1667 - "transcendent memory and xvmalloc\n");
1668 - if (old_ops.init != NULL)
1669 - pr_warning("ktmem: frontswap_ops overridden");
1670 - }
1671 -#endif
1672 -out:
1673 - return ret;
1674 -}
1675 -
1676 -module_init(zcache_init)
1677 diff --git a/drivers/staging/zcache/zcache_drv.c b/drivers/staging/zcache/zcache_drv.c
1678 new file mode 100644
1679 index 0000000..77ac2d4
1680 --- /dev/null
1681 +++ b/drivers/staging/zcache/zcache_drv.c
1682 @@ -0,0 +1,1661 @@
1683 +/*
1684 + * zcache.c
1685 + *
1686 + * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp.
1687 + * Copyright (c) 2010,2011, Nitin Gupta
1688 + *
1689 + * Zcache provides an in-kernel "host implementation" for transcendent memory
1690 + * and, thus indirectly, for cleancache and frontswap. Zcache includes two
1691 + * page-accessible memory [1] interfaces, both utilizing lzo1x compression:
1692 + * 1) "compression buddies" ("zbud") is used for ephemeral pages
1693 + * 2) xvmalloc is used for persistent pages.
1694 + * Xvmalloc (based on the TLSF allocator) has very low fragmentation
1695 + * so maximizes space efficiency, while zbud allows pairs (and potentially,
1696 + * in the future, more than a pair of) compressed pages to be closely linked
1697 + * so that reclaiming can be done via the kernel's physical-page-oriented
1698 + * "shrinker" interface.
1699 + *
1700 + * [1] For a definition of page-accessible memory (aka PAM), see:
1701 + * http://marc.info/?l=linux-mm&m=127811271605009
1702 + */
1703 +
1704 +#include <linux/cpu.h>
1705 +#include <linux/highmem.h>
1706 +#include <linux/list.h>
1707 +#include <linux/lzo.h>
1708 +#include <linux/slab.h>
1709 +#include <linux/spinlock.h>
1710 +#include <linux/types.h>
1711 +#include <linux/atomic.h>
1712 +#include "tmem.h"
1713 +
1714 +#include "../zram/xvmalloc.h" /* if built in drivers/staging */
1715 +
1716 +#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP))
1717 +#error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP"
1718 +#endif
1719 +#ifdef CONFIG_CLEANCACHE
1720 +#include <linux/cleancache.h>
1721 +#endif
1722 +#ifdef CONFIG_FRONTSWAP
1723 +#include <linux/frontswap.h>
1724 +#endif
1725 +
1726 +#if 0
1727 +/* this is more aggressive but may cause other problems? */
1728 +#define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN)
1729 +#else
1730 +#define ZCACHE_GFP_MASK \
1731 + (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
1732 +#endif
1733 +
1734 +/**********
1735 + * Compression buddies ("zbud") provides for packing two (or, possibly
1736 + * in the future, more) compressed ephemeral pages into a single "raw"
1737 + * (physical) page and tracking them with data structures so that
1738 + * the raw pages can be easily reclaimed.
1739 + *
1740 + * A zbud page ("zbpg") is an aligned page containing a list_head,
1741 + * a lock, and two "zbud headers". The remainder of the physical
1742 + * page is divided up into aligned 64-byte "chunks" which contain
1743 + * the compressed data for zero, one, or two zbuds. Each zbpg
1744 + * resides on: (1) an "unused list" if it has no zbuds; (2) a
1745 + * "buddied" list if it is fully populated with two zbuds; or
1746 + * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks
1747 + * the one unbuddied zbud uses. The data inside a zbpg cannot be
1748 + * read or written unless the zbpg's lock is held.
1749 + */
1750 +
1751 +#define ZBH_SENTINEL 0x43214321
1752 +#define ZBPG_SENTINEL 0xdeadbeef
1753 +
1754 +#define ZBUD_MAX_BUDS 2
1755 +
1756 +struct zbud_hdr {
1757 + uint32_t pool_id;
1758 + struct tmem_oid oid;
1759 + uint32_t index;
1760 + uint16_t size; /* compressed size in bytes, zero means unused */
1761 + DECL_SENTINEL
1762 +};
1763 +
1764 +struct zbud_page {
1765 + struct list_head bud_list;
1766 + spinlock_t lock;
1767 + struct zbud_hdr buddy[ZBUD_MAX_BUDS];
1768 + DECL_SENTINEL
1769 + /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */
1770 +};
1771 +
1772 +#define CHUNK_SHIFT 6
1773 +#define CHUNK_SIZE (1 << CHUNK_SHIFT)
1774 +#define CHUNK_MASK (~(CHUNK_SIZE-1))
1775 +#define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \
1776 + CHUNK_MASK) >> CHUNK_SHIFT)
1777 +#define MAX_CHUNK (NCHUNKS-1)
1778 +
1779 +static struct {
1780 + struct list_head list;
1781 + unsigned count;
1782 +} zbud_unbuddied[NCHUNKS];
1783 +/* list N contains pages with N chunks USED and NCHUNKS-N unused */
1784 +/* element 0 is never used but optimizing that isn't worth it */
1785 +static unsigned long zbud_cumul_chunk_counts[NCHUNKS];
1786 +
1787 +struct list_head zbud_buddied_list;
1788 +static unsigned long zcache_zbud_buddied_count;
1789 +
1790 +/* protects the buddied list and all unbuddied lists */
1791 +static DEFINE_SPINLOCK(zbud_budlists_spinlock);
1792 +
1793 +static LIST_HEAD(zbpg_unused_list);
1794 +static unsigned long zcache_zbpg_unused_list_count;
1795 +
1796 +/* protects the unused page list */
1797 +static DEFINE_SPINLOCK(zbpg_unused_list_spinlock);
1798 +
1799 +static atomic_t zcache_zbud_curr_raw_pages;
1800 +static atomic_t zcache_zbud_curr_zpages;
1801 +static unsigned long zcache_zbud_curr_zbytes;
1802 +static unsigned long zcache_zbud_cumul_zpages;
1803 +static unsigned long zcache_zbud_cumul_zbytes;
1804 +static unsigned long zcache_compress_poor;
1805 +
1806 +/* forward references */
1807 +static void *zcache_get_free_page(void);
1808 +static void zcache_free_page(void *p);
1809 +
1810 +/*
1811 + * zbud helper functions
1812 + */
1813 +
1814 +static inline unsigned zbud_max_buddy_size(void)
1815 +{
1816 + return MAX_CHUNK << CHUNK_SHIFT;
1817 +}
1818 +
1819 +static inline unsigned zbud_size_to_chunks(unsigned size)
1820 +{
1821 + BUG_ON(size == 0 || size > zbud_max_buddy_size());
1822 + return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
1823 +}
1824 +
1825 +static inline int zbud_budnum(struct zbud_hdr *zh)
1826 +{
1827 + unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1);
1828 + struct zbud_page *zbpg = NULL;
1829 + unsigned budnum = -1U;
1830 + int i;
1831 +
1832 + for (i = 0; i < ZBUD_MAX_BUDS; i++)
1833 + if (offset == offsetof(typeof(*zbpg), buddy[i])) {
1834 + budnum = i;
1835 + break;
1836 + }
1837 + BUG_ON(budnum == -1U);
1838 + return budnum;
1839 +}
1840 +
1841 +static char *zbud_data(struct zbud_hdr *zh, unsigned size)
1842 +{
1843 + struct zbud_page *zbpg;
1844 + char *p;
1845 + unsigned budnum;
1846 +
1847 + ASSERT_SENTINEL(zh, ZBH);
1848 + budnum = zbud_budnum(zh);
1849 + BUG_ON(size == 0 || size > zbud_max_buddy_size());
1850 + zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
1851 + ASSERT_SPINLOCK(&zbpg->lock);
1852 + p = (char *)zbpg;
1853 + if (budnum == 0)
1854 + p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) &
1855 + CHUNK_MASK);
1856 + else if (budnum == 1)
1857 + p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
1858 + return p;
1859 +}
1860 +
1861 +/*
1862 + * zbud raw page management
1863 + */
1864 +
1865 +static struct zbud_page *zbud_alloc_raw_page(void)
1866 +{
1867 + struct zbud_page *zbpg = NULL;
1868 + struct zbud_hdr *zh0, *zh1;
1869 + bool recycled = 0;
1870 +
1871 + /* if any pages on the zbpg list, use one */
1872 + spin_lock(&zbpg_unused_list_spinlock);
1873 + if (!list_empty(&zbpg_unused_list)) {
1874 + zbpg = list_first_entry(&zbpg_unused_list,
1875 + struct zbud_page, bud_list);
1876 + list_del_init(&zbpg->bud_list);
1877 + zcache_zbpg_unused_list_count--;
1878 + recycled = 1;
1879 + }
1880 + spin_unlock(&zbpg_unused_list_spinlock);
1881 + if (zbpg == NULL)
1882 + /* none on zbpg list, try to get a kernel page */
1883 + zbpg = zcache_get_free_page();
1884 + if (likely(zbpg != NULL)) {
1885 + INIT_LIST_HEAD(&zbpg->bud_list);
1886 + zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
1887 + spin_lock_init(&zbpg->lock);
1888 + if (recycled) {
1889 + ASSERT_INVERTED_SENTINEL(zbpg, ZBPG);
1890 + SET_SENTINEL(zbpg, ZBPG);
1891 + BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
1892 + BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
1893 + } else {
1894 + atomic_inc(&zcache_zbud_curr_raw_pages);
1895 + INIT_LIST_HEAD(&zbpg->bud_list);
1896 + SET_SENTINEL(zbpg, ZBPG);
1897 + zh0->size = 0; zh1->size = 0;
1898 + tmem_oid_set_invalid(&zh0->oid);
1899 + tmem_oid_set_invalid(&zh1->oid);
1900 + }
1901 + }
1902 + return zbpg;
1903 +}
1904 +
1905 +static void zbud_free_raw_page(struct zbud_page *zbpg)
1906 +{
1907 + struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1];
1908 +
1909 + ASSERT_SENTINEL(zbpg, ZBPG);
1910 + BUG_ON(!list_empty(&zbpg->bud_list));
1911 + ASSERT_SPINLOCK(&zbpg->lock);
1912 + BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
1913 + BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
1914 + INVERT_SENTINEL(zbpg, ZBPG);
1915 + spin_unlock(&zbpg->lock);
1916 + spin_lock(&zbpg_unused_list_spinlock);
1917 + list_add(&zbpg->bud_list, &zbpg_unused_list);
1918 + zcache_zbpg_unused_list_count++;
1919 + spin_unlock(&zbpg_unused_list_spinlock);
1920 +}
1921 +
1922 +/*
1923 + * core zbud handling routines
1924 + */
1925 +
1926 +static unsigned zbud_free(struct zbud_hdr *zh)
1927 +{
1928 + unsigned size;
1929 +
1930 + ASSERT_SENTINEL(zh, ZBH);
1931 + BUG_ON(!tmem_oid_valid(&zh->oid));
1932 + size = zh->size;
1933 + BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
1934 + zh->size = 0;
1935 + tmem_oid_set_invalid(&zh->oid);
1936 + INVERT_SENTINEL(zh, ZBH);
1937 + zcache_zbud_curr_zbytes -= size;
1938 + atomic_dec(&zcache_zbud_curr_zpages);
1939 + return size;
1940 +}
1941 +
1942 +static void zbud_free_and_delist(struct zbud_hdr *zh)
1943 +{
1944 + unsigned chunks;
1945 + struct zbud_hdr *zh_other;
1946 + unsigned budnum = zbud_budnum(zh), size;
1947 + struct zbud_page *zbpg =
1948 + container_of(zh, struct zbud_page, buddy[budnum]);
1949 +
1950 + spin_lock(&zbpg->lock);
1951 + if (list_empty(&zbpg->bud_list)) {
1952 + /* ignore zombie page... see zbud_evict_pages() */
1953 + spin_unlock(&zbpg->lock);
1954 + return;
1955 + }
1956 + size = zbud_free(zh);
1957 + ASSERT_SPINLOCK(&zbpg->lock);
1958 + zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0];
1959 + if (zh_other->size == 0) { /* was unbuddied: unlist and free */
1960 + chunks = zbud_size_to_chunks(size) ;
1961 + spin_lock(&zbud_budlists_spinlock);
1962 + BUG_ON(list_empty(&zbud_unbuddied[chunks].list));
1963 + list_del_init(&zbpg->bud_list);
1964 + zbud_unbuddied[chunks].count--;
1965 + spin_unlock(&zbud_budlists_spinlock);
1966 + zbud_free_raw_page(zbpg);
1967 + } else { /* was buddied: move remaining buddy to unbuddied list */
1968 + chunks = zbud_size_to_chunks(zh_other->size) ;
1969 + spin_lock(&zbud_budlists_spinlock);
1970 + list_del_init(&zbpg->bud_list);
1971 + zcache_zbud_buddied_count--;
1972 + list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list);
1973 + zbud_unbuddied[chunks].count++;
1974 + spin_unlock(&zbud_budlists_spinlock);
1975 + spin_unlock(&zbpg->lock);
1976 + }
1977 +}
1978 +
1979 +static struct zbud_hdr *zbud_create(uint32_t pool_id, struct tmem_oid *oid,
1980 + uint32_t index, struct page *page,
1981 + void *cdata, unsigned size)
1982 +{
1983 + struct zbud_hdr *zh0, *zh1, *zh = NULL;
1984 + struct zbud_page *zbpg = NULL, *ztmp;
1985 + unsigned nchunks;
1986 + char *to;
1987 + int i, found_good_buddy = 0;
1988 +
1989 + nchunks = zbud_size_to_chunks(size) ;
1990 + for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
1991 + spin_lock(&zbud_budlists_spinlock);
1992 + if (!list_empty(&zbud_unbuddied[i].list)) {
1993 + list_for_each_entry_safe(zbpg, ztmp,
1994 + &zbud_unbuddied[i].list, bud_list) {
1995 + if (spin_trylock(&zbpg->lock)) {
1996 + found_good_buddy = i;
1997 + goto found_unbuddied;
1998 + }
1999 + }
2000 + }
2001 + spin_unlock(&zbud_budlists_spinlock);
2002 + }
2003 + /* didn't find a good buddy, try allocating a new page */
2004 + zbpg = zbud_alloc_raw_page();
2005 + if (unlikely(zbpg == NULL))
2006 + goto out;
2007 + /* ok, have a page, now compress the data before taking locks */
2008 + spin_lock(&zbpg->lock);
2009 + spin_lock(&zbud_budlists_spinlock);
2010 + list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list);
2011 + zbud_unbuddied[nchunks].count++;
2012 + zh = &zbpg->buddy[0];
2013 + goto init_zh;
2014 +
2015 +found_unbuddied:
2016 + ASSERT_SPINLOCK(&zbpg->lock);
2017 + zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
2018 + BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0)));
2019 + if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */
2020 + ASSERT_SENTINEL(zh0, ZBH);
2021 + zh = zh1;
2022 + } else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */
2023 + ASSERT_SENTINEL(zh1, ZBH);
2024 + zh = zh0;
2025 + } else
2026 + BUG();
2027 + list_del_init(&zbpg->bud_list);
2028 + zbud_unbuddied[found_good_buddy].count--;
2029 + list_add_tail(&zbpg->bud_list, &zbud_buddied_list);
2030 + zcache_zbud_buddied_count++;
2031 +
2032 +init_zh:
2033 + SET_SENTINEL(zh, ZBH);
2034 + zh->size = size;
2035 + zh->index = index;
2036 + zh->oid = *oid;
2037 + zh->pool_id = pool_id;
2038 + /* can wait to copy the data until the list locks are dropped */
2039 + spin_unlock(&zbud_budlists_spinlock);
2040 +
2041 + to = zbud_data(zh, size);
2042 + memcpy(to, cdata, size);
2043 + spin_unlock(&zbpg->lock);
2044 + zbud_cumul_chunk_counts[nchunks]++;
2045 + atomic_inc(&zcache_zbud_curr_zpages);
2046 + zcache_zbud_cumul_zpages++;
2047 + zcache_zbud_curr_zbytes += size;
2048 + zcache_zbud_cumul_zbytes += size;
2049 +out:
2050 + return zh;
2051 +}
2052 +
2053 +static int zbud_decompress(struct page *page, struct zbud_hdr *zh)
2054 +{
2055 + struct zbud_page *zbpg;
2056 + unsigned budnum = zbud_budnum(zh);
2057 + size_t out_len = PAGE_SIZE;
2058 + char *to_va, *from_va;
2059 + unsigned size;
2060 + int ret = 0;
2061 +
2062 + zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
2063 + spin_lock(&zbpg->lock);
2064 + if (list_empty(&zbpg->bud_list)) {
2065 + /* ignore zombie page... see zbud_evict_pages() */
2066 + ret = -EINVAL;
2067 + goto out;
2068 + }
2069 + ASSERT_SENTINEL(zh, ZBH);
2070 + BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
2071 + to_va = kmap_atomic(page, KM_USER0);
2072 + size = zh->size;
2073 + from_va = zbud_data(zh, size);
2074 + ret = lzo1x_decompress_safe(from_va, size, to_va, &out_len);
2075 + BUG_ON(ret != LZO_E_OK);
2076 + BUG_ON(out_len != PAGE_SIZE);
2077 + kunmap_atomic(to_va, KM_USER0);
2078 +out:
2079 + spin_unlock(&zbpg->lock);
2080 + return ret;
2081 +}
2082 +
2083 +/*
2084 + * The following routines handle shrinking of ephemeral pages by evicting
2085 + * pages "least valuable" first.
2086 + */
2087 +
2088 +static unsigned long zcache_evicted_raw_pages;
2089 +static unsigned long zcache_evicted_buddied_pages;
2090 +static unsigned long zcache_evicted_unbuddied_pages;
2091 +
2092 +static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid);
2093 +static void zcache_put_pool(struct tmem_pool *pool);
2094 +
2095 +/*
2096 + * Flush and free all zbuds in a zbpg, then free the pageframe
2097 + */
2098 +static void zbud_evict_zbpg(struct zbud_page *zbpg)
2099 +{
2100 + struct zbud_hdr *zh;
2101 + int i, j;
2102 + uint32_t pool_id[ZBUD_MAX_BUDS], index[ZBUD_MAX_BUDS];
2103 + struct tmem_oid oid[ZBUD_MAX_BUDS];
2104 + struct tmem_pool *pool;
2105 +
2106 + ASSERT_SPINLOCK(&zbpg->lock);
2107 + BUG_ON(!list_empty(&zbpg->bud_list));
2108 + for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) {
2109 + zh = &zbpg->buddy[i];
2110 + if (zh->size) {
2111 + pool_id[j] = zh->pool_id;
2112 + oid[j] = zh->oid;
2113 + index[j] = zh->index;
2114 + j++;
2115 + zbud_free(zh);
2116 + }
2117 + }
2118 + spin_unlock(&zbpg->lock);
2119 + for (i = 0; i < j; i++) {
2120 + pool = zcache_get_pool_by_id(pool_id[i]);
2121 + if (pool != NULL) {
2122 + tmem_flush_page(pool, &oid[i], index[i]);
2123 + zcache_put_pool(pool);
2124 + }
2125 + }
2126 + ASSERT_SENTINEL(zbpg, ZBPG);
2127 + spin_lock(&zbpg->lock);
2128 + zbud_free_raw_page(zbpg);
2129 +}
2130 +
2131 +/*
2132 + * Free nr pages. This code is funky because we want to hold the locks
2133 + * protecting various lists for as short a time as possible, and in some
2134 + * circumstances the list may change asynchronously when the list lock is
2135 + * not held. In some cases we also trylock not only to avoid waiting on a
2136 + * page in use by another cpu, but also to avoid potential deadlock due to
2137 + * lock inversion.
2138 + */
2139 +static void zbud_evict_pages(int nr)
2140 +{
2141 + struct zbud_page *zbpg;
2142 + int i;
2143 +
2144 + /* first try freeing any pages on unused list */
2145 +retry_unused_list:
2146 + spin_lock_bh(&zbpg_unused_list_spinlock);
2147 + if (!list_empty(&zbpg_unused_list)) {
2148 + /* can't walk list here, since it may change when unlocked */
2149 + zbpg = list_first_entry(&zbpg_unused_list,
2150 + struct zbud_page, bud_list);
2151 + list_del_init(&zbpg->bud_list);
2152 + zcache_zbpg_unused_list_count--;
2153 + atomic_dec(&zcache_zbud_curr_raw_pages);
2154 + spin_unlock_bh(&zbpg_unused_list_spinlock);
2155 + zcache_free_page(zbpg);
2156 + zcache_evicted_raw_pages++;
2157 + if (--nr <= 0)
2158 + goto out;
2159 + goto retry_unused_list;
2160 + }
2161 + spin_unlock_bh(&zbpg_unused_list_spinlock);
2162 +
2163 + /* now try freeing unbuddied pages, starting with least space avail */
2164 + for (i = 0; i < MAX_CHUNK; i++) {
2165 +retry_unbud_list_i:
2166 + spin_lock_bh(&zbud_budlists_spinlock);
2167 + if (list_empty(&zbud_unbuddied[i].list)) {
2168 + spin_unlock_bh(&zbud_budlists_spinlock);
2169 + continue;
2170 + }
2171 + list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) {
2172 + if (unlikely(!spin_trylock(&zbpg->lock)))
2173 + continue;
2174 + list_del_init(&zbpg->bud_list);
2175 + zbud_unbuddied[i].count--;
2176 + spin_unlock(&zbud_budlists_spinlock);
2177 + zcache_evicted_unbuddied_pages++;
2178 + /* want budlists unlocked when doing zbpg eviction */
2179 + zbud_evict_zbpg(zbpg);
2180 + local_bh_enable();
2181 + if (--nr <= 0)
2182 + goto out;
2183 + goto retry_unbud_list_i;
2184 + }
2185 + spin_unlock_bh(&zbud_budlists_spinlock);
2186 + }
2187 +
2188 + /* as a last resort, free buddied pages */
2189 +retry_bud_list:
2190 + spin_lock_bh(&zbud_budlists_spinlock);
2191 + if (list_empty(&zbud_buddied_list)) {
2192 + spin_unlock_bh(&zbud_budlists_spinlock);
2193 + goto out;
2194 + }
2195 + list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) {
2196 + if (unlikely(!spin_trylock(&zbpg->lock)))
2197 + continue;
2198 + list_del_init(&zbpg->bud_list);
2199 + zcache_zbud_buddied_count--;
2200 + spin_unlock(&zbud_budlists_spinlock);
2201 + zcache_evicted_buddied_pages++;
2202 + /* want budlists unlocked when doing zbpg eviction */
2203 + zbud_evict_zbpg(zbpg);
2204 + local_bh_enable();
2205 + if (--nr <= 0)
2206 + goto out;
2207 + goto retry_bud_list;
2208 + }
2209 + spin_unlock_bh(&zbud_budlists_spinlock);
2210 +out:
2211 + return;
2212 +}
2213 +
2214 +static void zbud_init(void)
2215 +{
2216 + int i;
2217 +
2218 + INIT_LIST_HEAD(&zbud_buddied_list);
2219 + zcache_zbud_buddied_count = 0;
2220 + for (i = 0; i < NCHUNKS; i++) {
2221 + INIT_LIST_HEAD(&zbud_unbuddied[i].list);
2222 + zbud_unbuddied[i].count = 0;
2223 + }
2224 +}
2225 +
2226 +#ifdef CONFIG_SYSFS
2227 +/*
2228 + * These sysfs routines show a nice distribution of how many zbpg's are
2229 + * currently (and have ever been placed) in each unbuddied list. It's fun
2230 + * to watch but can probably go away before final merge.
2231 + */
2232 +static int zbud_show_unbuddied_list_counts(char *buf)
2233 +{
2234 + int i;
2235 + char *p = buf;
2236 +
2237 + for (i = 0; i < NCHUNKS - 1; i++)
2238 + p += sprintf(p, "%u ", zbud_unbuddied[i].count);
2239 + p += sprintf(p, "%d\n", zbud_unbuddied[i].count);
2240 + return p - buf;
2241 +}
2242 +
2243 +static int zbud_show_cumul_chunk_counts(char *buf)
2244 +{
2245 + unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0;
2246 + unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0;
2247 + unsigned long total_chunks_lte_42 = 0;
2248 + char *p = buf;
2249 +
2250 + for (i = 0; i < NCHUNKS; i++) {
2251 + p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]);
2252 + chunks += zbud_cumul_chunk_counts[i];
2253 + total_chunks += zbud_cumul_chunk_counts[i];
2254 + sum_total_chunks += i * zbud_cumul_chunk_counts[i];
2255 + if (i == 21)
2256 + total_chunks_lte_21 = total_chunks;
2257 + if (i == 32)
2258 + total_chunks_lte_32 = total_chunks;
2259 + if (i == 42)
2260 + total_chunks_lte_42 = total_chunks;
2261 + }
2262 + p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n",
2263 + total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42,
2264 + chunks == 0 ? 0 : sum_total_chunks / chunks);
2265 + return p - buf;
2266 +}
2267 +#endif
2268 +
2269 +/**********
2270 + * This "zv" PAM implementation combines the TLSF-based xvMalloc
2271 + * with lzo1x compression to maximize the amount of data that can
2272 + * be packed into a physical page.
2273 + *
2274 + * Zv represents a PAM page with the index and object (plus a "size" value
2275 + * necessary for decompression) immediately preceding the compressed data.
2276 + */
2277 +
2278 +#define ZVH_SENTINEL 0x43214321
2279 +
2280 +struct zv_hdr {
2281 + uint32_t pool_id;
2282 + struct tmem_oid oid;
2283 + uint32_t index;
2284 + DECL_SENTINEL
2285 +};
2286 +
2287 +static const int zv_max_page_size = (PAGE_SIZE / 8) * 7;
2288 +
2289 +static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id,
2290 + struct tmem_oid *oid, uint32_t index,
2291 + void *cdata, unsigned clen)
2292 +{
2293 + struct page *page;
2294 + struct zv_hdr *zv = NULL;
2295 + uint32_t offset;
2296 + int ret;
2297 +
2298 + BUG_ON(!irqs_disabled());
2299 + ret = xv_malloc(xvpool, clen + sizeof(struct zv_hdr),
2300 + &page, &offset, ZCACHE_GFP_MASK);
2301 + if (unlikely(ret))
2302 + goto out;
2303 + zv = kmap_atomic(page, KM_USER0) + offset;
2304 + zv->index = index;
2305 + zv->oid = *oid;
2306 + zv->pool_id = pool_id;
2307 + SET_SENTINEL(zv, ZVH);
2308 + memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen);
2309 + kunmap_atomic(zv, KM_USER0);
2310 +out:
2311 + return zv;
2312 +}
2313 +
2314 +static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv)
2315 +{
2316 + unsigned long flags;
2317 + struct page *page;
2318 + uint32_t offset;
2319 + uint16_t size;
2320 +
2321 + ASSERT_SENTINEL(zv, ZVH);
2322 + size = xv_get_object_size(zv) - sizeof(*zv);
2323 + BUG_ON(size == 0 || size > zv_max_page_size);
2324 + INVERT_SENTINEL(zv, ZVH);
2325 + page = virt_to_page(zv);
2326 + offset = (unsigned long)zv & ~PAGE_MASK;
2327 + local_irq_save(flags);
2328 + xv_free(xvpool, page, offset);
2329 + local_irq_restore(flags);
2330 +}
2331 +
2332 +static void zv_decompress(struct page *page, struct zv_hdr *zv)
2333 +{
2334 + size_t clen = PAGE_SIZE;
2335 + char *to_va;
2336 + unsigned size;
2337 + int ret;
2338 +
2339 + ASSERT_SENTINEL(zv, ZVH);
2340 + size = xv_get_object_size(zv) - sizeof(*zv);
2341 + BUG_ON(size == 0 || size > zv_max_page_size);
2342 + to_va = kmap_atomic(page, KM_USER0);
2343 + ret = lzo1x_decompress_safe((char *)zv + sizeof(*zv),
2344 + size, to_va, &clen);
2345 + kunmap_atomic(to_va, KM_USER0);
2346 + BUG_ON(ret != LZO_E_OK);
2347 + BUG_ON(clen != PAGE_SIZE);
2348 +}
2349 +
2350 +/*
2351 + * zcache core code starts here
2352 + */
2353 +
2354 +/* useful stats not collected by cleancache or frontswap */
2355 +static unsigned long zcache_flush_total;
2356 +static unsigned long zcache_flush_found;
2357 +static unsigned long zcache_flobj_total;
2358 +static unsigned long zcache_flobj_found;
2359 +static unsigned long zcache_failed_eph_puts;
2360 +static unsigned long zcache_failed_pers_puts;
2361 +
2362 +#define MAX_POOLS_PER_CLIENT 16
2363 +
2364 +static struct {
2365 + struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT];
2366 + struct xv_pool *xvpool;
2367 +} zcache_client;
2368 +
2369 +/*
2370 + * Tmem operations assume the poolid implies the invoking client.
2371 + * Zcache only has one client (the kernel itself), so translate
2372 + * the poolid into the tmem_pool allocated for it. A KVM version
2373 + * of zcache would have one client per guest and each client might
2374 + * have a poolid==N.
2375 + */
2376 +static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid)
2377 +{
2378 + struct tmem_pool *pool = NULL;
2379 +
2380 + if (poolid >= 0) {
2381 + pool = zcache_client.tmem_pools[poolid];
2382 + if (pool != NULL)
2383 + atomic_inc(&pool->refcount);
2384 + }
2385 + return pool;
2386 +}
2387 +
2388 +static void zcache_put_pool(struct tmem_pool *pool)
2389 +{
2390 + if (pool != NULL)
2391 + atomic_dec(&pool->refcount);
2392 +}
2393 +
2394 +/* counters for debugging */
2395 +static unsigned long zcache_failed_get_free_pages;
2396 +static unsigned long zcache_failed_alloc;
2397 +static unsigned long zcache_put_to_flush;
2398 +static unsigned long zcache_aborted_preload;
2399 +static unsigned long zcache_aborted_shrink;
2400 +
2401 +/*
2402 + * Ensure that memory allocation requests in zcache don't result
2403 + * in direct reclaim requests via the shrinker, which would cause
2404 + * an infinite loop. Maybe a GFP flag would be better?
2405 + */
2406 +static DEFINE_SPINLOCK(zcache_direct_reclaim_lock);
2407 +
2408 +/*
2409 + * for now, used named slabs so can easily track usage; later can
2410 + * either just use kmalloc, or perhaps add a slab-like allocator
2411 + * to more carefully manage total memory utilization
2412 + */
2413 +static struct kmem_cache *zcache_objnode_cache;
2414 +static struct kmem_cache *zcache_obj_cache;
2415 +static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0);
2416 +static unsigned long zcache_curr_obj_count_max;
2417 +static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0);
2418 +static unsigned long zcache_curr_objnode_count_max;
2419 +
2420 +/*
2421 + * to avoid memory allocation recursion (e.g. due to direct reclaim), we
2422 + * preload all necessary data structures so the hostops callbacks never
2423 + * actually do a malloc
2424 + */
2425 +struct zcache_preload {
2426 + void *page;
2427 + struct tmem_obj *obj;
2428 + int nr;
2429 + struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH];
2430 +};
2431 +static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, };
2432 +
2433 +static int zcache_do_preload(struct tmem_pool *pool)
2434 +{
2435 + struct zcache_preload *kp;
2436 + struct tmem_objnode *objnode;
2437 + struct tmem_obj *obj;
2438 + void *page;
2439 + int ret = -ENOMEM;
2440 +
2441 + if (unlikely(zcache_objnode_cache == NULL))
2442 + goto out;
2443 + if (unlikely(zcache_obj_cache == NULL))
2444 + goto out;
2445 + if (!spin_trylock(&zcache_direct_reclaim_lock)) {
2446 + zcache_aborted_preload++;
2447 + goto out;
2448 + }
2449 + preempt_disable();
2450 + kp = &__get_cpu_var(zcache_preloads);
2451 + while (kp->nr < ARRAY_SIZE(kp->objnodes)) {
2452 + preempt_enable_no_resched();
2453 + objnode = kmem_cache_alloc(zcache_objnode_cache,
2454 + ZCACHE_GFP_MASK);
2455 + if (unlikely(objnode == NULL)) {
2456 + zcache_failed_alloc++;
2457 + goto unlock_out;
2458 + }
2459 + preempt_disable();
2460 + kp = &__get_cpu_var(zcache_preloads);
2461 + if (kp->nr < ARRAY_SIZE(kp->objnodes))
2462 + kp->objnodes[kp->nr++] = objnode;
2463 + else
2464 + kmem_cache_free(zcache_objnode_cache, objnode);
2465 + }
2466 + preempt_enable_no_resched();
2467 + obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK);
2468 + if (unlikely(obj == NULL)) {
2469 + zcache_failed_alloc++;
2470 + goto unlock_out;
2471 + }
2472 + page = (void *)__get_free_page(ZCACHE_GFP_MASK);
2473 + if (unlikely(page == NULL)) {
2474 + zcache_failed_get_free_pages++;
2475 + kmem_cache_free(zcache_obj_cache, obj);
2476 + goto unlock_out;
2477 + }
2478 + preempt_disable();
2479 + kp = &__get_cpu_var(zcache_preloads);
2480 + if (kp->obj == NULL)
2481 + kp->obj = obj;
2482 + else
2483 + kmem_cache_free(zcache_obj_cache, obj);
2484 + if (kp->page == NULL)
2485 + kp->page = page;
2486 + else
2487 + free_page((unsigned long)page);
2488 + ret = 0;
2489 +unlock_out:
2490 + spin_unlock(&zcache_direct_reclaim_lock);
2491 +out:
2492 + return ret;
2493 +}
2494 +
2495 +static void *zcache_get_free_page(void)
2496 +{
2497 + struct zcache_preload *kp;
2498 + void *page;
2499 +
2500 + kp = &__get_cpu_var(zcache_preloads);
2501 + page = kp->page;
2502 + BUG_ON(page == NULL);
2503 + kp->page = NULL;
2504 + return page;
2505 +}
2506 +
2507 +static void zcache_free_page(void *p)
2508 +{
2509 + free_page((unsigned long)p);
2510 +}
2511 +
2512 +/*
2513 + * zcache implementation for tmem host ops
2514 + */
2515 +
2516 +static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool)
2517 +{
2518 + struct tmem_objnode *objnode = NULL;
2519 + unsigned long count;
2520 + struct zcache_preload *kp;
2521 +
2522 + kp = &__get_cpu_var(zcache_preloads);
2523 + if (kp->nr <= 0)
2524 + goto out;
2525 + objnode = kp->objnodes[kp->nr - 1];
2526 + BUG_ON(objnode == NULL);
2527 + kp->objnodes[kp->nr - 1] = NULL;
2528 + kp->nr--;
2529 + count = atomic_inc_return(&zcache_curr_objnode_count);
2530 + if (count > zcache_curr_objnode_count_max)
2531 + zcache_curr_objnode_count_max = count;
2532 +out:
2533 + return objnode;
2534 +}
2535 +
2536 +static void zcache_objnode_free(struct tmem_objnode *objnode,
2537 + struct tmem_pool *pool)
2538 +{
2539 + atomic_dec(&zcache_curr_objnode_count);
2540 + BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0);
2541 + kmem_cache_free(zcache_objnode_cache, objnode);
2542 +}
2543 +
2544 +static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool)
2545 +{
2546 + struct tmem_obj *obj = NULL;
2547 + unsigned long count;
2548 + struct zcache_preload *kp;
2549 +
2550 + kp = &__get_cpu_var(zcache_preloads);
2551 + obj = kp->obj;
2552 + BUG_ON(obj == NULL);
2553 + kp->obj = NULL;
2554 + count = atomic_inc_return(&zcache_curr_obj_count);
2555 + if (count > zcache_curr_obj_count_max)
2556 + zcache_curr_obj_count_max = count;
2557 + return obj;
2558 +}
2559 +
2560 +static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool)
2561 +{
2562 + atomic_dec(&zcache_curr_obj_count);
2563 + BUG_ON(atomic_read(&zcache_curr_obj_count) < 0);
2564 + kmem_cache_free(zcache_obj_cache, obj);
2565 +}
2566 +
2567 +static struct tmem_hostops zcache_hostops = {
2568 + .obj_alloc = zcache_obj_alloc,
2569 + .obj_free = zcache_obj_free,
2570 + .objnode_alloc = zcache_objnode_alloc,
2571 + .objnode_free = zcache_objnode_free,
2572 +};
2573 +
2574 +/*
2575 + * zcache implementations for PAM page descriptor ops
2576 + */
2577 +
2578 +static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0);
2579 +static unsigned long zcache_curr_eph_pampd_count_max;
2580 +static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0);
2581 +static unsigned long zcache_curr_pers_pampd_count_max;
2582 +
2583 +/* forward reference */
2584 +static int zcache_compress(struct page *from, void **out_va, size_t *out_len);
2585 +
2586 +static void *zcache_pampd_create(struct tmem_pool *pool, struct tmem_oid *oid,
2587 + uint32_t index, struct page *page)
2588 +{
2589 + void *pampd = NULL, *cdata;
2590 + size_t clen;
2591 + int ret;
2592 + bool ephemeral = is_ephemeral(pool);
2593 + unsigned long count;
2594 +
2595 + if (ephemeral) {
2596 + ret = zcache_compress(page, &cdata, &clen);
2597 + if (ret == 0)
2598 +
2599 + goto out;
2600 + if (clen == 0 || clen > zbud_max_buddy_size()) {
2601 + zcache_compress_poor++;
2602 + goto out;
2603 + }
2604 + pampd = (void *)zbud_create(pool->pool_id, oid, index,
2605 + page, cdata, clen);
2606 + if (pampd != NULL) {
2607 + count = atomic_inc_return(&zcache_curr_eph_pampd_count);
2608 + if (count > zcache_curr_eph_pampd_count_max)
2609 + zcache_curr_eph_pampd_count_max = count;
2610 + }
2611 + } else {
2612 + /*
2613 + * FIXME: This is all the "policy" there is for now.
2614 + * 3/4 totpages should allow ~37% of RAM to be filled with
2615 + * compressed frontswap pages
2616 + */
2617 + if (atomic_read(&zcache_curr_pers_pampd_count) >
2618 + 3 * totalram_pages / 4)
2619 + goto out;
2620 + ret = zcache_compress(page, &cdata, &clen);
2621 + if (ret == 0)
2622 + goto out;
2623 + if (clen > zv_max_page_size) {
2624 + zcache_compress_poor++;
2625 + goto out;
2626 + }
2627 + pampd = (void *)zv_create(zcache_client.xvpool, pool->pool_id,
2628 + oid, index, cdata, clen);
2629 + if (pampd == NULL)
2630 + goto out;
2631 + count = atomic_inc_return(&zcache_curr_pers_pampd_count);
2632 + if (count > zcache_curr_pers_pampd_count_max)
2633 + zcache_curr_pers_pampd_count_max = count;
2634 + }
2635 +out:
2636 + return pampd;
2637 +}
2638 +
2639 +/*
2640 + * fill the pageframe corresponding to the struct page with the data
2641 + * from the passed pampd
2642 + */
2643 +static int zcache_pampd_get_data(struct page *page, void *pampd,
2644 + struct tmem_pool *pool)
2645 +{
2646 + int ret = 0;
2647 +
2648 + if (is_ephemeral(pool))
2649 + ret = zbud_decompress(page, pampd);
2650 + else
2651 + zv_decompress(page, pampd);
2652 + return ret;
2653 +}
2654 +
2655 +/*
2656 + * free the pampd and remove it from any zcache lists
2657 + * pampd must no longer be pointed to from any tmem data structures!
2658 + */
2659 +static void zcache_pampd_free(void *pampd, struct tmem_pool *pool)
2660 +{
2661 + if (is_ephemeral(pool)) {
2662 + zbud_free_and_delist((struct zbud_hdr *)pampd);
2663 + atomic_dec(&zcache_curr_eph_pampd_count);
2664 + BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0);
2665 + } else {
2666 + zv_free(zcache_client.xvpool, (struct zv_hdr *)pampd);
2667 + atomic_dec(&zcache_curr_pers_pampd_count);
2668 + BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0);
2669 + }
2670 +}
2671 +
2672 +static struct tmem_pamops zcache_pamops = {
2673 + .create = zcache_pampd_create,
2674 + .get_data = zcache_pampd_get_data,
2675 + .free = zcache_pampd_free,
2676 +};
2677 +
2678 +/*
2679 + * zcache compression/decompression and related per-cpu stuff
2680 + */
2681 +
2682 +#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS
2683 +#define LZO_DSTMEM_PAGE_ORDER 1
2684 +static DEFINE_PER_CPU(unsigned char *, zcache_workmem);
2685 +static DEFINE_PER_CPU(unsigned char *, zcache_dstmem);
2686 +
2687 +static int zcache_compress(struct page *from, void **out_va, size_t *out_len)
2688 +{
2689 + int ret = 0;
2690 + unsigned char *dmem = __get_cpu_var(zcache_dstmem);
2691 + unsigned char *wmem = __get_cpu_var(zcache_workmem);
2692 + char *from_va;
2693 +
2694 + BUG_ON(!irqs_disabled());
2695 + if (unlikely(dmem == NULL || wmem == NULL))
2696 + goto out; /* no buffer, so can't compress */
2697 + from_va = kmap_atomic(from, KM_USER0);
2698 + mb();
2699 + ret = lzo1x_1_compress(from_va, PAGE_SIZE, dmem, out_len, wmem);
2700 + BUG_ON(ret != LZO_E_OK);
2701 + *out_va = dmem;
2702 + kunmap_atomic(from_va, KM_USER0);
2703 + ret = 1;
2704 +out:
2705 + return ret;
2706 +}
2707 +
2708 +
2709 +static int zcache_cpu_notifier(struct notifier_block *nb,
2710 + unsigned long action, void *pcpu)
2711 +{
2712 + int cpu = (long)pcpu;
2713 + struct zcache_preload *kp;
2714 +
2715 + switch (action) {
2716 + case CPU_UP_PREPARE:
2717 + per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages(
2718 + GFP_KERNEL | __GFP_REPEAT,
2719 + LZO_DSTMEM_PAGE_ORDER),
2720 + per_cpu(zcache_workmem, cpu) =
2721 + kzalloc(LZO1X_MEM_COMPRESS,
2722 + GFP_KERNEL | __GFP_REPEAT);
2723 + break;
2724 + case CPU_DEAD:
2725 + case CPU_UP_CANCELED:
2726 + free_pages((unsigned long)per_cpu(zcache_dstmem, cpu),
2727 + LZO_DSTMEM_PAGE_ORDER);
2728 + per_cpu(zcache_dstmem, cpu) = NULL;
2729 + kfree(per_cpu(zcache_workmem, cpu));
2730 + per_cpu(zcache_workmem, cpu) = NULL;
2731 + kp = &per_cpu(zcache_preloads, cpu);
2732 + while (kp->nr) {
2733 + kmem_cache_free(zcache_objnode_cache,
2734 + kp->objnodes[kp->nr - 1]);
2735 + kp->objnodes[kp->nr - 1] = NULL;
2736 + kp->nr--;
2737 + }
2738 + kmem_cache_free(zcache_obj_cache, kp->obj);
2739 + free_page((unsigned long)kp->page);
2740 + break;
2741 + default:
2742 + break;
2743 + }
2744 + return NOTIFY_OK;
2745 +}
2746 +
2747 +static struct notifier_block zcache_cpu_notifier_block = {
2748 + .notifier_call = zcache_cpu_notifier
2749 +};
2750 +
2751 +#ifdef CONFIG_SYSFS
2752 +#define ZCACHE_SYSFS_RO(_name) \
2753 + static ssize_t zcache_##_name##_show(struct kobject *kobj, \
2754 + struct kobj_attribute *attr, char *buf) \
2755 + { \
2756 + return sprintf(buf, "%lu\n", zcache_##_name); \
2757 + } \
2758 + static struct kobj_attribute zcache_##_name##_attr = { \
2759 + .attr = { .name = __stringify(_name), .mode = 0444 }, \
2760 + .show = zcache_##_name##_show, \
2761 + }
2762 +
2763 +#define ZCACHE_SYSFS_RO_ATOMIC(_name) \
2764 + static ssize_t zcache_##_name##_show(struct kobject *kobj, \
2765 + struct kobj_attribute *attr, char *buf) \
2766 + { \
2767 + return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \
2768 + } \
2769 + static struct kobj_attribute zcache_##_name##_attr = { \
2770 + .attr = { .name = __stringify(_name), .mode = 0444 }, \
2771 + .show = zcache_##_name##_show, \
2772 + }
2773 +
2774 +#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \
2775 + static ssize_t zcache_##_name##_show(struct kobject *kobj, \
2776 + struct kobj_attribute *attr, char *buf) \
2777 + { \
2778 + return _func(buf); \
2779 + } \
2780 + static struct kobj_attribute zcache_##_name##_attr = { \
2781 + .attr = { .name = __stringify(_name), .mode = 0444 }, \
2782 + .show = zcache_##_name##_show, \
2783 + }
2784 +
2785 +ZCACHE_SYSFS_RO(curr_obj_count_max);
2786 +ZCACHE_SYSFS_RO(curr_objnode_count_max);
2787 +ZCACHE_SYSFS_RO(flush_total);
2788 +ZCACHE_SYSFS_RO(flush_found);
2789 +ZCACHE_SYSFS_RO(flobj_total);
2790 +ZCACHE_SYSFS_RO(flobj_found);
2791 +ZCACHE_SYSFS_RO(failed_eph_puts);
2792 +ZCACHE_SYSFS_RO(failed_pers_puts);
2793 +ZCACHE_SYSFS_RO(zbud_curr_zbytes);
2794 +ZCACHE_SYSFS_RO(zbud_cumul_zpages);
2795 +ZCACHE_SYSFS_RO(zbud_cumul_zbytes);
2796 +ZCACHE_SYSFS_RO(zbud_buddied_count);
2797 +ZCACHE_SYSFS_RO(zbpg_unused_list_count);
2798 +ZCACHE_SYSFS_RO(evicted_raw_pages);
2799 +ZCACHE_SYSFS_RO(evicted_unbuddied_pages);
2800 +ZCACHE_SYSFS_RO(evicted_buddied_pages);
2801 +ZCACHE_SYSFS_RO(failed_get_free_pages);
2802 +ZCACHE_SYSFS_RO(failed_alloc);
2803 +ZCACHE_SYSFS_RO(put_to_flush);
2804 +ZCACHE_SYSFS_RO(aborted_preload);
2805 +ZCACHE_SYSFS_RO(aborted_shrink);
2806 +ZCACHE_SYSFS_RO(compress_poor);
2807 +ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages);
2808 +ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages);
2809 +ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count);
2810 +ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count);
2811 +ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts,
2812 + zbud_show_unbuddied_list_counts);
2813 +ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts,
2814 + zbud_show_cumul_chunk_counts);
2815 +
2816 +static struct attribute *zcache_attrs[] = {
2817 + &zcache_curr_obj_count_attr.attr,
2818 + &zcache_curr_obj_count_max_attr.attr,
2819 + &zcache_curr_objnode_count_attr.attr,
2820 + &zcache_curr_objnode_count_max_attr.attr,
2821 + &zcache_flush_total_attr.attr,
2822 + &zcache_flobj_total_attr.attr,
2823 + &zcache_flush_found_attr.attr,
2824 + &zcache_flobj_found_attr.attr,
2825 + &zcache_failed_eph_puts_attr.attr,
2826 + &zcache_failed_pers_puts_attr.attr,
2827 + &zcache_compress_poor_attr.attr,
2828 + &zcache_zbud_curr_raw_pages_attr.attr,
2829 + &zcache_zbud_curr_zpages_attr.attr,
2830 + &zcache_zbud_curr_zbytes_attr.attr,
2831 + &zcache_zbud_cumul_zpages_attr.attr,
2832 + &zcache_zbud_cumul_zbytes_attr.attr,
2833 + &zcache_zbud_buddied_count_attr.attr,
2834 + &zcache_zbpg_unused_list_count_attr.attr,
2835 + &zcache_evicted_raw_pages_attr.attr,
2836 + &zcache_evicted_unbuddied_pages_attr.attr,
2837 + &zcache_evicted_buddied_pages_attr.attr,
2838 + &zcache_failed_get_free_pages_attr.attr,
2839 + &zcache_failed_alloc_attr.attr,
2840 + &zcache_put_to_flush_attr.attr,
2841 + &zcache_aborted_preload_attr.attr,
2842 + &zcache_aborted_shrink_attr.attr,
2843 + &zcache_zbud_unbuddied_list_counts_attr.attr,
2844 + &zcache_zbud_cumul_chunk_counts_attr.attr,
2845 + NULL,
2846 +};
2847 +
2848 +static struct attribute_group zcache_attr_group = {
2849 + .attrs = zcache_attrs,
2850 + .name = "zcache",
2851 +};
2852 +
2853 +#endif /* CONFIG_SYSFS */
2854 +/*
2855 + * When zcache is disabled ("frozen"), pools can be created and destroyed,
2856 + * but all puts (and thus all other operations that require memory allocation)
2857 + * must fail. If zcache is unfrozen, accepts puts, then frozen again,
2858 + * data consistency requires all puts while frozen to be converted into
2859 + * flushes.
2860 + */
2861 +static bool zcache_freeze;
2862 +
2863 +/*
2864 + * zcache shrinker interface (only useful for ephemeral pages, so zbud only)
2865 + */
2866 +static int shrink_zcache_memory(struct shrinker *shrink,
2867 + struct shrink_control *sc)
2868 +{
2869 + int ret = -1;
2870 + int nr = sc->nr_to_scan;
2871 + gfp_t gfp_mask = sc->gfp_mask;
2872 +
2873 + if (nr >= 0) {
2874 + if (!(gfp_mask & __GFP_FS))
2875 + /* does this case really need to be skipped? */
2876 + goto out;
2877 + if (spin_trylock(&zcache_direct_reclaim_lock)) {
2878 + zbud_evict_pages(nr);
2879 + spin_unlock(&zcache_direct_reclaim_lock);
2880 + } else
2881 + zcache_aborted_shrink++;
2882 + }
2883 + ret = (int)atomic_read(&zcache_zbud_curr_raw_pages);
2884 +out:
2885 + return ret;
2886 +}
2887 +
2888 +static struct shrinker zcache_shrinker = {
2889 + .shrink = shrink_zcache_memory,
2890 + .seeks = DEFAULT_SEEKS,
2891 +};
2892 +
2893 +/*
2894 + * zcache shims between cleancache/frontswap ops and tmem
2895 + */
2896 +
2897 +static int zcache_put_page(int pool_id, struct tmem_oid *oidp,
2898 + uint32_t index, struct page *page)
2899 +{
2900 + struct tmem_pool *pool;
2901 + int ret = -1;
2902 +
2903 + BUG_ON(!irqs_disabled());
2904 + pool = zcache_get_pool_by_id(pool_id);
2905 + if (unlikely(pool == NULL))
2906 + goto out;
2907 + if (!zcache_freeze && zcache_do_preload(pool) == 0) {
2908 + /* preload does preempt_disable on success */
2909 + ret = tmem_put(pool, oidp, index, page);
2910 + if (ret < 0) {
2911 + if (is_ephemeral(pool))
2912 + zcache_failed_eph_puts++;
2913 + else
2914 + zcache_failed_pers_puts++;
2915 + }
2916 + zcache_put_pool(pool);
2917 + preempt_enable_no_resched();
2918 + } else {
2919 + zcache_put_to_flush++;
2920 + if (atomic_read(&pool->obj_count) > 0)
2921 + /* the put fails whether the flush succeeds or not */
2922 + (void)tmem_flush_page(pool, oidp, index);
2923 + zcache_put_pool(pool);
2924 + }
2925 +out:
2926 + return ret;
2927 +}
2928 +
2929 +static int zcache_get_page(int pool_id, struct tmem_oid *oidp,
2930 + uint32_t index, struct page *page)
2931 +{
2932 + struct tmem_pool *pool;
2933 + int ret = -1;
2934 + unsigned long flags;
2935 +
2936 + local_irq_save(flags);
2937 + pool = zcache_get_pool_by_id(pool_id);
2938 + if (likely(pool != NULL)) {
2939 + if (atomic_read(&pool->obj_count) > 0)
2940 + ret = tmem_get(pool, oidp, index, page);
2941 + zcache_put_pool(pool);
2942 + }
2943 + local_irq_restore(flags);
2944 + return ret;
2945 +}
2946 +
2947 +static int zcache_flush_page(int pool_id, struct tmem_oid *oidp, uint32_t index)
2948 +{
2949 + struct tmem_pool *pool;
2950 + int ret = -1;
2951 + unsigned long flags;
2952 +
2953 + local_irq_save(flags);
2954 + zcache_flush_total++;
2955 + pool = zcache_get_pool_by_id(pool_id);
2956 + if (likely(pool != NULL)) {
2957 + if (atomic_read(&pool->obj_count) > 0)
2958 + ret = tmem_flush_page(pool, oidp, index);
2959 + zcache_put_pool(pool);
2960 + }
2961 + if (ret >= 0)
2962 + zcache_flush_found++;
2963 + local_irq_restore(flags);
2964 + return ret;
2965 +}
2966 +
2967 +static int zcache_flush_object(int pool_id, struct tmem_oid *oidp)
2968 +{
2969 + struct tmem_pool *pool;
2970 + int ret = -1;
2971 + unsigned long flags;
2972 +
2973 + local_irq_save(flags);
2974 + zcache_flobj_total++;
2975 + pool = zcache_get_pool_by_id(pool_id);
2976 + if (likely(pool != NULL)) {
2977 + if (atomic_read(&pool->obj_count) > 0)
2978 + ret = tmem_flush_object(pool, oidp);
2979 + zcache_put_pool(pool);
2980 + }
2981 + if (ret >= 0)
2982 + zcache_flobj_found++;
2983 + local_irq_restore(flags);
2984 + return ret;
2985 +}
2986 +
2987 +static int zcache_destroy_pool(int pool_id)
2988 +{
2989 + struct tmem_pool *pool = NULL;
2990 + int ret = -1;
2991 +
2992 + if (pool_id < 0)
2993 + goto out;
2994 + pool = zcache_client.tmem_pools[pool_id];
2995 + if (pool == NULL)
2996 + goto out;
2997 + zcache_client.tmem_pools[pool_id] = NULL;
2998 + /* wait for pool activity on other cpus to quiesce */
2999 + while (atomic_read(&pool->refcount) != 0)
3000 + ;
3001 + local_bh_disable();
3002 + ret = tmem_destroy_pool(pool);
3003 + local_bh_enable();
3004 + kfree(pool);
3005 + pr_info("zcache: destroyed pool id=%d\n", pool_id);
3006 +out:
3007 + return ret;
3008 +}
3009 +
3010 +static int zcache_new_pool(uint32_t flags)
3011 +{
3012 + int poolid = -1;
3013 + struct tmem_pool *pool;
3014 +
3015 + pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL);
3016 + if (pool == NULL) {
3017 + pr_info("zcache: pool creation failed: out of memory\n");
3018 + goto out;
3019 + }
3020 +
3021 + for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++)
3022 + if (zcache_client.tmem_pools[poolid] == NULL)
3023 + break;
3024 + if (poolid >= MAX_POOLS_PER_CLIENT) {
3025 + pr_info("zcache: pool creation failed: max exceeded\n");
3026 + kfree(pool);
3027 + poolid = -1;
3028 + goto out;
3029 + }
3030 + atomic_set(&pool->refcount, 0);
3031 + pool->client = &zcache_client;
3032 + pool->pool_id = poolid;
3033 + tmem_new_pool(pool, flags);
3034 + zcache_client.tmem_pools[poolid] = pool;
3035 + pr_info("zcache: created %s tmem pool, id=%d\n",
3036 + flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
3037 + poolid);
3038 +out:
3039 + return poolid;
3040 +}
3041 +
3042 +/**********
3043 + * Two kernel functionalities currently can be layered on top of tmem.
3044 + * These are "cleancache" which is used as a second-chance cache for clean
3045 + * page cache pages; and "frontswap" which is used for swap pages
3046 + * to avoid writes to disk. A generic "shim" is provided here for each
3047 + * to translate in-kernel semantics to zcache semantics.
3048 + */
3049 +
3050 +#ifdef CONFIG_CLEANCACHE
3051 +static void zcache_cleancache_put_page(int pool_id,
3052 + struct cleancache_filekey key,
3053 + pgoff_t index, struct page *page)
3054 +{
3055 + u32 ind = (u32) index;
3056 + struct tmem_oid oid = *(struct tmem_oid *)&key;
3057 +
3058 + if (likely(ind == index))
3059 + (void)zcache_put_page(pool_id, &oid, index, page);
3060 +}
3061 +
3062 +static int zcache_cleancache_get_page(int pool_id,
3063 + struct cleancache_filekey key,
3064 + pgoff_t index, struct page *page)
3065 +{
3066 + u32 ind = (u32) index;
3067 + struct tmem_oid oid = *(struct tmem_oid *)&key;
3068 + int ret = -1;
3069 +
3070 + if (likely(ind == index))
3071 + ret = zcache_get_page(pool_id, &oid, index, page);
3072 + return ret;
3073 +}
3074 +
3075 +static void zcache_cleancache_flush_page(int pool_id,
3076 + struct cleancache_filekey key,
3077 + pgoff_t index)
3078 +{
3079 + u32 ind = (u32) index;
3080 + struct tmem_oid oid = *(struct tmem_oid *)&key;
3081 +
3082 + if (likely(ind == index))
3083 + (void)zcache_flush_page(pool_id, &oid, ind);
3084 +}
3085 +
3086 +static void zcache_cleancache_flush_inode(int pool_id,
3087 + struct cleancache_filekey key)
3088 +{
3089 + struct tmem_oid oid = *(struct tmem_oid *)&key;
3090 +
3091 + (void)zcache_flush_object(pool_id, &oid);
3092 +}
3093 +
3094 +static void zcache_cleancache_flush_fs(int pool_id)
3095 +{
3096 + if (pool_id >= 0)
3097 + (void)zcache_destroy_pool(pool_id);
3098 +}
3099 +
3100 +static int zcache_cleancache_init_fs(size_t pagesize)
3101 +{
3102 + BUG_ON(sizeof(struct cleancache_filekey) !=
3103 + sizeof(struct tmem_oid));
3104 + BUG_ON(pagesize != PAGE_SIZE);
3105 + return zcache_new_pool(0);
3106 +}
3107 +
3108 +static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
3109 +{
3110 + /* shared pools are unsupported and map to private */
3111 + BUG_ON(sizeof(struct cleancache_filekey) !=
3112 + sizeof(struct tmem_oid));
3113 + BUG_ON(pagesize != PAGE_SIZE);
3114 + return zcache_new_pool(0);
3115 +}
3116 +
3117 +static struct cleancache_ops zcache_cleancache_ops = {
3118 + .put_page = zcache_cleancache_put_page,
3119 + .get_page = zcache_cleancache_get_page,
3120 + .flush_page = zcache_cleancache_flush_page,
3121 + .flush_inode = zcache_cleancache_flush_inode,
3122 + .flush_fs = zcache_cleancache_flush_fs,
3123 + .init_shared_fs = zcache_cleancache_init_shared_fs,
3124 + .init_fs = zcache_cleancache_init_fs
3125 +};
3126 +
3127 +struct cleancache_ops zcache_cleancache_register_ops(void)
3128 +{
3129 + struct cleancache_ops old_ops =
3130 + cleancache_register_ops(&zcache_cleancache_ops);
3131 +
3132 + return old_ops;
3133 +}
3134 +#endif
3135 +
3136 +#ifdef CONFIG_FRONTSWAP
3137 +/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
3138 +static int zcache_frontswap_poolid = -1;
3139 +
3140 +/*
3141 + * Swizzling increases objects per swaptype, increasing tmem concurrency
3142 + * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS
3143 + */
3144 +#define SWIZ_BITS 4
3145 +#define SWIZ_MASK ((1 << SWIZ_BITS) - 1)
3146 +#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
3147 +#define iswiz(_ind) (_ind >> SWIZ_BITS)
3148 +
3149 +static inline struct tmem_oid oswiz(unsigned type, u32 ind)
3150 +{
3151 + struct tmem_oid oid = { .oid = { 0 } };
3152 + oid.oid[0] = _oswiz(type, ind);
3153 + return oid;
3154 +}
3155 +
3156 +static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
3157 + struct page *page)
3158 +{
3159 + u64 ind64 = (u64)offset;
3160 + u32 ind = (u32)offset;
3161 + struct tmem_oid oid = oswiz(type, ind);
3162 + int ret = -1;
3163 + unsigned long flags;
3164 +
3165 + BUG_ON(!PageLocked(page));
3166 + if (likely(ind64 == ind)) {
3167 + local_irq_save(flags);
3168 + ret = zcache_put_page(zcache_frontswap_poolid, &oid,
3169 + iswiz(ind), page);
3170 + local_irq_restore(flags);
3171 + }
3172 + return ret;
3173 +}
3174 +
3175 +/* returns 0 if the page was successfully gotten from frontswap, -1 if
3176 + * was not present (should never happen!) */
3177 +static int zcache_frontswap_get_page(unsigned type, pgoff_t offset,
3178 + struct page *page)
3179 +{
3180 + u64 ind64 = (u64)offset;
3181 + u32 ind = (u32)offset;
3182 + struct tmem_oid oid = oswiz(type, ind);
3183 + int ret = -1;
3184 +
3185 + BUG_ON(!PageLocked(page));
3186 + if (likely(ind64 == ind))
3187 + ret = zcache_get_page(zcache_frontswap_poolid, &oid,
3188 + iswiz(ind), page);
3189 + return ret;
3190 +}
3191 +
3192 +/* flush a single page from frontswap */
3193 +static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset)
3194 +{
3195 + u64 ind64 = (u64)offset;
3196 + u32 ind = (u32)offset;
3197 + struct tmem_oid oid = oswiz(type, ind);
3198 +
3199 + if (likely(ind64 == ind))
3200 + (void)zcache_flush_page(zcache_frontswap_poolid, &oid,
3201 + iswiz(ind));
3202 +}
3203 +
3204 +/* flush all pages from the passed swaptype */
3205 +static void zcache_frontswap_flush_area(unsigned type)
3206 +{
3207 + struct tmem_oid oid;
3208 + int ind;
3209 +
3210 + for (ind = SWIZ_MASK; ind >= 0; ind--) {
3211 + oid = oswiz(type, ind);
3212 + (void)zcache_flush_object(zcache_frontswap_poolid, &oid);
3213 + }
3214 +}
3215 +
3216 +static void zcache_frontswap_init(unsigned ignored)
3217 +{
3218 + /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
3219 + if (zcache_frontswap_poolid < 0)
3220 + zcache_frontswap_poolid = zcache_new_pool(TMEM_POOL_PERSIST);
3221 +}
3222 +
3223 +static struct frontswap_ops zcache_frontswap_ops = {
3224 + .put_page = zcache_frontswap_put_page,
3225 + .get_page = zcache_frontswap_get_page,
3226 + .flush_page = zcache_frontswap_flush_page,
3227 + .flush_area = zcache_frontswap_flush_area,
3228 + .init = zcache_frontswap_init
3229 +};
3230 +
3231 +struct frontswap_ops zcache_frontswap_register_ops(void)
3232 +{
3233 + struct frontswap_ops old_ops =
3234 + frontswap_register_ops(&zcache_frontswap_ops);
3235 +
3236 + return old_ops;
3237 +}
3238 +#endif
3239 +
3240 +/*
3241 + * zcache initialization
3242 + * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR
3243 + * NOTHING HAPPENS!
3244 + */
3245 +
3246 +static int zcache_enabled;
3247 +
3248 +static int __init enable_zcache(char *s)
3249 +{
3250 + zcache_enabled = 1;
3251 + return 1;
3252 +}
3253 +__setup("zcache", enable_zcache);
3254 +
3255 +/* allow independent dynamic disabling of cleancache and frontswap */
3256 +
3257 +static int use_cleancache = 1;
3258 +
3259 +static int __init no_cleancache(char *s)
3260 +{
3261 + use_cleancache = 0;
3262 + return 1;
3263 +}
3264 +
3265 +__setup("nocleancache", no_cleancache);
3266 +
3267 +static int use_frontswap = 1;
3268 +
3269 +static int __init no_frontswap(char *s)
3270 +{
3271 + use_frontswap = 0;
3272 + return 1;
3273 +}
3274 +
3275 +__setup("nofrontswap", no_frontswap);
3276 +
3277 +static int __init zcache_init(void)
3278 +{
3279 +#ifdef CONFIG_SYSFS
3280 + int ret = 0;
3281 +
3282 + ret = sysfs_create_group(mm_kobj, &zcache_attr_group);
3283 + if (ret) {
3284 + pr_err("zcache: can't create sysfs\n");
3285 + goto out;
3286 + }
3287 +#endif /* CONFIG_SYSFS */
3288 +#if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP)
3289 + if (zcache_enabled) {
3290 + unsigned int cpu;
3291 +
3292 + tmem_register_hostops(&zcache_hostops);
3293 + tmem_register_pamops(&zcache_pamops);
3294 + ret = register_cpu_notifier(&zcache_cpu_notifier_block);
3295 + if (ret) {
3296 + pr_err("zcache: can't register cpu notifier\n");
3297 + goto out;
3298 + }
3299 + for_each_online_cpu(cpu) {
3300 + void *pcpu = (void *)(long)cpu;
3301 + zcache_cpu_notifier(&zcache_cpu_notifier_block,
3302 + CPU_UP_PREPARE, pcpu);
3303 + }
3304 + }
3305 + zcache_objnode_cache = kmem_cache_create("zcache_objnode",
3306 + sizeof(struct tmem_objnode), 0, 0, NULL);
3307 + zcache_obj_cache = kmem_cache_create("zcache_obj",
3308 + sizeof(struct tmem_obj), 0, 0, NULL);
3309 +#endif
3310 +#ifdef CONFIG_CLEANCACHE
3311 + if (zcache_enabled && use_cleancache) {
3312 + struct cleancache_ops old_ops;
3313 +
3314 + zbud_init();
3315 + register_shrinker(&zcache_shrinker);
3316 + old_ops = zcache_cleancache_register_ops();
3317 + pr_info("zcache: cleancache enabled using kernel "
3318 + "transcendent memory and compression buddies\n");
3319 + if (old_ops.init_fs != NULL)
3320 + pr_warning("zcache: cleancache_ops overridden");
3321 + }
3322 +#endif
3323 +#ifdef CONFIG_FRONTSWAP
3324 + if (zcache_enabled && use_frontswap) {
3325 + struct frontswap_ops old_ops;
3326 +
3327 + zcache_client.xvpool = xv_create_pool();
3328 + if (zcache_client.xvpool == NULL) {
3329 + pr_err("zcache: can't create xvpool\n");
3330 + goto out;
3331 + }
3332 + old_ops = zcache_frontswap_register_ops();
3333 + pr_info("zcache: frontswap enabled using kernel "
3334 + "transcendent memory and xvmalloc\n");
3335 + if (old_ops.init != NULL)
3336 + pr_warning("ktmem: frontswap_ops overridden");
3337 + }
3338 +#endif
3339 +out:
3340 + return ret;
3341 +}
3342 +
3343 +module_init(zcache_init)

  ViewVC Help
Powered by ViewVC 1.1.20