/[linux-patches]/genpatches-2.6/tags/3.0-30/1038_linux-3.0.39.patch
Gentoo

Contents of /genpatches-2.6/tags/3.0-30/1038_linux-3.0.39.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2206 - (show annotations) (download)
Mon Sep 17 18:58:14 2012 UTC (19 months ago) by mpagano
File size: 76598 byte(s)
3.0-30 release
1 diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
2 index 12cecc8..4a37c47 100644
3 --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
4 +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
5 @@ -379,10 +379,10 @@ EVENT_PROCESS:
6
7 # To closer match vmstat scanning statistics, only count isolate_both
8 # and isolate_inactive as scanning. isolate_active is rotation
9 - # isolate_inactive == 0
10 - # isolate_active == 1
11 - # isolate_both == 2
12 - if ($isolate_mode != 1) {
13 + # isolate_inactive == 1
14 + # isolate_active == 2
15 + # isolate_both == 3
16 + if ($isolate_mode != 2) {
17 $perprocesspid{$process_pid}->{HIGH_NR_SCANNED} += $nr_scanned;
18 }
19 $perprocesspid{$process_pid}->{HIGH_NR_CONTIG_DIRTY} += $nr_contig_dirty;
20 diff --git a/Makefile b/Makefile
21 index 5fdfaa8..3ec1722 100644
22 --- a/Makefile
23 +++ b/Makefile
24 @@ -1,6 +1,6 @@
25 VERSION = 3
26 PATCHLEVEL = 0
27 -SUBLEVEL = 38
28 +SUBLEVEL = 39
29 EXTRAVERSION =
30 NAME = Sneaky Weasel
31
32 diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h
33 index 97f8bf6..adda036 100644
34 --- a/arch/mips/include/asm/thread_info.h
35 +++ b/arch/mips/include/asm/thread_info.h
36 @@ -60,6 +60,8 @@ struct thread_info {
37 register struct thread_info *__current_thread_info __asm__("$28");
38 #define current_thread_info() __current_thread_info
39
40 +#endif /* !__ASSEMBLY__ */
41 +
42 /* thread information allocation */
43 #if defined(CONFIG_PAGE_SIZE_4KB) && defined(CONFIG_32BIT)
44 #define THREAD_SIZE_ORDER (1)
45 @@ -97,8 +99,6 @@ register struct thread_info *__current_thread_info __asm__("$28");
46
47 #define free_thread_info(info) kfree(info)
48
49 -#endif /* !__ASSEMBLY__ */
50 -
51 #define PREEMPT_ACTIVE 0x10000000
52
53 /*
54 diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
55 index a81176f..be281c6 100644
56 --- a/arch/mips/kernel/vmlinux.lds.S
57 +++ b/arch/mips/kernel/vmlinux.lds.S
58 @@ -1,5 +1,6 @@
59 #include <asm/asm-offsets.h>
60 #include <asm/page.h>
61 +#include <asm/thread_info.h>
62 #include <asm-generic/vmlinux.lds.h>
63
64 #undef mips
65 @@ -73,7 +74,7 @@ SECTIONS
66 .data : { /* Data */
67 . = . + DATAOFFSET; /* for CONFIG_MAPPED_KERNEL */
68
69 - INIT_TASK_DATA(PAGE_SIZE)
70 + INIT_TASK_DATA(THREAD_SIZE)
71 NOSAVE_DATA
72 CACHELINE_ALIGNED_DATA(1 << CONFIG_MIPS_L1_CACHE_SHIFT)
73 READ_MOSTLY_DATA(1 << CONFIG_MIPS_L1_CACHE_SHIFT)
74 diff --git a/drivers/base/memory.c b/drivers/base/memory.c
75 index 45d7c8f..5fb6aae 100644
76 --- a/drivers/base/memory.c
77 +++ b/drivers/base/memory.c
78 @@ -224,13 +224,48 @@ int memory_isolate_notify(unsigned long val, void *v)
79 }
80
81 /*
82 + * The probe routines leave the pages reserved, just as the bootmem code does.
83 + * Make sure they're still that way.
84 + */
85 +static bool pages_correctly_reserved(unsigned long start_pfn,
86 + unsigned long nr_pages)
87 +{
88 + int i, j;
89 + struct page *page;
90 + unsigned long pfn = start_pfn;
91 +
92 + /*
93 + * memmap between sections is not contiguous except with
94 + * SPARSEMEM_VMEMMAP. We lookup the page once per section
95 + * and assume memmap is contiguous within each section
96 + */
97 + for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) {
98 + if (WARN_ON_ONCE(!pfn_valid(pfn)))
99 + return false;
100 + page = pfn_to_page(pfn);
101 +
102 + for (j = 0; j < PAGES_PER_SECTION; j++) {
103 + if (PageReserved(page + j))
104 + continue;
105 +
106 + printk(KERN_WARNING "section number %ld page number %d "
107 + "not reserved, was it already online?\n",
108 + pfn_to_section_nr(pfn), j);
109 +
110 + return false;
111 + }
112 + }
113 +
114 + return true;
115 +}
116 +
117 +/*
118 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
119 * OK to have direct references to sparsemem variables in here.
120 */
121 static int
122 memory_block_action(unsigned long phys_index, unsigned long action)
123 {
124 - int i;
125 unsigned long start_pfn, start_paddr;
126 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
127 struct page *first_page;
128 @@ -238,26 +273,13 @@ memory_block_action(unsigned long phys_index, unsigned long action)
129
130 first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT);
131
132 - /*
133 - * The probe routines leave the pages reserved, just
134 - * as the bootmem code does. Make sure they're still
135 - * that way.
136 - */
137 - if (action == MEM_ONLINE) {
138 - for (i = 0; i < nr_pages; i++) {
139 - if (PageReserved(first_page+i))
140 - continue;
141 -
142 - printk(KERN_WARNING "section number %ld page number %d "
143 - "not reserved, was it already online?\n",
144 - phys_index, i);
145 - return -EBUSY;
146 - }
147 - }
148 -
149 switch (action) {
150 case MEM_ONLINE:
151 start_pfn = page_to_pfn(first_page);
152 +
153 + if (!pages_correctly_reserved(start_pfn, nr_pages))
154 + return -EBUSY;
155 +
156 ret = online_pages(start_pfn, nr_pages);
157 break;
158 case MEM_OFFLINE:
159 diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
160 index 9bfd057..42ef54f 100644
161 --- a/drivers/md/dm-raid1.c
162 +++ b/drivers/md/dm-raid1.c
163 @@ -1210,7 +1210,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
164 * We need to dec pending if this was a write.
165 */
166 if (rw == WRITE) {
167 - if (!(bio->bi_rw & REQ_FLUSH))
168 + if (!(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD)))
169 dm_rh_dec(ms->rh, map_context->ll);
170 return error;
171 }
172 diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
173 index 7771ed2..69732e0 100644
174 --- a/drivers/md/dm-region-hash.c
175 +++ b/drivers/md/dm-region-hash.c
176 @@ -404,6 +404,9 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
177 return;
178 }
179
180 + if (bio->bi_rw & REQ_DISCARD)
181 + return;
182 +
183 /* We must inform the log that the sync count has changed. */
184 log->type->set_region_sync(log, region, 0);
185
186 @@ -524,7 +527,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
187 struct bio *bio;
188
189 for (bio = bios->head; bio; bio = bio->bi_next) {
190 - if (bio->bi_rw & REQ_FLUSH)
191 + if (bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))
192 continue;
193 rh_inc(rh, dm_rh_bio_to_region(rh, bio));
194 }
195 diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
196 index 1ac8db5d..57106a9 100644
197 --- a/fs/btrfs/disk-io.c
198 +++ b/fs/btrfs/disk-io.c
199 @@ -801,7 +801,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
200
201 #ifdef CONFIG_MIGRATION
202 static int btree_migratepage(struct address_space *mapping,
203 - struct page *newpage, struct page *page)
204 + struct page *newpage, struct page *page,
205 + enum migrate_mode mode)
206 {
207 /*
208 * we can't safely write a btree page from here,
209 @@ -816,7 +817,7 @@ static int btree_migratepage(struct address_space *mapping,
210 if (page_has_private(page) &&
211 !try_to_release_page(page, GFP_KERNEL))
212 return -EAGAIN;
213 - return migrate_page(mapping, newpage, page);
214 + return migrate_page(mapping, newpage, page, mode);
215 }
216 #endif
217
218 diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
219 index 6751e74..c71032b 100644
220 --- a/fs/cifs/readdir.c
221 +++ b/fs/cifs/readdir.c
222 @@ -85,9 +85,12 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
223
224 dentry = d_lookup(parent, name);
225 if (dentry) {
226 - /* FIXME: check for inode number changes? */
227 - if (dentry->d_inode != NULL)
228 + inode = dentry->d_inode;
229 + /* update inode in place if i_ino didn't change */
230 + if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) {
231 + cifs_fattr_to_inode(inode, fattr);
232 return dentry;
233 + }
234 d_drop(dentry);
235 dput(dentry);
236 }
237 diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
238 index 8b0c875..6327a06 100644
239 --- a/fs/hugetlbfs/inode.c
240 +++ b/fs/hugetlbfs/inode.c
241 @@ -568,7 +568,8 @@ static int hugetlbfs_set_page_dirty(struct page *page)
242 }
243
244 static int hugetlbfs_migrate_page(struct address_space *mapping,
245 - struct page *newpage, struct page *page)
246 + struct page *newpage, struct page *page,
247 + enum migrate_mode mode)
248 {
249 int rc;
250
251 diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
252 index 2a55347..4f10d81 100644
253 --- a/fs/nfs/internal.h
254 +++ b/fs/nfs/internal.h
255 @@ -315,7 +315,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data);
256
257 #ifdef CONFIG_MIGRATION
258 extern int nfs_migrate_page(struct address_space *,
259 - struct page *, struct page *);
260 + struct page *, struct page *, enum migrate_mode);
261 #else
262 #define nfs_migrate_page NULL
263 #endif
264 diff --git a/fs/nfs/write.c b/fs/nfs/write.c
265 index f2f80c0..58bb999 100644
266 --- a/fs/nfs/write.c
267 +++ b/fs/nfs/write.c
268 @@ -1662,7 +1662,7 @@ out_error:
269
270 #ifdef CONFIG_MIGRATION
271 int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
272 - struct page *page)
273 + struct page *page, enum migrate_mode mode)
274 {
275 /*
276 * If PagePrivate is set, then the page is currently associated with
277 @@ -1677,7 +1677,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
278
279 nfs_fscache_release_page(page, GFP_KERNEL);
280
281 - return migrate_page(mapping, newpage, page);
282 + return migrate_page(mapping, newpage, page, mode);
283 }
284 #endif
285
286 diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
287 index c606f01..1250016 100644
288 --- a/fs/ubifs/sb.c
289 +++ b/fs/ubifs/sb.c
290 @@ -715,8 +715,12 @@ static int fixup_free_space(struct ubifs_info *c)
291 lnum = ubifs_next_log_lnum(c, lnum);
292 }
293
294 - /* Fixup the current log head */
295 - err = fixup_leb(c, c->lhead_lnum, c->lhead_offs);
296 + /*
297 + * Fixup the log head which contains the only a CS node at the
298 + * beginning.
299 + */
300 + err = fixup_leb(c, c->lhead_lnum,
301 + ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size));
302 if (err)
303 goto out;
304
305 diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
306 index e9eaec5..7a7e5fd 100644
307 --- a/include/linux/cpuset.h
308 +++ b/include/linux/cpuset.h
309 @@ -89,42 +89,33 @@ extern void rebuild_sched_domains(void);
310 extern void cpuset_print_task_mems_allowed(struct task_struct *p);
311
312 /*
313 - * reading current mems_allowed and mempolicy in the fastpath must protected
314 - * by get_mems_allowed()
315 + * get_mems_allowed is required when making decisions involving mems_allowed
316 + * such as during page allocation. mems_allowed can be updated in parallel
317 + * and depending on the new value an operation can fail potentially causing
318 + * process failure. A retry loop with get_mems_allowed and put_mems_allowed
319 + * prevents these artificial failures.
320 */
321 -static inline void get_mems_allowed(void)
322 +static inline unsigned int get_mems_allowed(void)
323 {
324 - current->mems_allowed_change_disable++;
325 -
326 - /*
327 - * ensure that reading mems_allowed and mempolicy happens after the
328 - * update of ->mems_allowed_change_disable.
329 - *
330 - * the write-side task finds ->mems_allowed_change_disable is not 0,
331 - * and knows the read-side task is reading mems_allowed or mempolicy,
332 - * so it will clear old bits lazily.
333 - */
334 - smp_mb();
335 + return read_seqcount_begin(&current->mems_allowed_seq);
336 }
337
338 -static inline void put_mems_allowed(void)
339 +/*
340 + * If this returns false, the operation that took place after get_mems_allowed
341 + * may have failed. It is up to the caller to retry the operation if
342 + * appropriate.
343 + */
344 +static inline bool put_mems_allowed(unsigned int seq)
345 {
346 - /*
347 - * ensure that reading mems_allowed and mempolicy before reducing
348 - * mems_allowed_change_disable.
349 - *
350 - * the write-side task will know that the read-side task is still
351 - * reading mems_allowed or mempolicy, don't clears old bits in the
352 - * nodemask.
353 - */
354 - smp_mb();
355 - --ACCESS_ONCE(current->mems_allowed_change_disable);
356 + return !read_seqcount_retry(&current->mems_allowed_seq, seq);
357 }
358
359 static inline void set_mems_allowed(nodemask_t nodemask)
360 {
361 task_lock(current);
362 + write_seqcount_begin(&current->mems_allowed_seq);
363 current->mems_allowed = nodemask;
364 + write_seqcount_end(&current->mems_allowed_seq);
365 task_unlock(current);
366 }
367
368 @@ -234,12 +225,14 @@ static inline void set_mems_allowed(nodemask_t nodemask)
369 {
370 }
371
372 -static inline void get_mems_allowed(void)
373 +static inline unsigned int get_mems_allowed(void)
374 {
375 + return 0;
376 }
377
378 -static inline void put_mems_allowed(void)
379 +static inline bool put_mems_allowed(unsigned int seq)
380 {
381 + return true;
382 }
383
384 #endif /* !CONFIG_CPUSETS */
385 diff --git a/include/linux/fs.h b/include/linux/fs.h
386 index 96b1035..212ea7b 100644
387 --- a/include/linux/fs.h
388 +++ b/include/linux/fs.h
389 @@ -523,6 +523,7 @@ enum positive_aop_returns {
390 struct page;
391 struct address_space;
392 struct writeback_control;
393 +enum migrate_mode;
394
395 struct iov_iter {
396 const struct iovec *iov;
397 @@ -607,9 +608,12 @@ struct address_space_operations {
398 loff_t offset, unsigned long nr_segs);
399 int (*get_xip_mem)(struct address_space *, pgoff_t, int,
400 void **, unsigned long *);
401 - /* migrate the contents of a page to the specified target */
402 + /*
403 + * migrate the contents of a page to the specified target. If sync
404 + * is false, it must not block.
405 + */
406 int (*migratepage) (struct address_space *,
407 - struct page *, struct page *);
408 + struct page *, struct page *, enum migrate_mode);
409 int (*launder_page) (struct page *);
410 int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
411 unsigned long);
412 @@ -2478,7 +2482,8 @@ extern int generic_check_addressable(unsigned, u64);
413
414 #ifdef CONFIG_MIGRATION
415 extern int buffer_migrate_page(struct address_space *,
416 - struct page *, struct page *);
417 + struct page *, struct page *,
418 + enum migrate_mode);
419 #else
420 #define buffer_migrate_page NULL
421 #endif
422 diff --git a/include/linux/init_task.h b/include/linux/init_task.h
423 index 580f70c..5e41a8e 100644
424 --- a/include/linux/init_task.h
425 +++ b/include/linux/init_task.h
426 @@ -30,6 +30,13 @@ extern struct fs_struct init_fs;
427 #define INIT_THREADGROUP_FORK_LOCK(sig)
428 #endif
429
430 +#ifdef CONFIG_CPUSETS
431 +#define INIT_CPUSET_SEQ \
432 + .mems_allowed_seq = SEQCNT_ZERO,
433 +#else
434 +#define INIT_CPUSET_SEQ
435 +#endif
436 +
437 #define INIT_SIGNALS(sig) { \
438 .nr_threads = 1, \
439 .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\
440 @@ -193,6 +200,7 @@ extern struct cred init_cred;
441 INIT_FTRACE_GRAPH \
442 INIT_TRACE_RECURSION \
443 INIT_TASK_RCU_PREEMPT(tsk) \
444 + INIT_CPUSET_SEQ \
445 }
446
447
448 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
449 index 313a00e..4a8da84 100644
450 --- a/include/linux/memcontrol.h
451 +++ b/include/linux/memcontrol.h
452 @@ -35,7 +35,8 @@ enum mem_cgroup_page_stat_item {
453 extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
454 struct list_head *dst,
455 unsigned long *scanned, int order,
456 - int mode, struct zone *z,
457 + isolate_mode_t mode,
458 + struct zone *z,
459 struct mem_cgroup *mem_cont,
460 int active, int file);
461
462 diff --git a/include/linux/migrate.h b/include/linux/migrate.h
463 index e39aeec..eaf8674 100644
464 --- a/include/linux/migrate.h
465 +++ b/include/linux/migrate.h
466 @@ -6,18 +6,31 @@
467
468 typedef struct page *new_page_t(struct page *, unsigned long private, int **);
469
470 +/*
471 + * MIGRATE_ASYNC means never block
472 + * MIGRATE_SYNC_LIGHT in the current implementation means to allow blocking
473 + * on most operations but not ->writepage as the potential stall time
474 + * is too significant
475 + * MIGRATE_SYNC will block when migrating pages
476 + */
477 +enum migrate_mode {
478 + MIGRATE_ASYNC,
479 + MIGRATE_SYNC_LIGHT,
480 + MIGRATE_SYNC,
481 +};
482 +
483 #ifdef CONFIG_MIGRATION
484 #define PAGE_MIGRATION 1
485
486 extern void putback_lru_pages(struct list_head *l);
487 extern int migrate_page(struct address_space *,
488 - struct page *, struct page *);
489 + struct page *, struct page *, enum migrate_mode);
490 extern int migrate_pages(struct list_head *l, new_page_t x,
491 unsigned long private, bool offlining,
492 - bool sync);
493 + enum migrate_mode mode);
494 extern int migrate_huge_pages(struct list_head *l, new_page_t x,
495 unsigned long private, bool offlining,
496 - bool sync);
497 + enum migrate_mode mode);
498
499 extern int fail_migrate_page(struct address_space *,
500 struct page *, struct page *);
501 @@ -36,10 +49,10 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
502 static inline void putback_lru_pages(struct list_head *l) {}
503 static inline int migrate_pages(struct list_head *l, new_page_t x,
504 unsigned long private, bool offlining,
505 - bool sync) { return -ENOSYS; }
506 + enum migrate_mode mode) { return -ENOSYS; }
507 static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
508 unsigned long private, bool offlining,
509 - bool sync) { return -ENOSYS; }
510 + enum migrate_mode mode) { return -ENOSYS; }
511
512 static inline int migrate_prep(void) { return -ENOSYS; }
513 static inline int migrate_prep_local(void) { return -ENOSYS; }
514 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
515 index aa2d80b..b32f3f9 100644
516 --- a/include/linux/mmzone.h
517 +++ b/include/linux/mmzone.h
518 @@ -158,6 +158,20 @@ static inline int is_unevictable_lru(enum lru_list l)
519 return (l == LRU_UNEVICTABLE);
520 }
521
522 +/* Isolate inactive pages */
523 +#define ISOLATE_INACTIVE ((__force isolate_mode_t)0x1)
524 +/* Isolate active pages */
525 +#define ISOLATE_ACTIVE ((__force isolate_mode_t)0x2)
526 +/* Isolate clean file */
527 +#define ISOLATE_CLEAN ((__force isolate_mode_t)0x4)
528 +/* Isolate unmapped file */
529 +#define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x8)
530 +/* Isolate for asynchronous migration */
531 +#define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x10)
532 +
533 +/* LRU Isolation modes. */
534 +typedef unsigned __bitwise__ isolate_mode_t;
535 +
536 enum zone_watermarks {
537 WMARK_MIN,
538 WMARK_LOW,
539 diff --git a/include/linux/sched.h b/include/linux/sched.h
540 index 4ef452b..443ec43 100644
541 --- a/include/linux/sched.h
542 +++ b/include/linux/sched.h
543 @@ -1484,7 +1484,7 @@ struct task_struct {
544 #endif
545 #ifdef CONFIG_CPUSETS
546 nodemask_t mems_allowed; /* Protected by alloc_lock */
547 - int mems_allowed_change_disable;
548 + seqcount_t mems_allowed_seq; /* Seqence no to catch updates */
549 int cpuset_mem_spread_rotor;
550 int cpuset_slab_spread_rotor;
551 #endif
552 diff --git a/include/linux/swap.h b/include/linux/swap.h
553 index a273468..e73799d 100644
554 --- a/include/linux/swap.h
555 +++ b/include/linux/swap.h
556 @@ -243,11 +243,6 @@ static inline void lru_cache_add_file(struct page *page)
557 __lru_cache_add(page, LRU_INACTIVE_FILE);
558 }
559
560 -/* LRU Isolation modes. */
561 -#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */
562 -#define ISOLATE_ACTIVE 1 /* Isolate active pages. */
563 -#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */
564 -
565 /* linux/mm/vmscan.c */
566 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
567 gfp_t gfp_mask, nodemask_t *mask);
568 @@ -259,7 +254,7 @@ extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
569 unsigned int swappiness,
570 struct zone *zone,
571 unsigned long *nr_scanned);
572 -extern int __isolate_lru_page(struct page *page, int mode, int file);
573 +extern int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file);
574 extern unsigned long shrink_all_memory(unsigned long nr_pages);
575 extern int vm_swappiness;
576 extern int remove_mapping(struct address_space *mapping, struct page *page);
577 diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
578 index b2c33bd..edc4b3d 100644
579 --- a/include/trace/events/vmscan.h
580 +++ b/include/trace/events/vmscan.h
581 @@ -179,6 +179,83 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_re
582 TP_ARGS(nr_reclaimed)
583 );
584
585 +TRACE_EVENT(mm_shrink_slab_start,
586 + TP_PROTO(struct shrinker *shr, struct shrink_control *sc,
587 + long nr_objects_to_shrink, unsigned long pgs_scanned,
588 + unsigned long lru_pgs, unsigned long cache_items,
589 + unsigned long long delta, unsigned long total_scan),
590 +
591 + TP_ARGS(shr, sc, nr_objects_to_shrink, pgs_scanned, lru_pgs,
592 + cache_items, delta, total_scan),
593 +
594 + TP_STRUCT__entry(
595 + __field(struct shrinker *, shr)
596 + __field(void *, shrink)
597 + __field(long, nr_objects_to_shrink)
598 + __field(gfp_t, gfp_flags)
599 + __field(unsigned long, pgs_scanned)
600 + __field(unsigned long, lru_pgs)
601 + __field(unsigned long, cache_items)
602 + __field(unsigned long long, delta)
603 + __field(unsigned long, total_scan)
604 + ),
605 +
606 + TP_fast_assign(
607 + __entry->shr = shr;
608 + __entry->shrink = shr->shrink;
609 + __entry->nr_objects_to_shrink = nr_objects_to_shrink;
610 + __entry->gfp_flags = sc->gfp_mask;
611 + __entry->pgs_scanned = pgs_scanned;
612 + __entry->lru_pgs = lru_pgs;
613 + __entry->cache_items = cache_items;
614 + __entry->delta = delta;
615 + __entry->total_scan = total_scan;
616 + ),
617 +
618 + TP_printk("%pF %p: objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld",
619 + __entry->shrink,
620 + __entry->shr,
621 + __entry->nr_objects_to_shrink,
622 + show_gfp_flags(__entry->gfp_flags),
623 + __entry->pgs_scanned,
624 + __entry->lru_pgs,
625 + __entry->cache_items,
626 + __entry->delta,
627 + __entry->total_scan)
628 +);
629 +
630 +TRACE_EVENT(mm_shrink_slab_end,
631 + TP_PROTO(struct shrinker *shr, int shrinker_retval,
632 + long unused_scan_cnt, long new_scan_cnt),
633 +
634 + TP_ARGS(shr, shrinker_retval, unused_scan_cnt, new_scan_cnt),
635 +
636 + TP_STRUCT__entry(
637 + __field(struct shrinker *, shr)
638 + __field(void *, shrink)
639 + __field(long, unused_scan)
640 + __field(long, new_scan)
641 + __field(int, retval)
642 + __field(long, total_scan)
643 + ),
644 +
645 + TP_fast_assign(
646 + __entry->shr = shr;
647 + __entry->shrink = shr->shrink;
648 + __entry->unused_scan = unused_scan_cnt;
649 + __entry->new_scan = new_scan_cnt;
650 + __entry->retval = shrinker_retval;
651 + __entry->total_scan = new_scan_cnt - unused_scan_cnt;
652 + ),
653 +
654 + TP_printk("%pF %p: unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d",
655 + __entry->shrink,
656 + __entry->shr,
657 + __entry->unused_scan,
658 + __entry->new_scan,
659 + __entry->total_scan,
660 + __entry->retval)
661 +);
662
663 DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
664
665 @@ -189,7 +266,7 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
666 unsigned long nr_lumpy_taken,
667 unsigned long nr_lumpy_dirty,
668 unsigned long nr_lumpy_failed,
669 - int isolate_mode),
670 + isolate_mode_t isolate_mode),
671
672 TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode),
673
674 @@ -201,7 +278,7 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
675 __field(unsigned long, nr_lumpy_taken)
676 __field(unsigned long, nr_lumpy_dirty)
677 __field(unsigned long, nr_lumpy_failed)
678 - __field(int, isolate_mode)
679 + __field(isolate_mode_t, isolate_mode)
680 ),
681
682 TP_fast_assign(
683 @@ -235,7 +312,7 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate,
684 unsigned long nr_lumpy_taken,
685 unsigned long nr_lumpy_dirty,
686 unsigned long nr_lumpy_failed,
687 - int isolate_mode),
688 + isolate_mode_t isolate_mode),
689
690 TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode)
691
692 @@ -250,7 +327,7 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate,
693 unsigned long nr_lumpy_taken,
694 unsigned long nr_lumpy_dirty,
695 unsigned long nr_lumpy_failed,
696 - int isolate_mode),
697 + isolate_mode_t isolate_mode),
698
699 TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode)
700
701 diff --git a/kernel/cpuset.c b/kernel/cpuset.c
702 index 9c9b754..b2e84bd 100644
703 --- a/kernel/cpuset.c
704 +++ b/kernel/cpuset.c
705 @@ -123,6 +123,19 @@ static inline struct cpuset *task_cs(struct task_struct *task)
706 struct cpuset, css);
707 }
708
709 +#ifdef CONFIG_NUMA
710 +static inline bool task_has_mempolicy(struct task_struct *task)
711 +{
712 + return task->mempolicy;
713 +}
714 +#else
715 +static inline bool task_has_mempolicy(struct task_struct *task)
716 +{
717 + return false;
718 +}
719 +#endif
720 +
721 +
722 /* bits in struct cpuset flags field */
723 typedef enum {
724 CS_CPU_EXCLUSIVE,
725 @@ -949,7 +962,8 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
726 static void cpuset_change_task_nodemask(struct task_struct *tsk,
727 nodemask_t *newmems)
728 {
729 -repeat:
730 + bool need_loop;
731 +
732 /*
733 * Allow tasks that have access to memory reserves because they have
734 * been OOM killed to get memory anywhere.
735 @@ -960,46 +974,27 @@ repeat:
736 return;
737
738 task_lock(tsk);
739 - nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
740 - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
741 -
742 -
743 /*
744 - * ensure checking ->mems_allowed_change_disable after setting all new
745 - * allowed nodes.
746 - *
747 - * the read-side task can see an nodemask with new allowed nodes and
748 - * old allowed nodes. and if it allocates page when cpuset clears newly
749 - * disallowed ones continuous, it can see the new allowed bits.
750 - *
751 - * And if setting all new allowed nodes is after the checking, setting
752 - * all new allowed nodes and clearing newly disallowed ones will be done
753 - * continuous, and the read-side task may find no node to alloc page.
754 + * Determine if a loop is necessary if another thread is doing
755 + * get_mems_allowed(). If at least one node remains unchanged and
756 + * tsk does not have a mempolicy, then an empty nodemask will not be
757 + * possible when mems_allowed is larger than a word.
758 */
759 - smp_mb();
760 + need_loop = task_has_mempolicy(tsk) ||
761 + !nodes_intersects(*newmems, tsk->mems_allowed);
762
763 - /*
764 - * Allocation of memory is very fast, we needn't sleep when waiting
765 - * for the read-side.
766 - */
767 - while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
768 - task_unlock(tsk);
769 - if (!task_curr(tsk))
770 - yield();
771 - goto repeat;
772 - }
773 + if (need_loop)
774 + write_seqcount_begin(&tsk->mems_allowed_seq);
775
776 - /*
777 - * ensure checking ->mems_allowed_change_disable before clearing all new
778 - * disallowed nodes.
779 - *
780 - * if clearing newly disallowed bits before the checking, the read-side
781 - * task may find no node to alloc page.
782 - */
783 - smp_mb();
784 + nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
785 + mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
786
787 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
788 tsk->mems_allowed = *newmems;
789 +
790 + if (need_loop)
791 + write_seqcount_end(&tsk->mems_allowed_seq);
792 +
793 task_unlock(tsk);
794 }
795
796 diff --git a/kernel/fork.c b/kernel/fork.c
797 index 4712e3e..3d42aa3 100644
798 --- a/kernel/fork.c
799 +++ b/kernel/fork.c
800 @@ -985,6 +985,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
801 #ifdef CONFIG_CGROUPS
802 init_rwsem(&sig->threadgroup_fork_lock);
803 #endif
804 +#ifdef CONFIG_CPUSETS
805 + seqcount_init(&tsk->mems_allowed_seq);
806 +#endif
807
808 sig->oom_adj = current->signal->oom_adj;
809 sig->oom_score_adj = current->signal->oom_score_adj;
810 diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
811 index f1eb182..61fc450 100644
812 --- a/kernel/time/ntp.c
813 +++ b/kernel/time/ntp.c
814 @@ -375,7 +375,9 @@ int second_overflow(unsigned long secs)
815 time_state = TIME_DEL;
816 break;
817 case TIME_INS:
818 - if (secs % 86400 == 0) {
819 + if (!(time_status & STA_INS))
820 + time_state = TIME_OK;
821 + else if (secs % 86400 == 0) {
822 leap = -1;
823 time_state = TIME_OOP;
824 time_tai++;
825 @@ -384,7 +386,9 @@ int second_overflow(unsigned long secs)
826 }
827 break;
828 case TIME_DEL:
829 - if ((secs + 1) % 86400 == 0) {
830 + if (!(time_status & STA_DEL))
831 + time_state = TIME_OK;
832 + else if ((secs + 1) % 86400 == 0) {
833 leap = 1;
834 time_tai--;
835 time_state = TIME_WAIT;
836 diff --git a/mm/compaction.c b/mm/compaction.c
837 index adc5336..8ea7308 100644
838 --- a/mm/compaction.c
839 +++ b/mm/compaction.c
840 @@ -35,10 +35,6 @@ struct compact_control {
841 unsigned long migrate_pfn; /* isolate_migratepages search base */
842 bool sync; /* Synchronous migration */
843
844 - /* Account for isolated anon and file pages */
845 - unsigned long nr_anon;
846 - unsigned long nr_file;
847 -
848 unsigned int order; /* order a direct compactor needs */
849 int migratetype; /* MOVABLE, RECLAIMABLE etc */
850 struct zone *zone;
851 @@ -223,17 +219,13 @@ static void isolate_freepages(struct zone *zone,
852 static void acct_isolated(struct zone *zone, struct compact_control *cc)
853 {
854 struct page *page;
855 - unsigned int count[NR_LRU_LISTS] = { 0, };
856 + unsigned int count[2] = { 0, };
857
858 - list_for_each_entry(page, &cc->migratepages, lru) {
859 - int lru = page_lru_base_type(page);
860 - count[lru]++;
861 - }
862 + list_for_each_entry(page, &cc->migratepages, lru)
863 + count[!!page_is_file_cache(page)]++;
864
865 - cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
866 - cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
867 - __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
868 - __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
869 + __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
870 + __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
871 }
872
873 /* Similar to reclaim, but different enough that they don't share logic */
874 @@ -269,6 +261,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
875 unsigned long last_pageblock_nr = 0, pageblock_nr;
876 unsigned long nr_scanned = 0, nr_isolated = 0;
877 struct list_head *migratelist = &cc->migratepages;
878 + isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;
879
880 /* Do not scan outside zone boundaries */
881 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
882 @@ -378,8 +371,11 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
883 continue;
884 }
885
886 + if (!cc->sync)
887 + mode |= ISOLATE_ASYNC_MIGRATE;
888 +
889 /* Try isolate the page */
890 - if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
891 + if (__isolate_lru_page(page, mode, 0) != 0)
892 continue;
893
894 VM_BUG_ON(PageTransCompound(page));
895 @@ -581,7 +577,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
896 nr_migrate = cc->nr_migratepages;
897 err = migrate_pages(&cc->migratepages, compaction_alloc,
898 (unsigned long)cc, false,
899 - cc->sync);
900 + cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
901 update_nr_listpages(cc);
902 nr_remaining = cc->nr_migratepages;
903
904 diff --git a/mm/filemap.c b/mm/filemap.c
905 index b7d8603..10481eb 100644
906 --- a/mm/filemap.c
907 +++ b/mm/filemap.c
908 @@ -516,10 +516,13 @@ struct page *__page_cache_alloc(gfp_t gfp)
909 struct page *page;
910
911 if (cpuset_do_page_mem_spread()) {
912 - get_mems_allowed();
913 - n = cpuset_mem_spread_node();
914 - page = alloc_pages_exact_node(n, gfp, 0);
915 - put_mems_allowed();
916 + unsigned int cpuset_mems_cookie;
917 + do {
918 + cpuset_mems_cookie = get_mems_allowed();
919 + n = cpuset_mem_spread_node();
920 + page = alloc_pages_exact_node(n, gfp, 0);
921 + } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
922 +
923 return page;
924 }
925 return alloc_pages(gfp, 0);
926 diff --git a/mm/hugetlb.c b/mm/hugetlb.c
927 index 05f8fd4..ae60a53 100644
928 --- a/mm/hugetlb.c
929 +++ b/mm/hugetlb.c
930 @@ -460,8 +460,10 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
931 struct zonelist *zonelist;
932 struct zone *zone;
933 struct zoneref *z;
934 + unsigned int cpuset_mems_cookie;
935
936 - get_mems_allowed();
937 +retry_cpuset:
938 + cpuset_mems_cookie = get_mems_allowed();
939 zonelist = huge_zonelist(vma, address,
940 htlb_alloc_mask, &mpol, &nodemask);
941 /*
942 @@ -488,10 +490,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
943 }
944 }
945 }
946 -err:
947 +
948 mpol_cond_put(mpol);
949 - put_mems_allowed();
950 + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
951 + goto retry_cpuset;
952 return page;
953 +
954 +err:
955 + mpol_cond_put(mpol);
956 + return NULL;
957 }
958
959 static void update_and_free_page(struct hstate *h, struct page *page)
960 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
961 index ffb99b4..57cdf5a 100644
962 --- a/mm/memcontrol.c
963 +++ b/mm/memcontrol.c
964 @@ -1251,7 +1251,8 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
965 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
966 struct list_head *dst,
967 unsigned long *scanned, int order,
968 - int mode, struct zone *z,
969 + isolate_mode_t mode,
970 + struct zone *z,
971 struct mem_cgroup *mem_cont,
972 int active, int file)
973 {
974 diff --git a/mm/memory-failure.c b/mm/memory-failure.c
975 index 740c4f5..6496748 100644
976 --- a/mm/memory-failure.c
977 +++ b/mm/memory-failure.c
978 @@ -1464,7 +1464,7 @@ int soft_offline_page(struct page *page, int flags)
979 page_is_file_cache(page));
980 list_add(&page->lru, &pagelist);
981 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
982 - 0, true);
983 + 0, MIGRATE_SYNC);
984 if (ret) {
985 putback_lru_pages(&pagelist);
986 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
987 diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
988 index c46887b..ae5a3f2 100644
989 --- a/mm/memory_hotplug.c
990 +++ b/mm/memory_hotplug.c
991 @@ -747,7 +747,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
992 }
993 /* this function returns # of failed pages */
994 ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
995 - true, true);
996 + true, MIGRATE_SYNC);
997 if (ret)
998 putback_lru_pages(&source);
999 }
1000 diff --git a/mm/mempolicy.c b/mm/mempolicy.c
1001 index 3dac2d1..cff919f 100644
1002 --- a/mm/mempolicy.c
1003 +++ b/mm/mempolicy.c
1004 @@ -926,7 +926,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1005
1006 if (!list_empty(&pagelist)) {
1007 err = migrate_pages(&pagelist, new_node_page, dest,
1008 - false, true);
1009 + false, MIGRATE_SYNC);
1010 if (err)
1011 putback_lru_pages(&pagelist);
1012 }
1013 @@ -1810,18 +1810,24 @@ struct page *
1014 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1015 unsigned long addr, int node)
1016 {
1017 - struct mempolicy *pol = get_vma_policy(current, vma, addr);
1018 + struct mempolicy *pol;
1019 struct zonelist *zl;
1020 struct page *page;
1021 + unsigned int cpuset_mems_cookie;
1022 +
1023 +retry_cpuset:
1024 + pol = get_vma_policy(current, vma, addr);
1025 + cpuset_mems_cookie = get_mems_allowed();
1026
1027 - get_mems_allowed();
1028 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1029 unsigned nid;
1030
1031 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1032 mpol_cond_put(pol);
1033 page = alloc_page_interleave(gfp, order, nid);
1034 - put_mems_allowed();
1035 + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1036 + goto retry_cpuset;
1037 +
1038 return page;
1039 }
1040 zl = policy_zonelist(gfp, pol, node);
1041 @@ -1832,7 +1838,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1042 struct page *page = __alloc_pages_nodemask(gfp, order,
1043 zl, policy_nodemask(gfp, pol));
1044 __mpol_put(pol);
1045 - put_mems_allowed();
1046 + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1047 + goto retry_cpuset;
1048 return page;
1049 }
1050 /*
1051 @@ -1840,7 +1847,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1052 */
1053 page = __alloc_pages_nodemask(gfp, order, zl,
1054 policy_nodemask(gfp, pol));
1055 - put_mems_allowed();
1056 + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1057 + goto retry_cpuset;
1058 return page;
1059 }
1060
1061 @@ -1867,11 +1875,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1062 {
1063 struct mempolicy *pol = current->mempolicy;
1064 struct page *page;
1065 + unsigned int cpuset_mems_cookie;
1066
1067 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1068 pol = &default_policy;
1069
1070 - get_mems_allowed();
1071 +retry_cpuset:
1072 + cpuset_mems_cookie = get_mems_allowed();
1073 +
1074 /*
1075 * No reference counting needed for current->mempolicy
1076 * nor system default_policy
1077 @@ -1882,7 +1893,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1078 page = __alloc_pages_nodemask(gfp, order,
1079 policy_zonelist(gfp, pol, numa_node_id()),
1080 policy_nodemask(gfp, pol));
1081 - put_mems_allowed();
1082 +
1083 + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1084 + goto retry_cpuset;
1085 +
1086 return page;
1087 }
1088 EXPORT_SYMBOL(alloc_pages_current);
1089 diff --git a/mm/migrate.c b/mm/migrate.c
1090 index 14d0a6a..480714b 100644
1091 --- a/mm/migrate.c
1092 +++ b/mm/migrate.c
1093 @@ -220,6 +220,56 @@ out:
1094 pte_unmap_unlock(ptep, ptl);
1095 }
1096
1097 +#ifdef CONFIG_BLOCK
1098 +/* Returns true if all buffers are successfully locked */
1099 +static bool buffer_migrate_lock_buffers(struct buffer_head *head,
1100 + enum migrate_mode mode)
1101 +{
1102 + struct buffer_head *bh = head;
1103 +
1104 + /* Simple case, sync compaction */
1105 + if (mode != MIGRATE_ASYNC) {
1106 + do {
1107 + get_bh(bh);
1108 + lock_buffer(bh);
1109 + bh = bh->b_this_page;
1110 +
1111 + } while (bh != head);
1112 +
1113 + return true;
1114 + }
1115 +
1116 + /* async case, we cannot block on lock_buffer so use trylock_buffer */
1117 + do {
1118 + get_bh(bh);
1119 + if (!trylock_buffer(bh)) {
1120 + /*
1121 + * We failed to lock the buffer and cannot stall in
1122 + * async migration. Release the taken locks
1123 + */
1124 + struct buffer_head *failed_bh = bh;
1125 + put_bh(failed_bh);
1126 + bh = head;
1127 + while (bh != failed_bh) {
1128 + unlock_buffer(bh);
1129 + put_bh(bh);
1130 + bh = bh->b_this_page;
1131 + }
1132 + return false;
1133 + }
1134 +
1135 + bh = bh->b_this_page;
1136 + } while (bh != head);
1137 + return true;
1138 +}
1139 +#else
1140 +static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
1141 + enum migrate_mode mode)
1142 +{
1143 + return true;
1144 +}
1145 +#endif /* CONFIG_BLOCK */
1146 +
1147 /*
1148 * Replace the page in the mapping.
1149 *
1150 @@ -229,7 +279,8 @@ out:
1151 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
1152 */
1153 static int migrate_page_move_mapping(struct address_space *mapping,
1154 - struct page *newpage, struct page *page)
1155 + struct page *newpage, struct page *page,
1156 + struct buffer_head *head, enum migrate_mode mode)
1157 {
1158 int expected_count;
1159 void **pslot;
1160 @@ -259,6 +310,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
1161 }
1162
1163 /*
1164 + * In the async migration case of moving a page with buffers, lock the
1165 + * buffers using trylock before the mapping is moved. If the mapping
1166 + * was moved, we later failed to lock the buffers and could not move
1167 + * the mapping back due to an elevated page count, we would have to
1168 + * block waiting on other references to be dropped.
1169 + */
1170 + if (mode == MIGRATE_ASYNC && head &&
1171 + !buffer_migrate_lock_buffers(head, mode)) {
1172 + page_unfreeze_refs(page, expected_count);
1173 + spin_unlock_irq(&mapping->tree_lock);
1174 + return -EAGAIN;
1175 + }
1176 +
1177 + /*
1178 * Now we know that no one else is looking at the page.
1179 */
1180 get_page(newpage); /* add cache reference */
1181 @@ -415,13 +480,14 @@ EXPORT_SYMBOL(fail_migrate_page);
1182 * Pages are locked upon entry and exit.
1183 */
1184 int migrate_page(struct address_space *mapping,
1185 - struct page *newpage, struct page *page)
1186 + struct page *newpage, struct page *page,
1187 + enum migrate_mode mode)
1188 {
1189 int rc;
1190
1191 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
1192
1193 - rc = migrate_page_move_mapping(mapping, newpage, page);
1194 + rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
1195
1196 if (rc)
1197 return rc;
1198 @@ -438,28 +504,28 @@ EXPORT_SYMBOL(migrate_page);
1199 * exist.
1200 */
1201 int buffer_migrate_page(struct address_space *mapping,
1202 - struct page *newpage, struct page *page)
1203 + struct page *newpage, struct page *page, enum migrate_mode mode)
1204 {
1205 struct buffer_head *bh, *head;
1206 int rc;
1207
1208 if (!page_has_buffers(page))
1209 - return migrate_page(mapping, newpage, page);
1210 + return migrate_page(mapping, newpage, page, mode);
1211
1212 head = page_buffers(page);
1213
1214 - rc = migrate_page_move_mapping(mapping, newpage, page);
1215 + rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
1216
1217 if (rc)
1218 return rc;
1219
1220 - bh = head;
1221 - do {
1222 - get_bh(bh);
1223 - lock_buffer(bh);
1224 - bh = bh->b_this_page;
1225 -
1226 - } while (bh != head);
1227 + /*
1228 + * In the async case, migrate_page_move_mapping locked the buffers
1229 + * with an IRQ-safe spinlock held. In the sync case, the buffers
1230 + * need to be locked now
1231 + */
1232 + if (mode != MIGRATE_ASYNC)
1233 + BUG_ON(!buffer_migrate_lock_buffers(head, mode));
1234
1235 ClearPagePrivate(page);
1236 set_page_private(newpage, page_private(page));
1237 @@ -536,10 +602,14 @@ static int writeout(struct address_space *mapping, struct page *page)
1238 * Default handling if a filesystem does not provide a migration function.
1239 */
1240 static int fallback_migrate_page(struct address_space *mapping,
1241 - struct page *newpage, struct page *page)
1242 + struct page *newpage, struct page *page, enum migrate_mode mode)
1243 {
1244 - if (PageDirty(page))
1245 + if (PageDirty(page)) {
1246 + /* Only writeback pages in full synchronous migration */
1247 + if (mode != MIGRATE_SYNC)
1248 + return -EBUSY;
1249 return writeout(mapping, page);
1250 + }
1251
1252 /*
1253 * Buffers may be managed in a filesystem specific way.
1254 @@ -549,7 +619,7 @@ static int fallback_migrate_page(struct address_space *mapping,
1255 !try_to_release_page(page, GFP_KERNEL))
1256 return -EAGAIN;
1257
1258 - return migrate_page(mapping, newpage, page);
1259 + return migrate_page(mapping, newpage, page, mode);
1260 }
1261
1262 /*
1263 @@ -564,7 +634,7 @@ static int fallback_migrate_page(struct address_space *mapping,
1264 * == 0 - success
1265 */
1266 static int move_to_new_page(struct page *newpage, struct page *page,
1267 - int remap_swapcache, bool sync)
1268 + int remap_swapcache, enum migrate_mode mode)
1269 {
1270 struct address_space *mapping;
1271 int rc;
1272 @@ -585,29 +655,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
1273
1274 mapping = page_mapping(page);
1275 if (!mapping)
1276 - rc = migrate_page(mapping, newpage, page);
1277 - else {
1278 + rc = migrate_page(mapping, newpage, page, mode);
1279 + else if (mapping->a_ops->migratepage)
1280 /*
1281 - * Do not writeback pages if !sync and migratepage is
1282 - * not pointing to migrate_page() which is nonblocking
1283 - * (swapcache/tmpfs uses migratepage = migrate_page).
1284 + * Most pages have a mapping and most filesystems provide a
1285 + * migratepage callback. Anonymous pages are part of swap
1286 + * space which also has its own migratepage callback. This
1287 + * is the most common path for page migration.
1288 */
1289 - if (PageDirty(page) && !sync &&
1290 - mapping->a_ops->migratepage != migrate_page)
1291 - rc = -EBUSY;
1292 - else if (mapping->a_ops->migratepage)
1293 - /*
1294 - * Most pages have a mapping and most filesystems
1295 - * should provide a migration function. Anonymous
1296 - * pages are part of swap space which also has its
1297 - * own migration function. This is the most common
1298 - * path for page migration.
1299 - */
1300 - rc = mapping->a_ops->migratepage(mapping,
1301 - newpage, page);
1302 - else
1303 - rc = fallback_migrate_page(mapping, newpage, page);
1304 - }
1305 + rc = mapping->a_ops->migratepage(mapping,
1306 + newpage, page, mode);
1307 + else
1308 + rc = fallback_migrate_page(mapping, newpage, page, mode);
1309
1310 if (rc) {
1311 newpage->mapping = NULL;
1312 @@ -621,38 +680,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
1313 return rc;
1314 }
1315
1316 -/*
1317 - * Obtain the lock on page, remove all ptes and migrate the page
1318 - * to the newly allocated page in newpage.
1319 - */
1320 -static int unmap_and_move(new_page_t get_new_page, unsigned long private,
1321 - struct page *page, int force, bool offlining, bool sync)
1322 +static int __unmap_and_move(struct page *page, struct page *newpage,
1323 + int force, bool offlining, enum migrate_mode mode)
1324 {
1325 - int rc = 0;
1326 - int *result = NULL;
1327 - struct page *newpage = get_new_page(page, private, &result);
1328 + int rc = -EAGAIN;
1329 int remap_swapcache = 1;
1330 int charge = 0;
1331 struct mem_cgroup *mem;
1332 struct anon_vma *anon_vma = NULL;
1333
1334 - if (!newpage)
1335 - return -ENOMEM;
1336 -
1337 - if (page_count(page) == 1) {
1338 - /* page was freed from under us. So we are done. */
1339 - goto move_newpage;
1340 - }
1341 - if (unlikely(PageTransHuge(page)))
1342 - if (unlikely(split_huge_page(page)))
1343 - goto move_newpage;
1344 -
1345 - /* prepare cgroup just returns 0 or -ENOMEM */
1346 - rc = -EAGAIN;
1347 -
1348 if (!trylock_page(page)) {
1349 - if (!force || !sync)
1350 - goto move_newpage;
1351 + if (!force || mode == MIGRATE_ASYNC)
1352 + goto out;
1353
1354 /*
1355 * It's not safe for direct compaction to call lock_page.
1356 @@ -668,7 +707,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
1357 * altogether.
1358 */
1359 if (current->flags & PF_MEMALLOC)
1360 - goto move_newpage;
1361 + goto out;
1362
1363 lock_page(page);
1364 }
1365 @@ -697,10 +736,12 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
1366
1367 if (PageWriteback(page)) {
1368 /*
1369 - * For !sync, there is no point retrying as the retry loop
1370 - * is expected to be too short for PageWriteback to be cleared
1371 + * Only in the case of a full syncronous migration is it
1372 + * necessary to wait for PageWriteback. In the async case,
1373 + * the retry loop is too short and in the sync-light case,
1374 + * the overhead of stalling is too much
1375 */
1376 - if (!sync) {
1377 + if (mode != MIGRATE_SYNC) {
1378 rc = -EBUSY;
1379 goto uncharge;
1380 }
1381 @@ -771,7 +812,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
1382
1383 skip_unmap:
1384 if (!page_mapped(page))
1385 - rc = move_to_new_page(newpage, page, remap_swapcache, sync);
1386 + rc = move_to_new_page(newpage, page, remap_swapcache, mode);
1387
1388 if (rc && remap_swapcache)
1389 remove_migration_ptes(page, page);
1390 @@ -785,27 +826,53 @@ uncharge:
1391 mem_cgroup_end_migration(mem, page, newpage, rc == 0);
1392 unlock:
1393 unlock_page(page);
1394 +out:
1395 + return rc;
1396 +}
1397
1398 -move_newpage:
1399 +/*
1400 + * Obtain the lock on page, remove all ptes and migrate the page
1401 + * to the newly allocated page in newpage.
1402 + */
1403 +static int unmap_and_move(new_page_t get_new_page, unsigned long private,
1404 + struct page *page, int force, bool offlining,
1405 + enum migrate_mode mode)
1406 +{
1407 + int rc = 0;
1408 + int *result = NULL;
1409 + struct page *newpage = get_new_page(page, private, &result);
1410 +
1411 + if (!newpage)
1412 + return -ENOMEM;
1413 +
1414 + if (page_count(page) == 1) {
1415 + /* page was freed from under us. So we are done. */
1416 + goto out;
1417 + }
1418 +
1419 + if (unlikely(PageTransHuge(page)))
1420 + if (unlikely(split_huge_page(page)))
1421 + goto out;
1422 +
1423 + rc = __unmap_and_move(page, newpage, force, offlining, mode);
1424 +out:
1425 if (rc != -EAGAIN) {
1426 - /*
1427 - * A page that has been migrated has all references
1428 - * removed and will be freed. A page that has not been
1429 - * migrated will have kepts its references and be
1430 - * restored.
1431 - */
1432 - list_del(&page->lru);
1433 + /*
1434 + * A page that has been migrated has all references
1435 + * removed and will be freed. A page that has not been
1436 + * migrated will have kepts its references and be
1437 + * restored.
1438 + */
1439 + list_del(&page->lru);
1440 dec_zone_page_state(page, NR_ISOLATED_ANON +
1441 page_is_file_cache(page));
1442 putback_lru_page(page);
1443 }
1444 -
1445 /*
1446 * Move the new page to the LRU. If migration was not successful
1447 * then this will free the page.
1448 */
1449 putback_lru_page(newpage);
1450 -
1451 if (result) {
1452 if (rc)
1453 *result = rc;
1454 @@ -835,7 +902,8 @@ move_newpage:
1455 */
1456 static int unmap_and_move_huge_page(new_page_t get_new_page,
1457 unsigned long private, struct page *hpage,
1458 - int force, bool offlining, bool sync)
1459 + int force, bool offlining,
1460 + enum migrate_mode mode)
1461 {
1462 int rc = 0;
1463 int *result = NULL;
1464 @@ -848,7 +916,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1465 rc = -EAGAIN;
1466
1467 if (!trylock_page(hpage)) {
1468 - if (!force || !sync)
1469 + if (!force || mode != MIGRATE_SYNC)
1470 goto out;
1471 lock_page(hpage);
1472 }
1473 @@ -859,7 +927,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
1474 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
1475
1476 if (!page_mapped(hpage))
1477 - rc = move_to_new_page(new_hpage, hpage, 1, sync);
1478 + rc = move_to_new_page(new_hpage, hpage, 1, mode);
1479
1480 if (rc)
1481 remove_migration_ptes(hpage, hpage);
1482 @@ -902,7 +970,7 @@ out:
1483 */
1484 int migrate_pages(struct list_head *from,
1485 new_page_t get_new_page, unsigned long private, bool offlining,
1486 - bool sync)
1487 + enum migrate_mode mode)
1488 {
1489 int retry = 1;
1490 int nr_failed = 0;
1491 @@ -923,7 +991,7 @@ int migrate_pages(struct list_head *from,
1492
1493 rc = unmap_and_move(get_new_page, private,
1494 page, pass > 2, offlining,
1495 - sync);
1496 + mode);
1497
1498 switch(rc) {
1499 case -ENOMEM:
1500 @@ -953,7 +1021,7 @@ out:
1501
1502 int migrate_huge_pages(struct list_head *from,
1503 new_page_t get_new_page, unsigned long private, bool offlining,
1504 - bool sync)
1505 + enum migrate_mode mode)
1506 {
1507 int retry = 1;
1508 int nr_failed = 0;
1509 @@ -970,7 +1038,7 @@ int migrate_huge_pages(struct list_head *from,
1510
1511 rc = unmap_and_move_huge_page(get_new_page,
1512 private, page, pass > 2, offlining,
1513 - sync);
1514 + mode);
1515
1516 switch(rc) {
1517 case -ENOMEM:
1518 @@ -1099,7 +1167,7 @@ set_status:
1519 err = 0;
1520 if (!list_empty(&pagelist)) {
1521 err = migrate_pages(&pagelist, new_page_node,
1522 - (unsigned long)pm, 0, true);
1523 + (unsigned long)pm, 0, MIGRATE_SYNC);
1524 if (err)
1525 putback_lru_pages(&pagelist);
1526 }
1527 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
1528 index 947a7e9..9177aa3 100644
1529 --- a/mm/page_alloc.c
1530 +++ b/mm/page_alloc.c
1531 @@ -1897,14 +1897,20 @@ static struct page *
1532 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1533 struct zonelist *zonelist, enum zone_type high_zoneidx,
1534 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1535 - int migratetype, unsigned long *did_some_progress,
1536 - bool sync_migration)
1537 + int migratetype, bool sync_migration,
1538 + bool *deferred_compaction,
1539 + unsigned long *did_some_progress)
1540 {
1541 struct page *page;
1542
1543 - if (!order || compaction_deferred(preferred_zone))
1544 + if (!order)
1545 return NULL;
1546
1547 + if (compaction_deferred(preferred_zone)) {
1548 + *deferred_compaction = true;
1549 + return NULL;
1550 + }
1551 +
1552 current->flags |= PF_MEMALLOC;
1553 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1554 nodemask, sync_migration);
1555 @@ -1932,7 +1938,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1556 * but not enough to satisfy watermarks.
1557 */
1558 count_vm_event(COMPACTFAIL);
1559 - defer_compaction(preferred_zone);
1560 +
1561 + /*
1562 + * As async compaction considers a subset of pageblocks, only
1563 + * defer if the failure was a sync compaction failure.
1564 + */
1565 + if (sync_migration)
1566 + defer_compaction(preferred_zone);
1567
1568 cond_resched();
1569 }
1570 @@ -1944,8 +1956,9 @@ static inline struct page *
1571 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1572 struct zonelist *zonelist, enum zone_type high_zoneidx,
1573 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1574 - int migratetype, unsigned long *did_some_progress,
1575 - bool sync_migration)
1576 + int migratetype, bool sync_migration,
1577 + bool *deferred_compaction,
1578 + unsigned long *did_some_progress)
1579 {
1580 return NULL;
1581 }
1582 @@ -2095,6 +2108,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1583 unsigned long pages_reclaimed = 0;
1584 unsigned long did_some_progress;
1585 bool sync_migration = false;
1586 + bool deferred_compaction = false;
1587
1588 /*
1589 * In the slowpath, we sanity check order to avoid ever trying to
1590 @@ -2175,12 +2189,22 @@ rebalance:
1591 zonelist, high_zoneidx,
1592 nodemask,
1593 alloc_flags, preferred_zone,
1594 - migratetype, &did_some_progress,
1595 - sync_migration);
1596 + migratetype, sync_migration,
1597 + &deferred_compaction,
1598 + &did_some_progress);
1599 if (page)
1600 goto got_pg;
1601 sync_migration = true;
1602
1603 + /*
1604 + * If compaction is deferred for high-order allocations, it is because
1605 + * sync compaction recently failed. In this is the case and the caller
1606 + * has requested the system not be heavily disrupted, fail the
1607 + * allocation now instead of entering direct reclaim
1608 + */
1609 + if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
1610 + goto nopage;
1611 +
1612 /* Try direct reclaim and then allocating */
1613 page = __alloc_pages_direct_reclaim(gfp_mask, order,
1614 zonelist, high_zoneidx,
1615 @@ -2243,8 +2267,9 @@ rebalance:
1616 zonelist, high_zoneidx,
1617 nodemask,
1618 alloc_flags, preferred_zone,
1619 - migratetype, &did_some_progress,
1620 - sync_migration);
1621 + migratetype, sync_migration,
1622 + &deferred_compaction,
1623 + &did_some_progress);
1624 if (page)
1625 goto got_pg;
1626 }
1627 @@ -2268,8 +2293,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1628 {
1629 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1630 struct zone *preferred_zone;
1631 - struct page *page;
1632 + struct page *page = NULL;
1633 int migratetype = allocflags_to_migratetype(gfp_mask);
1634 + unsigned int cpuset_mems_cookie;
1635
1636 gfp_mask &= gfp_allowed_mask;
1637
1638 @@ -2288,15 +2314,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1639 if (unlikely(!zonelist->_zonerefs->zone))
1640 return NULL;
1641
1642 - get_mems_allowed();
1643 +retry_cpuset:
1644 + cpuset_mems_cookie = get_mems_allowed();
1645 +
1646 /* The preferred zone is used for statistics later */
1647 first_zones_zonelist(zonelist, high_zoneidx,
1648 nodemask ? : &cpuset_current_mems_allowed,
1649 &preferred_zone);
1650 - if (!preferred_zone) {
1651 - put_mems_allowed();
1652 - return NULL;
1653 - }
1654 + if (!preferred_zone)
1655 + goto out;
1656
1657 /* First allocation attempt */
1658 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1659 @@ -2306,9 +2332,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1660 page = __alloc_pages_slowpath(gfp_mask, order,
1661 zonelist, high_zoneidx, nodemask,
1662 preferred_zone, migratetype);
1663 - put_mems_allowed();
1664
1665 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
1666 +
1667 +out:
1668 + /*
1669 + * When updating a task's mems_allowed, it is possible to race with
1670 + * parallel threads in such a way that an allocation can fail while
1671 + * the mask is being updated. If a page allocation is about to fail,
1672 + * check if the cpuset changed during allocation and if so, retry.
1673 + */
1674 + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1675 + goto retry_cpuset;
1676 +
1677 return page;
1678 }
1679 EXPORT_SYMBOL(__alloc_pages_nodemask);
1680 @@ -2532,13 +2568,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
1681 bool skip_free_areas_node(unsigned int flags, int nid)
1682 {
1683 bool ret = false;
1684 + unsigned int cpuset_mems_cookie;
1685
1686 if (!(flags & SHOW_MEM_FILTER_NODES))
1687 goto out;
1688
1689 - get_mems_allowed();
1690 - ret = !node_isset(nid, cpuset_current_mems_allowed);
1691 - put_mems_allowed();
1692 + do {
1693 + cpuset_mems_cookie = get_mems_allowed();
1694 + ret = !node_isset(nid, cpuset_current_mems_allowed);
1695 + } while (!put_mems_allowed(cpuset_mems_cookie));
1696 out:
1697 return ret;
1698 }
1699 @@ -3418,25 +3456,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
1700 if (page_to_nid(page) != zone_to_nid(zone))
1701 continue;
1702
1703 - /* Blocks with reserved pages will never free, skip them. */
1704 - block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
1705 - if (pageblock_is_reserved(pfn, block_end_pfn))
1706 - continue;
1707 -
1708 block_migratetype = get_pageblock_migratetype(page);
1709
1710 - /* If this block is reserved, account for it */
1711 - if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
1712 - reserve--;
1713 - continue;
1714 - }
1715 + /* Only test what is necessary when the reserves are not met */
1716 + if (reserve > 0) {
1717 + /*
1718 + * Blocks with reserved pages will never free, skip
1719 + * them.
1720 + */
1721 + block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
1722 + if (pageblock_is_reserved(pfn, block_end_pfn))
1723 + continue;
1724
1725 - /* Suitable for reserving if this block is movable */
1726 - if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
1727 - set_pageblock_migratetype(page, MIGRATE_RESERVE);
1728 - move_freepages_block(zone, page, MIGRATE_RESERVE);
1729 - reserve--;
1730 - continue;
1731 + /* If this block is reserved, account for it */
1732 + if (block_migratetype == MIGRATE_RESERVE) {
1733 + reserve--;
1734 + continue;
1735 + }
1736 +
1737 + /* Suitable for reserving if this block is movable */
1738 + if (block_migratetype == MIGRATE_MOVABLE) {
1739 + set_pageblock_migratetype(page,
1740 + MIGRATE_RESERVE);
1741 + move_freepages_block(zone, page,
1742 + MIGRATE_RESERVE);
1743 + reserve--;
1744 + continue;
1745 + }
1746 }
1747
1748 /*
1749 diff --git a/mm/slab.c b/mm/slab.c
1750 index d96e223..a67f812 100644
1751 --- a/mm/slab.c
1752 +++ b/mm/slab.c
1753 @@ -3218,12 +3218,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
1754 if (in_interrupt() || (flags & __GFP_THISNODE))
1755 return NULL;
1756 nid_alloc = nid_here = numa_mem_id();
1757 - get_mems_allowed();
1758 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
1759 nid_alloc = cpuset_slab_spread_node();
1760 else if (current->mempolicy)
1761 nid_alloc = slab_node(current->mempolicy);
1762 - put_mems_allowed();
1763 if (nid_alloc != nid_here)
1764 return ____cache_alloc_node(cachep, flags, nid_alloc);
1765 return NULL;
1766 @@ -3246,14 +3244,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
1767 enum zone_type high_zoneidx = gfp_zone(flags);
1768 void *obj = NULL;
1769 int nid;
1770 + unsigned int cpuset_mems_cookie;
1771
1772 if (flags & __GFP_THISNODE)
1773 return NULL;
1774
1775 - get_mems_allowed();
1776 - zonelist = node_zonelist(slab_node(current->mempolicy), flags);
1777 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
1778
1779 +retry_cpuset:
1780 + cpuset_mems_cookie = get_mems_allowed();
1781 + zonelist = node_zonelist(slab_node(current->mempolicy), flags);
1782 +
1783 retry:
1784 /*
1785 * Look through allowed nodes for objects available
1786 @@ -3306,7 +3307,9 @@ retry:
1787 }
1788 }
1789 }
1790 - put_mems_allowed();
1791 +
1792 + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
1793 + goto retry_cpuset;
1794 return obj;
1795 }
1796
1797 diff --git a/mm/slub.c b/mm/slub.c
1798 index 10ab233..ae6e80e 100644
1799 --- a/mm/slub.c
1800 +++ b/mm/slub.c
1801 @@ -1457,6 +1457,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1802 struct zone *zone;
1803 enum zone_type high_zoneidx = gfp_zone(flags);
1804 struct page *page;
1805 + unsigned int cpuset_mems_cookie;
1806
1807 /*
1808 * The defrag ratio allows a configuration of the tradeoffs between
1809 @@ -1480,23 +1481,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1810 get_cycles() % 1024 > s->remote_node_defrag_ratio)
1811 return NULL;
1812
1813 - get_mems_allowed();
1814 - zonelist = node_zonelist(slab_node(current->mempolicy), flags);
1815 - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1816 - struct kmem_cache_node *n;
1817 -
1818 - n = get_node(s, zone_to_nid(zone));
1819 -
1820 - if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1821 - n->nr_partial > s->min_partial) {
1822 - page = get_partial_node(n);
1823 - if (page) {
1824 - put_mems_allowed();
1825 - return page;
1826 + do {
1827 + cpuset_mems_cookie = get_mems_allowed();
1828 + zonelist = node_zonelist(slab_node(current->mempolicy), flags);
1829 + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1830 + struct kmem_cache_node *n;
1831 +
1832 + n = get_node(s, zone_to_nid(zone));
1833 +
1834 + if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1835 + n->nr_partial > s->min_partial) {
1836 + page = get_partial_node(n);
1837 + if (page) {
1838 + /*
1839 + * Return the object even if
1840 + * put_mems_allowed indicated that
1841 + * the cpuset mems_allowed was
1842 + * updated in parallel. It's a
1843 + * harmless race between the alloc
1844 + * and the cpuset update.
1845 + */
1846 + put_mems_allowed(cpuset_mems_cookie);
1847 + return page;
1848 + }
1849 }
1850 }
1851 - }
1852 - put_mems_allowed();
1853 + } while (!put_mems_allowed(cpuset_mems_cookie));
1854 #endif
1855 return NULL;
1856 }
1857 diff --git a/mm/vmscan.c b/mm/vmscan.c
1858 index 1b0ed36..5326f98 100644
1859 --- a/mm/vmscan.c
1860 +++ b/mm/vmscan.c
1861 @@ -248,35 +248,66 @@ unsigned long shrink_slab(struct shrink_control *shrink,
1862
1863 list_for_each_entry(shrinker, &shrinker_list, list) {
1864 unsigned long long delta;
1865 - unsigned long total_scan;
1866 - unsigned long max_pass;
1867 + long total_scan;
1868 + long max_pass;
1869 + int shrink_ret = 0;
1870 + long nr;
1871 + long new_nr;
1872
1873 max_pass = do_shrinker_shrink(shrinker, shrink, 0);
1874 + if (max_pass <= 0)
1875 + continue;
1876 +
1877 + /*
1878 + * copy the current shrinker scan count into a local variable
1879 + * and zero it so that other concurrent shrinker invocations
1880 + * don't also do this scanning work.
1881 + */
1882 + do {
1883 + nr = shrinker->nr;
1884 + } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
1885 +
1886 + total_scan = nr;
1887 delta = (4 * nr_pages_scanned) / shrinker->seeks;
1888 delta *= max_pass;
1889 do_div(delta, lru_pages + 1);
1890 - shrinker->nr += delta;
1891 - if (shrinker->nr < 0) {
1892 + total_scan += delta;
1893 + if (total_scan < 0) {
1894 printk(KERN_ERR "shrink_slab: %pF negative objects to "
1895 "delete nr=%ld\n",
1896 - shrinker->shrink, shrinker->nr);
1897 - shrinker->nr = max_pass;
1898 + shrinker->shrink, total_scan);
1899 + total_scan = max_pass;
1900 }
1901
1902 /*
1903 + * We need to avoid excessive windup on filesystem shrinkers
1904 + * due to large numbers of GFP_NOFS allocations causing the
1905 + * shrinkers to return -1 all the time. This results in a large
1906 + * nr being built up so when a shrink that can do some work
1907 + * comes along it empties the entire cache due to nr >>>
1908 + * max_pass. This is bad for sustaining a working set in
1909 + * memory.
1910 + *
1911 + * Hence only allow the shrinker to scan the entire cache when
1912 + * a large delta change is calculated directly.
1913 + */
1914 + if (delta < max_pass / 4)
1915 + total_scan = min(total_scan, max_pass / 2);
1916 +
1917 + /*
1918 * Avoid risking looping forever due to too large nr value:
1919 * never try to free more than twice the estimate number of
1920 * freeable entries.
1921 */
1922 - if (shrinker->nr > max_pass * 2)
1923 - shrinker->nr = max_pass * 2;
1924 + if (total_scan > max_pass * 2)
1925 + total_scan = max_pass * 2;
1926
1927 - total_scan = shrinker->nr;
1928 - shrinker->nr = 0;
1929 + trace_mm_shrink_slab_start(shrinker, shrink, nr,
1930 + nr_pages_scanned, lru_pages,
1931 + max_pass, delta, total_scan);
1932
1933 while (total_scan >= SHRINK_BATCH) {
1934 long this_scan = SHRINK_BATCH;
1935 - int shrink_ret;
1936 int nr_before;
1937
1938 nr_before = do_shrinker_shrink(shrinker, shrink, 0);
1939 @@ -292,7 +323,19 @@ unsigned long shrink_slab(struct shrink_control *shrink,
1940 cond_resched();
1941 }
1942
1943 - shrinker->nr += total_scan;
1944 + /*
1945 + * move the unused scan count back into the shrinker in a
1946 + * manner that handles concurrent updates. If we exhausted the
1947 + * scan, there is no need to do an update.
1948 + */
1949 + do {
1950 + nr = shrinker->nr;
1951 + new_nr = total_scan + nr;
1952 + if (total_scan <= 0)
1953 + break;
1954 + } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
1955 +
1956 + trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
1957 }
1958 up_read(&shrinker_rwsem);
1959 out:
1960 @@ -683,7 +726,13 @@ static enum page_references page_check_references(struct page *page,
1961 */
1962 SetPageReferenced(page);
1963
1964 - if (referenced_page)
1965 + if (referenced_page || referenced_ptes > 1)
1966 + return PAGEREF_ACTIVATE;
1967 +
1968 + /*
1969 + * Activate file-backed executable pages after first usage.
1970 + */
1971 + if (vm_flags & VM_EXEC)
1972 return PAGEREF_ACTIVATE;
1973
1974 return PAGEREF_KEEP;
1975 @@ -972,23 +1021,27 @@ keep_lumpy:
1976 *
1977 * returns 0 on success, -ve errno on failure.
1978 */
1979 -int __isolate_lru_page(struct page *page, int mode, int file)
1980 +int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1981 {
1982 + bool all_lru_mode;
1983 int ret = -EINVAL;
1984
1985 /* Only take pages on the LRU. */
1986 if (!PageLRU(page))
1987 return ret;
1988
1989 + all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
1990 + (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
1991 +
1992 /*
1993 * When checking the active state, we need to be sure we are
1994 * dealing with comparible boolean values. Take the logical not
1995 * of each.
1996 */
1997 - if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
1998 + if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
1999 return ret;
2000
2001 - if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
2002 + if (!all_lru_mode && !!page_is_file_cache(page) != file)
2003 return ret;
2004
2005 /*
2006 @@ -1001,6 +1054,43 @@ int __isolate_lru_page(struct page *page, int mode, int file)
2007
2008 ret = -EBUSY;
2009
2010 + /*
2011 + * To minimise LRU disruption, the caller can indicate that it only
2012 + * wants to isolate pages it will be able to operate on without
2013 + * blocking - clean pages for the most part.
2014 + *
2015 + * ISOLATE_CLEAN means that only clean pages should be isolated. This
2016 + * is used by reclaim when it is cannot write to backing storage
2017 + *
2018 + * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
2019 + * that it is possible to migrate without blocking
2020 + */
2021 + if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
2022 + /* All the caller can do on PageWriteback is block */
2023 + if (PageWriteback(page))
2024 + return ret;
2025 +
2026 + if (PageDirty(page)) {
2027 + struct address_space *mapping;
2028 +
2029 + /* ISOLATE_CLEAN means only clean pages */
2030 + if (mode & ISOLATE_CLEAN)
2031 + return ret;
2032 +
2033 + /*
2034 + * Only pages without mappings or that have a
2035 + * ->migratepage callback are possible to migrate
2036 + * without blocking
2037 + */
2038 + mapping = page_mapping(page);
2039 + if (mapping && !mapping->a_ops->migratepage)
2040 + return ret;
2041 + }
2042 + }
2043 +
2044 + if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
2045 + return ret;
2046 +
2047 if (likely(get_page_unless_zero(page))) {
2048 /*
2049 * Be careful not to clear PageLRU until after we're
2050 @@ -1036,7 +1126,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)
2051 */
2052 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
2053 struct list_head *src, struct list_head *dst,
2054 - unsigned long *scanned, int order, int mode, int file)
2055 + unsigned long *scanned, int order, isolate_mode_t mode,
2056 + int file)
2057 {
2058 unsigned long nr_taken = 0;
2059 unsigned long nr_lumpy_taken = 0;
2060 @@ -1111,7 +1202,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
2061 * anon page which don't already have a swap slot is
2062 * pointless.
2063 */
2064 - if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
2065 + if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
2066 !PageSwapCache(cursor_page))
2067 break;
2068
2069 @@ -1161,8 +1252,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
2070 static unsigned long isolate_pages_global(unsigned long nr,
2071 struct list_head *dst,
2072 unsigned long *scanned, int order,
2073 - int mode, struct zone *z,
2074 - int active, int file)
2075 + isolate_mode_t mode,
2076 + struct zone *z, int active, int file)
2077 {
2078 int lru = LRU_BASE;
2079 if (active)
2080 @@ -1408,6 +1499,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
2081 unsigned long nr_taken;
2082 unsigned long nr_anon;
2083 unsigned long nr_file;
2084 + isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
2085
2086 while (unlikely(too_many_isolated(zone, file, sc))) {
2087 congestion_wait(BLK_RW_ASYNC, HZ/10);
2088 @@ -1418,15 +1510,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
2089 }
2090
2091 set_reclaim_mode(priority, sc, false);
2092 + if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
2093 + reclaim_mode |= ISOLATE_ACTIVE;
2094 +
2095 lru_add_drain();
2096 +
2097 + if (!sc->may_unmap)
2098 + reclaim_mode |= ISOLATE_UNMAPPED;
2099 + if (!sc->may_writepage)
2100 + reclaim_mode |= ISOLATE_CLEAN;
2101 +
2102 spin_lock_irq(&zone->lru_lock);
2103
2104 if (scanning_global_lru(sc)) {
2105 - nr_taken = isolate_pages_global(nr_to_scan,
2106 - &page_list, &nr_scanned, sc->order,
2107 - sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
2108 - ISOLATE_BOTH : ISOLATE_INACTIVE,
2109 - zone, 0, file);
2110 + nr_taken = isolate_pages_global(nr_to_scan, &page_list,
2111 + &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
2112 zone->pages_scanned += nr_scanned;
2113 if (current_is_kswapd())
2114 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
2115 @@ -1435,12 +1533,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
2116 __count_zone_vm_events(PGSCAN_DIRECT, zone,
2117 nr_scanned);
2118 } else {
2119 - nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
2120 - &page_list, &nr_scanned, sc->order,
2121 - sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
2122 - ISOLATE_BOTH : ISOLATE_INACTIVE,
2123 - zone, sc->mem_cgroup,
2124 - 0, file);
2125 + nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
2126 + &nr_scanned, sc->order, reclaim_mode, zone,
2127 + sc->mem_cgroup, 0, file);
2128 /*
2129 * mem_cgroup_isolate_pages() keeps track of
2130 * scanned pages on its own.
2131 @@ -1542,19 +1637,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
2132 struct page *page;
2133 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
2134 unsigned long nr_rotated = 0;
2135 + isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
2136
2137 lru_add_drain();
2138 +
2139 + if (!sc->may_unmap)
2140 + reclaim_mode |= ISOLATE_UNMAPPED;
2141 + if (!sc->may_writepage)
2142 + reclaim_mode |= ISOLATE_CLEAN;
2143 +
2144 spin_lock_irq(&zone->lru_lock);
2145 if (scanning_global_lru(sc)) {
2146 nr_taken = isolate_pages_global(nr_pages, &l_hold,
2147 &pgscanned, sc->order,
2148 - ISOLATE_ACTIVE, zone,
2149 + reclaim_mode, zone,
2150 1, file);
2151 zone->pages_scanned += pgscanned;
2152 } else {
2153 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
2154 &pgscanned, sc->order,
2155 - ISOLATE_ACTIVE, zone,
2156 + reclaim_mode, zone,
2157 sc->mem_cgroup, 1, file);
2158 /*
2159 * mem_cgroup_isolate_pages() keeps track of
2160 @@ -1747,23 +1849,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
2161 u64 fraction[2], denominator;
2162 enum lru_list l;
2163 int noswap = 0;
2164 - int force_scan = 0;
2165 + bool force_scan = false;
2166 unsigned long nr_force_scan[2];
2167
2168 -
2169 - anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
2170 - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
2171 - file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
2172 - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
2173 -
2174 - if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
2175 - /* kswapd does zone balancing and need to scan this zone */
2176 - if (scanning_global_lru(sc) && current_is_kswapd())
2177 - force_scan = 1;
2178 - /* memcg may have small limit and need to avoid priority drop */
2179 - if (!scanning_global_lru(sc))
2180 - force_scan = 1;
2181 - }
2182 + /* kswapd does zone balancing and needs to scan this zone */
2183 + if (scanning_global_lru(sc) && current_is_kswapd() &&
2184 + zone->all_unreclaimable)
2185 + force_scan = true;
2186 + /* memcg may have small limit and need to avoid priority drop */
2187 + if (!scanning_global_lru(sc))
2188 + force_scan = true;
2189
2190 /* If we have no swap space, do not bother scanning anon pages. */
2191 if (!sc->may_swap || (nr_swap_pages <= 0)) {
2192 @@ -1776,6 +1871,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
2193 goto out;
2194 }
2195
2196 + anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
2197 + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
2198 + file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
2199 + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
2200 +
2201 if (scanning_global_lru(sc)) {
2202 free = zone_page_state(zone, NR_FREE_PAGES);
2203 /* If we have very few page cache pages,
2204 @@ -1912,8 +2012,9 @@ static inline bool should_continue_reclaim(struct zone *zone,
2205 * inactive lists are large enough, continue reclaiming
2206 */
2207 pages_for_compaction = (2UL << sc->order);
2208 - inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
2209 - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
2210 + inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
2211 + if (nr_swap_pages > 0)
2212 + inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
2213 if (sc->nr_reclaimed < pages_for_compaction &&
2214 inactive_lru_pages > pages_for_compaction)
2215 return true;
2216 @@ -1985,6 +2086,42 @@ restart:
2217 throttle_vm_writeout(sc->gfp_mask);
2218 }
2219
2220 +/* Returns true if compaction should go ahead for a high-order request */
2221 +static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2222 +{
2223 + unsigned long balance_gap, watermark;
2224 + bool watermark_ok;
2225 +
2226 + /* Do not consider compaction for orders reclaim is meant to satisfy */
2227 + if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
2228 + return false;
2229 +
2230 + /*
2231 + * Compaction takes time to run and there are potentially other
2232 + * callers using the pages just freed. Continue reclaiming until
2233 + * there is a buffer of free pages available to give compaction
2234 + * a reasonable chance of completing and allocating the page
2235 + */
2236 + balance_gap = min(low_wmark_pages(zone),
2237 + (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2238 + KSWAPD_ZONE_BALANCE_GAP_RATIO);
2239 + watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
2240 + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
2241 +
2242 + /*
2243 + * If compaction is deferred, reclaim up to a point where
2244 + * compaction will have a chance of success when re-enabled
2245 + */
2246 + if (compaction_deferred(zone))
2247 + return watermark_ok;
2248 +
2249 + /* If compaction is not ready to start, keep reclaiming */
2250 + if (!compaction_suitable(zone, sc->order))
2251 + return false;
2252 +
2253 + return watermark_ok;
2254 +}
2255 +
2256 /*
2257 * This is the direct reclaim path, for page-allocating processes. We only
2258 * try to reclaim pages from zones which will satisfy the caller's allocation
2259 @@ -2000,14 +2137,20 @@ restart:
2260 *
2261 * If a zone is deemed to be full of pinned pages then just give it a light
2262 * scan then give up on it.
2263 + *
2264 + * This function returns true if a zone is being reclaimed for a costly
2265 + * high-order allocation and compaction is ready to begin. This indicates to
2266 + * the caller that it should consider retrying the allocation instead of
2267 + * further reclaim.
2268 */
2269 -static void shrink_zones(int priority, struct zonelist *zonelist,
2270 +static bool shrink_zones(int priority, struct zonelist *zonelist,
2271 struct scan_control *sc)
2272 {
2273 struct zoneref *z;
2274 struct zone *zone;
2275 unsigned long nr_soft_reclaimed;
2276 unsigned long nr_soft_scanned;
2277 + bool aborted_reclaim = false;
2278
2279 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2280 gfp_zone(sc->gfp_mask), sc->nodemask) {
2281 @@ -2022,6 +2165,21 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
2282 continue;
2283 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2284 continue; /* Let kswapd poll it */
2285 + if (COMPACTION_BUILD) {
2286 + /*
2287 + * If we already have plenty of memory free for
2288 + * compaction in this zone, don't free any more.
2289 + * Even though compaction is invoked for any
2290 + * non-zero order, only frequent costly order
2291 + * reclamation is disruptive enough to become a
2292 + * noticable problem, like transparent huge page
2293 + * allocations.
2294 + */
2295 + if (compaction_ready(zone, sc)) {
2296 + aborted_reclaim = true;
2297 + continue;
2298 + }
2299 + }
2300 /*
2301 * This steals pages from memory cgroups over softlimit
2302 * and returns the number of reclaimed pages and
2303 @@ -2039,6 +2197,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
2304
2305 shrink_zone(priority, zone, sc);
2306 }
2307 +
2308 + return aborted_reclaim;
2309 }
2310
2311 static bool zone_reclaimable(struct zone *zone)
2312 @@ -2092,8 +2252,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2313 struct zoneref *z;
2314 struct zone *zone;
2315 unsigned long writeback_threshold;
2316 + bool aborted_reclaim;
2317
2318 - get_mems_allowed();
2319 delayacct_freepages_start();
2320
2321 if (scanning_global_lru(sc))
2322 @@ -2103,7 +2263,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2323 sc->nr_scanned = 0;
2324 if (!priority)
2325 disable_swap_token(sc->mem_cgroup);
2326 - shrink_zones(priority, zonelist, sc);
2327 + aborted_reclaim = shrink_zones(priority, zonelist, sc);
2328 +
2329 /*
2330 * Don't shrink slabs when reclaiming memory from
2331 * over limit cgroups
2332 @@ -2155,7 +2316,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2333
2334 out:
2335 delayacct_freepages_end();
2336 - put_mems_allowed();
2337
2338 if (sc->nr_reclaimed)
2339 return sc->nr_reclaimed;
2340 @@ -2168,6 +2328,10 @@ out:
2341 if (oom_killer_disabled)
2342 return 0;
2343
2344 + /* Aborted reclaim to try compaction? don't OOM, then */
2345 + if (aborted_reclaim)
2346 + return 1;
2347 +
2348 /* top priority shrink_zones still had more to do? don't OOM, then */
2349 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
2350 return 1;
2351 @@ -2459,6 +2623,9 @@ loop_again:
2352 high_wmark_pages(zone), 0, 0)) {
2353 end_zone = i;
2354 break;
2355 + } else {
2356 + /* If balanced, clear the congested flag */
2357 + zone_clear_flag(zone, ZONE_CONGESTED);
2358 }
2359 }
2360 if (i < 0)
2361 @@ -2695,7 +2862,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2362 * them before going back to sleep.
2363 */
2364 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2365 - schedule();
2366 +
2367 + if (!kthread_should_stop())
2368 + schedule();
2369 +
2370 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2371 } else {
2372 if (remaining)
2373 @@ -2722,7 +2892,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2374 static int kswapd(void *p)
2375 {
2376 unsigned long order, new_order;
2377 + unsigned balanced_order;
2378 int classzone_idx, new_classzone_idx;
2379 + int balanced_classzone_idx;
2380 pg_data_t *pgdat = (pg_data_t*)p;
2381 struct task_struct *tsk = current;
2382
2383 @@ -2753,7 +2925,9 @@ static int kswapd(void *p)
2384 set_freezable();
2385
2386 order = new_order = 0;
2387 + balanced_order = 0;
2388 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2389 + balanced_classzone_idx = classzone_idx;
2390 for ( ; ; ) {
2391 int ret;
2392
2393 @@ -2762,7 +2936,8 @@ static int kswapd(void *p)
2394 * new request of a similar or harder type will succeed soon
2395 * so consider going to sleep on the basis we reclaimed at
2396 */
2397 - if (classzone_idx >= new_classzone_idx && order == new_order) {
2398 + if (balanced_classzone_idx >= new_classzone_idx &&
2399 + balanced_order == new_order) {
2400 new_order = pgdat->kswapd_max_order;
2401 new_classzone_idx = pgdat->classzone_idx;
2402 pgdat->kswapd_max_order = 0;
2403 @@ -2777,9 +2952,12 @@ static int kswapd(void *p)
2404 order = new_order;
2405 classzone_idx = new_classzone_idx;
2406 } else {
2407 - kswapd_try_to_sleep(pgdat, order, classzone_idx);
2408 + kswapd_try_to_sleep(pgdat, balanced_order,
2409 + balanced_classzone_idx);
2410 order = pgdat->kswapd_max_order;
2411 classzone_idx = pgdat->classzone_idx;
2412 + new_order = order;
2413 + new_classzone_idx = classzone_idx;
2414 pgdat->kswapd_max_order = 0;
2415 pgdat->classzone_idx = pgdat->nr_zones - 1;
2416 }
2417 @@ -2794,7 +2972,9 @@ static int kswapd(void *p)
2418 */
2419 if (!ret) {
2420 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2421 - order = balance_pgdat(pgdat, order, &classzone_idx);
2422 + balanced_classzone_idx = classzone_idx;
2423 + balanced_order = balance_pgdat(pgdat, order,
2424 + &balanced_classzone_idx);
2425 }
2426 }
2427 return 0;
2428 diff --git a/mm/vmstat.c b/mm/vmstat.c
2429 index 20c18b7..6559013 100644
2430 --- a/mm/vmstat.c
2431 +++ b/mm/vmstat.c
2432 @@ -78,7 +78,7 @@ void vm_events_fold_cpu(int cpu)
2433 *
2434 * vm_stat contains the global counters
2435 */
2436 -atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
2437 +atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
2438 EXPORT_SYMBOL(vm_stat);
2439
2440 #ifdef CONFIG_SMP

  ViewVC Help
Powered by ViewVC 1.1.20