File: /Users/paulross/dev/linux/linux-3.13/include/linux/mmzone.h

Green shading in the line number column means the source is part of the translation unit, red means it is conditionally excluded. Highlighted line numbers link to the translation unit page. Highlighted macros link to the macro page.
       1: #ifndef _LINUX_MMZONE_H
       2: #define _LINUX_MMZONE_H
       3: 
       4: #ifndef __ASSEMBLY__
       5: #ifndef __GENERATING_BOUNDS_H
       6: 
       7: #include <linux/spinlock.h>
       8: #include <linux/list.h>
       9: #include <linux/wait.h>
      10: #include <linux/bitops.h>
      11: #include <linux/cache.h>
      12: #include <linux/threads.h>
      13: #include <linux/numa.h>
      14: #include <linux/init.h>
      15: #include <linux/seqlock.h>
      16: #include <linux/nodemask.h>
      17: #include <linux/pageblock-flags.h>
      18: #include <linux/page-flags-layout.h>
      19: #include <linux/atomic.h>
      20: #include <asm/page.h>
      21: 
      22: /* Free memory management - zoned buddy allocator.  */
      23: #ifndef CONFIG_FORCE_MAX_ZONEORDER
      24: #define MAX_ORDER 11
      25: #else
      26: #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
      27: #endif
      28: #define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
      29: 
      30: /*
      31:  * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
      32:  * costly to service.  That is between allocation orders which should
      33:  * coalesce naturally under reasonable reclaim pressure and those which
      34:  * will not.
      35:  */
      36: #define PAGE_ALLOC_COSTLY_ORDER 3
      37: 
      38: enum {
      39:     MIGRATE_UNMOVABLE,
      40:     MIGRATE_RECLAIMABLE,
      41:     MIGRATE_MOVABLE,
      42:     MIGRATE_PCPTYPES,    /* the number of types on the pcp lists */
      43:     MIGRATE_RESERVE = MIGRATE_PCPTYPES,
      44: #ifdef CONFIG_CMA
      45:     /*
      46:      * MIGRATE_CMA migration type is designed to mimic the way
      47:      * ZONE_MOVABLE works.  Only movable pages can be allocated
      48:      * from MIGRATE_CMA pageblocks and page allocator never
      49:      * implicitly change migration type of MIGRATE_CMA pageblock.
      50:      *
      51:      * The way to use it is to change migratetype of a range of
      52:      * pageblocks to MIGRATE_CMA which can be done by
      53:      * __free_pageblock_cma() function.  What is important though
      54:      * is that a range of pageblocks must be aligned to
      55:      * MAX_ORDER_NR_PAGES should biggest page be bigger then
      56:      * a single pageblock.
      57:      */
      58:     MIGRATE_CMA,
      59: #endif
      60: #ifdef CONFIG_MEMORY_ISOLATION
      61:     MIGRATE_ISOLATE,    /* can't allocate from here */
      62: #endif
      63:     MIGRATE_TYPES
      64: };
      65: 
      66: #ifdef CONFIG_CMA
      67: #  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
      68: #else
      69: #  define is_migrate_cma(migratetype) false
      70: #endif
      71: 
      72: #define for_each_migratetype_order(order, type) \
      73:     for (order = 0; order < MAX_ORDER; order++) \
      74:         for (type = 0; type < MIGRATE_TYPES; type++)
      75: 
      76: extern int page_group_by_mobility_disabled;
      77: 
      78: static inline int get_pageblock_migratetype(struct page *page)
      79: {
      80:     return get_pageblock_flags_group(page, PB_migrate, PB_migrate_end);
      81: }
      82: 
      83: struct free_area {
      84:     struct list_head    free_list[MIGRATE_TYPES];
      85:     unsigned long        nr_free;
      86: };
      87: 
      88: struct pglist_data;
      89: 
      90: /*
      91:  * zone->lock and zone->lru_lock are two of the hottest locks in the kernel.
      92:  * So add a wild amount of padding here to ensure that they fall into separate
      93:  * cachelines.  There are very few zone structures in the machine, so space
      94:  * consumption is not a concern here.
      95:  */
      96: #if defined(CONFIG_SMP)
      97: struct zone_padding {
      98:     char x[0];
      99: } ____cacheline_internodealigned_in_smp;
     100: #define ZONE_PADDING(name)    struct zone_padding name;
     101: #else
     102: #define ZONE_PADDING(name)
     103: #endif
     104: 
     105: enum zone_stat_item {
     106:     /* First 128 byte cacheline (assuming 64 bit words) */
     107:     NR_FREE_PAGES,
     108:     NR_ALLOC_BATCH,
     109:     NR_LRU_BASE,
     110:     NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
     111:     NR_ACTIVE_ANON,        /*  "     "     "   "       "         */
     112:     NR_INACTIVE_FILE,    /*  "     "     "   "       "         */
     113:     NR_ACTIVE_FILE,        /*  "     "     "   "       "         */
     114:     NR_UNEVICTABLE,        /*  "     "     "   "       "         */
     115:     NR_MLOCK,        /* mlock()ed pages found and moved off LRU */
     116:     NR_ANON_PAGES,    /* Mapped anonymous pages */
     117:     NR_FILE_MAPPED,    /* pagecache pages mapped into pagetables.
     118:                only modified from process context */
     119:     NR_FILE_PAGES,
     120:     NR_FILE_DIRTY,
     121:     NR_WRITEBACK,
     122:     NR_SLAB_RECLAIMABLE,
     123:     NR_SLAB_UNRECLAIMABLE,
     124:     NR_PAGETABLE,        /* used for pagetables */
     125:     NR_KERNEL_STACK,
     126:     /* Second 128 byte cacheline */
     127:     NR_UNSTABLE_NFS,    /* NFS unstable pages */
     128:     NR_BOUNCE,
     129:     NR_VMSCAN_WRITE,
     130:     NR_VMSCAN_IMMEDIATE,    /* Prioritise for reclaim when writeback ends */
     131:     NR_WRITEBACK_TEMP,    /* Writeback using temporary buffers */
     132:     NR_ISOLATED_ANON,    /* Temporary isolated pages from anon lru */
     133:     NR_ISOLATED_FILE,    /* Temporary isolated pages from file lru */
     134:     NR_SHMEM,        /* shmem pages (included tmpfs/GEM pages) */
     135:     NR_DIRTIED,        /* page dirtyings since bootup */
     136:     NR_WRITTEN,        /* page writings since bootup */
     137: #ifdef CONFIG_NUMA
     138:     NUMA_HIT,        /* allocated in intended node */
     139:     NUMA_MISS,        /* allocated in non intended node */
     140:     NUMA_FOREIGN,        /* was intended here, hit elsewhere */
     141:     NUMA_INTERLEAVE_HIT,    /* interleaver preferred this zone */
     142:     NUMA_LOCAL,        /* allocation from local node */
     143:     NUMA_OTHER,        /* allocation from other node */
     144: #endif
     145:     NR_ANON_TRANSPARENT_HUGEPAGES,
     146:     NR_FREE_CMA_PAGES,
     147:     NR_VM_ZONE_STAT_ITEMS };
     148: 
     149: /*
     150:  * We do arithmetic on the LRU lists in various places in the code,
     151:  * so it is important to keep the active lists LRU_ACTIVE higher in
     152:  * the array than the corresponding inactive lists, and to keep
     153:  * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists.
     154:  *
     155:  * This has to be kept in sync with the statistics in zone_stat_item
     156:  * above and the descriptions in vmstat_text in mm/vmstat.c
     157:  */
     158: #define LRU_BASE 0
     159: #define LRU_ACTIVE 1
     160: #define LRU_FILE 2
     161: 
     162: enum lru_list {
     163:     LRU_INACTIVE_ANON = LRU_BASE,
     164:     LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
     165:     LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
     166:     LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
     167:     LRU_UNEVICTABLE,
     168:     NR_LRU_LISTS
     169: };
     170: 
     171: #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)
     172: 
     173: #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
     174: 
     175: static inline int is_file_lru(enum lru_list lru)
     176: {
     177:     return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
     178: }
     179: 
     180: static inline int is_active_lru(enum lru_list lru)
     181: {
     182:     return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
     183: }
     184: 
     185: static inline int is_unevictable_lru(enum lru_list lru)
     186: {
     187:     return (lru == LRU_UNEVICTABLE);
     188: }
     189: 
     190: struct zone_reclaim_stat {
     191:     /*
     192:      * The pageout code in vmscan.c keeps track of how many of the
     193:      * mem/swap backed and file backed pages are referenced.
     194:      * The higher the rotated/scanned ratio, the more valuable
     195:      * that cache is.
     196:      *
     197:      * The anon LRU stats live in [0], file LRU stats in [1]
     198:      */
     199:     unsigned long        recent_rotated[2];
     200:     unsigned long        recent_scanned[2];
     201: };
     202: 
     203: struct lruvec {
     204:     struct list_head lists[NR_LRU_LISTS];
     205:     struct zone_reclaim_stat reclaim_stat;
     206: #ifdef CONFIG_MEMCG
     207:     struct zone *zone;
     208: #endif
     209: };
     210: 
     211: /* Mask used at gathering information at once (see memcontrol.c) */
     212: #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
     213: #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
     214: #define LRU_ALL         ((1 << NR_LRU_LISTS) - 1)
     215: 
     216: /* Isolate clean file */
     217: #define ISOLATE_CLEAN        ((__force isolate_mode_t)0x1)
     218: /* Isolate unmapped file */
     219: #define ISOLATE_UNMAPPED    ((__force isolate_mode_t)0x2)
     220: /* Isolate for asynchronous migration */
     221: #define ISOLATE_ASYNC_MIGRATE    ((__force isolate_mode_t)0x4)
     222: /* Isolate unevictable pages */
     223: #define ISOLATE_UNEVICTABLE    ((__force isolate_mode_t)0x8)
     224: 
     225: /* LRU Isolation modes. */
     226: typedef unsigned __bitwise__ isolate_mode_t;
     227: 
     228: enum zone_watermarks {
     229:     WMARK_MIN,
     230:     WMARK_LOW,
     231:     WMARK_HIGH,
     232:     NR_WMARK
     233: };
     234: 
     235: #define min_wmark_pages(z) (z->watermark[WMARK_MIN])
     236: #define low_wmark_pages(z) (z->watermark[WMARK_LOW])
     237: #define high_wmark_pages(z) (z->watermark[WMARK_HIGH])
     238: 
     239: struct per_cpu_pages {
     240:     int count;        /* number of pages in the list */
     241:     int high;        /* high watermark, emptying needed */
     242:     int batch;        /* chunk size for buddy add/remove */
     243: 
     244:     /* Lists of pages, one per migrate type stored on the pcp-lists */
     245:     struct list_head lists[MIGRATE_PCPTYPES];
     246: };
     247: 
     248: struct per_cpu_pageset {
     249:     struct per_cpu_pages pcp;
     250: #ifdef CONFIG_NUMA
     251:     s8 expire;
     252: #endif
     253: #ifdef CONFIG_SMP
     254:     s8 stat_threshold;
     255:     s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
     256: #endif
     257: };
     258: 
     259: #endif /* !__GENERATING_BOUNDS.H */
     260: 
     261: enum zone_type {
     262: #ifdef CONFIG_ZONE_DMA
     263:     /*
     264:      * ZONE_DMA is used when there are devices that are not able
     265:      * to do DMA to all of addressable memory (ZONE_NORMAL). Then we
     266:      * carve out the portion of memory that is needed for these devices.
     267:      * The range is arch specific.
     268:      *
     269:      * Some examples
     270:      *
     271:      * Architecture        Limit
     272:      * ---------------------------
     273:      * parisc, ia64, sparc    <4G
     274:      * s390            <2G
     275:      * arm            Various
     276:      * alpha        Unlimited or 0-16MB.
     277:      *
     278:      * i386, x86_64 and multiple other arches
     279:      *             <16M.
     280:      */
     281:     ZONE_DMA,
     282: #endif
     283: #ifdef CONFIG_ZONE_DMA32
     284:     /*
     285:      * x86_64 needs two ZONE_DMAs because it supports devices that are
     286:      * only able to do DMA to the lower 16M but also 32 bit devices that
     287:      * can only do DMA areas below 4G.
     288:      */
     289:     ZONE_DMA32,
     290: #endif
     291:     /*
     292:      * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
     293:      * performed on pages in ZONE_NORMAL if the DMA devices support
     294:      * transfers to all addressable memory.
     295:      */
     296:     ZONE_NORMAL,
     297: #ifdef CONFIG_HIGHMEM
     298:     /*
     299:      * A memory area that is only addressable by the kernel through
     300:      * mapping portions into its own address space. This is for example
     301:      * used by i386 to allow the kernel to address the memory beyond
     302:      * 900MB. The kernel will set up special mappings (page
     303:      * table entries on i386) for each page that the kernel needs to
     304:      * access.
     305:      */
     306:     ZONE_HIGHMEM,
     307: #endif
     308:     ZONE_MOVABLE,
     309:     __MAX_NR_ZONES
     310: };
     311: 
     312: #ifndef __GENERATING_BOUNDS_H
     313: 
     314: struct zone {
     315:     /* Fields commonly accessed by the page allocator */
     316: 
     317:     /* zone watermarks, access with *_wmark_pages(zone) macros */
     318:     unsigned long watermark[NR_WMARK];
     319: 
     320:     /*
     321:      * When free pages are below this point, additional steps are taken
     322:      * when reading the number of free pages to avoid per-cpu counter
     323:      * drift allowing watermarks to be breached
     324:      */
     325:     unsigned long percpu_drift_mark;
     326: 
     327:     /*
     328:      * We don't know if the memory that we're going to allocate will be freeable
     329:      * or/and it will be released eventually, so to avoid totally wasting several
     330:      * GB of ram we must reserve some of the lower zone memory (otherwise we risk
     331:      * to run OOM on the lower zones despite there's tons of freeable ram
     332:      * on the higher zones). This array is recalculated at runtime if the
     333:      * sysctl_lowmem_reserve_ratio sysctl changes.
     334:      */
     335:     unsigned long        lowmem_reserve[MAX_NR_ZONES];
     336: 
     337:     /*
     338:      * This is a per-zone reserve of pages that should not be
     339:      * considered dirtyable memory.
     340:      */
     341:     unsigned long        dirty_balance_reserve;
     342: 
     343: #ifdef CONFIG_NUMA
     344:     int node;
     345:     /*
     346:      * zone reclaim becomes active if more unmapped pages exist.
     347:      */
     348:     unsigned long        min_unmapped_pages;
     349:     unsigned long        min_slab_pages;
     350: #endif
     351:     struct per_cpu_pageset __percpu *pageset;
     352:     /*
     353:      * free areas of different sizes
     354:      */
     355:     spinlock_t        lock;
     356: #if defined CONFIG_COMPACTION || defined CONFIG_CMA
     357:     /* Set to true when the PG_migrate_skip bits should be cleared */
     358:     bool            compact_blockskip_flush;
     359: 
     360:     /* pfns where compaction scanners should start */
     361:     unsigned long        compact_cached_free_pfn;
     362:     unsigned long        compact_cached_migrate_pfn;
     363: #endif
     364: #ifdef CONFIG_MEMORY_HOTPLUG
     365:     /* see spanned/present_pages for more description */
     366:     seqlock_t        span_seqlock;
     367: #endif
     368:     struct free_area    free_area[MAX_ORDER];
     369: 
     370: #ifndef CONFIG_SPARSEMEM
     371:     /*
     372:      * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
     373:      * In SPARSEMEM, this map is stored in struct mem_section
     374:      */
     375:     unsigned long        *pageblock_flags;
     376: #endif /* CONFIG_SPARSEMEM */
     377: 
     378: #ifdef CONFIG_COMPACTION
     379:     /*
     380:      * On compaction failure, 1<<compact_defer_shift compactions
     381:      * are skipped before trying again. The number attempted since
     382:      * last failure is tracked with compact_considered.
     383:      */
     384:     unsigned int        compact_considered;
     385:     unsigned int        compact_defer_shift;
     386:     int            compact_order_failed;
     387: #endif
     388: 
     389:     ZONE_PADDING(_pad1_)
     390: 
     391:     /* Fields commonly accessed by the page reclaim scanner */
     392:     spinlock_t        lru_lock;
     393:     struct lruvec        lruvec;
     394: 
     395:     unsigned long        pages_scanned;       /* since last reclaim */
     396:     unsigned long        flags;           /* zone flags, see below */
     397: 
     398:     /* Zone statistics */
     399:     atomic_long_t        vm_stat[NR_VM_ZONE_STAT_ITEMS];
     400: 
     401:     /*
     402:      * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
     403:      * this zone's LRU.  Maintained by the pageout code.
     404:      */
     405:     unsigned int inactive_ratio;
     406: 
     407: 
     408:     ZONE_PADDING(_pad2_)
     409:     /* Rarely used or read-mostly fields */
     410: 
     411:     /*
     412:      * wait_table        -- the array holding the hash table
     413:      * wait_table_hash_nr_entries    -- the size of the hash table array
     414:      * wait_table_bits    -- wait_table_size == (1 << wait_table_bits)
     415:      *
     416:      * The purpose of all these is to keep track of the people
     417:      * waiting for a page to become available and make them
     418:      * runnable again when possible. The trouble is that this
     419:      * consumes a lot of space, especially when so few things
     420:      * wait on pages at a given time. So instead of using
     421:      * per-page waitqueues, we use a waitqueue hash table.
     422:      *
     423:      * The bucket discipline is to sleep on the same queue when
     424:      * colliding and wake all in that wait queue when removing.
     425:      * When something wakes, it must check to be sure its page is
     426:      * truly available, a la thundering herd. The cost of a
     427:      * collision is great, but given the expected load of the
     428:      * table, they should be so rare as to be outweighed by the
     429:      * benefits from the saved space.
     430:      *
     431:      * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
     432:      * primary users of these fields, and in mm/page_alloc.c
     433:      * free_area_init_core() performs the initialization of them.
     434:      */
     435:     wait_queue_head_t    * wait_table;
     436:     unsigned long        wait_table_hash_nr_entries;
     437:     unsigned long        wait_table_bits;
     438: 
     439:     /*
     440:      * Discontig memory support fields.
     441:      */
     442:     struct pglist_data    *zone_pgdat;
     443:     /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
     444:     unsigned long        zone_start_pfn;
     445: 
     446:     /*
     447:      * spanned_pages is the total pages spanned by the zone, including
     448:      * holes, which is calculated as:
     449:      *     spanned_pages = zone_end_pfn - zone_start_pfn;
     450:      *
     451:      * present_pages is physical pages existing within the zone, which
     452:      * is calculated as:
     453:      *    present_pages = spanned_pages - absent_pages(pages in holes);
     454:      *
     455:      * managed_pages is present pages managed by the buddy system, which
     456:      * is calculated as (reserved_pages includes pages allocated by the
     457:      * bootmem allocator):
     458:      *    managed_pages = present_pages - reserved_pages;
     459:      *
     460:      * So present_pages may be used by memory hotplug or memory power
     461:      * management logic to figure out unmanaged pages by checking
     462:      * (present_pages - managed_pages). And managed_pages should be used
     463:      * by page allocator and vm scanner to calculate all kinds of watermarks
     464:      * and thresholds.
     465:      *
     466:      * Locking rules:
     467:      *
     468:      * zone_start_pfn and spanned_pages are protected by span_seqlock.
     469:      * It is a seqlock because it has to be read outside of zone->lock,
     470:      * and it is done in the main allocator path.  But, it is written
     471:      * quite infrequently.
     472:      *
     473:      * The span_seq lock is declared along with zone->lock because it is
     474:      * frequently read in proximity to zone->lock.  It's good to
     475:      * give them a chance of being in the same cacheline.
     476:      *
     477:      * Write access to present_pages at runtime should be protected by
     478:      * lock_memory_hotplug()/unlock_memory_hotplug().  Any reader who can't
     479:      * tolerant drift of present_pages should hold memory hotplug lock to
     480:      * get a stable value.
     481:      *
     482:      * Read access to managed_pages should be safe because it's unsigned
     483:      * long. Write access to zone->managed_pages and totalram_pages are
     484:      * protected by managed_page_count_lock at runtime. Idealy only
     485:      * adjust_managed_page_count() should be used instead of directly
     486:      * touching zone->managed_pages and totalram_pages.
     487:      */
     488:     unsigned long        spanned_pages;
     489:     unsigned long        present_pages;
     490:     unsigned long        managed_pages;
     491: 
     492:     /*
     493:      * rarely used fields:
     494:      */
     495:     const char        *name;
     496: } ____cacheline_internodealigned_in_smp;
     497: 
     498: typedef enum {
     499:     ZONE_RECLAIM_LOCKED,        /* prevents concurrent reclaim */
     500:     ZONE_OOM_LOCKED,        /* zone is in OOM killer zonelist */
     501:     ZONE_CONGESTED,            /* zone has many dirty pages backed by
     502:                      * a congested BDI
     503:                      */
     504:     ZONE_TAIL_LRU_DIRTY,        /* reclaim scanning has recently found
     505:                      * many dirty file pages at the tail
     506:                      * of the LRU.
     507:                      */
     508:     ZONE_WRITEBACK,            /* reclaim scanning has recently found
     509:                      * many pages under writeback
     510:                      */
     511: } zone_flags_t;
     512: 
     513: static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
     514: {
     515:     set_bit(flag, &zone->flags);
     516: }
     517: 
     518: static inline int zone_test_and_set_flag(struct zone *zone, zone_flags_t flag)
     519: {
     520:     return test_and_set_bit(flag, &zone->flags);
     521: }
     522: 
     523: static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag)
     524: {
     525:     clear_bit(flag, &zone->flags);
     526: }
     527: 
     528: static inline int zone_is_reclaim_congested(const struct zone *zone)
     529: {
     530:     return test_bit(ZONE_CONGESTED, &zone->flags);
     531: }
     532: 
     533: static inline int zone_is_reclaim_dirty(const struct zone *zone)
     534: {
     535:     return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags);
     536: }
     537: 
     538: static inline int zone_is_reclaim_writeback(const struct zone *zone)
     539: {
     540:     return test_bit(ZONE_WRITEBACK, &zone->flags);
     541: }
     542: 
     543: static inline int zone_is_reclaim_locked(const struct zone *zone)
     544: {
     545:     return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
     546: }
     547: 
     548: static inline int zone_is_oom_locked(const struct zone *zone)
     549: {
     550:     return test_bit(ZONE_OOM_LOCKED, &zone->flags);
     551: }
     552: 
     553: static inline unsigned long zone_end_pfn(const struct zone *zone)
     554: {
     555:     return zone->zone_start_pfn + zone->spanned_pages;
     556: }
     557: 
     558: static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
     559: {
     560:     return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
     561: }
     562: 
     563: static inline bool zone_is_initialized(struct zone *zone)
     564: {
     565:     return !!zone->wait_table;
     566: }
     567: 
     568: static inline bool zone_is_empty(struct zone *zone)
     569: {
     570:     return zone->spanned_pages == 0;
     571: }
     572: 
     573: /*
     574:  * The "priority" of VM scanning is how much of the queues we will scan in one
     575:  * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
     576:  * queues ("queue_length >> 12") during an aging round.
     577:  */
     578: #define DEF_PRIORITY 12
     579: 
     580: /* Maximum number of zones on a zonelist */
     581: #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
     582: 
     583: #ifdef CONFIG_NUMA
     584: 
     585: /*
     586:  * The NUMA zonelists are doubled because we need zonelists that restrict the
     587:  * allocations to a single node for GFP_THISNODE.
     588:  *
     589:  * [0]    : Zonelist with fallback
     590:  * [1]    : No fallback (GFP_THISNODE)
     591:  */
     592: #define MAX_ZONELISTS 2
     593: 
     594: 
     595: /*
     596:  * We cache key information from each zonelist for smaller cache
     597:  * footprint when scanning for free pages in get_page_from_freelist().
     598:  *
     599:  * 1) The BITMAP fullzones tracks which zones in a zonelist have come
     600:  *    up short of free memory since the last time (last_fullzone_zap)
     601:  *    we zero'd fullzones.
     602:  * 2) The array z_to_n[] maps each zone in the zonelist to its node
     603:  *    id, so that we can efficiently evaluate whether that node is
     604:  *    set in the current tasks mems_allowed.
     605:  *
     606:  * Both fullzones and z_to_n[] are one-to-one with the zonelist,
     607:  * indexed by a zones offset in the zonelist zones[] array.
     608:  *
     609:  * The get_page_from_freelist() routine does two scans.  During the
     610:  * first scan, we skip zones whose corresponding bit in 'fullzones'
     611:  * is set or whose corresponding node in current->mems_allowed (which
     612:  * comes from cpusets) is not set.  During the second scan, we bypass
     613:  * this zonelist_cache, to ensure we look methodically at each zone.
     614:  *
     615:  * Once per second, we zero out (zap) fullzones, forcing us to
     616:  * reconsider nodes that might have regained more free memory.
     617:  * The field last_full_zap is the time we last zapped fullzones.
     618:  *
     619:  * This mechanism reduces the amount of time we waste repeatedly
     620:  * reexaming zones for free memory when they just came up low on
     621:  * memory momentarilly ago.
     622:  *
     623:  * The zonelist_cache struct members logically belong in struct
     624:  * zonelist.  However, the mempolicy zonelists constructed for
     625:  * MPOL_BIND are intentionally variable length (and usually much
     626:  * shorter).  A general purpose mechanism for handling structs with
     627:  * multiple variable length members is more mechanism than we want
     628:  * here.  We resort to some special case hackery instead.
     629:  *
     630:  * The MPOL_BIND zonelists don't need this zonelist_cache (in good
     631:  * part because they are shorter), so we put the fixed length stuff
     632:  * at the front of the zonelist struct, ending in a variable length
     633:  * zones[], as is needed by MPOL_BIND.
     634:  *
     635:  * Then we put the optional zonelist cache on the end of the zonelist
     636:  * struct.  This optional stuff is found by a 'zlcache_ptr' pointer in
     637:  * the fixed length portion at the front of the struct.  This pointer
     638:  * both enables us to find the zonelist cache, and in the case of
     639:  * MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL)
     640:  * to know that the zonelist cache is not there.
     641:  *
     642:  * The end result is that struct zonelists come in two flavors:
     643:  *  1) The full, fixed length version, shown below, and
     644:  *  2) The custom zonelists for MPOL_BIND.
     645:  * The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache.
     646:  *
     647:  * Even though there may be multiple CPU cores on a node modifying
     648:  * fullzones or last_full_zap in the same zonelist_cache at the same
     649:  * time, we don't lock it.  This is just hint data - if it is wrong now
     650:  * and then, the allocator will still function, perhaps a bit slower.
     651:  */
     652: 
     653: 
     654: struct zonelist_cache {
     655:     unsigned short z_to_n[MAX_ZONES_PER_ZONELIST];        /* zone->nid */
     656:     DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST);    /* zone full? */
     657:     unsigned long last_full_zap;        /* when last zap'd (jiffies) */
     658: };
     659: #else
     660: #define MAX_ZONELISTS 1
     661: struct zonelist_cache;
     662: #endif
     663: 
     664: /*
     665:  * This struct contains information about a zone in a zonelist. It is stored
     666:  * here to avoid dereferences into large structures and lookups of tables
     667:  */
     668: struct zoneref {
     669:     struct zone *zone;    /* Pointer to actual zone */
     670:     int zone_idx;        /* zone_idx(zoneref->zone) */
     671: };
     672: 
     673: /*
     674:  * One allocation request operates on a zonelist. A zonelist
     675:  * is a list of zones, the first one is the 'goal' of the
     676:  * allocation, the other zones are fallback zones, in decreasing
     677:  * priority.
     678:  *
     679:  * If zlcache_ptr is not NULL, then it is just the address of zlcache,
     680:  * as explained above.  If zlcache_ptr is NULL, there is no zlcache.
     681:  * *
     682:  * To speed the reading of the zonelist, the zonerefs contain the zone index
     683:  * of the entry being read. Helper functions to access information given
     684:  * a struct zoneref are
     685:  *
     686:  * zonelist_zone()    - Return the struct zone * for an entry in _zonerefs
     687:  * zonelist_zone_idx()    - Return the index of the zone for an entry
     688:  * zonelist_node_idx()    - Return the index of the node for an entry
     689:  */
     690: struct zonelist {
     691:     struct zonelist_cache *zlcache_ptr;             // NULL or &zlcache
     692:     struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
     693: #ifdef CONFIG_NUMA
     694:     struct zonelist_cache zlcache;                 // optional ...
     695: #endif
     696: };
     697: 
     698: #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
     699: struct node_active_region {
     700:     unsigned long start_pfn;
     701:     unsigned long end_pfn;
     702:     int nid;
     703: };
     704: #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
     705: 
     706: #ifndef CONFIG_DISCONTIGMEM
     707: /* The array of struct pages - for discontigmem use pgdat->lmem_map */
     708: extern struct page *mem_map;
     709: #endif
     710: 
     711: /*
     712:  * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM
     713:  * (mostly NUMA machines?) to denote a higher-level memory zone than the
     714:  * zone denotes.
     715:  *
     716:  * On NUMA machines, each NUMA node would have a pg_data_t to describe
     717:  * it's memory layout.
     718:  *
     719:  * Memory statistics and page replacement data structures are maintained on a
     720:  * per-zone basis.
     721:  */
     722: struct bootmem_data;
     723: typedef struct pglist_data {
     724:     struct zone node_zones[MAX_NR_ZONES];
     725:     struct zonelist node_zonelists[MAX_ZONELISTS];
     726:     int nr_zones;
     727: #ifdef CONFIG_FLAT_NODE_MEM_MAP    /* means !SPARSEMEM */
     728:     struct page *node_mem_map;
     729: #ifdef CONFIG_MEMCG
     730:     struct page_cgroup *node_page_cgroup;
     731: #endif
     732: #endif
     733: #ifndef CONFIG_NO_BOOTMEM
     734:     struct bootmem_data *bdata;
     735: #endif
     736: #ifdef CONFIG_MEMORY_HOTPLUG
     737:     /*
     738:      * Must be held any time you expect node_start_pfn, node_present_pages
     739:      * or node_spanned_pages stay constant.  Holding this will also
     740:      * guarantee that any pfn_valid() stays that way.
     741:      *
     742:      * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
     743:      * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG.
     744:      *
     745:      * Nests above zone->lock and zone->span_seqlock
     746:      */
     747:     spinlock_t node_size_lock;
     748: #endif
     749:     unsigned long node_start_pfn;
     750:     unsigned long node_present_pages; /* total number of physical pages */
     751:     unsigned long node_spanned_pages; /* total size of physical page
     752:                          range, including holes */
     753:     int node_id;
     754:     nodemask_t reclaim_nodes;    /* Nodes allowed to reclaim from */
     755:     wait_queue_head_t kswapd_wait;
     756:     wait_queue_head_t pfmemalloc_wait;
     757:     struct task_struct *kswapd;    /* Protected by lock_memory_hotplug() */
     758:     int kswapd_max_order;
     759:     enum zone_type classzone_idx;
     760: #ifdef CONFIG_NUMA_BALANCING
     761:     /*
     762:      * Lock serializing the per destination node AutoNUMA memory
     763:      * migration rate limiting data.
     764:      */
     765:     spinlock_t numabalancing_migrate_lock;
     766: 
     767:     /* Rate limiting time interval */
     768:     unsigned long numabalancing_migrate_next_window;
     769: 
     770:     /* Number of pages migrated during the rate limiting time interval */
     771:     unsigned long numabalancing_migrate_nr_pages;
     772: #endif
     773: } pg_data_t;
     774: 
     775: #define node_present_pages(nid)    (NODE_DATA(nid)->node_present_pages)
     776: #define node_spanned_pages(nid)    (NODE_DATA(nid)->node_spanned_pages)
     777: #ifdef CONFIG_FLAT_NODE_MEM_MAP
     778: #define pgdat_page_nr(pgdat, pagenr)    ((pgdat)->node_mem_map + (pagenr))
     779: #else
     780: #define pgdat_page_nr(pgdat, pagenr)    pfn_to_page((pgdat)->node_start_pfn + (pagenr))
     781: #endif
     782: #define nid_page_nr(nid, pagenr)     pgdat_page_nr(NODE_DATA(nid),(pagenr))
     783: 
     784: #define node_start_pfn(nid)    (NODE_DATA(nid)->node_start_pfn)
     785: #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
     786: 
     787: static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
     788: {
     789:     return pgdat->node_start_pfn + pgdat->node_spanned_pages;
     790: }
     791: 
     792: static inline bool pgdat_is_empty(pg_data_t *pgdat)
     793: {
     794:     return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
     795: }
     796: 
     797: #include <linux/memory_hotplug.h>
     798: 
     799: extern struct mutex zonelists_mutex;
     800: void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
     801: void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
     802: bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
     803:         int classzone_idx, int alloc_flags);
     804: bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
     805:         int classzone_idx, int alloc_flags);
     806: enum memmap_context {
     807:     MEMMAP_EARLY,
     808:     MEMMAP_HOTPLUG,
     809: };
     810: extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
     811:                      unsigned long size,
     812:                      enum memmap_context context);
     813: 
     814: extern void lruvec_init(struct lruvec *lruvec);
     815: 
     816: static inline struct zone *lruvec_zone(struct lruvec *lruvec)
     817: {
     818: #ifdef CONFIG_MEMCG
     819:     return lruvec->zone;
     820: #else
     821:     return container_of(lruvec, struct zone, lruvec);
     822: #endif
     823: }
     824: 
     825: #ifdef CONFIG_HAVE_MEMORY_PRESENT
     826: void memory_present(int nid, unsigned long start, unsigned long end);
     827: #else
     828: static inline void memory_present(int nid, unsigned long start, unsigned long end) {}
     829: #endif
     830: 
     831: #ifdef CONFIG_HAVE_MEMORYLESS_NODES
     832: int local_memory_node(int node_id);
     833: #else
     834: static inline int local_memory_node(int node_id) { return node_id; };
     835: #endif
     836: 
     837: #ifdef CONFIG_NEED_NODE_MEMMAP_SIZE
     838: unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
     839: #endif
     840: 
     841: /*
     842:  * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
     843:  */
     844: #define zone_idx(zone)        ((zone) - (zone)->zone_pgdat->node_zones)
     845: 
     846: static inline int populated_zone(struct zone *zone)
     847: {
     848:     return (!!zone->present_pages);
     849: }
     850: 
     851: extern int movable_zone;
     852: 
     853: static inline int zone_movable_is_highmem(void)
     854: {
     855: #if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
     856:     return movable_zone == ZONE_HIGHMEM;
     857: #else
     858:     return 0;
     859: #endif
     860: }
     861: 
     862: static inline int is_highmem_idx(enum zone_type idx)
     863: {
     864: #ifdef CONFIG_HIGHMEM
     865:     return (idx == ZONE_HIGHMEM ||
     866:         (idx == ZONE_MOVABLE && zone_movable_is_highmem()));
     867: #else
     868:     return 0;
     869: #endif
     870: }
     871: 
     872: /**
     873:  * is_highmem - helper function to quickly check if a struct zone is a 
     874:  *              highmem zone or not.  This is an attempt to keep references
     875:  *              to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
     876:  * @zone - pointer to struct zone variable
     877:  */
     878: static inline int is_highmem(struct zone *zone)
     879: {
     880: #ifdef CONFIG_HIGHMEM
     881:     int zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones;
     882:     return zone_off == ZONE_HIGHMEM * sizeof(*zone) ||
     883:            (zone_off == ZONE_MOVABLE * sizeof(*zone) &&
     884:         zone_movable_is_highmem());
     885: #else
     886:     return 0;
     887: #endif
     888: }
     889: 
     890: /* These two functions are used to setup the per zone pages min values */
     891: struct ctl_table;
     892: int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
     893:                     void __user *, size_t *, loff_t *);
     894: extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
     895: int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
     896:                     void __user *, size_t *, loff_t *);
     897: int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
     898:                     void __user *, size_t *, loff_t *);
     899: int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
     900:             void __user *, size_t *, loff_t *);
     901: int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
     902:             void __user *, size_t *, loff_t *);
     903: 
     904: extern int numa_zonelist_order_handler(struct ctl_table *, int,
     905:             void __user *, size_t *, loff_t *);
     906: extern char numa_zonelist_order[];
     907: #define NUMA_ZONELIST_ORDER_LEN 16    /* string buffer size */
     908: 
     909: #ifndef CONFIG_NEED_MULTIPLE_NODES
     910: 
     911: extern struct pglist_data contig_page_data;
     912: #define NODE_DATA(nid)        (&contig_page_data)
     913: #define NODE_MEM_MAP(nid)    mem_map
     914: 
     915: #else /* CONFIG_NEED_MULTIPLE_NODES */
     916: 
     917: #include <asm/mmzone.h>
     918: 
     919: #endif /* !CONFIG_NEED_MULTIPLE_NODES */
     920: 
     921: extern struct pglist_data *first_online_pgdat(void);
     922: extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
     923: extern struct zone *next_zone(struct zone *zone);
     924: 
     925: /**
     926:  * for_each_online_pgdat - helper macro to iterate over all online nodes
     927:  * @pgdat - pointer to a pg_data_t variable
     928:  */
     929: #define for_each_online_pgdat(pgdat)            \
     930:     for (pgdat = first_online_pgdat();        \
     931:          pgdat;                    \
     932:          pgdat = next_online_pgdat(pgdat))
     933: /**
     934:  * for_each_zone - helper macro to iterate over all memory zones
     935:  * @zone - pointer to struct zone variable
     936:  *
     937:  * The user only needs to declare the zone variable, for_each_zone
     938:  * fills it in.
     939:  */
     940: #define for_each_zone(zone)                    \
     941:     for (zone = (first_online_pgdat())->node_zones; \
     942:          zone;                    \
     943:          zone = next_zone(zone))
     944: 
     945: #define for_each_populated_zone(zone)                \
     946:     for (zone = (first_online_pgdat())->node_zones; \
     947:          zone;                    \
     948:          zone = next_zone(zone))            \
     949:         if (!populated_zone(zone))        \
     950:             ; /* do nothing */        \
     951:         else
     952: 
     953: static inline struct zone *zonelist_zone(struct zoneref *zoneref)
     954: {
     955:     return zoneref->zone;
     956: }
     957: 
     958: static inline int zonelist_zone_idx(struct zoneref *zoneref)
     959: {
     960:     return zoneref->zone_idx;
     961: }
     962: 
     963: static inline int zonelist_node_idx(struct zoneref *zoneref)
     964: {
     965: #ifdef CONFIG_NUMA
     966:     /* zone_to_nid not available in this context */
     967:     return zoneref->zone->node;
     968: #else
     969:     return 0;
     970: #endif /* CONFIG_NUMA */
     971: }
     972: 
     973: /**
     974:  * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
     975:  * @z - The cursor used as a starting point for the search
     976:  * @highest_zoneidx - The zone index of the highest zone to return
     977:  * @nodes - An optional nodemask to filter the zonelist with
     978:  * @zone - The first suitable zone found is returned via this parameter
     979:  *
     980:  * This function returns the next zone at or below a given zone index that is
     981:  * within the allowed nodemask using a cursor as the starting point for the
     982:  * search. The zoneref returned is a cursor that represents the current zone
     983:  * being examined. It should be advanced by one before calling
     984:  * next_zones_zonelist again.
     985:  */
     986: struct zoneref *next_zones_zonelist(struct zoneref *z,
     987:                     enum zone_type highest_zoneidx,
     988:                     nodemask_t *nodes,
     989:                     struct zone **zone);
     990: 
     991: /**
     992:  * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
     993:  * @zonelist - The zonelist to search for a suitable zone
     994:  * @highest_zoneidx - The zone index of the highest zone to return
     995:  * @nodes - An optional nodemask to filter the zonelist with
     996:  * @zone - The first suitable zone found is returned via this parameter
     997:  *
     998:  * This function returns the first zone at or below a given zone index that is
     999:  * within the allowed nodemask. The zoneref returned is a cursor that can be
    1000:  * used to iterate the zonelist with next_zones_zonelist by advancing it by
    1001:  * one before calling.
    1002:  */
    1003: static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
    1004:                     enum zone_type highest_zoneidx,
    1005:                     nodemask_t *nodes,
    1006:                     struct zone **zone)
    1007: {
    1008:     return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes,
    1009:                                 zone);
    1010: }
    1011: 
    1012: /**
    1013:  * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
    1014:  * @zone - The current zone in the iterator
    1015:  * @z - The current pointer within zonelist->zones being iterated
    1016:  * @zlist - The zonelist being iterated
    1017:  * @highidx - The zone index of the highest zone to return
    1018:  * @nodemask - Nodemask allowed by the allocator
    1019:  *
    1020:  * This iterator iterates though all zones at or below a given zone index and
    1021:  * within a given nodemask
    1022:  */
    1023: #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
    1024:     for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone);    \
    1025:         zone;                            \
    1026:         z = next_zones_zonelist(++z, highidx, nodemask, &zone))    \
    1027: 
    1028: /**
    1029:  * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
    1030:  * @zone - The current zone in the iterator
    1031:  * @z - The current pointer within zonelist->zones being iterated
    1032:  * @zlist - The zonelist being iterated
    1033:  * @highidx - The zone index of the highest zone to return
    1034:  *
    1035:  * This iterator iterates though all zones at or below a given zone index.
    1036:  */
    1037: #define for_each_zone_zonelist(zone, z, zlist, highidx) \
    1038:     for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)
    1039: 
    1040: #ifdef CONFIG_SPARSEMEM
    1041: #include <asm/sparsemem.h>
    1042: #endif
    1043: 
    1044: #if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \
    1045:     !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
    1046: static inline unsigned long early_pfn_to_nid(unsigned long pfn)
    1047: {
    1048:     return 0;
    1049: }
    1050: #endif
    1051: 
    1052: #ifdef CONFIG_FLATMEM
    1053: #define pfn_to_nid(pfn)        (0)
    1054: #endif
    1055: 
    1056: #ifdef CONFIG_SPARSEMEM
    1057: 
    1058: /*
    1059:  * SECTION_SHIFT            #bits space required to store a section #
    1060:  *
    1061:  * PA_SECTION_SHIFT        physical address to/from section number
    1062:  * PFN_SECTION_SHIFT        pfn to/from section number
    1063:  */
    1064: #define PA_SECTION_SHIFT    (SECTION_SIZE_BITS)
    1065: #define PFN_SECTION_SHIFT    (SECTION_SIZE_BITS - PAGE_SHIFT)
    1066: 
    1067: #define NR_MEM_SECTIONS        (1UL << SECTIONS_SHIFT)
    1068: 
    1069: #define PAGES_PER_SECTION       (1UL << PFN_SECTION_SHIFT)
    1070: #define PAGE_SECTION_MASK    (~(PAGES_PER_SECTION-1))
    1071: 
    1072: #define SECTION_BLOCKFLAGS_BITS \
    1073:     ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)
    1074: 
    1075: #if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
    1076: #error Allocator MAX_ORDER exceeds SECTION_SIZE
    1077: #endif
    1078: 
    1079: #define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT)
    1080: #define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT)
    1081: 
    1082: #define SECTION_ALIGN_UP(pfn)    (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
    1083: #define SECTION_ALIGN_DOWN(pfn)    ((pfn) & PAGE_SECTION_MASK)
    1084: 
    1085: struct page;
    1086: struct page_cgroup;
    1087: struct mem_section {
    1088:     /*
    1089:      * This is, logically, a pointer to an array of struct
    1090:      * pages.  However, it is stored with some other magic.
    1091:      * (see sparse.c::sparse_init_one_section())
    1092:      *
    1093:      * Additionally during early boot we encode node id of
    1094:      * the location of the section here to guide allocation.
    1095:      * (see sparse.c::memory_present())
    1096:      *
    1097:      * Making it a UL at least makes someone do a cast
    1098:      * before using it wrong.
    1099:      */
    1100:     unsigned long section_mem_map;
    1101: 
    1102:     /* See declaration of similar field in struct zone */
    1103:     unsigned long *pageblock_flags;
    1104: #ifdef CONFIG_MEMCG
    1105:     /*
    1106:      * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
    1107:      * section. (see memcontrol.h/page_cgroup.h about this.)
    1108:      */
    1109:     struct page_cgroup *page_cgroup;
    1110:     unsigned long pad;
    1111: #endif
    1112:     /*
    1113:      * WARNING: mem_section must be a power-of-2 in size for the
    1114:      * calculation and use of SECTION_ROOT_MASK to make sense.
    1115:      */
    1116: };
    1117: 
    1118: #ifdef CONFIG_SPARSEMEM_EXTREME
    1119: #define SECTIONS_PER_ROOT       (PAGE_SIZE / sizeof (struct mem_section))
    1120: #else
    1121: #define SECTIONS_PER_ROOT    1
    1122: #endif
    1123: 
    1124: #define SECTION_NR_TO_ROOT(sec)    ((sec) / SECTIONS_PER_ROOT)
    1125: #define NR_SECTION_ROOTS    DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)
    1126: #define SECTION_ROOT_MASK    (SECTIONS_PER_ROOT - 1)
    1127: 
    1128: #ifdef CONFIG_SPARSEMEM_EXTREME
    1129: extern struct mem_section *mem_section[NR_SECTION_ROOTS];
    1130: #else
    1131: extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
    1132: #endif
    1133: 
    1134: static inline struct mem_section *__nr_to_section(unsigned long nr)
    1135: {
    1136:     if (!mem_section[SECTION_NR_TO_ROOT(nr)])
    1137:         return NULL;
    1138:     return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
    1139: }
    1140: extern int __section_nr(struct mem_section* ms);
    1141: extern unsigned long usemap_size(void);
    1142: 
    1143: /*
    1144:  * We use the lower bits of the mem_map pointer to store
    1145:  * a little bit of information.  There should be at least
    1146:  * 3 bits here due to 32-bit alignment.
    1147:  */
    1148: #define    SECTION_MARKED_PRESENT    (1UL<<0)
    1149: #define SECTION_HAS_MEM_MAP    (1UL<<1)
    1150: #define SECTION_MAP_LAST_BIT    (1UL<<2)
    1151: #define SECTION_MAP_MASK    (~(SECTION_MAP_LAST_BIT-1))
    1152: #define SECTION_NID_SHIFT    2
    1153: 
    1154: static inline struct page *__section_mem_map_addr(struct mem_section *section)
    1155: {
    1156:     unsigned long map = section->section_mem_map;
    1157:     map &= SECTION_MAP_MASK;
    1158:     return (struct page *)map;
    1159: }
    1160: 
    1161: static inline int present_section(struct mem_section *section)
    1162: {
    1163:     return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
    1164: }
    1165: 
    1166: static inline int present_section_nr(unsigned long nr)
    1167: {
    1168:     return present_section(__nr_to_section(nr));
    1169: }
    1170: 
    1171: static inline int valid_section(struct mem_section *section)
    1172: {
    1173:     return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
    1174: }
    1175: 
    1176: static inline int valid_section_nr(unsigned long nr)
    1177: {
    1178:     return valid_section(__nr_to_section(nr));
    1179: }
    1180: 
    1181: static inline struct mem_section *__pfn_to_section(unsigned long pfn)
    1182: {
    1183:     return __nr_to_section(pfn_to_section_nr(pfn));
    1184: }
    1185: 
    1186: #ifndef CONFIG_HAVE_ARCH_PFN_VALID
    1187: static inline int pfn_valid(unsigned long pfn)
    1188: {
    1189:     if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
    1190:         return 0;
    1191:     return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
    1192: }
    1193: #endif
    1194: 
    1195: static inline int pfn_present(unsigned long pfn)
    1196: {
    1197:     if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
    1198:         return 0;
    1199:     return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
    1200: }
    1201: 
    1202: /*
    1203:  * These are _only_ used during initialisation, therefore they
    1204:  * can use __initdata ...  They could have names to indicate
    1205:  * this restriction.
    1206:  */
    1207: #ifdef CONFIG_NUMA
    1208: #define pfn_to_nid(pfn)                            \
    1209: ({                                    \
    1210:     unsigned long __pfn_to_nid_pfn = (pfn);                \
    1211:     page_to_nid(pfn_to_page(__pfn_to_nid_pfn));            \
    1212: })
    1213: #else
    1214: #define pfn_to_nid(pfn)        (0)
    1215: #endif
    1216: 
    1217: #define early_pfn_valid(pfn)    pfn_valid(pfn)
    1218: void sparse_init(void);
    1219: #else
    1220: #define sparse_init()    do {} while (0)
    1221: #define sparse_index_init(_sec, _nid)  do {} while (0)
    1222: #endif /* CONFIG_SPARSEMEM */
    1223: 
    1224: #ifdef CONFIG_NODES_SPAN_OTHER_NODES
    1225: bool early_pfn_in_nid(unsigned long pfn, int nid);
    1226: #else
    1227: #define early_pfn_in_nid(pfn, nid)    (1)
    1228: #endif
    1229: 
    1230: #ifndef early_pfn_valid
    1231: #define early_pfn_valid(pfn)    (1)
    1232: #endif
    1233: 
    1234: void memory_present(int nid, unsigned long start, unsigned long end);
    1235: unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
    1236: 
    1237: /*
    1238:  * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
    1239:  * need to check pfn validility within that MAX_ORDER_NR_PAGES block.
    1240:  * pfn_valid_within() should be used in this case; we optimise this away
    1241:  * when we have no holes within a MAX_ORDER_NR_PAGES block.
    1242:  */
    1243: #ifdef CONFIG_HOLES_IN_ZONE
    1244: #define pfn_valid_within(pfn) pfn_valid(pfn)
    1245: #else
    1246: #define pfn_valid_within(pfn) (1)
    1247: #endif
    1248: 
    1249: #ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
    1250: /*
    1251:  * pfn_valid() is meant to be able to tell if a given PFN has valid memmap
    1252:  * associated with it or not. In FLATMEM, it is expected that holes always
    1253:  * have valid memmap as long as there is valid PFNs either side of the hole.
    1254:  * In SPARSEMEM, it is assumed that a valid section has a memmap for the
    1255:  * entire section.
    1256:  *
    1257:  * However, an ARM, and maybe other embedded architectures in the future
    1258:  * free memmap backing holes to save memory on the assumption the memmap is
    1259:  * never used. The page_zone linkages are then broken even though pfn_valid()
    1260:  * returns true. A walker of the full memmap must then do this additional
    1261:  * check to ensure the memmap they are looking at is sane by making sure
    1262:  * the zone and PFN linkages are still valid. This is expensive, but walkers
    1263:  * of the full memmap are extremely rare.
    1264:  */
    1265: int memmap_valid_within(unsigned long pfn,
    1266:                     struct page *page, struct zone *zone);
    1267: #else
    1268: static inline int memmap_valid_within(unsigned long pfn,
    1269:                     struct page *page, struct zone *zone)
    1270: {
    1271:     return 1;
    1272: }
    1273: #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
    1274: 
    1275: #endif /* !__GENERATING_BOUNDS.H */
    1276: #endif /* !__ASSEMBLY__ */
    1277: #endif /* _LINUX_MMZONE_H */
    1278: