Android kswapd-->lowmemorykiller启动和扫描过程

注册kswapd module,kernel启动时调用@kernel/mm:

module_init(kswapd_init)static int __init kswapd_init(void){    int nid;    swap_setup();    for_each_node_state(nid, N_MEMORY)        kswapd_run(nid);    if (kswapd_cpu_mask == NULL)        hotcpu_notifier(cpu_callback, 0);    return 0;}/* * This kswapd start function will be called by init and node-hot-add. * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. */int kswapd_run(int nid){    pg_data_t *pgdat = NODE_DATA(nid);    int ret = 0;    if (pgdat->kswapd)        return 0;    pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);    if (IS_ERR(pgdat->kswapd)) {        /* failure at boot is fatal */        BUG_ON(system_state == SYSTEM_BOOTING);        pr_err("Failed to start kswapd on node %d\n", nid);        ret = PTR_ERR(pgdat->kswapd);        pgdat->kswapd = NULL;    } else if (kswapd_cpu_mask) {        if (set_kswapd_cpu_mask(pgdat))            pr_warn("error setting kswapd cpu affinity mask\n");    }    return ret;}/* * The background pageout daemon, started as a kernel thread * from the init process. * * This basically trickles out pages so that we have _some_ * free memory available even if there is no other activity * that frees anything up. This is needed for things like routing * etc, where we otherwise might have all activity going on in * asynchronous contexts that cannot page things out. * * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */static int kswapd(void *p){...        /*         * We can speed up thawing tasks if we don't call balance_pgdat         * after returning from the refrigerator         */        if (!ret) {            trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);            balanced_classzone_idx = classzone_idx;            balanced_order = balance_pgdat(pgdat, order,                        &balanced_classzone_idx);        }    }    tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);    current->reclaim_state = NULL;    lockdep_clear_current_reclaim_state();    return 0;}/* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). * * Returns the final order kswapd was reclaiming at * * There is special handling here for zones which are full of pinned pages. * This can happen if the pages are all mlocked, or if they are all used by * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb. * What we do is to detect the case where all pages in the zone have been * scanned twice and there has been zero successful reclaim.  Mark the zone as * dead and from now on, only perform a short scan.  Basically we're polling * the zone for when the problem goes away. * * kswapd scans the zones in the highmem->normal->dma direction.  It skips * zones which have free_pages > high_wmark_pages(zone), but once a zone is * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the * lower zones regardless of the number of free pages in the lower zones. This * interoperates with the page allocator fallback scheme to ensure that aging * of pages is balanced across the zones. */static unsigned long balance_pgdat(pg_data_t *pgdat, int order,                            int *classzone_idx){...            /*             * There should be no need to raise the scanning             * priority if enough pages are already being scanned             * that that high watermark would be met at 100%             * efficiency.             */            if (kswapd_shrink_zone(zone, end_zone, &sc,                    lru_pages, &nr_attempted))}/* * kswapd shrinks the zone by the number of pages required to reach * the high watermark. * * Returns true if kswapd scanned at least the requested number of pages to * reclaim or if the lack of progress was due to pages under writeback. * This is used to determine if the scanning priority needs to be raised. */static bool kswapd_shrink_zone(struct zone *zone,                   int classzone_idx,                   struct scan_control *sc,                   unsigned long lru_pages,                   unsigned long *nr_attempted){...    reclaim_state->reclaimed_slab = 0;    shrink_slab(&shrink, sc->nr_scanned, lru_pages);    sc->nr_reclaimed += reclaim_state->reclaimed_slab;...}/* * Call the shrink functions to age shrinkable caches * * Here we assume it costs one seek to replace a lru page and that it also * takes a seek to recreate a cache object.  With this in mind we age equal * percentages of the lru and ageable caches.  This should balance the seeks * generated by these structures. * * If the vm encountered mapped pages on the LRU it increase the pressure on * slab to avoid swapping. * * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. * * `lru_pages' represents the number of on-LRU pages in all the zones which * are eligible for the caller's allocation attempt.  It is used for balancing * slab reclaim versus page reclaim. * * Returns the number of slab objects which we shrunk. */unsigned long shrink_slab(struct shrink_control *shrinkctl,              unsigned long nr_pages_scanned,              unsigned long lru_pages){    struct shrinker *shrinker;...        for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {            if (node_online(shrinkctl->nid))                freed += shrink_slab_node(shrinkctl, shrinker,                        nr_pages_scanned, lru_pages);        }...}static unsigned longshrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,         unsigned long nr_pages_scanned, unsigned long lru_pages){...    freeable = shrinker->count_objects(shrinker, shrinkctl);    if (freeable == 0)        return 0;...    /*     * Normally, we should not scan less than batch_size objects in one     * pass to avoid too frequent shrinker calls, but if the slab has less     * than batch_size objects in total and we are really tight on memory,     * we will try to reclaim all available objects, otherwise we can end     * up failing allocations although there are plenty of reclaimable     * objects spread over several slabs with usage less than the     * batch_size.     *     * We detect the "tight on memory" situations by looking at the total     * number of objects we want to scan (total_scan). If it is greater     * than the total number of objects on slab (freeable), we must be     * scanning at high prio and therefore should try to reclaim as much as     * possible.     */    while (total_scan > min_cache_size ||           total_scan >= freeable) {        unsigned long ret;        unsigned long nr_to_scan = min(batch_size, total_scan);        shrinkctl->nr_to_scan = nr_to_scan;        ret = shrinker->scan_objects(shrinker, shrinkctl);        if (ret == SHRINK_STOP)            break;        freed += ret;        count_vm_events(SLABS_SCANNED, nr_to_scan);        total_scan -= nr_to_scan;        cond_resched();    }

此处shrink就对应lowmemorykiller 注册的@kernel/drivers/staging/：

static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc){...//选择到需要kill的task，kill掉。        set_tsk_thread_flag(selected, TIF_MEMDIE);        send_sig(SIGKILL, selected, 0);...}static unsigned long lowmem_count(struct shrinker *s,                  struct shrink_control *sc){    return global_page_state(NR_ACTIVE_ANON) +        global_page_state(NR_ACTIVE_FILE) +        global_page_state(NR_INACTIVE_ANON) +        global_page_state(NR_INACTIVE_FILE);}static struct shrinker lowmem_shrinker = {    .scan_objects = lowmem_scan,    .count_objects = lowmem_count,    .seeks = DEFAULT_SEEKS * 16};static int __init lowmem_init(void){    register_shrinker(&lowmem_shrinker);    vmpressure_notifier_register(&lmk_vmpr_nb);    return 0;}

所以一般在android里遇到kswapd占用cpu资源较多时，可以优化lowmemorykiller里的scan过程，尽量保证更多的memory，减少kswapd回收不断scan的过程。

AMS在更新oom adj时则是通过下面的流程@frameworks/base/services/core/java/com/android/server/am：

//通过localsocket 通到native的lmkd daemon private static void writeLmkd(ByteBuffer buf) {        for (int i = 0; i < 3; i++) {            if (sLmkdSocket == null) {                    if (openLmkdSocket() == false) {                        try {                            Thread.sleep(1000);                        } catch (InterruptedException ie) {                        }                        continue;                    }            }            try {                sLmkdOutputStream.write(buf.array(), 0, buf.position());                return;            } catch (IOException ex) {                Slog.w(TAG, "Error writing to lowmemorykiller socket");                try {                    sLmkdSocket.close();                } catch (IOException ex2) {                }                sLmkdSocket = null;            }        }    }    private static boolean openLmkdSocket() {        try {            sLmkdSocket = new LocalSocket(LocalSocket.SOCKET_SEQPACKET);            sLmkdSocket.connect(                new LocalSocketAddress("lmkd",                        LocalSocketAddress.Namespace.RESERVED));            sLmkdOutputStream = sLmkdSocket.getOutputStream();        } catch (IOException ex) {            Slog.w(TAG, "lowmemorykiller daemon socket open failed");            sLmkdSocket = null;            return false;        }        return true;    }

native lmkd daemon:
@system/core/lmkd

static int init(void) {...    ctrl_lfd = android_get_control_socket("lmkd");    if (ctrl_lfd < 0) {        ALOGE("get lmkd control socket failed");        return -1;    }...}

数据处理回调：

static void ctrl_data_handler(uint32_t events) {    if (events & EPOLLHUP) {        ALOGI("ActivityManager disconnected");        if (!ctrl_dfd_reopened)            ctrl_data_close();    } else if (events & EPOLLIN) {        ctrl_command_handler();    }}static void ctrl_command_handler(void) {    int ibuf[CTRL_PACKET_MAX / sizeof(int)];    int len;    int cmd = -1;    int nargs;    int targets;    len = ctrl_data_read((char *)ibuf, CTRL_PACKET_MAX);    if (len <= 0)        return;    nargs = len / sizeof(int) - 1;    if (nargs < 0)        goto wronglen;    cmd = ntohl(ibuf[0]);    switch(cmd) {    case LMK_TARGET:        targets = nargs / 2;        if (nargs & 0x1 || targets > (int)ARRAY_SIZE(lowmem_adj))            goto wronglen;        cmd_target(targets, &ibuf[1]);        break;    case LMK_PROCPRIO:        if (nargs != 3)            goto wronglen;        cmd_procprio(ntohl(ibuf[1]), ntohl(ibuf[2]), ntohl(ibuf[3]));        break;    case LMK_PROCREMOVE:        if (nargs != 1)            goto wronglen;        cmd_procremove(ntohl(ibuf[1]));        break;    default:        ALOGE("Received unknown command code %d", cmd);        return;    }    return;wronglen:    ALOGE("Wrong control socket read length cmd=%d len=%d", cmd, len);}

最终写到对应proc 节点：

#define INKERNEL_MINFREE_PATH "/sys/module/lowmemorykiller/parameters/minfree"#define INKERNEL_ADJ_PATH "/sys/module/lowmemorykiller/parameters/adj""/proc/%d/oom_score_adj"

更多相关文章

随机推荐