1: /* 2: * Copyright (c) 1982, 1986 Regents of the University of California. 3: * All rights reserved. The Berkeley software License Agreement 4: * specifies the terms and conditions for redistribution. 5: * 6: * @(#)vm_page.c 7.1 (Berkeley) 6/5/86 7: */ 8: 9: #include "../machine/reg.h" 10: #include "../machine/pte.h" 11: 12: #include "param.h" 13: #include "systm.h" 14: #include "inode.h" 15: #include "dir.h" 16: #include "user.h" 17: #include "proc.h" 18: #include "buf.h" 19: #include "text.h" 20: #include "cmap.h" 21: #include "vm.h" 22: #include "file.h" 23: #include "trace.h" 24: 25: int nohash = 0; 26: /* 27: * Handle a page fault. 28: * 29: * Basic outline 30: * If page is allocated, but just not valid: 31: * Wait if intransit, else just revalidate 32: * Done 33: * Compute <dev,bn> from which page operation would take place 34: * If page is text page, and filling from file system or swap space: 35: * If in free list cache, reattach it and then done 36: * Allocate memory for page in 37: * If block here, restart because we could have swapped, etc. 38: * Lock process from swapping for duration 39: * Update pte's to reflect that page is intransit. 40: * If page is zero fill on demand: 41: * Clear pages and flush free list cache of stale cacheing 42: * for this swap page (e.g. before initializing again due 43: * to 407/410 exec). 44: * If page is fill from file and in buffer cache: 45: * Copy the page from the buffer cache. 46: * If not a fill on demand: 47: * Determine swap address and cluster to page in 48: * Do the swap to bring the page in 49: * Instrument the pagein 50: * After swap validate the required new page 51: * Leave prepaged pages reclaimable (not valid) 52: * Update shared copies of text page tables 53: * Complete bookkeeping on pages brought in: 54: * No longer intransit 55: * Hash text pages into core hash structure 56: * Unlock pages (modulo raw i/o requirements) 57: * Flush translation buffer 58: * Process pagein is done 59: */ 60: #ifdef TRACE 61: #define pgtrace(e) trace(e,v,u.u_procp->p_pid) 62: #else 63: #define pgtrace(e) 64: #endif 65: 66: int preptofree = 1; /* send pre-paged pages to free list */ 67: 68: pagein(virtaddr, dlyu) 69: unsigned virtaddr; 70: int dlyu; 71: { 72: register struct proc *p; 73: register struct pte *pte; 74: register unsigned v; 75: unsigned pf; 76: int type, fileno; 77: struct pte opte; 78: dev_t dev; 79: register int i; 80: int klsize; 81: unsigned vsave; 82: struct cmap *c; 83: int j; 84: daddr_t bn, bncache, bnswap; 85: int si, sk; 86: int swerror = 0; 87: #ifdef PGINPROF 88: #include "../vax/mtpr.h" 89: int otime, olbolt, oicr, s; 90: long a; 91: 92: s = splclock(); 93: otime = time, olbolt = lbolt, oicr = mfpr(ICR); 94: #endif 95: cnt.v_faults++; 96: /* 97: * Classify faulted page into a segment and get a pte 98: * for the faulted page. 99: */ 100: vsave = v = clbase(btop(virtaddr)); 101: p = u.u_procp; 102: if (isatsv(p, v)) 103: type = CTEXT; 104: else if (isassv(p, v)) 105: type = CSTACK; 106: else 107: type = CDATA; 108: pte = vtopte(p, v); 109: if (pte->pg_v) 110: panic("pagein"); 111: 112: /* 113: * If page is reclaimable, reclaim it. 114: * If page is text and intransit, sleep while it is intransit, 115: * If it is valid after the sleep, we are done. 116: * Otherwise we have to start checking again, since page could 117: * even be reclaimable now (we may have swapped for a long time). 118: */ 119: restart: 120: if (pte->pg_fod == 0 && pte->pg_pfnum) { 121: if (type == CTEXT && cmap[pgtocm(pte->pg_pfnum)].c_intrans) { 122: pgtrace(TR_INTRANS); 123: sleep((caddr_t)p->p_textp, PSWP+1); 124: pgtrace(TR_EINTRANS); 125: pte = vtopte(p, v); 126: if (pte->pg_v) { 127: valid: 128: if (dlyu) { 129: c = &cmap[pgtocm(pte->pg_pfnum)]; 130: if (c->c_lock) { 131: c->c_want = 1; 132: sleep((caddr_t)c, PSWP+1); 133: goto restart; 134: } 135: c->c_lock = 1; 136: } 137: newptes(pte, v, CLSIZE); 138: cnt.v_intrans++; 139: return; 140: } 141: goto restart; 142: } 143: /* 144: * If page is in the free list, then take 145: * it back into the resident set, updating 146: * the size recorded for the resident set. 147: */ 148: si = splimp(); 149: c = &cmap[pgtocm(pte->pg_pfnum)]; 150: if (c->c_free) { 151: pgtrace(TR_FRECLAIM); 152: munlink(c); 153: cnt.v_pgfrec++; 154: if (type == CTEXT) 155: p->p_textp->x_rssize += CLSIZE; 156: else 157: p->p_rssize += CLSIZE; 158: } else 159: pgtrace(TR_RECLAIM); 160: splx(si); 161: pte->pg_v = 1; 162: if (anycl(pte, pg_m)) 163: pte->pg_m = 1; 164: distcl(pte); 165: if (type == CTEXT) 166: distpte(p->p_textp, (unsigned)vtotp(p, v), pte); 167: u.u_ru.ru_minflt++; 168: cnt.v_pgrec++; 169: if (dlyu) { 170: c = &cmap[pgtocm(pte->pg_pfnum)]; 171: if (c->c_lock) { 172: c->c_want = 1; 173: sleep((caddr_t)c, PSWP+1); 174: goto restart; 175: } 176: c->c_lock = 1; 177: } 178: newptes(pte, v, CLSIZE); 179: #ifdef PGINPROF 180: a = vmtime(otime, olbolt, oicr); 181: rectime += a; 182: if (a >= 0) 183: vmfltmon(rmon, a, rmonmin, rres, NRMON); 184: splx(s); 185: #endif 186: return; 187: } 188: #ifdef PGINPROF 189: splx(s); 190: #endif 191: /* 192: * <dev,bn> is where data comes from/goes to. 193: * <dev,bncache> is where data is cached from/to. 194: * <swapdev,bnswap> is where data will eventually go. 195: */ 196: if (pte->pg_fod == 0) { 197: fileno = -1; 198: bnswap = bncache = bn = vtod(p, v, &u.u_dmap, &u.u_smap); 199: dev = swapdev; 200: } else { 201: fileno = ((struct fpte *)pte)->pg_fileno; 202: bn = ((struct fpte *)pte)->pg_blkno; 203: bnswap = vtod(p, v, &u.u_dmap, &u.u_smap); 204: if (fileno > PG_FMAX) 205: panic("pagein pg_fileno"); 206: if (fileno == PG_FTEXT) { 207: if (p->p_textp == 0) 208: panic("pagein PG_FTEXT"); 209: dev = p->p_textp->x_iptr->i_dev; 210: bncache = bn; 211: } else if (fileno == PG_FZERO) { 212: dev = swapdev; 213: bncache = bnswap; 214: } else { 215: panic("pagein"); /* can't happen */ 216: } 217: } 218: klsize = 1; 219: opte = *pte; 220: 221: /* 222: * Check for text detached but in free list. 223: * This can happen only if the page is filling 224: * from a inode or from the swap device, (e.g. not when reading 225: * in 407/410 execs to a zero fill page.) 226: * Honor lock bit to avoid races with pageouts. 227: */ 228: if (type == CTEXT && fileno != PG_FZERO && !nohash) { 229: si = splimp(); 230: while ((c = mfind(dev, bncache)) != 0) { 231: if (c->c_lock == 0) 232: break; 233: MLOCK(c); 234: MUNLOCK(c); 235: } 236: if (c) { 237: if (c->c_type != CTEXT || c->c_gone == 0 || 238: c->c_free == 0) 239: panic("pagein mfind"); 240: p->p_textp->x_rssize += CLSIZE; 241: /* 242: * Following code mimics memall(). 243: */ 244: munlink(c); 245: pf = cmtopg(c - cmap); 246: for (j = 0; j < CLSIZE; j++) { 247: *(int *)pte = pf++; 248: pte->pg_prot = opte.pg_prot; 249: pte++; 250: } 251: pte -= CLSIZE; 252: c->c_free = 0; 253: c->c_gone = 0; 254: if (c->c_intrans || c->c_want) 255: panic("pagein intrans|want"); 256: c->c_lock = 1; 257: if (c->c_page != vtotp(p, v)) 258: panic("pagein c_page chgd"); 259: c->c_ndx = p->p_textp - &text[0]; 260: if (dev == swapdev) { 261: cnt.v_xsfrec++; 262: pgtrace(TR_XSFREC); 263: } else { 264: cnt.v_xifrec++; 265: pgtrace(TR_XIFREC); 266: } 267: cnt.v_pgrec++; 268: u.u_ru.ru_minflt++; 269: if (dev != swapdev) { 270: c = mfind(swapdev, bnswap); 271: if (c) 272: munhash(swapdev, bnswap); 273: pte->pg_m = 1; 274: } 275: splx(si); 276: goto skipswap; 277: } 278: splx(si); 279: } 280: 281: /* 282: * Wasn't reclaimable or reattachable. 283: * Have to prepare to bring the page in. 284: * We allocate the page before locking so we will 285: * be swappable if there is no free memory. 286: * If we block we have to start over, since anything 287: * could have happened. 288: */ 289: sk = splimp(); /* lock memalls from here into kluster */ 290: if (freemem < CLSIZE * KLMAX) { 291: pgtrace(TR_WAITMEM); 292: while (freemem < CLSIZE * KLMAX) 293: sleep((caddr_t)&freemem, PSWP+2); 294: pgtrace(TR_EWAITMEM); 295: splx(sk); 296: pte = vtopte(p, v); 297: if (pte->pg_v) 298: goto valid; 299: goto restart; 300: } 301: 302: /* 303: * Now can get memory and committed to bringing in the page. 304: * Lock this process, get a page, 305: * construct the new pte, and increment 306: * the (process or text) resident set size. 307: */ 308: p->p_flag |= SPAGE; 309: if (memall(pte, CLSIZE, p, type) == 0) 310: panic("pagein memall"); 311: pte->pg_prot = opte.pg_prot; 312: pf = pte->pg_pfnum; 313: cmap[pgtocm(pf)].c_intrans = 1; 314: distcl(pte); 315: if (type == CTEXT) { 316: p->p_textp->x_rssize += CLSIZE; 317: distpte(p->p_textp, (unsigned)vtotp(p, v), pte); 318: } else 319: p->p_rssize += CLSIZE; 320: 321: /* 322: * Two cases: either fill on demand (zero, or from file or text) 323: * or from swap space. 324: */ 325: if (opte.pg_fod) { 326: pte->pg_m = 1; 327: if (fileno == PG_FZERO || fileno == PG_FTEXT) { 328: /* 329: * Flush any previous text page use of this 330: * swap device block. 331: */ 332: si = splimp(); 333: if (type == CTEXT) { 334: c = mfind(swapdev, bnswap); 335: if (c) 336: munhash(swapdev, bnswap); 337: } 338: splx(si); 339: /* 340: * If zero fill, short-circuit hard work 341: * by just clearing pages. 342: */ 343: if (fileno == PG_FZERO) { 344: pgtrace(TR_ZFOD); 345: for (i = 0; i < CLSIZE; i++) 346: clearseg(pf+i); 347: if (type != CTEXT) 348: cnt.v_zfod += CLSIZE; 349: splx(sk); 350: goto skipswap; 351: } 352: pgtrace(TR_EXFOD); 353: cnt.v_exfod += CLSIZE; 354: } else 355: panic("pagein vread"); 356: /* 357: * Fill from inode. Try to find adjacent 358: * pages to bring in also. 359: */ 360: v = fodkluster(p, v, pte, &klsize, dev, &bn); 361: bncache = bn; 362: splx(sk); 363: /* 364: * Blocks of an executable may still be in the buffer 365: * cache, so we explicitly flush them out to disk 366: * so that the proper data will be paged in. 367: */ 368: blkflush(dev, bn, (long)CLSIZE*NBPG); 369: #ifdef TRACE 370: if (type != CTEXT) 371: trace(TR_XFODMISS, dev, bn); 372: #endif 373: } else { 374: if (opte.pg_pfnum) 375: panic("pagein pfnum"); 376: pgtrace(TR_SWAPIN); 377: /* 378: * Fill from swap area. Try to find adjacent 379: * pages to bring in also. 380: */ 381: v = kluster(p, v, pte, B_READ, &klsize, 382: (type == CTEXT) ? kltxt : 383: ((p->p_flag & SSEQL) ? klseql : klin), bn); 384: splx(sk); 385: /* THIS COULD BE COMPUTED INCREMENTALLY... */ 386: bncache = bn = vtod(p, v, &u.u_dmap, &u.u_smap); 387: } 388: 389: distcl(pte); 390: swerror = swap(p, bn, ptob(v), klsize * ctob(CLSIZE), 391: B_READ, B_PGIN, dev, 0); 392: #ifdef TRACE 393: trace(TR_PGINDONE, vsave, u.u_procp->p_pid); 394: #endif 395: 396: /* 397: * Instrumentation. 398: */ 399: u.u_ru.ru_majflt++; 400: cnt.v_pgin++; 401: cnt.v_pgpgin += klsize * CLSIZE; 402: #ifdef PGINPROF 403: a = vmtime(otime, olbolt, oicr) / 100; 404: pgintime += a; 405: if (a >= 0) 406: vmfltmon(pmon, a, pmonmin, pres, NPMON); 407: #endif 408: 409: skipswap: 410: /* 411: * Fix page table entries. 412: * 413: * Only page requested in is validated, and rest of pages 414: * can be ``reclaimed''. This allows system to reclaim prepaged pages 415: * quickly if they are not used and memory is tight. 416: */ 417: pte = vtopte(p, vsave); 418: pte->pg_v = 1; 419: distcl(pte); 420: if (type == CTEXT) { 421: if (swerror == 0) { 422: distpte(p->p_textp, (unsigned)vtotp(p, vsave), pte); 423: if (opte.pg_fod) 424: p->p_textp->x_flag |= XWRIT; 425: } 426: wakeup((caddr_t)p->p_textp); 427: } 428: 429: /* 430: * Memall returned page(s) locked. Unlock all 431: * pages in cluster. If locking pages for raw i/o 432: * leave the page which was required to be paged in locked, 433: * but still unlock others. 434: * If text pages, hash into the cmap situation table. 435: */ 436: pte = vtopte(p, v); 437: for (i = 0; i < klsize; i++) { 438: c = &cmap[pgtocm(pte->pg_pfnum)]; 439: c->c_intrans = 0; 440: if (type == CTEXT && c->c_blkno == 0 && bncache && !nohash && 441: !swerror) { 442: mhash(c, dev, bncache); 443: bncache += btodb(CLBYTES); 444: } 445: if (v != vsave || !dlyu) 446: MUNLOCK(c); 447: if (v != vsave && type != CTEXT && preptofree && 448: opte.pg_fod == 0) { 449: /* 450: * Throw pre-paged data/stack pages at the 451: * bottom of the free list. 452: */ 453: p->p_rssize -= CLSIZE; 454: memfree(pte, CLSIZE, 0); 455: } 456: newptes(pte, v, CLSIZE); 457: v += CLSIZE; 458: pte += CLSIZE; 459: } 460: 461: /* 462: * All done. 463: */ 464: p->p_flag &= ~SPAGE; 465: 466: /* 467: * If process is declared fifo, memory is tight, 468: * and this was a data page-in, free memory 469: * klsdist pagein clusters away from the current fault. 470: */ 471: if ((p->p_flag&SSEQL) && freemem < lotsfree && type == CDATA) { 472: int k = (vtodp(p, vsave) / CLSIZE) / klseql; 473: #ifdef notdef 474: if (vsave > u.u_vsave) 475: k -= klsdist; 476: else 477: k += klsdist; 478: dpageout(p, k * klseql * CLSIZE, klout*CLSIZE); 479: u.u_vsave = vsave; 480: #else 481: dpageout(p, (k - klsdist) * klseql * CLSIZE, klout*CLSIZE); 482: dpageout(p, (k + klsdist) * klseql * CLSIZE, klout*CLSIZE); 483: #endif 484: } 485: } 486: 487: /* 488: * Take away n pages of data space 489: * starting at data page dp. 490: * Used to take pages away from sequential processes. 491: * Mimics pieces of code in pageout() below. 492: */ 493: dpageout(p, dp, n) 494: struct proc *p; 495: int dp, n; 496: { 497: register struct cmap *c; 498: int i, klsize; 499: register struct pte *pte; 500: unsigned v; 501: daddr_t daddr; 502: 503: if (dp < 0) { 504: n += dp; 505: dp = 0; 506: } 507: if (dp + n > p->p_dsize) 508: n = p->p_dsize - dp; 509: for (i = 0; i < n; i += CLSIZE, dp += CLSIZE) { 510: pte = dptopte(p, dp); 511: if (pte->pg_fod || pte->pg_pfnum == 0) 512: continue; 513: c = &cmap[pgtocm(pte->pg_pfnum)]; 514: if (c->c_lock || c->c_free) 515: continue; 516: if (pte->pg_v) { 517: pte->pg_v = 0; 518: if (anycl(pte, pg_m)) 519: pte->pg_m = 1; 520: distcl(pte); 521: } 522: if (dirtycl(pte)) { 523: if (bswlist.av_forw == NULL) 524: continue; 525: MLOCK(c); 526: pte->pg_m = 0; 527: distcl(pte); 528: p->p_poip++; 529: v = kluster(p, dptov(p, dp), pte, B_WRITE, 530: &klsize, klout, (daddr_t)0); 531: /* THIS ASSUMES THAT p == u.u_procp */ 532: daddr = vtod(p, v, &u.u_dmap, &u.u_smap); 533: (void)swap(p, daddr, ptob(v), klsize * ctob(CLSIZE), 534: B_WRITE, B_DIRTY, swapdev, pte->pg_pfnum); 535: } else { 536: if (c->c_gone == 0) 537: p->p_rssize -= CLSIZE; 538: memfree(pte, CLSIZE, 0); 539: cnt.v_seqfree += CLSIZE; 540: } 541: } 542: } 543: 544: unsigned maxdmap; 545: unsigned maxtsize; 546: 547: /* 548: * Setup the paging constants for the clock algorithm. 549: * Called after the system is initialized and the amount of memory 550: * and number of paging devices is known. 551: * 552: * Threshold constants are defined in ../machine/vmparam.h. 553: */ 554: vminit() 555: { 556: 557: /* 558: * Lotsfree is threshold where paging daemon turns on. 559: */ 560: if (lotsfree == 0) { 561: lotsfree = LOTSFREE / NBPG; 562: if (lotsfree > LOOPPAGES / LOTSFREEFRACT) 563: lotsfree = LOOPPAGES / LOTSFREEFRACT; 564: } 565: /* 566: * Desfree is amount of memory desired free. 567: * If less than this for extended period, do swapping. 568: */ 569: if (desfree == 0) { 570: desfree = DESFREE / NBPG; 571: if (desfree > LOOPPAGES / DESFREEFRACT) 572: desfree = LOOPPAGES / DESFREEFRACT; 573: } 574: 575: /* 576: * Minfree is minimal amount of free memory which is tolerable. 577: */ 578: if (minfree == 0) { 579: minfree = MINFREE / NBPG; 580: if (minfree > desfree / MINFREEFRACT) 581: minfree = desfree / MINFREEFRACT; 582: } 583: 584: /* 585: * Maxpgio thresholds how much paging is acceptable. 586: * This figures that 2/3 busy on an arm is all that is 587: * tolerable for paging. We assume one operation per disk rev. 588: */ 589: if (maxpgio == 0) 590: maxpgio = (DISKRPM * 2) / 3; 591: 592: /* 593: * Clock to scan using max of ~~10% of processor time for sampling, 594: * this estimated to allow maximum of 200 samples per second. 595: * This yields a ``fastscan'' of roughly (with CLSIZE=2): 596: * <=1m 2m 3m 4m 8m 597: * 5s 10s 15s 20s 40s 598: */ 599: if (fastscan == 0) 600: fastscan = 200; 601: if (fastscan > LOOPPAGES / 5) 602: fastscan = LOOPPAGES / 5; 603: 604: /* 605: * Set slow scan time to 1/2 the fast scan time. 606: */ 607: if (slowscan == 0) 608: slowscan = fastscan / 2; 609: 610: /* 611: * Calculate the swap allocation constants. 612: */ 613: if (dmmin == 0) 614: dmmin = DMMIN; 615: if (dmmax == 0) { 616: dmmax = DMMAX; 617: while (dmapsize(dmmin, dmmax / 2) >= MAXDSIZ && dmmax > dmmin) 618: dmmax /= 2; 619: } 620: maxdmap = dmapsize(dmmin, dmmax); 621: if (dmtext == 0) 622: dmtext = DMTEXT; 623: if (dmtext > dmmax) 624: dmtext = dmmax; 625: if (maxtsize == 0) 626: maxtsize = MAXTSIZ; 627: if (maxtsize > dtob(NXDAD * dmtext)) 628: maxtsize = dtob(NXDAD * dmtext); 629: 630: /* 631: * Set up the initial limits on process VM. 632: * Set the maximum resident set size to be all 633: * of (reasonably) available memory. This causes 634: * any single, large process to start random page 635: * replacement once it fills memory. 636: */ 637: u.u_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ; 638: u.u_rlimit[RLIMIT_STACK].rlim_max = MIN(MAXSSIZ, maxdmap); 639: u.u_rlimit[RLIMIT_DATA].rlim_cur = DFLDSIZ; 640: u.u_rlimit[RLIMIT_DATA].rlim_max = MIN(MAXDSIZ, maxdmap); 641: u.u_rlimit[RLIMIT_RSS].rlim_cur = u.u_rlimit[RLIMIT_RSS].rlim_max = 642: ctob(LOOPPAGES - desfree); 643: proc[0].p_maxrss = LOOPPAGES - desfree; 644: } 645: 646: dmapsize(dmin, dmax) 647: int dmin, dmax; 648: { 649: register int i, blk, size = 0; 650: 651: blk = dmin; 652: for (i = 0; i < NDMAP; i++) { 653: size += blk; 654: if (blk < dmax) 655: blk *= 2; 656: } 657: return (dtob(size)); 658: } 659: 660: int pushes; 661: 662: #define FRONT 1 663: #define BACK 2 664: 665: /* 666: * The page out daemon, which runs as process 2. 667: * 668: * As long as there are at least lotsfree pages, 669: * this process is not run. When the number of free 670: * pages stays in the range desfree to lotsfree, 671: * this daemon runs through the pages in the loop 672: * at a rate determined in vmsched(). Pageout manages 673: * two hands on the clock. The front hand moves through 674: * memory, clearing the valid bit (simulating a reference bit), 675: * and stealing pages from procs that are over maxrss. 676: * The back hand travels a distance behind the front hand, 677: * freeing the pages that have not been referenced in the time 678: * since the front hand passed. If modified, they are pushed to 679: * swap before being freed. 680: */ 681: pageout() 682: { 683: register int count; 684: register int maxhand = pgtocm(maxfree); 685: register int fronthand, backhand; 686: 687: /* 688: * Set the two clock hands to be separated by a reasonable amount, 689: * but no more than 360 degrees apart. 690: */ 691: backhand = 0 / CLBYTES; 692: fronthand = HANDSPREAD / CLBYTES; 693: if (fronthand >= maxhand) 694: fronthand = maxhand - 1; 695: 696: loop: 697: /* 698: * Before sleeping, look to see if there are any swap I/O headers 699: * in the ``cleaned'' list that correspond to dirty 700: * pages that have been pushed asynchronously. If so, 701: * empty the list by calling cleanup(). 702: * 703: * N.B.: We guarantee never to block while the cleaned list is nonempty. 704: */ 705: (void) splbio(); 706: if (bclnlist != NULL) { 707: (void) spl0(); 708: cleanup(); 709: goto loop; 710: } 711: sleep((caddr_t)&proc[2], PSWP+1); 712: (void) spl0(); 713: count = 0; 714: pushes = 0; 715: while (nscan < desscan && freemem < lotsfree) { 716: /* 717: * If checkpage manages to add a page to the free list, 718: * we give ourselves another couple of trips around the loop. 719: */ 720: if (checkpage(fronthand, FRONT)) 721: count = 0; 722: if (checkpage(backhand, BACK)) 723: count = 0; 724: cnt.v_scan++; 725: nscan++; 726: if (++fronthand >= maxhand) { 727: fronthand = 0; 728: cnt.v_rev++; 729: if (count > 2) { 730: /* 731: * Extremely unlikely, but we went around 732: * the loop twice and didn't get anywhere. 733: * Don't cycle, stop till the next clock tick. 734: */ 735: goto loop; 736: } 737: count++; 738: } 739: if (++backhand >= maxhand) 740: backhand = 0; 741: } 742: goto loop; 743: } 744: 745: /* 746: * An iteration of the clock pointer (hand) around the loop. 747: * Look at the page at hand. If it is a 748: * locked (for physical i/o e.g.), system (u., page table) 749: * or free, then leave it alone. 750: * Otherwise, if we are running the front hand, 751: * invalidate the page for simulation of the reference bit. 752: * If the proc is over maxrss, we take it. 753: * If running the back hand, check whether the page 754: * has been reclaimed. If not, free the page, 755: * pushing it to disk first if necessary. 756: */ 757: checkpage(hand, whichhand) 758: int hand, whichhand; 759: { 760: register struct proc *rp; 761: register struct text *xp; 762: register struct cmap *c; 763: register struct pte *pte; 764: swblk_t daddr; 765: unsigned v; 766: int klsize; 767: 768: top: 769: /* 770: * Find a process and text pointer for the 771: * page, and a virtual page number in either the 772: * process or the text image. 773: */ 774: c = &cmap[hand]; 775: if (c->c_lock || c->c_free) 776: return (0); 777: switch (c->c_type) { 778: 779: case CSYS: 780: return (0); 781: 782: case CTEXT: 783: xp = &text[c->c_ndx]; 784: rp = xp->x_caddr; 785: v = tptov(rp, c->c_page); 786: pte = tptopte(rp, c->c_page); 787: break; 788: 789: case CDATA: 790: case CSTACK: 791: rp = &proc[c->c_ndx]; 792: while (rp->p_flag & SNOVM) 793: rp = rp->p_xlink; 794: xp = rp->p_textp; 795: if (c->c_type == CDATA) { 796: v = dptov(rp, c->c_page); 797: pte = dptopte(rp, c->c_page); 798: } else { 799: v = sptov(rp, c->c_page); 800: pte = sptopte(rp, c->c_page); 801: } 802: break; 803: } 804: 805: if (pte->pg_pfnum != cmtopg(hand)) 806: panic("bad c_page"); 807: 808: /* 809: * If page is valid; make invalid but reclaimable. 810: * If this pte is not valid, then it must be reclaimable 811: * and we can add it to the free list. 812: */ 813: if (pte->pg_v) { 814: if (whichhand == BACK) 815: return(0); 816: pte->pg_v = 0; 817: if (anycl(pte, pg_m)) 818: pte->pg_m = 1; 819: distcl(pte); 820: if (c->c_type == CTEXT) 821: distpte(xp, (unsigned)vtotp(rp, v), pte); 822: if ((rp->p_flag & (SSEQL|SUANOM)) == 0 && 823: rp->p_rssize <= rp->p_maxrss) 824: return (0); 825: } 826: if (c->c_type != CTEXT) { 827: /* 828: * Guarantee a minimal investment in data 829: * space for jobs in balance set. 830: */ 831: if (rp->p_rssize < saferss - rp->p_slptime) 832: return (0); 833: } 834: 835: /* 836: * If the page is currently dirty, we 837: * have to arrange to have it cleaned before it 838: * can be freed. We mark it clean immediately. 839: * If it is reclaimed while being pushed, then modified 840: * again, we are assured of the correct order of 841: * writes because we lock the page during the write. 842: * This guarantees that a swap() of this process (and 843: * thus this page), initiated in parallel, will, 844: * in fact, push the page after us. 845: * 846: * The most general worst case here would be for 847: * a reclaim, a modify and a swapout to occur 848: * all before the single page transfer completes. 849: */ 850: if (dirtycl(pte)) { 851: /* 852: * If the process is being swapped out 853: * or about to exit, do not bother with its 854: * dirty pages 855: */ 856: if (rp->p_flag & (SLOCK|SWEXIT)) 857: return (0); 858: /* 859: * Limit pushes to avoid saturating 860: * pageout device. 861: */ 862: if (pushes > maxpgio / RATETOSCHEDPAGING) 863: return (0); 864: pushes++; 865: 866: /* 867: * Now carefully make sure that there will 868: * be a header available for the push so that 869: * we will not block waiting for a header in 870: * swap(). The reason this is important is 871: * that we (proc[2]) are the one who cleans 872: * dirty swap headers and we could otherwise 873: * deadlock waiting for ourselves to clean 874: * swap headers. The sleep here on &proc[2] 875: * is actually (effectively) a sleep on both 876: * ourselves and &bswlist, and this is known 877: * to swdone and swap in vm_swp.c. That is, 878: * &proc[2] will be awakened both when dirty 879: * headers show up and also to get the pageout 880: * daemon moving. 881: */ 882: loop2: 883: (void) splbio(); 884: if (bclnlist != NULL) { 885: (void) spl0(); 886: cleanup(); 887: goto loop2; 888: } 889: if (bswlist.av_forw == NULL) { 890: bswlist.b_flags |= B_WANTED; 891: sleep((caddr_t)&proc[2], PSWP+2); 892: (void) spl0(); 893: /* 894: * Page disposition may have changed 895: * since process may have exec'ed, 896: * forked, exited or just about 897: * anything else... try this page 898: * frame again, from the top. 899: */ 900: goto top; 901: } 902: (void) spl0(); 903: 904: MLOCK(c); 905: uaccess(rp, Pushmap, &pushutl); 906: /* 907: * Now committed to pushing the page... 908: */ 909: pte->pg_m = 0; 910: distcl(pte); 911: if (c->c_type == CTEXT) { 912: xp->x_poip++; 913: distpte(xp, (unsigned)vtotp(rp, v), pte); 914: } else 915: rp->p_poip++; 916: v = kluster(rp, v, pte, B_WRITE, &klsize, klout, (daddr_t)0); 917: if (klsize == 0) 918: panic("pageout klsize"); 919: daddr = vtod(rp, v, &pushutl.u_dmap, &pushutl.u_smap); 920: (void)swap(rp, daddr, ptob(v), klsize * ctob(CLSIZE), 921: B_WRITE, B_DIRTY, swapdev, pte->pg_pfnum); 922: /* 923: * The cleaning of this page will be 924: * completed later, in cleanup() called 925: * (synchronously) by us (proc[2]). In 926: * the meantime, the page frame is locked 927: * so no havoc can result. 928: */ 929: return (1); /* well, it'll be free soon */ 930: 931: } 932: /* 933: * Decrement the resident set size of the current 934: * text object/process, and put the page in the 935: * free list. Note that we don't give memfree the 936: * pte as its argument, since we don't want to destroy 937: * the pte. If it hasn't already been discarded 938: * it may yet have a chance to be reclaimed from 939: * the free list. 940: */ 941: if (c->c_gone == 0) 942: if (c->c_type == CTEXT) 943: xp->x_rssize -= CLSIZE; 944: else 945: rp->p_rssize -= CLSIZE; 946: memfree(pte, CLSIZE, 0); 947: cnt.v_dfree += CLSIZE; 948: return (1); /* freed a page! */ 949: } 950: 951: /* 952: * Process the ``cleaned'' list. 953: * 954: * Scan through the linked list of swap I/O headers 955: * and free the corresponding pages that have been 956: * cleaned by being written back to the paging area. 957: * If the page has been reclaimed during this time, 958: * we do not free the page. As they are processed, 959: * the swap I/O headers are removed from the cleaned 960: * list and inserted into the free list. 961: */ 962: cleanup() 963: { 964: register struct buf *bp; 965: register struct proc *rp; 966: register struct text *xp; 967: register struct cmap *c; 968: register struct pte *pte; 969: struct pte *upte; 970: unsigned pf; 971: register int i; 972: int s, center; 973: 974: for (;;) { 975: s = splbio(); 976: if ((bp = bclnlist) == 0) 977: break; 978: bclnlist = bp->av_forw; 979: splx(s); 980: pte = vtopte(&proc[2], btop(bp->b_un.b_addr)); 981: center = 0; 982: for (i = 0; i < bp->b_bcount; i += CLSIZE * NBPG) { 983: pf = pte->pg_pfnum; 984: c = &cmap[pgtocm(pf)]; 985: MUNLOCK(c); 986: if (pf != bp->b_pfcent) { 987: if (c->c_gone) { 988: memfree(pte, CLSIZE, 0); 989: cnt.v_dfree += CLSIZE; 990: } 991: goto skip; 992: } 993: center++; 994: switch (c->c_type) { 995: 996: case CSYS: 997: panic("cleanup CSYS"); 998: 999: case CTEXT: 1000: xp = &text[c->c_ndx]; 1001: xp->x_poip--; 1002: if (xp->x_poip == 0) 1003: wakeup((caddr_t)&xp->x_poip); 1004: break; 1005: 1006: case CDATA: 1007: case CSTACK: 1008: rp = &proc[c->c_ndx]; 1009: while (rp->p_flag & SNOVM) 1010: rp = rp->p_xlink; 1011: rp->p_poip--; 1012: if (rp->p_poip == 0) 1013: wakeup((caddr_t)&rp->p_poip); 1014: break; 1015: } 1016: if (c->c_gone == 0) { 1017: switch (c->c_type) { 1018: 1019: case CTEXT: 1020: upte = tptopte(xp->x_caddr, c->c_page); 1021: break; 1022: 1023: case CDATA: 1024: upte = dptopte(rp, c->c_page); 1025: break; 1026: 1027: case CSTACK: 1028: upte = sptopte(rp, c->c_page); 1029: break; 1030: } 1031: if (upte->pg_v) 1032: goto skip; 1033: if (c->c_type == CTEXT) 1034: xp->x_rssize -= CLSIZE; 1035: else 1036: rp->p_rssize -= CLSIZE; 1037: } 1038: memfree(pte, CLSIZE, 0); 1039: cnt.v_dfree += CLSIZE; 1040: skip: 1041: pte += CLSIZE; 1042: } 1043: if (center != 1) 1044: panic("cleanup center"); 1045: bp->b_flags = 0; 1046: bp->av_forw = bswlist.av_forw; 1047: bswlist.av_forw = bp; 1048: if (bswlist.b_flags & B_WANTED) { 1049: bswlist.b_flags &= ~B_WANTED; 1050: wakeup((caddr_t)&bswlist); 1051: } 1052: } 1053: splx(s); 1054: } 1055: 1056: /* 1057: * Kluster locates pages adjacent to the argument pages 1058: * that are immediately available to include in the pagein/pageout, 1059: * and given the availability of memory includes them. 1060: * It knows that the process image is contiguous in chunks; 1061: * an assumption here is that CLSIZE * KLMAX is a divisor of dmmin, 1062: * so that by looking at KLMAX chunks of pages, all such will 1063: * necessarily be mapped swap contiguous. 1064: */ 1065: int noklust; 1066: int klicnt[KLMAX]; 1067: int klocnt[KLMAX]; 1068: 1069: kluster(p, v, pte0, rw, pkl, klsize, bn0) 1070: register struct proc *p; 1071: unsigned v; 1072: struct pte *pte0; 1073: int rw; 1074: register int *pkl; 1075: int klsize; 1076: daddr_t bn0; 1077: { 1078: int type, cl, clmax; 1079: int kloff, k, klmax; 1080: register struct pte *pte; 1081: int klback, klforw; 1082: int i; 1083: unsigned v0; 1084: daddr_t bn; 1085: register struct cmap *c; 1086: 1087: if (rw == B_READ) 1088: klicnt[0]++; 1089: else 1090: klocnt[0]++; 1091: *pkl = 1; 1092: if (noklust || klsize <= 1 || klsize > KLMAX || (klsize & (klsize - 1))) 1093: return (v); 1094: if (rw == B_READ && freemem < CLSIZE * KLMAX) 1095: return (v); 1096: if (isassv(p, v)) { 1097: type = CSTACK; 1098: cl = vtosp(p, v) / CLSIZE; 1099: clmax = p->p_ssize / CLSIZE; 1100: } else if (isadsv(p, v)) { 1101: type = CDATA; 1102: cl = vtodp(p, v) / CLSIZE; 1103: clmax = p->p_dsize / CLSIZE; 1104: } else { 1105: type = CTEXT; 1106: cl = vtotp(p, v) / CLSIZE; 1107: clmax = p->p_textp->x_size / CLSIZE; 1108: } 1109: kloff = cl & (klsize - 1); 1110: pte = pte0; 1111: bn = bn0; 1112: for (k = kloff; --k >= 0;) { 1113: if (type == CSTACK) 1114: pte += CLSIZE; 1115: else 1116: pte -= CLSIZE; 1117: if (type == CTEXT && rw == B_READ && bn) { 1118: bn -= btodb(CLBYTES); 1119: if (mfind(swapdev, bn)) 1120: break; 1121: } 1122: if (!klok(pte, rw)) 1123: break; 1124: } 1125: klback = (kloff - k) - 1; 1126: pte = pte0; 1127: if ((cl - kloff) + klsize > clmax) 1128: klmax = clmax - (cl - kloff); 1129: else 1130: klmax = klsize; 1131: bn = bn0; 1132: for (k = kloff; ++k < klmax;) { 1133: if (type == CSTACK) 1134: pte -= CLSIZE; 1135: else 1136: pte += CLSIZE; 1137: if (type == CTEXT && rw == B_READ && bn) { 1138: bn += btodb(CLBYTES); 1139: if (mfind(swapdev, bn)) 1140: break; 1141: } 1142: if (!klok(pte, rw)) 1143: break; 1144: } 1145: klforw = (k - kloff) - 1; 1146: if (klforw + klback == 0) 1147: return (v); 1148: pte = pte0; 1149: if (type == CSTACK) { 1150: pte -= klforw * CLSIZE; 1151: v -= klforw * CLSIZE; 1152: } else { 1153: pte -= klback * CLSIZE; 1154: v -= klback * CLSIZE; 1155: } 1156: *pkl = klforw + klback + 1; 1157: if (rw == B_READ) 1158: klicnt[0]--, klicnt[*pkl - 1]++; 1159: else 1160: klocnt[0]--, klocnt[*pkl - 1]++; 1161: v0 = v; 1162: for (i = 0; i < *pkl; i++) { 1163: if (pte == pte0) 1164: goto cont; 1165: if (rw == B_WRITE) { 1166: c = &cmap[pgtocm(pte->pg_pfnum)]; 1167: MLOCK(c); 1168: pte->pg_m = 0; 1169: distcl(pte); 1170: if (type == CTEXT) 1171: distpte(p->p_textp, (unsigned)vtotp(p, v), pte); 1172: } else { 1173: struct pte opte; 1174: 1175: opte = *pte; 1176: if (memall(pte, CLSIZE, p, type) == 0) 1177: panic("kluster"); 1178: pte->pg_prot = opte.pg_prot; 1179: cmap[pgtocm(pte->pg_pfnum)].c_intrans = 1; 1180: distcl(pte); 1181: if (type == CTEXT) { 1182: p->p_textp->x_rssize += CLSIZE; 1183: distpte(p->p_textp, (unsigned)vtotp(p, v), pte); 1184: } else 1185: p->p_rssize += CLSIZE; 1186: distcl(pte); 1187: } 1188: cont: 1189: pte += CLSIZE; 1190: v += CLSIZE; 1191: } 1192: return (v0); 1193: } 1194: 1195: klok(pte, rw) 1196: register struct pte *pte; 1197: int rw; 1198: { 1199: register struct cmap *c; 1200: 1201: if (rw == B_WRITE) { 1202: if (pte->pg_fod) 1203: return (0); 1204: if (pte->pg_pfnum == 0) 1205: return (0); 1206: c = &cmap[pgtocm(pte->pg_pfnum)]; 1207: if (c->c_lock || c->c_intrans) 1208: return (0); 1209: if (!dirtycl(pte)) 1210: return (0); 1211: return (1); 1212: } else { 1213: if (pte->pg_fod) 1214: return (0); 1215: if (pte->pg_pfnum) 1216: return (0); 1217: return (1); 1218: } 1219: } 1220: 1221: /* 1222: * Fodkluster locates pages adjacent to the argument pages 1223: * that are immediately available to include in the pagein, 1224: * and given the availability of memory includes them. 1225: * It wants to page in a file system block if it can. 1226: */ 1227: int nofodklust; 1228: int fodklcnt[KLMAX]; 1229: 1230: fodkluster(p, v0, pte0, pkl, dev, pbn) 1231: register struct proc *p; 1232: unsigned v0; 1233: struct pte *pte0; 1234: int *pkl; 1235: dev_t dev; 1236: daddr_t *pbn; 1237: { 1238: register struct pte *pte; 1239: register struct fpte *fpte; 1240: struct cmap *c; 1241: register daddr_t bn; 1242: daddr_t bnswap; 1243: unsigned v, vmin, vmax; 1244: register int klsize; 1245: int klback, type, i; 1246: 1247: if (nofodklust) 1248: return (v0); 1249: fodklcnt[0]++; 1250: *pkl = 1; 1251: if (freemem < KLMAX) 1252: return (v0); 1253: if (isatsv(p, v0)) { 1254: type = CTEXT; 1255: vmin = tptov(p, 0); 1256: vmax = tptov(p, clrnd(p->p_tsize) - CLSIZE); 1257: } else { 1258: type = CDATA; 1259: vmin = dptov(p, 0); 1260: vmax = dptov(p, clrnd(p->p_dsize) - CLSIZE); 1261: } 1262: fpte = (struct fpte *)pte0; 1263: bn = *pbn; 1264: v = v0; 1265: for (klsize = 1; klsize < KLMAX; klsize++) { 1266: v -= CLSIZE; 1267: if (v < vmin) 1268: break; 1269: fpte -= CLSIZE; 1270: if (fpte->pg_fod == 0) 1271: break; 1272: bn -= btodb(CLBYTES); 1273: if (fpte->pg_blkno != bn) 1274: break; 1275: if (type == CTEXT) { 1276: if (mfind(dev, bn)) 1277: break; 1278: /* 1279: * Flush any previous text page use of this 1280: * swap device block. 1281: */ 1282: bnswap = vtod(p, v, &u.u_dmap, &u.u_smap); 1283: c = mfind(swapdev, bnswap); 1284: if (c) 1285: munhash(swapdev, bnswap); 1286: } 1287: } 1288: klback = klsize - 1; 1289: fpte = (struct fpte *)pte0; 1290: bn = *pbn; 1291: v = v0; 1292: for (; klsize < KLMAX; klsize++) { 1293: v += CLSIZE; 1294: if (v > vmax) 1295: break; 1296: fpte += CLSIZE; 1297: if (fpte->pg_fod == 0) 1298: break; 1299: bn += btodb(CLBYTES); 1300: if (fpte->pg_blkno != bn) 1301: break; 1302: if (type == CTEXT) { 1303: if (mfind(dev, bn)) 1304: break; 1305: /* 1306: * Flush any previous text page use of this 1307: * swap device block. 1308: */ 1309: bnswap = vtod(p, v, &u.u_dmap, &u.u_smap); 1310: c = mfind(swapdev, bnswap); 1311: if (c) 1312: munhash(swapdev, bnswap); 1313: } 1314: } 1315: if (klsize == 1) 1316: return (v0); 1317: pte = pte0; 1318: pte -= klback * CLSIZE; 1319: v0 -= klback * CLSIZE; 1320: *pbn -= klback * btodb(CLBYTES); 1321: *pkl = klsize; 1322: fodklcnt[0]--; fodklcnt[klsize - 1]++; 1323: v = v0; 1324: for (i = 0; i < klsize; i++) { 1325: if (pte != pte0) { 1326: struct pte opte; 1327: int pf; 1328: 1329: opte = *pte; 1330: if (memall(pte, CLSIZE, p, type) == 0) 1331: panic("fodkluster"); 1332: pte->pg_prot = opte.pg_prot; 1333: pf = pte->pg_pfnum; 1334: pte->pg_m = 1; 1335: cmap[pgtocm(pf)].c_intrans = 1; 1336: distcl(pte); 1337: if (type == CTEXT) { 1338: p->p_textp->x_rssize += CLSIZE; 1339: distpte(p->p_textp, (unsigned)vtotp(p, v), pte); 1340: } else 1341: p->p_rssize += CLSIZE; 1342: distcl(pte); 1343: } 1344: pte += CLSIZE; 1345: v += CLSIZE; 1346: } 1347: return (v0); 1348: }