patch-2.4.4 linux/arch/ia64/sn/sn1/mm.c

Next file: linux/arch/ia64/sn/sn1/probe.c
Previous file: linux/arch/ia64/sn/sn1/machvec.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.4.3/linux/arch/ia64/sn/sn1/mm.c linux/arch/ia64/sn/sn1/mm.c
@@ -1,7 +1,7 @@
 /*
- * Copyright, 2000, Silicon Graphics.
+ * Copyright, 2000-2001, Silicon Graphics.
  * Copyright Srinivasa Thirumalachar (sprasad@engr.sgi.com)
- * Copyright 2000 Kanoj Sarcar (kanoj@sgi.com)
+ * Copyright 2000-2001 Kanoj Sarcar (kanoj@sgi.com)
  */
 
 #include <linux/config.h>
@@ -11,32 +11,23 @@
 #include <asm/efi.h>
 #include <asm/sn/mmzone_sn1.h>
 
-#       define MIN(a,b)         ((a) < (b) ? (a) : (b))
-#       define MAX(a,b)         ((a) > (b) ? (a) : (b))
+#define MIN(a,b)	((a) < (b) ? (a) : (b))
+#define MAX(a,b)	((a) > (b) ? (a) : (b))
+
+#define DONE_NOTHING	0
+#define DONE_FINDING	1
+#define DONE_BUILDING	2
 
-/*
- * Note that the nodemem[] data structure does not support arbitrary
- * memory types and memory descriptors inside the node. For example, 
- * you can not have multiple efi-mem-type segments in the node and
- * expect the OS not to use specific mem-types. Currently, the 
- * assumption is that "start" is the start of virtual/physical memory 
- * on the node. PROM can reserve some memory _only_ at the beginning. 
- * This is tracked via the "usable" field, that maintains where the 
- * os can start using memory from on a node (ie end of PROM memory).
- * setup_node_bootmem() is passed the above "usable" value, and is
- * expected to make bootmem calls that ensure lower memory is not used.
- * Note that the bootmem for a node is initialized on the entire node, 
- * without regards to any holes - then we reserve the holes in 
- * setup_sn1_bootmem(), to make sure the holes are not handed out by
- * alloc_bootmem, as well as the corresponding mem_map entries are not
- * considered allocatable by the page_alloc routines.
- */
 struct nodemem_s {
-        u64     start ;
-        u64     end   ;
-        u64 	hole[SN1_MAX_BANK_PER_NODE] ;
-	u64	usable;
-} nodemem[MAXNODES] ;
+        u64     start;	/* start of kernel usable memory */
+        u64     end;	/* end of kernel usable memory */
+	u64	mtot;	/* total kernel usable memory */
+	u64	done;	/* state of bootmem initialization */
+	u64	bstart;	/* where should the bootmem area be */
+	u64	bsize;	/* bootmap size */
+        u64 	hole[SN1_MAX_BANK_PER_NODE];
+} nodemem[MAXNODES];
+
 static int nodemem_valid = 0;
 
 static int __init
@@ -46,7 +37,7 @@
         unsigned long count = 0;
 
 	if (start >= end)
-		return 0 ;
+		return 0;
 
 	/*
 	 * Get the memmap ptrs to the start and end of the holes.
@@ -54,31 +45,33 @@
 	 * Can we do virt_to_page(end), if end is on the next node?
 	 */
 
-	page = virt_to_page(start-1);
-	page++ ;
-	pageend = virt_to_page(end) ;
+	page = virt_to_page(start - 1);
+	page++;
+	pageend = virt_to_page(end);
 
 	printk("hpage=0x%lx, hpageend=0x%lx\n", (u64)page, (u64)pageend) ;
 	free_bootmem_node(NODE_DATA(nid), __pa(page), (u64)pageend - (u64)page);
 
-	return count ;
+	return count;
 }
 
-void
+static void __init
 free_unused_memmap_node(int nid)
 {
-	u64	i = 0 ;
-	u64	holestart = -1 ;
+	u64	i = 0;
+	u64	holestart = -1;
+	u64	start = nodemem[nid].start;
 
+	start = ((start >> SN1_NODE_ADDR_SHIFT) << SN1_NODE_ADDR_SHIFT);
 	do {
-		holestart = nodemem[nid].hole[i] ;
-		i++ ;
+		holestart = nodemem[nid].hole[i];
+		i++;
 		while ((i < SN1_MAX_BANK_PER_NODE) && 
-			(nodemem[nid].hole[i] == (u64)-1))
-			i++ ;
+					(nodemem[nid].hole[i] == (u64)-1))
+			i++;
 		if (i < SN1_MAX_BANK_PER_NODE)
 			free_unused_memmap_hole(nid, holestart, 
-				nodemem[nid].start + (i<<SN1_BANK_ADDR_SHIFT));
+				start + (i<<SN1_BANK_ADDR_SHIFT));
 	} while (i<SN1_MAX_BANK_PER_NODE);
 }
 
@@ -98,7 +91,6 @@
 		cnodeid = NASID_TO_CNODEID(nasid);
 		bankid = GetBankId(__pa(vaddr));
 		nodemem[cnodeid].start = MIN(nodemem[cnodeid].start, vaddr);
-		nodemem[cnodeid].usable = MIN(nodemem[cnodeid].usable, vaddr);
 		nvaddr = (unsigned long)__va((unsigned long)(++nasid) << 
 							SN1_NODE_ADDR_SHIFT);
 		nodemem[cnodeid].end = MAX(nodemem[cnodeid].end, MIN(end, nvaddr));
@@ -118,11 +110,14 @@
 pgtbl_size_ok(int nid)
 {
 	unsigned long numpfn, bank0size, nodesize ;
+	unsigned long start = nodemem[nid].start;
+
+	start = ((start >> SN1_NODE_ADDR_SHIFT) << SN1_NODE_ADDR_SHIFT);
 	
-	nodesize 	= nodemem[nid].end - nodemem[nid].start ;
+	nodesize 	= nodemem[nid].end - start ;
 	numpfn 		= nodesize >> PAGE_SHIFT;
 
-	bank0size 	= nodemem[nid].hole[0] - nodemem[nid].start ;
+	bank0size 	= nodemem[nid].hole[0] - start ;
 	/* If nid == master node && no kernel text replication */
 	bank0size      -= 0xA00000 ;	/* Kernel text + stuff */
 	bank0size      -= ((numpfn + 7) >> 3);
@@ -163,198 +158,198 @@
 
 #ifdef CONFIG_DISCONTIGMEM
 
-extern bootmem_data_t 	bdata[] ;
-static int	 	curnodeid ;
+extern bootmem_data_t bdata[];
 
+/*
+ * This assumes there will be a hole in kernel-usable memory between nodes
+ * (due to prom). The memory descriptors invoked via efi_memmap_walk are 
+ * in increasing order. It tries to identify first suitable free area to 
+ * put the bootmem for the node in. When presented with the md holding
+ * the kernel, it only searches at the end of the kernel area.
+ */
 static int __init
-setup_node_bootmem(unsigned long start, unsigned long end, unsigned long nodefree)
+find_node_bootmem(unsigned long start, unsigned long end, void *arg)
 {
+	int nasid = GetNasId(__pa(start));
+	int cnodeid = NASID_TO_CNODEID(nasid);
+	unsigned long nodesize;
 	extern char _end;
-	int i;
-	unsigned long kernelend = PAGE_ALIGN((unsigned long)(&_end));
-	unsigned long pkernelend = __pa(kernelend);
-	unsigned long bootmap_start, bootmap_size;
-	unsigned long pstart, pend;
-
-	pstart = __pa(start) ;
-	pend   = __pa(end) ;
-
-	/* If we are past a node mem boundary, on simulated dig numa
-	 * increment current node id. */
-
-	curnodeid = NASID_TO_CNODEID(GetNasId(pstart)) ;
-
-       /*
-        * Make sure we are being passed page aligned addresses.
-        */
-	if ((start & (PAGE_SIZE - 1)) || (end & (PAGE_SIZE - 1)))
-               panic("setup_node_bootmem:align");
+	unsigned long kaddr = (unsigned long)&_end;
 
-
-	/* For now, just go to the lower CHUNK alignment so that 
-	 * chunktonid of 0-8MB and other lower mem pages get initted. */
-
-	pstart &= CHUNKMASK ;
-	pend = (pend+CHUNKSZ-1) & CHUNKMASK;
-
-	/* If pend == 0, both addrs below 8 MB, special case it
-	 * FIX: CHUNKNUM(pend-1) broken if pend == 0 
-	 * both addrs within 8MB */
-
-	if (pend == 0) {
-		chunktonid[0] = 0;
-		return 0;
-	}
-
-	/* Fill up the chunktonid array first. */
-
-        for (i = PCHUNKNUM(pstart); i <= PCHUNKNUM(pend-1); i++)
-               chunktonid[i] = curnodeid;
-
-	/* This check is bogus for now till MAXCHUNKS is properly
-	 * defined to say if it includes holes or not. */
-
-	if ((CHUNKTONID(PCHUNKNUM(pend)) > MAXCHUNKS) || 
-		(PCHUNKNUM(pstart) >= PCHUNKNUM(pend))) {
-		printk("Ign 0x%lx-0x%lx, ", __pa(start), __pa(end));
+	/*
+	 * Track memory available to kernel.
+	 */
+	nodemem[cnodeid].mtot += ((end - start) >> PAGE_SHIFT);
+	if (nodemem[cnodeid].done != DONE_NOTHING)
 		return(0);
-	}
+	nodesize = nodemem[cnodeid].end - ((nodemem[cnodeid].start >> 
+				SN1_NODE_ADDR_SHIFT) << SN1_NODE_ADDR_SHIFT);
+	nodesize >>= PAGE_SHIFT;
 
-	/* This routine gets called many times in node 0.
-	 * The first one to reach here would be the one after
-	 * kernelend to end of first node. */
-
-	NODE_DATA(curnodeid)->bdata = &(bdata[curnodeid]);
-
-	if (curnodeid == 0) {
-		/* for master node, forcibly assign these values
-		 * This gets called many times on dig but we
-		 * want these exact values 
-		 * Also on softsdv, the memdesc for 0 is missing */
-		NODE_START(curnodeid) = PAGE_OFFSET;
-		NODE_SIZE(curnodeid) = (end - PAGE_OFFSET);
-	} else {
-		/* This gets called only once for non zero nodes
-		 * If it does not, then NODE_STARt should be 
-		 * LOCAL_BASE(nid) */
+	/*
+	 * Adjust limits for the md holding the kernel.
+	 */
+	if ((start < kaddr) && (end > kaddr))
+		start = PAGE_ALIGN(kaddr);
 
-		NODE_START(curnodeid) = start;
-		NODE_SIZE(curnodeid) = (end - start);
+	/*
+	 * We need space for mem_map, bootmem map plus a few more pages
+	 * to satisfy alloc_bootmems out of node 0.
+	 */
+	if ((end - start) > ((nodesize * sizeof(struct page)) + (nodesize/8)
+						+ (10 * PAGE_SIZE))) {
+		nodemem[cnodeid].bstart = start;
+		nodemem[cnodeid].done = DONE_FINDING;
 	}
+	return(0);
+}
 
-	/* if end < kernelend do not do anything below this */
-	if (pend < pkernelend)
-		return 0 ;
+/*
+ * This assumes there will be a hole in kernel-usable memory between nodes
+ * (due to prom). The memory descriptors invoked via efi_memmap_walk are 
+ * in increasing order.
+ */
+static int __init
+build_node_bootmem(unsigned long start, unsigned long end, void *arg)
+{
+	int nasid = GetNasId(__pa(start));
+	int curnodeid = NASID_TO_CNODEID(nasid);
+	int i;
+	unsigned long pstart, pend;
+	extern char _end, _stext;
+	unsigned long kaddr = (unsigned long)&_end;
 
-       /*
-        * Handle the node that contains kernel text/data. It would
-        * be nice if the loader loads the kernel at a "chunk", ie
-        * not in memory that the kernel will ignore (else free_initmem
-        * has to worry about not freeing memory that the kernel ignores).
-        * Note that we assume the space from the node start to
-        * KERNEL_START can not hold all the bootmem data, but from kernel
-        * end to node end can.
-        */
-
-	/* TBD: This may be bogus in light of the above check. */
-
-	if ((pstart < pkernelend) && (pend >= pkernelend)) {
-               bootmap_start = pkernelend;
-	} else {
-               bootmap_start = __pa(start);    /* chunk & page aligned */
+	if (nodemem[curnodeid].done == DONE_FINDING) {
+		/*
+		 * This is where we come to know the node is present.
+		 * Do node wide tasks.
+		 */
+		nodemem[curnodeid].done = DONE_BUILDING;
+		NODE_DATA(curnodeid)->bdata = &(bdata[curnodeid]);
+
+		/*
+	 	 * Update the chunktonid array as a node wide task. There
+		 * are too many smalls mds on first node to do this per md.
+	 	 */
+		pstart = __pa(nodemem[curnodeid].start);
+		pend = __pa(nodemem[curnodeid].end);
+		pstart &= CHUNKMASK;
+		pend = (pend + CHUNKSZ - 1) & CHUNKMASK;
+		/* Possible check point to enforce minimum node size */
+		if (nodemem[curnodeid].bstart == -1) {
+			printk("No valid bootmem area on node %d\n", curnodeid);
+			while(1);
+		}
+		for (i = PCHUNKNUM(pstart); i <= PCHUNKNUM(pend - 1); i++)
+			chunktonid[i] = curnodeid;
+		if ((CHUNKTONID(PCHUNKNUM(pend)) > MAXCHUNKS) || 
+				(PCHUNKNUM(pstart) >= PCHUNKNUM(pend))) {
+			printk("Ign 0x%lx-0x%lx, ", __pa(start), __pa(end));
+			return(0);
+		}
+
+		/*
+		 * NODE_START and NODE_SIZE determine the physical range
+		 * on the node that mem_map array needs to be set up for.
+		 */
+		NODE_START(curnodeid) = ((nodemem[curnodeid].start >> 
+				SN1_NODE_ADDR_SHIFT) << SN1_NODE_ADDR_SHIFT);
+		NODE_SIZE(curnodeid) = (nodemem[curnodeid].end - 
+							NODE_START(curnodeid));
+
+        	nodemem[curnodeid].bsize = 
+			init_bootmem_node(NODE_DATA(curnodeid),
+			(__pa(nodemem[curnodeid].bstart) >> PAGE_SHIFT),
+			(__pa((nodemem[curnodeid].start >> SN1_NODE_ADDR_SHIFT)
+			<< SN1_NODE_ADDR_SHIFT) >> PAGE_SHIFT),
+			(__pa(nodemem[curnodeid].end) >> PAGE_SHIFT));
+
+	} else if (nodemem[curnodeid].done == DONE_NOTHING) {
+		printk("build_node_bootmem: node %d weirdness\n", curnodeid);
+		while(1);		/* Paranoia */
 	}
 
 	/*
-	 * Low memory is reserved for PROM use on SN1. The current node
-	 * memory model is [PROM mem ... kernel ... free], where the 
-	 * first two components are optional on a node.
+	 * Free the entire md.
 	 */
-	if (bootmap_start < __pa(nodefree))
-		bootmap_start = __pa(nodefree);
-
-/* XXX TBD */
-/* For curnodeid of 0, this gets called many times because of many
- * < 8MB segments. start gets bumped each time. We want to fix it
- * to 0 now. 
- */
-	if (curnodeid == 0)
-		start=PAGE_OFFSET;
-/*
- * This makes sure that in free_area_init_core - paging_init
- * idx is the entire node page range and for loop goes thro
- * all pages. test_bit for kernel pages should remain reserved
- * because free available mem takes care of kernel_start and end
- */
-
-        bootmap_size = init_bootmem_node(NODE_DATA(curnodeid),
-			(bootmap_start >> PAGE_SHIFT),
-			(__pa(start) >> PAGE_SHIFT), (__pa(end) >> PAGE_SHIFT));
+	free_bootmem_node(NODE_DATA(curnodeid), __pa(start), (end - start));
 
-	free_bootmem_node(NODE_DATA(curnodeid), bootmap_start + bootmap_size,
-				__pa(end) - (bootmap_start + bootmap_size));
+	/*
+	 * Reclaim back the bootmap and kernel areas.
+	 */
+	if ((start <= nodemem[curnodeid].bstart) && (end >
+						nodemem[curnodeid].bstart))
+		reserve_bootmem_node(NODE_DATA(curnodeid),
+		    __pa(nodemem[curnodeid].bstart), nodemem[curnodeid].bsize);
+	if ((start <= kaddr) && (end > kaddr))
+		reserve_bootmem_node(NODE_DATA(curnodeid),
+		    __pa(&_stext), (&_end - &_stext));
 
 	return(0);
 }
 
-void
+void __init
 setup_sn1_bootmem(int maxnodes)
 {
         int     i;
 
-        for (i=0;i<MAXNODES;i++) {
-                nodemem[i].usable = nodemem[i].start = -1 ;
-                nodemem[i].end   = 0 ;
-		memset(&nodemem[i].hole, -1, sizeof(nodemem[i].hole)) ;
+        for (i = 0; i < MAXNODES; i++) {
+                nodemem[i].start = nodemem[i].bstart = -1;
+                nodemem[i].end = nodemem[i].bsize = nodemem[i].mtot = 0;
+		nodemem[i].done = DONE_NOTHING;
+		memset(&nodemem[i].hole, -1, sizeof(nodemem[i].hole));
         }
-        efi_memmap_walk(build_nodemem_map, 0) ;
+        efi_memmap_walk(build_nodemem_map, 0);
 
-	/*
-	 * Run thru all the nodes, adjusting their starts. This is needed
-	 * because efi_memmap_walk() might not process certain mds that 
-	 * are marked reserved for PROM at node low memory.
-	 */
-	for (i = 0; i < maxnodes; i++)
-		nodemem[i].start = ((nodemem[i].start >> SN1_NODE_ADDR_SHIFT) <<
-					SN1_NODE_ADDR_SHIFT);
-	nodemem_valid = 1 ;
+	nodemem_valid = 1;
 
-	/* After building the nodemem map, check if the page table
+	/* 
+	 * After building the nodemem map, check if the node memmap
 	 * will fit in the first bank of each node. If not change
-	 * the node end addr till it fits. We dont want to do this
-	 * in mm/page_alloc.c
+	 * the node end addr till it fits.
  	 */
 
-        for (i=0;i<maxnodes;i++)
-		check_pgtbl_size(i) ;
-
-        for (i=0;i<maxnodes;i++)
-                setup_node_bootmem(nodemem[i].start, nodemem[i].end, nodemem[i].usable);
+        for (i = 0; i < maxnodes; i++)
+		check_pgtbl_size(i);
 
-	/*
-	 * Mark the holes as reserved, so the corresponding mem_map
-	 * entries will not be marked allocatable in free_all_bootmem*().
-	 */
-	for (i = 0; i < maxnodes; i++) {
-		int j = 0 ;
-		u64 holestart = -1 ;
-
-		do {
-			holestart = nodemem[i].hole[j++];
-			while ((j < SN1_MAX_BANK_PER_NODE) && 
-					(nodemem[i].hole[j] == (u64)-1))
-				j++;
-			if (j < SN1_MAX_BANK_PER_NODE)
-				reserve_bootmem_node(NODE_DATA(i), 
-					__pa(holestart), (nodemem[i].start + 
-					((long)j <<  SN1_BANK_ADDR_SHIFT) - 
-					 holestart));
-		} while (j < SN1_MAX_BANK_PER_NODE);
-	}
+	dump_nodemem_map(maxnodes);
 
-	dump_nodemem_map(maxnodes) ;
+	efi_memmap_walk(find_node_bootmem, 0);
+	efi_memmap_walk(build_node_bootmem, 0);
 }
 #endif
 
+void __init
+discontig_paging_init(void)
+{
+	int i;
+	unsigned long max_dma, zones_size[MAX_NR_ZONES], holes_size[MAX_NR_ZONES];
+	extern void dump_node_data(void);
+
+	max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+	for (i = 0; i < numnodes; i++) {
+		unsigned long startpfn = __pa((void *)NODE_START(i)) >> PAGE_SHIFT;
+		unsigned long numpfn = NODE_SIZE(i) >> PAGE_SHIFT;
+		memset(zones_size, 0, sizeof(zones_size));
+		memset(holes_size, 0, sizeof(holes_size));
+		holes_size[ZONE_DMA] = numpfn - nodemem[i].mtot;
+
+		if ((startpfn + numpfn) < max_dma) {
+			zones_size[ZONE_DMA] = numpfn;
+		} else if (startpfn > max_dma) {
+			zones_size[ZONE_NORMAL] = numpfn;
+			panic("discontig_paging_init: %d\n", i);
+		} else {
+			zones_size[ZONE_DMA] = (max_dma - startpfn);
+			zones_size[ZONE_NORMAL] = numpfn - zones_size[ZONE_DMA];
+			panic("discontig_paging_init: %d\n", i);
+		}
+		free_area_init_node(i, NODE_DATA(i), NULL, zones_size, startpfn<<PAGE_SHIFT, holes_size);
+		free_unused_memmap_node(i);
+	}
+	dump_node_data();
+}
+
 /*
  * This used to be invoked from an SN1 specific hack in efi_memmap_walk.
  * It tries to ignore banks which the kernel is ignoring because bank 0 
@@ -386,10 +381,10 @@
 	int	i,j;
 
         printk("NODEMEM_S info ....\n") ;
-        printk("Node         start                end                 usable\n");
+        printk("Node         start                end\n");
         for (i=0;i<maxnodes;i++) {
-                printk("%d      0x%lx   0x%lx   0x%lx\n",
-                       i, nodemem[i].start, nodemem[i].end, nodemem[i].usable);
+                printk("%d      0x%lx   0x%lx\n",
+                       i, nodemem[i].start, nodemem[i].end);
                 printk("Holes -> ") ;
                 for (j=0;j<SN1_MAX_BANK_PER_NODE;j++)
                         printk("0x%lx ", nodemem[i].hole[j]) ;

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)