patch-2.4.20 linux-2.4.20/arch/ia64/kernel/perfmon.c

Next file: linux-2.4.20/arch/ia64/kernel/perfmon_generic.h
Previous file: linux-2.4.20/arch/ia64/kernel/pci.c
Back to the patch index
Back to the overall index

diff -urN linux-2.4.19/arch/ia64/kernel/perfmon.c linux-2.4.20/arch/ia64/kernel/perfmon.c
@@ -23,6 +23,7 @@
 #include <linux/vmalloc.h>
 #include <linux/wrapper.h>
 #include <linux/mm.h>
+#include <linux/sysctl.h>
 
 #include <asm/bitops.h>
 #include <asm/errno.h>
@@ -42,7 +43,7 @@
  * you must enable the following flag to activate the support for
  * accessing the registers via the perfmonctl() interface.
  */
-#ifdef CONFIG_ITANIUM
+#if defined(CONFIG_ITANIUM) || defined(CONFIG_MCKINLEY)
 #define PFM_PMU_USES_DBR	1
 #endif
 
@@ -68,26 +69,27 @@
 #define PMC_OVFL_NOTIFY(ctx, i)	((ctx)->ctx_soft_pmds[i].flags &  PFM_REGFL_OVFL_NOTIFY)
 #define PFM_FL_INHERIT_MASK	(PFM_FL_INHERIT_NONE|PFM_FL_INHERIT_ONCE|PFM_FL_INHERIT_ALL)
 
+/* i assume unsigned */
 #define PMC_IS_IMPL(i)	  (i<pmu_conf.num_pmcs && pmu_conf.impl_regs[i>>6] & (1UL<< (i) %64))
 #define PMD_IS_IMPL(i)	  (i<pmu_conf.num_pmds &&  pmu_conf.impl_regs[4+(i>>6)] & (1UL<<(i) % 64))
 
-#define PMD_IS_COUNTING(i) (i >=0  && i < 256 && pmu_conf.counter_pmds[i>>6] & (1UL <<(i) % 64))
-#define PMC_IS_COUNTING(i) PMD_IS_COUNTING(i)
+/* XXX: these three assume that register i is implemented */
+#define PMD_IS_COUNTING(i) (pmu_conf.pmd_desc[i].type == PFM_REG_COUNTING)
+#define PMC_IS_COUNTING(i) (pmu_conf.pmc_desc[i].type == PFM_REG_COUNTING)
+#define PMC_IS_MONITOR(c)  (pmu_conf.pmc_desc[i].type == PFM_REG_MONITOR)
 
+/* k assume unsigned */
 #define IBR_IS_IMPL(k)	  (k<pmu_conf.num_ibrs)
 #define DBR_IS_IMPL(k)	  (k<pmu_conf.num_dbrs)
 
-#define PMC_IS_BTB(a)	  (((pfm_monitor_t *)(a))->pmc_es == PMU_BTB_EVENT)
-
-#define LSHIFT(x)		(1UL<<(x))
-#define PMM(x)			LSHIFT(x)
-#define PMC_IS_MONITOR(c)	((pmu_conf.monitor_pmcs[0] & PMM((c))) != 0)
-
 #define CTX_IS_ENABLED(c) 	((c)->ctx_flags.state == PFM_CTX_ENABLED)
 #define CTX_OVFL_NOBLOCK(c)	((c)->ctx_fl_block == 0)
 #define CTX_INHERIT_MODE(c)	((c)->ctx_fl_inherit)
 #define CTX_HAS_SMPL(c)		((c)->ctx_psb != NULL)
-#define CTX_USED_PMD(ctx,n) 	(ctx)->ctx_used_pmds[(n)>>6] |= 1UL<< ((n) % 64)
+/* XXX: does not support more than 64 PMDs */
+#define CTX_USED_PMD(ctx, mask) (ctx)->ctx_used_pmds[0] |= (mask)
+#define CTX_IS_USED_PMD(ctx, c) (((ctx)->ctx_used_pmds[0] & (1UL << (c))) != 0UL)
+
 
 #define CTX_USED_IBR(ctx,n) 	(ctx)->ctx_used_ibrs[(n)>>6] |= 1UL<< ((n) % 64)
 #define CTX_USED_DBR(ctx,n) 	(ctx)->ctx_used_dbrs[(n)>>6] |= 1UL<< ((n) % 64)
@@ -104,17 +106,29 @@
 
 #define PFM_REG_RETFLAG_SET(flags, val)	do { flags &= ~PFM_REG_RETFL_MASK; flags |= (val); } while(0)
 
+#ifdef CONFIG_SMP
+#define cpu_is_online(i) (cpu_online_map & (1UL << i))
+#else
+#define cpu_is_online(i)        (i==0)
+#endif
+
 /*
  * debugging
  */
 #define DBprintk(a) \
 	do { \
-		if (pfm_debug_mode >0) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \
+		if (pfm_sysctl.debug >0) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \
+	} while (0)
+
+#define DBprintk_ovfl(a) \
+	do { \
+		if (pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \
 	} while (0)
 
 
+
 /* 
- * These are some helpful architected PMC and IBR/DBR register layouts
+ * Architected PMC structure
  */
 typedef struct {
 	unsigned long pmc_plm:4;	/* privilege level mask */
@@ -139,41 +153,40 @@
 typedef struct _pfm_smpl_buffer_desc {
 	spinlock_t		psb_lock;	/* protection lock */
 	unsigned long		psb_refcnt;	/* how many users for the buffer */
-	int			psb_flags;	/* bitvector of flags */
+	int			psb_flags;	/* bitvector of flags (not yet used) */
 
 	void			*psb_addr;	/* points to location of first entry */
 	unsigned long		psb_entries;	/* maximum number of entries */
 	unsigned long		psb_size;	/* aligned size of buffer */
 	unsigned long		psb_index;	/* next free entry slot XXX: must use the one in buffer */
 	unsigned long		psb_entry_size;	/* size of each entry including entry header */
+
 	perfmon_smpl_hdr_t	*psb_hdr;	/* points to sampling buffer header */
 
 	struct _pfm_smpl_buffer_desc *psb_next;	/* next psb, used for rvfreeing of psb_hdr */
 
 } pfm_smpl_buffer_desc_t;
 
+/*
+ * psb_flags
+ */
+#define PSB_HAS_VMA	0x1		/* a virtual mapping for the buffer exists */
+
 #define LOCK_PSB(p)	spin_lock(&(p)->psb_lock)
 #define UNLOCK_PSB(p)	spin_unlock(&(p)->psb_lock)
 
-#define PFM_PSB_VMA	0x1			/* a VMA is describing the buffer */
-
 /*
- * This structure is initialized at boot time and contains
- * a description of the PMU main characteristic as indicated
- * by PAL
+ * The possible type of a PMU register
  */
-typedef struct {
-	unsigned long pfm_is_disabled;	/* indicates if perfmon is working properly */
-	unsigned long perf_ovfl_val;	/* overflow value for generic counters   */
-	unsigned long max_counters;	/* upper limit on counter pair (PMC/PMD) */
-	unsigned long num_pmcs ;	/* highest PMC implemented (may have holes) */
-	unsigned long num_pmds;		/* highest PMD implemented (may have holes) */
-	unsigned long impl_regs[16];	/* buffer used to hold implememted PMC/PMD mask */
-	unsigned long num_ibrs;		/* number of instruction debug registers */
-	unsigned long num_dbrs;		/* number of data debug registers */
-	unsigned long monitor_pmcs[4];	/* which pmc are controlling monitors */
-	unsigned long counter_pmds[4];	/* which pmd are used as counters */
-} pmu_config_t;
+typedef enum { 
+	PFM_REG_NOTIMPL, /* not implemented */
+	PFM_REG_NONE, 	 /* end marker */
+	PFM_REG_MONITOR, /* a PMC with a pmc.pm field only */
+	PFM_REG_COUNTING,/* a PMC with a pmc.pm AND pmc.oi, a PMD used as a counter */
+	PFM_REG_CONTROL, /* PMU control register */
+	PFM_REG_CONFIG,  /* refine configuration */
+	PFM_REG_BUFFER	 /* PMD used as buffer */
+} pfm_pmu_reg_type_t;
 
 /*
  * 64-bit software counter structure
@@ -221,9 +234,11 @@
 
 	struct semaphore	ctx_restart_sem;   	/* use for blocking notification mode */
 
-	unsigned long		ctx_used_pmds[4];	/* bitmask of used PMD (speedup ctxsw) */
-	unsigned long		ctx_saved_pmcs[4];	/* bitmask of PMC to save on ctxsw */
-	unsigned long		ctx_reload_pmcs[4];	/* bitmask of PMC to reload on ctxsw (SMP) */
+	unsigned long		ctx_used_pmds[4];	/* bitmask of PMD used                 */
+	unsigned long		ctx_reload_pmds[4];	/* bitmask of PMD to reload on ctxsw   */
+
+	unsigned long		ctx_used_pmcs[4];	/* bitmask PMC used by context         */
+	unsigned long		ctx_reload_pmcs[4];	/* bitmask of PMC to reload on ctxsw   */
 
 	unsigned long		ctx_used_ibrs[4];	/* bitmask of used IBR (speedup ctxsw) */
 	unsigned long		ctx_used_dbrs[4];	/* bitmask of used DBR (speedup ctxsw) */
@@ -235,6 +250,7 @@
 	unsigned long		ctx_cpu;		/* cpu to which perfmon is applied (system wide) */
 
 	atomic_t		ctx_saving_in_progress;	/* flag indicating actual save in progress */
+	atomic_t		ctx_is_busy;		/* context accessed by overflow handler */
 	atomic_t		ctx_last_cpu;		/* CPU id of current or last CPU used */
 } pfm_context_t;
 
@@ -250,16 +266,54 @@
  * mostly used to synchronize between system wide and per-process
  */
 typedef struct {
-	spinlock_t		pfs_lock;		/* lock the structure */
+	spinlock_t		pfs_lock;		   /* lock the structure */
 
-	unsigned long		pfs_task_sessions;	/* number of per task sessions */
-	unsigned long		pfs_sys_sessions;	/* number of per system wide sessions */
-	unsigned long   	pfs_sys_use_dbregs;	  	/* incremented when a system wide session uses debug regs */
-	unsigned long   	pfs_ptrace_use_dbregs;	  /* incremented when a process uses debug regs */
-	struct task_struct	*pfs_sys_session[NR_CPUS];  /* point to task owning a system-wide session */
+	unsigned long		pfs_task_sessions;	   /* number of per task sessions */
+	unsigned long		pfs_sys_sessions;	   /* number of per system wide sessions */
+	unsigned long   	pfs_sys_use_dbregs;	   /* incremented when a system wide session uses debug regs */
+	unsigned long   	pfs_ptrace_use_dbregs;	   /* incremented when a process uses debug regs */
+	struct task_struct	*pfs_sys_session[NR_CPUS]; /* point to task owning a system-wide session */
 } pfm_session_t;
 
 /*
+ * information about a PMC or PMD.
+ * dep_pmd[]: a bitmask of dependent PMD registers 
+ * dep_pmc[]: a bitmask of dependent PMC registers
+ */
+typedef struct {
+	pfm_pmu_reg_type_t	type;
+	int			pm_pos;
+	int			(*read_check)(struct task_struct *task, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
+	int			(*write_check)(struct task_struct *task, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
+	unsigned long		dep_pmd[4];
+	unsigned long		dep_pmc[4];
+} pfm_reg_desc_t;
+/* assume cnum is a valid monitor */
+#define PMC_PM(cnum, val)	(((val) >> (pmu_conf.pmc_desc[cnum].pm_pos)) & 0x1)
+#define PMC_WR_FUNC(cnum)	(pmu_conf.pmc_desc[cnum].write_check)
+#define PMD_WR_FUNC(cnum)	(pmu_conf.pmd_desc[cnum].write_check)
+#define PMD_RD_FUNC(cnum)	(pmu_conf.pmd_desc[cnum].read_check)
+
+/*
+ * This structure is initialized at boot time and contains
+ * a description of the PMU main characteristic as indicated
+ * by PAL along with a list of inter-registers dependencies and configurations.
+ */
+typedef struct {
+	unsigned long pfm_is_disabled;	/* indicates if perfmon is working properly */
+	unsigned long perf_ovfl_val;	/* overflow value for generic counters   */
+	unsigned long max_counters;	/* upper limit on counter pair (PMC/PMD) */
+	unsigned long num_pmcs ;	/* highest PMC implemented (may have holes) */
+	unsigned long num_pmds;		/* highest PMD implemented (may have holes) */
+	unsigned long impl_regs[16];	/* buffer used to hold implememted PMC/PMD mask */
+	unsigned long num_ibrs;		/* number of instruction debug registers */
+	unsigned long num_dbrs;		/* number of data debug registers */
+	pfm_reg_desc_t *pmc_desc;	/* detailed PMC register descriptions */
+	pfm_reg_desc_t *pmd_desc;	/* detailed PMD register descriptions */
+} pmu_config_t;
+
+
+/*
  * structure used to pass argument to/from remote CPU 
  * using IPI to check and possibly save the PMU context on SMP systems.
  *
@@ -301,22 +355,50 @@
 #define PFM_CMD_NARG(cmd)	(pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_narg)
 #define PFM_CMD_ARG_SIZE(cmd)	(pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_argsize)
 
+typedef struct {
+	int	debug;		/* turn on/off debugging via syslog */
+	int	debug_ovfl;	/* turn on/off debug printk in overflow handler */
+	int	fastctxsw;	/* turn on/off fast (unsecure) ctxsw */
+} pfm_sysctl_t;
+
+typedef struct {
+	unsigned long pfm_spurious_ovfl_intr_count; /* keep track of spurious ovfl interrupts */
+	unsigned long pfm_ovfl_intr_count; /* keep track of ovfl interrupts */
+	unsigned long pfm_recorded_samples_count;
+	unsigned long pfm_full_smpl_buffer_count; /* how many times the sampling buffer was full */
+} pfm_stats_t;
 
 /*
  * perfmon internal variables
  */
 static pmu_config_t	pmu_conf; 	/* PMU configuration */
-static int		pfm_debug_mode;	/* 0= nodebug, >0= debug output on */
 static pfm_session_t	pfm_sessions;	/* global sessions information */
 static struct proc_dir_entry *perfmon_dir; /* for debug only */
-static unsigned long pfm_spurious_ovfl_intr_count; /* keep track of spurious ovfl interrupts */
-static unsigned long pfm_ovfl_intr_count; /* keep track of spurious ovfl interrupts */
-static unsigned long pfm_recorded_samples_count;
+static pfm_stats_t	pfm_stats;
 
+/* sysctl() controls */
+static pfm_sysctl_t pfm_sysctl;
+
+static ctl_table pfm_ctl_table[]={
+	{1, "debug", &pfm_sysctl.debug, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
+	{2, "debug_ovfl", &pfm_sysctl.debug_ovfl, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
+	{3, "fastctxsw", &pfm_sysctl.fastctxsw, sizeof(int), 0600, NULL, &proc_dointvec, NULL,},
+	{ 0, },
+};
+static ctl_table pfm_sysctl_dir[] = {
+	{1, "perfmon", NULL, 0, 0755, pfm_ctl_table, },
+ 	{0,},
+};
+static ctl_table pfm_sysctl_root[] = {
+	{1, "kernel", NULL, 0, 0755, pfm_sysctl_dir, },
+ 	{0,},
+};
+static struct ctl_table_header *pfm_sysctl_header;
 
 static unsigned long reset_pmcs[IA64_NUM_PMC_REGS];	/* contains PAL reset values for PMCS */
 
 static void pfm_vm_close(struct vm_area_struct * area);
+
 static struct vm_operations_struct pfm_vm_ops={
 	close: pfm_vm_close
 };
@@ -339,6 +421,14 @@
 #endif
 static void pfm_lazy_save_regs (struct task_struct *ta);
 
+#if   defined(CONFIG_ITANIUM)
+#include "perfmon_itanium.h"
+#elif defined(CONFIG_MCKINLEY)
+#include "perfmon_mckinley.h"
+#else
+#include "perfmon_generic.h"
+#endif
+
 static inline unsigned long
 pfm_read_soft_counter(pfm_context_t *ctx, int i)
 {
@@ -353,7 +443,7 @@
 	 * writing to unimplemented part is ignore, so we do not need to
 	 * mask off top part
 	 */
-	ia64_set_pmd(i, val);
+	ia64_set_pmd(i, val & pmu_conf.perf_ovfl_val);
 }
 
 /*
@@ -424,7 +514,6 @@
 	return pa;
 }
 
-
 static void *
 pfm_rvmalloc(unsigned long size)
 {
@@ -500,7 +589,7 @@
 	 *
 	 * This function cannot remove the buffer from here, because exit_mmap() must first
 	 * complete. Given that there is no other vma related callback in the generic code,
-	 * we have created on own with the linked list of sampling buffer to free which
+	 * we have created our own with the linked list of sampling buffers to free. The list
 	 * is part of the thread structure. In release_thread() we check if the list is
 	 * empty. If not we call into perfmon to free the buffer and psb. That is the only
 	 * way to ensure a safe deallocation of the sampling buffer which works when
@@ -516,16 +605,15 @@
 		psb->psb_next = current->thread.pfm_smpl_buf_list;
 		current->thread.pfm_smpl_buf_list = psb;
 
-		DBprintk(("psb for [%d] smpl @%p size %ld inserted into list\n", 
-			current->pid, psb->psb_hdr, psb->psb_size));
+		DBprintk(("[%d] add smpl @%p size %lu to smpl_buf_list psb_flags=0x%x\n", 
+			current->pid, psb->psb_hdr, psb->psb_size, psb->psb_flags));
 	}
-	DBprintk(("psb vma flag cleared for [%d] smpl @%p size %ld inserted into list\n", 
-			current->pid, psb->psb_hdr, psb->psb_size));
-
+	DBprintk(("[%d] clearing psb_flags=0x%x smpl @%p size %lu\n", 
+			current->pid, psb->psb_flags, psb->psb_hdr, psb->psb_size));
 	/*
-	 * indicate to pfm_context_exit() that the vma has been removed. 
+	 * decrement the number vma for the buffer
 	 */
-	psb->psb_flags &= ~PFM_PSB_VMA;
+	psb->psb_flags &= ~PSB_HAS_VMA;
 
 	UNLOCK_PSB(psb);
 }
@@ -548,7 +636,7 @@
 		printk("perfmon: invalid context mm=%p\n", task->mm);
 		return -1;
 	}
-	psb = ctx->ctx_psb;	
+	psb = ctx->ctx_psb;
 
 	down_write(&task->mm->mmap_sem);
 
@@ -559,14 +647,9 @@
 		printk("perfmon: pid %d unable to unmap sampling buffer @0x%lx size=%ld\n", 
 				task->pid, ctx->ctx_smpl_vaddr, psb->psb_size);
 	}
-	DBprintk(("[%d] do_unmap(0x%lx, %ld)=%d\n", 
-		task->pid, ctx->ctx_smpl_vaddr, psb->psb_size, r));
 
-	/* 
-	 * make sure we suppress all traces of this buffer
-	 * (important for pfm_inherit)
-	 */
-	ctx->ctx_smpl_vaddr = 0;
+	DBprintk(("[%d] do_unmap(0x%lx, %ld)=%d refcnt=%lu psb_flags=0x%x\n", 
+		task->pid, ctx->ctx_smpl_vaddr, psb->psb_size, r, psb->psb_refcnt, psb->psb_flags));
 
 	return 0;
 }
@@ -599,7 +682,7 @@
 	while (size > 0) {
 		page = pfm_kvirt_to_pa(buf);
 
-		if (remap_page_range(addr, page, PAGE_SIZE, PAGE_SHARED)) return -ENOMEM;
+		if (remap_page_range(addr, page, PAGE_SIZE, PAGE_READONLY)) return -ENOMEM;
 		
 		addr  += PAGE_SIZE;
 		buf   += PAGE_SIZE;
@@ -638,17 +721,25 @@
 	void *smpl_buf;
 	pfm_smpl_buffer_desc_t *psb;
 
-	regcount = pfm_smpl_entry_size(which_pmds, 1);
 
 	/* note that regcount might be 0, in this case only the header for each
 	 * entry will be recorded.
 	 */
+	regcount = pfm_smpl_entry_size(which_pmds, 1);
+
+	if ((sizeof(perfmon_smpl_hdr_t)+ entries*sizeof(perfmon_smpl_entry_t)) <= entries) {
+		DBprintk(("requested entries %lu is too big\n", entries));
+		return -EINVAL;
+	}
 
 	/*
 	 * 1 buffer hdr and for each entry a header + regcount PMDs to save
 	 */
 	size = PAGE_ALIGN(  sizeof(perfmon_smpl_hdr_t)
 			  + entries * (sizeof(perfmon_smpl_entry_t) + regcount*sizeof(u64)));
+
+	DBprintk(("sampling buffer size=%lu bytes\n", size));
+
 	/*
 	 * check requested size to avoid Denial-of-service attacks
 	 * XXX: may have to refine this test	
@@ -688,9 +779,13 @@
 	}
 	/*
 	 * partially initialize the vma for the sampling buffer
+	 *
+	 * The VM_DONTCOPY flag is very important as it ensures that the mapping
+	 * will never be inherited for any child process (via fork()) which is always 
+	 * what we want.
 	 */
 	vma->vm_mm	     = mm;
-	vma->vm_flags	     = VM_READ| VM_MAYREAD |VM_RESERVED;
+	vma->vm_flags	     = VM_READ| VM_MAYREAD |VM_RESERVED|VM_DONTCOPY;
 	vma->vm_page_prot    = PAGE_READONLY; /* XXX may need to change */
 	vma->vm_ops	     = &pfm_vm_ops; /* necesarry to get the close() callback */
 	vma->vm_pgoff	     = 0;
@@ -708,8 +803,8 @@
 	psb->psb_size    = size; /* aligned size */
 	psb->psb_index   = 0;
 	psb->psb_entries = entries;
-	psb->psb_flags   = PFM_PSB_VMA; /* remember that there is a vma describing the buffer */
 	psb->psb_refcnt  = 1;
+	psb->psb_flags   = PSB_HAS_VMA;
 
 	spin_lock_init(&psb->psb_lock);
 
@@ -719,9 +814,9 @@
 	 */
 	psb->psb_entry_size = sizeof(perfmon_smpl_entry_t) + regcount*sizeof(u64);
 
-	DBprintk(("psb @%p entry_size=%ld hdr=%p addr=%p\n", 
+	DBprintk(("psb @%p entry_size=%ld hdr=%p addr=%p refcnt=%lu psb_flags=0x%x\n", 
 		  (void *)psb,psb->psb_entry_size, (void *)psb->psb_hdr, 
-		  (void *)psb->psb_addr));
+		  (void *)psb->psb_addr, psb->psb_refcnt, psb->psb_flags));
 
 	/* initialize some of the fields of user visible buffer header */
 	psb->psb_hdr->hdr_version    = PFM_SMPL_VERSION;
@@ -797,7 +892,6 @@
 	    && (current->uid ^ task->suid) && (current->uid ^ task->uid);
 }
 
-
 static int
 pfx_is_sane(struct task_struct *task, pfarg_context_t *pfx)
 {
@@ -813,6 +907,11 @@
 	}
 	ctx_flags = pfx->ctx_flags;
 
+	if ((ctx_flags & PFM_FL_INHERIT_MASK) == (PFM_FL_INHERIT_ONCE|PFM_FL_INHERIT_ALL)) {
+		DBprintk(("invalid inherit mask 0x%x\n",ctx_flags & PFM_FL_INHERIT_MASK));
+		return -EINVAL;
+	}
+
 	if (ctx_flags & PFM_FL_SYSTEM_WIDE) {
 		DBprintk(("cpu_mask=0x%lx\n", pfx->ctx_cpu_mask));
 		/*
@@ -832,8 +931,8 @@
 		/*
 		 * and it must be a valid CPU
 		 */
-		cpu = ffs(pfx->ctx_cpu_mask);
-		if (cpu > smp_num_cpus) {
+		cpu = ffz(~pfx->ctx_cpu_mask);
+		if (cpu_is_online(cpu) == 0) {
 			DBprintk(("CPU%d is not online\n", cpu));
 			return -EINVAL;
 		}
@@ -851,7 +950,16 @@
 		 * must provide a target for the signal in blocking mode even when
 		 * no counter is configured with PFM_FL_REG_OVFL_NOTIFY
 		 */
-		if ((ctx_flags & PFM_FL_NOTIFY_BLOCK) && pfx->ctx_notify_pid == 0) return -EINVAL;
+		if ((ctx_flags & PFM_FL_NOTIFY_BLOCK) && pfx->ctx_notify_pid == 0) {
+			DBprintk(("must have notify_pid when blocking for [%d]\n", task->pid));
+			return -EINVAL;
+		}
+#if 0
+		if ((ctx_flags & PFM_FL_NOTIFY_BLOCK) && pfx->ctx_notify_pid == task->pid) {
+			DBprintk(("cannot notify self when blocking for [%d]\n", task->pid));
+			return -EINVAL;
+		}
+#endif
 	}
 	/* probably more to add here */
 
@@ -859,7 +967,7 @@
 }
 
 static int
-pfm_create_context(struct task_struct *task, pfm_context_t *ctx, void *req, int count, 
+pfm_context_create(struct task_struct *task, pfm_context_t *ctx, void *req, int count, 
 		   struct pt_regs *regs)
 {
 	pfarg_context_t tmp;
@@ -890,7 +998,7 @@
 	if (ctx_flags & PFM_FL_SYSTEM_WIDE) {
 
 		/* at this point, we know there is at least one bit set */
-		cpu = ffs(tmp.ctx_cpu_mask) - 1;
+		cpu = ffz(~tmp.ctx_cpu_mask);
 
 		DBprintk(("requesting CPU%d currently on CPU%d\n",cpu, smp_processor_id()));
 
@@ -984,7 +1092,7 @@
 	}
 
 	if (tmp.ctx_smpl_entries) {
-		DBprintk(("sampling entries=%ld\n",tmp.ctx_smpl_entries));
+		DBprintk(("sampling entries=%lu\n",tmp.ctx_smpl_entries));
 
 		ret = pfm_smpl_buffer_alloc(ctx, tmp.ctx_smpl_regs, 
 						 tmp.ctx_smpl_entries, &uaddr);
@@ -1010,20 +1118,12 @@
 
 	atomic_set(&ctx->ctx_last_cpu,-1); /* SMP only, means no CPU */
 
-	/* 
-	 * Keep track of the pmds we want to sample
-	 * XXX: may be we don't need to save/restore the DEAR/IEAR pmds
-	 * but we do need the BTB for sure. This is because of a hardware
-	 * buffer of 1 only for non-BTB pmds.
-	 *
-	 * We ignore the unimplemented pmds specified by the user
-	 */
-	ctx->ctx_used_pmds[0]  = tmp.ctx_smpl_regs[0] & pmu_conf.impl_regs[4];
-	ctx->ctx_saved_pmcs[0] = 1; /* always save/restore PMC[0] */
+	/* may be redudant with memset() but at least it's easier to remember */
+	atomic_set(&ctx->ctx_saving_in_progress, 0); 
+	atomic_set(&ctx->ctx_is_busy, 0); 
 
 	sema_init(&ctx->ctx_restart_sem, 0); /* init this semaphore to locked */
 
-
 	if (copy_to_user(req, &tmp, sizeof(tmp))) {
 		ret = -EFAULT;
 		goto buffer_error;
@@ -1126,21 +1226,22 @@
 			  	current->pid, 
 				flag == PFM_RELOAD_LONG_RESET ? "long" : "short", i, val));
 	}
+	ia64_srlz_d();
 	/* just in case ! */
 	ctx->ctx_ovfl_regs[0] = 0UL;
 }
 
 static int
-pfm_write_pmcs(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+pfm_write_pmcs(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 {
-	struct thread_struct *th = &ta->thread;
+	struct thread_struct *th = &task->thread;
 	pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg;
 	unsigned int cnum;
 	int i;
 	int ret = 0, reg_retval = 0;
 
 	/* we don't quite support this right now */
-	if (ta != current) return -EINVAL;
+	if (task != current) return -EINVAL;
 
 	if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
 
@@ -1169,30 +1270,30 @@
 		 * 	- per-task : user monitor
 		 * any other configuration is rejected.
 		 */
-		if (PMC_IS_MONITOR(cnum)) {
-			pfm_monitor_t *p = (pfm_monitor_t *)&tmp.reg_value;
-
-			DBprintk(("pmc[%u].pm = %d\n", cnum, p->pmc_pm));
+		if (PMC_IS_MONITOR(cnum) || PMC_IS_COUNTING(cnum)) {
+			DBprintk(("pmc[%u].pm=%ld\n", cnum, PMC_PM(cnum, tmp.reg_value))); 
 
-			if (ctx->ctx_fl_system ^ p->pmc_pm) {
-			//if ((ctx->ctx_fl_system == 1 && p->pmc_pm == 0)
-			 //  ||(ctx->ctx_fl_system == 0 && p->pmc_pm == 1)) {
+			if (ctx->ctx_fl_system ^ PMC_PM(cnum, tmp.reg_value)) {
+				DBprintk(("pmc_pm=%ld fl_system=%d\n", PMC_PM(cnum, tmp.reg_value), ctx->ctx_fl_system));
 				ret = -EINVAL;
 				goto abort_mission;
 			}
-			/*
-			 * enforce generation of overflow interrupt. Necessary on all
-			 * CPUs which do not implement 64-bit hardware counters.
-			 */
-			p->pmc_oi = 1;
 		}
 
 		if (PMC_IS_COUNTING(cnum)) {
+			pfm_monitor_t *p = (pfm_monitor_t *)&tmp.reg_value;
+			/*
+		 	 * enforce generation of overflow interrupt. Necessary on all
+		 	 * CPUs.
+		 	 */
+			p->pmc_oi = 1;
+
 			if (tmp.reg_flags & PFM_REGFL_OVFL_NOTIFY) {
 				/*
 				 * must have a target for the signal
 				 */
 				if (ctx->ctx_notify_task == NULL) {
+					DBprintk(("no notify_task && PFM_REGFL_OVFL_NOTIFY\n"));
 					ret = -EINVAL;
 					goto abort_mission;
 				}
@@ -1206,14 +1307,11 @@
 			ctx->ctx_soft_pmds[cnum].reset_pmds[1] = tmp.reg_reset_pmds[1];
 			ctx->ctx_soft_pmds[cnum].reset_pmds[2] = tmp.reg_reset_pmds[2];
 			ctx->ctx_soft_pmds[cnum].reset_pmds[3] = tmp.reg_reset_pmds[3];
-
-			/*
-			 * needed in case the user does not initialize the equivalent
-			 * PMD. Clearing is done in reset_pmu() so there is no possible
-			 * leak here.
-			 */
-			CTX_USED_PMD(ctx, cnum);
 		}
+		/*
+		 * execute write checker, if any
+		 */
+		if (PMC_WR_FUNC(cnum)) ret = PMC_WR_FUNC(cnum)(task, cnum, &tmp.reg_value, regs);
 abort_mission:
 		if (ret == -EINVAL) reg_retval = PFM_REG_RETFL_EINVAL;
 
@@ -1233,14 +1331,21 @@
 		 */
 		if (ret != 0) {
 			DBprintk(("[%d] pmc[%u]=0x%lx error %d\n",
-				  ta->pid, cnum, tmp.reg_value, reg_retval));
+				  task->pid, cnum, tmp.reg_value, reg_retval));
 			break;
 		}
 
 		/* 
 		 * We can proceed with this register!
 		 */
-		
+
+		/*
+		 * Needed in case the user does not initialize the equivalent
+		 * PMD. Clearing is done in reset_pmu() so there is no possible
+		 * leak here.
+		 */
+		CTX_USED_PMD(ctx, pmu_conf.pmc_desc[cnum].dep_pmd[0]);
+
 		/* 
 		 * keep copy the pmc, used for register reload
 		 */
@@ -1248,17 +1353,17 @@
 
 		ia64_set_pmc(cnum, tmp.reg_value);
 
-		DBprintk(("[%d] pmc[%u]=0x%lx flags=0x%x save_pmcs=0%lx reload_pmcs=0x%lx\n", 
-			  ta->pid, cnum, tmp.reg_value, 
+		DBprintk(("[%d] pmc[%u]=0x%lx flags=0x%x used_pmds=0x%lx\n", 
+			  task->pid, cnum, tmp.reg_value, 
 			  ctx->ctx_soft_pmds[cnum].flags, 
-			  ctx->ctx_saved_pmcs[0], ctx->ctx_reload_pmcs[0]));
+			  ctx->ctx_used_pmds[0]));
 
 	}
 	return ret;
 }
 
 static int
-pfm_write_pmds(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+pfm_write_pmds(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 {
 	pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg;
 	unsigned int cnum;
@@ -1266,7 +1371,7 @@
 	int ret = 0, reg_retval = 0;
 
 	/* we don't quite support this right now */
-	if (ta != current) return -EINVAL;
+	if (task != current) return -EINVAL;
 
 	/* 
 	 * Cannot do anything before PMU is enabled 
@@ -1281,7 +1386,6 @@
 		if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
 
 		cnum = tmp.reg_num;
-
 		if (!PMD_IS_IMPL(cnum)) {
 			ret = -EINVAL;
 			goto abort_mission;
@@ -1295,6 +1399,10 @@
 			ctx->ctx_soft_pmds[cnum].short_reset = tmp.reg_short_reset;
 
 		}
+		/*
+		 * execute write checker, if any
+		 */
+		if (PMD_WR_FUNC(cnum)) ret = PMD_WR_FUNC(cnum)(task, cnum, &tmp.reg_value, regs);
 abort_mission:
 		if (ret == -EINVAL) reg_retval = PFM_REG_RETFL_EINVAL;
 
@@ -1311,21 +1419,24 @@
 		 */
 		if (ret != 0) {
 			DBprintk(("[%d] pmc[%u]=0x%lx error %d\n",
-				  ta->pid, cnum, tmp.reg_value, reg_retval));
+				  task->pid, cnum, tmp.reg_value, reg_retval));
 			break;
 		}
 
 		/* keep track of what we use */
-		CTX_USED_PMD(ctx, cnum);
+		CTX_USED_PMD(ctx, pmu_conf.pmd_desc[(cnum)].dep_pmd[0]);
+		/* mark this register as used as well */
+		CTX_USED_PMD(ctx, RDEP(cnum));
 
 		/* writes to unimplemented part is ignored, so this is safe */
-		ia64_set_pmd(cnum, tmp.reg_value);
+		ia64_set_pmd(cnum, tmp.reg_value & pmu_conf.perf_ovfl_val);
 
 		/* to go away */
 		ia64_srlz_d();
+
 		DBprintk(("[%d] pmd[%u]: soft_pmd=0x%lx  short_reset=0x%lx "
 			  "long_reset=0x%lx hw_pmd=%lx notify=%c used_pmds=0x%lx reset_pmds=0x%lx\n",
-				ta->pid, cnum,
+				task->pid, cnum,
 				ctx->ctx_soft_pmds[cnum].val,
 				ctx->ctx_soft_pmds[cnum].short_reset,
 				ctx->ctx_soft_pmds[cnum].long_reset,
@@ -1338,12 +1449,13 @@
 }
 
 static int
-pfm_read_pmds(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+pfm_read_pmds(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 {
-	struct thread_struct *th = &ta->thread;
+	struct thread_struct *th = &task->thread;
 	unsigned long val=0;
 	pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg;
-	int i;
+	unsigned int cnum;
+	int i, ret = 0;
 
 	if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
 
@@ -1356,14 +1468,25 @@
 
 	/* XXX: ctx locking may be required here */
 
-	DBprintk(("ctx_last_cpu=%d for [%d]\n", atomic_read(&ctx->ctx_last_cpu), ta->pid));
+	DBprintk(("ctx_last_cpu=%d for [%d]\n", atomic_read(&ctx->ctx_last_cpu), task->pid));
 
 	for (i = 0; i < count; i++, req++) {
-		unsigned long reg_val = ~0UL, ctx_val = ~0UL;
+		unsigned long ctx_val = ~0UL;
 
 		if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
 
-		if (!PMD_IS_IMPL(tmp.reg_num)) goto abort_mission;
+		cnum = tmp.reg_num;
+
+		if (!PMD_IS_IMPL(cnum)) goto abort_mission;
+		/*
+		 * we can only read the register that we use. That includes
+		 * the one we explicitely initialize AND the one we want included
+		 * in the sampling buffer (smpl_regs).
+		 *
+		 * Having this restriction allows optimization in the ctxsw routine
+		 * without compromising security (leaks)
+		 */
+		if (!CTX_IS_USED_PMD(ctx, cnum)) goto abort_mission;
 
 		/*
 		 * If the task is not the current one, then we check if the
@@ -1372,8 +1495,8 @@
 		 */
 		if (atomic_read(&ctx->ctx_last_cpu) == smp_processor_id()){
 			ia64_srlz_d();
-			val = reg_val = ia64_get_pmd(tmp.reg_num);
-			DBprintk(("reading pmd[%u]=0x%lx from hw\n", tmp.reg_num, val));
+			val = ia64_get_pmd(cnum);
+			DBprintk(("reading pmd[%u]=0x%lx from hw\n", cnum, val));
 		} else {
 #ifdef CONFIG_SMP
 			int cpu;
@@ -1389,30 +1512,35 @@
 			 */
 			cpu = atomic_read(&ctx->ctx_last_cpu);
 			if (cpu != -1) {
-				DBprintk(("must fetch on CPU%d for [%d]\n", cpu, ta->pid));
-				pfm_fetch_regs(cpu, ta, ctx);
+				DBprintk(("must fetch on CPU%d for [%d]\n", cpu, task->pid));
+				pfm_fetch_regs(cpu, task, ctx);
 			}
 #endif
 			/* context has been saved */
-			val = reg_val = th->pmd[tmp.reg_num];
+			val = th->pmd[cnum];
 		}
-		if (PMD_IS_COUNTING(tmp.reg_num)) {
+		if (PMD_IS_COUNTING(cnum)) {
 			/*
 			 * XXX: need to check for overflow
 			 */
 
 			val &= pmu_conf.perf_ovfl_val;
-			val += ctx_val = ctx->ctx_soft_pmds[tmp.reg_num].val;
-		} else {
+			val += ctx_val = ctx->ctx_soft_pmds[cnum].val;
+		} 
 
-			val = reg_val = ia64_get_pmd(tmp.reg_num);
-		}
-		PFM_REG_RETFLAG_SET(tmp.reg_flags, 0);
 		tmp.reg_value = val;
 
-		DBprintk(("read pmd[%u] soft_pmd=0x%lx reg=0x%lx pmc=0x%lx\n", 
-					tmp.reg_num, ctx_val, reg_val, 
-					ia64_get_pmc(tmp.reg_num)));
+		/*
+		 * execute read checker, if any
+		 */
+		if (PMD_RD_FUNC(cnum)) {
+			ret = PMD_RD_FUNC(cnum)(task, cnum, &tmp.reg_value, regs);
+		}
+
+		PFM_REG_RETFLAG_SET(tmp.reg_flags, ret);
+
+		DBprintk(("read pmd[%u] ret=%d value=0x%lx pmc=0x%lx\n", 
+					cnum, ret, val, ia64_get_pmc(cnum)));
 
 		if (copy_to_user(req, &tmp, sizeof(tmp))) return -EFAULT;
 	}
@@ -1420,7 +1548,7 @@
 abort_mission:
 	PFM_REG_RETFLAG_SET(tmp.reg_flags, PFM_REG_RETFL_EINVAL);
 	/* 
-	 * XXX: if this fails, we stick we the original failure, flag not updated!
+	 * XXX: if this fails, we stick with the original failure, flag not updated!
 	 */
 	copy_to_user(req, &tmp, sizeof(tmp));
 	return -EINVAL;
@@ -1455,15 +1583,11 @@
 	 */
 	if (ctx && ctx->ctx_fl_using_dbreg == 1) return -1;
 
-	/*
-	 * XXX: not pretty
-	 */
 	LOCK_PFS();
 
 	/*
-	 * We only allow the use of debug registers when there is no system
-	 * wide monitoring 
-	 * XXX: we could relax this by 
+	 * We cannot allow setting breakpoints when system wide monitoring
+	 * sessions are using the debug registers.
 	 */
 	if (pfm_sessions.pfs_sys_use_dbregs> 0)
 		ret = -1;
@@ -1516,6 +1640,7 @@
 {
 	return 0;
 }
+
 int
 pfm_release_debug_registers(struct task_struct *task)
 {
@@ -1534,12 +1659,6 @@
 	 */
 	if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
 
-
-	if (ctx->ctx_fl_frozen==0) {
-		printk("task %d without pmu_frozen set\n", task->pid);
-		return -EINVAL;
-	}
-
 	if (task == current) {
 		DBprintk(("restarting self %d frozen=%d \n", current->pid, ctx->ctx_fl_frozen));
 
@@ -1656,25 +1775,35 @@
 				current->pid,
 				ctx->ctx_fl_system, PMU_OWNER(),
 				current));
+
 	/* simply stop monitoring but not the PMU */
 	if (ctx->ctx_fl_system) {
 
-		__asm__ __volatile__ ("rsm psr.pp;;"::: "memory");
-
 		/* disable dcr pp */
 		ia64_set_dcr(ia64_get_dcr() & ~IA64_DCR_PP);
 
+		/* stop monitoring */
+		__asm__ __volatile__ ("rsm psr.pp;;"::: "memory");
+
+		ia64_srlz_i();
+
 #ifdef CONFIG_SMP
 		local_cpu_data->pfm_dcr_pp  = 0;
 #else
 		pfm_tasklist_toggle_pp(0);
 #endif
-
 		ia64_psr(regs)->pp = 0;
 
 	} else {
+
+		/* stop monitoring */
 		__asm__ __volatile__ ("rum psr.up;;"::: "memory");
 
+		ia64_srlz_i();
+
+		/*
+		 * clear user level psr.up
+		 */
 		ia64_psr(regs)->up = 0;
 	}
 	return 0;
@@ -1701,7 +1830,7 @@
 		ia64_psr(regs)->up = 0;
 	}
 	/* 
-	 * goes back to default behavior 
+	 * goes back to default behavior: no user level control
 	 * no need to change live psr.sp because useless at the kernel level
 	 */
 	ia64_psr(regs)->sp = 1;
@@ -1713,10 +1842,8 @@
 	return 0;
 }
 
-
-
 static int
-pfm_destroy_context(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, 
+pfm_context_destroy(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, 
 	 struct pt_regs *regs)
 {
 	/* we don't quite support this right now */
@@ -1742,15 +1869,14 @@
 		ia64_psr(regs)->up = 0;
 	}
 
-	/* restore security level */
-	ia64_psr(regs)->sp = 1;
-
 skipped_stop:
 	/*
 	 * remove sampling buffer mapping, if any
 	 */
-	if (ctx->ctx_smpl_vaddr) pfm_remove_smpl_mapping(task);
-
+	if (ctx->ctx_smpl_vaddr) {
+		pfm_remove_smpl_mapping(task);
+		ctx->ctx_smpl_vaddr = 0UL;
+	}
 	/* now free context and related state */
 	pfm_context_exit(task);
 
@@ -1761,7 +1887,7 @@
  * does nothing at the moment
  */
 static int
-pfm_unprotect_context(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, 
+pfm_context_unprotect(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, 
 	 struct pt_regs *regs)
 {
 	return 0;
@@ -1791,9 +1917,9 @@
 {
 	unsigned int mode = *(unsigned int *)arg;
 
-	pfm_debug_mode = mode == 0 ? 0 : 1;
+	pfm_sysctl.debug = mode == 0 ? 0 : 1;
 
-	printk("perfmon debugging %s\n", pfm_debug_mode ? "on" : "off");
+	printk("perfmon debugging %s\n", pfm_sysctl.debug ? "on" : "off");
 
 	return 0;
 }
@@ -1863,8 +1989,8 @@
 	if (ctx->ctx_fl_system) {
 		/* we mark ourselves as owner  of the debug registers */
 		ctx->ctx_fl_using_dbreg = 1;
-	} else {
-       		if (ctx->ctx_fl_using_dbreg == 0) {
+		DBprintk(("system-wide setting fl_using_dbreg for [%d]\n", task->pid));
+	} else if (first_time) {
 			ret= -EBUSY;
 			if ((thread->flags & IA64_THREAD_DBG_VALID) != 0) {
 				DBprintk(("debug registers already in use for [%d]\n", task->pid));
@@ -1873,6 +1999,7 @@
 			/* we mark ourselves as owner  of the debug registers */
 			ctx->ctx_fl_using_dbreg = 1;
 
+			DBprintk(("setting fl_using_dbreg for [%d]\n", task->pid));
 			/* 
 			 * Given debug registers cannot be used for both debugging 
 			 * and performance monitoring at the same time, we reuse
@@ -1880,18 +2007,27 @@
 			 */
 			memset(task->thread.dbr, 0, sizeof(task->thread.dbr));
 			memset(task->thread.ibr, 0, sizeof(task->thread.ibr));
+	}
 
-			/*
-			 * clear hardware registers to make sure we don't leak
-			 * information and pick up stale state
-			 */
-			for (i=0; i < pmu_conf.num_ibrs; i++) {
-				ia64_set_ibr(i, 0UL);
-			}
-			for (i=0; i < pmu_conf.num_dbrs; i++) {
-				ia64_set_dbr(i, 0UL);
-			}
+	if (first_time) {
+		DBprintk(("[%d] clearing ibrs,dbrs\n", task->pid));
+		/*
+	 	 * clear hardware registers to make sure we don't
+	 	 * pick up stale state. 
+		 *
+		 * for a system wide session, we do not use
+		 * thread.dbr, thread.ibr because this process
+		 * never leaves the current CPU and the state
+		 * is shared by all processes running on it
+	 	 */
+		for (i=0; i < pmu_conf.num_ibrs; i++) {
+			ia64_set_ibr(i, 0UL);
+		}
+		ia64_srlz_i();
+		for (i=0; i < pmu_conf.num_dbrs; i++) {
+			ia64_set_dbr(i, 0UL);
 		}
+		ia64_srlz_d();
 	}
 
 	ret = -EFAULT;
@@ -1951,6 +2087,7 @@
 			CTX_USED_IBR(ctx, rnum);
 
 			ia64_set_ibr(rnum, dbreg.val);
+			ia64_srlz_i();
 
 			thread->ibr[rnum] = dbreg.val;
 
@@ -1959,6 +2096,7 @@
 			CTX_USED_DBR(ctx, rnum);
 
 			ia64_set_dbr(rnum, dbreg.val);
+			ia64_srlz_d();
 
 			thread->dbr[rnum] = dbreg.val;
 
@@ -2058,27 +2196,35 @@
 
 	if (ctx->ctx_fl_system) {
 		
-		/* enable dcr pp */
-		ia64_set_dcr(ia64_get_dcr()|IA64_DCR_PP);
-
 #ifdef CONFIG_SMP
 		local_cpu_data->pfm_dcr_pp  = 1;
 #else
 		pfm_tasklist_toggle_pp(1);
 #endif
+		/* set user level psr.pp */
 		ia64_psr(regs)->pp = 1;
 
+		/* start monitoring at kernel level */
 		__asm__ __volatile__ ("ssm psr.pp;;"::: "memory");
 
+		/* enable dcr pp */
+		ia64_set_dcr(ia64_get_dcr()|IA64_DCR_PP);
+
+		ia64_srlz_i();
+
 	} else {
 		if ((task->thread.flags & IA64_THREAD_PM_VALID) == 0) {
 			printk("perfmon: pfm_start task flag not set for [%d]\n", task->pid);
 			return -EINVAL;
 		}
+		/* set user level psr.up */
 		ia64_psr(regs)->up = 1;
+
+		/* start monitoring at kernel level */
 		__asm__ __volatile__ ("sum psr.up;;"::: "memory");
+
+		ia64_srlz_i();
 	}
-	ia64_srlz_d();
 
 	return 0;
 }
@@ -2101,7 +2247,9 @@
 		ia64_psr(regs)->pp = 0;
 		ia64_psr(regs)->up = 0; /* just to make sure! */
 
+		/* make sure monitoring is stopped */
 		__asm__ __volatile__ ("rsm psr.pp;;"::: "memory");
+		ia64_srlz_i();
 
 #ifdef CONFIG_SMP
 		local_cpu_data->pfm_syst_wide = 1;
@@ -2116,21 +2264,21 @@
 		ia64_psr(regs)->pp = 0; /* just to make sure! */
 		ia64_psr(regs)->up = 0;
 
+		/* make sure monitoring is stopped */
 		__asm__ __volatile__ ("rum psr.up;;"::: "memory");
-		/*
-		 * allow user control (user monitors only)
-		if (task  == ctx->ctx_owner) {
-		 */
-		{
-			DBprintk(("clearing psr.sp for [%d]\n", current->pid));
-			ia64_psr(regs)->sp = 0;
-		}
+		ia64_srlz_i();
+
+		DBprintk(("clearing psr.sp for [%d]\n", current->pid));
+
+		/* allow user level control  */
+		ia64_psr(regs)->sp = 0;
+
+		/* PMU state will be saved/restored on ctxsw */
 		task->thread.flags |= IA64_THREAD_PM_VALID;
 	}
 
 	SET_PMU_OWNER(task);
 
-
 	ctx->ctx_flags.state = PFM_CTX_ENABLED;
 	atomic_set(&ctx->ctx_last_cpu, smp_processor_id());
 
@@ -2141,6 +2289,40 @@
 	return 0;
 }
 
+static int
+pfm_get_pmc_reset(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, 
+	   struct pt_regs *regs)
+{
+	pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg;
+	unsigned int cnum;
+	int i;
+
+	for (i = 0; i < count; i++, req++) {
+
+		if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
+
+		cnum = tmp.reg_num;
+
+		if (!PMC_IS_IMPL(cnum)) goto abort_mission;
+
+		tmp.reg_value = reset_pmcs[cnum];
+
+		PFM_REG_RETFLAG_SET(tmp.reg_flags, 0);
+
+		DBprintk(("pmc_reset_val pmc[%u]=0x%lx\n", cnum, tmp.reg_value)); 
+
+		if (copy_to_user(req, &tmp, sizeof(tmp))) return -EFAULT;
+	}
+	return 0;
+abort_mission:
+	PFM_REG_RETFLAG_SET(tmp.reg_flags, PFM_REG_RETFL_EINVAL);
+	/* 
+	 * XXX: if this fails, we stick with the original failure, flag not updated!
+	 */
+	copy_to_user(req, &tmp, sizeof(tmp));
+	return -EINVAL;
+}
+
 /*
  * functions MUST be listed in the increasing order of their index (see permfon.h)
  */
@@ -2148,19 +2330,19 @@
 /* 0  */{ NULL, 0, 0, 0}, /* not used */
 /* 1  */{ pfm_write_pmcs, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_READ|PFM_CMD_ARG_WRITE, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)}, 
 /* 2  */{ pfm_write_pmds, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_READ, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
-/* 3  */{ pfm_read_pmds, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_READ|PFM_CMD_ARG_WRITE, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
+/* 3  */{ pfm_read_pmds,PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_READ|PFM_CMD_ARG_WRITE, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)}, 
 /* 4  */{ pfm_stop, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
 /* 5  */{ pfm_start, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
 /* 6  */{ pfm_enable, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
 /* 7  */{ pfm_disable, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
-/* 8  */{ pfm_create_context, PFM_CMD_ARG_READ, 1, sizeof(pfarg_context_t)},
-/* 9  */{ pfm_destroy_context, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
+/* 8  */{ pfm_context_create, PFM_CMD_PID|PFM_CMD_ARG_READ|PFM_CMD_ARG_WRITE, 1, sizeof(pfarg_context_t)},
+/* 9  */{ pfm_context_destroy, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
 /* 10 */{ pfm_restart, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_NOCHK, 0, 0},
 /* 11 */{ pfm_protect_context, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
 /* 12 */{ pfm_get_features, PFM_CMD_ARG_WRITE, 0, 0},
 /* 13 */{ pfm_debug, 0, 1, sizeof(unsigned int)},
-/* 14 */{ pfm_unprotect_context, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
-/* 15 */{ NULL, 0, 0, 0}, /* not used */
+/* 14 */{ pfm_context_unprotect, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
+/* 15 */{ pfm_get_pmc_reset, PFM_CMD_ARG_READ|PFM_CMD_ARG_WRITE, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
 /* 16 */{ NULL, 0, 0, 0}, /* not used */
 /* 17 */{ NULL, 0, 0, 0}, /* not used */
 /* 18 */{ NULL, 0, 0, 0}, /* not used */
@@ -2222,9 +2404,9 @@
 {
 	struct pt_regs *regs = (struct pt_regs *)&stack;
 	struct task_struct *task = current;
-	pfm_context_t *ctx = task->thread.pfm_context;
+	pfm_context_t *ctx;
 	size_t sz;
-	int ret = -ESRCH, narg;
+	int ret, narg;
 
 	/* 
 	 * reject any call if perfmon was disabled at initialization time
@@ -2254,6 +2436,8 @@
 
 		if (pid != current->pid) {
 
+			ret = -ESRCH;
+
 			read_lock(&tasklist_lock);
 
 			task = find_task_by_pid(pid);
@@ -2268,10 +2452,11 @@
 				ret = check_task_state(task);
 				if (ret != 0) goto abort_call;
 			}
-			ctx = task->thread.pfm_context;
-		}
+		} 
 	} 
 
+	ctx = task->thread.pfm_context;
+
 	if (PFM_CMD_USE_CTX(cmd)) {
 		ret = -EINVAL;
 	       if (ctx == NULL) {
@@ -2387,9 +2572,9 @@
 	int j;
 
 
-pfm_recorded_samples_count++;
+
 	idx = ia64_fetch_and_add(1, &psb->psb_index);
-	DBprintk(("recording index=%ld entries=%ld\n", idx-1, psb->psb_entries));
+	DBprintk_ovfl(("recording index=%ld entries=%ld\n", idx-1, psb->psb_entries));
 
 	/*
 	* XXX: there is a small chance that we could run out on index before resetting
@@ -2409,7 +2594,7 @@
 	/*
 	 * initialize entry header
 	 */
-	h->pid  = task->pid;
+	h->pid  = current->pid;
 	h->cpu  = smp_processor_id();
 	h->rate = 0; /* XXX: add the sampling rate used here */
 	h->ip   = regs ? regs->cr_iip : 0x0;	/* where did the fault happened */
@@ -2437,24 +2622,27 @@
 		} else {
 			*e = ia64_get_pmd(j); /* slow */
 		}
-		DBprintk(("e=%p pmd%d =0x%lx\n", (void *)e, j, *e));
+		DBprintk_ovfl(("e=%p pmd%d =0x%lx\n", (void *)e, j, *e));
 		e++;
 	}
+	pfm_stats.pfm_recorded_samples_count++;
+
 	/*
 	 * make the new entry visible to user, needs to be atomic
 	 */
 	ia64_fetch_and_add(1, &psb->psb_hdr->hdr_count);
 
-	DBprintk(("index=%ld entries=%ld hdr_count=%ld\n", 
+	DBprintk_ovfl(("index=%ld entries=%ld hdr_count=%ld\n", 
 				idx, psb->psb_entries, psb->psb_hdr->hdr_count));
 	/* 
 	 * sampling buffer full ? 
 	 */
 	if (idx == (psb->psb_entries-1)) {
-		DBprintk(("sampling buffer full\n"));
+		DBprintk_ovfl(("sampling buffer full\n"));
 		/*
 		 * XXX: must reset buffer in blocking mode and lost notified
 		 */
+		pfm_stats.pfm_full_smpl_buffer_count++;
 		return 1;
 	}
 	return 0;
@@ -2467,15 +2655,13 @@
  *	new value of pmc[0]. if 0x0 then unfreeze, else keep frozen
  */
 static unsigned long
-pfm_overflow_handler(struct task_struct *task, u64 pmc0, struct pt_regs *regs)
+pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, struct pt_regs *regs)
 {
 	unsigned long mask;
 	struct thread_struct *t;
-	pfm_context_t *ctx;
 	unsigned long old_val;
 	unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL;
 	int i;
-	int my_cpu = smp_processor_id();
 	int ret = 1;
 	struct siginfo si;
 	/*
@@ -2491,18 +2677,7 @@
 	 * valid one, i.e. the one that caused the interrupt.
 	 */
 
-	if (task == NULL) {
-		DBprintk(("owners[%d]=NULL\n", my_cpu));
-		return 0x1;
-	}
 	t   = &task->thread;
-	ctx = task->thread.pfm_context;
-
-	if (!ctx) {
-		printk("perfmon: Spurious overflow interrupt: process %d has no PFM context\n", 
-			task->pid);
-		return 0;
-	}
 
 	/*
 	 * XXX: debug test
@@ -2524,12 +2699,12 @@
 
 	mask = pmc0 >> PMU_FIRST_COUNTER;
 
-	DBprintk(("pmc0=0x%lx pid=%d iip=0x%lx, %s"
-		  " mode used_pmds=0x%lx save_pmcs=0x%lx reload_pmcs=0x%lx\n", 
+	DBprintk_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s"
+		  " mode used_pmds=0x%lx used_pmcs=0x%lx reload_pmcs=0x%lx\n", 
 			pmc0, task->pid, (regs ? regs->cr_iip : 0), 
 			CTX_OVFL_NOBLOCK(ctx) ? "nonblocking" : "blocking",
 			ctx->ctx_used_pmds[0],
-			ctx->ctx_saved_pmcs[0],
+			ctx->ctx_used_pmcs[0],
 			ctx->ctx_reload_pmcs[0]));
 
 	/*
@@ -2540,7 +2715,7 @@
 		/* skip pmd which did not overflow */
 		if ((mask & 0x1) == 0) continue;
 
-		DBprintk(("PMD[%d] overflowed hw_pmd=0x%lx soft_pmd=0x%lx\n", 
+		DBprintk_ovfl(("pmd[%d] overflowed hw_pmd=0x%lx soft_pmd=0x%lx\n", 
 			  i, ia64_get_pmd(i), ctx->ctx_soft_pmds[i].val));
 
 		/*
@@ -2552,8 +2727,7 @@
 		old_val = ctx->ctx_soft_pmds[i].val;
 		ctx->ctx_soft_pmds[i].val = 1 + pmu_conf.perf_ovfl_val + pfm_read_soft_counter(ctx, i);
 
-
-		DBprintk(("soft_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx\n", 
+		DBprintk_ovfl(("soft_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx\n", 
 			  i, ctx->ctx_soft_pmds[i].val, old_val, 
 			  ia64_get_pmd(i) & pmu_conf.perf_ovfl_val));
 
@@ -2570,7 +2744,7 @@
 
 			ovfl_pmds |= 1UL << i;
 
-			DBprintk(("soft_pmd[%d] overflowed flags=0x%x, ovfl=0x%lx\n", i, ctx->ctx_soft_pmds[i].flags, ovfl_pmds));
+			DBprintk_ovfl(("soft_pmd[%d] overflowed flags=0x%x, ovfl=0x%lx\n", i, ctx->ctx_soft_pmds[i].flags, ovfl_pmds));
 
 			if (PMC_OVFL_NOTIFY(ctx, i)) {
 				ovfl_notify |= 1UL << i;
@@ -2609,7 +2783,8 @@
 	 * No overflow requiring a user level notification
 	 */
 	if (ovfl_notify == 0UL) {
-		pfm_reset_regs(ctx, &ovfl_pmds, PFM_RELOAD_SHORT_RESET);
+		if (ovfl_pmds) 
+			pfm_reset_regs(ctx, &ovfl_pmds, PFM_RELOAD_SHORT_RESET);
 		return 0x0;
 	}
 
@@ -2684,7 +2859,7 @@
 	 	 * necessarily go to the signal handler (if any) when it goes back to
 	 	 * user mode.
 	 	 */
-		DBprintk(("[%d] sending notification to [%d]\n", 
+		DBprintk_ovfl(("[%d] sending notification to [%d]\n", 
 			  task->pid, ctx->ctx_notify_task->pid));
 
 
@@ -2717,7 +2892,7 @@
 		 * before, changing it to NULL will still maintain this invariant.
 		 * Of course, when it is equal to current it cannot change at this point.
 		 */
-		DBprintk(("block=%d notify [%d] current [%d]\n", 
+		DBprintk_ovfl(("block=%d notify [%d] current [%d]\n", 
 			ctx->ctx_fl_block,
 			ctx->ctx_notify_task ? ctx->ctx_notify_task->pid: -1, 
 			current->pid ));
@@ -2728,7 +2903,7 @@
 	} else {
 lost_notify: /* XXX: more to do here, to convert to non-blocking (reset values) */
 
-		DBprintk(("notification task has disappeared !\n"));
+		DBprintk_ovfl(("notification task has disappeared !\n"));
 		/*
 		 * for a non-blocking context, we make sure we do not fall into the 
 		 * pfm_overflow_notify() trap. Also in the case of a blocking context with lost 
@@ -2750,7 +2925,7 @@
 	 */
 	ctx->ctx_fl_frozen = 1;
 
-	DBprintk(("reload pmc0=0x%x must_block=%ld\n",
+	DBprintk_ovfl(("return pmc0=0x%x must_block=%ld\n",
 				ctx->ctx_fl_frozen ? 0x1 : 0x0, t->pfm_ovfl_block_reset));
 
 	return ctx->ctx_fl_frozen ? 0x1 : 0x0;
@@ -2761,8 +2936,9 @@
 {
 	u64 pmc0;
 	struct task_struct *task;
+	pfm_context_t *ctx;
 
-	pfm_ovfl_intr_count++;
+	pfm_stats.pfm_ovfl_intr_count++;
 
 	/* 
 	 * srlz.d done before arriving here
@@ -2776,24 +2952,54 @@
 	 * assumes : if any PM[0].bit[63-1] is set, then PMC[0].fr = 1
 	 */
 	if ((pmc0 & ~0x1UL)!=0UL && (task=PMU_OWNER())!= NULL) {
-
 		/* 
-		 * assumes, PMC[0].fr = 1 at this point 
-		 *
-		 * XXX: change protype to pass &pmc0
+		 * we assume that pmc0.fr is always set here
 		 */
-		pmc0 = pfm_overflow_handler(task, pmc0, regs);
+		ctx = task->thread.pfm_context;
 
-		/* we never explicitely freeze PMU here */
-		if (pmc0 == 0) {
-			ia64_set_pmc(0, 0);
-			ia64_srlz_d();
+		/* sanity check */
+		if (!ctx) {
+			printk("perfmon: Spurious overflow interrupt: process %d has no PFM context\n", 
+				task->pid);
+			return;
 		}
+#ifdef CONFIG_SMP
+		/*
+		 * Because an IPI has higher priority than the PMU overflow interrupt, it is 
+		 * possible that the handler be interrupted by a request from another CPU to fetch 
+		 * the PMU state of the currently active context. The task may have just been 
+		 * migrated to another CPU which is trying to restore the context. If there was
+		 * a pending overflow interrupt when the task left this CPU, it is possible for
+		 * the handler to get interrupt by the IPI. In which case, we fetch request
+		 * MUST be postponed until the interrupt handler is done. The ctx_is_busy
+		 * flag indicates such a condition. The other CPU must busy wait until it's cleared.
+		 */
+		atomic_set(&ctx->ctx_is_busy, 1);
+#endif
+
+		/* 
+		 * assume PMC[0].fr = 1 at this point 
+		 */
+		pmc0 = pfm_overflow_handler(task, ctx, pmc0, regs);
+
+		/*
+		 * We always clear the overflow status bits and either unfreeze
+		 * or keep the PMU frozen.
+		 */
+		ia64_set_pmc(0, pmc0);
+		ia64_srlz_d();
+
+#ifdef CONFIG_SMP
+		/*
+		 * announce that we are doing with the context
+		 */
+		atomic_set(&ctx->ctx_is_busy, 0);
+#endif
 	} else {
-		pfm_spurious_ovfl_intr_count++;
+		pfm_stats.pfm_spurious_ovfl_intr_count++;
 
-		DBprintk(("perfmon: Spurious PMU overflow interrupt on CPU%d: pmc0=0x%lx owner=%p\n", 
-			smp_processor_id(), pmc0, (void *)PMU_OWNER()));
+		printk("perfmon: Spurious PMU overflow interrupt on CPU%d: pmc0=0x%lx owner=%p\n", 
+			smp_processor_id(), pmc0, (void *)PMU_OWNER());
 	}
 }
 
@@ -2801,33 +3007,31 @@
 static int
 perfmon_proc_info(char *page)
 {
-#ifdef CONFIG_SMP
-#define cpu_is_online(i) (cpu_online_map & (1UL << i))
-#else
-#define cpu_is_online(i)        1
-#endif
 	char *p = page;
-	u64 pmc0 = ia64_get_pmc(0);
 	int i;
 
-	p += sprintf(p, "perfmon enabled: %s\n", pmu_conf.pfm_is_disabled ? "No": "Yes");
-
-	p += sprintf(p, "monitors_pmcs0]=0x%lx\n", pmu_conf.monitor_pmcs[0]);
-	p += sprintf(p, "counter_pmcds[0]=0x%lx\n", pmu_conf.counter_pmds[0]);
-	p += sprintf(p, "overflow interrupts=%lu\n", pfm_ovfl_intr_count);
-	p += sprintf(p, "spurious overflow interrupts=%lu\n", pfm_spurious_ovfl_intr_count);
-	p += sprintf(p, "recorded samples=%lu\n", pfm_recorded_samples_count);
-
-	p += sprintf(p, "CPU%d.pmc[0]=%lx\nPerfmon debug: %s\n", 
-			smp_processor_id(), pmc0, pfm_debug_mode ? "On" : "Off");
+	p += sprintf(p, "enabled          : %s\n", pmu_conf.pfm_is_disabled ? "No": "Yes");
+	p += sprintf(p, "fastctxsw        : %s\n", pfm_sysctl.fastctxsw > 0 ? "Yes": "No");
+	p += sprintf(p, "ovfl_mask        : 0x%lx\n", pmu_conf.perf_ovfl_val);
+	p += sprintf(p, "overflow intrs   : %lu\n", pfm_stats.pfm_ovfl_intr_count);
+	p += sprintf(p, "spurious intrs   : %lu\n", pfm_stats.pfm_spurious_ovfl_intr_count);
+	p += sprintf(p, "recorded samples : %lu\n", pfm_stats.pfm_recorded_samples_count);
+	p += sprintf(p, "smpl buffer full : %lu\n", pfm_stats.pfm_full_smpl_buffer_count);
 
 #ifdef CONFIG_SMP
-	p += sprintf(p, "CPU%d cpu_data.pfm_syst_wide=%d cpu_data.dcr_pp=%d\n", 
-			smp_processor_id(), local_cpu_data->pfm_syst_wide, local_cpu_data->pfm_dcr_pp);
+	p += sprintf(p, "CPU%d syst_wide   : %d\n"
+			"CPU%d dcr_pp      : %d\n", 
+			smp_processor_id(), 
+			local_cpu_data->pfm_syst_wide, 
+			smp_processor_id(), 
+			local_cpu_data->pfm_dcr_pp);
 #endif
 
 	LOCK_PFS();
-	p += sprintf(p, "proc_sessions=%lu\nsys_sessions=%lu\nsys_use_dbregs=%lu\nptrace_use_dbregs=%lu\n", 
+	p += sprintf(p, "proc_sessions    : %lu\n"
+			"sys_sessions     : %lu\n"
+			"sys_use_dbregs   : %lu\n"
+			"ptrace_use_dbregs: %lu\n", 
 			pfm_sessions.pfs_task_sessions, 
 			pfm_sessions.pfs_sys_sessions,
 			pfm_sessions.pfs_sys_use_dbregs,
@@ -2837,12 +3041,28 @@
 
 	for(i=0; i < NR_CPUS; i++) {
 		if (cpu_is_online(i)) {
-			p += sprintf(p, "CPU%d.pmu_owner: %-6d\n",
+			p += sprintf(p, "CPU%d owner : %-6d\n",
 					i, 
 					pmu_owners[i].owner ? pmu_owners[i].owner->pid: -1);
 		}
 	}
 
+	for(i=0; pmd_desc[i].type != PFM_REG_NONE; i++) {
+		p += sprintf(p, "PMD%-2d: %d 0x%lx 0x%lx\n", 
+				i,
+				pmd_desc[i].type, 
+				pmd_desc[i].dep_pmd[0], 
+				pmd_desc[i].dep_pmc[0]); 
+	}
+
+	for(i=0; pmc_desc[i].type != PFM_REG_NONE; i++) {
+		p += sprintf(p, "PMC%-2d: %d 0x%lx 0x%lx\n", 
+				i, 
+				pmc_desc[i].type, 
+				pmc_desc[i].dep_pmd[0], 
+				pmc_desc[i].dep_pmc[0]); 
+	}
+
 	return p - page;
 }
 
@@ -2901,6 +3121,7 @@
 	 * It will be restored from ipsr when going back to user level
 	 */
 	__asm__ __volatile__ ("rum psr.up;;"::: "memory");
+	ia64_srlz_i();
 
 	ctx->ctx_saved_psr = psr;
 
@@ -2956,13 +3177,9 @@
 	for (i=0; mask; i++, mask>>=1) {
 		if (mask & 0x1) t->pmd[i] =ia64_get_pmd(i);
 	}
-	/*
-	 * XXX: simplify to pmc0 only
-	 */
-	mask = ctx->ctx_saved_pmcs[0];
-	for (i=0; mask; i++, mask>>=1) {
-		if (mask & 0x1) t->pmc[i] = ia64_get_pmc(i);
-	}
+
+	/* save pmc0 */
+	t->pmc[0] = ia64_get_pmc(0);
 
 	/* not owned by this CPU */
 	atomic_set(&ctx->ctx_last_cpu, -1);
@@ -3000,6 +3217,12 @@
 		  PMU_OWNER() ? PMU_OWNER()->pid: -1,
 		  atomic_read(&ctx->ctx_saving_in_progress)));
 
+	/* must wait until not busy before retrying whole request */
+	if (atomic_read(&ctx->ctx_is_busy)) {
+		arg->retval = 2;
+		return;
+	}
+
 	/* must wait if saving was interrupted */
 	if (atomic_read(&ctx->ctx_saving_in_progress)) {
 		arg->retval = 1;
@@ -3012,9 +3235,9 @@
 		return;
 	}
 
-	DBprintk(("saving state for [%d] save_pmcs=0x%lx all_pmcs=0x%lx used_pmds=0x%lx\n", 
+	DBprintk(("saving state for [%d] used_pmcs=0x%lx reload_pmcs=0x%lx used_pmds=0x%lx\n", 
 		arg->task->pid,
-		ctx->ctx_saved_pmcs[0],
+		ctx->ctx_used_pmcs[0],
 		ctx->ctx_reload_pmcs[0],
 		ctx->ctx_used_pmds[0]));
 
@@ -3027,17 +3250,15 @@
 
 	/*
 	 * XXX needs further optimization.
-	 * Also must take holes into account
 	 */
 	mask = ctx->ctx_used_pmds[0];
 	for (i=0; mask; i++, mask>>=1) {
-		if (mask & 0x1) t->pmd[i] =ia64_get_pmd(i);
-	}
-	
-	mask = ctx->ctx_saved_pmcs[0];
-	for (i=0; mask; i++, mask>>=1) {
-		if (mask & 0x1) t->pmc[i] = ia64_get_pmc(i);
+		if (mask & 0x1) t->pmd[i] = ia64_get_pmd(i);
 	}
+
+	/* save pmc0 */
+	t->pmc[0] = ia64_get_pmc(0);
+
 	/* not owned by this CPU */
 	atomic_set(&ctx->ctx_last_cpu, -1);
 
@@ -3066,11 +3287,17 @@
 	arg.task   = task;
 	arg.retval = -1;
 
+	if (atomic_read(&ctx->ctx_is_busy)) {
+must_wait_busy:
+		while (atomic_read(&ctx->ctx_is_busy));
+	}
+
 	if (atomic_read(&ctx->ctx_saving_in_progress)) {
 		DBprintk(("no IPI, must wait for [%d] to be saved on [%d]\n", task->pid, cpu));
-
+must_wait_saving:
 		/* busy wait */
 		while (atomic_read(&ctx->ctx_saving_in_progress));
+		DBprintk(("done saving for [%d] on [%d]\n", task->pid, cpu));
 		return;
 	}
 	DBprintk(("calling CPU %d from CPU %d\n", cpu, smp_processor_id()));
@@ -3090,11 +3317,8 @@
 	 * This is the case, where we interrupted the saving which started just at the time we sent the
 	 * IPI.
 	 */
-	if (arg.retval == 1) {
-		DBprintk(("must wait for [%d] to be saved on [%d]\n", task->pid, cpu));
-		while (atomic_read(&ctx->ctx_saving_in_progress));
-		DBprintk(("done saving for [%d] on [%d]\n", task->pid, cpu));
-	}
+	if (arg.retval == 1) goto must_wait_saving;
+	if (arg.retval == 2) goto must_wait_busy;
 }
 #endif /* CONFIG_SMP */
 
@@ -3113,6 +3337,30 @@
 
 	owner = PMU_OWNER();
 	ctx   = task->thread.pfm_context;
+	t     = &task->thread;
+
+	/*
+	 * we restore ALL the debug registers to avoid picking up 
+	 * stale state.
+	 *
+	 * This must be done even when the task is still the owner
+	 * as the registers may have been modified via ptrace()
+	 * (not perfmon) by the previous task. 
+	 *
+	 * XXX: dealing with this in a lazy fashion requires modifications
+	 * to the way the the debug registers are managed. This is will done
+	 * in the next version of perfmon.
+	 */
+	if (ctx->ctx_fl_using_dbreg) {
+		for (i=0; i < pmu_conf.num_ibrs; i++) {
+			ia64_set_ibr(i, t->ibr[i]);
+		}
+		ia64_srlz_i();
+		for (i=0; i < pmu_conf.num_dbrs; i++) {
+			ia64_set_dbr(i, t->dbr[i]);
+		}
+		ia64_srlz_d();
+	}
 
 	/*
 	 * if we were the last user, then nothing to do except restore psr
@@ -3148,55 +3396,37 @@
 		pfm_fetch_regs(cpu, task, ctx);
 	}
 #endif
-	t   = &task->thread;
 
 	/*
-	 * XXX: will be replaced by assembly routine
-	 * We clear all unused PMDs to avoid leaking information
+	 * To avoid leaking information to the user level when psr.sp=0,
+	 * we must reload ALL implemented pmds (even the ones we don't use).
+	 * In the kernel we only allow PFM_READ_PMDS on registers which
+	 * we initialized or requested (sampling) so there is no risk there.
+	 *
+	 * As an optimization, we will only reload the PMD that we use when 
+	 * the context is in protected mode, i.e. psr.sp=1 because then there
+	 * is no leak possible.
 	 */
-	mask = ctx->ctx_used_pmds[0];
+	mask = pfm_sysctl.fastctxsw || ctx->ctx_fl_protected ?  ctx->ctx_used_pmds[0] : ctx->ctx_reload_pmds[0];
 	for (i=0; mask; i++, mask>>=1) {
-		if (mask & 0x1) 
-			ia64_set_pmd(i, t->pmd[i]);
-		else
-			ia64_set_pmd(i, 0UL);
+		if (mask & 0x1) ia64_set_pmd(i, t->pmd[i] & pmu_conf.perf_ovfl_val);
 	}
-	/* XXX: will need to clear all unused pmd, for security */
 
 	/* 
-	 * skip pmc[0] to avoid side-effects, 
-	 * all PMCs are systematically reloaded, unsued get default value
-	 * to avoid picking up stale configuration
+	 * PMC0 is never set in the mask because it is always restored
+	 * separately.  
+	 *
+	 * ALL PMCs are systematically reloaded, unused registers
+	 * get their default (PAL reset) values to avoid picking up 
+	 * stale configuration.
 	 */	
-	mask = ctx->ctx_reload_pmcs[0]>>1;
-	for (i=1; mask; i++, mask>>=1) {
-		if (mask & 0x1) ia64_set_pmc(i, t->pmc[i]);
-	}
-
-	/*
-	 * restore debug registers when used for range restrictions.
-	 * We must restore the unused registers to avoid picking up
-	 * stale information.
-	 */
-	mask = ctx->ctx_used_ibrs[0];
+	mask = ctx->ctx_reload_pmcs[0];
 	for (i=0; mask; i++, mask>>=1) {
-		if (mask & 0x1) 
-			ia64_set_ibr(i, t->ibr[i]);
-		else
-			ia64_set_ibr(i, 0UL);
-	}
-
-	mask = ctx->ctx_used_dbrs[0];
-	for (i=0; mask; i++, mask>>=1) {
-		if (mask & 0x1) 
-			ia64_set_dbr(i, t->dbr[i]);
-		else
-			ia64_set_dbr(i, 0UL);
+		if (mask & 0x1) ia64_set_pmc(i, t->pmc[i]);
 	}
 
 	if (t->pmc[0] & ~0x1) {
-		ia64_srlz_d();
-		pfm_overflow_handler(task, t->pmc[0], NULL);
+		pfm_overflow_handler(task, ctx, t->pmc[0], NULL);
 	}
 
 	/*
@@ -3249,7 +3479,7 @@
 			 * When restoring context, we must restore ALL pmcs, even the ones 
 			 * that the task does not use to avoid leaks and possibly corruption
 			 * of the sesion because of configuration conflicts. So here, we 
-			 * initializaed the table used in the context switch restore routine.
+			 * initialize the entire set used in the context switch restore routine.
 	 		 */
 			t->pmc[i] = reset_pmcs[i];
 			DBprintk((" pmc[%d]=0x%lx\n", i, reset_pmcs[i]));
@@ -3258,39 +3488,61 @@
 	}
 	/*
 	 * clear reset values for PMD. 
-	 * XX: good up to 64 PMDS. Suppose that zero is a valid value.
+	 * XXX: good up to 64 PMDS. Suppose that zero is a valid value.
 	 */
 	mask = pmu_conf.impl_regs[4];
 	for(i=0; mask; mask>>=1, i++) {
 		if (mask & 0x1) ia64_set_pmd(i, 0UL);
+		t->pmd[i] = 0UL;
 	}
 
 	/*
-	 * On context switched restore, we must restore ALL pmc even
+	 * On context switched restore, we must restore ALL pmc and ALL pmd even
 	 * when they are not actively used by the task. In UP, the incoming process 
-	 * may otherwise pick up left over PMC state from the previous process.
+	 * may otherwise pick up left over PMC, PMD state from the previous process.
 	 * As opposed to PMD, stale PMC can cause harm to the incoming
 	 * process because they may change what is being measured. 
 	 * Therefore, we must systematically reinstall the entire
 	 * PMC state. In SMP, the same thing is possible on the 
-	 * same CPU but also on between 2 CPUs.
+	 * same CPU but also on between 2 CPUs. 
+	 *
+	 * The problem with PMD is information leaking especially
+	 * to user level when psr.sp=0
 	 *
 	 * There is unfortunately no easy way to avoid this problem
-	 * on either UP or SMP. This definitively slows down the 
-	 * pfm_load_regs(). 
+	 * on either UP or SMP. This definitively slows down the
+	 * pfm_load_regs() function. 
 	 */
 	
 	 /*
 	  * We must include all the PMC in this mask to make sure we don't
-	  * see any side effect of the stale state, such as opcode matching
+	  * see any side effect of a stale state, such as opcode matching
 	  * or range restrictions, for instance.
+	  *
+	  * We never directly restore PMC0 so we do not include it in the mask.
 	  */
-	ctx->ctx_reload_pmcs[0] = pmu_conf.impl_regs[0];
+	ctx->ctx_reload_pmcs[0] = pmu_conf.impl_regs[0] & ~0x1;
+	/*
+	 * We must include all the PMD in this mask to avoid picking
+	 * up stale value and leak information, especially directly
+	 * at the user level when psr.sp=0
+	 */
+	ctx->ctx_reload_pmds[0] = pmu_conf.impl_regs[4];
+
+	/* 
+	 * Keep track of the pmds we want to sample
+	 * XXX: may be we don't need to save/restore the DEAR/IEAR pmds
+	 * but we do need the BTB for sure. This is because of a hardware
+	 * buffer of 1 only for non-BTB pmds.
+	 *
+	 * We ignore the unimplemented pmds specified by the user
+	 */
+	ctx->ctx_used_pmds[0] = ctx->ctx_smpl_regs[0] & pmu_conf.impl_regs[4];
+	ctx->ctx_used_pmcs[0] = 1; /* always save/restore PMC[0] */
 
 	/*
 	 * useful in case of re-enable after disable
 	 */
-	ctx->ctx_used_pmds[0] = 0UL;
 	ctx->ctx_used_ibrs[0] = 0UL;
 	ctx->ctx_used_dbrs[0] = 0UL;
 
@@ -3312,7 +3564,7 @@
 {
 	pfm_context_t *ctx;
 	u64 pmc0;
-	unsigned long mask, mask2, val;
+	unsigned long mask2, val;
 	int i;
 
 	ctx = task->thread.pfm_context;
@@ -3334,22 +3586,28 @@
 	 */
 	if (ctx->ctx_fl_system) {
 
-		__asm__ __volatile__ ("rsm psr.pp;;"::: "memory");
 
 		/* disable dcr pp */
 		ia64_set_dcr(ia64_get_dcr() & ~IA64_DCR_PP);
 
+		/* stop monitoring */
+		__asm__ __volatile__ ("rsm psr.pp;;"::: "memory");
+
+		ia64_srlz_i();
+
 #ifdef CONFIG_SMP
 		local_cpu_data->pfm_syst_wide = 0;
 		local_cpu_data->pfm_dcr_pp    = 0;
 #else
 		pfm_tasklist_toggle_pp(0);
 #endif
-
 	} else  {
 
+		/* stop monitoring */
 		__asm__ __volatile__ ("rum psr.up;;"::: "memory");
 
+		ia64_srlz_i();
+
 		/* no more save/restore on ctxsw */
 		current->thread.flags &= ~IA64_THREAD_PM_VALID;
 	}
@@ -3383,7 +3641,7 @@
 	ia64_srlz_d();
 
 	/*
-	 * We don't need to restore psr, because we are on our way out anyway
+	 * We don't need to restore psr, because we are on our way out
 	 */
 
 	/*
@@ -3399,10 +3657,12 @@
 	if (atomic_read(&ctx->ctx_last_cpu) != smp_processor_id()) 
 		printk("perfmon: [%d] last_cpu=%d\n", task->pid, atomic_read(&ctx->ctx_last_cpu));
 
-	mask  = pmc0 >> PMU_FIRST_COUNTER;
-	mask2 = ctx->ctx_used_pmds[0] >> PMU_FIRST_COUNTER;
-
-	for (i = PMU_FIRST_COUNTER; mask2; i++, mask>>=1, mask2>>=1) {
+	/*
+	 * we save all the used pmds
+	 * we take care of overflows for pmds used as counters
+	 */
+	mask2 = ctx->ctx_used_pmds[0];
+	for (i = 0; mask2; i++, mask2>>=1) {
 
 		/* skip non used pmds */
 		if ((mask2 & 0x1) == 0) continue;
@@ -3410,7 +3670,6 @@
 		val = ia64_get_pmd(i);
 
 		if (PMD_IS_COUNTING(i)) {
-
 			DBprintk(("[%d] pmd[%d] soft_pmd=0x%lx hw_pmd=0x%lx\n", task->pid, i, ctx->ctx_soft_pmds[i].val, val & pmu_conf.perf_ovfl_val));
 
 			/* collect latest results */
@@ -3423,15 +3682,19 @@
 			 */
 			task->thread.pmd[i] = 0;
 
-			/* take care of overflow inline */
-			if (mask & 0x1) {
+			/* 
+			 * take care of overflow inline
+			 */
+			if (pmc0 & (1UL << i)) {
 				ctx->ctx_soft_pmds[i].val += 1 + pmu_conf.perf_ovfl_val;
 				DBprintk(("[%d] pmd[%d] overflowed soft_pmd=0x%lx\n",
 					task->pid, i, ctx->ctx_soft_pmds[i].val));
 			}
 		} else {
 			DBprintk(("[%d] pmd[%d] hw_pmd=0x%lx\n", task->pid, i, val));
-			/* not a counter, just save value as is */
+			/* 
+			 * not a counter, just save value as is
+			 */
 			task->thread.pmd[i] = val;
 		}
 	}
@@ -3449,31 +3712,73 @@
 int
 pfm_inherit(struct task_struct *task, struct pt_regs *regs)
 {
-	pfm_context_t *ctx = current->thread.pfm_context;
+	pfm_context_t *ctx;
 	pfm_context_t *nctx;
-	struct thread_struct *th = &task->thread;
+	struct thread_struct *thread;
 	unsigned long m;
 	int i;
 
 	/*
+	 * the new task was copied from parent and therefore points
+	 * to the parent's context at this point
+	 */
+	ctx    = task->thread.pfm_context;
+	thread = &task->thread;
+
+	/*
 	 * make sure child cannot mess up the monitoring session
 	 */
 	 ia64_psr(regs)->sp = 1;
 	 DBprintk(("enabling psr.sp for [%d]\n", task->pid));
 
-	 /*
-	  * remove any sampling buffer mapping from child user 
-	  * address space. Must be done for all cases of inheritance.
-	  */
-	 if (ctx->ctx_smpl_vaddr) pfm_remove_smpl_mapping(task);
+
+	/*
+	 * if there was a virtual mapping for the sampling buffer
+	 * the mapping is NOT inherited across fork() (see VM_DONTCOPY), 
+	 * so we don't have to explicitely remove it here. 
+	 *
+	 *
+	 * Part of the clearing of fields is also done in
+	 * copy_thread() because the fiels are outside the
+	 * pfm_context structure and can affect tasks not
+	 * using perfmon.
+	 */
+
+	/* clear pending notification */
+	task->thread.pfm_ovfl_block_reset = 0;
+
+	/*
+	 * clear cpu pinning restriction for child
+	 */
+	if (ctx->ctx_fl_system) {
+		task->cpus_allowed = ctx->ctx_saved_cpus_allowed;
+		task->need_resched = 1;
+
+	 	DBprintk(("setting cpus_allowed for [%d] to 0x%lx from 0x%lx\n", 
+			task->pid,
+			ctx->ctx_saved_cpus_allowed, 
+			current->cpus_allowed));
+	}
 
 	/*
 	 * takes care of easiest case first
 	 */
 	if (CTX_INHERIT_MODE(ctx) == PFM_FL_INHERIT_NONE) {
+
 		DBprintk(("removing PFM context for [%d]\n", task->pid));
-		task->thread.pfm_context     = NULL;
-		task->thread.pfm_ovfl_block_reset  = 0;
+
+		task->thread.pfm_context = NULL;
+
+		/* 
+		 * we must clear psr.up because the new child does
+		 * not have a context and the PM_VALID flag is cleared
+		 * in copy_thread().
+		 *
+		 * we do not clear psr.pp because it is always
+		 * controlled by the system wide logic and we should
+		 * never be here when system wide is running anyway
+		 */
+	 	ia64_psr(regs)->up = 0;
 
 		/* copy_thread() clears IA64_THREAD_PM_VALID */
 		return 0;
@@ -3487,69 +3792,82 @@
 
 	if (CTX_INHERIT_MODE(ctx) == PFM_FL_INHERIT_ONCE) {
 		nctx->ctx_fl_inherit = PFM_FL_INHERIT_NONE;
-		atomic_set(&nctx->ctx_last_cpu, -1);
-
-		/*
-		 * task is not yet visible in the tasklist, so we do 
-		 * not need to lock the newly created context.
-		 * However, we must grab the tasklist_lock to ensure
-		 * that the ctx_owner or ctx_notify_task do not disappear
-		 * while we increment their check counters.
-		 */
-		read_lock(&tasklist_lock);
+		DBprintk(("downgrading to INHERIT_NONE for [%d]\n", task->pid));
+	}
+	/*
+	 * task is not yet visible in the tasklist, so we do 
+	 * not need to lock the newly created context.
+	 * However, we must grab the tasklist_lock to ensure
+	 * that the ctx_owner or ctx_notify_task do not disappear
+	 * while we increment their check counters.
+	 */
+	read_lock(&tasklist_lock);
 
-		if (nctx->ctx_notify_task) 
-			atomic_inc(&nctx->ctx_notify_task->thread.pfm_notifiers_check);
+	if (nctx->ctx_notify_task) 
+		atomic_inc(&nctx->ctx_notify_task->thread.pfm_notifiers_check);
 
-		if (nctx->ctx_owner)
-			atomic_inc(&nctx->ctx_owner->thread.pfm_owners_check);
+	if (nctx->ctx_owner)
+		atomic_inc(&nctx->ctx_owner->thread.pfm_owners_check);
 
-		read_unlock(&tasklist_lock);
+	read_unlock(&tasklist_lock);
 
-		DBprintk(("downgrading to INHERIT_NONE for [%d]\n", task->pid));
 
-		LOCK_PFS();
-		pfm_sessions.pfs_task_sessions++;
-		UNLOCK_PFS();
-	}
+	LOCK_PFS();
+	pfm_sessions.pfs_task_sessions++;
+	UNLOCK_PFS();
 
 	/* initialize counters in new context */
-	m = pmu_conf.counter_pmds[0] >> PMU_FIRST_COUNTER;
+	m = nctx->ctx_used_pmds[0] >> PMU_FIRST_COUNTER;
 	for(i = PMU_FIRST_COUNTER ; m ; m>>=1, i++) {
-		if (m & 0x1) {
+		if ((m & 0x1) && pmu_conf.pmd_desc[i].type == PFM_REG_COUNTING) {
 			nctx->ctx_soft_pmds[i].val = nctx->ctx_soft_pmds[i].ival & ~pmu_conf.perf_ovfl_val;
-			th->pmd[i]	      	   = nctx->ctx_soft_pmds[i].ival & pmu_conf.perf_ovfl_val;
+			thread->pmd[i]	      	   = nctx->ctx_soft_pmds[i].ival & pmu_conf.perf_ovfl_val;
 		}
+		/* what about the other pmds? zero or keep as is */
 
 	}
-	/* clear BTB index register */
-	th->pmd[16] = 0;
+	/*
+	 * clear BTB index register
+	 * XXX: CPU-model specific knowledge!
+	 */
+	thread->pmd[16] = 0;
 
-	/* if sampling then increment number of users of buffer */
-	if (nctx->ctx_psb) {
 
-		/*
-		 * XXX: nopt very pretty!
-		 */
+	nctx->ctx_fl_frozen    = 0;
+	nctx->ctx_ovfl_regs[0] = 0UL;
+	atomic_set(&nctx->ctx_last_cpu, -1);
+
+	/*
+	 * here nctx->ctx_psb == ctx->ctx_psb
+	 *
+	 * increment reference count to sampling
+	 * buffer, if any. Note that this is independent
+	 * from the virtual mapping. The latter is never
+	 * inherited while the former will be if context
+	 * is setup to something different from PFM_FL_INHERIT_NONE
+	 */
+	if (nctx->ctx_psb) {
 		LOCK_PSB(nctx->ctx_psb);
+
 		nctx->ctx_psb->psb_refcnt++;
+
+	 	DBprintk(("updated smpl @ %p refcnt=%lu psb_flags=0x%x\n", 
+			ctx->ctx_psb->psb_hdr,
+			ctx->ctx_psb->psb_refcnt,
+			ctx->ctx_psb->psb_flags));
+
 		UNLOCK_PSB(nctx->ctx_psb);
+
 		/*
 	 	 * remove any pointer to sampling buffer mapping
 	 	 */
 		nctx->ctx_smpl_vaddr = 0;
 	}
 
-	nctx->ctx_fl_frozen = 0;
-	nctx->ctx_ovfl_regs[0] = 0UL;
-
 	sema_init(&nctx->ctx_restart_sem, 0); /* reset this semaphore to locked */
 
-	/* clear pending notification */
-	th->pfm_ovfl_block_reset = 0;
-
 	/* link with new task */
-	th->pfm_context    = nctx;
+	thread->pfm_context = nctx;
 
 	DBprintk(("nctx=%p for process [%d]\n", (void *)nctx, task->pid));
 
@@ -3559,7 +3877,7 @@
 	 */
 	if (current->thread.flags & IA64_THREAD_PM_VALID) {
 		DBprintk(("setting PM_VALID for [%d]\n", task->pid));
-		th->flags |= IA64_THREAD_PM_VALID;
+		thread->flags |= IA64_THREAD_PM_VALID;
 	}
 
 	return 0;
@@ -3588,9 +3906,9 @@
 
 		LOCK_PSB(psb);
 
-		DBprintk(("sampling buffer from [%d] @%p size %ld vma_flag=0x%x\n",
+		DBprintk(("sampling buffer from [%d] @%p size %ld refcnt=%lu psb_flags=0x%x\n",
 			task->pid,
-			psb->psb_hdr, psb->psb_size, psb->psb_flags));
+			psb->psb_hdr, psb->psb_size, psb->psb_refcnt, psb->psb_flags));
 
 		/*
 		 * in the case where we are the last user, we may be able to free
@@ -3613,7 +3931,7 @@
 			 *
 			 * See pfm_vm_close() and pfm_cleanup_smpl_buf() for more details.
 			 */
-			if ((psb->psb_flags & PFM_PSB_VMA) == 0) {
+			if ((psb->psb_flags & PSB_HAS_VMA) == 0) {
 
 				DBprintk(("cleaning sampling buffer from [%d] @%p size %ld\n",
 					task->pid,
@@ -3645,7 +3963,7 @@
 	 * direct pointer to a task structure thereby bypassing the tasklist. 
 	 * We must make sure that, if we have task!= NULL, the target task is still 
 	 * present and is identical to the initial task specified 
-	 * during pfm_create_context(). It may already be detached from the tasklist but 
+	 * during pfm_context_create(). It may already be detached from the tasklist but 
 	 * that's okay. Note that it is okay if we miss the deadline and the task scans 
 	 * the list for nothing, it will affect performance but not correctness. 
 	 * The correctness is ensured by using the ctx_lock which prevents the 
@@ -3683,7 +4001,8 @@
 		pfm_sessions.pfs_sys_session[ctx->ctx_cpu] = NULL;
 		pfm_sessions.pfs_sys_sessions--;
 		DBprintk(("freeing syswide session on CPU%ld\n", ctx->ctx_cpu));
-		/* update perfmon debug register counter */
+
+		/* update perfmon debug register usage counter */
 		if (ctx->ctx_fl_using_dbreg) {
 			if (pfm_sessions.pfs_sys_use_dbregs == 0) {
 				printk("perfmon: invalid release for [%d] sys_use_dbregs=0\n", task->pid);
@@ -3795,6 +4114,8 @@
 		}
 	}
 	read_unlock(&tasklist_lock);
+
+	atomic_set(&task->thread.pfm_owners_check, 0);
 }
 
 
@@ -3852,6 +4173,8 @@
 		}
 	}
 	read_unlock(&tasklist_lock);
+
+	atomic_set(&task->thread.pfm_notifiers_check, 0);
 }
 
 static struct irqaction perfmon_irqaction = {
@@ -3870,6 +4193,12 @@
 		if (i >= pmu_conf.num_pmcs) break;
 		if (PMC_IS_IMPL(i)) reset_pmcs[i] = ia64_get_pmc(i);
 	}
+#ifdef CONFIG_MCKINLEY
+	/*
+	 * set the 'stupid' enable bit to power the PMU!
+	 */
+	reset_pmcs[4] |= 1UL << 23;
+#endif
 }
 
 /*
@@ -3881,11 +4210,6 @@
 	pal_perf_mon_info_u_t pm_info;
 	s64 status;
 
-	register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction);
-
-	ia64_set_pmv(IA64_PERFMON_VECTOR);
-	ia64_srlz_d();
-
 	pmu_conf.pfm_is_disabled = 1;
 
 	printk("perfmon: version %u.%u (sampling format v%u.%u) IRQ %u\n", 
@@ -3937,23 +4261,12 @@
 	 */
 	pfm_pmu_snapshot();
 
-	/* 
-	 * list the pmc registers used to control monitors 
-	 * XXX: unfortunately this information is not provided by PAL
-	 *
-	 * We start with the architected minimum and then refine for each CPU model
-	 */
-	pmu_conf.monitor_pmcs[0] = PMM(4)|PMM(5)|PMM(6)|PMM(7);
-
 	/*
-	 * architected counters
+	 * setup the register configuration descriptions for the CPU
 	 */
-	pmu_conf.counter_pmds[0] |= PMM(4)|PMM(5)|PMM(6)|PMM(7);
+	pmu_conf.pmc_desc = pmc_desc;
+	pmu_conf.pmd_desc = pmd_desc;
 
-#ifdef CONFIG_ITANIUM
-	pmu_conf.monitor_pmcs[0] |= PMM(10)|PMM(11)|PMM(12);
-	/* Itanium does not add more counters */
-#endif
 	/* we are all set */
 	pmu_conf.pfm_is_disabled = 0;
 
@@ -3962,6 +4275,8 @@
 	 */
 	perfmon_dir = create_proc_read_entry ("perfmon", 0, 0, perfmon_read_entry, NULL);
 
+	pfm_sysctl_header = register_sysctl_table(pfm_sysctl_root, 0);
+
 	spin_lock_init(&pfm_sessions.pfs_lock);
 
 	return 0;
@@ -3972,11 +4287,13 @@
 void
 perfmon_init_percpu (void)
 {
+	if (smp_processor_id() == 0)
+		register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction);
+
 	ia64_set_pmv(IA64_PERFMON_VECTOR);
 	ia64_srlz_d();
 }
 
-
 #else /* !CONFIG_PERFMON */
 
 asmlinkage int

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)