changelog shortlog tags branches files raw gz bz2 help

Mercurial > hg > plan9front / changeset: bcm: speed up co-processor operations by avoiding i+d cache flush on each operation

changeset 6868: 3a2e3a97af40
parent 6867: b7b7ff7d6823
child 6869: 194ab9426a0c
author: cinap_lenrek@felloff.net
date: Wed, 07 Nov 2018 16:48:14 +0100
files: sys/src/9/bcm/coproc.c sys/src/9/bcm/vfp3.c
description: bcm: speed up co-processor operations by avoiding i+d cache flush on each operation

coproc.c generated the instrucitons anew each time,
requiering a i+d cache flush for each operation.

instead, we can speed this up like this:

given that the coprocessor registers are per cpu, we can
assume that interrupts have already been disabled by
the caller to prevent a process switch to another cpu.

we cache the instructions generated in a static append
only buffer and maintain separate end pointers for each
cpu.

the cache flushes only need to be done when new
operations have been added to the buffer.
     1.1--- a/sys/src/9/bcm/coproc.c
     1.2+++ b/sys/src/9/bcm/coproc.c
     1.3@@ -1,1 +1,163 @@
     1.4-#include "../teg2/coproc.c"
     1.5+/*
     1.6+ * arm co-processors
     1.7+ * mainly to cope with arm hard-wiring register numbers into instructions.
     1.8+ *
     1.9+ * CP15 (system control) is the one that gets used the most in practice.
    1.10+ *
    1.11+ * these routines must be callable from KZERO.
    1.12+ *
    1.13+ * on a multiprocessor, process switching to another cpu is assumed
    1.14+ * to be inhibited by the caller as these registers are local to the cpu.
    1.15+ */
    1.16+#include "u.h"
    1.17+#include "../port/lib.h"
    1.18+#include "mem.h"
    1.19+#include "dat.h"
    1.20+#include "fns.h"
    1.21+#include "io.h"
    1.22+
    1.23+#include "arm.h"
    1.24+
    1.25+enum {
    1.26+	/* alternates:	0xe12fff1e	BX (R14); last e is R14 */
    1.27+	/*		0xe28ef000	B 0(R14); second e is R14 (ken) */
    1.28+	Retinst	= 0xe1a0f00e,		/* MOV R14, R15 */
    1.29+
    1.30+	Opmask	= MASK(3),
    1.31+	Regmask	= MASK(4),
    1.32+};
    1.33+
    1.34+static void*
    1.35+mkinstr(ulong wd)
    1.36+{
    1.37+	static ulong ib[256], *ep[MAXMACH+1];
    1.38+	static Lock lk;
    1.39+	ulong *ip, *ie;
    1.40+
    1.41+	ie = ep[m->machno];
    1.42+	for(ip = ib; ip < ie; ip += 2)
    1.43+		if(*ip == wd)
    1.44+			return ip;
    1.45+
    1.46+	ilock(&lk);
    1.47+	ie = ep[MAXMACH];
    1.48+	for(; ip < ie; ip += 2)
    1.49+		if(*ip == wd)
    1.50+			goto Found;
    1.51+	if(ip >= &ib[nelem(ib)])
    1.52+		panic("mkinstr: out of instrucuction buffer");
    1.53+	ip[0] = wd;
    1.54+	ip[1] = Retinst;
    1.55+	ep[MAXMACH] = ie = ip + 2;
    1.56+	cachedwbse(ip, 2*sizeof(*ip));
    1.57+Found:
    1.58+	iunlock(&lk);
    1.59+	cacheiinv();
    1.60+	ep[m->machno] = ie;
    1.61+	return ip;
    1.62+}
    1.63+
    1.64+
    1.65+static void*
    1.66+setupcpop(ulong opcode, int cp, int op1, int crn, int crm,
    1.67+	int op2)
    1.68+{
    1.69+	op1 &= Opmask;
    1.70+	op2 &= Opmask;
    1.71+	crn &= Regmask;
    1.72+	crm &= Regmask;
    1.73+	cp  &= Regmask;
    1.74+	return mkinstr(opcode | op1 << 21 | crn << 16 | cp << 8 | op2 << 5 | crm);
    1.75+}
    1.76+
    1.77+ulong
    1.78+cprd(int cp, int op1, int crn, int crm, int op2)
    1.79+{
    1.80+	/*
    1.81+	 * MRC.  return value will be in R0, which is convenient.
    1.82+	 * Rt will be R0.
    1.83+	 */
    1.84+	ulong (*fp)(void) = setupcpop(0xee100010, cp, op1, crn, crm, op2);
    1.85+	return fp();
    1.86+}
    1.87+
    1.88+void
    1.89+cpwr(int cp, int op1, int crn, int crm, int op2, ulong val)
    1.90+{
    1.91+	/* MCR, Rt is R0 */
    1.92+	void (*fp)(ulong) = setupcpop(0xee000010, cp, op1, crn, crm, op2);
    1.93+	fp(val);
    1.94+}
    1.95+
    1.96+ulong
    1.97+cprdsc(int op1, int crn, int crm, int op2)
    1.98+{
    1.99+	return cprd(CpSC, op1, crn, crm, op2);
   1.100+}
   1.101+
   1.102+void
   1.103+cpwrsc(int op1, int crn, int crm, int op2, ulong val)
   1.104+{
   1.105+	cpwr(CpSC, op1, crn, crm, op2, val);
   1.106+}
   1.107+
   1.108+/* floating point */
   1.109+
   1.110+/* fp coproc control */
   1.111+static void*
   1.112+setupfpctlop(int opcode, int fpctlreg)
   1.113+{
   1.114+	fpctlreg &= Nfpctlregs - 1;
   1.115+	return mkinstr(opcode | fpctlreg << 16 | 0 << 12 | CpFP << 8);
   1.116+}
   1.117+
   1.118+ulong
   1.119+fprd(int fpreg)
   1.120+{
   1.121+	/*
   1.122+	 * VMRS.  return value will be in R0, which is convenient.
   1.123+	 * Rt will be R0.
   1.124+	 */
   1.125+	ulong (*fp)(void) = setupfpctlop(0xeef00010, fpreg);
   1.126+	return fp();
   1.127+}
   1.128+
   1.129+void
   1.130+fpwr(int fpreg, ulong val)
   1.131+{
   1.132+	/*
   1.133+	 * fpu might be off and this VMSR might enable it
   1.134+	 * VMSR, Rt is R0
   1.135+	 */
   1.136+	void (*fp)(ulong) = setupfpctlop(0xeee00010, fpreg);
   1.137+	fp(val);
   1.138+}
   1.139+
   1.140+/* fp register access; don't bother with single precision */
   1.141+static void*
   1.142+setupfpop(int opcode, int fpreg)
   1.143+{
   1.144+	ulong wd = opcode | 0 << 16 | (fpreg & (16 - 1)) << 12;
   1.145+	if (fpreg >= 16)
   1.146+		wd |= 1 << 22;		/* high bit of dfp reg # */
   1.147+	return mkinstr(wd);
   1.148+}
   1.149+
   1.150+ulong
   1.151+fpsavereg(int fpreg, uvlong *fpp)
   1.152+{
   1.153+	/*
   1.154+	 * VSTR.  pointer will be in R0, which is convenient.
   1.155+	 * Rt will be R0.
   1.156+	 */
   1.157+	ulong (*fp)(uvlong *) = setupfpop(0xed000000 | CpDFP << 8, fpreg);
   1.158+	return fp(fpp);
   1.159+}
   1.160+
   1.161+void
   1.162+fprestreg(int fpreg, uvlong val)
   1.163+{
   1.164+	/* VLDR, Rt is R0 */
   1.165+	void (*fp)(uvlong *) = setupfpop(0xed100000 | CpDFP << 8, fpreg);
   1.166+	fp(&val);
   1.167+}
     2.1--- a/sys/src/9/bcm/vfp3.c
     2.2+++ b/sys/src/9/bcm/vfp3.c
     2.3@@ -338,8 +338,12 @@ fpuprocfork(Proc *p)
     2.4 void
     2.5 fpusysprocsetup(Proc *p)
     2.6 {
     2.7+	int s;
     2.8+
     2.9+	s = splhi();
    2.10 	p->fpstate = FPinit;
    2.11 	fpoff();
    2.12+	splx(s);
    2.13 }
    2.14 
    2.15 static void