changelog shortlog tags branches changeset files revisions annotate raw help

Mercurial > hg > plan9front / sys/src/9/port/sysproc.c

changeset 7200: 4b67ffcd2c61
parent: 6b46879a272e
child: 95d2092a1b6a
author: cinap_lenrek@felloff.net
date: Fri, 03 May 2019 23:15:42 +0200
permissions: -rw-r--r--
description: kernel: exec support for arm64 binaries
1 #include "u.h"
2 #include "tos.h"
3 #include "../port/lib.h"
4 #include "mem.h"
5 #include "dat.h"
6 #include "fns.h"
7 #include "../port/error.h"
8 #include "edf.h"
9 
10 #include <a.out.h>
11 
12 uintptr
13 sysr1(va_list)
14 {
15  if(!iseve())
16  error(Eperm);
17  return 0;
18 }
19 
20 static void
21 abortion(void*)
22 {
23  pexit("fork aborted", 1);
24 }
25 
26 uintptr
27 sysrfork(va_list list)
28 {
29  Proc *p;
30  int n, i;
31  Fgrp *ofg;
32  Pgrp *opg;
33  Rgrp *org;
34  Egrp *oeg;
35  ulong pid, flag;
36  Mach *wm;
37 
38  flag = va_arg(list, ulong);
39  /* Check flags before we commit */
40  if((flag & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
41  error(Ebadarg);
42  if((flag & (RFNAMEG|RFCNAMEG)) == (RFNAMEG|RFCNAMEG))
43  error(Ebadarg);
44  if((flag & (RFENVG|RFCENVG)) == (RFENVG|RFCENVG))
45  error(Ebadarg);
46 
47  if((flag&RFPROC) == 0) {
48  if(flag & (RFMEM|RFNOWAIT))
49  error(Ebadarg);
50  if(flag & (RFFDG|RFCFDG)) {
51  ofg = up->fgrp;
52  if(flag & RFFDG)
53  up->fgrp = dupfgrp(ofg);
54  else
55  up->fgrp = dupfgrp(nil);
56  closefgrp(ofg);
57  }
58  if(flag & (RFNAMEG|RFCNAMEG)) {
59  opg = up->pgrp;
60  up->pgrp = newpgrp();
61  if(flag & RFNAMEG)
62  pgrpcpy(up->pgrp, opg);
63  /* inherit noattach */
64  up->pgrp->noattach = opg->noattach;
65  closepgrp(opg);
66  }
67  if(flag & RFNOMNT)
68  up->pgrp->noattach = 1;
69  if(flag & RFREND) {
70  org = up->rgrp;
71  up->rgrp = newrgrp();
72  closergrp(org);
73  }
74  if(flag & (RFENVG|RFCENVG)) {
75  oeg = up->egrp;
76  up->egrp = smalloc(sizeof(Egrp));
77  up->egrp->ref = 1;
78  if(flag & RFENVG)
79  envcpy(up->egrp, oeg);
80  closeegrp(oeg);
81  }
82  if(flag & RFNOTEG)
83  up->noteid = pidalloc(0);
84  return 0;
85  }
86 
87  p = newproc();
88 
89  p->scallnr = up->scallnr;
90  p->s = up->s;
91  p->nerrlab = 0;
92  p->slash = up->slash;
93  p->dot = up->dot;
94  incref(p->dot);
95 
96  memmove(p->note, up->note, sizeof(p->note));
97  p->privatemem = up->privatemem;
98  p->noswap = up->noswap;
99  p->nnote = up->nnote;
100  p->notified = 0;
101  p->lastnote = up->lastnote;
102  p->notify = up->notify;
103  p->ureg = up->ureg;
104  p->dbgreg = 0;
105 
106  /* Abort the child process on error */
107  if(waserror()){
108  p->kp = 1;
109  kprocchild(p, abortion, 0);
110  ready(p);
111  nexterror();
112  }
113 
114  /* Make a new set of memory segments */
115  n = flag & RFMEM;
116  qlock(&p->seglock);
117  if(waserror()){
118  qunlock(&p->seglock);
119  nexterror();
120  }
121  for(i = 0; i < NSEG; i++)
122  if(up->seg[i] != nil)
123  p->seg[i] = dupseg(up->seg, i, n);
124  qunlock(&p->seglock);
125  poperror();
126 
127  /* File descriptors */
128  if(flag & (RFFDG|RFCFDG)) {
129  if(flag & RFFDG)
130  p->fgrp = dupfgrp(up->fgrp);
131  else
132  p->fgrp = dupfgrp(nil);
133  }
134  else {
135  p->fgrp = up->fgrp;
136  incref(p->fgrp);
137  }
138 
139  /* Process groups */
140  if(flag & (RFNAMEG|RFCNAMEG)) {
141  p->pgrp = newpgrp();
142  if(flag & RFNAMEG)
143  pgrpcpy(p->pgrp, up->pgrp);
144  /* inherit noattach */
145  p->pgrp->noattach = up->pgrp->noattach;
146  }
147  else {
148  p->pgrp = up->pgrp;
149  incref(p->pgrp);
150  }
151  if(flag & RFNOMNT)
152  p->pgrp->noattach = 1;
153 
154  if(flag & RFREND)
155  p->rgrp = newrgrp();
156  else {
157  incref(up->rgrp);
158  p->rgrp = up->rgrp;
159  }
160 
161  /* Environment group */
162  if(flag & (RFENVG|RFCENVG)) {
163  p->egrp = smalloc(sizeof(Egrp));
164  p->egrp->ref = 1;
165  if(flag & RFENVG)
166  envcpy(p->egrp, up->egrp);
167  }
168  else {
169  p->egrp = up->egrp;
170  incref(p->egrp);
171  }
172  p->hang = up->hang;
173  p->procmode = up->procmode;
174  if(up->procctl == Proc_tracesyscall)
175  p->procctl = Proc_tracesyscall;
176 
177  poperror(); /* abortion */
178 
179  /* Craft a return frame which will cause the child to pop out of
180  * the scheduler in user mode with the return register zero
181  */
182  forkchild(p, up->dbgreg);
183 
184  p->parent = up;
185  if((flag&RFNOWAIT) == 0){
186  p->parentpid = up->pid;
187  lock(&up->exl);
188  up->nchild++;
189  unlock(&up->exl);
190  }
191  if((flag&RFNOTEG) == 0)
192  p->noteid = up->noteid;
193 
194  pid = p->pid;
195  memset(p->time, 0, sizeof(p->time));
196  p->time[TReal] = MACHP(0)->ticks;
197 
198  kstrdup(&p->text, up->text);
199  kstrdup(&p->user, up->user);
200 
201  procfork(p);
202 
203  /*
204  * since the bss/data segments are now shareable,
205  * any mmu info about this process is now stale
206  * (i.e. has bad properties) and has to be discarded.
207  */
208  flushmmu();
209  p->basepri = up->basepri;
210  p->priority = up->basepri;
211  p->fixedpri = up->fixedpri;
212  p->mp = up->mp;
213  wm = up->wired;
214  if(wm)
215  procwired(p, wm->machno);
216  ready(p);
217  sched();
218  return pid;
219 }
220 
221 static int
222 shargs(char *s, int n, char **ap)
223 {
224  int i;
225 
226  s += 2;
227  n -= 2; /* skip #! */
228  for(i=0;; i++){
229  if(i >= n)
230  return 0;
231  if(s[i]=='\n')
232  break;
233  }
234  s[i] = 0;
235 
236  i = 0;
237  for(;;) {
238  while(*s==' ' || *s=='\t')
239  s++;
240  if(*s == 0)
241  break;
242  ap[i++] = s++;
243  while(*s && *s!=' ' && *s!='\t')
244  s++;
245  if(*s == 0)
246  break;
247  *s++ = 0;
248  }
249  ap[i] = nil;
250  return i;
251 }
252 
253 static ulong
254 l2be(long l)
255 {
256  uchar *cp;
257 
258  cp = (uchar*)&l;
259  return (cp[0]<<24) | (cp[1]<<16) | (cp[2]<<8) | cp[3];
260 }
261 
262 uintptr
263 sysexec(va_list list)
264 {
265  Segment *s, *ts;
266  int i;
267  Chan *tc;
268  char **argv, **argp, **argp0;
269  char *a, *e, *charp, *args, *file, *file0;
270  char *progarg[sizeof(Exec)/2+1], *elem, progelem[64];
271  ulong magic, ssize, nargs, nbytes, n;
272  uintptr t, d, b, entry, bssend, text, data, bss, tstk, align;
273  int indir;
274  Exec exec;
275  char line[sizeof(Exec)];
276  Fgrp *f;
277  Image *img;
278  Tos *tos;
279 
280  args = elem = nil;
281  file0 = va_arg(list, char*);
282  validaddr((uintptr)file0, 1, 0);
283  argp0 = va_arg(list, char**);
284  evenaddr((uintptr)argp0);
285  validaddr((uintptr)argp0, 2*BY2WD, 0);
286  if(*argp0 == nil)
287  error(Ebadarg);
288  file0 = validnamedup(file0, 1);
289  if(waserror()){
290  free(file0);
291  free(elem);
292  free(args);
293  /* Disaster after commit */
294  if(up->seg[SSEG] == nil)
295  pexit(up->errstr, 1);
296  s = up->seg[ESEG];
297  if(s != nil){
298  putseg(s);
299  up->seg[ESEG] = nil;
300  }
301  nexterror();
302  }
303  align = BY2PG;
304  indir = 0;
305  file = file0;
306  for(;;){
307  tc = namec(file, Aopen, OEXEC, 0);
308  if(waserror()){
309  cclose(tc);
310  nexterror();
311  }
312  if(!indir)
313  kstrdup(&elem, up->genbuf);
314 
315  n = devtab[tc->type]->read(tc, &exec, sizeof(Exec), 0);
316  if(n <= 2)
317  error(Ebadexec);
318  magic = l2be(exec.magic);
319  if(n == sizeof(Exec) && (magic == AOUT_MAGIC)){
320  entry = l2be(exec.entry);
321  text = l2be(exec.text);
322  if(magic & HDR_MAGIC)
323  text += 8;
324  switch(magic){
325  case S_MAGIC: /* 2MB segment alignment for amd64 */
326  align = 0x200000;
327  break;
328  case V_MAGIC: /* 16K segment alignment for mips */
329  align = 0x4000;
330  break;
331  case R_MAGIC: /* 64K segment alignment for arm64 */
332  align = 0x10000;
333  break;
334  }
335  if(text >= (USTKTOP-USTKSIZE)-(UTZERO+sizeof(Exec))
336  || entry < UTZERO+sizeof(Exec)
337  || entry >= UTZERO+sizeof(Exec)+text)
338  error(Ebadexec);
339  break; /* for binary */
340  }
341 
342  /*
343  * Process #! /bin/sh args ...
344  */
345  memmove(line, &exec, n);
346  if(indir || line[0]!='#' || line[1]!='!')
347  error(Ebadexec);
348  n = shargs(line, n, progarg);
349  if(n < 1)
350  error(Ebadexec);
351  indir = 1;
352  /*
353  * First arg becomes complete file name
354  */
355  progarg[n++] = file;
356  progarg[n] = nil;
357  argp0++;
358  file = progarg[0];
359  if(strlen(elem) >= sizeof progelem)
360  error(Ebadexec);
361  strcpy(progelem, elem);
362  progarg[0] = progelem;
363  poperror();
364  cclose(tc);
365  }
366 
367  data = l2be(exec.data);
368  bss = l2be(exec.bss);
369  align--;
370  t = (UTZERO+sizeof(Exec)+text+align) & ~align;
371  align = BY2PG-1;
372  d = (t + data + align) & ~align;
373  bssend = t + data + bss;
374  b = (bssend + align) & ~align;
375  if(t >= (USTKTOP-USTKSIZE) || d >= (USTKTOP-USTKSIZE) || b >= (USTKTOP-USTKSIZE))
376  error(Ebadexec);
377 
378  /*
379  * Args: pass 1: count
380  */
381  nbytes = sizeof(Tos); /* hole for profiling clock at top of stack (and more) */
382  nargs = 0;
383  if(indir){
384  argp = progarg;
385  while(*argp != nil){
386  a = *argp++;
387  nbytes += strlen(a) + 1;
388  nargs++;
389  }
390  }
391  argp = argp0;
392  while(*argp != nil){
393  a = *argp++;
394  if(((uintptr)argp&(BY2PG-1)) < BY2WD)
395  validaddr((uintptr)argp, BY2WD, 0);
396  validaddr((uintptr)a, 1, 0);
397  e = vmemchr(a, 0, USTKSIZE);
398  if(e == nil)
399  error(Ebadarg);
400  nbytes += (e - a) + 1;
401  if(nbytes >= USTKSIZE)
402  error(Enovmem);
403  nargs++;
404  }
405  ssize = BY2WD*(nargs+1) + ((nbytes+(BY2WD-1)) & ~(BY2WD-1));
406 
407  /*
408  * 8-byte align SP for those (e.g. sparc) that need it.
409  * execregs() will subtract another 4 bytes for argc.
410  */
411  if(BY2WD == 4 && (ssize+4) & 7)
412  ssize += 4;
413 
414  if(PGROUND(ssize) >= USTKSIZE)
415  error(Enovmem);
416 
417  /*
418  * Build the stack segment, putting it in kernel virtual for the moment
419  */
420  qlock(&up->seglock);
421  if(waserror()){
422  qunlock(&up->seglock);
423  nexterror();
424  }
425 
426  s = up->seg[SSEG];
427  do {
428  tstk = s->base;
429  if(tstk <= USTKSIZE)
430  error(Enovmem);
431  } while((s = isoverlap(tstk-USTKSIZE, USTKSIZE)) != nil);
432  up->seg[ESEG] = newseg(SG_STACK, tstk-USTKSIZE, USTKSIZE/BY2PG);
433 
434  /*
435  * Args: pass 2: assemble; the pages will be faulted in
436  */
437  tos = (Tos*)(tstk - sizeof(Tos));
438  tos->cyclefreq = m->cyclefreq;
439  tos->kcycles = 0;
440  tos->pcycles = 0;
441  tos->clock = 0;
442 
443  argv = (char**)(tstk - ssize);
444  charp = (char*)(tstk - nbytes);
445  if(indir)
446  argp = progarg;
447  else
448  argp = argp0;
449 
450  for(i=0; i<nargs; i++){
451  if(indir && *argp==nil) {
452  indir = 0;
453  argp = argp0;
454  }
455  *argv++ = charp + (USTKTOP-tstk);
456  a = *argp++;
457  if(indir)
458  e = strchr(a, 0);
459  else {
460  validaddr((uintptr)a, 1, 0);
461  e = vmemchr(a, 0, (char*)tstk - charp);
462  if(e == nil)
463  error(Ebadarg);
464  }
465  n = (e - a) + 1;
466  memmove(charp, a, n);
467  charp += n;
468  }
469 
470  /* copy args; easiest from new process's stack */
471  a = (char*)(tstk - nbytes);
472  n = charp - a;
473  if(n > 128) /* don't waste too much space on huge arg lists */
474  n = 128;
475  args = smalloc(n);
476  memmove(args, a, n);
477  if(n>0 && args[n-1]!='\0'){
478  /* make sure last arg is NUL-terminated */
479  /* put NUL at UTF-8 character boundary */
480  for(i=n-1; i>0; --i)
481  if(fullrune(args+i, n-i))
482  break;
483  args[i] = 0;
484  n = i+1;
485  }
486 
487  /*
488  * Committed.
489  * Free old memory.
490  * Special segments are maintained across exec
491  */
492  for(i = SSEG; i <= BSEG; i++) {
493  putseg(up->seg[i]);
494  /* prevent a second free if we have an error */
495  up->seg[i] = nil;
496  }
497  for(i = ESEG+1; i < NSEG; i++) {
498  s = up->seg[i];
499  if(s != nil && (s->type&SG_CEXEC) != 0) {
500  putseg(s);
501  up->seg[i] = nil;
502  }
503  }
504 
505  /*
506  * Close on exec
507  */
508  if((f = up->fgrp) != nil) {
509  for(i=0; i<=f->maxfd; i++)
510  fdclose(i, CCEXEC);
511  }
512 
513  /* Text. Shared. Attaches to cache image if possible */
514  /* attachimage returns a locked cache image */
515  img = attachimage(SG_TEXT|SG_RONLY, tc, UTZERO, (t-UTZERO)>>PGSHIFT);
516  ts = img->s;
517  up->seg[TSEG] = ts;
518  ts->flushme = 1;
519  ts->fstart = 0;
520  ts->flen = sizeof(Exec)+text;
521  unlock(img);
522 
523  /* Data. Shared. */
524  s = newseg(SG_DATA, t, (d-t)>>PGSHIFT);
525  up->seg[DSEG] = s;
526 
527  /* Attached by hand */
528  incref(img);
529  s->image = img;
530  s->fstart = ts->fstart+ts->flen;
531  s->flen = data;
532 
533  /* BSS. Zero fill on demand */
534  up->seg[BSEG] = newseg(SG_BSS, d, (b-d)>>PGSHIFT);
535 
536  /*
537  * Move the stack
538  */
539  s = up->seg[ESEG];
540  up->seg[ESEG] = nil;
541  s->base = USTKTOP-USTKSIZE;
542  s->top = USTKTOP;
543  relocateseg(s, USTKTOP-tstk);
544  up->seg[SSEG] = s;
545  qunlock(&up->seglock);
546  poperror(); /* seglock */
547 
548  /*
549  * '/' processes are higher priority (hack to make /ip more responsive).
550  */
551  if(devtab[tc->type]->dc == L'/')
552  up->basepri = PriRoot;
553  up->priority = up->basepri;
554  poperror(); /* tc */
555  cclose(tc);
556  poperror(); /* file0 */
557  free(file0);
558 
559  qlock(&up->debug);
560  free(up->text);
561  up->text = elem;
562  free(up->args);
563  up->args = args;
564  up->nargs = n;
565  up->setargs = 0;
566 
567  up->nnote = 0;
568  up->notify = 0;
569  up->notified = 0;
570  up->privatemem = 0;
571  up->noswap = 0;
572  procsetup(up);
573  qunlock(&up->debug);
574 
575  /*
576  * At this point, the mmu contains info about the old address
577  * space and needs to be flushed
578  */
579  flushmmu();
580 
581  if(up->hang)
582  up->procctl = Proc_stopme;
583  return execregs(entry, ssize, nargs);
584 }
585 
586 int
587 return0(void*)
588 {
589  return 0;
590 }
591 
592 uintptr
593 syssleep(va_list list)
594 {
595  long ms;
596 
597  ms = va_arg(list, long);
598  if(ms <= 0) {
599  if (up->edf != nil && (up->edf->flags & Admitted))
600  edfyield();
601  else
602  yield();
603  } else {
604  tsleep(&up->sleep, return0, 0, ms);
605  }
606  return 0;
607 }
608 
609 uintptr
610 sysalarm(va_list list)
611 {
612  return procalarm(va_arg(list, ulong));
613 }
614 
615 
616 uintptr
617 sysexits(va_list list)
618 {
619  char *status;
620  char *inval = "invalid exit string";
621  char buf[ERRMAX];
622 
623  status = va_arg(list, char*);
624  if(status != nil){
625  if(waserror())
626  status = inval;
627  else{
628  validaddr((uintptr)status, 1, 0);
629  if(vmemchr(status, 0, ERRMAX) == nil){
630  memmove(buf, status, ERRMAX);
631  buf[ERRMAX-1] = 0;
632  status = buf;
633  }
634  poperror();
635  }
636 
637  }
638  pexit(status, 1);
639  return 0; /* not reached */
640 }
641 
642 uintptr
643 sys_wait(va_list list)
644 {
645  ulong pid;
646  Waitmsg w;
647  OWaitmsg *ow;
648 
649  ow = va_arg(list, OWaitmsg*);
650  if(ow == nil)
651  pid = pwait(nil);
652  else {
653  validaddr((uintptr)ow, sizeof(OWaitmsg), 1);
654  evenaddr((uintptr)ow);
655  pid = pwait(&w);
656  }
657  if(ow != nil){
658  readnum(0, ow->pid, NUMSIZE, w.pid, NUMSIZE);
659  readnum(0, ow->time+TUser*NUMSIZE, NUMSIZE, w.time[TUser], NUMSIZE);
660  readnum(0, ow->time+TSys*NUMSIZE, NUMSIZE, w.time[TSys], NUMSIZE);
661  readnum(0, ow->time+TReal*NUMSIZE, NUMSIZE, w.time[TReal], NUMSIZE);
662  strncpy(ow->msg, w.msg, sizeof(ow->msg)-1);
663  ow->msg[sizeof(ow->msg)-1] = '\0';
664  }
665  return pid;
666 }
667 
668 uintptr
669 sysawait(va_list list)
670 {
671  char *p;
672  Waitmsg w;
673  uint n;
674 
675  p = va_arg(list, char*);
676  n = va_arg(list, uint);
677  validaddr((uintptr)p, n, 1);
678  pwait(&w);
679  return (uintptr)snprint(p, n, "%d %lud %lud %lud %q",
680  w.pid,
681  w.time[TUser], w.time[TSys], w.time[TReal],
682  w.msg);
683 }
684 
685 void
686 werrstr(char *fmt, ...)
687 {
688  va_list va;
689 
690  if(up == nil)
691  return;
692 
693  va_start(va, fmt);
694  vseprint(up->syserrstr, up->syserrstr+ERRMAX, fmt, va);
695  va_end(va);
696 }
697 
698 static int
699 generrstr(char *buf, uint nbuf)
700 {
701  char tmp[ERRMAX];
702 
703  if(nbuf == 0)
704  error(Ebadarg);
705  validaddr((uintptr)buf, nbuf, 1);
706  if(nbuf > sizeof tmp)
707  nbuf = sizeof tmp;
708  memmove(tmp, buf, nbuf);
709 
710  /* make sure it's NUL-terminated */
711  tmp[nbuf-1] = '\0';
712  memmove(buf, up->syserrstr, nbuf);
713  buf[nbuf-1] = '\0';
714  memmove(up->syserrstr, tmp, nbuf);
715  return 0;
716 }
717 
718 uintptr
719 syserrstr(va_list list)
720 {
721  char *buf;
722  uint len;
723 
724  buf = va_arg(list, char*);
725  len = va_arg(list, uint);
726  return (uintptr)generrstr(buf, len);
727 }
728 
729 /* compatibility for old binaries */
730 uintptr
731 sys_errstr(va_list list)
732 {
733  return (uintptr)generrstr(va_arg(list, char*), 64);
734 }
735 
736 uintptr
737 sysnotify(va_list list)
738 {
739  int (*f)(void*, char*);
740  f = va_arg(list, void*);
741  if(f != nil)
742  validaddr((uintptr)f, sizeof(void*), 0);
743  up->notify = f;
744  return 0;
745 }
746 
747 uintptr
748 sysnoted(va_list list)
749 {
750  if(va_arg(list, int) != NRSTR && !up->notified)
751  error(Egreg);
752  return 0;
753 }
754 
755 uintptr
756 syssegbrk(va_list list)
757 {
758  int i;
759  uintptr addr;
760  Segment *s;
761 
762  addr = va_arg(list, uintptr);
763  for(i = 0; i < NSEG; i++) {
764  s = up->seg[i];
765  if(s == nil || addr < s->base || addr >= s->top)
766  continue;
767  switch(s->type&SG_TYPE) {
768  case SG_TEXT:
769  case SG_DATA:
770  case SG_STACK:
771  case SG_PHYSICAL:
772  case SG_FIXED:
773  case SG_STICKY:
774  error(Ebadarg);
775  default:
776  return ibrk(va_arg(list, uintptr), i);
777  }
778  }
779  error(Ebadarg);
780  return 0; /* not reached */
781 }
782 
783 uintptr
784 syssegattach(va_list list)
785 {
786  int attr;
787  char *name;
788  uintptr va;
789  ulong len;
790 
791  attr = va_arg(list, int);
792  name = va_arg(list, char*);
793  va = va_arg(list, uintptr);
794  len = va_arg(list, ulong);
795  validaddr((uintptr)name, 1, 0);
796  name = validnamedup(name, 1);
797  if(waserror()){
798  free(name);
799  nexterror();
800  }
801  va = segattach(attr, name, va, len);
802  free(name);
803  poperror();
804  return va;
805 }
806 
807 uintptr
808 syssegdetach(va_list list)
809 {
810  int i;
811  uintptr addr;
812  Segment *s;
813 
814  addr = va_arg(list, uintptr);
815 
816  qlock(&up->seglock);
817  if(waserror()){
818  qunlock(&up->seglock);
819  nexterror();
820  }
821 
822  s = nil;
823  for(i = 0; i < NSEG; i++)
824  if((s = up->seg[i]) != nil) {
825  qlock(s);
826  if((addr >= s->base && addr < s->top) ||
827  (s->top == s->base && addr == s->base))
828  goto found;
829  qunlock(s);
830  }
831 
832  error(Ebadarg);
833 
834 found:
835  /*
836  * Check we are not detaching the initial stack segment.
837  */
838  if(s == up->seg[SSEG]){
839  qunlock(s);
840  error(Ebadarg);
841  }
842  up->seg[i] = nil;
843  qunlock(s);
844  putseg(s);
845  qunlock(&up->seglock);
846  poperror();
847 
848  /* Ensure we flush any entries from the lost segment */
849  flushmmu();
850  return 0;
851 }
852 
853 uintptr
854 syssegfree(va_list list)
855 {
856  Segment *s;
857  uintptr from, to;
858 
859  from = va_arg(list, uintptr);
860  to = va_arg(list, ulong);
861  to += from;
862  if(to < from)
863  error(Ebadarg);
864  s = seg(up, from, 1);
865  if(s == nil)
866  error(Ebadarg);
867  to &= ~(BY2PG-1);
868  from = PGROUND(from);
869  if(from >= to) {
870  qunlock(s);
871  return 0;
872  }
873  if(to > s->top) {
874  qunlock(s);
875  error(Ebadarg);
876  }
877  mfreeseg(s, from, (to - from) / BY2PG);
878  qunlock(s);
879  flushmmu();
880  return 0;
881 }
882 
883 /* For binary compatibility */
884 uintptr
885 sysbrk_(va_list list)
886 {
887  return ibrk(va_arg(list, uintptr), BSEG);
888 }
889 
890 uintptr
891 sysrendezvous(va_list list)
892 {
893  uintptr tag, val, new;
894  Proc *p, **l;
895 
896  tag = va_arg(list, uintptr);
897  new = va_arg(list, uintptr);
898  l = &REND(up->rgrp, tag);
899 
900  lock(up->rgrp);
901  for(p = *l; p != nil; p = p->rendhash) {
902  if(p->rendtag == tag) {
903  *l = p->rendhash;
904  val = p->rendval;
905  p->rendval = new;
906  unlock(up->rgrp);
907 
908  ready(p);
909 
910  return val;
911  }
912  l = &p->rendhash;
913  }
914 
915  /* Going to sleep here */
916  up->rendtag = tag;
917  up->rendval = new;
918  up->rendhash = *l;
919  *l = up;
920  up->state = Rendezvous;
921  unlock(up->rgrp);
922 
923  sched();
924 
925  return up->rendval;
926 }
927 
928 /*
929  * The implementation of semaphores is complicated by needing
930  * to avoid rescheduling in syssemrelease, so that it is safe
931  * to call from real-time processes. This means syssemrelease
932  * cannot acquire any qlocks, only spin locks.
933  *
934  * Semacquire and semrelease must both manipulate the semaphore
935  * wait list. Lock-free linked lists only exist in theory, not
936  * in practice, so the wait list is protected by a spin lock.
937  *
938  * The semaphore value *addr is stored in user memory, so it
939  * cannot be read or written while holding spin locks.
940  *
941  * Thus, we can access the list only when holding the lock, and
942  * we can access the semaphore only when not holding the lock.
943  * This makes things interesting. Note that sleep's condition function
944  * is called while holding two locks - r and up->rlock - so it cannot
945  * access the semaphore value either.
946  *
947  * An acquirer announces its intention to try for the semaphore
948  * by putting a Sema structure onto the wait list and then
949  * setting Sema.waiting. After one last check of semaphore,
950  * the acquirer sleeps until Sema.waiting==0. A releaser of n
951  * must wake up n acquirers who have Sema.waiting set. It does
952  * this by clearing Sema.waiting and then calling wakeup.
953  *
954  * There are three interesting races here.
955 
956  * The first is that in this particular sleep/wakeup usage, a single
957  * wakeup can rouse a process from two consecutive sleeps!
958  * The ordering is:
959  *
960  * (a) set Sema.waiting = 1
961  * (a) call sleep
962  * (b) set Sema.waiting = 0
963  * (a) check Sema.waiting inside sleep, return w/o sleeping
964  * (a) try for semaphore, fail
965  * (a) set Sema.waiting = 1
966  * (a) call sleep
967  * (b) call wakeup(a)
968  * (a) wake up again
969  *
970  * This is okay - semacquire will just go around the loop
971  * again. It does mean that at the top of the for(;;) loop in
972  * semacquire, phore.waiting might already be set to 1.
973  *
974  * The second is that a releaser might wake an acquirer who is
975  * interrupted before he can acquire the lock. Since
976  * release(n) issues only n wakeup calls -- only n can be used
977  * anyway -- if the interrupted process is not going to use his
978  * wakeup call he must pass it on to another acquirer.
979  *
980  * The third race is similar to the second but more subtle. An
981  * acquirer sets waiting=1 and then does a final canacquire()
982  * before going to sleep. The opposite order would result in
983  * missing wakeups that happen between canacquire and
984  * waiting=1. (In fact, the whole point of Sema.waiting is to
985  * avoid missing wakeups between canacquire() and sleep().) But
986  * there can be spurious wakeups between a successful
987  * canacquire() and the following semdequeue(). This wakeup is
988  * not useful to the acquirer, since he has already acquired
989  * the semaphore. Like in the previous case, though, the
990  * acquirer must pass the wakeup call along.
991  *
992  * This is all rather subtle. The code below has been verified
993  * with the spin model /sys/src/9/port/semaphore.p. The
994  * original code anticipated the second race but not the first
995  * or third, which were caught only with spin. The first race
996  * is mentioned in /sys/doc/sleep.ps, but I'd forgotten about it.
997  * It was lucky that my abstract model of sleep/wakeup still managed
998  * to preserve that behavior.
999  *
1000  * I remain slightly concerned about memory coherence
1001  * outside of locks. The spin model does not take
1002  * queued processor writes into account so we have to
1003  * think hard. The only variables accessed outside locks
1004  * are the semaphore value itself and the boolean flag
1005  * Sema.waiting. The value is only accessed with cmpswap,
1006  * whose job description includes doing the right thing as
1007  * far as memory coherence across processors. That leaves
1008  * Sema.waiting. To handle it, we call coherence() before each
1009  * read and after each write. - rsc
1010  */
1011 
1012 /* Add semaphore p with addr a to list in seg. */
1013 static void
1014 semqueue(Segment *s, long *a, Sema *p)
1015 {
1016  memset(p, 0, sizeof *p);
1017  p->addr = a;
1018  lock(&s->sema); /* uses s->sema.Rendez.Lock, but no one else is */
1019  p->next = &s->sema;
1020  p->prev = s->sema.prev;
1021  p->next->prev = p;
1022  p->prev->next = p;
1023  unlock(&s->sema);
1024 }
1025 
1026 /* Remove semaphore p from list in seg. */
1027 static void
1028 semdequeue(Segment *s, Sema *p)
1029 {
1030  lock(&s->sema);
1031  p->next->prev = p->prev;
1032  p->prev->next = p->next;
1033  unlock(&s->sema);
1034 }
1035 
1036 /* Wake up n waiters with addr a on list in seg. */
1037 static void
1038 semwakeup(Segment *s, long *a, long n)
1039 {
1040  Sema *p;
1041 
1042  lock(&s->sema);
1043  for(p=s->sema.next; p!=&s->sema && n>0; p=p->next){
1044  if(p->addr == a && p->waiting){
1045  p->waiting = 0;
1046  coherence();
1047  wakeup(p);
1048  n--;
1049  }
1050  }
1051  unlock(&s->sema);
1052 }
1053 
1054 /* Add delta to semaphore and wake up waiters as appropriate. */
1055 static long
1056 semrelease(Segment *s, long *addr, long delta)
1057 {
1058  long value;
1059 
1060  do
1061  value = *addr;
1062  while(!cmpswap(addr, value, value+delta));
1063  semwakeup(s, addr, delta);
1064  return value+delta;
1065 }
1066 
1067 /* Try to acquire semaphore using compare-and-swap */
1068 static int
1069 canacquire(long *addr)
1070 {
1071  long value;
1072 
1073  while((value=*addr) > 0)
1074  if(cmpswap(addr, value, value-1))
1075  return 1;
1076  return 0;
1077 }
1078 
1079 /* Should we wake up? */
1080 static int
1081 semawoke(void *p)
1082 {
1083  coherence();
1084  return !((Sema*)p)->waiting;
1085 }
1086 
1087 /* Acquire semaphore (subtract 1). */
1088 static int
1089 semacquire(Segment *s, long *addr, int block)
1090 {
1091  int acquired;
1092  Sema phore;
1093 
1094  if(canacquire(addr))
1095  return 1;
1096  if(!block)
1097  return 0;
1098 
1099  acquired = 0;
1100  semqueue(s, addr, &phore);
1101  for(;;){
1102  phore.waiting = 1;
1103  coherence();
1104  if(canacquire(addr)){
1105  acquired = 1;
1106  break;
1107  }
1108  if(waserror())
1109  break;
1110  sleep(&phore, semawoke, &phore);
1111  poperror();
1112  }
1113  semdequeue(s, &phore);
1114  coherence(); /* not strictly necessary due to lock in semdequeue */
1115  if(!phore.waiting)
1116  semwakeup(s, addr, 1);
1117  if(!acquired)
1118  nexterror();
1119  return 1;
1120 }
1121 
1122 /* Acquire semaphore or time-out */
1123 static int
1124 tsemacquire(Segment *s, long *addr, ulong ms)
1125 {
1126  int acquired, timedout;
1127  ulong t;
1128  Sema phore;
1129 
1130  if(canacquire(addr))
1131  return 1;
1132  if(ms == 0)
1133  return 0;
1134  acquired = timedout = 0;
1135  semqueue(s, addr, &phore);
1136  for(;;){
1137  phore.waiting = 1;
1138  coherence();
1139  if(canacquire(addr)){
1140  acquired = 1;
1141  break;
1142  }
1143  if(waserror())
1144  break;
1145  t = MACHP(0)->ticks;
1146  tsleep(&phore, semawoke, &phore, ms);
1147  t = TK2MS(MACHP(0)->ticks - t);
1148  poperror();
1149  if(t >= ms){
1150  timedout = 1;
1151  break;
1152  }
1153  ms -= t;
1154  }
1155  semdequeue(s, &phore);
1156  coherence(); /* not strictly necessary due to lock in semdequeue */
1157  if(!phore.waiting)
1158  semwakeup(s, addr, 1);
1159  if(timedout)
1160  return 0;
1161  if(!acquired)
1162  nexterror();
1163  return 1;
1164 }
1165 
1166 uintptr
1167 syssemacquire(va_list list)
1168 {
1169  int block;
1170  long *addr;
1171  Segment *s;
1172 
1173  addr = va_arg(list, long*);
1174  block = va_arg(list, int);
1175  evenaddr((uintptr)addr);
1176  s = seg(up, (uintptr)addr, 0);
1177  if(s == nil || (s->type&SG_RONLY) != 0 || (uintptr)addr+sizeof(long) > s->top){
1178  validaddr((uintptr)addr, sizeof(long), 1);
1179  error(Ebadarg);
1180  }
1181  if(*addr < 0)
1182  error(Ebadarg);
1183  return (uintptr)semacquire(s, addr, block);
1184 }
1185 
1186 uintptr
1187 systsemacquire(va_list list)
1188 {
1189  long *addr;
1190  ulong ms;
1191  Segment *s;
1192 
1193  addr = va_arg(list, long*);
1194  ms = va_arg(list, ulong);
1195  evenaddr((uintptr)addr);
1196  s = seg(up, (uintptr)addr, 0);
1197  if(s == nil || (s->type&SG_RONLY) != 0 || (uintptr)addr+sizeof(long) > s->top){
1198  validaddr((uintptr)addr, sizeof(long), 1);
1199  error(Ebadarg);
1200  }
1201  if(*addr < 0)
1202  error(Ebadarg);
1203  return (uintptr)tsemacquire(s, addr, ms);
1204 }
1205 
1206 uintptr
1207 syssemrelease(va_list list)
1208 {
1209  long *addr, delta;
1210  Segment *s;
1211 
1212  addr = va_arg(list, long*);
1213  delta = va_arg(list, long);
1214  evenaddr((uintptr)addr);
1215  s = seg(up, (uintptr)addr, 0);
1216  if(s == nil || (s->type&SG_RONLY) != 0 || (uintptr)addr+sizeof(long) > s->top){
1217  validaddr((uintptr)addr, sizeof(long), 1);
1218  error(Ebadarg);
1219  }
1220  /* delta == 0 is a no-op, not a release */
1221  if(delta < 0 || *addr < 0)
1222  error(Ebadarg);
1223  return (uintptr)semrelease(s, addr, delta);
1224 }
1225 
1226 /* For binary compatibility */
1227 uintptr
1228 sys_nsec(va_list list)
1229 {
1230  vlong *v;
1231 
1232  /* return in register on 64bit machine */
1233  if(sizeof(uintptr) == sizeof(vlong)){
1234  USED(list);
1235  return (uintptr)todget(nil);
1236  }
1237 
1238  v = va_arg(list, vlong*);
1239  evenaddr((uintptr)v);
1240  validaddr((uintptr)v, sizeof(vlong), 1);
1241  *v = todget(nil);
1242  return 0;
1243 }