changelog shortlog tags branches changeset files revisions annotate raw help

Mercurial > hg > plan9front / sys/src/9/pc/sdnvme.c

changeset 5847: f0c30306e7d5
child: 30cf99b1c789
author: cinap_lenrek@felloff.net
date: Wed, 29 Mar 2017 00:21:35 +0200
permissions: -rw-r--r--
description: sdnvme: NVMe controller driver (work in progress)

basic NVMe controller driver, reads and writes work.
"namespaces" show up as logical units.
uses pin/msi interrupts (no msi-x support yet).
one submission queue per cpu, shared completion queue.
no recovery from fatal controller errors.
only tested in qemu (no hardware available).

commiting this so it can be found by someone who has
hardware.
1 #include "u.h"
2 #include "../port/lib.h"
3 #include "mem.h"
4 #include "dat.h"
5 #include "fns.h"
6 #include "io.h"
7 #include "ureg.h"
8 #include "../port/error.h"
9 
10 #include "../port/sd.h"
11 
12 typedef struct WS WS;
13 typedef struct CQ CQ;
14 typedef struct SQ SQ;
15 typedef struct Ctlr Ctlr;
16 
17 struct WS
18 {
19  u32int cdw0;
20  ushort status;
21  Rendez *sleep;
22  WS **link;
23  SQ *queue;
24 };
25 
26 struct CQ
27 {
28  u32int head;
29  u32int mask;
30  u32int shift;
31  u32int *base;
32  Ctlr *ctlr;
33 };
34 
35 struct SQ
36 {
37  u32int tail;
38  u32int mask;
39  u32int shift;
40  u32int *base;
41  WS **wait;
42  Ctlr *ctlr;
43 };
44 
45 struct Ctlr
46 {
47  QLock;
48 
49  Lock intr;
50  u32int ints;
51  u32int irqc[2];
52 
53  Pcidev *pci;
54  u32int *reg;
55 
56  u64int cap;
57  uchar *ident;
58  u32int *nsid;
59  int nnsid;
60 
61  u32int mps; /* mps = 1<<mpsshift */
62  u32int mpsshift;
63  u32int dstrd;
64 
65  CQ cq[1+1];
66  SQ sq[1+MAXMACH];
67 
68  Ctlr *next;
69 };
70 
71 /* controller registers */
72 enum {
73  Cap0,
74  Cap1,
75  Ver,
76  IntMs,
77  IntMc,
78  CCfg,
79 
80  CSts = 0x1C/4,
81  Nssr,
82  AQAttr,
83  ASQBase0,
84  ASQBase1,
85  ACQBase0,
86  ACQBase1,
87 
88  DBell = 0x1000/4,
89 };
90 
91 static u32int*
92 qcmd(WS *ws, Ctlr *ctlr, int adm, u32int opc, u32int nsid, void *mptr, void *data, ulong len)
93 {
94  u32int cid, *e;
95  u64int pa;
96  SQ *sq;
97 
98  if(!adm){
99  Retry:
100  splhi();
101  sq = &ctlr->sq[1+m->machno];
102  } else {
103  qlock(ctlr);
104  sq = &ctlr->sq[0];
105  }
106  ws->sleep = &up->sleep;
107  ws->queue = sq;
108  ws->link = &sq->wait[sq->tail & sq->mask];
109  while(*ws->link != nil){
110  sched();
111  if(!adm){
112  /* should be very rare */
113  goto Retry;
114  }
115  }
116  *ws->link = ws;
117 
118  e = &sq->base[((cid = sq->tail++) & sq->mask)<<4];
119  e[0] = opc | cid<<16;
120  e[1] = nsid;
121  e[2] = 0;
122  e[3] = 0;
123  if(mptr != nil){
124  pa = PADDR(mptr);
125  e[4] = pa;
126  e[5] = pa>>32;
127  } else {
128  e[4] = 0;
129  e[5] = 0;
130  }
131  if(len > 0){
132  pa = PADDR(data);
133  e[6] = pa;
134  e[7] = pa>>32;
135  if(len > ctlr->mps - (pa & ctlr->mps-1))
136  pa += ctlr->mps - (pa & ctlr->mps-1);
137  else
138  pa = 0;
139  } else {
140  e[6] = 0;
141  e[7] = 0;
142  pa = 0;
143  }
144  e[8] = pa;
145  e[9] = pa>>32;
146  return e;
147 }
148 
149 static void
150 nvmeintr(Ureg *, void *arg)
151 {
152  u32int phaseshift, *e;
153  WS *ws, **wp;
154  Ctlr *ctlr;
155  SQ *sq;
156  CQ *cq;
157 
158  ctlr = arg;
159  if(ctlr->ints == 0)
160  return;
161 
162  ilock(&ctlr->intr);
163  ctlr->reg[IntMs] = ctlr->ints;
164  for(cq = &ctlr->cq[nelem(ctlr->cq)-1]; cq >= ctlr->cq; cq--){
165  if(cq->base == nil)
166  continue;
167  phaseshift = 16 - cq->shift;
168  for(;; cq->head++){
169  e = &cq->base[(cq->head & cq->mask)<<2];
170  if(((e[3] ^ (cq->head << phaseshift)) & 0x10000) == 0)
171  break;
172 
173  if(0) iprint("nvmeintr: cq%d [%.4ux] %.8ux %.8ux %.8ux %.8ux\n",
174  (int)(cq - ctlr->cq), cq->head & cq->mask,
175  e[0], e[1], e[2], e[3]);
176 
177  sq = &ctlr->sq[e[2] >> 16];
178  wp = &sq->wait[e[3] & sq->mask];
179  if((ws = *wp) != nil && ws->link == wp){
180  Rendez *z = ws->sleep;
181  ws->cdw0 = e[0];
182  ws->status = e[3]>>17;
183  *wp = nil;
184  wakeup(z);
185  }
186  }
187  ctlr->reg[DBell + ((cq-ctlr->cq)*2+1 << ctlr->dstrd)] = cq->head & cq->mask;
188  }
189  if((ctlr->reg[CSts] & 3) != 1)
190  iprint("nvmeintr: fatal controller error\n");
191  ctlr->reg[IntMc] = ctlr->ints;
192  iunlock(&ctlr->intr);
193 }
194 
195 static int
196 wdone(void *arg)
197 {
198  WS *ws = arg;
199  return *ws->link != ws;
200 }
201 
202 static u32int
203 wcmd(WS *ws)
204 {
205  SQ *sq = ws->queue;
206  Ctlr *ctlr = sq->ctlr;
207 
208  coherence();
209  ctlr->reg[DBell + ((sq-ctlr->sq)*2+0 << ctlr->dstrd)] = sq->tail & sq->mask;
210  if(sq > ctlr->sq) {
211  assert(sq == &ctlr->sq[1+m->machno]);
212  spllo();
213  } else
214  qunlock(sq->ctlr);
215  while(waserror())
216  ;
217  tsleep(ws->sleep, wdone, ws, 5);
218  while(!wdone(ws)){
219  nvmeintr(nil, ctlr);
220  tsleep(ws->sleep, wdone, ws, 10);
221  }
222  poperror();
223  return ws->status;
224 }
225 
226 void
227 checkstatus(u32int status, char *info)
228 {
229  if(status == 0)
230  return;
231  snprint(up->genbuf, sizeof(up->genbuf), "%s: status %ux", info, status);
232  error(up->genbuf);
233 }
234 
235 static long
236 nvmebio(SDunit *u, int lun, int write, void *a, long count, uvlong lba)
237 {
238  u32int nsid, s, n, m, *e;
239  Ctlr *ctlr;
240  uchar *p;
241  WS ws;
242 
243  USED(lun);
244 
245  ctlr = u->dev->ctlr;
246  nsid = ctlr->nsid[u->subno];
247  s = u->secsize;
248  p = a;
249  while(count > 0){
250  m = (2*ctlr->mps - ((uintptr)p & ctlr->mps-1)) / s;
251  if((n = count) > m)
252  n = m;
253  e = qcmd(&ws, ctlr, 0, write ? 0x01 : 0x02, nsid, nil, p, n*s);
254  e[10] = lba;
255  e[11] = lba>>32;
256  e[12] = n-1;
257  e[13] = (count>n)<<6; /* sequential request */
258  e[14] = 0;
259  e[15] = 0;
260  checkstatus(wcmd(&ws), write ? "write" : "read");
261  p += n*s;
262  count -= n;
263  lba += n;
264  }
265  return p - (uchar*)a;
266 }
267 
268 static int
269 nvmerio(SDreq *r)
270 {
271  int i, count, rw;
272  uvlong lba;
273  SDunit *u;
274 
275  u = r->unit;
276  if(r->cmd[0] == 0x35 || r->cmd[0] == 0x91)
277  return sdsetsense(r, SDok, 0, 0, 0);
278  if((i = sdfakescsi(r)) != SDnostatus)
279  return r->status = i;
280  if((i = sdfakescsirw(r, &lba, &count, &rw)) != SDnostatus)
281  return i;
282  r->rlen = nvmebio(u, r->lun, rw == SDwrite, r->data, count, lba);
283  return r->status = SDok;
284 }
285 
286 static int
287 nvmeverify(SDunit *u)
288 {
289  Ctlr *ctlr = u->dev->ctlr;
290  return u->subno < ctlr->nnsid;
291 }
292 
293 static int
294 nvmeonline(SDunit *u)
295 {
296  u32int *e, lbaf;
297  uchar *info, *p;
298  Ctlr *ctlr;
299  WS ws;
300 
301  if(u->sectors != 0)
302  return 1;
303 
304  ctlr = u->dev->ctlr;
305  if((info = mallocalign(0x1000, ctlr->mps, 0, 0)) == nil)
306  return 0;
307 
308  e = qcmd(&ws, ctlr, 1, 0x06, ctlr->nsid[u->subno], nil, info, 0x1000);
309  e[10] = 0; // identify namespace
310  if(wcmd(&ws) != 0){
311  free(info);
312  return 0;
313  }
314  p = info;
315  u->sectors = p[0] | p[1]<<8 | p[2]<<16 | p[3]<<24
316  | (u64int)p[4]<<32
317  | (u64int)p[5]<<40
318  | (u64int)p[6]<<48
319  | (u64int)p[7]<<56;
320  p = &info[128 + 4*(info[26]&15)];
321  lbaf = p[0] | p[1]<<8 | p[2]<<16 | p[3]<<24;
322  u->secsize = 1<<((lbaf>>16)&0xFF);
323  free(info);
324 
325  memset(u->inquiry, 0, sizeof u->inquiry);
326  u->inquiry[2] = 2;
327  u->inquiry[3] = 2;
328  u->inquiry[4] = sizeof u->inquiry - 4;
329  memmove(u->inquiry+8, ctlr->ident+24, 20);
330 
331  return 2;
332 }
333 
334 static int
335 nvmerctl(SDunit *u, char *p, int l)
336 {
337  Ctlr *ctlr;
338  char *e, *s;
339 
340  if((ctlr = u->dev->ctlr) == nil || ctlr->ident == nil)
341  return 0;
342 
343  e = p+l;
344  s = p;
345 
346  p = seprint(p, e, "model\t%.20s\n", (char*)ctlr->ident+24);
347  p = seprint(p, e, "serial\t%.10s\n", (char*)ctlr->ident+4);
348  p = seprint(p, e, "firm\t%.6s\n", (char*)ctlr->ident+64);
349  p = seprint(p, e, "geometry %llud %lud\n", u->sectors, u->secsize);
350 
351  return p-s;
352 }
353 
354 static void*
355 cqalloc(Ctlr *ctlr, CQ *cq, u32int lgsize)
356 {
357  cq->ctlr = ctlr;
358  cq->head = 0;
359  cq->shift = lgsize-4;
360  cq->mask = (1<<cq->shift)-1;
361  if((cq->base = mallocalign(1<<lgsize, ctlr->mps, 0, 0)) == nil)
362  error(Enomem);
363  memset(cq->base, 0, 1<<lgsize);
364  return cq->base;
365 }
366 
367 static void*
368 sqalloc(Ctlr *ctlr, SQ *sq, u32int lgsize)
369 {
370  sq->ctlr = ctlr;
371  sq->tail = 0;
372  sq->shift = lgsize-6;
373  sq->mask = (1<<sq->shift)-1;
374  if((sq->base = mallocalign(1<<lgsize, ctlr->mps, 0, 0)) == nil)
375  error(Enomem);
376  if((sq->wait = mallocz(sizeof(WS*)*(sq->mask+1), 1)) == nil)
377  error(Enomem);
378  memset(sq->base, 0, 1<<lgsize);
379  return sq->base;
380 }
381 
382 static void
383 setupqueues(Ctlr *ctlr)
384 {
385  u32int lgsize, *e;
386  CQ *cq;
387  SQ *sq;
388  WS ws;
389  int i;
390 
391  /* Overkill */
392  lgsize = 12-6+4;
393  while(lgsize < 16+4 && lgsize < ctlr->mpsshift && 1<<lgsize < conf.nmach<<12-6+4)
394  lgsize++;
395 
396  /* CQID1: shared completion queue */
397  cq = &ctlr->cq[1];
398  cqalloc(ctlr, cq, lgsize);
399  e = qcmd(&ws, ctlr, 1, 0x05, ~0, nil, cq->base, 1<<lgsize);
400  e[10] = (cq - ctlr->cq) | cq->mask<<16;
401  e[11] = 3; /* IEN | PC */
402  checkstatus(wcmd(&ws), "create completion queue");
403 
404  /* SQID[1..nmach]: submission queue per cpu */
405  for(i=1; i<=conf.nmach; i++){
406  sq = &ctlr->sq[i];
407  sqalloc(ctlr, sq, 12);
408  e = qcmd(&ws, ctlr, 1, 0x01, ~0, nil, sq->base, 0x1000);
409  e[10] = i | sq->mask<<16;
410  e[11] = (cq - ctlr->cq)<<16 | 1; /* CQID<<16 | PC */
411  checkstatus(wcmd(&ws), "create submission queue");
412  }
413 
414  ilock(&ctlr->intr);
415  ctlr->ints |= 1<<(cq - ctlr->cq);
416  ctlr->reg[IntMc] = ctlr->ints;
417  iunlock(&ctlr->intr);
418 }
419 
420 static void
421 identify(Ctlr *ctlr)
422 {
423  u32int *e;
424  WS ws;
425 
426  if(ctlr->ident == nil)
427  if((ctlr->ident = mallocalign(0x1000, ctlr->mps, 0, 0)) == nil)
428  error(Enomem);
429  if(ctlr->nsid == nil)
430  if((ctlr->nsid = mallocalign(0x1000, ctlr->mps, 0, 0)) == nil)
431  error(Enomem);
432 
433  e = qcmd(&ws, ctlr, 1, 0x06, ~0, nil, ctlr->ident, 0x1000);
434  e[10] = 1; // identify controller
435  checkstatus(wcmd(&ws), "identify controller");
436 
437  e = qcmd(&ws, ctlr, 1, 0x06, 0, nil, ctlr->nsid, 0x1000);
438  e[10] = 2; // namespace list
439  checkstatus(wcmd(&ws), "namespace list");
440 
441  ctlr->nnsid = 0;
442  while(ctlr->nnsid < 1024 && ctlr->nsid[ctlr->nnsid] != 0)
443  ctlr->nnsid++;
444 }
445 
446 static int
447 nvmedisable(SDev *sd)
448 {
449  char name[32];
450  Ctlr *ctlr;
451  int i;
452 
453  ctlr = sd->ctlr;
454 
455  /* mask interrupts */
456  ilock(&ctlr->intr);
457  ctlr->ints = 0;
458  ctlr->reg[IntMs] = ~ctlr->ints;
459  iunlock(&ctlr->intr);
460 
461  /* disable controller */
462  ctlr->reg[CCfg] = 0;
463 
464  for(i = 0; i < 10; i++){
465  if((ctlr->reg[CSts] & 1) == 0)
466  break;
467  tsleep(&up->sleep, return0, nil, 100);
468  }
469 
470  snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
471  intrdisable(ctlr->pci->intl, nvmeintr, ctlr, ctlr->pci->tbdf, name);
472 
473  pciclrbme(ctlr->pci); /* dma disable */
474 
475  for(i=0; i<nelem(ctlr->sq); i++){
476  free(ctlr->sq[i].base);
477  free(ctlr->sq[i].wait);
478  }
479  for(i=0; i<nelem(ctlr->cq); i++)
480  free(ctlr->cq[i].base);
481 
482  memset(ctlr->sq, 0, sizeof(ctlr->sq));
483  memset(ctlr->cq, 0, sizeof(ctlr->cq));
484 
485  free(ctlr->ident);
486  ctlr->ident = nil;
487  free(ctlr->nsid);
488  ctlr->nsid = nil;
489  ctlr->nnsid = 0;
490 
491  return 1;
492 }
493 
494 static int
495 nvmeenable(SDev *sd)
496 {
497  char name[32];
498  Ctlr *ctlr;
499  u64int pa;
500  int to;
501 
502  ctlr = sd->ctlr;
503 
504  snprint(name, sizeof(name), "%s (%s)", sd->name, sd->ifc->name);
505  intrenable(ctlr->pci->intl, nvmeintr, ctlr, ctlr->pci->tbdf, name);
506 
507  if(waserror()){
508  print("%s: %s\n", name, up->errstr);
509  nvmedisable(sd);
510  sd->nunit = 0; /* hack: prevent further probing */
511  return 0;
512  }
513 
514  pa = PADDR(cqalloc(ctlr, &ctlr->cq[0], ctlr->mpsshift));
515  ctlr->reg[ACQBase0] = pa;
516  ctlr->reg[ACQBase1] = pa>>32;
517 
518  pa = PADDR(sqalloc(ctlr, &ctlr->sq[0], ctlr->mpsshift));
519  ctlr->reg[ASQBase0] = pa;
520  ctlr->reg[ASQBase1] = pa>>32;
521 
522  ctlr->reg[AQAttr] = ctlr->sq[0].mask | ctlr->cq[0].mask<<16;
523 
524  /* dma enable */
525  pcisetbme(ctlr->pci);
526 
527  /* enable interrupt */
528  ilock(&ctlr->intr);
529  ctlr->ints = 1;
530  ctlr->reg[IntMc] = ctlr->ints;
531  iunlock(&ctlr->intr);
532 
533  /* enable controller */
534  ctlr->reg[CCfg] = 1 | (ctlr->mpsshift-12)<<7 | 6<<16 | 4<<20;
535 
536  for(to = (ctlr->cap>>24) & 255; to >= 0; to--){
537  tsleep(&up->sleep, return0, nil, 500);
538  if((ctlr->reg[CSts] & 3) == 1)
539  goto Ready;
540  }
541  if(ctlr->reg[CSts] & 2)
542  error("fatal controller status during initialization");
543  error("controller initialization timeout");
544 Ready:
545  identify(ctlr);
546  setupqueues(ctlr);
547 
548  poperror();
549 
550  return 1;
551 }
552 
553 static Ctlr*
554 nvmepnpctlrs(void)
555 {
556  Ctlr *ctlr, *h, *t;
557  Pcidev *p;
558  int i;
559 
560  h = t = nil;
561  for(p = nil; p = pcimatch(p, 0, 0);){
562  if(p->ccrb != 1 || p->ccru != 8 || p->ccrp != 2)
563  continue;
564  if(p->mem[0].size == 0)
565  continue;
566  if((ctlr = malloc(sizeof(*ctlr))) == nil){
567  print("nvme: no memory for Ctlr\n");
568  break;
569  }
570  ctlr->pci = p;
571  ctlr->reg = vmap(p->mem[0].bar & ~0xF, p->mem[0].size);
572  if(ctlr->reg == nil){
573  print("nvme: can't vmap bar0\n");
574  Bad:
575  if(ctlr->reg != nil)
576  vunmap(ctlr->reg, p->mem[0].size);
577  free(ctlr);
578  continue;
579  }
580  ctlr->cap = ctlr->reg[Cap0];
581  ctlr->cap |= (u64int)ctlr->reg[Cap1]<<32;
582 
583  /* mask interrupts */
584  ctlr->ints = 0;
585  ctlr->reg[IntMs] = ~ctlr->ints;
586 
587  /* disable controller */
588  ctlr->reg[CCfg] = 0;
589 
590  if((ctlr->cap&(1ULL<<37)) == 0){
591  print("nvme: doesnt support NVM commactlr set: %ux\n",
592  (u32int)(ctlr->cap>>37) & 0xFF);
593  goto Bad;
594  }
595 
596  /* use 64K page size when possible */
597  ctlr->dstrd = (ctlr->cap >> 32) & 15;
598  for(i = (ctlr->cap >> 48) & 15; i < ((ctlr->cap >> 52) & 15); i++){
599  if(i >= 16-12) /* 64K */
600  break;
601  }
602  ctlr->mpsshift = i+12;
603  ctlr->mps = 1 << ctlr->mpsshift;
604 
605  if(h == nil)
606  h = ctlr;
607  else
608  t->next = ctlr;
609  t = ctlr;
610  }
611 
612  return h;
613 }
614 
615 SDifc sdnvmeifc;
616 
617 static SDev*
618 nvmepnp(void)
619 {
620  SDev *s, *h, *t;
621  Ctlr *ctlr;
622  int id;
623 
624  h = t = nil;
625 
626  id = 'N';
627  for(ctlr = nvmepnpctlrs(); ctlr != nil; ctlr = ctlr->next){
628  if((s = malloc(sizeof(*s))) == nil)
629  break;
630  s->ctlr = ctlr;
631  s->idno = id++;
632  s->ifc = &sdnvmeifc;
633  s->nunit = 1024;
634  if(h)
635  t->next = s;
636  else
637  h = s;
638  t = s;
639  }
640 
641  return h;
642 }
643 
644 SDifc sdnvmeifc = {
645  "nvme", /* name */
646 
647  nvmepnp, /* pnp */
648  nil, /* legacy */
649  nvmeenable, /* enable */
650  nvmedisable, /* disable */
651 
652  nvmeverify, /* verify */
653  nvmeonline, /* online */
654  nvmerio, /* rio */
655  nvmerctl, /* rctl */
656  nil, /* wctl */
657 
658  nvmebio, /* bio */
659  nil, /* probe */
660  nil, /* clear */
661  nil, /* rtopctl */
662  nil, /* wtopctl */
663 };