changelog shortlog tags branches files raw gz bz2 help

Mercurial > hg > ventivac / changeset: rabinparams, for testing parameters for the rabin module.

changeset 86: 7bf17e4341ab
parent 85: fce22d6a4f48
child 87: afca2ca2ff3a
author: Mechiel Lukkien <mechiel@ueber.net>
date: Wed, 18 Jul 2007 20:23:35 +0200
files: README appl/cmd/mkfile appl/cmd/rabinparams.b appl/lib/rabin.b
description: rabinparams, for testing parameters for the rabin module.

next is to play with this and find decent values.
     1.1--- a/README	Wed Jul 18 15:34:36 2007 +0200
     1.2+++ b/README	Wed Jul 18 20:23:35 2007 +0200
     1.3@@ -18,3 +18,14 @@
     1.4 3. now just running "mk install" will copy module/vac.m into your /module,
     1.5    and compile all limbo files and install them as well.  note that "mk
     1.6    install" only works on posix systems (i.e. not inside inferno).
     1.7+
     1.8+
     1.9+other notes:
    1.10+
    1.11+- appl/cmd/testrabin.b reads a file and splits it using appl/lib/rabin.b.
    1.12+  the parameters for the rabin algorithm can be set from the command-line,
    1.13+  making it usable for testing.
    1.14+- appl/cmd/rabinparams.b walks a directory and splits all files with rabin
    1.15+  fingerprinting.  when done, it prints a histogram of each occuring
    1.16+  block size, how many blocks were found and what the mean block size was.
    1.17+  it can also print all boundaries to stderr.
     2.1--- a/appl/cmd/mkfile	Wed Jul 18 15:34:36 2007 +0200
     2.2+++ b/appl/cmd/mkfile	Wed Jul 18 20:23:35 2007 +0200
     2.3@@ -12,6 +12,7 @@
     2.4         vtest.dis\
     2.5         ventisrv.dis\
     2.6 	testrabin.dis\
     2.7+	rabinparams.dis\
     2.8 
     2.9 MODULES=\
    2.10 
     3.1--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2+++ b/appl/cmd/rabinparams.b	Wed Jul 18 20:23:35 2007 +0200
     3.3@@ -0,0 +1,200 @@
     3.4+implement Rabinparams;
     3.5+
     3.6+include "sys.m";
     3.7+	sys: Sys;
     3.8+include "draw.m";
     3.9+include "arg.m";
    3.10+include "bufio.m";
    3.11+	bufio: Bufio;
    3.12+	Iobuf: import bufio;
    3.13+include "rabin.m";
    3.14+	rabin: Rabin;
    3.15+
    3.16+print, sprint, fprint, fildes: import sys;
    3.17+Rcfg, Rfile: import rabin;
    3.18+
    3.19+bflag, dflag, vflag: int;
    3.20+prime, mod, width, mask, min, max: int;
    3.21+rcfg: ref Rcfg;
    3.22+berr: ref Iobuf;
    3.23+
    3.24+totalbytes, blocks: big;
    3.25+blocksizes: array of int;
    3.26+bounds: array of array of (array of byte, int);
    3.27+
    3.28+Rabinparams: module {
    3.29+	init:	fn(nil: ref Draw->Context, nil: list of string);
    3.30+};
    3.31+
    3.32+init(nil: ref Draw->Context, args: list of string)
    3.33+{
    3.34+	sys = load Sys Sys->PATH;
    3.35+	arg := load Arg Arg->PATH;
    3.36+	bufio = load Bufio Bufio->PATH;
    3.37+	rabin = load Rabin Rabin->PATH;
    3.38+	if(rabin == nil)
    3.39+		fail(sprint("loading module rabin %s: %r", Rabin->PATH));
    3.40+	rabin->init(bufio);
    3.41+
    3.42+	prime = 269;
    3.43+	mod = 1<<10;
    3.44+	width = 3;
    3.45+	mask = mod-1;
    3.46+	min = 1024;
    3.47+	max = 32*1024;
    3.48+
    3.49+	blocks = big 0;
    3.50+	blocksizes = array[56*1024] of {* => 0};
    3.51+	bounds = array[256] of array of (array of byte, int);
    3.52+
    3.53+	arg->init(args);
    3.54+	arg->setusage(arg->progname()+" [-bv] [-p prime] [-n width] [-m mod] [-k mask] [-s min] [-S max] dir ...");
    3.55+	while((c := arg->opt()) != 0)
    3.56+		case c {
    3.57+		'b' =>	bflag++;
    3.58+		'd' =>	dflag++;
    3.59+		'p' =>	prime = int arg->earg();
    3.60+		'n' =>	width = int arg->earg();
    3.61+		'm' =>	mod = int arg->earg();
    3.62+		'k' =>	mask = int arg->earg();
    3.63+		's' =>	min = int arg->earg();
    3.64+		'S' =>	max = int arg->earg();
    3.65+		'v' =>	vflag++;
    3.66+		* =>	arg->usage();
    3.67+		}
    3.68+	args = arg->argv();
    3.69+
    3.70+	if(len args == 0)
    3.71+		arg->usage();
    3.72+	rabin->debug = dflag;
    3.73+
    3.74+	rerr: string;
    3.75+	(rcfg, rerr) = Rcfg.mk(prime, width, mod, mask);
    3.76+	if(rerr != nil)
    3.77+		fail("rabincfg: "+rerr);
    3.78+
    3.79+	if(vflag > 1) {
    3.80+		berr = bufio->fopen(fildes(2), Bufio->OWRITE);
    3.81+		if(berr == nil)
    3.82+			fail(sprint("bufio open stderr: %r"));
    3.83+	}
    3.84+
    3.85+	for(; args != nil; args = tl args)
    3.86+		walk(hd args);
    3.87+	if(berr != nil)
    3.88+		berr.close();
    3.89+
    3.90+	print("blocksizes:\n");
    3.91+	for(i := 0; i < len blocksizes; i++)
    3.92+		if(blocksizes[i] == 0)
    3.93+			continue;
    3.94+		else
    3.95+			print("%10d  %d\n", i, blocksizes[i]);
    3.96+	if(bflag) {
    3.97+		nbounds := 0;
    3.98+		for(i = 0; i < len bounds; i++) {
    3.99+			for(j := 0; j < len bounds[i]; j++)
   3.100+				print("%-20s %d\n", fmtbound(bounds[i][j].t0), bounds[i][j].t1);
   3.101+			nbounds += len bounds[i];
   3.102+		}
   3.103+		print("nbounds=%d\n", nbounds);
   3.104+	}
   3.105+	print("blocks=%bd\n", blocks);
   3.106+	avg := 0;
   3.107+	if(blocks != big 0)
   3.108+		avg = int (totalbytes/blocks);
   3.109+	print("average blocksize=%d\n", avg);
   3.110+}
   3.111+
   3.112+walk(path: string)
   3.113+{
   3.114+	if(vflag)
   3.115+		print("walk %s\n", path);
   3.116+	fd := sys->open(path, Sys->OREAD);
   3.117+	if(fd == nil)
   3.118+		fail(sprint("opening %s: %r", path));
   3.119+	(ok, dir) := sys->fstat(fd);
   3.120+	if(ok < 0)
   3.121+		fail(sprint("fstat %s: %r", path));
   3.122+	if(dir.mode & Sys->DMDIR) {
   3.123+		for(;;) {
   3.124+			(n, d) := sys->dirread(fd);
   3.125+			if(n == 0)
   3.126+				break;
   3.127+			if(n < 0)
   3.128+				fail(sprint("dirread %s: %r", path));
   3.129+			for(i := 0; i < n; i++)
   3.130+				walk(path+"/"+d[i].name);
   3.131+		}
   3.132+	} else {
   3.133+		b := bufio->fopen(fd, Bufio->OREAD);
   3.134+		if(b == nil)
   3.135+			fail(sprint("bufio open %s: %r", path));
   3.136+		(rfile, oerr) := rabin->open(rcfg, b, min, max);
   3.137+		if(oerr != nil)
   3.138+			fail("rfile open: "+oerr);
   3.139+
   3.140+		i := 0;
   3.141+		for(;;) {
   3.142+			(d, nil, err) := rfile.read();
   3.143+			if(err != nil)
   3.144+				fail("rfile read: "+err);
   3.145+			if(len d == 0)
   3.146+				break;
   3.147+			if(i > 0 && len d > min && len d < max)
   3.148+				note(d[:width], len d);
   3.149+			i++;
   3.150+		}
   3.151+		b.close();
   3.152+	}
   3.153+}
   3.154+
   3.155+eq(d1, d2: array of byte): int
   3.156+{
   3.157+	if(len d1 != len d2)
   3.158+		return 0;
   3.159+	for(i := 0; i < len d1; i++)
   3.160+		if(d1[i] != d2[i])
   3.161+			return 0;
   3.162+	return 1;
   3.163+}
   3.164+
   3.165+note(d: array of byte, n: int)
   3.166+{
   3.167+	totalbytes += big n;
   3.168+	blocks++;
   3.169+	blocksizes[n]++;
   3.170+	buck := int d[0];
   3.171+	if(bflag) {
   3.172+		for(i := 0; i < len bounds[buck]; i++) {
   3.173+			if(eq(bounds[buck][i].t0, d)) {
   3.174+				bounds[buck][i].t1++;
   3.175+				return;
   3.176+			}
   3.177+		}
   3.178+		nb := array[len bounds[buck]+1] of (array of byte, int);
   3.179+		nb[:] = bounds[buck];
   3.180+		nb[len bounds[buck]] = (d, 1);
   3.181+		bounds[buck] = nb;
   3.182+	}
   3.183+	if(berr != nil)
   3.184+		berr.puts(sprint("%-20s\n", fmtbound(d)));
   3.185+}
   3.186+
   3.187+fmtbound(d: array of byte): string
   3.188+{
   3.189+	s := "";
   3.190+	for(i := 0; i < len d; i++) {
   3.191+		if(d[i] >= byte 16r20 && d[i] <= byte 16r7f)
   3.192+			s += sprint("%c", int d[i]);
   3.193+		else
   3.194+			s += sprint("\\x%02x", int d[i]);
   3.195+	}
   3.196+	return "\""+s+"\"";
   3.197+}
   3.198+
   3.199+fail(s: string)
   3.200+{
   3.201+	fprint(fildes(2), "%s\n", s);
   3.202+	raise "fail:"+s;
   3.203+}
     4.1--- a/appl/lib/rabin.b	Wed Jul 18 15:34:36 2007 +0200
     4.2+++ b/appl/lib/rabin.b	Wed Jul 18 20:23:35 2007 +0200
     4.3@@ -89,7 +89,7 @@
     4.4 		new := (prime * (c.state - c.rcfg.tab[int c.buf[c.n-width]]) + ch) & (mod-1);
     4.5 		c.n++;
     4.6 		c.state = new;
     4.7-		if(c.n-width >= c.max || (c.n-width >= c.min && (new & mask) == 0)) {
     4.8+		if(c.n-width >= c.max || (c.n-width >= c.min && (new & mask) == mask)) {
     4.9 			d := array[c.n-width] of byte;
    4.10 			d[:] = c.buf[:len d];
    4.11 			off := c.off;
    4.12@@ -107,9 +107,3 @@
    4.13 	if(debug)
    4.14 		sys->fprint(sys->fildes(2), "%s\n", s);
    4.15 }
    4.16-
    4.17-fail(s: string)
    4.18-{
    4.19-	fprint(fildes(2), "%s\n", s);
    4.20-	raise "fail:"+s;
    4.21-}